public static parquet.column.statistics.Statistics<?> readStats(Statistics statistics, PrimitiveTypeName type) { parquet.column.statistics.Statistics<?> stats = parquet.column.statistics.Statistics.getStatsBasedOnType(type); if (statistics != null) { if (statistics.isSetMax() && statistics.isSetMin()) { stats.setMinMaxFromBytes(statistics.min.array(), statistics.max.array()); } stats.setNumNulls(statistics.null_count); } return stats; }
@Test public void testMatchesWithStatistics() throws ParquetCorruptionException { String value = "Test"; ColumnDescriptor columnDescriptor = new ColumnDescriptor(new String[] {"path"}, BINARY, 0, 0); RichColumnDescriptor column = new RichColumnDescriptor(columnDescriptor, new PrimitiveType(OPTIONAL, BINARY, "Test column")); TupleDomain<ColumnDescriptor> effectivePredicate = getEffectivePredicate(column, createVarcharType(255), utf8Slice(value)); TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column)); Statistics stats = getStatsBasedOnType(column.getType()); stats.setNumNulls(1L); stats.setMinMaxFromBytes(value.getBytes(), value.getBytes()); assertTrue(parquetPredicate.matches(2, singletonMap(column, stats), ID, true)); }
private void resetStatistics() { this.statistics = Statistics.getStatsBasedOnType(this.path.getType()); }
private void resetStatistics() { this.statistics = Statistics.getStatsBasedOnType(this.path.getType()); }
private ColumnChunkPageWriter(ColumnDescriptor path, BytesCompressor compressor, int initialSize) { this.path = path; this.compressor = compressor; this.buf = new CapacityByteArrayOutputStream(initialSize); this.totalStatistics = Statistics.getStatsBasedOnType(this.path.getType()); }
private ColumnChunkPageWriter(ColumnDescriptor path, BytesCompressor compressor, int pageSize) { this.path = path; this.compressor = compressor; this.buf = new ConcatenatingByteArrayCollector(); this.totalStatistics = getStatsBasedOnType(this.path.getType()); }
public static parquet.column.statistics.Statistics<?> readStats(Statistics statistics, PrimitiveTypeName type) { parquet.column.statistics.Statistics<?> stats = parquet.column.statistics.Statistics.getStatsBasedOnType(type); if (statistics != null) { if (statistics.isSetMax() && statistics.isSetMin()) { stats.setMinMaxFromBytes(statistics.min.array(), statistics.max.array()); } stats.setNumNulls(statistics.null_count); } return stats; }
public static parquet.column.statistics.Statistics fromParquetStatistics(Statistics statistics, PrimitiveTypeName type) { // create stats object based on the column type parquet.column.statistics.Statistics stats = parquet.column.statistics.Statistics.getStatsBasedOnType(type); // If there was no statistics written to the footer, create an empty Statistics object and return if (statistics != null) { if (statistics.isSetMax() && statistics.isSetMin()) { stats.setMinMaxFromBytes(statistics.min.array(), statistics.max.array()); } stats.setNumNulls(statistics.null_count); } return stats; }
/** * start a column inside a block * @param descriptor the column descriptor * @param valueCount the value count in this column * @param statistics the statistics in this column * @param compressionCodecName * @throws IOException */ public void startColumn(ColumnDescriptor descriptor, long valueCount, CompressionCodecName compressionCodecName) throws IOException { state = state.startColumn(); if (DEBUG) LOG.debug(out.getPos() + ": start column: " + descriptor + " count=" + valueCount); currentEncodings = new HashSet<parquet.column.Encoding>(); currentChunkPath = ColumnPath.get(descriptor.getPath()); currentChunkType = descriptor.getType(); currentChunkCodec = compressionCodecName; currentChunkValueCount = valueCount; currentChunkFirstDataPage = out.getPos(); compressedLength = 0; uncompressedLength = 0; // need to know what type of stats to initialize to // better way to do this? currentStatistics = Statistics.getStatsBasedOnType(currentChunkType); }
/** * start a column inside a block * @param descriptor the column descriptor * @param valueCount the value count in this column * @param statistics the statistics in this column * @param compressionCodecName * @throws IOException */ public void startColumn(ColumnDescriptor descriptor, long valueCount, CompressionCodecName compressionCodecName) throws IOException { state = state.startColumn(); if (DEBUG) LOG.debug(out.getPos() + ": start column: " + descriptor + " count=" + valueCount); currentEncodings = new HashSet<parquet.column.Encoding>(); currentChunkPath = ColumnPath.get(descriptor.getPath()); currentChunkType = descriptor.getType(); currentChunkCodec = compressionCodecName; currentChunkValueCount = valueCount; currentChunkFirstDataPage = out.getPos(); compressedLength = 0; uncompressedLength = 0; // need to know what type of stats to initialize to // better way to do this? currentStatistics = Statistics.getStatsBasedOnType(currentChunkType); }
@Test public void testMatchesWithStatistics() throws ParquetCorruptionException { String value = "Test"; ColumnDescriptor columnDescriptor = new ColumnDescriptor(new String[] {"path"}, BINARY, 0, 0); RichColumnDescriptor column = new RichColumnDescriptor(columnDescriptor, new PrimitiveType(OPTIONAL, BINARY, "Test column")); TupleDomain<ColumnDescriptor> effectivePredicate = getEffectivePredicate(column, createVarcharType(255), utf8Slice(value)); TupleDomainParquetPredicate parquetPredicate = new TupleDomainParquetPredicate(effectivePredicate, singletonList(column)); Statistics stats = getStatsBasedOnType(column.getType()); stats.setNumNulls(1L); stats.setMinMaxFromBytes(value.getBytes(), value.getBytes()); assertTrue(parquetPredicate.matches(2, singletonMap(column, stats), ID, true)); }