/** * Read the rowKey list from the given parquet file. * * @param filePath The parquet file path. * @param configuration configuration to build fs object * @return Set Set of row keys */ public static Set<String> readRowKeysFromParquet(Configuration configuration, Path filePath) { return filterParquetRowKeys(configuration, filePath, new HashSet<>()); }
private void initState(String fileName, String partitionPath) throws HoodieIndexException { try { Path filePath = new Path(basePath + "/" + partitionPath + "/" + fileName); bloomFilter = ParquetUtils .readBloomFilterFromParquetMetadata(metaClient.getHadoopConf(), filePath); candidateRecordKeys = new ArrayList<>(); currentFile = fileName; currentPartitionPath = partitionPath; } catch (Exception e) { throw new HoodieIndexException("Error checking candidate keys against file.", e); } }
/** * Get the schema of the given parquet file. */ public static MessageType readSchema(Configuration configuration, Path parquetFilePath) { return readMetadata(configuration, parquetFilePath).getFileMetaData().getSchema(); }
@Test public void testHoodieWriteSupport() throws Exception { List<String> rowKeys = new ArrayList<>(); for (int i = 0; i < 1000; i++) { rowKeys.add(UUID.randomUUID().toString()); } String filePath = basePath + "/test.parquet"; writeParquetFile(filePath, rowKeys); // Read and verify List<String> rowKeysInFile = new ArrayList<>( ParquetUtils.readRowKeysFromParquet(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath))); Collections.sort(rowKeysInFile); Collections.sort(rowKeys); assertEquals("Did not read back the expected list of keys", rowKeys, rowKeysInFile); BloomFilter filterInFile = ParquetUtils.readBloomFilterFromParquetMetadata(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath)); for (String rowKey : rowKeys) { assertTrue("key should be found in bloom filter", filterInFile.mightContain(rowKey)); } }
BloomFilter filter = ParquetUtils.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), parquetFilePath); for (HoodieRecord record : records) { assertTrue(filter.mightContain(record.getRecordKey())); List<GenericRecord> fileRecords = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), parquetFilePath); GenericRecord newRecord; int index = 0; BloomFilter updatedFilter = ParquetUtils.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), updatedParquetFilePath); for (HoodieRecord record : records) {
assertEquals("file should contain 100 records", ParquetUtils.readRowKeysFromParquet(jsc.hadoopConfiguration(), new Path(basePath, testPartitionPath + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(), 100); Path newFile = new Path(basePath, testPartitionPath + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1)); assertEquals("file should contain 140 records", ParquetUtils.readRowKeysFromParquet(jsc.hadoopConfiguration(), newFile).size(), 140); List<GenericRecord> records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), newFile); for (GenericRecord record : records) { String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); if (file.getFileName().contains(file1)) { assertEquals("Existing file should be expanded", commitTime3, file.getCommitTime()); records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), new Path(file.getPath())); for (GenericRecord record : records) { String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); } else { assertEquals("New file must be written for commit 3", commitTime3, file.getCommitTime()); records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), new Path(file.getPath())); for (GenericRecord record : records) { String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
public static String[] readMinMaxRecordKeys(Configuration configuration, Path parquetFilePath) { List<String> minMaxKeys = readParquetFooter(configuration, parquetFilePath, HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER, HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER); if (minMaxKeys.size() != 2) { throw new HoodieException(String.format( "Could not read min/max record key out of footer correctly from %s. read) : %s", parquetFilePath, minMaxKeys)); } return new String[] {minMaxKeys.get(0), minMaxKeys.get(1)}; }
try { String[] minMaxKeys = ParquetUtils .readMinMaxRecordKeys(hoodieTable.getHadoopConf(), ft._2().getFileStatus().getPath()); return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1]));
public static Schema readAvroSchema(Configuration configuration, Path parquetFilePath) { return new AvroSchemaConverter().convert(readSchema(configuration, parquetFilePath)); }
assertEquals("file should contain 100 records", ParquetUtils.readRowKeysFromParquet(jsc.hadoopConfiguration(), new Path(basePath, testPartitionPath + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(), 100); Path newFile = new Path(basePath, testPartitionPath + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1)); assertEquals("file should contain 140 records", ParquetUtils.readRowKeysFromParquet(jsc.hadoopConfiguration(), newFile).size(), 140); List<GenericRecord> records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), newFile); for (GenericRecord record : records) { String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); for (HoodieDataFile file : files) { assertEquals("All files must be at commit 3", commitTime3, file.getCommitTime()); records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), new Path(file.getPath())); totalInserts += records.size();
/** * Read out the bloom filter from the parquet file meta data. */ public static BloomFilter readBloomFilterFromParquetMetadata(Configuration configuration, Path parquetFilePath) { String footerVal = readParquetFooter(configuration, parquetFilePath, HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY).get(0); return new BloomFilter(footerVal); }
try { String[] minMaxKeys = ParquetUtils .readMinMaxRecordKeys(hoodieTable.getHadoopConf(), ft._2().getFileStatus().getPath()); return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1]));
/** * Given a list of row keys and one file, return only row keys existing in that file. */ public static List<String> checkCandidatesAgainstFile(Configuration configuration, List<String> candidateRecordKeys, Path filePath) throws HoodieIndexException { List<String> foundRecordKeys = new ArrayList<>(); try { // Load all rowKeys from the file, to double-confirm if (!candidateRecordKeys.isEmpty()) { Set<String> fileRowKeys = ParquetUtils.filterParquetRowKeys(configuration, filePath, new HashSet<>(candidateRecordKeys)); foundRecordKeys.addAll(fileRowKeys); logger.info("After checking with row keys, we have " + foundRecordKeys.size() + " results, for file " + filePath + " => " + foundRecordKeys); if (logger.isDebugEnabled()) { logger.debug("Keys matching for file " + filePath + " => " + foundRecordKeys); } } } catch (Exception e) { throw new HoodieIndexException("Error checking candidate keys against file.", e); } return foundRecordKeys; }
private void initState(String fileName, String partitionPath) throws HoodieIndexException { try { Path filePath = new Path(basePath + "/" + partitionPath + "/" + fileName); bloomFilter = ParquetUtils .readBloomFilterFromParquetMetadata(metaClient.getHadoopConf(), filePath); candidateRecordKeys = new ArrayList<>(); currentFile = fileName; currentParitionPath = partitionPath; } catch (Exception e) { throw new HoodieIndexException("Error checking candidate keys against file.", e); } }
private static List<String> readParquetFooter(Configuration configuration, Path parquetFilePath, String... footerNames) { List<String> footerVals = new ArrayList<>(); ParquetMetadata footer = readMetadata(configuration, parquetFilePath); Map<String, String> metadata = footer.getFileMetaData().getKeyValueMetaData(); for (String footerName : footerNames) { if (metadata.containsKey(footerName)) { footerVals.add(metadata.get(footerName)); } else { throw new MetadataNotFoundException("Could not find index in Parquet footer. " + "Looked for key " + footerName + " in " + parquetFilePath); } } return footerVals; }
/** * Given a list of row keys and one file, return only row keys existing in that file. */ public static List<String> checkCandidatesAgainstFile(Configuration configuration, List<String> candidateRecordKeys, Path filePath) throws HoodieIndexException { List<String> foundRecordKeys = new ArrayList<>(); try { // Load all rowKeys from the file, to double-confirm if (!candidateRecordKeys.isEmpty()) { Set<String> fileRowKeys = ParquetUtils.filterParquetRowKeys(configuration, filePath, new HashSet<>(candidateRecordKeys)); foundRecordKeys.addAll(fileRowKeys); logger.info("After checking with row keys, we have " + foundRecordKeys.size() + " results, for file " + filePath + " => " + foundRecordKeys); if (logger.isDebugEnabled()) { logger.debug("Keys matching for file " + filePath + " => " + foundRecordKeys); } } } catch (Exception e) { throw new HoodieIndexException("Error checking candidate keys against file.", e); } return foundRecordKeys; }
@Test public void testFilterParquetRowKeys() throws Exception { List<String> rowKeys = new ArrayList<>(); Set<String> filter = new HashSet<>(); for (int i = 0; i < 1000; i++) { String rowKey = UUID.randomUUID().toString(); rowKeys.add(rowKey); if (i % 100 == 0) { filter.add(rowKey); } } String filePath = basePath + "/test.parquet"; writeParquetFile(filePath, rowKeys); // Read and verify Set<String> filtered = ParquetUtils.filterParquetRowKeys(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath), filter); assertEquals("Filtered count does not match", filter.size(), filtered.size()); for (String rowKey : filtered) { assertTrue("filtered key must be in the given filter", filter.contains(rowKey)); } }