com.uber.hoodie.common.util.ParquetUtils java code examples

/**
 * Read the rowKey list from the given parquet file.
 *
 * @param filePath      The parquet file path.
 * @param configuration configuration to build fs object
 * @return Set          Set of row keys
 */
public static Set<String> readRowKeysFromParquet(Configuration configuration, Path filePath) {
 return filterParquetRowKeys(configuration, filePath, new HashSet<>());
}

private void initState(String fileName, String partitionPath) throws HoodieIndexException {
 try {
  Path filePath = new Path(basePath + "/" + partitionPath + "/" + fileName);
  bloomFilter = ParquetUtils
    .readBloomFilterFromParquetMetadata(metaClient.getHadoopConf(), filePath);
  candidateRecordKeys = new ArrayList<>();
  currentFile = fileName;
  currentPartitionPath = partitionPath;
 } catch (Exception e) {
  throw new HoodieIndexException("Error checking candidate keys against file.", e);
 }
}

/**
 * Get the schema of the given parquet file.
 */
public static MessageType readSchema(Configuration configuration, Path parquetFilePath) {
 return readMetadata(configuration, parquetFilePath).getFileMetaData().getSchema();
}

@Test
public void testHoodieWriteSupport() throws Exception {
 List<String> rowKeys = new ArrayList<>();
 for (int i = 0; i < 1000; i++) {
  rowKeys.add(UUID.randomUUID().toString());
 }
 String filePath = basePath + "/test.parquet";
 writeParquetFile(filePath, rowKeys);
 // Read and verify
 List<String> rowKeysInFile = new ArrayList<>(
   ParquetUtils.readRowKeysFromParquet(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath)));
 Collections.sort(rowKeysInFile);
 Collections.sort(rowKeys);
 assertEquals("Did not read back the expected list of keys", rowKeys, rowKeysInFile);
 BloomFilter filterInFile = ParquetUtils.readBloomFilterFromParquetMetadata(HoodieTestUtils.getDefaultHadoopConf(),
   new Path(filePath));
 for (String rowKey : rowKeys) {
  assertTrue("key should be found in bloom filter", filterInFile.mightContain(rowKey));
 }
}

BloomFilter filter = ParquetUtils.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), parquetFilePath);
for (HoodieRecord record : records) {
 assertTrue(filter.mightContain(record.getRecordKey()));
List<GenericRecord> fileRecords = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), parquetFilePath);
GenericRecord newRecord;
int index = 0;
BloomFilter updatedFilter = ParquetUtils.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(),
  updatedParquetFilePath);
for (HoodieRecord record : records) {

assertEquals("file should contain 100 records", ParquetUtils.readRowKeysFromParquet(jsc.hadoopConfiguration(),
  new Path(basePath, testPartitionPath + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(), 100);
Path newFile = new Path(basePath, testPartitionPath + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
assertEquals("file should contain 140 records",
  ParquetUtils.readRowKeysFromParquet(jsc.hadoopConfiguration(), newFile).size(), 140);
List<GenericRecord> records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), newFile);
for (GenericRecord record : records) {
 String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
 if (file.getFileName().contains(file1)) {
  assertEquals("Existing file should be expanded", commitTime3, file.getCommitTime());
  records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), new Path(file.getPath()));
  for (GenericRecord record : records) {
   String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
 } else {
  assertEquals("New file must be written for commit 3", commitTime3, file.getCommitTime());
  records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), new Path(file.getPath()));
  for (GenericRecord record : records) {
   String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();

public static String[] readMinMaxRecordKeys(Configuration configuration, Path parquetFilePath) {
 List<String> minMaxKeys = readParquetFooter(configuration, parquetFilePath,
   HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER,
   HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER);
 if (minMaxKeys.size() != 2) {
  throw new HoodieException(String.format(
    "Could not read min/max record key out of footer correctly from %s. read) : %s",
    parquetFilePath, minMaxKeys));
 }
 return new String[] {minMaxKeys.get(0), minMaxKeys.get(1)};
}

try {
 String[] minMaxKeys = ParquetUtils
   .readMinMaxRecordKeys(hoodieTable.getHadoopConf(), ft._2().getFileStatus().getPath());
 return new Tuple2<>(ft._1(),
   new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1]));

public static Schema readAvroSchema(Configuration configuration, Path parquetFilePath) {
 return new AvroSchemaConverter().convert(readSchema(configuration, parquetFilePath));
}

assertEquals("file should contain 100 records", ParquetUtils.readRowKeysFromParquet(jsc.hadoopConfiguration(),
  new Path(basePath, testPartitionPath + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(), 100);
Path newFile = new Path(basePath, testPartitionPath + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
assertEquals("file should contain 140 records",
  ParquetUtils.readRowKeysFromParquet(jsc.hadoopConfiguration(), newFile).size(), 140);
List<GenericRecord> records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), newFile);
for (GenericRecord record : records) {
 String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
for (HoodieDataFile file : files) {
 assertEquals("All files must be at commit 3", commitTime3, file.getCommitTime());
 records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), new Path(file.getPath()));
 totalInserts += records.size();

/**
 * Read out the bloom filter from the parquet file meta data.
 */
public static BloomFilter readBloomFilterFromParquetMetadata(Configuration configuration,
  Path parquetFilePath) {
 String footerVal = readParquetFooter(configuration, parquetFilePath,
   HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY).get(0);
 return new BloomFilter(footerVal);
}

try {
 String[] minMaxKeys = ParquetUtils
   .readMinMaxRecordKeys(hoodieTable.getHadoopConf(), ft._2().getFileStatus().getPath());
 return new Tuple2<>(ft._1(),
   new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1]));

/**
 * Given a list of row keys and one file, return only row keys existing in that file.
 */
public static List<String> checkCandidatesAgainstFile(Configuration configuration,
  List<String> candidateRecordKeys, Path filePath) throws HoodieIndexException {
 List<String> foundRecordKeys = new ArrayList<>();
 try {
  // Load all rowKeys from the file, to double-confirm
  if (!candidateRecordKeys.isEmpty()) {
   Set<String> fileRowKeys = ParquetUtils.filterParquetRowKeys(configuration, filePath,
     new HashSet<>(candidateRecordKeys));
   foundRecordKeys.addAll(fileRowKeys);
   logger.info("After checking with row keys, we have " + foundRecordKeys.size()
     + " results, for file " + filePath + " => " + foundRecordKeys);
   if (logger.isDebugEnabled()) {
    logger.debug("Keys matching for file " + filePath + " => " + foundRecordKeys);
   }
  }
 } catch (Exception e) {
  throw new HoodieIndexException("Error checking candidate keys against file.", e);
 }
 return foundRecordKeys;
}

private void initState(String fileName, String partitionPath) throws HoodieIndexException {
 try {
  Path filePath = new Path(basePath + "/" + partitionPath + "/" + fileName);
  bloomFilter = ParquetUtils
    .readBloomFilterFromParquetMetadata(metaClient.getHadoopConf(), filePath);
  candidateRecordKeys = new ArrayList<>();
  currentFile = fileName;
  currentParitionPath = partitionPath;
 } catch (Exception e) {
  throw new HoodieIndexException("Error checking candidate keys against file.", e);
 }
}

private static List<String> readParquetFooter(Configuration configuration, Path parquetFilePath,
  String... footerNames) {
 List<String> footerVals = new ArrayList<>();
 ParquetMetadata footer = readMetadata(configuration, parquetFilePath);
 Map<String, String> metadata = footer.getFileMetaData().getKeyValueMetaData();
 for (String footerName : footerNames) {
  if (metadata.containsKey(footerName)) {
   footerVals.add(metadata.get(footerName));
  } else {
   throw new MetadataNotFoundException("Could not find index in Parquet footer. "
     + "Looked for key " + footerName + " in " + parquetFilePath);
  }
 }
 return footerVals;
}

/**
 * Given a list of row keys and one file, return only row keys existing in that file.
 */
public static List<String> checkCandidatesAgainstFile(Configuration configuration,
  List<String> candidateRecordKeys, Path filePath) throws HoodieIndexException {
 List<String> foundRecordKeys = new ArrayList<>();
 try {
  // Load all rowKeys from the file, to double-confirm
  if (!candidateRecordKeys.isEmpty()) {
   Set<String> fileRowKeys = ParquetUtils.filterParquetRowKeys(configuration, filePath,
     new HashSet<>(candidateRecordKeys));
   foundRecordKeys.addAll(fileRowKeys);
   logger.info("After checking with row keys, we have " + foundRecordKeys.size()
     + " results, for file " + filePath + " => " + foundRecordKeys);
   if (logger.isDebugEnabled()) {
    logger.debug("Keys matching for file " + filePath + " => " + foundRecordKeys);
   }
  }
 } catch (Exception e) {
  throw new HoodieIndexException("Error checking candidate keys against file.", e);
 }
 return foundRecordKeys;
}

@Test
public void testFilterParquetRowKeys() throws Exception {
 List<String> rowKeys = new ArrayList<>();
 Set<String> filter = new HashSet<>();
 for (int i = 0; i < 1000; i++) {
  String rowKey = UUID.randomUUID().toString();
  rowKeys.add(rowKey);
  if (i % 100 == 0) {
   filter.add(rowKey);
  }
 }
 String filePath = basePath + "/test.parquet";
 writeParquetFile(filePath, rowKeys);
 // Read and verify
 Set<String> filtered = ParquetUtils.filterParquetRowKeys(HoodieTestUtils.getDefaultHadoopConf(),
   new Path(filePath),
   filter);
 assertEquals("Filtered count does not match", filter.size(), filtered.size());
 for (String rowKey : filtered) {
  assertTrue("filtered key must be in the given filter", filter.contains(rowKey));
 }
}

Javadoc

Utility functions involving with parquet.

Most used methods

filterParquetRowKeys
readBloomFilterFromParquetMetadata
readRowKeysFromParquet
Read the rowKey list from the given parquet file.
readAvroRecords
NOTE: This literally reads the entire file contents, thus should be used with caution.
readMetadata
readMinMaxRecordKeys
readParquetFooter
readSchema
Get the schema of the given parquet file.

Popular in Java

Updating database using SQL prepared statement
scheduleAtFixedRate (Timer)
setContentView (Activity)
getSystemService (Context)
Enumeration (java.util)
A legacy iteration interface.New code should use Iterator instead. Iterator replaces the enumeration
BlockingQueue (java.util.concurrent)
A java.util.Queue that additionally supports operations that wait for the queue to become non-empty
Callable (java.util.concurrent)
A task that returns a result and may throw an exception. Implementors define a single method with no
Collectors (java.util.stream)
Cipher (javax.crypto)
This class provides access to implementations of cryptographic ciphers for encryption and decryption
JLabel (javax.swing)
Top plugins for Android Studio

How to useParquetUtils in com.uber.hoodie.common.util

Best Java code snippets using com.uber.hoodie.common.util.ParquetUtils (Showing top 17 results out of 315)

How to use
ParquetUtils
in
com.uber.hoodie.common.util