/** * Read the rowKey list from the given parquet file. * * @param filePath The parquet file path. * @param configuration configuration to build fs object * @return Set Set of row keys */ public static Set<String> readRowKeysFromParquet(Configuration configuration, Path filePath) { return filterParquetRowKeys(configuration, filePath, new HashSet<>()); }
/** * Given a list of row keys and one file, return only row keys existing in that file. */ public static List<String> checkCandidatesAgainstFile(Configuration configuration, List<String> candidateRecordKeys, Path filePath) throws HoodieIndexException { List<String> foundRecordKeys = new ArrayList<>(); try { // Load all rowKeys from the file, to double-confirm if (!candidateRecordKeys.isEmpty()) { Set<String> fileRowKeys = ParquetUtils.filterParquetRowKeys(configuration, filePath, new HashSet<>(candidateRecordKeys)); foundRecordKeys.addAll(fileRowKeys); logger.info("After checking with row keys, we have " + foundRecordKeys.size() + " results, for file " + filePath + " => " + foundRecordKeys); if (logger.isDebugEnabled()) { logger.debug("Keys matching for file " + filePath + " => " + foundRecordKeys); } } } catch (Exception e) { throw new HoodieIndexException("Error checking candidate keys against file.", e); } return foundRecordKeys; }
/** * Given a list of row keys and one file, return only row keys existing in that file. */ public static List<String> checkCandidatesAgainstFile(Configuration configuration, List<String> candidateRecordKeys, Path filePath) throws HoodieIndexException { List<String> foundRecordKeys = new ArrayList<>(); try { // Load all rowKeys from the file, to double-confirm if (!candidateRecordKeys.isEmpty()) { Set<String> fileRowKeys = ParquetUtils.filterParquetRowKeys(configuration, filePath, new HashSet<>(candidateRecordKeys)); foundRecordKeys.addAll(fileRowKeys); logger.info("After checking with row keys, we have " + foundRecordKeys.size() + " results, for file " + filePath + " => " + foundRecordKeys); if (logger.isDebugEnabled()) { logger.debug("Keys matching for file " + filePath + " => " + foundRecordKeys); } } } catch (Exception e) { throw new HoodieIndexException("Error checking candidate keys against file.", e); } return foundRecordKeys; }
@Test public void testFilterParquetRowKeys() throws Exception { List<String> rowKeys = new ArrayList<>(); Set<String> filter = new HashSet<>(); for (int i = 0; i < 1000; i++) { String rowKey = UUID.randomUUID().toString(); rowKeys.add(rowKey); if (i % 100 == 0) { filter.add(rowKey); } } String filePath = basePath + "/test.parquet"; writeParquetFile(filePath, rowKeys); // Read and verify Set<String> filtered = ParquetUtils.filterParquetRowKeys(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath), filter); assertEquals("Filtered count does not match", filter.size(), filtered.size()); for (String rowKey : filtered) { assertTrue("filtered key must be in the given filter", filter.contains(rowKey)); } }