JavaPairRDD<String, String> partitionRecordKeyPairRDD, int totalSubpartitions, HoodieTableMetaClient metaClient) { int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), totalSubpartitions); explodeRecordRDDWithFileComparisons( partitionToFileIndexInfo, partitionRecordKeyPairRDD)
/** * Lookup the location for each record key and return the pair<record_key,location> for all record * keys already present and drop the record keys if not present */ private JavaPairRDD<String, String> lookupIndex( JavaPairRDD<String, String> partitionRecordKeyPairRDD, final JavaSparkContext jsc, final HoodieTable hoodieTable) { // Obtain records per partition, in the incoming records Map<String, Long> recordsPerPartition = partitionRecordKeyPairRDD.countByKey(); List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet()); // Step 2: Load all involved files as <Partition, filename> pairs List<Tuple2<String, BloomIndexFileInfo>> fileInfoList = loadInvolvedFiles(affectedPartitionPathList, jsc, hoodieTable); final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo = fileInfoList.stream() .collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList()))); // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, // that contains it. int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo, partitionRecordKeyPairRDD); return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, parallelism, hoodieTable.getMetaClient()); }
public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys, JavaSparkContext jsc, HoodieTable<T> hoodieTable) { JavaPairRDD<String, String> partitionRecordKeyPairRDD = hoodieKeys .mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey())); // Lookup indexes for all the partition/recordkey pair JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable); JavaPairRDD<String, HoodieKey> rowKeyHoodieKeyPairRDD = hoodieKeys .mapToPair(key -> new Tuple2<>(key.getRecordKey(), key)); return rowKeyHoodieKeyPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).mapToPair(keyPathTuple -> { Optional<String> recordLocationPath; if (keyPathTuple._2._2.isPresent()) { String fileName = keyPathTuple._2._2.get(); String partitionPath = keyPathTuple._2._1.getPartitionPath(); recordLocationPath = Optional .of(new Path(new Path(hoodieTable.getMetaClient().getBasePath(), partitionPath), fileName) .toUri().getPath()); } else { recordLocationPath = Optional.absent(); } return new Tuple2<>(keyPathTuple._2._1, recordLocationPath); }); }
@Test public void testRangePruning() { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); HoodieBloomIndex index = new HoodieBloomIndex(config); final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>(); partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(new BloomIndexFileInfo("f1"), new BloomIndexFileInfo("f2", "000", "000"), new BloomIndexFileInfo("f3", "001", "003"), new BloomIndexFileInfo("f4", "002", "007"), new BloomIndexFileInfo("f5", "009", "010"))); JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc.parallelize(Arrays.asList( new Tuple2<>("2017/10/22", "003"), new Tuple2<>("2017/10/22", "002"), new Tuple2<>("2017/10/22", "005"), new Tuple2<>("2017/10/22", "004"))).mapToPair(t -> t); List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index.explodeRecordRDDWithFileComparisons( partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect(); assertEquals(10, comparisonKeyList.size()); Map<String, List<String>> recordKeyToFileComps = comparisonKeyList.stream().collect(Collectors.groupingBy( t -> t._2()._2().getRecordKey(), Collectors.mapping(t -> t._2()._1().split("#")[0], Collectors.toList()))); assertEquals(4, recordKeyToFileComps.size()); assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("002")); assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("003")); assertEquals(Arrays.asList("f1", "f4"), recordKeyToFileComps.get("004")); assertEquals(Arrays.asList("f1", "f4"), recordKeyToFileComps.get("005")); }
@Test public void testTagLocationWithEmptyRDD() throws Exception { // We have some records to be tagged (two different partitions) JavaRDD<HoodieRecord> recordRDD = jsc.emptyRDD(); // Also create the metadata and config HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); HoodieTable table = HoodieTable.getHoodieTable(metadata, config, jsc); // Let's tag HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config); try { bloomIndex.tagLocation(recordRDD, jsc, table); } catch (IllegalArgumentException e) { fail("EmptyRDD should not result in IllegalArgumentException: Positive number of slices " + "required"); } }
@Test public void testLoadInvolvedFiles() throws IOException { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); HoodieBloomIndex index = new HoodieBloomIndex(config); HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieTable table = HoodieTable.getHoodieTable(metadata, config, jsc); List<Tuple2<String, BloomIndexFileInfo>> filesList = index.loadInvolvedFiles(partitions, jsc, table); filesList = index.loadInvolvedFiles(partitions, jsc, table); assertEquals(filesList.size(), 4);
public static <T extends HoodieRecordPayload> HoodieIndex<T> createIndex(HoodieWriteConfig config, JavaSparkContext jsc) throws HoodieIndexException { switch (config.getIndexType()) { case HBASE: return new HBaseIndex<>(config); case INMEMORY: return new InMemoryHashIndex<>(config); case BLOOM: return new HoodieBloomIndex<>(config); case BUCKETED: return new BucketedIndex<>(config); default: throw new HoodieIndexException("Index type unspecified, set " + config.getIndexType()); } }
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config); JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, jsc, table); taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, jsc, table);
@Override public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc, HoodieTable<T> hoodieTable) { // Step 0: cache the input record RDD if (config.getBloomIndexUseCaching()) { recordRDD.persist(config.getBloomIndexInputStorageLevel()); } // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey) JavaPairRDD<String, String> partitionRecordKeyPairRDD = recordRDD .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())); // Lookup indexes for all the partition/recordkey pair JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable); // Cache the result, for subsequent stages. if (config.getBloomIndexUseCaching()) { rowKeyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER()); } if (logger.isDebugEnabled()) { long totalTaggedRecords = rowKeyFilenamePairRDD.count(); logger.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords); } // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys // Cost: 4 sec. JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD, recordRDD); if (config.getBloomIndexUseCaching()) { recordRDD.unpersist(); // unpersist the input Record RDD rowKeyFilenamePairRDD.unpersist(); } return taggedRecordRDD; }
if (config.getBloomIndexPruneByRanges()) { totalComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairRDD).count(); } else {
/** * Load all involved files as <Partition, filename> pair RDD from all partitions in the table. */ @Override @VisibleForTesting List<Tuple2<String, BloomIndexFileInfo>> loadInvolvedFiles(List<String> partitions, final JavaSparkContext jsc, final HoodieTable hoodieTable) { HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); try { List<String> allPartitionPaths = FSUtils .getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), config.shouldAssumeDatePartitioning()); return super.loadInvolvedFiles(allPartitionPaths, jsc, hoodieTable); } catch (IOException e) { throw new HoodieIOException("Failed to load all partitions", e); } }
if (shouldCompareWithFile(indexInfo, recordKey)) { recordComparisons.add( new Tuple2<>(String.format("%s#%s", indexInfo.getFileName(), recordKey),
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config); JavaRDD<HoodieRecord> taggedRecordRDD = bloomIndex.tagLocation(recordRDD, jsc, table); table = HoodieTable.getHoodieTable(metadata, config, jsc); taggedRecordRDD = bloomIndex.tagLocation(recordRDD, jsc, table);
public static <T extends HoodieRecordPayload> HoodieIndex<T> createIndex(HoodieWriteConfig config, JavaSparkContext jsc) throws HoodieIndexException { switch (config.getIndexType()) { case HBASE: return new HBaseIndex<>(config); case INMEMORY: return new InMemoryHashIndex<>(config); case BLOOM: return new HoodieBloomIndex<>(config); case GLOBAL_BLOOM: return new HoodieGlobalBloomIndex<>(config); case BUCKETED: return new BucketedIndex<>(config); default: throw new HoodieIndexException("Index type unspecified, set " + config.getIndexType()); } }
@Override public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc, HoodieTable<T> hoodieTable) { // Step 0: cache the input record RDD if (config.getBloomIndexUseCaching()) { recordRDD.persist(config.getBloomIndexInputStorageLevel()); } // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey) JavaPairRDD<String, String> partitionRecordKeyPairRDD = recordRDD .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())); // Lookup indexes for all the partition/recordkey pair JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable); // Cache the result, for subsequent stages. if (config.getBloomIndexUseCaching()) { rowKeyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER()); } if (logger.isDebugEnabled()) { long totalTaggedRecords = rowKeyFilenamePairRDD.count(); logger.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords); } // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys // Cost: 4 sec. JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD, recordRDD); if (config.getBloomIndexUseCaching()) { recordRDD.unpersist(); // unpersist the input Record RDD rowKeyFilenamePairRDD.unpersist(); } return taggedRecordRDD; }
if (config.getBloomIndexPruneByRanges()) { totalComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo, partitionRecordKeyPairRDD).count(); } else {
if (shouldCompareWithFile(indexInfo, recordKey)) { recordComparisons.add( new Tuple2<>(String.format("%s#%s", indexInfo.getFileName(), recordKey),
/** * Lookup the location for each record key and return the pair<record_key,location> for all record * keys already present and drop the record keys if not present */ private JavaPairRDD<String, String> lookupIndex( JavaPairRDD<String, String> partitionRecordKeyPairRDD, final JavaSparkContext jsc, final HoodieTable hoodieTable) { // Obtain records per partition, in the incoming records Map<String, Long> recordsPerPartition = partitionRecordKeyPairRDD.countByKey(); List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet()); // Step 2: Load all involved files as <Partition, filename> pairs List<Tuple2<String, BloomIndexFileInfo>> fileInfoList = loadInvolvedFiles(affectedPartitionPathList, jsc, hoodieTable); final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo = fileInfoList.stream() .collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList()))); // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, // that contains it. int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo, partitionRecordKeyPairRDD); return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, parallelism, hoodieTable.getMetaClient()); }
HoodieTable table = HoodieTable.getHoodieTable(metadata, config, jsc); HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config); JavaRDD<HoodieRecord> taggedRecordRDD = bloomIndex.tagLocation(recordRDD, jsc, table);
JavaPairRDD<String, String> partitionRecordKeyPairRDD, int totalSubpartitions, HoodieTableMetaClient metaClient) { int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), totalSubpartitions); explodeRecordRDDWithFileComparisons( partitionToFileIndexInfo, partitionRecordKeyPairRDD)