/** * Lookup the location for each record key and return the pair<record_key,location> for all record * keys already present and drop the record keys if not present */ private JavaPairRDD<String, String> lookupIndex( JavaPairRDD<String, String> partitionRecordKeyPairRDD, final JavaSparkContext jsc, final HoodieTable hoodieTable) { // Obtain records per partition, in the incoming records Map<String, Long> recordsPerPartition = partitionRecordKeyPairRDD.countByKey(); List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet()); // Step 2: Load all involved files as <Partition, filename> pairs List<Tuple2<String, BloomIndexFileInfo>> fileInfoList = loadInvolvedFiles(affectedPartitionPathList, jsc, hoodieTable); final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo = fileInfoList.stream() .collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList()))); // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, // that contains it. int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo, partitionRecordKeyPairRDD); return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, parallelism, hoodieTable.getMetaClient()); }
/** * Lookup the location for each record key and return the pair<record_key,location> for all record * keys already present and drop the record keys if not present */ private JavaPairRDD<String, String> lookupIndex( JavaPairRDD<String, String> partitionRecordKeyPairRDD, final JavaSparkContext jsc, final HoodieTable hoodieTable) { // Obtain records per partition, in the incoming records Map<String, Long> recordsPerPartition = partitionRecordKeyPairRDD.countByKey(); List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet()); // Step 2: Load all involved files as <Partition, filename> pairs List<Tuple2<String, BloomIndexFileInfo>> fileInfoList = loadInvolvedFiles(affectedPartitionPathList, jsc, hoodieTable); final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo = fileInfoList.stream() .collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList()))); // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, // that contains it. int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo, partitionRecordKeyPairRDD); return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, parallelism, hoodieTable.getMetaClient()); }