com.uber.hoodie.index.bloom.HoodieBloomIndex java code examples

 JavaPairRDD<String, String> partitionRecordKeyPairRDD, int totalSubpartitions, HoodieTableMetaClient metaClient) {
int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(),
  totalSubpartitions);
  explodeRecordRDDWithFileComparisons(
    partitionToFileIndexInfo, partitionRecordKeyPairRDD)

/**
 * Lookup the location for each record key and return the pair<record_key,location> for all record
 * keys already present and drop the record keys if not present
 */
private JavaPairRDD<String, String> lookupIndex(
  JavaPairRDD<String, String> partitionRecordKeyPairRDD, final JavaSparkContext
  jsc, final HoodieTable hoodieTable) {
 // Obtain records per partition, in the incoming records
 Map<String, Long> recordsPerPartition = partitionRecordKeyPairRDD.countByKey();
 List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());
 // Step 2: Load all involved files as <Partition, filename> pairs
 List<Tuple2<String, BloomIndexFileInfo>> fileInfoList = loadInvolvedFiles(affectedPartitionPathList, jsc,
   hoodieTable);
 final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo = fileInfoList.stream()
   .collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));
 // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id,
 // that contains it.
 int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo,
   partitionRecordKeyPairRDD);
 return findMatchingFilesForRecordKeys(partitionToFileInfo,
   partitionRecordKeyPairRDD, parallelism, hoodieTable.getMetaClient());
}

public JavaPairRDD<HoodieKey, Optional<String>> fetchRecordLocation(JavaRDD<HoodieKey> hoodieKeys,
  JavaSparkContext jsc, HoodieTable<T> hoodieTable) {
 JavaPairRDD<String, String> partitionRecordKeyPairRDD = hoodieKeys
   .mapToPair(key -> new Tuple2<>(key.getPartitionPath(), key.getRecordKey()));
 // Lookup indexes for all the partition/recordkey pair
 JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);
 JavaPairRDD<String, HoodieKey> rowKeyHoodieKeyPairRDD = hoodieKeys
   .mapToPair(key -> new Tuple2<>(key.getRecordKey(), key));
 return rowKeyHoodieKeyPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).mapToPair(keyPathTuple -> {
  Optional<String> recordLocationPath;
  if (keyPathTuple._2._2.isPresent()) {
   String fileName = keyPathTuple._2._2.get();
   String partitionPath = keyPathTuple._2._1.getPartitionPath();
   recordLocationPath = Optional
     .of(new Path(new Path(hoodieTable.getMetaClient().getBasePath(), partitionPath), fileName)
       .toUri().getPath());
  } else {
   recordLocationPath = Optional.absent();
  }
  return new Tuple2<>(keyPathTuple._2._1, recordLocationPath);
 });
}

@Test
public void testRangePruning() {
 HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
 HoodieBloomIndex index = new HoodieBloomIndex(config);
 final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>();
 partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(new BloomIndexFileInfo("f1"),
   new BloomIndexFileInfo("f2", "000", "000"), new BloomIndexFileInfo("f3", "001", "003"),
   new BloomIndexFileInfo("f4", "002", "007"), new BloomIndexFileInfo("f5", "009", "010")));
 JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc.parallelize(Arrays.asList(
   new Tuple2<>("2017/10/22", "003"), new Tuple2<>("2017/10/22", "002"), new Tuple2<>("2017/10/22", "005"),
   new Tuple2<>("2017/10/22", "004"))).mapToPair(t -> t);
 List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index.explodeRecordRDDWithFileComparisons(
   partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect();
 assertEquals(10, comparisonKeyList.size());
 Map<String, List<String>> recordKeyToFileComps = comparisonKeyList.stream().collect(Collectors.groupingBy(
   t -> t._2()._2().getRecordKey(), Collectors.mapping(t -> t._2()._1().split("#")[0], Collectors.toList())));
 assertEquals(4, recordKeyToFileComps.size());
 assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("002"));
 assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("003"));
 assertEquals(Arrays.asList("f1", "f4"), recordKeyToFileComps.get("004"));
 assertEquals(Arrays.asList("f1", "f4"), recordKeyToFileComps.get("005"));
}

@Test
public void testTagLocationWithEmptyRDD() throws Exception {
 // We have some records to be tagged (two different partitions)
 JavaRDD<HoodieRecord> recordRDD = jsc.emptyRDD();
 // Also create the metadata and config
 HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
 HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
 HoodieTable table = HoodieTable.getHoodieTable(metadata, config, jsc);
 // Let's tag
 HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config);
 try {
  bloomIndex.tagLocation(recordRDD, jsc, table);
 } catch (IllegalArgumentException e) {
  fail("EmptyRDD should not result in IllegalArgumentException: Positive number of slices " + "required");
 }
}

@Test
public void testLoadInvolvedFiles() throws IOException {
 HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
 HoodieBloomIndex index = new HoodieBloomIndex(config);
 HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
 HoodieTable table = HoodieTable.getHoodieTable(metadata, config, jsc);
 List<Tuple2<String, BloomIndexFileInfo>> filesList = index.loadInvolvedFiles(partitions, jsc, table);
 filesList = index.loadInvolvedFiles(partitions, jsc, table);
 assertEquals(filesList.size(), 4);

public static <T extends HoodieRecordPayload> HoodieIndex<T> createIndex(HoodieWriteConfig config,
  JavaSparkContext jsc) throws HoodieIndexException {
 switch (config.getIndexType()) {
  case HBASE:
   return new HBaseIndex<>(config);
  case INMEMORY:
   return new InMemoryHashIndex<>(config);
  case BLOOM:
   return new HoodieBloomIndex<>(config);
  case BUCKETED:
   return new BucketedIndex<>(config);
  default:
   throw new HoodieIndexException("Index type unspecified, set " + config.getIndexType());
 }
}

HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config);
JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, jsc, table);
taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, jsc, table);

@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc,
  HoodieTable<T> hoodieTable) {
 // Step 0: cache the input record RDD
 if (config.getBloomIndexUseCaching()) {
  recordRDD.persist(config.getBloomIndexInputStorageLevel());
 }
 // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
 JavaPairRDD<String, String> partitionRecordKeyPairRDD = recordRDD
   .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
 // Lookup indexes for all the partition/recordkey pair
 JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);
 // Cache the result, for subsequent stages.
 if (config.getBloomIndexUseCaching()) {
  rowKeyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
 }
 if (logger.isDebugEnabled()) {
  long totalTaggedRecords = rowKeyFilenamePairRDD.count();
  logger.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords);
 }
 // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
 // Cost: 4 sec.
 JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD,
   recordRDD);
 if (config.getBloomIndexUseCaching()) {
  recordRDD.unpersist(); // unpersist the input Record RDD
  rowKeyFilenamePairRDD.unpersist();
 }
 return taggedRecordRDD;
}

if (config.getBloomIndexPruneByRanges()) {
 totalComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo,
   partitionRecordKeyPairRDD).count();
} else {

/**
 * Load all involved files as <Partition, filename> pair RDD from all partitions in the table.
 */
@Override
@VisibleForTesting
List<Tuple2<String, BloomIndexFileInfo>> loadInvolvedFiles(List<String> partitions, final JavaSparkContext jsc,
                              final HoodieTable hoodieTable) {
 HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
 try {
  List<String> allPartitionPaths = FSUtils
    .getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
      config.shouldAssumeDatePartitioning());
  return super.loadInvolvedFiles(allPartitionPaths, jsc, hoodieTable);
 } catch (IOException e) {
  throw new HoodieIOException("Failed to load all partitions", e);
 }
}

if (shouldCompareWithFile(indexInfo, recordKey)) {
 recordComparisons.add(
   new Tuple2<>(String.format("%s#%s", indexInfo.getFileName(), recordKey),

HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config);
JavaRDD<HoodieRecord> taggedRecordRDD = bloomIndex.tagLocation(recordRDD, jsc, table);
table = HoodieTable.getHoodieTable(metadata, config, jsc);
taggedRecordRDD = bloomIndex.tagLocation(recordRDD, jsc, table);

public static <T extends HoodieRecordPayload> HoodieIndex<T> createIndex(HoodieWriteConfig config,
  JavaSparkContext jsc) throws HoodieIndexException {
 switch (config.getIndexType()) {
  case HBASE:
   return new HBaseIndex<>(config);
  case INMEMORY:
   return new InMemoryHashIndex<>(config);
  case BLOOM:
   return new HoodieBloomIndex<>(config);
  case GLOBAL_BLOOM:
   return new HoodieGlobalBloomIndex<>(config);
  case BUCKETED:
   return new BucketedIndex<>(config);
  default:
   throw new HoodieIndexException("Index type unspecified, set " + config.getIndexType());
 }
}

@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc,
  HoodieTable<T> hoodieTable) {
 // Step 0: cache the input record RDD
 if (config.getBloomIndexUseCaching()) {
  recordRDD.persist(config.getBloomIndexInputStorageLevel());
 }
 // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
 JavaPairRDD<String, String> partitionRecordKeyPairRDD = recordRDD
   .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
 // Lookup indexes for all the partition/recordkey pair
 JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);
 // Cache the result, for subsequent stages.
 if (config.getBloomIndexUseCaching()) {
  rowKeyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
 }
 if (logger.isDebugEnabled()) {
  long totalTaggedRecords = rowKeyFilenamePairRDD.count();
  logger.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords);
 }
 // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
 // Cost: 4 sec.
 JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD,
   recordRDD);
 if (config.getBloomIndexUseCaching()) {
  recordRDD.unpersist(); // unpersist the input Record RDD
  rowKeyFilenamePairRDD.unpersist();
 }
 return taggedRecordRDD;
}

if (config.getBloomIndexPruneByRanges()) {
 totalComparisons = explodeRecordRDDWithFileComparisons(partitionToFileInfo,
   partitionRecordKeyPairRDD).count();
} else {

if (shouldCompareWithFile(indexInfo, recordKey)) {
 recordComparisons.add(
   new Tuple2<>(String.format("%s#%s", indexInfo.getFileName(), recordKey),

/**
 * Lookup the location for each record key and return the pair<record_key,location> for all record
 * keys already present and drop the record keys if not present
 */
private JavaPairRDD<String, String> lookupIndex(
  JavaPairRDD<String, String> partitionRecordKeyPairRDD, final JavaSparkContext
  jsc, final HoodieTable hoodieTable) {
 // Obtain records per partition, in the incoming records
 Map<String, Long> recordsPerPartition = partitionRecordKeyPairRDD.countByKey();
 List<String> affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet());
 // Step 2: Load all involved files as <Partition, filename> pairs
 List<Tuple2<String, BloomIndexFileInfo>> fileInfoList = loadInvolvedFiles(affectedPartitionPathList, jsc,
   hoodieTable);
 final Map<String, List<BloomIndexFileInfo>> partitionToFileInfo = fileInfoList.stream()
   .collect(groupingBy(Tuple2::_1, mapping(Tuple2::_2, toList())));
 // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id,
 // that contains it.
 int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo,
   partitionRecordKeyPairRDD);
 return findMatchingFilesForRecordKeys(partitionToFileInfo,
   partitionRecordKeyPairRDD, parallelism, hoodieTable.getMetaClient());
}

HoodieTable table = HoodieTable.getHoodieTable(metadata, config, jsc);
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config);
JavaRDD<HoodieRecord> taggedRecordRDD = bloomIndex.tagLocation(recordRDD, jsc, table);

 JavaPairRDD<String, String> partitionRecordKeyPairRDD, int totalSubpartitions, HoodieTableMetaClient metaClient) {
int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(),
  totalSubpartitions);
  explodeRecordRDDWithFileComparisons(
    partitionToFileIndexInfo, partitionRecordKeyPairRDD)

Javadoc

Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in its metadata.

Most used methods

<init>
explodeRecordRDDWithFileComparisons
For each incoming record, produce N output records, 1 each for each file against which the record's
loadInvolvedFiles
Load all involved files as pair RDD.
autoComputeParallelism
The index lookup can be skewed in three dimensions : #files, #partitions, #records To be able to smo
determineParallelism
Its crucial to pick the right parallelism. totalSubPartitions : this is deemed safe limit, to be nic
fetchRecordLocation
findMatchingFilesForRecordKeys
Find out pair. All workload grouped by file-level. Join PairRDD(PartitionPath, RecordKey) and PairRD
lookupIndex
Lookup the location for each record key and return the pair for all record keys already present and
shouldCompareWithFile
if we dont have key ranges, then also we need to compare against the file. no other choice if we do,
tagLocation
tagLocationBacktoRecords
Tag the back to the original HoodieRecord RDD.

tagLocationBacktoRecords

Popular in Java

Running tasks concurrently on multiple threads
putExtra (Intent)
scheduleAtFixedRate (Timer)
getExternalFilesDir (Context)
String (java.lang)
SocketTimeoutException (java.net)
This exception is thrown when a timeout expired on a socket read or accept operation.
Calendar (java.util)
Calendar is an abstract base class for converting between a Date object and a set of integer fields
BlockingQueue (java.util.concurrent)
A java.util.Queue that additionally supports operations that wait for the queue to become non-empty
Callable (java.util.concurrent)
A task that returns a result and may throw an exception. Implementors define a single method with no
Pattern (java.util.regex)
Patterns are compiled regular expressions. In many cases, convenience methods such as String#matches
CodeWhisperer alternatives

How to useHoodieBloomIndex in com.uber.hoodie.index.bloom

Best Java code snippets using com.uber.hoodie.index.bloom.HoodieBloomIndex (Showing top 20 results out of 315)

How to use
HoodieBloomIndex
in
com.uber.hoodie.index.bloom