/** * if we dont have key ranges, then also we need to compare against the file. no other choice if * we do, then only compare the file if the record key falls in range. */ boolean shouldCompareWithFile(BloomIndexFileInfo indexInfo, String recordKey) { return !indexInfo.hasKeyRanges() || indexInfo.isKeyInRange(recordKey); }
assertEquals(filesList.size(), 4); assertNull(filesList.get(0)._2().getMaxRecordKey()); assertNull(filesList.get(0)._2().getMinRecordKey()); assertFalse(filesList.get(1)._2().hasKeyRanges()); assertNotNull(filesList.get(2)._2().getMaxRecordKey()); assertNotNull(filesList.get(2)._2().getMinRecordKey()); assertTrue(filesList.get(3)._2().hasKeyRanges()); new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2_0_20160401010101.parquet")), new Tuple2<>("2015/03/12", new BloomIndexFileInfo("1_0_20150312101010.parquet")), new Tuple2<>("2015/03/12", new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")), new Tuple2<>("2015/03/12", new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003"))); assertEquals(expected, filesList);
if (shouldCompareWithFile(indexInfo, recordKey)) { recordComparisons.add( new Tuple2<>(String.format("%s#%s", indexInfo.getFileName(), recordKey), new Tuple2<>(indexInfo.getFileName(), new HoodieKey(recordKey, partitionPath))));
.readMinMaxRecordKeys(hoodieTable.getHadoopConf(), ft._2().getFileStatus().getPath()); return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1])); } catch (MetadataNotFoundException me) { logger.warn("Unable to find range metadata in file :" + ft._2()); return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName())); .map(ft -> new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName()))) .collect(toList());
assertNull(filesMap.get("2016/04/01/2_0_20160401010101.parquet").getMaxRecordKey()); assertNull(filesMap.get("2016/04/01/2_0_20160401010101.parquet").getMinRecordKey()); assertFalse(filesMap.get("2015/03/12/1_0_20150312101010.parquet").hasKeyRanges()); assertNotNull(filesMap.get("2015/03/12/3_0_20150312101010.parquet").getMaxRecordKey()); assertNotNull(filesMap.get("2015/03/12/3_0_20150312101010.parquet").getMinRecordKey()); assertTrue(filesMap.get("2015/03/12/3_0_20150312101010.parquet").hasKeyRanges()); expected.put("2016/04/01/2_0_20160401010101.parquet", new BloomIndexFileInfo("2_0_20160401010101.parquet")); expected.put("2015/03/12/1_0_20150312101010.parquet", new BloomIndexFileInfo("1_0_20150312101010.parquet")); expected.put("2015/03/12/3_0_20150312101010.parquet", new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")); expected.put("2015/03/12/4_0_20150312101010.parquet", new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003"));
if (shouldCompareWithFile(indexInfo, recordKey)) { recordComparisons.add( new Tuple2<>(String.format("%s#%s", indexInfo.getFileName(), recordKey), new Tuple2<>(indexInfo.getFileName(), new HoodieKey(recordKey, partitionPath))));
.readMinMaxRecordKeys(hoodieTable.getHadoopConf(), ft._2().getFileStatus().getPath()); return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1])); } catch (MetadataNotFoundException me) { logger.warn("Unable to find range metadata in file :" + ft._2()); return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName())); .map(ft -> new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName()))) .collect(toList());
/** * if we dont have key ranges, then also we need to compare against the file. no other choice if * we do, then only compare the file if the record key falls in range. */ private boolean shouldCompareWithFile(BloomIndexFileInfo indexInfo, String recordKey) { return !indexInfo.hasKeyRanges() || indexInfo.isKeyInRange(recordKey); }
if (shouldCompareWithFile(indexInfo._2(), recordKey)) { recordComparisons.add( new Tuple2<>(String.format("%s#%s", indexInfo._2().getFileName(), recordKey), new Tuple2<>(indexInfo._2().getFileName(), new HoodieKey(recordKey, indexInfo._1()))));
@Test public void testRangePruning() { HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); HoodieBloomIndex index = new HoodieBloomIndex(config); final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>(); partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(new BloomIndexFileInfo("f1"), new BloomIndexFileInfo("f2", "000", "000"), new BloomIndexFileInfo("f3", "001", "003"), new BloomIndexFileInfo("f4", "002", "007"), new BloomIndexFileInfo("f5", "009", "010"))); JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc.parallelize(Arrays.asList( new Tuple2<>("2017/10/22", "003"), new Tuple2<>("2017/10/22", "002"), new Tuple2<>("2017/10/22", "005"), new Tuple2<>("2017/10/22", "004"))).mapToPair(t -> t); List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index.explodeRecordRDDWithFileComparisons( partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect(); assertEquals(10, comparisonKeyList.size()); Map<String, List<String>> recordKeyToFileComps = comparisonKeyList.stream().collect(Collectors.groupingBy( t -> t._2()._2().getRecordKey(), Collectors.mapping(t -> t._2()._1().split("#")[0], Collectors.toList()))); assertEquals(4, recordKeyToFileComps.size()); assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("002")); assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("003")); assertEquals(Arrays.asList("f1", "f4"), recordKeyToFileComps.get("004")); assertEquals(Arrays.asList("f1", "f4"), recordKeyToFileComps.get("005")); }
private Map<String, BloomIndexFileInfo> toFileMap(List<Tuple2<String, BloomIndexFileInfo>> filesList) { Map<String, BloomIndexFileInfo> filesMap = new HashMap<>(); for (Tuple2<String, BloomIndexFileInfo> t : filesList) { filesMap.put(t._1() + "/" + t._2().getFileName(), t._2()); } return filesMap; }
partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(new BloomIndexFileInfo("f1"), new BloomIndexFileInfo("f2", "000", "000"), new BloomIndexFileInfo("f3", "001", "003"))); new BloomIndexFileInfo("f4", "002", "007"), new BloomIndexFileInfo("f5", "009", "010")));