public HoodieRecordMissingException(HoodieRecord record) { super( "Record " + record.getRecordKey() + " with partition path " + record.getPartitionPath() + " in current location " + record.getCurrentLocation() + " is not found in the partition"); } }
private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords, Partitioner partitioner) { return dedupedRecords.mapToPair(record -> new Tuple2<>( new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())), record)) .partitionBy(partitioner).map(Tuple2::_2); }
private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords, Partitioner partitioner) { return dedupedRecords.mapToPair(record -> new Tuple2<>( new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())), record)) .partitionBy(partitioner).map(tuple -> tuple._2()); }
private void buildProfile() { Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords .mapToPair(record -> new Tuple2<>( new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())), record)).countByKey(); for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts .entrySet()) { String partitionPath = e.getKey()._1(); Long count = e.getValue(); Option<HoodieRecordLocation> locOption = e.getKey()._2(); if (!partitionPathStatMap.containsKey(partitionPath)) { partitionPathStatMap.put(partitionPath, new WorkloadStat()); } if (locOption.isDefined()) { // update partitionPathStatMap.get(partitionPath).addUpdates(locOption.get(), count); globalStat.addUpdates(locOption.get(), count); } else { // insert partitionPathStatMap.get(partitionPath).addInserts(count); globalStat.addInserts(count); } } }
private void buildProfile() { Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords .mapToPair(record -> new Tuple2<>( new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())), record)).countByKey(); for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts .entrySet()) { String partitionPath = e.getKey()._1(); Long count = e.getValue(); Option<HoodieRecordLocation> locOption = e.getKey()._2(); if (!partitionPathStatMap.containsKey(partitionPath)) { partitionPathStatMap.put(partitionPath, new WorkloadStat()); } if (locOption.isDefined()) { // update partitionPathStatMap.get(partitionPath).addUpdates(locOption.get(), count); globalStat.addUpdates(locOption.get(), count); } else { // insert partitionPathStatMap.get(partitionPath).addInserts(count); globalStat.addInserts(count); } } }
java.util.Optional<HoodieRecordLocation> loc = rec.getNewLocation(); if (loc.isPresent()) { if (rec.getCurrentLocation() != null) {
java.util.Optional<HoodieRecordLocation> loc = rec.getNewLocation(); if (loc.isPresent()) { if (rec.getCurrentLocation() != null) {
.addCommitMetadataToRecord((GenericRecord) avroRecord.get(), commitTime, seqId); if (hoodieRecord.getCurrentLocation() != null) { updatedRecordsWritten++; } else {
.addCommitMetadataToRecord((GenericRecord) avroRecord.get(), commitTime, seqId); if (hoodieRecord.getCurrentLocation() != null) { updatedRecordsWritten++; } else {
/** * Ensure records have location field set * * @param taggedRecords Tagged Records * @param commitTime Commit Timestamp */ void checkTaggedRecords(List<HoodieRecord> taggedRecords, String commitTime) { for (HoodieRecord rec : taggedRecords) { assertTrue("Record " + rec + " found with no location.", rec.isCurrentLocationKnown()); assertEquals("All records should have commit time " + commitTime + ", since updates were made", rec.getCurrentLocation().getCommitTime(), commitTime); } }
/** * Tag the <rowKey, filename> back to the original HoodieRecord RDD. */ private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords( JavaPairRDD<String, String> rowKeyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) { JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD .mapToPair(record -> new Tuple2<>(record.getRecordKey(), record)); // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), // so we do left outer join. return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map(v1 -> { HoodieRecord<T> record = v1._1(); if (v1._2().isPresent()) { String filename = v1._2().get(); if (filename != null && !filename.isEmpty()) { // When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD will have 2 // entries with the same exact in memory copy of the HoodieRecord and the 2 separate filenames that the // record is found in. This will result in setting currentLocation 2 times and it will fail the second time. // This check will create a new in memory copy of the hoodie record. if (record.getCurrentLocation() != null) { record = new HoodieRecord<T>(record.getKey(), record.getData()); } record.setCurrentLocation(new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename))); } } return record; }); }
/** * Tag the <rowKey, filename> back to the original HoodieRecord RDD. */ private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords( JavaPairRDD<String, String> rowKeyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) { JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD .mapToPair(record -> new Tuple2<>(record.getRecordKey(), record)); // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), // so we do left outer join. return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map(v1 -> { HoodieRecord<T> record = v1._1(); if (v1._2().isPresent()) { String filename = v1._2().get(); if (filename != null && !filename.isEmpty()) { // When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD will have 2 // entries with the same exact in memory copy of the HoodieRecord and the 2 separate filenames that the // record is found in. This will result in setting currentLocation 2 times and it will fail the second time. // This check will create a new in memory copy of the hoodie record. if (record.getCurrentLocation() != null) { record = new HoodieRecord<T>(record.getKey(), record.getData()); } record.setCurrentLocation(new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename))); } } return record; }); }
assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename0))); } else if (record.getRecordKey().equals("001")) { assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename2))); } else if (record.getRecordKey().equals("002")) { assertTrue(!record.isCurrentLocationKnown()); } else if (record.getRecordKey().equals("004")) { assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename3)));
assert onDiskHoodieRecord.getKey().equals(records.get(dkey).getKey()); assert records.get(ikey).getCurrentLocation().getFileId().equals(SpillableMapTestUtils.DUMMY_FILE_ID); assert records.get(ikey).getCurrentLocation().getCommitTime().equals(SpillableMapTestUtils.DUMMY_COMMIT_TIME);
table = new HoodieCopyOnWriteTable(config, jsc); Iterator<List<WriteStatus>> iter = table .handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(), updatedRecords.iterator());
assert (javaRDD.filter(record -> record.getCurrentLocation().getFileId() == null).collect().size() == 0); List<String> taggedFileIds = javaRDD.map(record -> record.getCurrentLocation().getFileId()).distinct().collect(); assert (javaRDD.filter(record -> record.getCurrentLocation() != null).collect().size() == 0);
private UpsertPartitioner getUpsertPartitioner(int smallFileSize, int numInserts, int numUpdates, int fileSize, String testPartitionPath, boolean autoSplitInserts) throws Exception { HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig( HoodieCompactionConfig.newBuilder().compactionSmallFileSize(smallFileSize).insertSplitSize(100) .autoTuneInsertSplits(autoSplitInserts).build()).withStorageConfig( HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build(); HoodieClientTestUtils.fakeCommitFile(basePath, "001"); HoodieClientTestUtils.fakeDataFile(basePath, testPartitionPath, "001", "file1", fileSize); HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[]{testPartitionPath}); List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts); List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates); for (HoodieRecord updateRec : updateRecords) { updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1")); } List<HoodieRecord> records = new ArrayList<>(); records.addAll(insertRecords); records.addAll(updateRecords); WorkloadProfile profile = new WorkloadProfile(jsc.parallelize(records)); HoodieCopyOnWriteTable.UpsertPartitioner partitioner = (HoodieCopyOnWriteTable.UpsertPartitioner) table.getUpsertPartitioner(profile); assertEquals("Update record should have gone to the 1 update partiton", 0, partitioner.getPartition( new Tuple2<>(updateRecords.get(0).getKey(), Option.apply(updateRecords.get(0).getCurrentLocation())))); return partitioner; }
assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename1))); } else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) { assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename2))); } else if (record.getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) { assertTrue(!record.isCurrentLocationKnown()); } else if (record.getRecordKey().equals("4eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) { assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename3)));
assertTrue(javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count() == 200); assertTrue(javaRDD.filter( record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getCommitTime() .equals(newCommitTime))).distinct().count() == 200);
assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename))); } else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) { assertFalse(record.isCurrentLocationKnown());