public WorkloadProfile(JavaRDD<HoodieRecord<T>> taggedRecords) { this.taggedRecords = taggedRecords; this.partitionPathStatMap = new HashMap<>(); this.globalStat = new WorkloadStat(); buildProfile(); }
private JavaRDD<WriteStatus> upsertRecordsInternal(JavaRDD<HoodieRecord<T>> preppedRecords, String commitTime, HoodieTable<T> hoodieTable, final boolean isUpsert) { // Cache the tagged records, so we don't end up computing both // TODO: Consistent contract in HoodieWriteClient regarding preppedRecord storage level handling if (preppedRecords.getStorageLevel() == StorageLevel.NONE()) { preppedRecords.persist(StorageLevel.MEMORY_AND_DISK_SER()); } else { logger.info("RDD PreppedRecords was persisted at: " + preppedRecords.getStorageLevel()); } WorkloadProfile profile = null; if (hoodieTable.isWorkloadProfileNeeded()) { profile = new WorkloadProfile(preppedRecords); logger.info("Workload profile :" + profile); saveWorkloadProfileMetadataToInflight(profile, hoodieTable, commitTime); } // partition using the insert partitioner final Partitioner partitioner = getPartitioner(hoodieTable, isUpsert, profile); JavaRDD<HoodieRecord<T>> partitionedRecords = partition(preppedRecords, partitioner); JavaRDD<WriteStatus> writeStatusRDD = partitionedRecords .mapPartitionsWithIndex((partition, recordItr) -> { if (isUpsert) { return hoodieTable.handleUpsertPartition(commitTime, partition, recordItr, partitioner); } else { return hoodieTable.handleInsertPartition(commitTime, partition, recordItr, partitioner); } }, true).flatMap(writeStatuses -> writeStatuses.iterator()); return updateIndexAndCommitIfNeeded(writeStatusRDD, hoodieTable, commitTime); }
try { HoodieCommitMetadata metadata = new HoodieCommitMetadata(); profile.getPartitionPaths().stream().forEach(path -> { WorkloadStat partitionStat = profile.getWorkloadStat(path.toString()); partitionStat.getUpdateLocationToCount().entrySet().stream().forEach(entry -> { HoodieWriteStat writeStat = new HoodieWriteStat();
private JavaRDD<WriteStatus> upsertRecordsInternal(JavaRDD<HoodieRecord<T>> preppedRecords, String commitTime, HoodieTable<T> hoodieTable, final boolean isUpsert) { // Cache the tagged records, so we don't end up computing both // TODO: Consistent contract in HoodieWriteClient regarding preppedRecord storage level handling if (preppedRecords.getStorageLevel() == StorageLevel.NONE()) { preppedRecords.persist(StorageLevel.MEMORY_AND_DISK_SER()); } else { logger.info("RDD PreppedRecords was persisted at: " + preppedRecords.getStorageLevel()); } WorkloadProfile profile = null; if (hoodieTable.isWorkloadProfileNeeded()) { profile = new WorkloadProfile(preppedRecords); logger.info("Workload profile :" + profile); saveWorkloadProfileMetadataToInflight(profile, hoodieTable, commitTime); } // partition using the insert partitioner final Partitioner partitioner = getPartitioner(hoodieTable, isUpsert, profile); JavaRDD<HoodieRecord<T>> partitionedRecords = partition(preppedRecords, partitioner); JavaRDD<WriteStatus> writeStatusRDD = partitionedRecords .mapPartitionsWithIndex((partition, recordItr) -> { if (isUpsert) { return hoodieTable.handleUpsertPartition(commitTime, partition, recordItr, partitioner); } else { return hoodieTable.handleInsertPartition(commitTime, partition, recordItr, partitioner); } }, true).flatMap(List::iterator); return updateIndexAndCommitIfNeeded(writeStatusRDD, hoodieTable, commitTime); }
try { HoodieCommitMetadata metadata = new HoodieCommitMetadata(); profile.getPartitionPaths().stream().forEach(path -> { WorkloadStat partitionStat = profile.getWorkloadStat(path.toString()); partitionStat.getUpdateLocationToCount().entrySet().stream().forEach(entry -> { HoodieWriteStat writeStat = new HoodieWriteStat();
private UpsertPartitioner getUpsertPartitioner(int smallFileSize, int numInserts, int numUpdates, int fileSize, String testPartitionPath, boolean autoSplitInserts) throws Exception { HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig( HoodieCompactionConfig.newBuilder().compactionSmallFileSize(smallFileSize).insertSplitSize(100) .autoTuneInsertSplits(autoSplitInserts).build()).withStorageConfig( HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build(); HoodieClientTestUtils.fakeCommitFile(basePath, "001"); HoodieClientTestUtils.fakeDataFile(basePath, testPartitionPath, "001", "file1", fileSize); HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[]{testPartitionPath}); List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts); List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates); for (HoodieRecord updateRec : updateRecords) { updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1")); } List<HoodieRecord> records = new ArrayList<>(); records.addAll(insertRecords); records.addAll(updateRecords); WorkloadProfile profile = new WorkloadProfile(jsc.parallelize(records)); HoodieCopyOnWriteTable.UpsertPartitioner partitioner = (HoodieCopyOnWriteTable.UpsertPartitioner) table.getUpsertPartitioner(profile); assertEquals("Update record should have gone to the 1 update partiton", 0, partitioner.getPartition( new Tuple2<>(updateRecords.get(0).getKey(), Option.apply(updateRecords.get(0).getCurrentLocation())))); return partitioner; }
public WorkloadProfile(JavaRDD<HoodieRecord<T>> taggedRecords) { this.taggedRecords = taggedRecords; this.partitionPathStatMap = new HashMap<>(); this.globalStat = new WorkloadStat(); buildProfile(); }