@Override public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable, HoodieWriteConfig config, String compactionInstantTime) throws IOException { if (compactionPlan == null || (compactionPlan.getOperations() == null) || (compactionPlan.getOperations().isEmpty())) { return jsc.emptyRDD(); } HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); // Compacting is very similar to applying updates to existing file HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); List<CompactionOperation> operations = compactionPlan.getOperations().stream().map( CompactionOperation::convertFromAvroRecordInstance).collect(toList()); log.info("Compactor compacting " + operations + " files"); return jsc.parallelize(operations, operations.size()) .map(s -> compact(table, metaClient, config, s, compactionInstantTime)) .flatMap(writeStatusesItr -> writeStatusesItr.iterator()); }
@Override public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable, HoodieWriteConfig config, String compactionInstantTime) throws IOException { if (compactionPlan == null || (compactionPlan.getOperations() == null) || (compactionPlan.getOperations().isEmpty())) { return jsc.emptyRDD(); } HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); // Compacting is very similar to applying updates to existing file HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); List<CompactionOperation> operations = compactionPlan.getOperations().stream().map( CompactionOperation::convertFromAvroRecordInstance).collect(toList()); log.info("Compactor compacting " + operations + " files"); return jsc.parallelize(operations, operations.size()) .map(s -> compact(table, metaClient, config, s, compactionInstantTime)) .flatMap(List::iterator); }
public static <T extends HoodieRecordPayload> HoodieTable<T> getHoodieTable( HoodieTableMetaClient metaClient, HoodieWriteConfig config, JavaSparkContext jsc) { switch (metaClient.getTableType()) { case COPY_ON_WRITE: return new HoodieCopyOnWriteTable<>(config, jsc); case MERGE_ON_READ: return new HoodieMergeOnReadTable<>(config, jsc); default: throw new HoodieException("Unsupported table type :" + metaClient.getTableType()); } }
public static <T extends HoodieRecordPayload> HoodieTable<T> getHoodieTable( HoodieTableMetaClient metaClient, HoodieWriteConfig config, JavaSparkContext jsc) { switch (metaClient.getTableType()) { case COPY_ON_WRITE: return new HoodieCopyOnWriteTable<>(config, jsc); case MERGE_ON_READ: return new HoodieMergeOnReadTable<>(config, jsc); default: throw new HoodieException("Unsupported table type :" + metaClient.getTableType()); } }
@Test public void testFileSizeUpsertRecords() throws Exception { HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig( HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024).parquetPageSize(64 * 1024) .build()).build(); String commitTime = HoodieTestUtils.makeNewCommitTime(); HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); List<HoodieRecord> records = new ArrayList<>(); // Approx 1150 records are written for block size of 64KB for (int i = 0; i < 2000; i++) { String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}"; TestRawTripPayload rowChange = new TestRawTripPayload(recordStr); records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); } // Insert new records HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator())); // Check the updated file int counts = 0; for (File file : new File(basePath + "/2016/01/31").listFiles()) { if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName()).equals(commitTime)) { System.out.println(file.getName() + "-" + file.length()); counts++; } } assertEquals("If the number of records are more than 1150, then there should be a new file", 3, counts); }
@Test public void testInsertWithPartialFailures() throws Exception { HoodieWriteConfig config = makeHoodieClientConfig(); String commitTime = HoodieTestUtils.makeNewCommitTime(); FileSystem fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration()); HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); // Write a few records, and get atleast one file // 10 records for partition 1, 1 record for partition 2. List<HoodieRecord> records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z"); records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z")); // Simulate crash after first file List<WriteStatus> statuses = HoodieClientTestUtils .collectStatuses(table.handleInsert(commitTime, records.iterator())); WriteStatus status = statuses.get(0); Path partialFile = new Path(String.format("%s/%s/%s", basePath, status.getPartitionPath(), FSUtils.makeDataFileName(commitTime, 0, status.getFileId()))); assertTrue(fs.exists(partialFile)); // When we retry records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z"); records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z")); statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator())); status = statuses.get(0); Path retriedFIle = new Path(String.format("%s/%s/%s", basePath, status.getPartitionPath(), FSUtils.makeDataFileName(commitTime, 0, status.getFileId()))); assertTrue(fs.exists(retriedFIle)); assertFalse(fs.exists(partialFile)); }
String commitTime = HoodieTestUtils.makeNewCommitTime(); HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc);
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); System.out.println(fileId); table = new HoodieCopyOnWriteTable(config, jsc);
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); String newCommitTime = HoodieTestUtils.makeNewCommitTime(); metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); table = new HoodieCopyOnWriteTable(config, jsc); Iterator<List<WriteStatus>> iter = table .handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(),
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc);
private UpsertPartitioner getUpsertPartitioner(int smallFileSize, int numInserts, int numUpdates, int fileSize, String testPartitionPath, boolean autoSplitInserts) throws Exception { HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig( HoodieCompactionConfig.newBuilder().compactionSmallFileSize(smallFileSize).insertSplitSize(100) .autoTuneInsertSplits(autoSplitInserts).build()).withStorageConfig( HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build(); HoodieClientTestUtils.fakeCommitFile(basePath, "001"); HoodieClientTestUtils.fakeDataFile(basePath, testPartitionPath, "001", "file1", fileSize); HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[]{testPartitionPath}); List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts); List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates); for (HoodieRecord updateRec : updateRecords) { updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1")); } List<HoodieRecord> records = new ArrayList<>(); records.addAll(insertRecords); records.addAll(updateRecords); WorkloadProfile profile = new WorkloadProfile(jsc.parallelize(records)); HoodieCopyOnWriteTable.UpsertPartitioner partitioner = (HoodieCopyOnWriteTable.UpsertPartitioner) table.getUpsertPartitioner(profile); assertEquals("Update record should have gone to the 1 update partiton", 0, partitioner.getPartition( new Tuple2<>(updateRecords.get(0).getKey(), Option.apply(updateRecords.get(0).getCurrentLocation())))); return partitioner; }