/** * Build Avro generated Compaction operation payload from compaction operation POJO for serialization */ public static HoodieCompactionOperation buildHoodieCompactionOperation(CompactionOperation op) { return HoodieCompactionOperation.newBuilder().setFileId(op.getFileId()) .setBaseInstantTime(op.getBaseInstantTime()) .setPartitionPath(op.getPartitionPath()) .setDataFilePath(op.getDataFilePath().isPresent() ? op.getDataFilePath().get() : null) .setDeltaFilePaths(op.getDeltaFilePaths()) .setMetrics(op.getMetrics()).build(); }
/** * Build Compaction operation payload from Avro version for using in Spark executors * * @param hc HoodieCompactionOperation */ public static CompactionOperation buildCompactionOperation(HoodieCompactionOperation hc) { return CompactionOperation.convertFromAvroRecordInstance(hc); }
if (lastInstant.isPresent()) { java.util.Optional<FileSlice> fileSliceOptional = fileSystemView.getLatestUnCompactedFileSlices(operation.getPartitionPath()) .filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst(); if (fileSliceOptional.isPresent()) { FileSlice fs = fileSliceOptional.get(); java.util.Optional<HoodieDataFile> df = fs.getDataFile(); if (operation.getDataFilePath().isPresent()) { String expPath = metaClient.getFs().getFileStatus(new Path(operation.getDataFilePath().get())).getPath() .toString(); Preconditions.checkArgument(df.isPresent(), "Data File must be present. File Slice was : " Set<HoodieLogFile> logFilesInCompactionOp = operation.getDeltaFilePaths().stream() .map(dp -> { try { + "nor present after compaction request instant. Some of these :" + diff); } else { throw new CompactionValidationException("Unable to find file-slice for file-id (" + operation.getFileId() + " Compaction operation is invalid.");
List<Comparable[]> rows = new ArrayList<>(); res.stream().forEach(r -> { Comparable[] row = new Comparable[]{r.getOperation().getFileId(), r.getOperation().getBaseInstantTime(), r.getOperation().getDataFilePath().isPresent() ? r.getOperation().getDataFilePath().get() : "", r.getOperation().getDeltaFilePaths().size(), r.isSuccess(), r.getException().isPresent() ? r.getException().get().getMessage() : ""}; rows.add(row);
fileSystemView.getLatestMergedFileSlicesBeforeOrOn(operation.getPartitionPath(), lastInstant.getTimestamp()) .filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get(); List<HoodieLogFile> logFilesToRepair = merged.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(compactionInstant)) .collect(Collectors.toList()); FileSlice fileSliceForCompaction = fileSystemView.getLatestFileSlicesBeforeOrOn(operation.getPartitionPath(), operation.getBaseInstantTime()) .filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get(); int maxUsedVersion = fileSliceForCompaction.getLogFiles().findFirst().map(HoodieLogFile::getLogVersion) for (HoodieLogFile toRepair : logFilesToRepair) { int version = maxUsedVersion + 1; HoodieLogFile newLf = new HoodieLogFile(new Path(parentPath, FSUtils.makeLogFileName(operation.getFileId(), logExtn, operation.getBaseInstantTime(), version))); result.add(Pair.of(toRepair, newLf)); maxUsedVersion = version;
HoodieInstant lastInstant = metaClient.getCommitsAndCompactionTimeline().lastInstant().get(); FileSlice merged = fileSystemView.getLatestMergedFileSlicesBeforeOrOn(op.getPartitionPath(), lastInstant.getTimestamp()) .filter(fs -> fs.getFileId().equals(op.getFileId())).findFirst().get(); final int maxVersion = op.getDeltaFilePaths().stream().map(lf -> FSUtils.getFileVersionFromLog(new Path(lf))) .reduce((x, y) -> x > y ? x : y).orElse(0); List<HoodieLogFile> logFilesToBeMoved =
return new CompactionOperation(dataFile, partitionPath, logFiles, config.getCompactionStrategy().captureMetrics(config, dataFile, partitionPath, logFiles)); }) .filter(c -> !c.getDeltaFilePaths().isEmpty()) .collect(toList()).iterator()).collect().stream().map(CompactionUtils::buildHoodieCompactionOperation) .collect(toList());
Set<HoodieLogFile> expLogFilesToBeRenamed = fsView.getLatestFileSlices(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0]) .filter(fs -> fs.getBaseInstantTime().equals(compactionInstant)) .filter(fs -> fs.getFileId().equals(op.getFileId())) .flatMap(fs -> fs.getLogFiles()) .collect(Collectors.toSet()); fsView.getLatestMergedFileSlicesBeforeOrOn(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0], compactionInstant) .filter(fs -> fs.getBaseInstantTime().equals(ingestionInstant)) .filter(fs -> fs.getFileId().equals(op.getFileId())) .map(fs -> Pair.of(fs.getFileId(), fs.getLogFiles().count())) .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); client.unscheduleCompactionFileId(op.getFileId(), false, false); .filter(fs -> fs.getFileId().equals(op.getFileId())).forEach(fs -> { Assert.assertFalse("No Data file must be present", fs.getDataFile().isPresent()); Assert.assertTrue("No Log Files", fs.getLogFiles().count() == 0); newFsView.getAllFileGroups(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0]).flatMap(fg -> fg.getAllFileSlices()) .filter(fs -> fs.getBaseInstantTime().equals(ingestionInstant)) .filter(fs -> fs.getFileId().equals(op.getFileId())) .map(fs -> Pair.of(fs.getFileId(), fs.getLogFiles().count())) .collect(Collectors.toMap(Pair::getKey, Pair::getValue));
/** * Determine if file slice needed to be preserved for pending compaction * @param fileSlice File Slice * @return true if file slice needs to be preserved, false otherwise. */ private boolean isFileSliceNeededForPendingCompaction(FileSlice fileSlice) { CompactionOperation op = fileIdToPendingCompactionOperations.get(fileSlice.getFileId()); if (null != op) { // If file slice's instant time is newer or same as that of operation, do not clean return HoodieTimeline.compareTimestamps(fileSlice.getBaseInstantTime(), op.getBaseInstantTime(), HoodieTimeline.GREATER_OR_EQUAL); } return false; } }
/** * Convert Avro generated Compaction operation to POJO for Spark RDD operation * @param operation Hoodie Compaction Operation * @return */ public static CompactionOperation convertFromAvroRecordInstance(HoodieCompactionOperation operation) { CompactionOperation op = new CompactionOperation(); op.baseInstantTime = operation.getBaseInstantTime(); op.dataFilePath = Optional.fromNullable(operation.getDataFilePath()); op.deltaFilePaths = new ArrayList<>(operation.getDeltaFilePaths()); op.fileId = operation.getFileId(); op.metrics = operation.getMetrics() == null ? new HashMap<>() : new HashMap<>(operation.getMetrics()); op.partitionPath = operation.getPartitionPath(); return op; } }
return new CompactionOperation(dataFile, partitionPath, logFiles, config.getCompactionStrategy().captureMetrics(config, dataFile, partitionPath, logFiles)); }) .filter(c -> !c.getDeltaFilePaths().isEmpty()) .collect(toList()).iterator()).collect().stream().map(CompactionUtils::buildHoodieCompactionOperation) .collect(toList());
/** * Determine if file slice needed to be preserved for pending compaction * @param fileSlice File Slice * @return true if file slice needs to be preserved, false otherwise. */ private boolean isFileSliceNeededForPendingCompaction(FileSlice fileSlice) { CompactionOperation op = fileIdToPendingCompactionOperations.get(fileSlice.getFileId()); if (null != op) { // If file slice's instant time is newer or same as that of operation, do not clean return HoodieTimeline.compareTimestamps(fileSlice.getBaseInstantTime(), op.getBaseInstantTime(), HoodieTimeline.GREATER_OR_EQUAL); } return false; } }
.addMetadataFields(new Schema.Parser().parse(config.getSchema())); log.info("Compacting base " + operation.getDataFilePath() + " with delta files " + operation .getDeltaFilePaths() + " for commit " + commitTime); log.info("MaxMemoryPerCompaction => " + config.getMaxMemoryPerCompaction()); HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, metaClient.getBasePath(), operation.getDeltaFilePaths(), readerSchema, maxInstantTime, config.getMaxMemoryPerCompaction(), config.getCompactionLazyBlockReadEnabled(), config.getCompactionReverseLogReadEnabled(), config.getMaxDFSStreamBufferSize(), .getLatestDataFilesOn(operation.getPartitionPath(), operation.getBaseInstantTime()) .filter(df -> df.getFileId().equals(operation.getFileId())).findFirst(); if (operation.getDataFilePath().isPresent()) { result = hoodieCopyOnWriteTable .handleUpdate(commitTime, operation.getFileId(), scanner.getRecords(), oldDataFileOpt); } else { result = hoodieCopyOnWriteTable .handleInsert(commitTime, operation.getPartitionPath(), operation.getFileId(), scanner.iterator()); s.getStat().setTotalLogFilesCompacted(scanner.getTotalLogFiles()); s.getStat().setTotalLogRecords(scanner.getTotalLogRecords()); s.getStat().setPartitionPath(operation.getPartitionPath()); s.getStat().setTotalLogSizeCompacted(operation.getMetrics().get( CompactionStrategy.TOTAL_LOG_FILE_SIZE).longValue()); s.getStat().setTotalLogBlocks(scanner.getTotalLogBlocks());
/** * Create a file system view, as of the given timeline */ public HoodieTableFileSystemView(HoodieTableMetaClient metaClient, HoodieTimeline visibleActiveTimeline) { this.metaClient = metaClient; this.visibleActiveTimeline = visibleActiveTimeline; this.fileGroupMap = new HashMap<>(); this.partitionToFileGroupsMap = new HashMap<>(); // Build fileId to Pending Compaction Instants List<HoodieInstant> pendingCompactionInstants = metaClient.getActiveTimeline().filterPendingCompactionTimeline().getInstants().collect(Collectors.toList()); this.fileIdToPendingCompaction = ImmutableMap.copyOf( CompactionUtils.getAllPendingCompactionOperations(metaClient).entrySet().stream() .map(entry -> Pair.of(entry.getKey(), Pair.of(entry.getValue().getKey(), CompactionOperation.convertFromAvroRecordInstance(entry.getValue().getValue())))) .collect(Collectors.toMap(Pair::getKey, Pair::getValue))); }
.addMetadataFields(new Schema.Parser().parse(config.getSchema())); log.info("Compacting base " + operation.getDataFilePath() + " with delta files " + operation .getDeltaFilePaths() + " for commit " + commitTime); log.info("MaxMemoryPerCompaction => " + config.getMaxMemoryPerCompaction()); HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, metaClient.getBasePath(), operation.getDeltaFilePaths(), readerSchema, maxInstantTime, config.getMaxMemoryPerCompaction(), config.getCompactionLazyBlockReadEnabled(), config.getCompactionReverseLogReadEnabled(), config.getMaxDFSStreamBufferSize(), .getLatestDataFilesOn(operation.getPartitionPath(), operation.getBaseInstantTime()) .filter(df -> df.getFileId().equals(operation.getFileId())).findFirst(); if (operation.getDataFilePath().isPresent()) { result = hoodieCopyOnWriteTable .handleUpdate(commitTime, operation.getFileId(), scanner.getRecords(), oldDataFileOpt); } else { result = hoodieCopyOnWriteTable .handleInsert(commitTime, operation.getPartitionPath(), operation.getFileId(), scanner.iterator()); s.getStat().setTotalLogFilesCompacted(scanner.getTotalLogFiles()); s.getStat().setTotalLogRecords(scanner.getTotalLogRecords()); s.getStat().setPartitionPath(operation.getPartitionPath()); s.getStat().setTotalLogSizeCompacted(operation.getMetrics().get( CompactionStrategy.TOTAL_LOG_FILE_SIZE).longValue()); s.getStat().setTotalLogBlocks(scanner.getTotalLogBlocks());
/** * Generate renaming actions for unscheduling a fileId from pending compaction. NOTE: Can only be used safely when no * writer (ingestion/compaction) is running. * * @param metaClient Hoodie Table MetaClient * @param fileId FileId to remove compaction * @param fsViewOpt Cached File System View * @param skipValidation Skip Validation * @return list of pairs of log-files (old, new) and for each pair, rename must be done to successfully unschedule * compaction. */ public List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsForUnschedulingCompactionForFileId( HoodieTableMetaClient metaClient, String fileId, Optional<HoodieTableFileSystemView> fsViewOpt, boolean skipValidation) throws IOException { Map<String, Pair<String, HoodieCompactionOperation>> allPendingCompactions = CompactionUtils.getAllPendingCompactionOperations(metaClient); if (allPendingCompactions.containsKey(fileId)) { Pair<String, HoodieCompactionOperation> opWithInstant = allPendingCompactions.get(fileId); return getRenamingActionsForUnschedulingCompactionOperation(metaClient, opWithInstant.getKey(), CompactionOperation.convertFromAvroRecordInstance(opWithInstant.getValue()), fsViewOpt, skipValidation); } throw new HoodieException("FileId " + fileId + " not in pending compaction"); }
@Test public void testUnscheduleCompactionFileId() throws Exception { int numEntriesPerInstant = 10; CompactionTestUtils .setupAndValidateCompactionOperations(metaClient, false, numEntriesPerInstant, numEntriesPerInstant, numEntriesPerInstant, numEntriesPerInstant); Map<String, CompactionOperation> instantsWithOp = Arrays.asList("001", "003", "005", "007").stream().map(instant -> { try { return Pair.of(instant, CompactionUtils.getCompactionPlan(metaClient, instant)); } catch (IOException ioe) { throw new HoodieException(ioe); } }).map(instantWithPlan -> instantWithPlan.getRight().getOperations().stream().map(op -> Pair.of( instantWithPlan.getLeft(), CompactionOperation.convertFromAvroRecordInstance(op))).findFirst().get()) .collect(Collectors.toMap(Pair::getLeft, Pair::getRight)); // THere are delta-commits after compaction instant validateUnScheduleFileId(client, "000", "001", instantsWithOp.get("001"), 2); // THere are delta-commits after compaction instant validateUnScheduleFileId(client, "002", "003", instantsWithOp.get("003"), 2); // THere are no delta-commits after compaction instant validateUnScheduleFileId(client, "004", "005", instantsWithOp.get("005"), 0); // THere are no delta-commits after compaction instant validateUnScheduleFileId(client, "006", "007", instantsWithOp.get("007"), 0); }