/** * Returns true if the file-group is under pending-compaction and the file-slice' baseInstant matches * compaction Instant * @param fileSlice File Slice * @return */ private boolean isFileSliceAfterPendingCompaction(FileSlice fileSlice) { Pair<String, CompactionOperation> compactionWithInstantTime = fileIdToPendingCompaction.get(fileSlice.getFileId()); return (null != compactionWithInstantTime) && fileSlice.getBaseInstantTime().equals(compactionWithInstantTime.getKey()); }
/** * If the file-slice is because of pending compaction instant, this method merges the file-slice with the one before * the compaction instant time * @param fileGroup File Group for which the file slice belongs to * @param fileSlice File Slice which needs to be merged * @return */ private FileSlice getMergedFileSlice(HoodieFileGroup fileGroup, FileSlice fileSlice) { // if the file-group is under construction, pick the latest before compaction instant time. if (fileIdToPendingCompaction.containsKey(fileSlice.getFileId())) { String compactionInstantTime = fileIdToPendingCompaction.get(fileSlice.getFileId()).getKey(); if (fileSlice.getBaseInstantTime().equals(compactionInstantTime)) { Optional<FileSlice> prevFileSlice = fileGroup.getLatestFileSliceBefore(compactionInstantTime); if (prevFileSlice.isPresent()) { return mergeCompactionPendingFileSlices(fileSlice, prevFileSlice.get()); } } } return fileSlice; }
@Override public Stream<FileSlice> getLatestUnCompactedFileSlices(String partitionPath) { return getAllFileGroups(partitionPath) .map(fileGroup -> { FileSlice fileSlice = fileGroup.getLatestFileSlice().get(); // if the file-group is under compaction, pick the latest before compaction instant time. if (isFileSliceAfterPendingCompaction(fileSlice)) { String compactionInstantTime = fileIdToPendingCompaction.get(fileSlice.getFileId()).getLeft(); return fileGroup.getLatestFileSliceBefore(compactionInstantTime); } return Optional.of(fileSlice); }) .map(Optional::get); }
/** * Determine if file slice needed to be preserved for pending compaction * @param fileSlice File Slice * @return true if file slice needs to be preserved, false otherwise. */ private boolean isFileSliceNeededForPendingCompaction(FileSlice fileSlice) { CompactionOperation op = fileIdToPendingCompactionOperations.get(fileSlice.getFileId()); if (null != op) { // If file slice's instant time is newer or same as that of operation, do not clean return HoodieTimeline.compareTimestamps(fileSlice.getBaseInstantTime(), op.getBaseInstantTime(), HoodieTimeline.GREATER_OR_EQUAL); } return false; } }
/** * Determine if file slice needed to be preserved for pending compaction * @param fileSlice File Slice * @return true if file slice needs to be preserved, false otherwise. */ private boolean isFileSliceNeededForPendingCompaction(FileSlice fileSlice) { CompactionOperation op = fileIdToPendingCompactionOperations.get(fileSlice.getFileId()); if (null != op) { // If file slice's instant time is newer or same as that of operation, do not clean return HoodieTimeline.compareTimestamps(fileSlice.getBaseInstantTime(), op.getBaseInstantTime(), HoodieTimeline.GREATER_OR_EQUAL); } return false; } }
/** * With async compaction, it is possible to see partial/complete data-files due to inflight-compactions, * Ignore those data-files * @param fileSlice File Slice * @return */ private FileSlice filterDataFileAfterPendingCompaction(FileSlice fileSlice) { if (isFileSliceAfterPendingCompaction(fileSlice)) { // Data file is filtered out of the file-slice as the corresponding compaction // instant not completed yet. FileSlice transformed = new FileSlice(fileSlice.getBaseInstantTime(), fileSlice.getFileId()); fileSlice.getLogFiles().forEach(transformed::addLogFile); return transformed; } return fileSlice; }
/** * Helper to merge last 2 file-slices. These 2 file-slices do not have compaction done yet. * * @param lastSlice Latest File slice for a file-group * @param penultimateSlice Penultimate file slice for a file-group in commit timeline order */ private static FileSlice mergeCompactionPendingFileSlices(FileSlice lastSlice, FileSlice penultimateSlice) { FileSlice merged = new FileSlice(penultimateSlice.getBaseInstantTime(), penultimateSlice.getFileId()); if (penultimateSlice.getDataFile().isPresent()) { merged.setDataFile(penultimateSlice.getDataFile().get()); } // Add Log files from penultimate and last slices penultimateSlice.getLogFiles().forEach(merged::addLogFile); lastSlice.getLogFiles().forEach(merged::addLogFile); return merged; }
FileSlice merged = fileSystemView.getLatestMergedFileSlicesBeforeOrOn(op.getPartitionPath(), lastInstant.getTimestamp()) .filter(fs -> fs.getFileId().equals(op.getFileId())).findFirst().get(); final int maxVersion = op.getDeltaFilePaths().stream().map(lf -> FSUtils.getFileVersionFromLog(new Path(lf)))
FileSlice merged = fileSystemView.getLatestMergedFileSlicesBeforeOrOn(operation.getPartitionPath(), lastInstant.getTimestamp()) .filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get(); List<HoodieLogFile> logFilesToRepair = merged.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(compactionInstant)) FileSlice fileSliceForCompaction = fileSystemView.getLatestFileSlicesBeforeOrOn(operation.getPartitionPath(), operation.getBaseInstantTime()) .filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get(); int maxUsedVersion = fileSliceForCompaction.getLogFiles().findFirst().map(HoodieLogFile::getLogVersion)
/** * Generate compaction operation from file-slice * * @param partitionPath Partition path * @param fileSlice File Slice * @param metricsCaptureFunction Metrics Capture function * @return Compaction Operation */ public static HoodieCompactionOperation buildFromFileSlice(String partitionPath, FileSlice fileSlice, Optional<Function<Pair<String, FileSlice>, Map<String, Double>>> metricsCaptureFunction) { HoodieCompactionOperation.Builder builder = HoodieCompactionOperation.newBuilder(); builder.setPartitionPath(partitionPath); builder.setFileId(fileSlice.getFileId()); builder.setBaseInstantTime(fileSlice.getBaseInstantTime()); builder.setDeltaFilePaths(fileSlice.getLogFiles().map(lf -> lf.getPath().toString()).collect(Collectors.toList())); if (fileSlice.getDataFile().isPresent()) { builder.setDataFilePath(fileSlice.getDataFile().get().getPath()); } if (metricsCaptureFunction.isPresent()) { builder.setMetrics(metricsCaptureFunction.get().apply(Pair.of(partitionPath, fileSlice))); } return builder.build(); }
.flatMap((FlatMapFunction<String, CompactionOperation>) partitionPath -> fileSystemView .getLatestFileSlices(partitionPath) .filter(slice -> !fileIdsWithPendingCompactions.contains(slice.getFileId())) .map( s -> {
.filter(fileSlice1 -> fileSlice1.getFileId().equals(fileId)).findFirst();
.filter(fileSlice1 -> fileSlice1.getFileId().equals(fileId)).findFirst();
assertEquals(3, slices.size()); for (FileSlice slice : slices) { if (slice.getFileId().equals(fileId1)) { assertEquals(slice.getBaseInstantTime(), commitTime3); assertTrue(slice.getDataFile().isPresent()); assertEquals(slice.getLogFiles().count(), 0); } else if (slice.getFileId().equals(fileId2)) { assertEquals(slice.getBaseInstantTime(), commitTime4); assertFalse(slice.getDataFile().isPresent()); assertEquals(slice.getLogFiles().count(), 1); } else if (slice.getFileId().equals(fileId3)) { assertEquals(slice.getBaseInstantTime(), commitTime4); assertTrue(slice.getDataFile().isPresent());
/** * Validates if generated compaction operation matches with input file slice and partition path * * @param slice File Slice * @param op HoodieCompactionOperation * @param expPartitionPath Partition path */ private void testFileSliceCompactionOpEquality(FileSlice slice, HoodieCompactionOperation op, String expPartitionPath) { Assert.assertEquals("Partition path is correct", expPartitionPath, op.getPartitionPath()); Assert.assertEquals("Same base-instant", slice.getBaseInstantTime(), op.getBaseInstantTime()); Assert.assertEquals("Same file-id", slice.getFileId(), op.getFileId()); if (slice.getDataFile().isPresent()) { Assert.assertEquals("Same data-file", slice.getDataFile().get().getPath(), op.getDataFilePath()); } List<String> paths = slice.getLogFiles().map(l -> l.getPath().toString()).collect(Collectors.toList()); IntStream.range(0, paths.size()).boxed().forEach(idx -> { Assert.assertEquals("Log File Index " + idx, paths.get(idx), op.getDeltaFilePaths().get(idx)); }); Assert.assertEquals("Metrics set", metrics, op.getMetrics()); } }
Set<HoodieLogFile> expLogFilesToBeRenamed = fsView.getLatestFileSlices(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0]) .filter(fs -> fs.getBaseInstantTime().equals(compactionInstant)) .filter(fs -> fs.getFileId().equals(op.getFileId())) .flatMap(fs -> fs.getLogFiles()) .collect(Collectors.toSet()); fsView.getLatestMergedFileSlicesBeforeOrOn(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0], compactionInstant) .filter(fs -> fs.getBaseInstantTime().equals(ingestionInstant)) .filter(fs -> fs.getFileId().equals(op.getFileId())) .map(fs -> Pair.of(fs.getFileId(), fs.getLogFiles().count())) .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); .filter(fs -> fs.getFileId().equals(op.getFileId())).forEach(fs -> { Assert.assertFalse("No Data file must be present", fs.getDataFile().isPresent()); Assert.assertTrue("No Log Files", fs.getLogFiles().count() == 0); newFsView.getAllFileGroups(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0]).flatMap(fg -> fg.getAllFileSlices()) .filter(fs -> fs.getBaseInstantTime().equals(ingestionInstant)) .filter(fs -> fs.getFileId().equals(op.getFileId())) .map(fs -> Pair.of(fs.getFileId(), fs.getLogFiles().count())) .collect(Collectors.toMap(Pair::getKey, Pair::getValue));
private void validateRenameFiles(List<Pair<HoodieLogFile, HoodieLogFile>> renameFiles, String ingestionInstant, String compactionInstant, HoodieTableFileSystemView fsView) { // Ensure new names of log-files are on expected lines Set<HoodieLogFile> uniqNewLogFiles = new HashSet<>(); Set<HoodieLogFile> uniqOldLogFiles = new HashSet<>(); renameFiles.stream().forEach(lfPair -> { Assert.assertFalse("Old Log File Names do not collide", uniqOldLogFiles.contains(lfPair.getKey())); Assert.assertFalse("New Log File Names do not collide", uniqNewLogFiles.contains(lfPair.getValue())); uniqOldLogFiles.add(lfPair.getKey()); uniqNewLogFiles.add(lfPair.getValue()); }); renameFiles.stream().forEach(lfPair -> { HoodieLogFile oldLogFile = lfPair.getLeft(); HoodieLogFile newLogFile = lfPair.getValue(); Assert.assertEquals("Base Commit time is expected", ingestionInstant, newLogFile.getBaseCommitTime()); Assert.assertEquals("Base Commit time is expected", compactionInstant, oldLogFile.getBaseCommitTime()); Assert.assertEquals("File Id is expected", oldLogFile.getFileId(), newLogFile.getFileId()); HoodieLogFile lastLogFileBeforeCompaction = fsView.getLatestMergedFileSlicesBeforeOrOn(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0], ingestionInstant) .filter(fs -> fs.getFileId().equals(oldLogFile.getFileId())) .map(fs -> fs.getLogFiles().findFirst().get()).findFirst().get(); Assert.assertEquals("Log Version expected", lastLogFileBeforeCompaction.getLogVersion() + oldLogFile.getLogVersion(), newLogFile.getLogVersion()); Assert.assertTrue("Log version does not collide", newLogFile.getLogVersion() > lastLogFileBeforeCompaction.getLogVersion()); }); }
/** * HELPER METHODS FOR TESTING **/ private void validateDeltaCommit(String latestDeltaCommit, final Map<String, Pair<String, HoodieCompactionOperation>> fileIdToCompactionOperation, HoodieWriteConfig cfg) throws IOException { HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); HoodieTable table = HoodieTable.getHoodieTable(metaClient, cfg, jsc); List<FileSlice> fileSliceList = getCurrentLatestFileSlices(table, cfg); fileSliceList.forEach(fileSlice -> { Pair<String, HoodieCompactionOperation> opPair = fileIdToCompactionOperation.get(fileSlice.getFileId()); if (opPair != null) { System.out.println("FileSlice :" + fileSlice); assertTrue("Expect baseInstant to match compaction Instant", fileSlice.getBaseInstantTime().equals(opPair.getKey())); assertTrue("Expect atleast one log file to be present where the latest delta commit was written", fileSlice.getLogFiles().count() > 0); assertFalse("Expect no data-file to be present", fileSlice.getDataFile().isPresent()); } else { assertTrue("Expect baseInstant to be less than or equal to latestDeltaCommit", fileSlice.getBaseInstantTime().compareTo(latestDeltaCommit) <= 0); } }); }
assertEquals(1, fileSliceList.size()); FileSlice fileSlice = fileSliceList.get(0); assertEquals("File-Id must be set correctly", fileId, fileSlice.getFileId()); assertFalse("Data file for base instant must be present", fileSlice.getDataFile().isPresent()); assertEquals("Base Instant for file-group set correctly", instantTime1, fileSlice.getBaseInstantTime()); assertEquals(1, fileSliceList.size()); fileSlice = fileSliceList.get(0); assertEquals("File-Id must be set correctly", fileId, fileSlice.getFileId()); assertFalse("Data file for base instant must be present", fileSlice.getDataFile().isPresent()); assertEquals("Base Instant for file-group set correctly", instantTime1, fileSlice.getBaseInstantTime()); assertEquals(1, fileSliceList.size()); fileSlice = fileSliceList.get(0); assertEquals("File-Id must be set correctly", fileId, fileSlice.getFileId()); assertFalse("Data file for base instant must be present", fileSlice.getDataFile().isPresent()); assertEquals("Base Instant for file-group set correctly", instantTime1, fileSlice.getBaseInstantTime());
fsView.getLatestMergedFileSlicesBeforeOrOn(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0], compactionInstant) .filter(fs -> fs.getBaseInstantTime().equals(ingestionInstant)) .map(fs -> Pair.of(fs.getFileId(), fs.getLogFiles().count())) .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); newFsView.getAllFileGroups(HoodieTestUtils.DEFAULT_PARTITION_PATHS[0]).flatMap(fg -> fg.getAllFileSlices()) .filter(fs -> fs.getBaseInstantTime().equals(ingestionInstant)) .map(fs -> Pair.of(fs.getFileId(), fs.getLogFiles().count())) .collect(Collectors.toMap(Pair::getKey, Pair::getValue));