public static Comparator<HoodieDataFile> getCommitTimeComparator() { return (o1, o2) -> { // reverse the order return o2.getCommitTime().compareTo(o1.getCommitTime()); }; }
/** * Add a new datafile into the file group */ public void addDataFile(HoodieDataFile dataFile) { if (!fileSlices.containsKey(dataFile.getCommitTime())) { fileSlices.put(dataFile.getCommitTime(), new FileSlice(dataFile.getCommitTime(), id)); } fileSlices.get(dataFile.getCommitTime()).setDataFile(dataFile); }
@Override public Stream<HoodieDataFile> getLatestDataFilesInRange(List<String> commitsToReturn) { return fileGroupMap.values().stream() .map(fileGroup -> fileGroup.getAllDataFiles() .filter(dataFile -> commitsToReturn.contains(dataFile.getCommitTime()) && !isDataFileDueToPendingCompaction(dataFile)) .findFirst()) .filter(Optional::isPresent) .map(Optional::get); }
@Override public Stream<HoodieDataFile> getLatestDataFilesOn(String partitionPath, String instantTime) { return getAllFileGroups(partitionPath) .map(fileGroup -> fileGroup.getAllDataFiles() .filter(dataFile -> HoodieTimeline.compareTimestamps(dataFile.getCommitTime(), instantTime, HoodieTimeline.EQUAL)) .filter(df -> !isDataFileDueToPendingCompaction(df)) .findFirst()) .filter(Optional::isPresent) .map(Optional::get); }
@Override public Stream<HoodieDataFile> getLatestDataFilesBeforeOrOn(String partitionPath, String maxCommitTime) { return getAllFileGroups(partitionPath) .map(fileGroup -> fileGroup.getAllDataFiles() .filter(dataFile -> HoodieTimeline.compareTimestamps(dataFile.getCommitTime(), maxCommitTime, HoodieTimeline.LESSER_OR_EQUAL)) .filter(df -> !isDataFileDueToPendingCompaction(df)) .findFirst()) .filter(Optional::isPresent) .map(Optional::get); }
public CompactionOperation(java.util.Optional<HoodieDataFile> dataFile, String partitionPath, List<HoodieLogFile> logFiles, Map<String, Double> metrics) { if (dataFile.isPresent()) { this.baseInstantTime = dataFile.get().getCommitTime(); this.dataFilePath = Optional.of(dataFile.get().getPath()); this.fileId = dataFile.get().getFileId(); this.dataFileCommitTime = Optional.of(dataFile.get().getCommitTime()); } else { assert logFiles.size() > 0; this.dataFilePath = Optional.absent(); this.baseInstantTime = FSUtils.getBaseCommitTimeFromLogPath(logFiles.get(0).getPath()); this.fileId = FSUtils.getFileIdFromLogPath(logFiles.get(0).getPath()); this.dataFileCommitTime = Optional.absent(); } this.partitionPath = partitionPath; this.deltaFilePaths = logFiles.stream().map(s -> s.getPath().toString()) .collect(Collectors.toList()); this.metrics = metrics; }
/** * With async compaction, it is possible to see partial/complete data-files due to inflight-compactions, Ignore * those data-files * * @param dataFile Data File */ private boolean isDataFileDueToPendingCompaction(HoodieDataFile dataFile) { Pair<String, CompactionOperation> compactionWithInstantTime = fileIdToPendingCompaction.get(dataFile.getFileId()); if ((null != compactionWithInstantTime) && (null != compactionWithInstantTime.getLeft()) && dataFile.getCommitTime().equals(compactionWithInstantTime.getKey())) { return true; } return false; }
private List<HoodieCompactionOperation> createCompactionOperations(HoodieWriteConfig config, Map<Long, List<Long>> sizesMap, Map<Long, String> keyToPartitionMap) { List<HoodieCompactionOperation> operations = Lists.newArrayList(sizesMap.size()); sizesMap.forEach((k, v) -> { HoodieDataFile df = TestHoodieDataFile.newDataFile(k); String partitionPath = keyToPartitionMap.get(k); List<HoodieLogFile> logFiles = v.stream().map(TestHoodieLogFile::newLogFile).collect(Collectors.toList()); operations.add(new HoodieCompactionOperation(df.getCommitTime(), logFiles.stream().map(s -> s.getPath().toString()).collect(Collectors.toList()), df.getPath(), df.getFileId(), partitionPath, config.getCompactionStrategy().captureMetrics(config, Optional.of(df), partitionPath, logFiles))); }); return operations; } }
private static HoodieLogFile generateLogData(Path parquetFilePath, boolean isLogSchemaSimple) throws IOException, InterruptedException, URISyntaxException { Schema schema = (isLogSchemaSimple ? SchemaTestUtil.getSimpleSchema() : SchemaTestUtil.getEvolvedSchema()); HoodieDataFile dataFile = new HoodieDataFile(fileSystem.getFileStatus(parquetFilePath)); // Write a log file for this parquet file Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(parquetFilePath.getParent()) .withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId(dataFile.getFileId()) .overBaseCommit(dataFile.getCommitTime()).withFs(fileSystem).build(); List<IndexedRecord> records = (isLogSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100) : SchemaTestUtil.generateEvolvedTestRecords(100, 100)); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, dataFile.getCommitTime()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); logWriter.appendBlock(dataBlock); logWriter.close(); return logWriter.getLogFile(); }
return view1.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("003")); }).collect(Collectors.toList()); assertEquals("The data files for commit 003 should be present", 3, dataFiles.size()); return view1.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("002")); }).collect(Collectors.toList()); assertEquals("The data files for commit 002 should be present", 3, dataFiles.size()); return view2.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("004")); }).collect(Collectors.toList()); assertEquals("The data files for commit 004 should be present", 3, dataFiles.size()); final TableFileSystemView.ReadOptimizedView view3 = table.getROFileSystemView(); dataFiles = partitionPaths.stream().flatMap(s -> { return view3.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("002")); }).collect(Collectors.toList()); assertEquals("The data files for commit 002 be available", 3, dataFiles.size()); return view3.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("003")); }).collect(Collectors.toList()); assertEquals("The data files for commit 003 should be rolled back", 0, dataFiles.size()); return view3.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("004")); }).collect(Collectors.toList()); assertEquals("The data files for commit 004 should be rolled back", 0, dataFiles.size());
fileGroup.getAllDataFiles().filter(df -> { return compactionFileIdToLatestFileSlice.get(fileGroup.getId()) .getBaseInstantTime().equals(df.getCommitTime()); }).findAny(); Assert.assertTrue("Data File selected for compaction is retained", for (int i = 0; i < dataFiles.size(); i++) { assertEquals("File " + fileId + " does not have latest versions on commits" + commitedVersions, Iterables.get(dataFiles, i).getCommitTime(), commitedVersions.get(commitedVersions.size() - 1 - i));
fileGroup.getAllDataFiles().forEach(value -> { logger.debug("Data File - " + value); commitTimes.add(value.getCommitTime()); }); assertEquals("Only contain acceptable versions of file should be present",
assertEquals("Expect only one data-file to be sent", 1, dataFiles.size()); dataFiles.stream().forEach(df -> { assertEquals("Expect data-file for instant 1 be returned", df.getCommitTime(), instantTime1); }); assertEquals("Expect only one data-file to be sent", 1, dataFiles.size()); dataFiles.stream().forEach(df -> { assertEquals("Expect data-file for instant 1 be returned", df.getCommitTime(), instantTime1); }); assertEquals("Expect only one data-file to be sent", 1, dataFiles.size()); dataFiles.stream().forEach(df -> { assertEquals("Expect data-file for instant 1 be returned", df.getCommitTime(), instantTime1); }); assertEquals("Expect only one data-file to be sent", 1, dataFiles.size()); dataFiles.stream().forEach(df -> { assertEquals("Expect data-file for instant 1 be returned", df.getCommitTime(), instantTime1); }); assertEquals("Expect only one data-file to be sent", 1, dataFiles.size()); dataFiles.stream().forEach(df -> { assertEquals("Expect data-file created by compaction be returned", df.getCommitTime(), compactionRequestedTime); }); assertEquals("Expect only one data-file to be sent", 1, dataFiles.size()); dataFiles.stream().forEach(df -> { assertEquals("Expect data-file created by compaction be returned", df.getCommitTime(),
for (HoodieDataFile file : files) { if (file.getFileName().contains(file1)) { assertEquals("Existing file should be expanded", commitTime3, file.getCommitTime()); records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), new Path(file.getPath())); for (GenericRecord record : records) { assertEquals("New file must be written for commit 3", commitTime3, file.getCommitTime()); records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), new Path(file.getPath())); for (GenericRecord record : records) {
assertEquals("All files must be at commit 3", commitTime3, file.getCommitTime()); records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), new Path(file.getPath())); totalInserts += records.size();
if (compactedCommitTime.equals(file.getCommitTime())) { return true; } else { if (compactedCommitTime.equals(file.getCommitTime())) { return true; } else {
if (absentCommit.equals(file.getCommitTime())) { return true; } else {