public HoodieMergeHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable, Iterator<HoodieRecord<T>> recordItr, String fileId) { super(config, commitTime, hoodieTable); this.fileSystemView = hoodieTable.getROFileSystemView(); String partitionPath = init(fileId, recordItr); init(fileId, partitionPath, fileSystemView.getLatestDataFiles(partitionPath) .filter(dataFile -> dataFile.getFileId().equals(fileId)).findFirst()); }
} else { List<HoodieDataFile> filteredFiles = roView.getLatestDataFiles() .collect(Collectors.toList()); LOG.info("Total paths to process after hoodie filter " + filteredFiles.size());
public HoodieMergeHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable, Iterator<HoodieRecord<T>> recordItr, String fileId) { super(config, commitTime, hoodieTable); this.fileSystemView = hoodieTable.getROFileSystemView(); String partitionPath = init(fileId, recordItr); init(fileId, partitionPath, fileSystemView.getLatestDataFiles(partitionPath) .filter(dataFile -> dataFile.getFileId().equals(fileId)).findFirst()); }
dataFiles = roView.getLatestDataFiles().collect(Collectors.toList()); if (skipCreatingDataFile) { assertEquals("Expect no data file to be returned", 0, dataFiles.size()); }); dataFiles = roView.getLatestDataFiles(partitionPath).collect(Collectors.toList()); if (skipCreatingDataFile) { assertEquals("Expect no data file to be returned", 0, dataFiles.size()); dataFiles = roView.getLatestDataFiles().collect(Collectors.toList()); assertEquals("Expect only one data-file to be sent", 1, dataFiles.size()); dataFiles.stream().forEach(df -> { compactionRequestedTime); }); dataFiles = roView.getLatestDataFiles(partitionPath).collect(Collectors.toList()); assertEquals("Expect only one data-file to be sent", 1, dataFiles.size()); dataFiles.stream().forEach(df -> {
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles(); assertTrue(!dataFilesToRead.findAny().isPresent()); dataFilesToRead = roView.getLatestDataFiles(); assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit", dataFilesToRead.findAny().isPresent()); List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList()); List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath); assertEquals(recordsRead.size(), 200); dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList()); recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath); assertEquals(recordsRead.size(), 200); dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList()); recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath); assertEquals(recordsRead.size(), 200); hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg, jsc); roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles); dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList()); recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath); hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg, jsc); roView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles); List<HoodieDataFile> dataFiles2 = roView.getLatestDataFiles().collect(Collectors.toList());
List<HoodieDataFile> statuses1 = roView.getLatestDataFiles().collect(Collectors.toList()); assertEquals(3, statuses1.size()); Set<String> filenames = Sets.newHashSet();
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles(); assertTrue(!dataFilesToRead.findAny().isPresent()); dataFilesToRead = roView.getLatestDataFiles(); assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit", dataFilesToRead.findAny().isPresent()); dataFilesToRead = roView.getLatestDataFiles(); assertTrue(dataFilesToRead.findAny().isPresent());
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), allFiles); Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles(); Map<String, Long> parquetFileIdToSize = dataFilesToRead.collect( Collectors.toMap(HoodieDataFile::getFileId, HoodieDataFile::getFileSize)); dataFilesToRead = roView.getLatestDataFiles(); List<HoodieDataFile> dataFilesList = dataFilesToRead.collect(Collectors.toList()); assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit", roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(), allFiles); dataFilesToRead = roView.getLatestDataFiles(); List<HoodieDataFile> newDataFilesList = dataFilesToRead.collect(Collectors.toList()); Map<String, Long> parquetFileIdToNewSize = newDataFilesList.stream().collect( .filter(entry -> parquetFileIdToSize.get(entry.getKey()) < entry.getValue()).count() > 0); List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList()); List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
List<HoodieDataFile> dataFiles = roView.getLatestDataFiles().collect(Collectors.toList()); assertTrue("No data file expected", dataFiles.isEmpty()); List<FileSlice> fileSliceList = rtView.getLatestFileSlices(partitionPath).collect(Collectors.toList());
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles); Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles(); assertTrue(!dataFilesToRead.findAny().isPresent()); dataFilesToRead = roView.getLatestDataFiles(); assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit", dataFilesToRead.findAny().isPresent()); dataFilesToRead = roView.getLatestDataFiles(); assertTrue(dataFilesToRead.findAny().isPresent()); List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList()); List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
roView.getLatestDataFiles(partitionPath).filter(dfile -> dfile.getFileId().equals(fileId)).findFirst() .isPresent()); refreshFsView(null); assertFalse("No commit, should not find any data file", roView.getLatestDataFiles(partitionPath).filter(dfile -> dfile.getFileId().equals(fileId)).findFirst() .isPresent()); refreshFsView(null); assertEquals("", fileName1, roView.getLatestDataFiles(partitionPath).filter(dfile -> dfile.getFileId().equals(fileId)).findFirst().get() .getFileName()); refreshFsView(null); assertEquals("", fileName1, roView.getLatestDataFiles(partitionPath).filter(dfile -> dfile.getFileId().equals(fileId)).findFirst().get() .getFileName()); refreshFsView(null); assertEquals("", fileName2, roView.getLatestDataFiles(partitionPath).filter(dfile -> dfile.getFileId().equals(fileId)).findFirst().get() .getFileName());
/** * Reads the paths under the a hoodie dataset out as a DataFrame */ public static Dataset<Row> read(JavaSparkContext jsc, String basePath, SQLContext sqlContext, FileSystem fs, String... paths) { List<String> filteredPaths = new ArrayList<>(); try { HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath, true); for (String path : paths) { TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView( metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), fs.globStatus(new Path(path))); List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect(Collectors.toList()); for (HoodieDataFile file : latestFiles) { filteredPaths.add(file.getPath()); } } return sqlContext.read().parquet(filteredPaths.toArray(new String[filteredPaths.size()])); } catch (Exception e) { throw new HoodieException("Error reading hoodie dataset as a dataframe", e); } }