@Override public Partitioner getInsertPartitioner(WorkloadProfile profile) { return getUpsertPartitioner(profile); }
@Override public Iterator<List<WriteStatus>> handleInsert(String commitTime, Iterator<HoodieRecord<T>> recordItr) throws Exception { // If canIndexLogFiles, write inserts to log files else write inserts to parquet files if (index.canIndexLogFiles()) { return new MergeOnReadLazyInsertIterable<>(recordItr, config, commitTime, this); } else { return super.handleInsert(commitTime, recordItr); } }
/** * Performs cleaning of partition paths according to cleaning policy and returns the number of * files cleaned. Handles skews in partitions to clean by making files to clean as the unit of * task distribution. * * @throws IllegalArgumentException if unknown cleaning policy is provided */ @Override public List<HoodieCleanStat> clean(JavaSparkContext jsc) { try { FileSystem fs = getMetaClient().getFs(); List<String> partitionsToClean = FSUtils .getAllPartitionPaths(fs, getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()); logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config .getCleanerPolicy()); if (partitionsToClean.isEmpty()) { logger.info("Nothing to clean here mom. It is already clean"); return Collections.emptyList(); } return cleanPartitionPaths(partitionsToClean, jsc); } catch (IOException e) { throw new HoodieIOException("Failed to clean up after commit", e); } }
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); Iterator<List<WriteStatus>> insertResult = table.handleInsert("100", records.iterator()); Path commitFile = new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100")); FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()).create(commitFile); System.out.println(fileId); table = new HoodieCopyOnWriteTable(config, jsc); table.handleUpdate("101", fileId, records.iterator()); } catch (ClassCastException e) { fail("UpdateFunction could not read records written with exampleSchema.txt using the "
@Test public void testInsertWithPartialFailures() throws Exception { HoodieWriteConfig config = makeHoodieClientConfig(); String commitTime = HoodieTestUtils.makeNewCommitTime(); FileSystem fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration()); HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); // Write a few records, and get atleast one file // 10 records for partition 1, 1 record for partition 2. List<HoodieRecord> records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z"); records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z")); // Simulate crash after first file List<WriteStatus> statuses = HoodieClientTestUtils .collectStatuses(table.handleInsert(commitTime, records.iterator())); WriteStatus status = statuses.get(0); Path partialFile = new Path(String.format("%s/%s/%s", basePath, status.getPartitionPath(), FSUtils.makeDataFileName(commitTime, 0, status.getFileId()))); assertTrue(fs.exists(partialFile)); // When we retry records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z"); records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z")); statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator())); status = statuses.get(0); Path retriedFIle = new Path(String.format("%s/%s/%s", basePath, status.getPartitionPath(), FSUtils.makeDataFileName(commitTime, 0, status.getFileId()))); assertTrue(fs.exists(retriedFIle)); assertFalse(fs.exists(partialFile)); }
throws IOException { String actionType = metaClient.getCommitActionType(); HoodieActiveTimeline activeTimeline = this.getActiveTimeline(); List<String> inflights = this.getInflightCommitTimeline().getInstants() .map(HoodieInstant::getTimestamp).collect(Collectors.toList()); .getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning())) .map((Function<String, HoodieRollbackStat>) partitionPath -> { Map<FileStatus, Boolean> results = deleteCleanedFiles(partitionPath, commits); return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath) .withDeletedFileResults(results).build(); cleanTemporaryDataFiles(jsc);
Optional<HoodieDataFile> oldDataFileOpt = hoodieCopyOnWriteTable.getROFileSystemView() .getLatestDataFilesOn(operation.getPartitionPath(), operation.getBaseInstantTime()) .filter(df -> df.getFileId().equals(operation.getFileId())).findFirst(); .handleUpdate(commitTime, operation.getFileId(), scanner.getRecords(), oldDataFileOpt); } else { result = hoodieCopyOnWriteTable .handleInsert(commitTime, operation.getPartitionPath(), operation.getFileId(), scanner.iterator());
@SuppressWarnings("unchecked") @Override public Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, Integer partition, Iterator recordItr, Partitioner partitioner) { UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner; BucketInfo binfo = upsertPartitioner.getBucketInfo(partition); BucketType btype = binfo.bucketType; try { if (btype.equals(BucketType.INSERT)) { return handleInsert(commitTime, recordItr); } else if (btype.equals(BucketType.UPDATE)) { return handleUpdate(commitTime, binfo.fileLoc, recordItr); } else { throw new HoodieUpsertException( "Unknown bucketType " + btype + " for partition :" + partition); } } catch (Throwable t) { String msg = "Error upserting bucketType " + btype + " for partition :" + partition; logger.error(msg, t); throw new HoodieUpsertException(msg, t); } }
private UpsertPartitioner getUpsertPartitioner(int smallFileSize, int numInserts, int numUpdates, int fileSize, String testPartitionPath, boolean autoSplitInserts) throws Exception { HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig( HoodieCompactionConfig.newBuilder().compactionSmallFileSize(smallFileSize).insertSplitSize(100) .autoTuneInsertSplits(autoSplitInserts).build()).withStorageConfig( HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build(); HoodieClientTestUtils.fakeCommitFile(basePath, "001"); HoodieClientTestUtils.fakeDataFile(basePath, testPartitionPath, "001", "file1", fileSize); HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[]{testPartitionPath}); List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts); List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates); for (HoodieRecord updateRec : updateRecords) { updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1")); } List<HoodieRecord> records = new ArrayList<>(); records.addAll(insertRecords); records.addAll(updateRecords); WorkloadProfile profile = new WorkloadProfile(jsc.parallelize(records)); HoodieCopyOnWriteTable.UpsertPartitioner partitioner = (HoodieCopyOnWriteTable.UpsertPartitioner) table.getUpsertPartitioner(profile); assertEquals("Update record should have gone to the 1 update partiton", 0, partitioner.getPartition( new Tuple2<>(updateRecords.get(0).getKey(), Option.apply(updateRecords.get(0).getCurrentLocation())))); return partitioner; }
@Override public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable, HoodieWriteConfig config, String compactionInstantTime) throws IOException { if (compactionPlan == null || (compactionPlan.getOperations() == null) || (compactionPlan.getOperations().isEmpty())) { return jsc.emptyRDD(); } HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); // Compacting is very similar to applying updates to existing file HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); List<CompactionOperation> operations = compactionPlan.getOperations().stream().map( CompactionOperation::convertFromAvroRecordInstance).collect(toList()); log.info("Compactor compacting " + operations + " files"); return jsc.parallelize(operations, operations.size()) .map(s -> compact(table, metaClient, config, s, compactionInstantTime)) .flatMap(writeStatusesItr -> writeStatusesItr.iterator()); }
@Override public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileId, Iterator<HoodieRecord<T>> recordItr) throws IOException { logger.info("Merging updates for commit " + commitTime + " for file " + fileId); if (!index.canIndexLogFiles() && mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) { logger.info( "Small file corrections for updates for commit " + commitTime + " for file " + fileId); return super.handleUpdate(commitTime, fileId, recordItr); } else { HoodieAppendHandle<T> appendHandle = new HoodieAppendHandle<>(config, commitTime, this, fileId, recordItr); appendHandle.doAppend(); appendHandle.close(); return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus())) .iterator(); } }
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean, JavaSparkContext jsc) { int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism()); logger.info("Using cleanerParallelism: " + cleanerParallelism); List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc .parallelize(partitionsToClean, cleanerParallelism) .flatMapToPair(getFilesToDeleteFunc(this, config)) .repartition(cleanerParallelism) // repartition to remove skews .mapPartitionsToPair(deleteFilesFunc(this)).reduceByKey( // merge partition level clean stats below (Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1 .merge(e2)).collect(); Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.stream() .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2)); HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config); // Return PartitionCleanStat for each partition passed. return partitionsToClean.stream().map(partitionPath -> { PartitionCleanStat partitionCleanStat = (partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap .get(partitionPath) : new PartitionCleanStat(partitionPath); return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()) .withPartitionPath(partitionPath) .withEarliestCommitRetained(cleaner.getEarliestCommitToRetain()) .withDeletePathPattern(partitionCleanStat.deletePathPatterns) .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles) .withFailedDeletes(partitionCleanStat.failedDeleteFiles).build(); }).collect(Collectors.toList()); }
final FileSystem fs = getMetaClient().getFs(); final Path finalPath = new Path(config.getBasePath(), writeStat.getPath()); cleanTemporaryDataFiles(jsc);
/** * Common method used for cleaning out parquet files under a partition path during rollback of a * set of commits */ protected Map<FileStatus, Boolean> deleteCleanedFiles(String partitionPath, List<String> commits) throws IOException { Map<FileStatus, Boolean> results = Maps.newHashMap(); // PathFilter to get all parquet files and log files that need to be deleted PathFilter filter = (path) -> { if (path.toString().contains(".parquet")) { String fileCommitTime = FSUtils.getCommitTime(path.getName()); return commits.contains(fileCommitTime); } return false; }; deleteCleanedFiles(results, partitionPath, filter); return results; }
@Override public void finalizeWrite(JavaSparkContext jsc, List<HoodieWriteStat> stats) throws HoodieIOException { // delegate to base class for MOR tables super.finalizeWrite(jsc, stats); }
private static PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat> deleteFilesFunc( HoodieTable table) { return (PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat>) iter -> { Map<String, PartitionCleanStat> partitionCleanStatMap = new HashMap<>(); FileSystem fs = table.getMetaClient().getFs(); while (iter.hasNext()) { Tuple2<String, String> partitionDelFileTuple = iter.next(); String partitionPath = partitionDelFileTuple._1(); String deletePathStr = partitionDelFileTuple._2(); Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr); if (!partitionCleanStatMap.containsKey(partitionPath)) { partitionCleanStatMap.put(partitionPath, new PartitionCleanStat(partitionPath)); } PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath); partitionCleanStat.addDeleteFilePatterns(deletePathStr); partitionCleanStat.addDeletedFileResult(deletePathStr, deletedFileResult); } return partitionCleanStatMap.entrySet().stream() .map(e -> new Tuple2<>(e.getKey(), e.getValue())) .collect(Collectors.toList()).iterator(); }; }
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); HoodieClientTestUtils.collectStatuses(table.handleInsert(firstCommitTime, records.iterator())); String newCommitTime = HoodieTestUtils.makeNewCommitTime(); metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); table = new HoodieCopyOnWriteTable(config, jsc); Iterator<List<WriteStatus>> iter = table .handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(), updatedRecords.iterator());
@Test public void testFileSizeUpsertRecords() throws Exception { HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig( HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024).parquetPageSize(64 * 1024) .build()).build(); String commitTime = HoodieTestUtils.makeNewCommitTime(); HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); List<HoodieRecord> records = new ArrayList<>(); // Approx 1150 records are written for block size of 64KB for (int i = 0; i < 2000; i++) { String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}"; TestRawTripPayload rowChange = new TestRawTripPayload(recordStr); records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); } // Insert new records HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator())); // Check the updated file int counts = 0; for (File file : new File(basePath + "/2016/01/31").listFiles()) { if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName()).equals(commitTime)) { System.out.println(file.getName() + "-" + file.length()); counts++; } } assertEquals("If the number of records are more than 1150, then there should be a new file", 3, counts); }
throws IOException { String actionType = metaClient.getCommitActionType(); HoodieActiveTimeline activeTimeline = this.getActiveTimeline(); List<String> inflights = this.getInflightCommitTimeline().getInstants() .map(HoodieInstant::getTimestamp).collect(Collectors.toList()); .getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning())) .map((Function<String, HoodieRollbackStat>) partitionPath -> { Map<FileStatus, Boolean> results = deleteCleanedFiles(partitionPath, commits); return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath) .withDeletedFileResults(results).build(); cleanTemporaryDataFiles(jsc);
Optional<HoodieDataFile> oldDataFileOpt = hoodieCopyOnWriteTable.getROFileSystemView() .getLatestDataFilesOn(operation.getPartitionPath(), operation.getBaseInstantTime()) .filter(df -> df.getFileId().equals(operation.getFileId())).findFirst(); .handleUpdate(commitTime, operation.getFileId(), scanner.getRecords(), oldDataFileOpt); } else { result = hoodieCopyOnWriteTable .handleInsert(commitTime, operation.getPartitionPath(), operation.getFileId(), scanner.iterator());