private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean, JavaSparkContext jsc) { int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism()); logger.info("Using cleanerParallelism: " + cleanerParallelism); List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc .parallelize(partitionsToClean, cleanerParallelism) .flatMapToPair(getFilesToDeleteFunc(this, config)) .repartition(cleanerParallelism) // repartition to remove skews .mapPartitionsToPair(deleteFilesFunc(this)).reduceByKey( // merge partition level clean stats below (Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1 .merge(e2)).collect(); Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.stream() .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2)); HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config); // Return PartitionCleanStat for each partition passed. return partitionsToClean.stream().map(partitionPath -> { PartitionCleanStat partitionCleanStat = (partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap .get(partitionPath) : new PartitionCleanStat(partitionPath); return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()) .withPartitionPath(partitionPath) .withEarliestCommitRetained(cleaner.getEarliestCommitToRetain()) .withDeletePathPattern(partitionCleanStat.deletePathPatterns) .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles) .withFailedDeletes(partitionCleanStat.failedDeleteFiles).build(); }).collect(Collectors.toList()); }
private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean, JavaSparkContext jsc) { int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism()); logger.info("Using cleanerParallelism: " + cleanerParallelism); List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc .parallelize(partitionsToClean, cleanerParallelism) .flatMapToPair(getFilesToDeleteFunc(this, config)) .repartition(cleanerParallelism) // repartition to remove skews .mapPartitionsToPair(deleteFilesFunc(this)).reduceByKey( // merge partition level clean stats below (Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1 .merge(e2)).collect(); Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.stream() .collect(Collectors.toMap(e -> e._1(), e -> e._2())); HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config); // Return PartitionCleanStat for each partition passed. return partitionsToClean.stream().map(partitionPath -> { PartitionCleanStat partitionCleanStat = (partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap .get(partitionPath) : new PartitionCleanStat(partitionPath); return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()) .withPartitionPath(partitionPath) .withEarliestCommitRetained(cleaner.getEarliestCommitToRetain()) .withDeletePathPattern(partitionCleanStat.deletePathPatterns) .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles) .withFailedDeletes(partitionCleanStat.failedDeleteFiles).build(); }).collect(Collectors.toList()); }