com.uber.hoodie.table.HoodieCopyOnWriteTable java code examples

@Override
public Partitioner getInsertPartitioner(WorkloadProfile profile) {
 return getUpsertPartitioner(profile);
}

@Override
public Iterator<List<WriteStatus>> handleInsert(String commitTime,
  Iterator<HoodieRecord<T>> recordItr) throws Exception {
 // If canIndexLogFiles, write inserts to log files else write inserts to parquet files
 if (index.canIndexLogFiles()) {
  return new MergeOnReadLazyInsertIterable<>(recordItr, config, commitTime, this);
 } else {
  return super.handleInsert(commitTime, recordItr);
 }
}

/**
 * Performs cleaning of partition paths according to cleaning policy and returns the number of
 * files cleaned. Handles skews in partitions to clean by making files to clean as the unit of
 * task distribution.
 *
 * @throws IllegalArgumentException if unknown cleaning policy is provided
 */
@Override
public List<HoodieCleanStat> clean(JavaSparkContext jsc) {
 try {
  FileSystem fs = getMetaClient().getFs();
  List<String> partitionsToClean = FSUtils
    .getAllPartitionPaths(fs, getMetaClient().getBasePath(),
      config.shouldAssumeDatePartitioning());
  logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config
    .getCleanerPolicy());
  if (partitionsToClean.isEmpty()) {
   logger.info("Nothing to clean here mom. It is already clean");
   return Collections.emptyList();
  }
  return cleanPartitionPaths(partitionsToClean, jsc);
 } catch (IOException e) {
  throw new HoodieIOException("Failed to clean up after commit", e);
 }
}

HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc);
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
Iterator<List<WriteStatus>> insertResult = table.handleInsert("100", records.iterator());
Path commitFile = new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100"));
FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()).create(commitFile);
System.out.println(fileId);
table = new HoodieCopyOnWriteTable(config, jsc);
 table.handleUpdate("101", fileId, records.iterator());
} catch (ClassCastException e) {
 fail("UpdateFunction could not read records written with exampleSchema.txt using the "

@Test
public void testInsertWithPartialFailures() throws Exception {
 HoodieWriteConfig config = makeHoodieClientConfig();
 String commitTime = HoodieTestUtils.makeNewCommitTime();
 FileSystem fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration());
 HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
 HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc);
 // Write a few records, and get atleast one file
 // 10 records for partition 1, 1 record for partition 2.
 List<HoodieRecord> records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z");
 records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
 // Simulate crash after first file
 List<WriteStatus> statuses = HoodieClientTestUtils
   .collectStatuses(table.handleInsert(commitTime, records.iterator()));
 WriteStatus status = statuses.get(0);
 Path partialFile = new Path(String.format("%s/%s/%s", basePath, status.getPartitionPath(),
   FSUtils.makeDataFileName(commitTime, 0, status.getFileId())));
 assertTrue(fs.exists(partialFile));
 // When we retry
 records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z");
 records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
 statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
 status = statuses.get(0);
 Path retriedFIle = new Path(String.format("%s/%s/%s", basePath, status.getPartitionPath(),
   FSUtils.makeDataFileName(commitTime, 0, status.getFileId())));
 assertTrue(fs.exists(retriedFIle));
 assertFalse(fs.exists(partialFile));
}

 throws IOException {
String actionType = metaClient.getCommitActionType();
HoodieActiveTimeline activeTimeline = this.getActiveTimeline();
List<String> inflights = this.getInflightCommitTimeline().getInstants()
  .map(HoodieInstant::getTimestamp).collect(Collectors.toList());
  .getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(),
    config.shouldAssumeDatePartitioning()))
  .map((Function<String, HoodieRollbackStat>) partitionPath -> {
   Map<FileStatus, Boolean> results = deleteCleanedFiles(partitionPath, commits);
   return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
     .withDeletedFileResults(results).build();
cleanTemporaryDataFiles(jsc);

Optional<HoodieDataFile> oldDataFileOpt = hoodieCopyOnWriteTable.getROFileSystemView()
  .getLatestDataFilesOn(operation.getPartitionPath(), operation.getBaseInstantTime())
  .filter(df -> df.getFileId().equals(operation.getFileId())).findFirst();
   .handleUpdate(commitTime, operation.getFileId(), scanner.getRecords(), oldDataFileOpt);
} else {
 result = hoodieCopyOnWriteTable
   .handleInsert(commitTime, operation.getPartitionPath(), operation.getFileId(), scanner.iterator());

@SuppressWarnings("unchecked")
@Override
public Iterator<List<WriteStatus>> handleUpsertPartition(String commitTime, Integer partition,
  Iterator recordItr, Partitioner partitioner) {
 UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner;
 BucketInfo binfo = upsertPartitioner.getBucketInfo(partition);
 BucketType btype = binfo.bucketType;
 try {
  if (btype.equals(BucketType.INSERT)) {
   return handleInsert(commitTime, recordItr);
  } else if (btype.equals(BucketType.UPDATE)) {
   return handleUpdate(commitTime, binfo.fileLoc, recordItr);
  } else {
   throw new HoodieUpsertException(
     "Unknown bucketType " + btype + " for partition :" + partition);
  }
 } catch (Throwable t) {
  String msg = "Error upserting bucketType " + btype + " for partition :" + partition;
  logger.error(msg, t);
  throw new HoodieUpsertException(msg, t);
 }
}

private UpsertPartitioner getUpsertPartitioner(int smallFileSize, int numInserts,
  int numUpdates, int fileSize, String testPartitionPath, boolean autoSplitInserts) throws Exception {
 HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(
   HoodieCompactionConfig.newBuilder().compactionSmallFileSize(smallFileSize).insertSplitSize(100)
     .autoTuneInsertSplits(autoSplitInserts).build()).withStorageConfig(
   HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build();
 HoodieClientTestUtils.fakeCommitFile(basePath, "001");
 HoodieClientTestUtils.fakeDataFile(basePath, testPartitionPath, "001", "file1", fileSize);
 HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
 HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc);
 HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[]{testPartitionPath});
 List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts);
 List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates);
 for (HoodieRecord updateRec : updateRecords) {
  updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1"));
 }
 List<HoodieRecord> records = new ArrayList<>();
 records.addAll(insertRecords);
 records.addAll(updateRecords);
 WorkloadProfile profile = new WorkloadProfile(jsc.parallelize(records));
 HoodieCopyOnWriteTable.UpsertPartitioner partitioner =
   (HoodieCopyOnWriteTable.UpsertPartitioner) table.getUpsertPartitioner(profile);
 assertEquals("Update record should have gone to the 1 update partiton", 0, partitioner.getPartition(
   new Tuple2<>(updateRecords.get(0).getKey(), Option.apply(updateRecords.get(0).getCurrentLocation()))));
 return partitioner;
}

@Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc,
  HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable, HoodieWriteConfig config,
  String compactionInstantTime) throws IOException {
 if (compactionPlan == null || (compactionPlan.getOperations() == null)
   || (compactionPlan.getOperations().isEmpty())) {
  return jsc.emptyRDD();
 }
 HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
 // Compacting is very similar to applying updates to existing file
 HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc);
 List<CompactionOperation> operations = compactionPlan.getOperations().stream().map(
     CompactionOperation::convertFromAvroRecordInstance).collect(toList());
 log.info("Compactor compacting " + operations + " files");
 return jsc.parallelize(operations, operations.size())
   .map(s -> compact(table, metaClient, config, s, compactionInstantTime))
   .flatMap(writeStatusesItr -> writeStatusesItr.iterator());
}

@Override
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileId,
  Iterator<HoodieRecord<T>> recordItr) throws IOException {
 logger.info("Merging updates for commit " + commitTime + " for file " + fileId);
 if (!index.canIndexLogFiles() && mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) {
  logger.info(
    "Small file corrections for updates for commit " + commitTime + " for file " + fileId);
  return super.handleUpdate(commitTime, fileId, recordItr);
 } else {
  HoodieAppendHandle<T> appendHandle = new HoodieAppendHandle<>(config, commitTime, this,
    fileId, recordItr);
  appendHandle.doAppend();
  appendHandle.close();
  return Collections.singletonList(Collections.singletonList(appendHandle.getWriteStatus()))
    .iterator();
 }
}

private List<HoodieCleanStat> cleanPartitionPaths(List<String> partitionsToClean,
  JavaSparkContext jsc) {
 int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
 logger.info("Using cleanerParallelism: " + cleanerParallelism);
 List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
   .parallelize(partitionsToClean, cleanerParallelism)
   .flatMapToPair(getFilesToDeleteFunc(this, config))
   .repartition(cleanerParallelism)                    // repartition to remove skews
   .mapPartitionsToPair(deleteFilesFunc(this)).reduceByKey(
     // merge partition level clean stats below
     (Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1
       .merge(e2)).collect();
 Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.stream()
   .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2));
 HoodieCleanHelper cleaner = new HoodieCleanHelper(this, config);
 // Return PartitionCleanStat for each partition passed.
 return partitionsToClean.stream().map(partitionPath -> {
  PartitionCleanStat partitionCleanStat =
    (partitionCleanStatsMap.containsKey(partitionPath)) ? partitionCleanStatsMap
      .get(partitionPath) : new PartitionCleanStat(partitionPath);
  return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy())
    .withPartitionPath(partitionPath)
    .withEarliestCommitRetained(cleaner.getEarliestCommitToRetain())
    .withDeletePathPattern(partitionCleanStat.deletePathPatterns)
    .withSuccessfulDeletes(partitionCleanStat.successDeleteFiles)
    .withFailedDeletes(partitionCleanStat.failedDeleteFiles).build();
 }).collect(Collectors.toList());
}

   final FileSystem fs = getMetaClient().getFs();
   final Path finalPath = new Path(config.getBasePath(), writeStat.getPath());
cleanTemporaryDataFiles(jsc);

/**
 * Common method used for cleaning out parquet files under a partition path during rollback of a
 * set of commits
 */
protected Map<FileStatus, Boolean> deleteCleanedFiles(String partitionPath, List<String> commits)
  throws IOException {
 Map<FileStatus, Boolean> results = Maps.newHashMap();
 // PathFilter to get all parquet files and log files that need to be deleted
 PathFilter filter = (path) -> {
  if (path.toString().contains(".parquet")) {
   String fileCommitTime = FSUtils.getCommitTime(path.getName());
   return commits.contains(fileCommitTime);
  }
  return false;
 };
 deleteCleanedFiles(results, partitionPath, filter);
 return results;
}

@Override
public void finalizeWrite(JavaSparkContext jsc, List<HoodieWriteStat> stats)
  throws HoodieIOException {
 // delegate to base class for MOR tables
 super.finalizeWrite(jsc, stats);
}

private static PairFlatMapFunction<Iterator<Tuple2<String, String>>, String,
  PartitionCleanStat> deleteFilesFunc(
  HoodieTable table) {
 return (PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat>)
   iter -> {
    Map<String, PartitionCleanStat> partitionCleanStatMap = new HashMap<>();
    FileSystem fs = table.getMetaClient().getFs();
    while (iter.hasNext()) {
     Tuple2<String, String> partitionDelFileTuple = iter.next();
     String partitionPath = partitionDelFileTuple._1();
     String deletePathStr = partitionDelFileTuple._2();
     Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr);
     if (!partitionCleanStatMap.containsKey(partitionPath)) {
      partitionCleanStatMap.put(partitionPath, new PartitionCleanStat(partitionPath));
     }
     PartitionCleanStat partitionCleanStat = partitionCleanStatMap.get(partitionPath);
     partitionCleanStat.addDeleteFilePatterns(deletePathStr);
     partitionCleanStat.addDeletedFileResult(deletePathStr, deletedFileResult);
    }
    return partitionCleanStatMap.entrySet().stream()
      .map(e -> new Tuple2<>(e.getKey(), e.getValue()))
      .collect(Collectors.toList()).iterator();
   };
}

HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc);
HoodieClientTestUtils.collectStatuses(table.handleInsert(firstCommitTime, records.iterator()));
String newCommitTime = HoodieTestUtils.makeNewCommitTime();
metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
table = new HoodieCopyOnWriteTable(config, jsc);
Iterator<List<WriteStatus>> iter = table
  .handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(),
    updatedRecords.iterator());

@Test
public void testFileSizeUpsertRecords() throws Exception {
 HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig(
   HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024).parquetPageSize(64 * 1024)
     .build()).build();
 String commitTime = HoodieTestUtils.makeNewCommitTime();
 HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
 HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc);
 List<HoodieRecord> records = new ArrayList<>();
 // Approx 1150 records are written for block size of 64KB
 for (int i = 0; i < 2000; i++) {
  String recordStr =
    "{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i
      + "}";
  TestRawTripPayload rowChange = new TestRawTripPayload(recordStr);
  records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
 }
 // Insert new records
 HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
 // Check the updated file
 int counts = 0;
 for (File file : new File(basePath + "/2016/01/31").listFiles()) {
  if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName()).equals(commitTime)) {
   System.out.println(file.getName() + "-" + file.length());
   counts++;
  }
 }
 assertEquals("If the number of records are more than 1150, then there should be a new file", 3, counts);
}

 throws IOException {
String actionType = metaClient.getCommitActionType();
HoodieActiveTimeline activeTimeline = this.getActiveTimeline();
List<String> inflights = this.getInflightCommitTimeline().getInstants()
  .map(HoodieInstant::getTimestamp).collect(Collectors.toList());
  .getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(),
    config.shouldAssumeDatePartitioning()))
  .map((Function<String, HoodieRollbackStat>) partitionPath -> {
   Map<FileStatus, Boolean> results = deleteCleanedFiles(partitionPath, commits);
   return HoodieRollbackStat.newBuilder().withPartitionPath(partitionPath)
     .withDeletedFileResults(results).build();
cleanTemporaryDataFiles(jsc);

Optional<HoodieDataFile> oldDataFileOpt = hoodieCopyOnWriteTable.getROFileSystemView()
  .getLatestDataFilesOn(operation.getPartitionPath(), operation.getBaseInstantTime())
  .filter(df -> df.getFileId().equals(operation.getFileId())).findFirst();
   .handleUpdate(commitTime, operation.getFileId(), scanner.getRecords(), oldDataFileOpt);
} else {
 result = hoodieCopyOnWriteTable
   .handleInsert(commitTime, operation.getPartitionPath(), operation.getFileId(), scanner.iterator());

Javadoc

Implementation of a very heavily read-optimized Hoodie Table where

INSERTS - Produce new files, block aligned to desired size (or) Merge with the smallest existing file, to expand it

UPDATES - Produce a new version of the file, just replacing the updated records with new values

Most used methods

<init>
getUpsertPartitioner
handleInsert
handleUpdate
cleanPartitionPaths
cleanTemporaryDataFiles
Clean temporary data files that are produced from previous failed commit or retried spark stages.
deleteCleanedFiles
Common method used for cleaning out parquet files under a partition path during rollback of a set of
deleteFileAndGetResult
deleteFilesFunc
finalizeWrite
Finalize the written data files
getActiveTimeline
getFilesToDeleteFunc

Popular in Java

Parsing JSON documents to java classes using gson
getSharedPreferences (Context)
putExtra (Intent)
setRequestProperty (URLConnection)
PrintStream (java.io)
Fake signature of an existing Java class.
ServerSocket (java.net)
This class represents a server-side socket that waits for incoming client connections. A ServerSocke
Logger (org.apache.log4j)
This is the central class in the log4j package. Most logging operations, except configuration, are d
JFrame (javax.swing)
Runner (org.openjdk.jmh.runner)
Location (org.springframework.beans.factory.parsing)
Class that models an arbitrary location in a Resource.Typically used to track the location of proble
Top plugins for WebStorm

How to useHoodieCopyOnWriteTable in com.uber.hoodie.table

Best Java code snippets using com.uber.hoodie.table.HoodieCopyOnWriteTable (Showing top 20 results out of 315)

How to use
HoodieCopyOnWriteTable
in
com.uber.hoodie.table