com.uber.hoodie.common.model.HoodieRecord.getPartitionPath java code examples

public HoodieMergeHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable,
  Map<String, HoodieRecord<T>> keyToNewRecords, String fileId, Optional<HoodieDataFile> dataFileToBeMerged) {
 super(config, commitTime, hoodieTable);
 this.fileSystemView = hoodieTable.getROFileSystemView();
 this.keyToNewRecords = keyToNewRecords;
 init(fileId, keyToNewRecords.get(keyToNewRecords.keySet().stream().findFirst().get())
   .getPartitionPath(), dataFileToBeMerged);
}

public HoodieMergeHandle(HoodieWriteConfig config, String commitTime, HoodieTable<T> hoodieTable,
  Map<String, HoodieRecord<T>> keyToNewRecords, String fileId, Optional<HoodieDataFile> dataFileToBeMerged) {
 super(config, commitTime, hoodieTable);
 this.fileSystemView = hoodieTable.getROFileSystemView();
 this.keyToNewRecords = keyToNewRecords;
 init(fileId, keyToNewRecords.get(keyToNewRecords.keySet().stream().findFirst().get())
   .getPartitionPath(), dataFileToBeMerged);
}

@Override
public boolean canWrite(HoodieRecord record) {
 return storageWriter.canWrite() && record.getPartitionPath().equals(writeStatus.getPartitionPath());
}

@Override
public boolean canWrite(HoodieRecord record) {
 return storageWriter.canWrite() && record.getPartitionPath().equals(status.getPartitionPath());
}

 public HoodieRecordMissingException(HoodieRecord record) {
  super(
    "Record " + record.getRecordKey() + " with partition path " + record.getPartitionPath()
      + " in current location " + record.getCurrentLocation()
      + " is not found in the partition");
 }
}

public static Optional<String> convertToString(HoodieRecord record) {
 try {
  String str = ((TestRawTripPayload) record.getData()).getJsonData();
  str = "{" + str.substring(str.indexOf("\"timestamp\":"));
  return Optional.of(str.replaceAll("}",
    ", \"partition\": \"" + record.getPartitionPath() + "\"}"));
 } catch (IOException e) {
  return Optional.empty();
 }
}

private void buildProfile() {
 Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords
   .mapToPair(record -> new Tuple2<>(
     new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())),
     record)).countByKey();
 for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts
   .entrySet()) {
  String partitionPath = e.getKey()._1();
  Long count = e.getValue();
  Option<HoodieRecordLocation> locOption = e.getKey()._2();
  if (!partitionPathStatMap.containsKey(partitionPath)) {
   partitionPathStatMap.put(partitionPath, new WorkloadStat());
  }
  if (locOption.isDefined()) {
   // update
   partitionPathStatMap.get(partitionPath).addUpdates(locOption.get(), count);
   globalStat.addUpdates(locOption.get(), count);
  } else {
   // insert
   partitionPathStatMap.get(partitionPath).addInserts(count);
   globalStat.addInserts(count);
  }
 }
}

private void buildProfile() {
 Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = taggedRecords
   .mapToPair(record -> new Tuple2<>(
     new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())),
     record)).countByKey();
 for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts
   .entrySet()) {
  String partitionPath = e.getKey()._1();
  Long count = e.getValue();
  Option<HoodieRecordLocation> locOption = e.getKey()._2();
  if (!partitionPathStatMap.containsKey(partitionPath)) {
   partitionPathStatMap.put(partitionPath, new WorkloadStat());
  }
  if (locOption.isDefined()) {
   // update
   partitionPathStatMap.get(partitionPath).addUpdates(locOption.get(), count);
   globalStat.addUpdates(locOption.get(), count);
  } else {
   // insert
   partitionPathStatMap.get(partitionPath).addInserts(count);
   globalStat.addInserts(count);
  }
 }
}

@Override
public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException {
 String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(),
   recordIndex.getAndIncrement());
 HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, record.getRecordKey(),
   record.getPartitionPath(), file.getName());
 HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId);
 super.write(avroRecord);
 writeSupport.add(record.getRecordKey());
}

/**
 * Assert that there is no duplicate key at the partition level
 *
 * @param records List of Hoodie records
 */
void assertNodupesWithinPartition(List<HoodieRecord> records) {
 Map<String, Set<String>> partitionToKeys = new HashMap<>();
 for (HoodieRecord r : records) {
  String key = r.getRecordKey();
  String partitionPath = r.getPartitionPath();
  if (!partitionToKeys.containsKey(partitionPath)) {
   partitionToKeys.put(partitionPath, new HashSet<>());
  }
  assertTrue("key " + key + " is duplicate within partition " + partitionPath,
    !partitionToKeys.get(partitionPath).contains(key));
  partitionToKeys.get(partitionPath).add(key);
 }
}

@Override
public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException {
 String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(),
   recordIndex.getAndIncrement());
 HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, record.getRecordKey(),
   record.getPartitionPath(), file.getName());
 HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId);
 super.write(avroRecord);
 writeSupport.add(record.getRecordKey());
}

private JavaRDD<WriteStatus> bulkInsertInternal(JavaRDD<HoodieRecord<T>> dedupedRecords,
  String commitTime, HoodieTable<T> table,
  Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
 final JavaRDD<HoodieRecord<T>> repartitionedRecords;
 if (bulkInsertPartitioner.isDefined()) {
  repartitionedRecords = bulkInsertPartitioner.get()
    .repartitionRecords(dedupedRecords, config.getBulkInsertShuffleParallelism());
 } else {
  // Now, sort the records and line them up nicely for loading.
  repartitionedRecords = dedupedRecords.sortBy(record -> {
   // Let's use "partitionPath + key" as the sort key. Spark, will ensure
   // the records split evenly across RDD partitions, such that small partitions fit
   // into 1 RDD partition, while big ones spread evenly across multiple RDD partitions
   return String.format("%s+%s", record.getPartitionPath(), record.getRecordKey());
  }, true, config.getBulkInsertShuffleParallelism());
 }
 JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords
   .mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table), true)
   .flatMap(writeStatuses -> writeStatuses.iterator());
 return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime);
}

private JavaRDD<WriteStatus> bulkInsertInternal(JavaRDD<HoodieRecord<T>> dedupedRecords,
  String commitTime, HoodieTable<T> table,
  Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
 final JavaRDD<HoodieRecord<T>> repartitionedRecords;
 if (bulkInsertPartitioner.isDefined()) {
  repartitionedRecords = bulkInsertPartitioner.get()
    .repartitionRecords(dedupedRecords, config.getBulkInsertShuffleParallelism());
 } else {
  // Now, sort the records and line them up nicely for loading.
  repartitionedRecords = dedupedRecords.sortBy(record -> {
   // Let's use "partitionPath + key" as the sort key. Spark, will ensure
   // the records split evenly across RDD partitions, such that small partitions fit
   // into 1 RDD partition, while big ones spread evenly across multiple RDD partitions
   return String.format("%s+%s", record.getPartitionPath(), record.getRecordKey());
  }, true, config.getBulkInsertShuffleParallelism());
 }
 JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords
   .mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table), true)
   .flatMap(writeStatuses -> writeStatuses.iterator());
 return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime);
}

@Test
public void testMakeNewPath() throws Exception {
 String fileName = UUID.randomUUID().toString();
 String partitionPath = "2016/05/04";
 int unitNumber = (int) (Math.random() * 10);
 HoodieRecord record = mock(HoodieRecord.class);
 when(record.getPartitionPath()).thenReturn(partitionPath);
 String commitTime = HoodieTestUtils.makeNewCommitTime();
 HoodieWriteConfig config = makeHoodieClientConfig();
 HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
 HoodieTable table = HoodieTable.getHoodieTable(metaClient, config, jsc);
 HoodieCreateHandle io = new HoodieCreateHandle(config, commitTime, table, partitionPath,
   UUID.randomUUID().toString());
 Path newPath = io.makeNewPath(record.getPartitionPath(), unitNumber, fileName);
 assertTrue(newPath.toString().equals(
   this.basePath + "/" + partitionPath + "/" + FSUtils.makeDataFileName(commitTime, unitNumber, fileName)));
}

/**
 * Load the new incoming records in a map and return partitionPath
 */
private String init(String fileId, Iterator<HoodieRecord<T>> newRecordsItr) {
 try {
  // Load the new records in a map
  logger.info("MaxMemoryPerPartitionMerge => " + config.getMaxMemoryPerPartitionMerge());
  this.keyToNewRecords = new ExternalSpillableMap<>(config.getMaxMemoryPerPartitionMerge(),
    config.getSpillableMapBasePath(), new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema));
 } catch (IOException io) {
  throw new HoodieIOException("Cannot instantiate an ExternalSpillableMap", io);
 }
 String partitionPath = null;
 while (newRecordsItr.hasNext()) {
  HoodieRecord<T> record = newRecordsItr.next();
  partitionPath = record.getPartitionPath();
  keyToNewRecords.put(record.getRecordKey(), record);
  // update the new location of the record, so we know where to find it next
  record.setNewLocation(new HoodieRecordLocation(commitTime, fileId));
 }
 logger.info("Number of entries in MemoryBasedMap => "
   + ((ExternalSpillableMap) keyToNewRecords).getInMemoryMapNumEntries()
   + "Total size in bytes of MemoryBasedMap => "
   + ((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize()
   + "Number of entries in DiskBasedMap => "
   + ((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries()
   + "Size of file spilled to disk => "
   + ((ExternalSpillableMap) keyToNewRecords).getSizeOfFileOnDiskInBytes());
 return partitionPath;
}

public static void writeRecordsToLogFiles(FileSystem fs, String basePath, Schema schema,
  List<HoodieRecord> updatedRecords) {
 Map<HoodieRecordLocation, List<HoodieRecord>> groupedUpdated = updatedRecords.stream().collect(
   Collectors.groupingBy(HoodieRecord::getCurrentLocation));
 groupedUpdated.entrySet().forEach(s -> {
  HoodieRecordLocation location = s.getKey();
  String partitionPath = s.getValue().get(0).getPartitionPath();
  Writer logWriter;
  try {
   logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath))
     .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(location.getFileId())
     .overBaseCommit(location.getCommitTime()).withFs(fs).build();
   Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap();
   header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, location.getCommitTime());
   header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
   logWriter.appendBlock(new HoodieAvroDataBlock(s.getValue().stream().map(r -> {
    try {
     GenericRecord val = (GenericRecord) r.getData().getInsertValue(schema).get();
     HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), "");
     return (IndexedRecord) val;
    } catch (IOException e) {
     return null;
    }
   }).collect(Collectors.toList()), header));
   logWriter.close();
  } catch (Exception e) {
   fail(e.toString());
  }
 });
}

@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc,
  HoodieTable<T> hoodieTable) {
 // Step 0: cache the input record RDD
 if (config.getBloomIndexUseCaching()) {
  recordRDD.persist(config.getBloomIndexInputStorageLevel());
 }
 // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
 JavaPairRDD<String, String> partitionRecordKeyPairRDD = recordRDD
   .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
 // Lookup indexes for all the partition/recordkey pair
 JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);
 // Cache the result, for subsequent stages.
 if (config.getBloomIndexUseCaching()) {
  rowKeyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
 }
 if (logger.isDebugEnabled()) {
  long totalTaggedRecords = rowKeyFilenamePairRDD.count();
  logger.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords);
 }
 // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
 // Cost: 4 sec.
 JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD,
   recordRDD);
 if (config.getBloomIndexUseCaching()) {
  recordRDD.unpersist(); // unpersist the input Record RDD
  rowKeyFilenamePairRDD.unpersist();
 }
 return taggedRecordRDD;
}

@Override
public JavaRDD<HoodieRecord<T>> tagLocation(JavaRDD<HoodieRecord<T>> recordRDD, JavaSparkContext jsc,
  HoodieTable<T> hoodieTable) {
 // Step 0: cache the input record RDD
 if (config.getBloomIndexUseCaching()) {
  recordRDD.persist(config.getBloomIndexInputStorageLevel());
 }
 // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey)
 JavaPairRDD<String, String> partitionRecordKeyPairRDD = recordRDD
   .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()));
 // Lookup indexes for all the partition/recordkey pair
 JavaPairRDD<String, String> rowKeyFilenamePairRDD = lookupIndex(partitionRecordKeyPairRDD, jsc, hoodieTable);
 // Cache the result, for subsequent stages.
 if (config.getBloomIndexUseCaching()) {
  rowKeyFilenamePairRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
 }
 if (logger.isDebugEnabled()) {
  long totalTaggedRecords = rowKeyFilenamePairRDD.count();
  logger.debug("Number of update records (ones tagged with a fileID): " + totalTaggedRecords);
 }
 // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys
 // Cost: 4 sec.
 JavaRDD<HoodieRecord<T>> taggedRecordRDD = tagLocationBacktoRecords(rowKeyFilenamePairRDD,
   recordRDD);
 if (config.getBloomIndexUseCaching()) {
  recordRDD.unpersist(); // unpersist the input Record RDD
  rowKeyFilenamePairRDD.unpersist();
 }
 return taggedRecordRDD;
}

@Override
protected void processNextRecord(HoodieRecord<? extends HoodieRecordPayload> hoodieRecord) throws IOException {
 String key = hoodieRecord.getRecordKey();
 if (records.containsKey(key)) {
  // Merge and store the merged record. The HoodieRecordPayload implementation is free to decide what should be
  // done when a delete (empty payload) is encountered before or after an insert/update.
  HoodieRecordPayload combinedValue = records.get(key).getData().preCombine(hoodieRecord.getData());
  records.put(key, new HoodieRecord<>(new HoodieKey(key, hoodieRecord.getPartitionPath()), combinedValue));
 } else {
  // Put the record as is
  records.put(key, hoodieRecord);
 }
}

  record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())).groupByKey().collectAsMap();
assertEquals(map.size(), 2);
List<String> list1 = Lists.newArrayList(map.get("2016/01/31"));

Popular methods of HoodieRecord

setNewLocation

Popular in Java

Reactive rest calls using spring rest template
getContentResolver (Context)
putExtra (Intent)
getResourceAsStream (ClassLoader)
BigInteger (java.math)
An immutable arbitrary-precision signed integer.FAST CRYPTOGRAPHY This implementation is efficient f
SimpleDateFormat (java.text)
Formats and parses dates in a locale-sensitive manner. Formatting turns a Date into a String, and pa
GregorianCalendar (java.util)
GregorianCalendar is a concrete subclass of Calendarand provides the standard calendar used by most
IOUtils (org.apache.commons.io)
General IO stream manipulation utilities. This class provides static utility methods for input/outpu
FlowLayout (java.awt)
A flow layout arranges components in a left-to-right flow, much like lines of text in a paragraph. F
BasicDataSource (org.apache.commons.dbcp)
Basic implementation of javax.sql.DataSource that is configured via JavaBeans properties. This is no
Github Copilot alternatives

How to use getPartitionPathmethodin com.uber.hoodie.common.model.HoodieRecord

Best Java code snippets using com.uber.hoodie.common.model.HoodieRecord.getPartitionPath (Showing top 20 results out of 315)

How to use
getPartitionPath
method
in
com.uber.hoodie.common.model.HoodieRecord