com.uber.hoodie.common.model.HoodieRecord.getKey java code examples

/**
 * Mark write as failed, optionally using given parameters for the purpose of calculating some
 * aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
 * objects are collected in Spark Driver.
 *
 * @param record deflated {@code HoodieRecord} containing information that uniquely identifies
 * it.
 * @param optionalRecordMetadata optional metadata related to data contained in {@link
 * HoodieRecord} before deflation.
 */
public void markFailure(HoodieRecord record, Throwable t,
  Optional<Map<String, String>> optionalRecordMetadata) {
 failedRecords.add(record);
 errors.put(record.getKey(), t);
 totalRecords++;
 totalErrorRecords++;
}

/**
 * Mark write as failed, optionally using given parameters for the purpose of calculating some
 * aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus
 * objects are collected in Spark Driver.
 *
 * @param record deflated {@code HoodieRecord} containing information that uniquely identifies
 * it.
 * @param optionalRecordMetadata optional metadata related to data contained in {@link
 * HoodieRecord} before deflation.
 */
public void markFailure(HoodieRecord record, Throwable t,
  Optional<Map<String, String>> optionalRecordMetadata) {
 failedRecords.add(record);
 errors.put(record.getKey(), t);
 totalRecords++;
 totalErrorRecords++;
}

@Override
public String getRowKey(@NonNull final RawData rawdata) {
  try {
    return ((HoodieRecord) rawdata.getData()).getKey().getRecordKey();
  } catch (Exception e) {
    log.debug("Not able to extract Hadoop_row_key from RawData");
    return DEFAULT_ROW_KEY;
  }
}

private void writeToBuffer(HoodieRecord<T> record) {
 // update the new location of the record, so we know where to find it next
 record.setNewLocation(new HoodieRecordLocation(commitTime, fileId));
 Optional<IndexedRecord> indexedRecord = getIndexedRecord(record);
 if (indexedRecord.isPresent()) {
  recordList.add(indexedRecord.get());
 } else {
  keysToDelete.add(record.getKey());
 }
 numberOfRecords++;
}

 @Override
 public WriteStatus call(WriteStatus writeStatus) {
  for (HoodieRecord record : writeStatus.getWrittenRecords()) {
   if (!writeStatus.isErrored(record.getKey())) {
    HoodieKey key = record.getKey();
    java.util.Optional<HoodieRecordLocation> newLocation = record.getNewLocation();
    if (newLocation.isPresent()) {
     recordLocationMap.put(key, newLocation.get());
    } else {
     //Delete existing index for a deleted record
     recordLocationMap.remove(key);
    }
   }
  }
  return writeStatus;
 }
});

 @Override
 public WriteStatus call(WriteStatus writeStatus) {
  for (HoodieRecord record : writeStatus.getWrittenRecords()) {
   if (!writeStatus.isErrored(record.getKey())) {
    HoodieKey key = record.getKey();
    java.util.Optional<HoodieRecordLocation> newLocation = record.getNewLocation();
    if (newLocation.isPresent()) {
     recordLocationMap.put(key, newLocation.get());
    } else {
     //Delete existing index for a deleted record
     recordLocationMap.remove(key);
    }
   }
  }
  return writeStatus;
 }
});

@Override
public String getErrorSourceData(@NonNull final ErrorData errorData) {
  try {
    HoodieRecord<HoodieRecordPayload> payload = (HoodieRecord) errorData.getRawData().getData();
    String data = String.format("%s. %s", payload.getKey().toString(),
      String.format("HoodieRecordPayload %s", payload.getData().toString()));
    return data;
  } catch (Exception e) {
    log.debug("Not able to extract Error source data from ErrorData");
    return DEFAULT_ERROR_SOURCE_DATA;
  }
}

private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords,
  Partitioner partitioner) {
 return dedupedRecords.mapToPair(record -> new Tuple2<>(
   new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())), record))
   .partitionBy(partitioner).map(Tuple2::_2);
}

public List<HoodieRecord> generateUpdates(String commitTime, List<HoodieRecord> baseRecords) throws IOException {
 List<HoodieRecord> updates = new ArrayList<>();
 for (HoodieRecord baseRecord : baseRecords) {
  HoodieRecord record = generateUpdateRecord(baseRecord.getKey(), commitTime);
  updates.add(record);
 }
 return updates;
}

private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords,
  Partitioner partitioner) {
 return dedupedRecords.mapToPair(record -> new Tuple2<>(
   new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())), record))
   .partitionBy(partitioner).map(tuple -> tuple._2());
}

public HoodieRecord generateDeleteRecord(HoodieRecord existingRecord) throws IOException {
 HoodieKey key = existingRecord.getKey();
 return generateDeleteRecord(key);
}

public List<HoodieRecord> generateSameKeyInserts(String commitTime, List<HoodieRecord> origin) throws IOException {
 List<HoodieRecord> copy = new ArrayList<>();
 for (HoodieRecord r: origin) {
  HoodieKey key = r.getKey();
  HoodieRecord record = new HoodieRecord(key,  generateRandomValue(key, commitTime));
  copy.add(record);
 }
 return copy;
}

/**
 * Deduplicate Hoodie records, using the given deduplication funciton.
 */
JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records,
  int parallelism) {
 boolean isIndexingGlobal = index.isGlobal();
 return records
   .mapToPair(record -> {
    HoodieKey hoodieKey = record.getKey();
    // If index used is global, then records are expected to differ in their partitionPath
    Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey;
    return new Tuple2<>(key, record);
   })
   .reduceByKey((rec1, rec2) -> {
    @SuppressWarnings("unchecked") T reducedData = (T) rec1.getData()
      .preCombine(rec2.getData());
    // we cannot allow the user to change the key or partitionPath, since that will affect
    // everything
    // so pick it from one of the records.
    return new HoodieRecord<T>(rec1.getKey(), reducedData);
   }, parallelism).map(Tuple2::_2);
}

public void writeRecordsAndErrors(@NonNull final HoodieWriteResult result,
                 final boolean isErrorTableEnabled) {
  try {
    if (result.getException().isPresent()) {
      throw result.getException().get();
    }
    if (result.getWriteStatuses().isPresent()) {
      if (isErrorTableEnabled) {
        // TODO: Can we make this more readable, please?
        final JavaRDD<Tuple2<HoodieRecord, String>> hoodieRecordAndErrorTupleRDD
            = result.getWriteStatuses().get()
            .flatMap(ws -> ws.getFailedRecords().stream().map(fr ->
                new Tuple2<>(fr, ws.getErrors().get(fr.getKey()).getMessage())).iterator());
        final JavaRDD<ErrorData> errorRDD = hoodieRecordAndErrorTupleRDD
            .map(r -> new ErrorData(r._2, RawDataHelper.getRawData(r._1)));
        ErrorTableUtil.writeErrorRecordsToErrorTable(this.jsc.sc(),
            this.hoodieConf.getConf(), Optional.of(this.hoodieConf.getTableName()),
            new RDDWrapper<>(errorRDD), new HoodieSinkErrorExtractor());
      }
    }
  } catch (HoodieInsertException | HoodieUpsertException e) {
    log.error("Error writing to hoodie", e);
    throw new JobRuntimeException("hoodie write failed :"
        + (result.getWriteStatuses().isPresent() ? result.getWriteStatuses().get().count() : -1), e);
  } catch (Exception e) {
    throw new JobRuntimeException("Error writing to hoodie", e);
  }
}

/**
 * Deduplicate Hoodie records, using the given deduplication funciton.
 */
JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records,
  int parallelism) {
 boolean isIndexingGlobal = index.isGlobal();
 return records
   .mapToPair(record -> {
    HoodieKey hoodieKey = record.getKey();
    // If index used is global, then records are expected to differ in their partitionPath
    Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey;
    return new Tuple2<>(key, record);
   })
   .reduceByKey((rec1, rec2) -> {
    @SuppressWarnings("unchecked") T reducedData = (T) rec1.getData()
      .preCombine(rec2.getData());
    // we cannot allow the user to change the key or partitionPath, since that will affect
    // everything
    // so pick it from one of the records.
    return new HoodieRecord<T>(rec1.getKey(), reducedData);
   }, parallelism).map(recordTuple -> recordTuple._2());
}

  public static <T extends HoodieRecordPayload> JavaRDD<HoodieRecord<T>> combineRecords(
      final JavaRDD<HoodieRecord<T>> records, final Function<HoodieRecord<T>, Object> recordKeyFunc,
      final int parallelism) {
    return records
        .mapToPair(record -> new Tuple2<>(recordKeyFunc.call(record), record))
        .reduceByKey((rec1, rec2) -> {
            @SuppressWarnings("unchecked")
            T reducedData = (T) rec1.getData().preCombine(rec2.getData());
            return new HoodieRecord<T>(rec1.getKey(), reducedData);
          }, parallelism)
        .map(recordTuple -> recordTuple._2());
  }
}

public static List<HoodieRecord> updateHoodieTestRecordsWithoutHoodieMetadata(List<HoodieRecord> oldRecords,
  Schema schema,
  String fieldNameToUpdate, String newValue)
  throws IOException, URISyntaxException {
 return oldRecords
   .stream()
   .map(r -> {
    try {
     GenericRecord rec = (GenericRecord) r.getData().getInsertValue(schema).get();
     rec.put(fieldNameToUpdate, newValue);
     return new HoodieRecord<>(r.getKey(),
       new HoodieAvroPayload(Optional.of(rec)));
    } catch (IOException io) {
     throw new HoodieIOException("unable to get data from hoodie record", io);
    }
   }).collect(Collectors.toList());
}

/**
 * Tag the <rowKey, filename> back to the original HoodieRecord RDD.
 */
private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords(
  JavaPairRDD<String, String> rowKeyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) {
 JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD
   .mapToPair(record -> new Tuple2<>(record.getRecordKey(), record));
 // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null),
 // so we do left outer join.
 return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map(v1 -> {
  HoodieRecord<T> record = v1._1();
  if (v1._2().isPresent()) {
   String filename = v1._2().get();
   if (filename != null && !filename.isEmpty()) {
    // When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD will have 2
    // entries with the same exact in memory copy of the HoodieRecord and the 2 separate filenames that the
    // record is found in. This will result in setting currentLocation 2 times and it will fail the second time.
    // This check will create a new in memory copy of the hoodie record.
    if (record.getCurrentLocation() != null) {
     record = new HoodieRecord<T>(record.getKey(), record.getData());
    }
    record.setCurrentLocation(new HoodieRecordLocation(FSUtils.getCommitTime(filename),
      FSUtils.getFileId(filename)));
   }
  }
  return record;
 });
}

@Test
public void testAvroLogRecordReaderWithInvalidRollback()
  throws IOException, URISyntaxException, InterruptedException {
 Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema());
 // Set a small threshold so that every block is a new version
 Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath)
   .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1")
   .overBaseCommit("100").withFs(fs).build();
 // Write 1
 List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100);
 Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap();
 header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
 header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
 HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header);
 writer = writer.appendBlock(dataBlock);
 // Write invalid rollback for a failed write (possible for in-flight commits)
 header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "101");
 header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE,
   String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal()));
 HoodieCommandBlock commandBlock = new HoodieCommandBlock(header);
 writer = writer.appendBlock(commandBlock);
 List<String> allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION,
   "100").map(s -> s.getPath().toString()).collect(Collectors.toList());
 HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema,
   "100", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH);
 assertEquals("We still would read 100 records", 100, scanner.getTotalLogRecords());
 final List<String> readKeys = new ArrayList<>(100);
 scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey()));
 assertEquals("Stream collect should return all 150 records", 100, readKeys.size());
}

private UpsertPartitioner getUpsertPartitioner(int smallFileSize, int numInserts,
  int numUpdates, int fileSize, String testPartitionPath, boolean autoSplitInserts) throws Exception {
 HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(
   HoodieCompactionConfig.newBuilder().compactionSmallFileSize(smallFileSize).insertSplitSize(100)
     .autoTuneInsertSplits(autoSplitInserts).build()).withStorageConfig(
   HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build();
 HoodieClientTestUtils.fakeCommitFile(basePath, "001");
 HoodieClientTestUtils.fakeDataFile(basePath, testPartitionPath, "001", "file1", fileSize);
 HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
 HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc);
 HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[]{testPartitionPath});
 List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts);
 List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates);
 for (HoodieRecord updateRec : updateRecords) {
  updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1"));
 }
 List<HoodieRecord> records = new ArrayList<>();
 records.addAll(insertRecords);
 records.addAll(updateRecords);
 WorkloadProfile profile = new WorkloadProfile(jsc.parallelize(records));
 HoodieCopyOnWriteTable.UpsertPartitioner partitioner =
   (HoodieCopyOnWriteTable.UpsertPartitioner) table.getUpsertPartitioner(profile);
 assertEquals("Update record should have gone to the 1 update partiton", 0, partitioner.getPartition(
   new Tuple2<>(updateRecords.get(0).getKey(), Option.apply(updateRecords.get(0).getCurrentLocation()))));
 return partitioner;
}

Popular methods of HoodieRecord

setNewLocation

Popular in Java

Reading from database using SQL prepared statement
getApplicationContext (Context)
getExternalFilesDir (Context)
onCreateOptionsMenu (Activity)
File (java.io)
An "abstract" representation of a file system entity identified by a pathname. The pathname may be a
Date (java.util)
A specific moment in time, with millisecond precision. Values typically come from System#currentTime
Pattern (java.util.regex)
Patterns are compiled regular expressions. In many cases, convenience methods such as String#matches
DataSource (javax.sql)
An interface for the creation of Connection objects which represent a connection to a database. This
JTextField (javax.swing)
Location (org.springframework.beans.factory.parsing)
Class that models an arbitrary location in a Resource.Typically used to track the location of proble
CodeWhisperer alternatives

How to use getKeymethodin com.uber.hoodie.common.model.HoodieRecord

Best Java code snippets using com.uber.hoodie.common.model.HoodieRecord.getKey (Showing top 20 results out of 315)

How to use
getKey
method
in
com.uber.hoodie.common.model.HoodieRecord