public static HoodieRecord createHoodieRecord(GenericRecord gr, Comparable orderingVal, HoodieKey hKey, String payloadClass) throws IOException { HoodieRecordPayload payload = DataSourceUtils.createPayload(payloadClass, gr, orderingVal); return new HoodieRecord<>(hKey, payload); } }
public static HoodieRecord createHoodieRecord(GenericRecord gr, Comparable orderingVal, HoodieKey hKey, String payloadClass) throws IOException { HoodieRecordPayload payload = DataSourceUtils.createPayload(payloadClass, gr, orderingVal); return new HoodieRecord<>(hKey, payload); }
/** * Utility method to convert bytes to HoodieRecord using schema and payload class */ public static <R> R convertToHoodieRecordPayload(GenericRecord rec, String payloadClazz) { String recKey = rec.get(HoodieRecord.RECORD_KEY_METADATA_FIELD) .toString(); String partitionPath = rec.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD) .toString(); HoodieRecord<? extends HoodieRecordPayload> hoodieRecord = new HoodieRecord<>( new HoodieKey(recKey, partitionPath), ReflectionUtils .loadPayload(payloadClazz, new Object[]{Optional.of(rec)}, Optional.class)); return (R) hoodieRecord; }
@Override protected final List<ConverterResult<AvroPayload, HoodieRecord<HoodieRecordPayload>>> convert( @NonNull final AvroPayload payload) throws Exception { final HoodieKey hoodieKey = new HoodieKey(getRecordKey(payload), getPartitionPath(payload)); final HoodieRecordPayload hoodiePayload = getPayload(payload); return Collections.singletonList(new ConverterResult<>((new HoodieRecord<>(hoodieKey, hoodiePayload)))); }
/** * Utility method to convert bytes to HoodieRecord using schema and payload class */ public static <R> R generateEmptyPayload(String recKey, String partitionPath, String payloadClazz) { HoodieRecord<? extends HoodieRecordPayload> hoodieRecord = new HoodieRecord<>( new HoodieKey(recKey, partitionPath), ReflectionUtils .loadPayload(payloadClazz, new Object[]{Optional.empty()}, Optional.class)); return (R) hoodieRecord; } }
public HoodieRecord generateUpdateRecord(HoodieKey key, String commitTime) throws IOException { return new HoodieRecord(key, generateRandomValue(key, commitTime)); }
public static List<HoodieRecord<HoodieAvroPayload>> loadFromFile(FileSystem fs, String deltaFilePath, Schema expectedSchema) { List<HoodieRecord<HoodieAvroPayload>> loadedRecords = Lists.newArrayList(); Path path = new Path(deltaFilePath); try { SeekableInput input = new FsInput(path, fs.getConf()); GenericDatumReader<GenericRecord> reader = new GenericDatumReader<>(); // Set the expected schema to be the current schema to account for schema evolution reader.setExpected(expectedSchema); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader); for (GenericRecord deltaRecord : fileReader) { String key = deltaRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); String partitionPath = deltaRecord.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); loadedRecords.add(new HoodieRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Optional.of(deltaRecord)))); } fileReader.close(); // also closes underlying FsInput } catch (IOException e) { throw new HoodieIOException("Could not read avro records from path " + deltaFilePath, e); } return loadedRecords; }
public List<HoodieRecord> generateSameKeyInserts(String commitTime, List<HoodieRecord> origin) throws IOException { List<HoodieRecord> copy = new ArrayList<>(); for (HoodieRecord r: origin) { HoodieKey key = r.getKey(); HoodieRecord record = new HoodieRecord(key, generateRandomValue(key, commitTime)); copy.add(record); } return copy; }
public static List<HoodieRecord> generateHoodieTestRecordsWithoutHoodieMetadata(int from, int limit) throws IOException, URISyntaxException { List<IndexedRecord> iRecords = generateTestRecords(from, limit); return iRecords .stream() .map(r -> new HoodieRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), new HoodieAvroPayload(Optional.of((GenericRecord) r)))).collect(Collectors.toList()); }
/** * Generates new inserts, uniformly across the partition paths above. It also updates the list of existing keys. */ public List<HoodieRecord> generateInserts(String commitTime, Integer n) throws IOException { List<HoodieRecord> inserts = new ArrayList<>(); for (int i = 0; i < n; i++) { String partitionPath = partitionPaths[rand.nextInt(partitionPaths.length)]; HoodieKey key = new HoodieKey(UUID.randomUUID().toString(), partitionPath); HoodieRecord record = new HoodieRecord(key, generateRandomValue(key, commitTime)); inserts.add(record); KeyPartition kp = new KeyPartition(); kp.key = key; kp.partitionPath = partitionPath; existingKeysList.add(kp); } return inserts; }
private List<HoodieRecord> newHoodieRecords(int n, String time) throws Exception { List<HoodieRecord> records = new ArrayList<>(); for (int i = 0; i < n; i++) { String recordStr = String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}", UUID.randomUUID().toString(), time, i); TestRawTripPayload rowChange = new TestRawTripPayload(recordStr); records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange)); } return records; }
public HoodieRecord generateDeleteRecord(HoodieKey key) throws IOException { TestRawTripPayload payload = new TestRawTripPayload(Optional.empty(), key.getRecordKey(), key.getPartitionPath(), null, true); return new HoodieRecord(key, payload); }
public static List<String> upsertRecords(List<IndexedRecord> iRecords, Map<String, HoodieRecord<? extends HoodieRecordPayload>> records) { List<String> recordKeys = new ArrayList<>(); iRecords .stream() .forEach(r -> { String key = ((GenericRecord) r).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); String partitionPath = ((GenericRecord) r).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString(); recordKeys.add(key); HoodieRecord record = new HoodieRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Optional.of((GenericRecord) r))); record.setCurrentLocation(new HoodieRecordLocation("DUMMY_COMMIT_TIME", "DUMMY_FILE_ID")); records.put(key, record); }); return recordKeys; } }
public static <T extends HoodieRecordPayload> JavaRDD<HoodieRecord<T>> combineRecords( final JavaRDD<HoodieRecord<T>> records, final Function<HoodieRecord<T>, Object> recordKeyFunc, final int parallelism) { return records .mapToPair(record -> new Tuple2<>(recordKeyFunc.call(record), record)) .reduceByKey((rec1, rec2) -> { @SuppressWarnings("unchecked") T reducedData = (T) rec1.getData().preCombine(rec2.getData()); return new HoodieRecord<T>(rec1.getKey(), reducedData); }, parallelism) .map(recordTuple -> recordTuple._2()); } }
public static List<HoodieRecord> updateHoodieTestRecordsWithoutHoodieMetadata(List<HoodieRecord> oldRecords, Schema schema, String fieldNameToUpdate, String newValue) throws IOException, URISyntaxException { return oldRecords .stream() .map(r -> { try { GenericRecord rec = (GenericRecord) r.getData().getInsertValue(schema).get(); rec.put(fieldNameToUpdate, newValue); return new HoodieRecord<>(r.getKey(), new HoodieAvroPayload(Optional.of(rec))); } catch (IOException io) { throw new HoodieIOException("unable to get data from hoodie record", io); } }).collect(Collectors.toList()); }
private static HoodieRecord convertToHoodieRecords(IndexedRecord iRecord, String key, String partitionPath) { return new HoodieRecord<>(new HoodieKey(key, partitionPath), new HoodieAvroPayload(Optional.of((GenericRecord) iRecord))); }
/** * Deduplicate Hoodie records, using the given deduplication funciton. */ JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records, int parallelism) { boolean isIndexingGlobal = index.isGlobal(); return records .mapToPair(record -> { HoodieKey hoodieKey = record.getKey(); // If index used is global, then records are expected to differ in their partitionPath Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey; return new Tuple2<>(key, record); }) .reduceByKey((rec1, rec2) -> { @SuppressWarnings("unchecked") T reducedData = (T) rec1.getData() .preCombine(rec2.getData()); // we cannot allow the user to change the key or partitionPath, since that will affect // everything // so pick it from one of the records. return new HoodieRecord<T>(rec1.getKey(), reducedData); }, parallelism).map(Tuple2::_2); }
/** * Deduplicate Hoodie records, using the given deduplication funciton. */ JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records, int parallelism) { boolean isIndexingGlobal = index.isGlobal(); return records .mapToPair(record -> { HoodieKey hoodieKey = record.getKey(); // If index used is global, then records are expected to differ in their partitionPath Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey; return new Tuple2<>(key, record); }) .reduceByKey((rec1, rec2) -> { @SuppressWarnings("unchecked") T reducedData = (T) rec1.getData() .preCombine(rec2.getData()); // we cannot allow the user to change the key or partitionPath, since that will affect // everything // so pick it from one of the records. return new HoodieRecord<T>(rec1.getKey(), reducedData); }, parallelism).map(recordTuple -> recordTuple._2()); }
@Override protected void processNextRecord(HoodieRecord<? extends HoodieRecordPayload> hoodieRecord) throws IOException { String key = hoodieRecord.getRecordKey(); if (records.containsKey(key)) { // Merge and store the merged record. The HoodieRecordPayload implementation is free to decide what should be // done when a delete (empty payload) is encountered before or after an insert/update. HoodieRecordPayload combinedValue = records.get(key).getData().preCombine(hoodieRecord.getData()); records.put(key, new HoodieRecord<>(new HoodieKey(key, hoodieRecord.getPartitionPath()), combinedValue)); } else { // Put the record as is records.put(key, hoodieRecord); } }
/** * Tag the <rowKey, filename> back to the original HoodieRecord RDD. */ private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords( JavaPairRDD<String, String> rowKeyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) { JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD .mapToPair(record -> new Tuple2<>(record.getRecordKey(), record)); // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), // so we do left outer join. return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map(v1 -> { HoodieRecord<T> record = v1._1(); if (v1._2().isPresent()) { String filename = v1._2().get(); if (filename != null && !filename.isEmpty()) { // When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD will have 2 // entries with the same exact in memory copy of the HoodieRecord and the 2 separate filenames that the // record is found in. This will result in setting currentLocation 2 times and it will fail the second time. // This check will create a new in memory copy of the hoodie record. if (record.getCurrentLocation() != null) { record = new HoodieRecord<T>(record.getKey(), record.getData()); } record.setCurrentLocation(new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename))); } } return record; }); }