public HoodieInsertValueGenResult(T record, Schema schema) { this.record = record; try { this.insertValue = record.getData().getInsertValue(schema); } catch (Exception e) { this.exception = Optional.of(e); } } }
/** * Perform the actual writing of the given record into the backing file. */ public void write(HoodieRecord record, Optional<IndexedRecord> avroRecord, Optional<Exception> exception) { Optional recordMetadata = record.getData().getMetadata(); if (exception.isPresent() && exception.get() instanceof Throwable) { // Not throwing exception from here, since we don't want to fail the entire job for a single record writeStatus.markFailure(record, exception.get(), recordMetadata); logger.error("Error writing record " + record, exception.get()); } else { write(record, avroRecord); } }
@Override public String getErrorSourceData(@NonNull final ErrorData errorData) { try { HoodieRecord<HoodieRecordPayload> payload = (HoodieRecord) errorData.getRawData().getData(); String data = String.format("%s. %s", payload.getKey().toString(), String.format("HoodieRecordPayload %s", payload.getData().toString())); return data; } catch (Exception e) { log.debug("Not able to extract Error source data from ErrorData"); return DEFAULT_ERROR_SOURCE_DATA; } }
/** * Transformer function to help transform a HoodieRecord. This transformer is used by BufferedIterator to offload some * expensive operations of transformation to the reader thread. */ static <T extends HoodieRecordPayload> Function<HoodieRecord<T>, Tuple2<HoodieRecord<T>, Optional<IndexedRecord>>> getTransformFunction(Schema schema) { return hoodieRecord -> { try { return new Tuple2<HoodieRecord<T>, Optional<IndexedRecord>>(hoodieRecord, hoodieRecord.getData().getInsertValue(schema)); } catch (IOException e) { throw new HoodieException(e); } }; }
/** * Writes all records passed */ public void write() { try { while (recordIterator.hasNext()) { HoodieRecord<T> record = recordIterator.next(); write(record, record.getData().getInsertValue(schema)); } } catch (IOException io) { throw new HoodieInsertException( "Failed to insert records for path " + getStorageWriterPath(), io); } }
/** * Writes all records passed */ public void write() { try { while (recordIterator.hasNext()) { HoodieRecord<T> record = recordIterator.next(); write(record, record.getData().getInsertValue(schema)); } } catch (IOException io) { throw new HoodieInsertException( "Failed to insert records for path " + getStorageWriterPath(), io); } }
public static Optional<String> convertToString(HoodieRecord record) { try { String str = ((TestRawTripPayload) record.getData()).getJsonData(); str = "{" + str.substring(str.indexOf("\"timestamp\":")); return Optional.of(str.replaceAll("}", ", \"partition\": \"" + record.getPartitionPath() + "\"}")); } catch (IOException e) { return Optional.empty(); } }
private boolean writeRecord(HoodieRecord<T> hoodieRecord, Optional<IndexedRecord> indexedRecord) { Optional recordMetadata = hoodieRecord.getData().getMetadata(); try { if (indexedRecord.isPresent()) { storageWriter.writeAvroWithMetadata(indexedRecord.get(), hoodieRecord); recordsWritten++; } else { recordsDeleted++; } writeStatus.markSuccess(hoodieRecord, recordMetadata); // deflate record payload after recording success. This will help users access payload as a // part of marking // record successful. hoodieRecord.deflate(); return true; } catch (Exception e) { logger.error("Error writing record " + hoodieRecord, e); writeStatus.markFailure(hoodieRecord, e, recordMetadata); } return false; }
public static <T extends HoodieRecordPayload> JavaRDD<HoodieRecord<T>> combineRecords( final JavaRDD<HoodieRecord<T>> records, final Function<HoodieRecord<T>, Object> recordKeyFunc, final int parallelism) { return records .mapToPair(record -> new Tuple2<>(recordKeyFunc.call(record), record)) .reduceByKey((rec1, rec2) -> { @SuppressWarnings("unchecked") T reducedData = (T) rec1.getData().preCombine(rec2.getData()); return new HoodieRecord<T>(rec1.getKey(), reducedData); }, parallelism) .map(recordTuple -> recordTuple._2()); } }
private boolean writeRecord(HoodieRecord<T> hoodieRecord, Optional<IndexedRecord> indexedRecord) { Optional recordMetadata = hoodieRecord.getData().getMetadata(); try { if (indexedRecord.isPresent()) { storageWriter.writeAvroWithMetadata(indexedRecord.get(), hoodieRecord); recordsWritten++; } else { recordsDeleted++; } writeStatus.markSuccess(hoodieRecord, recordMetadata); // deflate record payload after recording success. This will help users access payload as a // part of marking // record successful. hoodieRecord.deflate(); return true; } catch (Exception e) { logger.error("Error writing record " + hoodieRecord, e); writeStatus.markFailure(hoodieRecord, e, recordMetadata); } return false; }
@Override public void write(HoodieRecord record, Optional<IndexedRecord> insertValue) { Optional recordMetadata = record.getData().getMetadata(); try { init(record); flushToDiskIfRequired(record); writeToBuffer(record); } catch (Throwable t) { // Not throwing exception from here, since we don't want to fail the entire job // for a single record writeStatus.markFailure(record, t, recordMetadata); logger.error("Error writing record " + record, t); } }
@Override public void write(HoodieRecord record, Optional<IndexedRecord> insertValue) { Optional recordMetadata = record.getData().getMetadata(); try { init(record); flushToDiskIfRequired(record); writeToBuffer(record); } catch (Throwable t) { // Not throwing exception from here, since we don't want to fail the entire job // for a single record writeStatus.markFailure(record, t, recordMetadata); logger.error("Error writing record " + record, t); } }
@Override protected void processNextRecord(HoodieRecord<? extends HoodieRecordPayload> hoodieRecord) throws IOException { String key = hoodieRecord.getRecordKey(); if (records.containsKey(key)) { // Merge and store the merged record. The HoodieRecordPayload implementation is free to decide what should be // done when a delete (empty payload) is encountered before or after an insert/update. HoodieRecordPayload combinedValue = records.get(key).getData().preCombine(hoodieRecord.getData()); records.put(key, new HoodieRecord<>(new HoodieKey(key, hoodieRecord.getPartitionPath()), combinedValue)); } else { // Put the record as is records.put(key, hoodieRecord); } }
/** * Deduplicate Hoodie records, using the given deduplication funciton. */ JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records, int parallelism) { boolean isIndexingGlobal = index.isGlobal(); return records .mapToPair(record -> { HoodieKey hoodieKey = record.getKey(); // If index used is global, then records are expected to differ in their partitionPath Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey; return new Tuple2<>(key, record); }) .reduceByKey((rec1, rec2) -> { @SuppressWarnings("unchecked") T reducedData = (T) rec1.getData() .preCombine(rec2.getData()); // we cannot allow the user to change the key or partitionPath, since that will affect // everything // so pick it from one of the records. return new HoodieRecord<T>(rec1.getKey(), reducedData); }, parallelism).map(Tuple2::_2); }
/** * Deduplicate Hoodie records, using the given deduplication funciton. */ JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records, int parallelism) { boolean isIndexingGlobal = index.isGlobal(); return records .mapToPair(record -> { HoodieKey hoodieKey = record.getKey(); // If index used is global, then records are expected to differ in their partitionPath Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey; return new Tuple2<>(key, record); }) .reduceByKey((rec1, rec2) -> { @SuppressWarnings("unchecked") T reducedData = (T) rec1.getData() .preCombine(rec2.getData()); // we cannot allow the user to change the key or partitionPath, since that will affect // everything // so pick it from one of the records. return new HoodieRecord<T>(rec1.getKey(), reducedData); }, parallelism).map(recordTuple -> recordTuple._2()); }
/** * Perform the actual writing of the given record into the backing file. */ public void write(HoodieRecord record, Optional<IndexedRecord> avroRecord) { Optional recordMetadata = record.getData().getMetadata(); try { if (avroRecord.isPresent()) { storageWriter.writeAvroWithMetadata(avroRecord.get(), record); // update the new location of record, so we know where to find it next record.setNewLocation(new HoodieRecordLocation(commitTime, writeStatus.getFileId())); recordsWritten++; insertRecordsWritten++; } else { recordsDeleted++; } writeStatus.markSuccess(record, recordMetadata); // deflate record payload after recording success. This will help users access payload as a // part of marking // record successful. record.deflate(); } catch (Throwable t) { // Not throwing exception from here, since we don't want to fail the entire job // for a single record writeStatus.markFailure(record, t, recordMetadata); logger.error("Error writing record " + record, t); } }
/** * Perform the actual writing of the given record into the backing file. */ public void write(HoodieRecord record, Optional<IndexedRecord> avroRecord) { Optional recordMetadata = record.getData().getMetadata(); try { if (avroRecord.isPresent()) { storageWriter.writeAvroWithMetadata(avroRecord.get(), record); // update the new location of record, so we know where to find it next record.setNewLocation(new HoodieRecordLocation(commitTime, status.getFileId())); recordsWritten++; insertRecordsWritten++; } else { recordsDeleted++; } status.markSuccess(record, recordMetadata); // deflate record payload after recording success. This will help users access payload as a // part of marking // record successful. record.deflate(); } catch (Throwable t) { // Not throwing exception from here, since we don't want to fail the entire job // for a single record status.markFailure(record, t, recordMetadata); logger.error("Error writing record " + record, t); } }
public static List<HoodieRecord> updateHoodieTestRecordsWithoutHoodieMetadata(List<HoodieRecord> oldRecords, Schema schema, String fieldNameToUpdate, String newValue) throws IOException, URISyntaxException { return oldRecords .stream() .map(r -> { try { GenericRecord rec = (GenericRecord) r.getData().getInsertValue(schema).get(); rec.put(fieldNameToUpdate, newValue); return new HoodieRecord<>(r.getKey(), new HoodieAvroPayload(Optional.of(rec))); } catch (IOException io) { throw new HoodieIOException("unable to get data from hoodie record", io); } }).collect(Collectors.toList()); }
public static void writeRecordsToLogFiles(FileSystem fs, String basePath, Schema schema, List<HoodieRecord> updatedRecords) { Map<HoodieRecordLocation, List<HoodieRecord>> groupedUpdated = updatedRecords.stream().collect( Collectors.groupingBy(HoodieRecord::getCurrentLocation)); groupedUpdated.entrySet().forEach(s -> { HoodieRecordLocation location = s.getKey(); String partitionPath = s.getValue().get(0).getPartitionPath(); Writer logWriter; try { logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath)) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(location.getFileId()) .overBaseCommit(location.getCommitTime()).withFs(fs).build(); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, location.getCommitTime()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); logWriter.appendBlock(new HoodieAvroDataBlock(s.getValue().stream().map(r -> { try { GenericRecord val = (GenericRecord) r.getData().getInsertValue(schema).get(); HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), ""); return (IndexedRecord) val; } catch (IOException e) { return null; } }).collect(Collectors.toList()), header)); logWriter.close(); } catch (Exception e) { fail(e.toString()); } }); }
/** * Tag the <rowKey, filename> back to the original HoodieRecord RDD. */ private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords( JavaPairRDD<String, String> rowKeyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) { JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD .mapToPair(record -> new Tuple2<>(record.getRecordKey(), record)); // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), // so we do left outer join. return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map(v1 -> { HoodieRecord<T> record = v1._1(); if (v1._2().isPresent()) { String filename = v1._2().get(); if (filename != null && !filename.isEmpty()) { // When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD will have 2 // entries with the same exact in memory copy of the HoodieRecord and the 2 separate filenames that the // record is found in. This will result in setting currentLocation 2 times and it will fail the second time. // This check will create a new in memory copy of the hoodie record. if (record.getCurrentLocation() != null) { record = new HoodieRecord<T>(record.getKey(), record.getData()); } record.setCurrentLocation(new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename))); } } return record; }); }