/** * Mark write as failed, optionally using given parameters for the purpose of calculating some * aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus * objects are collected in Spark Driver. * * @param record deflated {@code HoodieRecord} containing information that uniquely identifies * it. * @param optionalRecordMetadata optional metadata related to data contained in {@link * HoodieRecord} before deflation. */ public void markFailure(HoodieRecord record, Throwable t, Optional<Map<String, String>> optionalRecordMetadata) { failedRecords.add(record); errors.put(record.getKey(), t); totalRecords++; totalErrorRecords++; }
/** * Mark write as failed, optionally using given parameters for the purpose of calculating some * aggregate metrics. This method is not meant to cache passed arguments, since WriteStatus * objects are collected in Spark Driver. * * @param record deflated {@code HoodieRecord} containing information that uniquely identifies * it. * @param optionalRecordMetadata optional metadata related to data contained in {@link * HoodieRecord} before deflation. */ public void markFailure(HoodieRecord record, Throwable t, Optional<Map<String, String>> optionalRecordMetadata) { failedRecords.add(record); errors.put(record.getKey(), t); totalRecords++; totalErrorRecords++; }
@Override public String getRowKey(@NonNull final RawData rawdata) { try { return ((HoodieRecord) rawdata.getData()).getKey().getRecordKey(); } catch (Exception e) { log.debug("Not able to extract Hadoop_row_key from RawData"); return DEFAULT_ROW_KEY; } }
private void writeToBuffer(HoodieRecord<T> record) { // update the new location of the record, so we know where to find it next record.setNewLocation(new HoodieRecordLocation(commitTime, fileId)); Optional<IndexedRecord> indexedRecord = getIndexedRecord(record); if (indexedRecord.isPresent()) { recordList.add(indexedRecord.get()); } else { keysToDelete.add(record.getKey()); } numberOfRecords++; }
@Override public WriteStatus call(WriteStatus writeStatus) { for (HoodieRecord record : writeStatus.getWrittenRecords()) { if (!writeStatus.isErrored(record.getKey())) { HoodieKey key = record.getKey(); java.util.Optional<HoodieRecordLocation> newLocation = record.getNewLocation(); if (newLocation.isPresent()) { recordLocationMap.put(key, newLocation.get()); } else { //Delete existing index for a deleted record recordLocationMap.remove(key); } } } return writeStatus; } });
@Override public WriteStatus call(WriteStatus writeStatus) { for (HoodieRecord record : writeStatus.getWrittenRecords()) { if (!writeStatus.isErrored(record.getKey())) { HoodieKey key = record.getKey(); java.util.Optional<HoodieRecordLocation> newLocation = record.getNewLocation(); if (newLocation.isPresent()) { recordLocationMap.put(key, newLocation.get()); } else { //Delete existing index for a deleted record recordLocationMap.remove(key); } } } return writeStatus; } });
@Override public String getErrorSourceData(@NonNull final ErrorData errorData) { try { HoodieRecord<HoodieRecordPayload> payload = (HoodieRecord) errorData.getRawData().getData(); String data = String.format("%s. %s", payload.getKey().toString(), String.format("HoodieRecordPayload %s", payload.getData().toString())); return data; } catch (Exception e) { log.debug("Not able to extract Error source data from ErrorData"); return DEFAULT_ERROR_SOURCE_DATA; } }
private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords, Partitioner partitioner) { return dedupedRecords.mapToPair(record -> new Tuple2<>( new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())), record)) .partitionBy(partitioner).map(Tuple2::_2); }
public List<HoodieRecord> generateUpdates(String commitTime, List<HoodieRecord> baseRecords) throws IOException { List<HoodieRecord> updates = new ArrayList<>(); for (HoodieRecord baseRecord : baseRecords) { HoodieRecord record = generateUpdateRecord(baseRecord.getKey(), commitTime); updates.add(record); } return updates; }
private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords, Partitioner partitioner) { return dedupedRecords.mapToPair(record -> new Tuple2<>( new Tuple2<>(record.getKey(), Option.apply(record.getCurrentLocation())), record)) .partitionBy(partitioner).map(tuple -> tuple._2()); }
public HoodieRecord generateDeleteRecord(HoodieRecord existingRecord) throws IOException { HoodieKey key = existingRecord.getKey(); return generateDeleteRecord(key); }
public List<HoodieRecord> generateSameKeyInserts(String commitTime, List<HoodieRecord> origin) throws IOException { List<HoodieRecord> copy = new ArrayList<>(); for (HoodieRecord r: origin) { HoodieKey key = r.getKey(); HoodieRecord record = new HoodieRecord(key, generateRandomValue(key, commitTime)); copy.add(record); } return copy; }
/** * Deduplicate Hoodie records, using the given deduplication funciton. */ JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records, int parallelism) { boolean isIndexingGlobal = index.isGlobal(); return records .mapToPair(record -> { HoodieKey hoodieKey = record.getKey(); // If index used is global, then records are expected to differ in their partitionPath Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey; return new Tuple2<>(key, record); }) .reduceByKey((rec1, rec2) -> { @SuppressWarnings("unchecked") T reducedData = (T) rec1.getData() .preCombine(rec2.getData()); // we cannot allow the user to change the key or partitionPath, since that will affect // everything // so pick it from one of the records. return new HoodieRecord<T>(rec1.getKey(), reducedData); }, parallelism).map(Tuple2::_2); }
public void writeRecordsAndErrors(@NonNull final HoodieWriteResult result, final boolean isErrorTableEnabled) { try { if (result.getException().isPresent()) { throw result.getException().get(); } if (result.getWriteStatuses().isPresent()) { if (isErrorTableEnabled) { // TODO: Can we make this more readable, please? final JavaRDD<Tuple2<HoodieRecord, String>> hoodieRecordAndErrorTupleRDD = result.getWriteStatuses().get() .flatMap(ws -> ws.getFailedRecords().stream().map(fr -> new Tuple2<>(fr, ws.getErrors().get(fr.getKey()).getMessage())).iterator()); final JavaRDD<ErrorData> errorRDD = hoodieRecordAndErrorTupleRDD .map(r -> new ErrorData(r._2, RawDataHelper.getRawData(r._1))); ErrorTableUtil.writeErrorRecordsToErrorTable(this.jsc.sc(), this.hoodieConf.getConf(), Optional.of(this.hoodieConf.getTableName()), new RDDWrapper<>(errorRDD), new HoodieSinkErrorExtractor()); } } } catch (HoodieInsertException | HoodieUpsertException e) { log.error("Error writing to hoodie", e); throw new JobRuntimeException("hoodie write failed :" + (result.getWriteStatuses().isPresent() ? result.getWriteStatuses().get().count() : -1), e); } catch (Exception e) { throw new JobRuntimeException("Error writing to hoodie", e); } }
/** * Deduplicate Hoodie records, using the given deduplication funciton. */ JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records, int parallelism) { boolean isIndexingGlobal = index.isGlobal(); return records .mapToPair(record -> { HoodieKey hoodieKey = record.getKey(); // If index used is global, then records are expected to differ in their partitionPath Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey; return new Tuple2<>(key, record); }) .reduceByKey((rec1, rec2) -> { @SuppressWarnings("unchecked") T reducedData = (T) rec1.getData() .preCombine(rec2.getData()); // we cannot allow the user to change the key or partitionPath, since that will affect // everything // so pick it from one of the records. return new HoodieRecord<T>(rec1.getKey(), reducedData); }, parallelism).map(recordTuple -> recordTuple._2()); }
public static <T extends HoodieRecordPayload> JavaRDD<HoodieRecord<T>> combineRecords( final JavaRDD<HoodieRecord<T>> records, final Function<HoodieRecord<T>, Object> recordKeyFunc, final int parallelism) { return records .mapToPair(record -> new Tuple2<>(recordKeyFunc.call(record), record)) .reduceByKey((rec1, rec2) -> { @SuppressWarnings("unchecked") T reducedData = (T) rec1.getData().preCombine(rec2.getData()); return new HoodieRecord<T>(rec1.getKey(), reducedData); }, parallelism) .map(recordTuple -> recordTuple._2()); } }
public static List<HoodieRecord> updateHoodieTestRecordsWithoutHoodieMetadata(List<HoodieRecord> oldRecords, Schema schema, String fieldNameToUpdate, String newValue) throws IOException, URISyntaxException { return oldRecords .stream() .map(r -> { try { GenericRecord rec = (GenericRecord) r.getData().getInsertValue(schema).get(); rec.put(fieldNameToUpdate, newValue); return new HoodieRecord<>(r.getKey(), new HoodieAvroPayload(Optional.of(rec))); } catch (IOException io) { throw new HoodieIOException("unable to get data from hoodie record", io); } }).collect(Collectors.toList()); }
/** * Tag the <rowKey, filename> back to the original HoodieRecord RDD. */ private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords( JavaPairRDD<String, String> rowKeyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) { JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD .mapToPair(record -> new Tuple2<>(record.getRecordKey(), record)); // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), // so we do left outer join. return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map(v1 -> { HoodieRecord<T> record = v1._1(); if (v1._2().isPresent()) { String filename = v1._2().get(); if (filename != null && !filename.isEmpty()) { // When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD will have 2 // entries with the same exact in memory copy of the HoodieRecord and the 2 separate filenames that the // record is found in. This will result in setting currentLocation 2 times and it will fail the second time. // This check will create a new in memory copy of the hoodie record. if (record.getCurrentLocation() != null) { record = new HoodieRecord<T>(record.getKey(), record.getData()); } record.setCurrentLocation(new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename))); } } return record; }); }
@Test public void testAvroLogRecordReaderWithInvalidRollback() throws IOException, URISyntaxException, InterruptedException { Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); // Set a small threshold so that every block is a new version Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") .overBaseCommit("100").withFs(fs).build(); // Write 1 List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header); writer = writer.appendBlock(dataBlock); // Write invalid rollback for a failed write (possible for in-flight commits) header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "101"); header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); HoodieCommandBlock commandBlock = new HoodieCommandBlock(header); writer = writer.appendBlock(commandBlock); List<String> allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100").map(s -> s.getPath().toString()).collect(Collectors.toList()); HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "100", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH); assertEquals("We still would read 100 records", 100, scanner.getTotalLogRecords()); final List<String> readKeys = new ArrayList<>(100); scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey())); assertEquals("Stream collect should return all 150 records", 100, readKeys.size()); }
private UpsertPartitioner getUpsertPartitioner(int smallFileSize, int numInserts, int numUpdates, int fileSize, String testPartitionPath, boolean autoSplitInserts) throws Exception { HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig( HoodieCompactionConfig.newBuilder().compactionSmallFileSize(smallFileSize).insertSplitSize(100) .autoTuneInsertSplits(autoSplitInserts).build()).withStorageConfig( HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build(); HoodieClientTestUtils.fakeCommitFile(basePath, "001"); HoodieClientTestUtils.fakeDataFile(basePath, testPartitionPath, "001", "file1", fileSize); HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc); HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[]{testPartitionPath}); List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts); List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates); for (HoodieRecord updateRec : updateRecords) { updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1")); } List<HoodieRecord> records = new ArrayList<>(); records.addAll(insertRecords); records.addAll(updateRecords); WorkloadProfile profile = new WorkloadProfile(jsc.parallelize(records)); HoodieCopyOnWriteTable.UpsertPartitioner partitioner = (HoodieCopyOnWriteTable.UpsertPartitioner) table.getUpsertPartitioner(profile); assertEquals("Update record should have gone to the 1 update partiton", 0, partitioner.getPartition( new Tuple2<>(updateRecords.get(0).getKey(), Option.apply(updateRecords.get(0).getCurrentLocation())))); return partitioner; }