/** * Tag the <rowKey, filename> back to the original HoodieRecord RDD. */ private JavaRDD<HoodieRecord<T>> tagLocationBacktoRecords( JavaPairRDD<String, String> rowKeyFilenamePairRDD, JavaRDD<HoodieRecord<T>> recordRDD) { JavaPairRDD<String, HoodieRecord<T>> rowKeyRecordPairRDD = recordRDD .mapToPair(record -> new Tuple2<>(record.getRecordKey(), record)); // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), // so we do left outer join. return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map(v1 -> { HoodieRecord<T> record = v1._1(); if (v1._2().isPresent()) { String filename = v1._2().get(); if (filename != null && !filename.isEmpty()) { // When you have a record in multiple files in the same partition, then rowKeyRecordPairRDD will have 2 // entries with the same exact in memory copy of the HoodieRecord and the 2 separate filenames that the // record is found in. This will result in setting currentLocation 2 times and it will fail the second time. // This check will create a new in memory copy of the hoodie record. if (record.getCurrentLocation() != null) { record = new HoodieRecord<T>(record.getKey(), record.getData()); } record.setCurrentLocation(new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename))); } } return record; }); }
private Optional<IndexedRecord> getIndexedRecord(HoodieRecord<T> hoodieRecord) { Optional recordMetadata = hoodieRecord.getData().getMetadata(); try { Optional<IndexedRecord> avroRecord = hoodieRecord.getData().getInsertValue(schema); String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(), recordIndex.getAndIncrement()); HoodieAvroUtils .addHoodieKeyToRecord((GenericRecord) avroRecord.get(), hoodieRecord.getRecordKey(), hoodieRecord.getPartitionPath(), fileId); HoodieAvroUtils .addCommitMetadataToRecord((GenericRecord) avroRecord.get(), commitTime, seqId); if (hoodieRecord.getCurrentLocation() != null) { updatedRecordsWritten++; } else { hoodieRecord.deflate(); return avroRecord; } catch (Exception e) {
/** * Filter out HoodieRecords that already exists in the output folder. This is useful in * deduplication. * * @param hoodieRecords Input RDD of Hoodie records. * @return A subset of hoodieRecords RDD, with existing records filtered out. */ public JavaRDD<HoodieRecord<T>> filterExists(JavaRDD<HoodieRecord<T>> hoodieRecords) { JavaRDD<HoodieRecord<T>> recordsWithLocation = tagLocation(hoodieRecords); return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown()); }
public HoodieRecordMissingException(HoodieRecord record) { super( "Record " + record.getRecordKey() + " with partition path " + record.getPartitionPath() + " in current location " + record.getCurrentLocation() + " is not found in the partition"); } }
private void writeToBuffer(HoodieRecord<T> record) { // update the new location of the record, so we know where to find it next record.setNewLocation(new HoodieRecordLocation(commitTime, fileId)); Optional<IndexedRecord> indexedRecord = getIndexedRecord(record); if (indexedRecord.isPresent()) { recordList.add(indexedRecord.get()); } else { keysToDelete.add(record.getKey()); } numberOfRecords++; }
private void writeToBuffer(HoodieRecord<T> record) { // update the new location of the record, so we know where to find it next record.setNewLocation(new HoodieRecordLocation(commitTime, fileId)); Optional<IndexedRecord> indexedRecord = getIndexedRecord(record); if (indexedRecord.isPresent()) { recordList.add(indexedRecord.get()); } else { keysToDelete.add(record.getRecordKey()); } numberOfRecords++; }
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); TestRawTripPayload rowChange2 = new TestRawTripPayload( "{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); TestRawTripPayload rowChange3 = new TestRawTripPayload( "{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"); HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); HoodieRecord record5 = new HoodieRecord(new HoodieKey(rowChange5.getRowKey(), rowChange5.getPartitionPath()), rowChange4); if (record.getRecordKey().equals("000")) { assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename0))); } else if (record.getRecordKey().equals("001")) { assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename2))); } else if (record.getRecordKey().equals("002")) { assertTrue(!record.isCurrentLocationKnown()); } else if (record.getRecordKey().equals("004")) { assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename3)));
public static <T extends HoodieRecordPayload> JavaRDD<HoodieRecord<T>> combineRecords( final JavaRDD<HoodieRecord<T>> records, final Function<HoodieRecord<T>, Object> recordKeyFunc, final int parallelism) { return records .mapToPair(record -> new Tuple2<>(recordKeyFunc.call(record), record)) .reduceByKey((rec1, rec2) -> { @SuppressWarnings("unchecked") T reducedData = (T) rec1.getData().preCombine(rec2.getData()); return new HoodieRecord<T>(rec1.getKey(), reducedData); }, parallelism) .map(recordTuple -> recordTuple._2()); } }
statements.add(generateStatement(rec.getRecordKey())); currentBatchOfRecords.add(rec); currentRecord = new HoodieRecord( new HoodieKey(currentRecord.getRecordKey(), partitionPath), currentRecord.getData()); currentRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId)); taggedRecords.add(currentRecord); assert (currentRecord.getRecordKey().contentEquals(keyFromResult)); } else { //if commit is invalid, treat this as a new taggedRecord taggedRecords.add(currentRecord);
@Override protected void processNextRecord(HoodieRecord<? extends HoodieRecordPayload> hoodieRecord) throws IOException { String key = hoodieRecord.getRecordKey(); if (records.containsKey(key)) { // Merge and store the merged record. The HoodieRecordPayload implementation is free to decide what should be // done when a delete (empty payload) is encountered before or after an insert/update. HoodieRecordPayload combinedValue = records.get(key).getData().preCombine(hoodieRecord.getData()); records.put(key, new HoodieRecord<>(new HoodieKey(key, hoodieRecord.getPartitionPath()), combinedValue)); } else { // Put the record as is records.put(key, hoodieRecord); } }
try { for (HoodieRecord rec : writeStatus.getWrittenRecords()) { if (!writeStatus.isErrored(rec.getKey())) { java.util.Optional<HoodieRecordLocation> loc = rec.getNewLocation(); if (loc.isPresent()) { if (rec.getCurrentLocation() != null) { Put put = new Put(Bytes.toBytes(rec.getRecordKey())); put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, Bytes.toBytes(loc.get().getCommitTime())); Bytes.toBytes(loc.get().getFileId())); put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, Bytes.toBytes(rec.getPartitionPath())); puts.add(put); } else { Delete delete = new Delete(Bytes.toBytes(rec.getRecordKey())); deletes.add(delete);
public static void writeRecordsToLogFiles(FileSystem fs, String basePath, Schema schema, List<HoodieRecord> updatedRecords) { Map<HoodieRecordLocation, List<HoodieRecord>> groupedUpdated = updatedRecords.stream().collect( Collectors.groupingBy(HoodieRecord::getCurrentLocation)); groupedUpdated.entrySet().forEach(s -> { HoodieRecordLocation location = s.getKey(); String partitionPath = s.getValue().get(0).getPartitionPath(); Writer logWriter; try { logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath)) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(location.getFileId()) .overBaseCommit(location.getCommitTime()).withFs(fs).build(); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, location.getCommitTime()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); logWriter.appendBlock(new HoodieAvroDataBlock(s.getValue().stream().map(r -> { try { GenericRecord val = (GenericRecord) r.getData().getInsertValue(schema).get(); HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), ""); return (IndexedRecord) val; } catch (IOException e) { return null; } }).collect(Collectors.toList()), header)); logWriter.close(); } catch (Exception e) { fail(e.toString()); } }); }
public static HoodieRecord createHoodieRecord(GenericRecord gr, Comparable orderingVal, HoodieKey hKey, String payloadClass) throws IOException { HoodieRecordPayload payload = DataSourceUtils.createPayload(payloadClass, gr, orderingVal); return new HoodieRecord<>(hKey, payload); }
/** * Assert that there is no duplicate key at the partition level * * @param records List of Hoodie records */ void assertNodupesWithinPartition(List<HoodieRecord> records) { Map<String, Set<String>> partitionToKeys = new HashMap<>(); for (HoodieRecord r : records) { String key = r.getRecordKey(); String partitionPath = r.getPartitionPath(); if (!partitionToKeys.containsKey(partitionPath)) { partitionToKeys.put(partitionPath, new HashSet<>()); } assertTrue("key " + key + " is duplicate within partition " + partitionPath, !partitionToKeys.get(partitionPath).contains(key)); partitionToKeys.get(partitionPath).add(key); } }
public HoodieInsertValueGenResult(T record, Schema schema) { this.record = record; try { this.insertValue = record.getData().getInsertValue(schema); } catch (Exception e) { this.exception = Optional.of(e); } } }
records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); BloomFilter filter = ParquetUtils.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), parquetFilePath); for (HoodieRecord record : records) { assertTrue(filter.mightContain(record.getRecordKey())); int index = 0; for (GenericRecord record : fileRecords) { assertTrue(record.get("_row_key").toString().equals(records.get(index).getRecordKey())); index++; + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1); HoodieRecord updatedRecord1 = new HoodieRecord( new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1); updatedRecord1.setCurrentLocation(new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName()))); HoodieRecord insertedRecord1 = new HoodieRecord( new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); table = new HoodieCopyOnWriteTable(config, jsc); Iterator<List<WriteStatus>> iter = table .handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(), updatedRecords.iterator());
Optional<IndexedRecord> recordOptional = hoodieRecord.getData().getInsertValue(getReaderSchema()); ArrayWritable aWritable; String key = hoodieRecord.getRecordKey(); if (recordOptional.isPresent()) { GenericRecord rec = (GenericRecord) recordOptional.get();
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())).groupByKey().collectAsMap(); assertEquals(map.size(), 2); List<String> list1 = Lists.newArrayList(map.get("2016/01/31"));
@Override public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException { String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(), recordIndex.getAndIncrement()); HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, record.getRecordKey(), record.getPartitionPath(), file.getName()); HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId); super.write(avroRecord); writeSupport.add(record.getRecordKey()); }
/** * Load the new incoming records in a map and return partitionPath */ private String init(String fileId, Iterator<HoodieRecord<T>> newRecordsItr) { try { // Load the new records in a map logger.info("MaxMemoryPerPartitionMerge => " + config.getMaxMemoryPerPartitionMerge()); this.keyToNewRecords = new ExternalSpillableMap<>(config.getMaxMemoryPerPartitionMerge(), config.getSpillableMapBasePath(), new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); } catch (IOException io) { throw new HoodieIOException("Cannot instantiate an ExternalSpillableMap", io); } String partitionPath = null; while (newRecordsItr.hasNext()) { HoodieRecord<T> record = newRecordsItr.next(); partitionPath = record.getPartitionPath(); keyToNewRecords.put(record.getRecordKey(), record); // update the new location of the record, so we know where to find it next record.setNewLocation(new HoodieRecordLocation(commitTime, fileId)); } logger.info("Number of entries in MemoryBasedMap => " + ((ExternalSpillableMap) keyToNewRecords).getInMemoryMapNumEntries() + "Total size in bytes of MemoryBasedMap => " + ((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize() + "Number of entries in DiskBasedMap => " + ((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries() + "Size of file spilled to disk => " + ((ExternalSpillableMap) keyToNewRecords).getSizeOfFileOnDiskInBytes()); return partitionPath; }