public static List<IndexedRecord> generateHoodieTestRecords(int from, int limit) throws IOException, URISyntaxException { List<IndexedRecord> records = generateTestRecords(from, limit); String commitTime = HoodieActiveTimeline.createNewCommitTime(); Schema hoodieFieldsSchema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); return records.stream().map(s -> HoodieAvroUtils.rewriteRecord((GenericRecord) s, hoodieFieldsSchema)).map(p -> { p.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, UUID.randomUUID().toString()); p.put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, "0000/00/00"); p.put(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime); return p; }).collect(Collectors.toList()); }
@Override public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException { String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(), recordIndex.getAndIncrement()); HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, record.getRecordKey(), record.getPartitionPath(), file.getName()); HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId); super.write(avroRecord); writeSupport.add(record.getRecordKey()); }
public HoodieAvroPayload(Optional<GenericRecord> record) { try { if (record.isPresent()) { this.recordBytes = HoodieAvroUtils.avroToBytes(record.get()); } else { this.recordBytes = new byte[0]; } } catch (IOException io) { throw new HoodieIOException("Cannot convert record to bytes", io); } }
@Override public Optional<IndexedRecord> getInsertValue(Schema schema) throws IOException { if (recordBytes.length == 0) { return Optional.empty(); } Optional<GenericRecord> record = Optional.of(HoodieAvroUtils.bytesToAvro(recordBytes, schema)); return record.map(r -> HoodieAvroUtils.rewriteRecord(r, schema)); } }
public static Schema createHoodieWriteSchema(HoodieWriteConfig config) { return HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); }
@Override public Optional<IndexedRecord> getInsertValue(Schema schema) throws IOException { return Optional.of(HoodieAvroUtils.rewriteRecord(record, schema)); } }
@Test public void testPropsPresent() { Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(EXAMPLE_SCHEMA)); boolean piiPresent = false; for (Schema.Field field : schema.getFields()) { if (HoodieAvroUtils.isMetadataField(field.name())) { continue; } Assert.assertTrue("field name is null", field.name() != null); Map<String, JsonNode> props = field.getJsonProps(); Assert.assertTrue("The property is null", props != null); if (field.name().equals("pii_col")) { piiPresent = true; Assert.assertTrue("sensitivity_level is removed in field 'pii_col'", props.containsKey("column_category")); } else { Assert.assertTrue("The property shows up but not set", props.size() == 0); } } Assert.assertTrue("column pii_col doesn't show up", piiPresent); } }
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") .overBaseCommit("100").withFs(fs).build(); Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); List<IndexedRecord> records = SchemaTestUtil.generateHoodieTestRecords(0, 100); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, new String(HoodieAvroUtils.compress(schema.toString()))); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, schema); byte[] content = dataBlock.getBytes(schema);
@Override public Optional<IndexedRecord> getInsertValue(Schema schema) throws IOException { return Optional.of(HoodieAvroUtils.bytesToAvro(recordBytes, schema)); } }
public static void writeRecordsToLogFiles(FileSystem fs, String basePath, Schema schema, List<HoodieRecord> updatedRecords) { Map<HoodieRecordLocation, List<HoodieRecord>> groupedUpdated = updatedRecords.stream().collect( Collectors.groupingBy(HoodieRecord::getCurrentLocation)); groupedUpdated.entrySet().forEach(s -> { HoodieRecordLocation location = s.getKey(); String partitionPath = s.getValue().get(0).getPartitionPath(); Writer logWriter; try { logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath)) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(location.getFileId()) .overBaseCommit(location.getCommitTime()).withFs(fs).build(); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, location.getCommitTime()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); logWriter.appendBlock(new HoodieAvroDataBlock(s.getValue().stream().map(r -> { try { GenericRecord val = (GenericRecord) r.getData().getInsertValue(schema).get(); HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), ""); return (IndexedRecord) val; } catch (IOException e) { return null; } }).collect(Collectors.toList()), header)); logWriter.close(); } catch (Exception e) { fail(e.toString()); } }); }
Schema readSchema = HoodieAvroUtils.getRecordKeySchema(); AvroReadSupport.setAvroReadSchema(conf, readSchema); AvroReadSupport.setRequestedProjection(conf, readSchema);
byte[] schemaContent = HoodieAvroUtils.compress(schema.toString()); output.writeInt(schemaContent.length); output.write(schemaContent);
parentFields.add(fileNameField); for (Schema.Field field : schema.getFields()) { if (!isMetadataField(field.name())) { Schema.Field newField = new Schema.Field(field.name(), field.schema(), field.doc(), null); for (Map.Entry<String, JsonNode> prop : field.getJsonProps().entrySet()) {
/** * Generates a new avro record of the above schema format, retaining the key if optionally provided. */ public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime) throws IOException { GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime, "driver-" + commitTime, 0.0); HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1"); return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA); }
public static Schema createHoodieWriteSchema(HoodieWriteConfig config) { return HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); }
@Override public Optional<IndexedRecord> getInsertValue(final Schema schema) throws IOException { final Optional<GenericRecord> record = getRecord(); return record.map(r -> HoodieAvroUtils.rewriteRecord(r, schema)); }
@Override public Optional<IndexedRecord> getInsertValue(Schema schema) throws IOException { return Optional.of(HoodieAvroUtils.rewriteRecord(HoodieAvroUtils.bytesToAvro(recordBytes, Schema.parse(schemaStr)), schema)); } }
private void writeParquetFile(String filePath, List<String> rowKeys) throws Exception { // Write out a parquet file Schema schema = HoodieAvroUtils.getRecordKeySchema(); BloomFilter filter = new BloomFilter(1000, 0.0001); HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter); ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE); for (String rowKey : rowKeys) { GenericRecord rec = new GenericData.Record(schema); rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey); writer.write(rec); filter.add(rowKey); } writer.close(); } }
Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
@Before public void init() throws IOException { // Initialize a local spark env jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestHoodieBloomIndex")); // Create a temp folder as the base path TemporaryFolder folder = new TemporaryFolder(); folder.create(); basePath = folder.getRoot().getAbsolutePath(); fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration()); HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath); // We have some records to be tagged (two different partitions) schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8"); schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr)); }