@Override public void write(@NonNull final JavaRDD<AvroPayload> data) { // Generate HoodieRecord from AvroPayload. It may generate error records in the process and should be // written to error tables. final RDDWrapper<HoodieRecord<HoodieRecordPayload>> hoodieRecords = this.hoodieSinkDataConverter.map(data); write(hoodieRecords); }
/** * This constructor gives the option to only convert certain fields from the schema * @param inputSchema * @param conf * @param fieldsToConvert * @param requiredFields */ public CassandraSinkDataConverter(@NonNull final Schema inputSchema, @NonNull final Configuration conf, @NonNull final Optional<Set<String>> fieldsToConvert, @NonNull final List<String> requiredFields, @NonNull final TimestampInfo timestampInfo, @NonNull final ErrorExtractor errorExtractor) { super(conf, errorExtractor); if (fieldsToConvert.isPresent()) { validate(fieldsToConvert.get(), requiredFields); } this.inputSchemaJson = inputSchema.toString(); this.requiredFields = Collections.unmodifiableList(requiredFields); this.fieldsToConvert = fieldsToConvert; this.timestampInfo = timestampInfo; }
@Override protected final List<ConverterResult<AvroPayload, HoodieRecord<HoodieRecordPayload>>> convert( @NonNull final AvroPayload payload) throws Exception { final HoodieKey hoodieKey = new HoodieKey(getRecordKey(payload), getPartitionPath(payload)); final HoodieRecordPayload hoodiePayload = getPayload(payload); return Collections.singletonList(new ConverterResult<>((new HoodieRecord<>(hoodieKey, hoodiePayload)))); }
@Test public void testConvertAllWithCsv() { log.info("Starts Test convert all with csv"); final String separator = " "; final JavaRDD<AvroPayload> payloadData = AvroPayloadUtil.generateTestData(this.jsc.get(), 10, StringTypes.EMPTY); final Configuration conf = initConf(separator, "csv"); log.info("Starting to convert data."); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final JavaRDD<String> dataConverted = converter.convertAll(payloadData); int i = 1; for (String line: dataConverted.collect()){ Assert.assertEquals(String.valueOf(i) + separator + String.valueOf(i) + separator + "true", line ); i = i + 1; } }
@Test public void testGetHeaderWithCsv() { final String separator = ","; final JavaRDD<AvroPayload> payloadData = AvroPayloadUtil.generateTestData(this.jsc.get(), 10, StringTypes.EMPTY); final Configuration conf = initConf(separator, "csv"); log.info("Starting to get data header."); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final String header = converter.getHeader(payloadData); final String resultHeader = "int_field,string_field,boolean_field"; Assert.assertEquals(resultHeader, header); log.info("Header: {}", header); }
final JavaRDD<String> convertedData = this.converter.convertAll(data); final int partNum = getRepartitionNum(convertedData); final int desiredDigit = (int) Math.floor(Math.log10(partNum) + 1); final JavaRDD<String> dataToWrite; if (this.conf.isColumnHeader()) { final String header = this.converter.getHeader(data); dataToWrite = addColumnHeader(header, dataRepartitioned); } else {
@Before public void setupTest() { super.setupTest(); this.testData1 = AvroPayloadUtil.generateTestData(this.jsc.get(), NUM_RECORD1, StringTypes.EMPTY); this.testData2 = AvroPayloadUtil.generateTestDataNew(this.jsc.get(), NUM_RECORD2, StringTypes.EMPTY); this.conf = initConfig(pathPrefix, PATH1, COMMA_SEPARATOR, TIMESTAMP1, SOURCE_SUB_PATH1, VERSION); this.converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final FileSinkConfiguration fileConf = new FileSinkConfiguration(conf); this.fileSink = spy(new HdfsFileSink(fileConf, converter)); this.convertedData1 = this.converter.convertAll(this.testData1); this.convertedData2 = this.converter.convertAll(this.testData2); }
@Override public JavaRDD<AvroPayload> getData(@NonNull final ParquetWorkUnitCalculatorResult workUnitCalcResult) { Preconditions.checkState(workUnitCalcResult.hasWorkUnits(), "No work to process for: " + hiveConf.getDataPath()); /** * Current implementation of HiveSource assumes that only a single work unit exists which * corresponds to the single partition that is processed per job. */ final List<String> workUnits = workUnitCalcResult.getWorkUnits(); final String hdfsPath = new Path(this.hiveConf.getDataPath(), workUnits.get(0)).toString(); log.info("Reading data from path: {}", hdfsPath); final Dataset<Row> data = this.sqlContext.read().parquet(hdfsPath); final int numPartitions = calculateHiveNumPartitions(data); log.info("Using {} partitions", numPartitions); final JavaRDD<AvroPayload> hiveRawData = data .coalesce(numPartitions) .javaRDD() .flatMap(row -> { final List<AvroPayload> payloads = new ArrayList<>(); this.converter.convert(row).forEach(d -> payloads.add(d.getSuccessData().get().getData())); return payloads.iterator(); }); return hiveRawData; }
private void testWriteGeneral(@NonNull final JavaRDD<AvroPayload> testData, @NonNull final Configuration conf) throws IOException { final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final FileSinkConfiguration fileConf = new FileSinkConfiguration(conf); final FileSink awsSink = new AwsFileSink(fileConf, converter); awsSink.write(testData); }
new CassandraSinkCQLDataConverter( avroSchema, new Configuration(),
private void testWriteAllFieldsMockDataToCassandra(boolean addLongTimestamp) { final JavaRDD<AvroPayload> testData = AvroPayloadUtil.generateTestData(this.jsc.get(), 100, StringTypes.EMPTY); final List<String> schemaFields = AvroPayloadUtil.getSchemaFields(); final List<String> partitionKeys = Collections.singletonList(schemaFields.get(0)); final List<ClusterKey> clusteringKeys = Collections.singletonList( new ClusterKey(schemaFields.get(1), ClusterKey.Order.DESC)); final List<String> requiredFields = Arrays.asList(schemaFields.get(0), schemaFields.get(1)); final Optional<String> timestamp = addLongTimestamp ? Optional.of(TEST_TIMESTAMP) : Optional.absent(); final TimestampInfo tsInfo = new TimestampInfo(timestamp, true); final CassandraSinkDataConverter dataconverter = new CassandraSinkDataConverter(AvroPayloadUtil.getAvroTestDataSchema(StringTypes.EMPTY), new Configuration(), Optional.of(new HashSet<>(schemaFields)), requiredFields, tsInfo, new ErrorExtractor()); final CassandraSchemaConverter schemaConverter = new CassandraSchemaConverter(KEY_SPACE, TABLE, tsInfo, Optional.absent()); final CassandraSchema schema = schemaConverter.convertToExternalSchema( AvroPayloadUtil.getAvroTestDataSchema(StringTypes.EMPTY)); final Optional<Long> ttl = Optional.of(10000L); final CassandraSinkSchemaManager schemaManager = new CassandraSinkSchemaManager(schema, partitionKeys, clusteringKeys, ttl); final CassandraSinkConfiguration conf = initializeConfiguration(false, addLongTimestamp); final CassandraSSTableSink sink = new CassandraSSTableSink(dataconverter, schemaManager, conf); sink.write(testData); validateCassandraTable(100, false, addLongTimestamp); }
/** * This constructor gives the option to only convert certain fields from the schema * @param inputSchema * @param conf * @param fieldsToConvert * @param requiredFields */ public CassandraSinkCQLDataConverter(@NonNull final Schema inputSchema, @NonNull final Configuration conf, @NonNull final Optional<Set<String>> fieldsToConvert, @NonNull final List<String> requiredFields, @NonNull final TimestampInfo timestampInfo, @NonNull final ErrorExtractor errorExtractor) { super(conf, errorExtractor); if (fieldsToConvert.isPresent()) { validate(fieldsToConvert.get(), requiredFields); } this.inputSchemaJson = inputSchema.toString(); this.requiredFields = Collections.unmodifiableList(requiredFields); this.fieldsToConvert = fieldsToConvert; this.timestampInfo = timestampInfo; this.keyspaceName = conf.getProperty(CassandraSinkConfiguration.KEYSPACE, ""); this.tableName = conf.getProperty(CassandraSinkConfiguration.TABLE_NAME, ""); }
/** * This method converts RDD of AvroPayload data to RDD of String with specified file type. * Currently supports csv file only. * @param data * @return * @throws UnsupportedOperationException */ public JavaRDD<String> convertAll(@NonNull final JavaRDD<AvroPayload> data) throws UnsupportedOperationException { final JavaRDD<String> lines = data.map(row -> { final String line = this.convert(row).get(0).getSuccessData().get().getData(); log.debug("Line: {}", line); return line; }); return lines; }
public final RDDWrapper<OD> map(@NonNull final JavaRDD<ID> data) { final ForkOperator<IData> converter = new ForkOperator<>(data.map(r -> RawDataHelper.getRawData(r)), new DataConversionFunction(), this.conf); converter.execute(); // Write error records. ErrorTableUtil.writeErrorRecordsToErrorTable(data.context(), this.conf, Optional.absent(), new RDDWrapper<>(converter.getRDD(ERROR_RECORD).map(r -> (ErrorData) r), converter.getCount(ERROR_RECORD)), errorExtractor); return new RDDWrapper<>(converter.getRDD(VALID_RECORD).map(r -> ((ValidData<OD>) r).getData()), converter.getCount(VALID_RECORD)); }
@Test public void testConvertAllWithCsvSpecialChar() { log.info("Starts Test convert all with csv"); final String separator = ","; final JavaRDD<AvroPayload> payloadData = AvroPayloadUtil.generateTestDataNew(this.jsc.get(), 10, StringTypes.EMPTY); final Configuration conf = initConf(separator, "csv"); log.info("Starting to convert data."); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final JavaRDD<String> dataConverted = converter.convertAll(payloadData); int i = 1; for (String line: dataConverted.collect()){ Assert.assertEquals(String.valueOf(i) + separator + "\"" + String.valueOf(i) + "\\\",try\\\\\"" + separator + "true", line); i = i + 1; } }
private void testWriteToCsvCommon(@NonNull final String pathPrefix, @NonNull final String path, @NonNull final String separator, @NonNull final JavaRDD<AvroPayload> testData, @NonNull final int partitionNum, @NonNull final String timeStamp, @NonNull final String sourceSubPath, @NonNull final String dispersalType) throws Exception { final Configuration conf = initConfig(pathPrefix, path, separator, timeStamp, sourceSubPath, dispersalType); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final FileSinkConfiguration fileConf = new FileSinkConfiguration(conf); final HdfsFileSink hdfsSink = spy(new HdfsFileSink(fileConf, converter)); hdfsSink.write(testData); verify(hdfsSink, times(1)).write(Matchers.any(JavaRDD.class)); verify(hdfsSink, times(1)).getRepartitionNum(Matchers.any(JavaRDD.class)); verify(hdfsSink, times(1)).getRddSizeInMegaByte(Matchers.any(JavaRDD.class)); final FileStatus[] status = this.fileSystem.get().listStatus(new Path(fileConf.getPathHdfs())); int fileNum = 0; for (final FileStatus fileStatus : status) { if (fileStatus.isFile()) { fileNum++; } } assertEquals(partitionNum, fileNum); } }
private void testWriteAllFieldsMockDataToCassandra(boolean addLongTimestamp) { final JavaRDD<AvroPayload> testData = AvroPayloadUtil.generateTestData(this.jsc.get(), 100, StringTypes.EMPTY); final List<String> schemaFields = AvroPayloadUtil.getSchemaFields(); final List<String> partitionKeys = Collections.singletonList(schemaFields.get(0)); final List<ClusterKey> clusteringKeys = Collections.singletonList( new ClusterKey(schemaFields.get(1), ClusterKey.Order.DESC)); final List<String> requiredFields = Arrays.asList(schemaFields.get(0), schemaFields.get(1)); final Optional<String> timestamp = addLongTimestamp ? Optional.of(TEST_TIMESTAMP) : Optional.absent(); final TimestampInfo tsInfo = new TimestampInfo(timestamp, true); final CassandraSinkCQLDataConverter converter = new CassandraSinkCQLDataConverter(AvroPayloadUtil.getAvroTestDataSchema(StringTypes.EMPTY), new Configuration(), Optional.of(new HashSet<>(schemaFields)), requiredFields, tsInfo, new ErrorExtractor()); final CassandraSchemaConverter schemaConverter = new CassandraSchemaConverter(KEY_SPACE, TABLE, tsInfo, Optional.absent()); final CassandraSchema schema = schemaConverter.convertToExternalSchema( AvroPayloadUtil.getAvroTestDataSchema(StringTypes.EMPTY)); final Optional<Long> ttl = Optional.of(10000L); final CassandraSinkSchemaManager schemaManager = new CassandraSinkSchemaManager(schema, partitionKeys, clusteringKeys, ttl); final CassandraSinkConfiguration conf = initializeConfiguration(false, addLongTimestamp); final CassandraClientSink sink = new CassandraClientSink(converter, schemaManager, conf); sink.write(testData); validateCassandraTable(100, false, addLongTimestamp); }
new CassandraSinkDataConverter( avroSchema, new Configuration(),
@Test(expected = SparkException.class) public void testConvertAllWithJsonNotSupported() { log.info("Starts Test convert all with json"); final String separator = ","; final JavaRDD<AvroPayload> payloadData = AvroPayloadUtil.generateTestData(this.jsc.get(), 10, StringTypes.EMPTY); final Configuration conf = initConf(separator, "json"); log.info("Starting to convert data."); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final JavaRDD<String> dataConverted = converter.convertAll(payloadData); int i = 1; for (String line: dataConverted.collect()){ Assert.assertEquals(String.valueOf(i) + separator + String.valueOf(i) + separator + "true", line ); i = i + 1; } }
private String testWriteToMockS3General(@NonNull final Configuration conf) throws IOException { final JavaRDD<AvroPayload> testData = AvroPayloadUtil.generateTestDataNew(this.jsc.get(), NUM_RECORD, StringTypes.EMPTY); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final FileSinkConfiguration fileConf = new FileSinkConfiguration(conf); final AwsConfiguration awsConf = new AwsConfiguration(fileConf); final MockAwsFileSink awsMockSink = spy(new MockAwsFileSink(fileConf, converter)); awsMockSink.write(testData); final AmazonS3 MockClient = awsMockSink.getS3Client(); verify(awsMockSink, times(EXPECTED_INVOCATIONS)).write(Matchers.any(JavaRDD.class)); verify(MockClient, times(EXPECTED_PARTITION_NUM)).putObject(Matchers.any(PutObjectRequest.class)); assertTrue(MockClient.doesBucketExistV2(fileConf.getBucketName().get())); for (int i = 0 ; i < EXPECTED_PARTITION_NUM ; i++) { final Boolean objectExist = MockClient.doesObjectExist(fileConf.getBucketName().get(), awsConf.getS3FilePrefix()+ "_0000" + i); assertTrue(objectExist); } return awsConf.getS3FilePrefix(); }