/** * This method converts RDD of AvroPayload data to RDD of String with specified file type. * Currently supports csv file only. * @param data * @return * @throws UnsupportedOperationException */ public JavaRDD<String> convertAll(@NonNull final JavaRDD<AvroPayload> data) throws UnsupportedOperationException { final JavaRDD<String> lines = data.map(row -> { final String line = this.convert(row).get(0).getSuccessData().get().getData(); log.debug("Line: {}", line); return line; }); return lines; }
@Override public JavaRDD<AvroPayload> getData(@NonNull final ParquetWorkUnitCalculatorResult workUnitCalcResult) { Preconditions.checkState(workUnitCalcResult.hasWorkUnits(), "No work to process for: " + hiveConf.getDataPath()); /** * Current implementation of HiveSource assumes that only a single work unit exists which * corresponds to the single partition that is processed per job. */ final List<String> workUnits = workUnitCalcResult.getWorkUnits(); final String hdfsPath = new Path(this.hiveConf.getDataPath(), workUnits.get(0)).toString(); log.info("Reading data from path: {}", hdfsPath); final Dataset<Row> data = this.sqlContext.read().parquet(hdfsPath); final int numPartitions = calculateHiveNumPartitions(data); log.info("Using {} partitions", numPartitions); final JavaRDD<AvroPayload> hiveRawData = data .coalesce(numPartitions) .javaRDD() .flatMap(row -> { final List<AvroPayload> payloads = new ArrayList<>(); this.converter.convert(row).forEach(d -> payloads.add(d.getSuccessData().get().getData())); return payloads.iterator(); }); return hiveRawData; }
@Test public void convert() throws Exception { final Configuration conf = new Configuration(); final Schema schema = new Schema.Parser().parse( getClass().getClassLoader().getResourceAsStream("schemas/schemasource/myTestSchema.1.avsc")); final JSONFileSourceDataConverter converter = new JSONFileSourceDataConverter( conf, new KafkaSourceConverterErrorExtractor(), schema); final List<ConverterResult<String, AvroPayload>> results = converter.convert("{\"firstName\": 112, \"lastName\": \"Lname\", \"address\": {\"line1\": \"1234 Main St\", \"city\": \"The City\", \"zip\": 12345}}"); Assert.assertEquals(1, results.size()); Assert.assertFalse(results.get(0).getErrorData().isPresent()); final GenericRecord gr = results.get(0).getSuccessData().get().getData().getData(); // should be able to convert int to string Assert.assertEquals(new Utf8("112"), gr.get("firstName")); Assert.assertEquals(new Utf8("Lname"), gr.get("lastName")); }
public final RDDWrapper<OD> map(@NonNull final JavaRDD<ID> data) { final ForkOperator<IData> converter = new ForkOperator<>(data.map(r -> RawDataHelper.getRawData(r)), new DataConversionFunction(), this.conf); converter.execute(); // Write error records. ErrorTableUtil.writeErrorRecordsToErrorTable(data.context(), this.conf, Optional.absent(), new RDDWrapper<>(converter.getRDD(ERROR_RECORD).map(r -> (ErrorData) r), converter.getCount(ERROR_RECORD)), errorExtractor); return new RDDWrapper<>(converter.getRDD(VALID_RECORD).map(r -> ((ValidData<OD>) r).getData()), converter.getCount(VALID_RECORD)); }
final AvroPayload payload = converter.convert(rows.get(0)).get(0).getSuccessData().get().getData(); final GenericRecord gr = payload.getData();