@Override public JavaRDD<AvroPayload> getData(@NonNull final ParquetWorkUnitCalculatorResult workUnitCalcResult) { Preconditions.checkState(workUnitCalcResult.hasWorkUnits(), "No work to process for: " + hiveConf.getDataPath()); /** * Current implementation of HiveSource assumes that only a single work unit exists which * corresponds to the single partition that is processed per job. */ final List<String> workUnits = workUnitCalcResult.getWorkUnits(); final String hdfsPath = new Path(this.hiveConf.getDataPath(), workUnits.get(0)).toString(); log.info("Reading data from path: {}", hdfsPath); final Dataset<Row> data = this.sqlContext.read().parquet(hdfsPath); final int numPartitions = calculateHiveNumPartitions(data); log.info("Using {} partitions", numPartitions); final JavaRDD<AvroPayload> hiveRawData = data .coalesce(numPartitions) .javaRDD() .flatMap(row -> { final List<AvroPayload> payloads = new ArrayList<>(); this.converter.convert(row).forEach(d -> payloads.add(d.getSuccessData().get().getData())); return payloads.iterator(); }); return hiveRawData; }
new SparkSourceDataConverter(expectedSchema, commonSchema, new Configuration(), Collections.singleton(STRING_FIELD), new ErrorExtractor()); final AvroPayload payload = converter.convert(rows.get(0)).get(0).getSuccessData().get().getData(); final GenericRecord gr = payload.getData();
new SparkSourceDataConverter(schema, commonSchema, new Configuration(), Collections.singleton(STRING_FIELD), new ErrorExtractor()); final JavaRDD<AvroPayload> payloadRDD = converter.map(df.javaRDD()).getData();
final HiveSourceConfiguration hiveConf = HiveTestUtil.initializeConfig(JOB_NAME, dataPath, "testMetadataPath"); final SparkSourceDataConverter converter = new SparkSourceDataConverter(dfSchema, avroSchema, hiveConf.getConf(), Sets.newHashSet(LEFT_FIELD, RIGHT_FIELD), new ErrorExtractor()); final HiveSource source = new HiveSource(hiveConf, this.sqlContext.get(), converter);