@Override
public JavaRDD<AvroPayload> getData(@NonNull final ParquetWorkUnitCalculatorResult workUnitCalcResult) {
Preconditions.checkState(workUnitCalcResult.hasWorkUnits(),
"No work to process for: " + hiveConf.getDataPath());
final List<String> workUnits = workUnitCalcResult.getWorkUnits();
final String hdfsPath = new Path(this.hiveConf.getDataPath(), workUnits.get(0)).toString();
log.info("Reading data from path: {}", hdfsPath);
final Dataset<Row> data = this.sqlContext.read().parquet(hdfsPath);
final int numPartitions = calculateHiveNumPartitions(data);
log.info("Using {} partitions", numPartitions);
final JavaRDD<AvroPayload> hiveRawData = data
.coalesce(numPartitions)
.javaRDD()
.flatMap(row -> {
final List<AvroPayload> payloads = new ArrayList<>();
this.converter.convert(row).forEach(d -> payloads.add(d.getSuccessData().get().getData()));
return payloads.iterator();
});
return hiveRawData;
}