@Override public JavaRDD<AvroPayload> getData(@NonNull final ParquetWorkUnitCalculatorResult workUnitCalcResult) { Preconditions.checkState(workUnitCalcResult.hasWorkUnits(), "No work to process for: " + hiveConf.getDataPath()); /** * Current implementation of HiveSource assumes that only a single work unit exists which * corresponds to the single partition that is processed per job. */ final List<String> workUnits = workUnitCalcResult.getWorkUnits(); final String hdfsPath = new Path(this.hiveConf.getDataPath(), workUnits.get(0)).toString(); log.info("Reading data from path: {}", hdfsPath); final Dataset<Row> data = this.sqlContext.read().parquet(hdfsPath); final int numPartitions = calculateHiveNumPartitions(data); log.info("Using {} partitions", numPartitions); final JavaRDD<AvroPayload> hiveRawData = data .coalesce(numPartitions) .javaRDD() .flatMap(row -> { final List<AvroPayload> payloads = new ArrayList<>(); this.converter.convert(row).forEach(d -> payloads.add(d.getSuccessData().get().getData())); return payloads.iterator(); }); return hiveRawData; }
final List<String> workUnits = nextPartition.isPresent() ? Collections.singletonList(nextPartition.get()) : Collections.EMPTY_LIST; return new ParquetWorkUnitCalculatorResult(workUnits, nextRunState);
private void virtuallyProcessPartition(@NonNull final HDFSPartitionManager partitionManager, @NonNull final HDFSMetadataManager metadataManager, @NotEmpty final Optional<StringValue> expectedLatestCheckpoint, @NotEmpty final String expectedNextPartition) throws IOException { Assert.assertEquals(expectedLatestCheckpoint, getLatestCheckpoint(metadataManager)); final ParquetWorkUnitCalculator calculator = new ParquetWorkUnitCalculator(this.hiveConfig, this.fileSystem); calculator.initPreviousRunState(metadataManager); final ParquetWorkUnitCalculatorResult result = calculator.computeWorkUnits(); final List<String> workUnits = result.getWorkUnits(); Assert.assertEquals(1, workUnits.size()); Assert.assertEquals(expectedNextPartition, workUnits.get(0)); Assert.assertTrue(result.getNextRunState().getPartition().isPresent()); Assert.assertEquals(expectedNextPartition, result.getNextRunState().getPartition().get()); calculator.saveNextRunState(metadataManager, result.getNextRunState()); Assert.assertEquals(expectedNextPartition, metadataManager.get(MetadataConstants.CHECKPOINT_KEY).get().getValue()); metadataManager.saveChanges(); }
@Test public void testComputeWorkUnitsWithExistentCheckpoint() throws IOException { this.fileSystem.mkdirs(new Path(this.dataPath, PARTITION_1)); this.fileSystem.mkdirs(new Path(this.dataPath, PARTITION_3)); this.partitionManager = new HDFSPartitionManager(JOB_NAME, this.metadataPath, this.dataPath, this.fileSystem); this.metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(this.metadataPath, JOB_NAME).toString(), new AtomicBoolean(true)); // partition 1 is in effect already processed since the checkpoint is larger final StringValue val1 = new StringValue(PARTITION_2); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); this.metadataManager.saveChanges(); final ParquetWorkUnitCalculator calculator = new ParquetWorkUnitCalculator(this.hiveConfig, this.fileSystem); calculator.initPreviousRunState(this.metadataManager); final IWorkUnitCalculator.IWorkUnitCalculatorResult iresult = calculator.computeWorkUnits(); Assert.assertTrue(iresult instanceof ParquetWorkUnitCalculatorResult); final ParquetWorkUnitCalculatorResult result = (ParquetWorkUnitCalculatorResult) iresult; final List<String> workUnits = result.getWorkUnits(); Assert.assertEquals(1, workUnits.size()); Assert.assertEquals(PARTITION_3, workUnits.get(0)); Assert.assertTrue(result.getNextRunState().getPartition().isPresent()); Assert.assertEquals(PARTITION_3, result.getNextRunState().getPartition().get()); }