final HiveRunState nextRunState = new HiveRunState(this.nextPartition); final List<String> workUnits = nextPartition.isPresent() ? Collections.singletonList(nextPartition.get()) : Collections.EMPTY_LIST; return new ParquetWorkUnitCalculatorResult(workUnits, nextRunState);
private void virtuallyProcessPartition(@NonNull final HDFSPartitionManager partitionManager, @NonNull final HDFSMetadataManager metadataManager, @NotEmpty final Optional<StringValue> expectedLatestCheckpoint, @NotEmpty final String expectedNextPartition) throws IOException { Assert.assertEquals(expectedLatestCheckpoint, getLatestCheckpoint(metadataManager)); final ParquetWorkUnitCalculator calculator = new ParquetWorkUnitCalculator(this.hiveConfig, this.fileSystem); calculator.initPreviousRunState(metadataManager); final ParquetWorkUnitCalculatorResult result = calculator.computeWorkUnits(); final List<String> workUnits = result.getWorkUnits(); Assert.assertEquals(1, workUnits.size()); Assert.assertEquals(expectedNextPartition, workUnits.get(0)); Assert.assertTrue(result.getNextRunState().getPartition().isPresent()); Assert.assertEquals(expectedNextPartition, result.getNextRunState().getPartition().get()); calculator.saveNextRunState(metadataManager, result.getNextRunState()); Assert.assertEquals(expectedNextPartition, metadataManager.get(MetadataConstants.CHECKPOINT_KEY).get().getValue()); metadataManager.saveChanges(); }
@Override public JavaRDD<AvroPayload> getData(@NonNull final ParquetWorkUnitCalculatorResult workUnitCalcResult) { Preconditions.checkState(workUnitCalcResult.hasWorkUnits(), "No work to process for: " + hiveConf.getDataPath()); /** * Current implementation of HiveSource assumes that only a single work unit exists which * corresponds to the single partition that is processed per job. */ final List<String> workUnits = workUnitCalcResult.getWorkUnits(); final String hdfsPath = new Path(this.hiveConf.getDataPath(), workUnits.get(0)).toString(); log.info("Reading data from path: {}", hdfsPath); final Dataset<Row> data = this.sqlContext.read().parquet(hdfsPath); final int numPartitions = calculateHiveNumPartitions(data); log.info("Using {} partitions", numPartitions); final JavaRDD<AvroPayload> hiveRawData = data .coalesce(numPartitions) .javaRDD() .flatMap(row -> { final List<AvroPayload> payloads = new ArrayList<>(); this.converter.convert(row).forEach(d -> payloads.add(d.getSuccessData().get().getData())); return payloads.iterator(); }); return hiveRawData; }
@Test public void testComputeWorkUnitsWithExistentCheckpoint() throws IOException { this.fileSystem.mkdirs(new Path(this.dataPath, PARTITION_1)); this.fileSystem.mkdirs(new Path(this.dataPath, PARTITION_3)); this.partitionManager = new HDFSPartitionManager(JOB_NAME, this.metadataPath, this.dataPath, this.fileSystem); this.metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(this.metadataPath, JOB_NAME).toString(), new AtomicBoolean(true)); // partition 1 is in effect already processed since the checkpoint is larger final StringValue val1 = new StringValue(PARTITION_2); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); this.metadataManager.saveChanges(); final ParquetWorkUnitCalculator calculator = new ParquetWorkUnitCalculator(this.hiveConfig, this.fileSystem); calculator.initPreviousRunState(this.metadataManager); final IWorkUnitCalculator.IWorkUnitCalculatorResult iresult = calculator.computeWorkUnits(); Assert.assertTrue(iresult instanceof ParquetWorkUnitCalculatorResult); final ParquetWorkUnitCalculatorResult result = (ParquetWorkUnitCalculatorResult) iresult; final List<String> workUnits = result.getWorkUnits(); Assert.assertEquals(1, workUnits.size()); Assert.assertEquals(PARTITION_3, workUnits.get(0)); Assert.assertTrue(result.getNextRunState().getPartition().isPresent()); Assert.assertEquals(PARTITION_3, result.getNextRunState().getPartition().get()); }
Assert.assertFalse(getLatestCheckpoint(this.metadataManager).isPresent()); virtuallyProcessPartition(this.partitionManager, this.metadataManager, Optional.absent(), this.dataPath); new AtomicBoolean(true)); final Optional<StringValue> latestCheckpoint2 = getLatestCheckpoint(metadataManager2); Assert.assertTrue(latestCheckpoint2.isPresent()); Assert.assertEquals(this.dataPath, latestCheckpoint2.get().getValue()); final ParquetWorkUnitCalculator calc = new ParquetWorkUnitCalculator(this.hiveConfig, this.fileSystem); calc.initPreviousRunState(metadataManager2); Assert.assertTrue(calc.getNextPartition().isPresent()); Assert.assertEquals(this.dataPath, calc.getNextPartition().get()); calc.initPreviousRunState(metadataManager2); final IWorkUnitCalculator.IWorkUnitCalculatorResult<String, HiveRunState> iresult = calc.computeWorkUnits(); calc.saveNextRunState(metadataManager2, iresult.getNextRunState()); Assert.assertEquals(this.dataPath, metadataManager2.get(MetadataConstants.CHECKPOINT_KEY).get().getValue());
final SparkSourceDataConverter converter = new SparkSourceDataConverter(dfSchema, avroSchema, hiveConf.getConf(), Sets.newHashSet(LEFT_FIELD, RIGHT_FIELD), new ErrorExtractor()); final HiveSource source = new HiveSource(hiveConf, this.sqlContext.get(), converter); new AtomicBoolean(true)); final ParquetWorkUnitCalculator calculator = new ParquetWorkUnitCalculator(hiveConf, this.fileSystem.get()); calculator.initPreviousRunState(metadataManager); final IWorkUnitCalculator.IWorkUnitCalculatorResult<String, HiveRunState> results = calculator.computeWorkUnits(); final JavaRDD<AvroPayload> rddData = source.getData((ParquetWorkUnitCalculatorResult) results); final List<AvroPayload> collectedData = rddData.collect(); Assert.assertEquals(1, collectedData.size());
@Test public void testComputeWorkUnitsWithNoPrexistentCheckpointsMultiplePartitions() throws IOException { // No prexisting checkpoints for the workunit calculator this.fileSystem.mkdirs(new Path(this.dataPath, PARTITION_2)); this.fileSystem.mkdirs(new Path(this.dataPath, PARTITION_3)); this.partitionManager = new HDFSPartitionManager(JOB_NAME, this.metadataPath, this.dataPath, this.fileSystem); this.metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(this.metadataPath, JOB_NAME).toString(), new AtomicBoolean(true)); Assert.assertFalse(this.partitionManager.isSinglePartition()); final Optional<StringValue> latestCheckpoint = getLatestCheckpoint(this.metadataManager); Assert.assertFalse(latestCheckpoint.isPresent()); virtuallyProcessPartition(this.partitionManager, this.metadataManager, Optional.absent(), PARTITION_2); final HDFSPartitionManager partitionManager2 = new HDFSPartitionManager(JOB_NAME, this.metadataPath, this.dataPath, this.fileSystem); final HDFSMetadataManager metadataManager2 = new HDFSMetadataManager(this.fileSystem, new Path(this.metadataPath, JOB_NAME).toString(), new AtomicBoolean(true)); final Optional<StringValue> latestCheckpoint2 = getLatestCheckpoint(metadataManager2); Assert.assertTrue(latestCheckpoint2.isPresent()); virtuallyProcessPartition(partitionManager2, metadataManager2, Optional.of(new StringValue(PARTITION_2)), PARTITION_3); }
if (!checkpointGreaterThanNextPartition(latestCheckpoint)) { log.info("Save next partition {} in metadata manager", this.nextPartition); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, new StringValue(this.nextPartition.get()));
public ParquetWorkUnitCalculator(@NonNull final HiveSourceConfiguration hiveConf, @NonNull final FileSystem fs) throws IOException { this.hiveConf = hiveConf; final PartitionType partitionType = hiveConf.getPartitionType(); log.info("Create partition manger with partition type: {}", partitionType); if (partitionType.equals(PartitionType.NONE) || partitionType.equals(PartitionType.NORMAL)) { // create partition manager internally this.partitionManager = new HDFSPartitionManager(hiveConf.getJobName(), hiveConf.getBaseMetadataPath(), hiveConf.getDataPath(), fs); } else if (partitionType.equals(PartitionType.DATE)) { this.partitionManager = new HDFSDatePartitionManager(hiveConf.getJobName(), hiveConf.getBaseMetadataPath(), hiveConf.getDataPath(), hiveConf.getPartitionKeyName().get(), getHiveConf().getStartDate(), fs); } else { throw new JobRuntimeException("Error: Partition type is not supported. Partition type: " + partitionType); } }
@Test public void testBasicConfig() { final Configuration config = getValidHiveSourceConfiguration(); final HiveSourceConfiguration hiveConfig = new HiveSourceConfiguration(config); Assert.assertEquals(JOB_NAME, hiveConfig.getJobName()); Assert.assertEquals(DEFAULT_DATA_PATH, hiveConfig.getDataPath()); Assert.assertEquals(DEFAULT_METADATA_PATH, hiveConfig.getBaseMetadataPath()); Assert.assertTrue(hiveConfig.shouldSaveCheckpoint()); }