@Test public void testGetExistingPartitions() throws IOException { final Path partition0Path = new Path(RAW_DATA_PATH, PARTITION0); final Path partition1Path = new Path(RAW_DATA_PATH, PARTITION1); final Path partition2Path = new Path(RAW_DATA_PATH, PARTITION2); this.fileSystem.create(new Path(partition0Path, FILE1)); this.fileSystem.create(new Path(partition1Path, FILE1)); this.fileSystem.create(new Path(partition2Path, FILE1)); final HDFSPartitionManager pm = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, this.fileSystem); Assert.assertEquals(3, pm.getExistingPartitions().size()); for (int i = 0; i < pm.getExistingPartitions().size(); i++) { Assert.assertEquals("partition" + i, pm.getExistingPartitions().get(i)); } }
@Test public void testGetExistingPartitionsOnlyFilesExist() throws IOException { final Path partition0File = new Path(RAW_DATA_PATH, PARTITION0); this.fileSystem.create(partition0File); final HDFSPartitionManager pm = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, this.fileSystem); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); Assert.assertEquals(1, pm.getExistingPartitions().size()); Assert.assertEquals(RAW_DATA_PATH, pm.getNextPartition(getLatestCheckpoint(metadataManager)).get()); }
@Test public void testGetNextPartitionWithNonExistentCheckpoint() throws InterruptedException, IOException { final Path partitionPath = new Path(RAW_DATA_PATH, PARTITION1); final Path filePath = new Path(partitionPath, FILE1); this.fileSystem.create(filePath); final HDFSPartitionManager pm = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, this.fileSystem); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); final Optional<String> partition = pm.getNextPartition(getLatestCheckpoint(metadataManager)); Assert.assertTrue(partition.isPresent()); Assert.assertEquals(PARTITION1, partition.get()); }
@Test public void testGetNextPartitionCheckpointIsLargerThanPartition() throws InterruptedException, IOException { final Path partition2Path = new Path(RAW_DATA_PATH, PARTITION2); this.fileSystem.mkdirs(new Path(partition2Path, FILE1)); final StringValue val1 = new StringValue(PARTITION2); final HDFSPartitionManager pm = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, this.fileSystem); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Path partition1Path = new Path(RAW_DATA_PATH, PARTITION1); this.fileSystem.mkdirs(new Path(partition1Path, FILE1)); // Checkpoint value is greater than the partitions in the data folder so nothing new to process Assert.assertFalse(pm.getNextPartition(getLatestCheckpoint(metadataManager)).isPresent()); }
@Test public void testGetNextPartitionWithOnlyTempFileCheckpoints() throws InterruptedException, IOException { final Path partitionPath = new Path(RAW_DATA_PATH, PARTITION1); final Path filePath = new Path(partitionPath, FILE1); this.fileSystem.create(filePath); final HDFSPartitionManager pm = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, this.fileSystem); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); // if this metadata was saved successfully we would say there's no partition to process // but this will be in a temp file so it will be ignored metadataManager.set(MetadataConstants.CHECKPOINT_KEY, new StringValue(PARTITION2)); metadataManager.saveChanges(); final Optional<FileStatus> fs = metadataManager.getLatestMetadataFile(); Assert.assertTrue(fs.isPresent()); // move the metadata file back to a temp location this.fileSystem.rename(fs.get().getPath(), new Path(fs.get().getPath().toString() + MetadataConstants.TEMP_FILE_EXTENSION)); final Optional<String> partition = pm.getNextPartition(getLatestCheckpoint(metadataManager)); Assert.assertTrue(partition.isPresent()); Assert.assertEquals(PARTITION1, partition.get()); }
@Test public void testGetNextPartitionMultipleDataPartitions() throws IOException, InterruptedException { final StringValue val1 = new StringValue(PARTITION1); final Path partition2Path = new Path(RAW_DATA_PATH, PARTITION2); final Path partition3Path = new Path(RAW_DATA_PATH, PARTITION3); this.fileSystem.create(new Path(partition2Path, FILE1)); this.fileSystem.create(new Path(partition3Path, FILE1)); final HDFSPartitionManager pm = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, this.fileSystem); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = getLatestCheckpoint(metadataManager); Assert.assertTrue(pm.getNextPartition(latestCheckpoint).isPresent()); Assert.assertTrue(pm.getNextPartition(latestCheckpoint).get().equals(PARTITION2)); }
@Test public void testComputeWorkUnitsWithNoPrexistentCheckpointsMultiplePartitions() throws IOException { // No prexisting checkpoints for the workunit calculator this.fileSystem.mkdirs(new Path(this.dataPath, PARTITION_2)); this.fileSystem.mkdirs(new Path(this.dataPath, PARTITION_3)); this.partitionManager = new HDFSPartitionManager(JOB_NAME, this.metadataPath, this.dataPath, this.fileSystem); this.metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(this.metadataPath, JOB_NAME).toString(), new AtomicBoolean(true)); Assert.assertFalse(this.partitionManager.isSinglePartition()); final Optional<StringValue> latestCheckpoint = getLatestCheckpoint(this.metadataManager); Assert.assertFalse(latestCheckpoint.isPresent()); virtuallyProcessPartition(this.partitionManager, this.metadataManager, Optional.absent(), PARTITION_2); final HDFSPartitionManager partitionManager2 = new HDFSPartitionManager(JOB_NAME, this.metadataPath, this.dataPath, this.fileSystem); final HDFSMetadataManager metadataManager2 = new HDFSMetadataManager(this.fileSystem, new Path(this.metadataPath, JOB_NAME).toString(), new AtomicBoolean(true)); final Optional<StringValue> latestCheckpoint2 = getLatestCheckpoint(metadataManager2); Assert.assertTrue(latestCheckpoint2.isPresent()); virtuallyProcessPartition(partitionManager2, metadataManager2, Optional.of(new StringValue(PARTITION_2)), PARTITION_3); }
@Test public void testGetNextPartitionSinglePartition() throws IOException, InterruptedException { final Path partitionPath = new Path(RAW_DATA_PATH, PARTITION2); final Path filePath = new Path(partitionPath, FILE1); this.fileSystem.create(filePath); final StringValue val1 = new StringValue(PARTITION1); final HDFSPartitionManager pm = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, this.fileSystem); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = getLatestCheckpoint(metadataManager); Assert.assertTrue(pm.getNextPartition(latestCheckpoint).isPresent()); Assert.assertTrue(pm.getNextPartition(latestCheckpoint).get().equals(PARTITION2)); }
public ParquetWorkUnitCalculator(@NonNull final HiveSourceConfiguration hiveConf, @NonNull final FileSystem fs) throws IOException { this.hiveConf = hiveConf; final PartitionType partitionType = hiveConf.getPartitionType(); log.info("Create partition manger with partition type: {}", partitionType); if (partitionType.equals(PartitionType.NONE) || partitionType.equals(PartitionType.NORMAL)) { // create partition manager internally this.partitionManager = new HDFSPartitionManager(hiveConf.getJobName(), hiveConf.getBaseMetadataPath(), hiveConf.getDataPath(), fs); } else if (partitionType.equals(PartitionType.DATE)) { this.partitionManager = new HDFSDatePartitionManager(hiveConf.getJobName(), hiveConf.getBaseMetadataPath(), hiveConf.getDataPath(), hiveConf.getPartitionKeyName().get(), getHiveConf().getStartDate(), fs); } else { throw new JobRuntimeException("Error: Partition type is not supported. Partition type: " + partitionType); } }
this.fileSystem.create(new Path(partition2Path, FILE1)); final HDFSPartitionManager pm = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, metadataManager.saveChanges(); final HDFSPartitionManager pm2 = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, final HDFSPartitionManager pm3 = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH,
@Test public void testComputeWorkUnitsWithExistentCheckpoint() throws IOException { this.fileSystem.mkdirs(new Path(this.dataPath, PARTITION_1)); this.fileSystem.mkdirs(new Path(this.dataPath, PARTITION_3)); this.partitionManager = new HDFSPartitionManager(JOB_NAME, this.metadataPath, this.dataPath, this.fileSystem); this.metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(this.metadataPath, JOB_NAME).toString(), new AtomicBoolean(true)); // partition 1 is in effect already processed since the checkpoint is larger final StringValue val1 = new StringValue(PARTITION_2); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); this.metadataManager.saveChanges(); final ParquetWorkUnitCalculator calculator = new ParquetWorkUnitCalculator(this.hiveConfig, this.fileSystem); calculator.initPreviousRunState(this.metadataManager); final IWorkUnitCalculator.IWorkUnitCalculatorResult iresult = calculator.computeWorkUnits(); Assert.assertTrue(iresult instanceof ParquetWorkUnitCalculatorResult); final ParquetWorkUnitCalculatorResult result = (ParquetWorkUnitCalculatorResult) iresult; final List<String> workUnits = result.getWorkUnits(); Assert.assertEquals(1, workUnits.size()); Assert.assertEquals(PARTITION_3, workUnits.get(0)); Assert.assertTrue(result.getNextRunState().getPartition().isPresent()); Assert.assertEquals(PARTITION_3, result.getNextRunState().getPartition().get()); }
this.fileSystem.create(new Path(this.dataPath, dataFileName)); this.partitionManager = new HDFSPartitionManager(JOB_NAME, this.metadataPath, this.dataPath, final HDFSPartitionManager pm2 = new HDFSPartitionManager(JOB_NAME, this.metadataPath, this.dataPath,