private void validateDeserializedMapEqualsInMemoryMap(final Map<String, StringValue> deserializedMap) { for (Map.Entry<String, StringValue> entry : deserializedMap.entrySet()) { final Optional<StringValue> valueInMemory = this.metadataManager.get(entry.getKey()); Assert.assertTrue(valueInMemory.isPresent()); Assert.assertEquals(valueInMemory.get().getValue(), entry.getValue().getValue()); } Assert.assertEquals(this.metadataManager.getAllKeys(), deserializedMap.keySet()); }
@Test public void testGetNextPartitionWithStartDateAndNoCheckpoint() throws IOException, ParseException { this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-05-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-06-01")); final SimpleDateFormat sdf = new SimpleDateFormat(HiveSourceConfiguration.HIVE_START_DATE_FORMAT); final Date startDate = sdf.parse("2017-05-15"); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.of(startDate), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-06-01", partition.get()); }
@Test public void testDeletionIsPropagated() throws Exception { final StringValue val1 = new StringValue("testVal"); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); this.metadataManager.saveChanges(); Optional<FileStatus> fs = this.metadataManager.getLatestMetadataFile(); Assert.assertTrue(fs.isPresent()); Map<String, StringValue> loadedMap = this.metadataManager.loadMetadata(fs.get().getPath()); validateDeserializedMapEqualsInMemoryMap(loadedMap); // reload the configuration setupTest(); Assert.assertTrue(this.metadataManager.get(MetadataConstants.CHECKPOINT_KEY).isPresent()); this.metadataManager.remove(MetadataConstants.CHECKPOINT_KEY); Assert.assertFalse(this.metadataManager.get(MetadataConstants.CHECKPOINT_KEY).isPresent()); this.metadataManager.saveChanges(); fs = this.metadataManager.getLatestMetadataFile(); Assert.assertFalse(this.metadataManager.get(MetadataConstants.CHECKPOINT_KEY).isPresent()); loadedMap = this.metadataManager.loadMetadata(fs.get().getPath()); validateDeserializedMapEqualsInMemoryMap(loadedMap); }
@Test public void testHDFSOverwriteCheckpointValue() throws IOException, InterruptedException { final StringValue val1 = new StringValue("testVal"); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); final StringValue val2 = new StringValue("testVal2"); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val2); final Optional<StringValue> readValue = this.metadataManager.get(MetadataConstants.CHECKPOINT_KEY); Assert.assertTrue(readValue.isPresent()); Assert.assertTrue(readValue.get().getValue().equals("testVal2")); this.metadataManager.saveChanges(); final Optional<FileStatus> fs = this.metadataManager.getLatestMetadataFile(); Assert.assertTrue(fs.isPresent()); final Map<String, StringValue> loadedMap = this.metadataManager.loadMetadata(fs.get().getPath()); validateDeserializedMapEqualsInMemoryMap(loadedMap); }
@Test public void testGetNextPartitionWitMultipleDatePartitionsAndNoCheckpoint() throws IOException { this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-05-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-05-02")); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.absent(), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-05-01", partition.get()); }
@Test public void testHDFSReadWriteSingleMetadataFile() throws IOException { // Test in memory final StringValue val = new StringValue("testVal"); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val); final Optional<StringValue> readValue = this.metadataManager.get(MetadataConstants.CHECKPOINT_KEY); Assert.assertTrue(readValue.isPresent()); Assert.assertTrue(readValue.get().getValue().equals("testVal")); this.metadataManager.set("foo", new StringValue("bar")); // Serialize the metadata map to a file this.metadataManager.saveChanges(); final Optional<FileStatus> fs = this.metadataManager.getLatestMetadataFile(); Assert.assertTrue(fs.isPresent()); // Deserialize the metadata map and check contents are the same final Map<String, StringValue> loadedMap = this.metadataManager.loadMetadata(fs.get().getPath()); validateDeserializedMapEqualsInMemoryMap(loadedMap); }
@Test public void testGetNextPartitionWithNonexistentCheckpoint() throws IOException { final Path basePath = new Path(RAW_DATA_PATH, "datestr=2017-05-01"); this.fs.mkdirs(basePath); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.absent(), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-05-01", partition.get()); }
@Test public void testGetNextPartitionWithCheckpointLaterThanStartDate() throws IOException, ParseException { this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-05-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-06-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-07-01")); final SimpleDateFormat sdf = new SimpleDateFormat(HiveSourceConfiguration.HIVE_START_DATE_FORMAT); final Date startDate = sdf.parse("2017-05-03"); final StringValue val1 = new StringValue("datestr=2017-06-02"); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.of(startDate), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-07-01", partition.get()); }
@Test public void testGetNextPartitionWithCheckpointBeforeThanStartDate() throws IOException, ParseException { this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-05-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-06-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-07-01")); final SimpleDateFormat sdf = new SimpleDateFormat(HiveSourceConfiguration.HIVE_START_DATE_FORMAT); final Date startDate = sdf.parse("2017-06-01"); final StringValue val1 = new StringValue("datestr=2017-05-02"); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.of(startDate), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-07-01", partition.get()); }
@Test public void testGetNextPartitionWithSmallerExistentCheckpoint() throws IOException, InterruptedException { final StringValue val1 = new StringValue("datestr=2017-05-01"); final Path partition1 = new Path(RAW_DATA_PATH, "datestr=2017-05-02"); this.fs.mkdirs(new Path(partition1, FILE1)); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.absent(), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-05-02", partition.get()); }
@Test public void testGetNextPartitionWithMultipleDatePartitionsAndOneCheckpoint() throws IOException { // Job has multiple data partitions, one is less than checkpoint and the other is larger final Path partition1 = new Path(RAW_DATA_PATH, "datestr=2017-05-01"); final Path partition2 = new Path(RAW_DATA_PATH, "datestr=2017-05-03"); this.fs.mkdirs(new Path(partition1, FILE1)); this.fs.mkdirs(new Path(partition2, FILE1)); final StringValue val1 = new StringValue("datestr=2017-05-02"); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.absent(), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-05-03", partition.get()); }
@Test public void testGetNextPartitionWithLargerExistentCheckpoint() throws IOException, InterruptedException { // In this case the checkpoint is larger than the data partition so there is no "next" partition final StringValue val1 = new StringValue("datestr=2017-05-02"); final Path partition1 = new Path(RAW_DATA_PATH, "datestr=2017-05-01"); this.fs.mkdirs(new Path(partition1, FILE1)); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.absent(), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertFalse(partition.isPresent()); } }
new AtomicBoolean(true)); final Optional<StringValue> latestCheckpoint2 = metadataManager2.get(MetadataConstants.CHECKPOINT_KEY); Assert.assertTrue(latestCheckpoint2.isPresent()); Assert.assertEquals(PARTITION1, latestCheckpoint2.get().getValue()); new AtomicBoolean(true)); final Optional<StringValue> latestCheckpoint3 = metadataManager3.get(MetadataConstants.CHECKPOINT_KEY); Assert.assertTrue(latestCheckpoint3.isPresent()); Assert.assertEquals(PARTITION2, latestCheckpoint3.get().getValue());
private void virtuallyProcessPartition(@NonNull final HDFSPartitionManager partitionManager, @NonNull final HDFSMetadataManager metadataManager, @NotEmpty final Optional<StringValue> expectedLatestCheckpoint, @NotEmpty final String expectedNextPartition) throws IOException { Assert.assertEquals(expectedLatestCheckpoint, getLatestCheckpoint(metadataManager)); final ParquetWorkUnitCalculator calculator = new ParquetWorkUnitCalculator(this.hiveConfig, this.fileSystem); calculator.initPreviousRunState(metadataManager); final ParquetWorkUnitCalculatorResult result = calculator.computeWorkUnits(); final List<String> workUnits = result.getWorkUnits(); Assert.assertEquals(1, workUnits.size()); Assert.assertEquals(expectedNextPartition, workUnits.get(0)); Assert.assertTrue(result.getNextRunState().getPartition().isPresent()); Assert.assertEquals(expectedNextPartition, result.getNextRunState().getPartition().get()); calculator.saveNextRunState(metadataManager, result.getNextRunState()); Assert.assertEquals(expectedNextPartition, metadataManager.get(MetadataConstants.CHECKPOINT_KEY).get().getValue()); metadataManager.saveChanges(); }
final IWorkUnitCalculator.IWorkUnitCalculatorResult<String, HiveRunState> iresult = calc.computeWorkUnits(); calc.saveNextRunState(metadataManager2, iresult.getNextRunState()); Assert.assertEquals(this.dataPath, metadataManager2.get(MetadataConstants.CHECKPOINT_KEY).get().getValue());