/** * Remove the specified key from the metadata * * @param key the key to remove * @return Optional of value if it was present, Optional.absent() if not */ @Override public Optional<StringValue> remove(@NotEmpty final String key) { final String val = this.metadataMap.remove(key); return val == null ? Optional.absent() : Optional.of(new StringValue(val)); }
/** * Returns given metadata key. */ public Optional<StringValue> get(@NotEmpty final String key) { final String val = this.metadataMap.get(key); return val == null ? Optional.absent() : Optional.of(new StringValue(val)); }
/** * Set the metadata for this DAG, if not empty * @param key * @param value */ public void set(@NotEmpty final String key, @NonNull final Map<String, String> value) { try { if (!value.isEmpty()) { this.metadataManager.set(key, new StringValue(mapper.writeValueAsString(value))); } } catch (JsonProcessingException e) { throw new MetadataException("Unable to set the JobManager metadata for key :" + key); } }
@Override public void saveNextRunState(@NonNull final IMetadataManager<StringValue> metadataManager, final KafkaRunState nextRunState) { final String topicName = this.conf.getTopicName(); final String topicSpecificName = getTopicSpecificMetadataKey(topicName); nextRunState.getPartitionOffsets().entrySet().forEach( entry -> { metadataManager.set(topicSpecificName + entry.getKey(), new StringValue(entry.getValue().toString())); }); }
public void persist() { final Map<String, String> stats = new HashMap<>(); if (!this.currentStat.isEmpty()) { this.sinkStatQ.add(this.currentStat); } while (this.sinkStatQ.size() > MAX_HISTORY_SIZE) { this.sinkStatQ.poll(); } for (int i = 0; !this.sinkStatQ.isEmpty(); i++) { stats.put(Integer.toString(i), SinkStat.serialize(this.sinkStatQ.poll())); } this.metadataManager.set(getMetakey(), new StringValue(MapUtil.serializeMap(stats))); }
public static Map<String, StringValue> deserialize(final ObjectInputStream ois) throws IOException { final int version = ois.readInt(); if (version == SERIALIZATION_VERSION) { final Map<String, StringValue> map = new HashMap<>(); final int numEntries = ois.readInt(); for (int i = 0; i < numEntries; i++) { final String key = ois.readUTF(); final StringValue value = new StringValue(ois.readUTF()); log.info("Deserializing key: {} and value: {}", key, value.getValue()); map.put(key, value); } if (ois.available() > 0) { throw new MetadataException("Deserialization error, not all bytes were read off the stream"); } return map; } else { throw new MetadataException("Version: " + version + " is not supported"); } }
@Test public void testGetNextPartitionCheckpointIsLargerThanPartition() throws InterruptedException, IOException { final Path partition2Path = new Path(RAW_DATA_PATH, PARTITION2); this.fileSystem.mkdirs(new Path(partition2Path, FILE1)); final StringValue val1 = new StringValue(PARTITION2); final HDFSPartitionManager pm = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, this.fileSystem); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Path partition1Path = new Path(RAW_DATA_PATH, PARTITION1); this.fileSystem.mkdirs(new Path(partition1Path, FILE1)); // Checkpoint value is greater than the partitions in the data folder so nothing new to process Assert.assertFalse(pm.getNextPartition(getLatestCheckpoint(metadataManager)).isPresent()); }
@Test public void testGetNextPartitionWithCheckpointLaterThanStartDate() throws IOException, ParseException { this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-05-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-06-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-07-01")); final SimpleDateFormat sdf = new SimpleDateFormat(HiveSourceConfiguration.HIVE_START_DATE_FORMAT); final Date startDate = sdf.parse("2017-05-03"); final StringValue val1 = new StringValue("datestr=2017-06-02"); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.of(startDate), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-07-01", partition.get()); }
@Test public void testGetNextPartitionWithCheckpointBeforeThanStartDate() throws IOException, ParseException { this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-05-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-06-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-07-01")); final SimpleDateFormat sdf = new SimpleDateFormat(HiveSourceConfiguration.HIVE_START_DATE_FORMAT); final Date startDate = sdf.parse("2017-06-01"); final StringValue val1 = new StringValue("datestr=2017-05-02"); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.of(startDate), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-07-01", partition.get()); }
@Test public void testGetNextPartitionWithOnlyTempFileCheckpoints() throws InterruptedException, IOException { final Path partitionPath = new Path(RAW_DATA_PATH, PARTITION1); final Path filePath = new Path(partitionPath, FILE1); this.fileSystem.create(filePath); final HDFSPartitionManager pm = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, this.fileSystem); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); // if this metadata was saved successfully we would say there's no partition to process // but this will be in a temp file so it will be ignored metadataManager.set(MetadataConstants.CHECKPOINT_KEY, new StringValue(PARTITION2)); metadataManager.saveChanges(); final Optional<FileStatus> fs = metadataManager.getLatestMetadataFile(); Assert.assertTrue(fs.isPresent()); // move the metadata file back to a temp location this.fileSystem.rename(fs.get().getPath(), new Path(fs.get().getPath().toString() + MetadataConstants.TEMP_FILE_EXTENSION)); final Optional<String> partition = pm.getNextPartition(getLatestCheckpoint(metadataManager)); Assert.assertTrue(partition.isPresent()); Assert.assertEquals(PARTITION1, partition.get()); }
@Test public void testGetNextPartitionMultipleDataPartitions() throws IOException, InterruptedException { final StringValue val1 = new StringValue(PARTITION1); final Path partition2Path = new Path(RAW_DATA_PATH, PARTITION2); final Path partition3Path = new Path(RAW_DATA_PATH, PARTITION3); this.fileSystem.create(new Path(partition2Path, FILE1)); this.fileSystem.create(new Path(partition3Path, FILE1)); final HDFSPartitionManager pm = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, this.fileSystem); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = getLatestCheckpoint(metadataManager); Assert.assertTrue(pm.getNextPartition(latestCheckpoint).isPresent()); Assert.assertTrue(pm.getNextPartition(latestCheckpoint).get().equals(PARTITION2)); }
@Test public void testGetNextPartitionWithSmallerExistentCheckpoint() throws IOException, InterruptedException { final StringValue val1 = new StringValue("datestr=2017-05-01"); final Path partition1 = new Path(RAW_DATA_PATH, "datestr=2017-05-02"); this.fs.mkdirs(new Path(partition1, FILE1)); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.absent(), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-05-02", partition.get()); }
@Test public void testGetNextPartitionWithMultipleDatePartitionsAndOneCheckpoint() throws IOException { // Job has multiple data partitions, one is less than checkpoint and the other is larger final Path partition1 = new Path(RAW_DATA_PATH, "datestr=2017-05-01"); final Path partition2 = new Path(RAW_DATA_PATH, "datestr=2017-05-03"); this.fs.mkdirs(new Path(partition1, FILE1)); this.fs.mkdirs(new Path(partition2, FILE1)); final StringValue val1 = new StringValue("datestr=2017-05-02"); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.absent(), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-05-03", partition.get()); }
@Test public void testGetNextPartitionWithLargerExistentCheckpoint() throws IOException, InterruptedException { // In this case the checkpoint is larger than the data partition so there is no "next" partition final StringValue val1 = new StringValue("datestr=2017-05-02"); final Path partition1 = new Path(RAW_DATA_PATH, "datestr=2017-05-01"); this.fs.mkdirs(new Path(partition1, FILE1)); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.absent(), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertFalse(partition.isPresent()); } }
@Test public void testGetNextPartitionSinglePartition() throws IOException, InterruptedException { final Path partitionPath = new Path(RAW_DATA_PATH, PARTITION2); final Path filePath = new Path(partitionPath, FILE1); this.fileSystem.create(filePath); final StringValue val1 = new StringValue(PARTITION1); final HDFSPartitionManager pm = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, this.fileSystem); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = getLatestCheckpoint(metadataManager); Assert.assertTrue(pm.getNextPartition(latestCheckpoint).isPresent()); Assert.assertTrue(pm.getNextPartition(latestCheckpoint).get().equals(PARTITION2)); }
@Test public void testHDFSOverwriteCheckpointValue() throws IOException, InterruptedException { final StringValue val1 = new StringValue("testVal"); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); final StringValue val2 = new StringValue("testVal2"); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val2); final Optional<StringValue> readValue = this.metadataManager.get(MetadataConstants.CHECKPOINT_KEY); Assert.assertTrue(readValue.isPresent()); Assert.assertTrue(readValue.get().getValue().equals("testVal2")); this.metadataManager.saveChanges(); final Optional<FileStatus> fs = this.metadataManager.getLatestMetadataFile(); Assert.assertTrue(fs.isPresent()); final Map<String, StringValue> loadedMap = this.metadataManager.loadMetadata(fs.get().getPath()); validateDeserializedMapEqualsInMemoryMap(loadedMap); }
@Test public void testComputeWorkUnitsWithNoPrexistentCheckpointsMultiplePartitions() throws IOException { // No prexisting checkpoints for the workunit calculator this.fileSystem.mkdirs(new Path(this.dataPath, PARTITION_2)); this.fileSystem.mkdirs(new Path(this.dataPath, PARTITION_3)); this.partitionManager = new HDFSPartitionManager(JOB_NAME, this.metadataPath, this.dataPath, this.fileSystem); this.metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(this.metadataPath, JOB_NAME).toString(), new AtomicBoolean(true)); Assert.assertFalse(this.partitionManager.isSinglePartition()); final Optional<StringValue> latestCheckpoint = getLatestCheckpoint(this.metadataManager); Assert.assertFalse(latestCheckpoint.isPresent()); virtuallyProcessPartition(this.partitionManager, this.metadataManager, Optional.absent(), PARTITION_2); final HDFSPartitionManager partitionManager2 = new HDFSPartitionManager(JOB_NAME, this.metadataPath, this.dataPath, this.fileSystem); final HDFSMetadataManager metadataManager2 = new HDFSMetadataManager(this.fileSystem, new Path(this.metadataPath, JOB_NAME).toString(), new AtomicBoolean(true)); final Optional<StringValue> latestCheckpoint2 = getLatestCheckpoint(metadataManager2); Assert.assertTrue(latestCheckpoint2.isPresent()); virtuallyProcessPartition(partitionManager2, metadataManager2, Optional.of(new StringValue(PARTITION_2)), PARTITION_3); }
@Test public void testHDFSReadWriteSingleMetadataFile() throws IOException { // Test in memory final StringValue val = new StringValue("testVal"); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val); final Optional<StringValue> readValue = this.metadataManager.get(MetadataConstants.CHECKPOINT_KEY); Assert.assertTrue(readValue.isPresent()); Assert.assertTrue(readValue.get().getValue().equals("testVal")); this.metadataManager.set("foo", new StringValue("bar")); // Serialize the metadata map to a file this.metadataManager.saveChanges(); final Optional<FileStatus> fs = this.metadataManager.getLatestMetadataFile(); Assert.assertTrue(fs.isPresent()); // Deserialize the metadata map and check contents are the same final Map<String, StringValue> loadedMap = this.metadataManager.loadMetadata(fs.get().getPath()); validateDeserializedMapEqualsInMemoryMap(loadedMap); }
@Test public void testComputeWorkUnitsWithExistentCheckpoint() throws IOException { this.fileSystem.mkdirs(new Path(this.dataPath, PARTITION_1)); this.fileSystem.mkdirs(new Path(this.dataPath, PARTITION_3)); this.partitionManager = new HDFSPartitionManager(JOB_NAME, this.metadataPath, this.dataPath, this.fileSystem); this.metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(this.metadataPath, JOB_NAME).toString(), new AtomicBoolean(true)); // partition 1 is in effect already processed since the checkpoint is larger final StringValue val1 = new StringValue(PARTITION_2); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); this.metadataManager.saveChanges(); final ParquetWorkUnitCalculator calculator = new ParquetWorkUnitCalculator(this.hiveConfig, this.fileSystem); calculator.initPreviousRunState(this.metadataManager); final IWorkUnitCalculator.IWorkUnitCalculatorResult iresult = calculator.computeWorkUnits(); Assert.assertTrue(iresult instanceof ParquetWorkUnitCalculatorResult); final ParquetWorkUnitCalculatorResult result = (ParquetWorkUnitCalculatorResult) iresult; final List<String> workUnits = result.getWorkUnits(); Assert.assertEquals(1, workUnits.size()); Assert.assertEquals(PARTITION_3, workUnits.get(0)); Assert.assertTrue(result.getNextRunState().getPartition().isPresent()); Assert.assertEquals(PARTITION_3, result.getNextRunState().getPartition().get()); }
@Test public void testDeletionIsPropagated() throws Exception { final StringValue val1 = new StringValue("testVal"); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); this.metadataManager.saveChanges(); Optional<FileStatus> fs = this.metadataManager.getLatestMetadataFile(); Assert.assertTrue(fs.isPresent()); Map<String, StringValue> loadedMap = this.metadataManager.loadMetadata(fs.get().getPath()); validateDeserializedMapEqualsInMemoryMap(loadedMap); // reload the configuration setupTest(); Assert.assertTrue(this.metadataManager.get(MetadataConstants.CHECKPOINT_KEY).isPresent()); this.metadataManager.remove(MetadataConstants.CHECKPOINT_KEY); Assert.assertFalse(this.metadataManager.get(MetadataConstants.CHECKPOINT_KEY).isPresent()); this.metadataManager.saveChanges(); fs = this.metadataManager.getLatestMetadataFile(); Assert.assertFalse(this.metadataManager.get(MetadataConstants.CHECKPOINT_KEY).isPresent()); loadedMap = this.metadataManager.loadMetadata(fs.get().getPath()); validateDeserializedMapEqualsInMemoryMap(loadedMap); }