public JobManagerMetadataTracker(@NonNull final Configuration config) throws IOException { final Optional<String> sourceType = config.getProperty(MetadataConstants.JOBMANAGER_METADATA_STORAGE); if (!sourceType.isPresent()) { throw new MissingPropertyException("Source information for the JobManager Metadata Tracker is missing."); } if (sourceType.get().toUpperCase().equals(MetadataConstants.JOBMANAGER_METADATA_SOURCE_HDFS)) { final Optional<String> basePath = config.getProperty(MetadataConstants.JOBMANAGER_METADATA_HDFS_BASEPATH); if (!basePath.isPresent()) { throw new MissingPropertyException("Base Path for HDFS JobManager Metadata Tracker is missing."); } this.metadataManager = new HDFSMetadataManager(FSUtils.getFs(config), basePath.get(), new AtomicBoolean(true)); this.shouldSaveChanges = new AtomicBoolean(true); } }
@VisibleForTesting /** * This method assumes that the path points explicitly to a metadata file and is not a directory * @param path * @return Map<String, StringValue> * @throws IOException */ public Map<String, StringValue> loadMetadata(final Path path) throws IOException { try (final InputStream is = new BufferedInputStream(this.fileSystem.open(path))) { try (final ObjectInputStream input = new ObjectInputStream(is)) { return deserialize(input); } } }
@Test public void testDeletionIsPropagated() throws Exception { final StringValue val1 = new StringValue("testVal"); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); this.metadataManager.saveChanges(); Optional<FileStatus> fs = this.metadataManager.getLatestMetadataFile(); Assert.assertTrue(fs.isPresent()); Map<String, StringValue> loadedMap = this.metadataManager.loadMetadata(fs.get().getPath()); validateDeserializedMapEqualsInMemoryMap(loadedMap); // reload the configuration setupTest(); Assert.assertTrue(this.metadataManager.get(MetadataConstants.CHECKPOINT_KEY).isPresent()); this.metadataManager.remove(MetadataConstants.CHECKPOINT_KEY); Assert.assertFalse(this.metadataManager.get(MetadataConstants.CHECKPOINT_KEY).isPresent()); this.metadataManager.saveChanges(); fs = this.metadataManager.getLatestMetadataFile(); Assert.assertFalse(this.metadataManager.get(MetadataConstants.CHECKPOINT_KEY).isPresent()); loadedMap = this.metadataManager.loadMetadata(fs.get().getPath()); validateDeserializedMapEqualsInMemoryMap(loadedMap); }
/** * This method will load the latest metadata file within the base metadata path for the * stated metadataKey. * * @return Map of Metadata keys to values * @throws IOException */ public Map<String, StringValue> loadMetadata() throws IOException { log.info("Attempting to load metadata"); final Optional<FileStatus> fs = getLatestMetadataFile(); if (fs.isPresent()) { log.info("Loading metadata from: {}", fs.get().getPath()); return loadMetadata(fs.get().getPath()); } else { log.info("No metadata file found"); } return new HashMap<String, StringValue>(); }
@Test public void testGetNextPartitionWithSmallerExistentCheckpoint() throws IOException, InterruptedException { final StringValue val1 = new StringValue("datestr=2017-05-01"); final Path partition1 = new Path(RAW_DATA_PATH, "datestr=2017-05-02"); this.fs.mkdirs(new Path(partition1, FILE1)); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.absent(), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-05-02", partition.get()); }
@Test public void testGetNextPartitionCheckpointIsLargerThanPartition() throws InterruptedException, IOException { final Path partition2Path = new Path(RAW_DATA_PATH, PARTITION2); this.fileSystem.mkdirs(new Path(partition2Path, FILE1)); final StringValue val1 = new StringValue(PARTITION2); final HDFSPartitionManager pm = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, this.fileSystem); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Path partition1Path = new Path(RAW_DATA_PATH, PARTITION1); this.fileSystem.mkdirs(new Path(partition1Path, FILE1)); // Checkpoint value is greater than the partitions in the data folder so nothing new to process Assert.assertFalse(pm.getNextPartition(getLatestCheckpoint(metadataManager)).isPresent()); }
@Test public void testGetNextPartitionWithOnlyTempFileCheckpoints() throws InterruptedException, IOException { final Path partitionPath = new Path(RAW_DATA_PATH, PARTITION1); final Path filePath = new Path(partitionPath, FILE1); this.fileSystem.create(filePath); final HDFSPartitionManager pm = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, this.fileSystem); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); // if this metadata was saved successfully we would say there's no partition to process // but this will be in a temp file so it will be ignored metadataManager.set(MetadataConstants.CHECKPOINT_KEY, new StringValue(PARTITION2)); metadataManager.saveChanges(); final Optional<FileStatus> fs = metadataManager.getLatestMetadataFile(); Assert.assertTrue(fs.isPresent()); // move the metadata file back to a temp location this.fileSystem.rename(fs.get().getPath(), new Path(fs.get().getPath().toString() + MetadataConstants.TEMP_FILE_EXTENSION)); final Optional<String> partition = pm.getNextPartition(getLatestCheckpoint(metadataManager)); Assert.assertTrue(partition.isPresent()); Assert.assertEquals(PARTITION1, partition.get()); }
@Test public void testGetNextPartitionWithStartDateAndNoCheckpoint() throws IOException, ParseException { this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-05-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-06-01")); final SimpleDateFormat sdf = new SimpleDateFormat(HiveSourceConfiguration.HIVE_START_DATE_FORMAT); final Date startDate = sdf.parse("2017-05-15"); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.of(startDate), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-06-01", partition.get()); }
this.fileSystem); this.metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(this.metadataPath, JOB_NAME).toString(), new AtomicBoolean(true)); this.fileSystem); final HDFSMetadataManager metadataManager2 = new HDFSMetadataManager(this.fileSystem, new Path(this.metadataPath, JOB_NAME).toString(), new AtomicBoolean(true)); metadataManager2.set(MetadataConstants.CHECKPOINT_KEY, new StringValue(StringTypes.EMPTY)); calc.initPreviousRunState(metadataManager2); final IWorkUnitCalculator.IWorkUnitCalculatorResult<String, HiveRunState> iresult = calc.computeWorkUnits(); calc.saveNextRunState(metadataManager2, iresult.getNextRunState()); Assert.assertEquals(this.dataPath, metadataManager2.get(MetadataConstants.CHECKPOINT_KEY).get().getValue());
public HDFSMetadataManager(@NonNull final FileSystem fs, @NotEmpty final String baseMetadataPath, @NonNull final AtomicBoolean shouldSaveChanges) throws IOException { this.fileSystem = fs; this.baseMetadataPath = baseMetadataPath; this.shouldSaveChanges = shouldSaveChanges; this.metadataMap = loadMetadata(); }
private void validateDeserializedMapEqualsInMemoryMap(final Map<String, StringValue> deserializedMap) { for (Map.Entry<String, StringValue> entry : deserializedMap.entrySet()) { final Optional<StringValue> valueInMemory = this.metadataManager.get(entry.getKey()); Assert.assertTrue(valueInMemory.isPresent()); Assert.assertEquals(valueInMemory.get().getValue(), entry.getValue().getValue()); } Assert.assertEquals(this.metadataManager.getAllKeys(), deserializedMap.keySet()); }
writeMetadataFile(); pruneMetadataFiles(); return null; };
private void virtuallyProcessPartition(@NonNull final HDFSPartitionManager partitionManager, @NonNull final HDFSMetadataManager metadataManager, @NotEmpty final Optional<StringValue> expectedLatestCheckpoint, @NotEmpty final String expectedNextPartition) throws IOException { Assert.assertEquals(expectedLatestCheckpoint, getLatestCheckpoint(metadataManager)); final ParquetWorkUnitCalculator calculator = new ParquetWorkUnitCalculator(this.hiveConfig, this.fileSystem); calculator.initPreviousRunState(metadataManager); final ParquetWorkUnitCalculatorResult result = calculator.computeWorkUnits(); final List<String> workUnits = result.getWorkUnits(); Assert.assertEquals(1, workUnits.size()); Assert.assertEquals(expectedNextPartition, workUnits.get(0)); Assert.assertTrue(result.getNextRunState().getPartition().isPresent()); Assert.assertEquals(expectedNextPartition, result.getNextRunState().getPartition().get()); calculator.saveNextRunState(metadataManager, result.getNextRunState()); Assert.assertEquals(expectedNextPartition, metadataManager.get(MetadataConstants.CHECKPOINT_KEY).get().getValue()); metadataManager.saveChanges(); }
private void writeMetadataFile() { final Long currentTime = System.currentTimeMillis(); final String fileLocation = new Path(this.baseMetadataPath, currentTime.toString()).toString(); final String tmpFileLocation = fileLocation.toString() + MetadataConstants.TEMP_FILE_EXTENSION; try (final OutputStream os = new BufferedOutputStream( this.fileSystem.create( new Path(tmpFileLocation)))) { try (final ObjectOutputStream oos = new ObjectOutputStream(os)) { serialize(oos); } log.info("Saving metadata to: {}", fileLocation); this.fileSystem.rename(new Path(tmpFileLocation), new Path(fileLocation)); } catch (final IOException e) { final String errMsg = String.format("IOException occurred while saving changes. Message: %s", e.getMessage()); throw new MetadataException(errMsg, e); } }
@Test public void testGetNextPartitionWithLargerExistentCheckpoint() throws IOException, InterruptedException { // In this case the checkpoint is larger than the data partition so there is no "next" partition final StringValue val1 = new StringValue("datestr=2017-05-02"); final Path partition1 = new Path(RAW_DATA_PATH, "datestr=2017-05-01"); this.fs.mkdirs(new Path(partition1, FILE1)); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.absent(), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertFalse(partition.isPresent()); } }
@Test public void testGetNextPartitionMultipleDataPartitions() throws IOException, InterruptedException { final StringValue val1 = new StringValue(PARTITION1); final Path partition2Path = new Path(RAW_DATA_PATH, PARTITION2); final Path partition3Path = new Path(RAW_DATA_PATH, PARTITION3); this.fileSystem.create(new Path(partition2Path, FILE1)); this.fileSystem.create(new Path(partition3Path, FILE1)); final HDFSPartitionManager pm = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, this.fileSystem); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = getLatestCheckpoint(metadataManager); Assert.assertTrue(pm.getNextPartition(latestCheckpoint).isPresent()); Assert.assertTrue(pm.getNextPartition(latestCheckpoint).get().equals(PARTITION2)); }
@Test public void testGetNextPartitionWitMultipleDatePartitionsAndNoCheckpoint() throws IOException { this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-05-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-05-02")); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.absent(), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-05-01", partition.get()); }
private Optional<StringValue> getLatestCheckpoint(@NonNull HDFSMetadataManager metadataManager) throws IOException { final Map<String, StringValue> metadataMap = metadataManager.loadMetadata(); return metadataMap.containsKey(MetadataConstants.CHECKPOINT_KEY) ? Optional.of(metadataMap.get(MetadataConstants.CHECKPOINT_KEY)) : Optional.absent(); } }
@Test public void testHDFSOverwriteCheckpointValue() throws IOException, InterruptedException { final StringValue val1 = new StringValue("testVal"); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); final StringValue val2 = new StringValue("testVal2"); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val2); final Optional<StringValue> readValue = this.metadataManager.get(MetadataConstants.CHECKPOINT_KEY); Assert.assertTrue(readValue.isPresent()); Assert.assertTrue(readValue.get().getValue().equals("testVal2")); this.metadataManager.saveChanges(); final Optional<FileStatus> fs = this.metadataManager.getLatestMetadataFile(); Assert.assertTrue(fs.isPresent()); final Map<String, StringValue> loadedMap = this.metadataManager.loadMetadata(fs.get().getPath()); validateDeserializedMapEqualsInMemoryMap(loadedMap); }
@Test public void testGetNextPartitionWithCheckpointLaterThanStartDate() throws IOException, ParseException { this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-05-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-06-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-07-01")); final SimpleDateFormat sdf = new SimpleDateFormat(HiveSourceConfiguration.HIVE_START_DATE_FORMAT); final Date startDate = sdf.parse("2017-05-03"); final StringValue val1 = new StringValue("datestr=2017-06-02"); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.of(startDate), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-07-01", partition.get()); }