state.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/publisher"); File recoveryDir = new File(RecoveryHelper.getPersistDir(state).get().toUri().getPath()); RecoveryHelper recoveryHelper = new RecoveryHelper(FileSystem.getLocal(new Configuration()), state); recoveryHelper.persistFile(state, copyableFile, new Path(file.getAbsolutePath())); recoveryHelper.findPersistedFile(state, copyableFile, Predicates.<FileStatus>alwaysTrue()); Assert.assertTrue(fileToRecover.isPresent()); Assert.assertEquals(fileToRecover.get().getPath().toUri().getPath(), fileInRecovery.getAbsolutePath()); recoveryHelper.findPersistedFile(state, copyableFile, Predicates.<FileStatus>alwaysFalse()); Assert.assertFalse(fileToRecover.isPresent());
@Test public void testPurge() throws Exception { String content = "contents"; File persistDirBase = Files.createTempDir(); persistDirBase.deleteOnExit(); State state = new State(); state.setProp(RecoveryHelper.PERSIST_DIR_KEY, persistDirBase.getAbsolutePath()); state.setProp(RecoveryHelper.PERSIST_RETENTION_KEY, "1"); RecoveryHelper recoveryHelper = new RecoveryHelper(FileSystem.getLocal(new Configuration()), state); File persistDir = new File(RecoveryHelper.getPersistDir(state).get().toString()); persistDir.mkdir(); File file = new File(persistDir, "file1"); OutputStream os = new FileOutputStream(file); IOUtils.write(content, os); os.close(); file.setLastModified(System.currentTimeMillis() - TimeUnit.HOURS.toMillis(2)); File file2 = new File(persistDir, "file2"); OutputStream os2 = new FileOutputStream(file2); IOUtils.write(content, os2); os2.close(); Assert.assertEquals(persistDir.listFiles().length, 2); recoveryHelper.purgeOldPersistedFile(); Assert.assertEquals(persistDir.listFiles().length, 1); }
/** * Moves a copied path into a persistent location managed by gobblin-distcp. This method is used when an already * copied file cannot be successfully published. In future runs, instead of re-copying the file, distcp will use the * persisted file. * * @param state {@link State} containing job information. * @param file {@link org.apache.gobblin.data.management.copy.CopyEntity} from which input {@link Path} originated. * @param path {@link Path} to persist. * @return true if persist was successful. * @throws IOException */ public boolean persistFile(State state, CopyableFile file, Path path) throws IOException { if (!this.persistDir.isPresent()) { return false; } String guid = computeGuid(state, file); Path guidPath = new Path(this.persistDir.get(), guid); if (!this.fs.exists(guidPath)) { this.fs.mkdirs(guidPath, new FsPermission(FsAction.ALL, FsAction.READ, FsAction.NONE)); } Path targetPath = new Path(guidPath, shortenPathName(file.getOrigin().getPath(), 250 - guid.length())); log.info(String.format("Persisting file %s with guid %s to location %s.", path, guid, targetPath)); if (this.fs.rename(path, targetPath)) { this.fs.setTimes(targetPath, System.currentTimeMillis(), -1); return true; } return false; }
/** * Build a new {@link CopyDataPublisher} from {@link State}. The constructor expects the following to be set in the * {@link State}, * <ul> * <li>{@link ConfigurationKeys#WRITER_OUTPUT_DIR} * <li>{@link ConfigurationKeys#WRITER_FILE_SYSTEM_URI} * </ul> * */ public CopyDataPublisher(State state) throws IOException { super(state); // Extract LineageInfo from state if (state instanceof SourceState) { lineageInfo = LineageInfo.getLineageInfo(((SourceState) state).getBroker()); } else if (state instanceof WorkUnitState) { lineageInfo = LineageInfo.getLineageInfo(((WorkUnitState) state).getTaskBrokerNullable()); } else { lineageInfo = Optional.absent(); } String uri = this.state.getProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, ConfigurationKeys.LOCAL_FS_URI); this.fs = FileSystem.get(URI.create(uri), WriterUtils.getFsConfiguration(state)); FileAwareInputStreamDataWriterBuilder.setJobSpecificOutputPaths(state); this.writerOutputDir = new Path(state.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR)); MetricContext metricContext = Instrumented.getMetricContext(state, CopyDataPublisher.class, GobblinMetrics.getCustomTagsFromState(state)); this.eventSubmitter = new EventSubmitter.Builder(metricContext, "org.apache.gobblin.copy.CopyDataPublisher").build(); this.recoveryHelper = new RecoveryHelper(this.fs, state); this.recoveryHelper.purgeOldPersistedFile(); }
log.error("Could not commit file %s.", outputFilePath); this.recoveryHelper.persistFile(this.state, copyableFile, stagingFilePath); throw ioe; } finally {
public RecoveryHelper(FileSystem fs, State state) throws IOException { this.fs = fs; this.persistDir = getPersistDir(state); this.retentionHours = state.getPropAsInt(PERSIST_RETENTION_KEY, DEFAULT_PERSIST_RETENTION); }
this.copyableDatasetMetadata = CopyableDatasetMetadata.deserialize(state.getProp(CopySource.SERIALIZED_COPYABLE_DATASET)); this.recoveryHelper = new RecoveryHelper(this.fs, state); this.actualProcessedCopyableFile = Optional.absent();
this.recoveryHelper.findPersistedFile(this.state, copyableFile, fileStatusAttributesFilter);
@Test public void testShortenPathName() throws Exception { Assert.assertEquals(RecoveryHelper.shortenPathName(new Path("/test"), 10), "_test"); Assert.assertEquals(RecoveryHelper.shortenPathName(new Path("/relatively/long/path"), 9), "_re...ath"); } }
/** * Searches the persist directory to find {@link Path}s matching the input {@link org.apache.gobblin.data.management.copy.CopyEntity}. * @param state {@link State} containing job information. * @param file {@link org.apache.gobblin.data.management.copy.CopyEntity} for which persisted {@link Path}s should be found. * @param filter {@link com.google.common.base.Predicate} used to filter found paths. * @return Optionally, a {@link Path} in the {@link FileSystem} that is the desired copy of the {@link org.apache.gobblin.data.management.copy.CopyEntity}. * @throws IOException */ public Optional<FileStatus> findPersistedFile(State state, CopyEntity file, Predicate<FileStatus> filter) throws IOException { if (!this.persistDir.isPresent() || !this.fs.exists(this.persistDir.get())) { return Optional.absent(); } Path guidPath = new Path(this.persistDir.get(), computeGuid(state, file)); FileStatus[] statuses; try { statuses = this.fs.listStatus(guidPath); } catch (FileNotFoundException e) { return Optional.absent(); } for (FileStatus fileStatus : statuses) { if (filter.apply(fileStatus)) { return Optional.of(fileStatus); } } return Optional.absent(); }
/** * Build a new {@link CopyDataPublisher} from {@link State}. The constructor expects the following to be set in the * {@link State}, * <ul> * <li>{@link ConfigurationKeys#WRITER_OUTPUT_DIR} * <li>{@link ConfigurationKeys#WRITER_FILE_SYSTEM_URI} * </ul> * */ public CopyDataPublisher(State state) throws IOException { super(state); // Extract LineageInfo from state if (state instanceof SourceState) { lineageInfo = LineageInfo.getLineageInfo(((SourceState) state).getBroker()); } else if (state instanceof WorkUnitState) { lineageInfo = LineageInfo.getLineageInfo(((WorkUnitState) state).getTaskBrokerNullable()); } else { lineageInfo = Optional.absent(); } String uri = this.state.getProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, ConfigurationKeys.LOCAL_FS_URI); this.fs = FileSystem.get(URI.create(uri), WriterUtils.getFsConfiguration(state)); FileAwareInputStreamDataWriterBuilder.setJobSpecificOutputPaths(state); this.writerOutputDir = new Path(state.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR)); MetricContext metricContext = Instrumented.getMetricContext(state, CopyDataPublisher.class, GobblinMetrics.getCustomTagsFromState(state)); this.eventSubmitter = new EventSubmitter.Builder(metricContext, "org.apache.gobblin.copy.CopyDataPublisher").build(); this.recoveryHelper = new RecoveryHelper(this.fs, state); this.recoveryHelper.purgeOldPersistedFile(); }
private int persistFailedFileSet(Collection<? extends WorkUnitState> workUnitStates) throws IOException { int filesPersisted = 0; for (WorkUnitState wu : workUnitStates) { if (wu.getWorkingState() == WorkingState.SUCCESSFUL) { CopyEntity entity = CopySource.deserializeCopyEntity(wu); if (entity instanceof CopyableFile) { CopyableFile file = (CopyableFile) entity; Path outputDir = FileAwareInputStreamDataWriter.getOutputDir(wu); CopyableDatasetMetadata metadata = CopySource.deserializeCopyableDataset(wu); Path outputPath = FileAwareInputStreamDataWriter.getOutputFilePath(file, outputDir, file.getDatasetAndPartition(metadata)); if (this.recoveryHelper.persistFile(wu, file, outputPath)) { filesPersisted++; } } } } return filesPersisted; }
@Test public void testGetPersistDir() throws Exception { State state = new State(); Assert.assertFalse(RecoveryHelper.getPersistDir(state).isPresent()); state.setProp(RecoveryHelper.PERSIST_DIR_KEY, this.tmpDir.getAbsolutePath()); Assert.assertTrue(RecoveryHelper.getPersistDir(state).isPresent()); Assert.assertTrue(RecoveryHelper.getPersistDir(state).get().toUri().getPath(). startsWith(this.tmpDir.getAbsolutePath())); }
this.copyableDatasetMetadata = CopyableDatasetMetadata.deserialize(state.getProp(CopySource.SERIALIZED_COPYABLE_DATASET)); this.recoveryHelper = new RecoveryHelper(this.fs, state); this.actualProcessedCopyableFile = Optional.absent();
this.recoveryHelper.findPersistedFile(this.state, copyableFile, fileStatusAttributesFilter);
/** * Searches the persist directory to find {@link Path}s matching the input {@link org.apache.gobblin.data.management.copy.CopyEntity}. * @param state {@link State} containing job information. * @param file {@link org.apache.gobblin.data.management.copy.CopyEntity} for which persisted {@link Path}s should be found. * @param filter {@link com.google.common.base.Predicate} used to filter found paths. * @return Optionally, a {@link Path} in the {@link FileSystem} that is the desired copy of the {@link org.apache.gobblin.data.management.copy.CopyEntity}. * @throws IOException */ public Optional<FileStatus> findPersistedFile(State state, CopyEntity file, Predicate<FileStatus> filter) throws IOException { if (!this.persistDir.isPresent() || !this.fs.exists(this.persistDir.get())) { return Optional.absent(); } Path guidPath = new Path(this.persistDir.get(), computeGuid(state, file)); FileStatus[] statuses; try { statuses = this.fs.listStatus(guidPath); } catch (FileNotFoundException e) { return Optional.absent(); } for (FileStatus fileStatus : statuses) { if (filter.apply(fileStatus)) { return Optional.of(fileStatus); } } return Optional.absent(); }
/** * Moves a copied path into a persistent location managed by gobblin-distcp. This method is used when an already * copied file cannot be successfully published. In future runs, instead of re-copying the file, distcp will use the * persisted file. * * @param state {@link State} containing job information. * @param file {@link org.apache.gobblin.data.management.copy.CopyEntity} from which input {@link Path} originated. * @param path {@link Path} to persist. * @return true if persist was successful. * @throws IOException */ public boolean persistFile(State state, CopyableFile file, Path path) throws IOException { if (!this.persistDir.isPresent()) { return false; } String guid = computeGuid(state, file); Path guidPath = new Path(this.persistDir.get(), guid); if (!this.fs.exists(guidPath)) { this.fs.mkdirs(guidPath, new FsPermission(FsAction.ALL, FsAction.READ, FsAction.NONE)); } Path targetPath = new Path(guidPath, shortenPathName(file.getOrigin().getPath(), 250 - guid.length())); log.info(String.format("Persisting file %s with guid %s to location %s.", path, guid, targetPath)); if (this.fs.rename(path, targetPath)) { this.fs.setTimes(targetPath, System.currentTimeMillis(), -1); return true; } return false; }
log.error("Could not commit file %s.", outputFilePath); this.recoveryHelper.persistFile(this.state, copyableFile, stagingFilePath); throw ioe; } finally {
public RecoveryHelper(FileSystem fs, State state) throws IOException { this.fs = fs; this.persistDir = getPersistDir(state); this.retentionHours = state.getPropAsInt(PERSIST_RETENTION_KEY, DEFAULT_PERSIST_RETENTION); }
private int persistFailedFileSet(Collection<? extends WorkUnitState> workUnitStates) throws IOException { int filesPersisted = 0; for (WorkUnitState wu : workUnitStates) { if (wu.getWorkingState() == WorkingState.SUCCESSFUL) { CopyEntity entity = CopySource.deserializeCopyEntity(wu); if (entity instanceof CopyableFile) { CopyableFile file = (CopyableFile) entity; Path outputDir = FileAwareInputStreamDataWriter.getOutputDir(wu); CopyableDatasetMetadata metadata = CopySource.deserializeCopyableDataset(wu); Path outputPath = FileAwareInputStreamDataWriter.getOutputFilePath(file, outputDir, file.getDatasetAndPartition(metadata)); if (this.recoveryHelper.persistFile(wu, file, outputPath)) { filesPersisted++; } } } } return filesPersisted; }