private void recomputeStats() { this.totalEntities = this.generatedEntities.size(); this.totalSize = 0; for (CopyEntity copyEntity : this.generatedEntities) { if (copyEntity instanceof CopyableFile) { this.totalSize += ((CopyableFile) copyEntity).getOrigin().getLen(); } } }
/** * Submit an sla event when a {@link org.apache.gobblin.data.management.copy.CopyableFile} is published. The <code>workUnitState</code> passed should have the * required {@link SlaEventKeys} set. * * @see SlaEventSubmitter#submit() * * @param eventSubmitter * @param workUnitState */ static void submitSuccessfulFilePublish(EventSubmitter eventSubmitter, CopyableFile cf, WorkUnitState workUnitState) { String datasetUrn = workUnitState.getProp(SlaEventKeys.DATASET_URN_KEY); String partition = workUnitState.getProp(SlaEventKeys.PARTITION_KEY); String completenessPercentage = workUnitState.getProp(SlaEventKeys.COMPLETENESS_PERCENTAGE_KEY); String recordCount = workUnitState.getProp(SlaEventKeys.RECORD_COUNT_KEY); String previousPublishTimestamp = workUnitState.getProp(SlaEventKeys.PREVIOUS_PUBLISH_TS_IN_MILLI_SECS_KEY); String dedupeStatus = workUnitState.getProp(SlaEventKeys.DEDUPE_STATUS_KEY); SlaEventSubmitter.builder().eventSubmitter(eventSubmitter).eventName(FILE_PUBLISHED_EVENT_NAME) .datasetUrn(datasetUrn).partition(partition).originTimestamp(Long.toString(cf.getOriginTimestamp())) .upstreamTimestamp(Long.toString(cf.getUpstreamTimestamp())).completenessPercentage(completenessPercentage) .recordCount(recordCount).previousPublishTimestamp(previousPublishTimestamp).dedupeStatus(dedupeStatus) .additionalMetadata(TARGET_PATH, cf.getDestination().toString()) .additionalMetadata(SOURCE_PATH, cf.getOrigin().getPath().toString()) .additionalMetadata(SIZE_IN_BYTES, Long.toString(cf.getOrigin().getLen())).build().submit(); } }
private static void setWorkUnitWeight(WorkUnit workUnit, CopyEntity copyEntity, long minWeight) { long weight = 0; if (copyEntity instanceof CopyableFile) { weight = ((CopyableFile) copyEntity).getOrigin().getLen(); } weight = Math.max(weight, minWeight); workUnit.setProp(WORK_UNIT_WEIGHT, Long.toString(weight)); }
Assert.assertEquals(copyableFile.getOrigin(), origin);
private void deleteFilesOnSource(WorkUnitState state) throws IOException { CopyEntity copyEntity = CopySource.deserializeCopyEntity(state); if (copyEntity instanceof CopyableFile) { HadoopUtils.deletePath(this.sourceFs, ((CopyableFile) copyEntity).getOrigin().getPath(), true); HadoopUtils.deletePath(this.sourceFs, PathUtils.addExtension(((CopyableFile) copyEntity).getOrigin().getPath(), ReadyCopyableFileFilter.READY_EXTENSION), true); } } }
Assert.assertEquals(copyableFile.getOrigin(), origin);
Path readyFilePath = PathUtils.addExtension(cf.getOrigin().getPath(), READY_EXTENSION); try { if (sourceFs.exists(readyFilePath)) { filtered.add(cf); } else { log.info(String.format("Removing %s as the .ready file is not found", cf.getOrigin().getPath())); cf.getOrigin().getPath(), e.getMessage()));
/** * Moves a copied path into a persistent location managed by gobblin-distcp. This method is used when an already * copied file cannot be successfully published. In future runs, instead of re-copying the file, distcp will use the * persisted file. * * @param state {@link State} containing job information. * @param file {@link org.apache.gobblin.data.management.copy.CopyEntity} from which input {@link Path} originated. * @param path {@link Path} to persist. * @return true if persist was successful. * @throws IOException */ public boolean persistFile(State state, CopyableFile file, Path path) throws IOException { if (!this.persistDir.isPresent()) { return false; } String guid = computeGuid(state, file); Path guidPath = new Path(this.persistDir.get(), guid); if (!this.fs.exists(guidPath)) { this.fs.mkdirs(guidPath, new FsPermission(FsAction.ALL, FsAction.READ, FsAction.NONE)); } Path targetPath = new Path(guidPath, shortenPathName(file.getOrigin().getPath(), 250 - guid.length())); log.info(String.format("Persisting file %s with guid %s to location %s.", path, guid, targetPath)); if (this.fs.rename(path, targetPath)) { this.fs.setTimes(targetPath, System.currentTimeMillis(), -1); return true; } return false; }
private List<String> extractPaths(List<WorkUnit> workUnits) { List<String> paths = Lists.newArrayList(); for (WorkUnit wu : workUnits) { CopyableFile cf = (CopyableFile) CopySource.deserializeCopyEntity(wu); paths.add(cf.getOrigin().getPath().toString()); } return paths; }
/** * @return desired block size for destination file. */ public long getBlockSize(FileSystem targetFs) { return getPreserve().preserve(PreserveAttributes.Option.BLOCK_SIZE) ? getOrigin().getBlockSize() : targetFs.getDefaultBlockSize(this.destination); }
/** * @return desired replication for destination file. */ public short getReplication(FileSystem targetFs) { return getPreserve().preserve(PreserveAttributes.Option.REPLICATION) ? getOrigin().getReplication() : targetFs.getDefaultReplication(this.destination); }
@Test public void testReadRecord() throws Exception { CopyableFile file = getTestCopyableFile("inputStreamExtractorTest/first.txt"); FileAwareInputStreamExtractor extractor = new FileAwareInputStreamExtractor(FileSystem.getLocal(new Configuration()), file); FileAwareInputStream fileAwareInputStream = extractor.readRecord(null); Assert.assertEquals(fileAwareInputStream.getFile().getOrigin().getPath(), file.getOrigin().getPath()); Assert.assertEquals(IOUtils.toString(fileAwareInputStream.getInputStream()), "first"); Assert.assertNull(extractor.readRecord(null)); }
@Test public void testFilter() throws Exception { CopyableFileFilter readyFilter = new ReadyCopyableFileFilter(); List<CopyableFile> copyableFiles = Lists.newArrayList(); copyableFiles.add(CopyableFileUtils.getTestCopyableFile()); copyableFiles.add(CopyableFileUtils.getTestCopyableFile()); copyableFiles.add(CopyableFileUtils.getTestCopyableFile()); FileSystem sourceFs = Mockito.mock(FileSystem.class); Mockito.when(sourceFs.exists(PathUtils.addExtension(copyableFiles.get(0).getOrigin().getPath(), ".ready"))) .thenReturn(false); Mockito.when(sourceFs.exists(PathUtils.addExtension(copyableFiles.get(1).getOrigin().getPath(), ".ready"))) .thenReturn(true); Mockito.when(sourceFs.exists(PathUtils.addExtension(copyableFiles.get(2).getOrigin().getPath(), ".ready"))) .thenReturn(false); Collection<CopyableFile> filtered = readyFilter.filter(sourceFs, null, copyableFiles); Assert.assertEquals(filtered.size(), 1); }
this.taskBroker.getSharedResource(new StreamThrottler.Factory<GobblinScopeTypes>(), new EmptyKey()); ThrottledInputStream throttledInputStream = throttler.throttleInputStream().inputStream(inputStream) .sourceURI(copyableFile.getOrigin().getPath().makeQualified(defaultFS.getUri(), defaultFS.getWorkingDirectory()).toUri()) .targetURI(this.fs.makeQualified(writeAt).toUri()).build(); StreamCopier copier = new StreamCopier(throttledInputStream, os, maxBytes).withBufferSize(this.bufferSize); log.info("File {}: Starting copy", copyableFile.getOrigin().getPath()); log.info("File {}: copied {} bytes, average rate: {} B/s", copyableFile.getOrigin().getPath(), this.copySpeedMeter.getCount(), this.copySpeedMeter.getMeanRate()); } else { log.info("File {} copied.", copyableFile.getOrigin().getPath());
private ClassifiedFiles classifyFiles(Collection<? extends CopyEntity> copyEntities) { Map<Path, Path> pathsToCopy = Maps.newHashMap(); Set<Path> pathsToDelete = Sets.newHashSet(); for (CopyEntity ce : copyEntities) { if (ce instanceof CopyableFile) { pathsToCopy.put(((CopyableFile) ce).getOrigin().getPath(), ((CopyableFile) ce).getDestination()); } if (ce instanceof CommitStepCopyEntity) { CommitStep step = ((CommitStepCopyEntity) ce).getStep(); if (step instanceof DeleteFileCommitStep) { for (FileStatus status : ((DeleteFileCommitStep) step).getPathsToDelete()) { pathsToDelete.add(status.getPath()); } } } } return new ClassifiedFiles(pathsToCopy, pathsToDelete); }
@Override public FileAwareInputStream readRecord(@Deprecated FileAwareInputStream reuse) throws DataRecordException, IOException { if (!this.recordRead) { Configuration conf = this.state == null ? HadoopUtils.newConfiguration() : HadoopUtils.getConfFromState(this.state); FileSystem fsFromFile = this.file.getOrigin().getPath().getFileSystem(conf); this.recordRead = true; FileAwareInputStream.FileAwareInputStreamBuilder builder = FileAwareInputStream.builder().file(this.file); if (this.file.getFileStatus().isDirectory()) { return builder.inputStream(EmptyInputStream.instance).build(); } FSDataInputStream dataInputStream = fsFromFile.open(this.file.getFileStatus().getPath()); if (this.state != null && DistcpFileSplitter.isSplitWorkUnit(this.state)) { Optional<DistcpFileSplitter.Split> split = DistcpFileSplitter.getSplit(this.state); builder.split(split); if (split.isPresent()) { dataInputStream.seek(split.get().getLowPosition()); } } builder.inputStream(MeteredInputStream.builder().in(dataInputStream).build()); return builder.build(); } return null; }
CopyableFile file = (CopyableFile) copyEntity; Path originRelativePath = PathUtils.relativizePath(PathUtils.getPathWithoutSchemeAndAuthority(file.getOrigin().getPath()), PathUtils.getPathWithoutSchemeAndAuthority(new Path(sourceDir))); Path targetRelativePath =
@Test public void testCopySource() throws Exception { SourceState state = new SourceState(); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, "file:///"); state.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, "file:///"); state.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/target/dir"); state.setProp(DatasetUtils.DATASET_PROFILE_CLASS_KEY, TestCopyableDatasetFinder.class.getName()); CopySource source = new CopySource(); List<WorkUnit> workunits = source.getWorkunits(state); workunits = JobLauncherUtils.flattenWorkUnits(workunits); Assert.assertEquals(workunits.size(), TestCopyableDataset.FILE_COUNT); Extract extract = workunits.get(0).getExtract(); for (WorkUnit workUnit : workunits) { CopyableFile file = (CopyableFile) CopySource.deserializeCopyEntity(workUnit); Assert.assertTrue(file.getOrigin().getPath().toString().startsWith(TestCopyableDataset.ORIGIN_PREFIX)); Assert.assertEquals(file.getDestinationOwnerAndPermission(), TestCopyableDataset.OWNER_AND_PERMISSION); Assert.assertEquals(workUnit.getExtract(), extract); } }
Assert.assertTrue(copyableFile.getOrigin().getPath().toString().startsWith(TestCopyableDataset.ORIGIN_PREFIX)); Assert.assertEquals(copyableFile.getDestinationOwnerAndPermission(), TestCopyableDataset.OWNER_AND_PERMISSION); if (Integer.parseInt(copyableFile.getOrigin().getPath().getName()) < TestCopyablePartitionableDataset.THRESHOLD) {