public static CopyableFile createTestCopyableFile(String resourcePath) throws IOException { FileSystem fs = FileSystem.getLocal(new Configuration()); fs.create(new Path(resourcePath)); FileStatus status = new FileStatus(0l, false, 0, 0l, 0l, new Path(resourcePath)); return new CopyableFile(status, new Path(getRandomPath()), null, null, null, PreserveAttributes.fromMnemonicString(""), "", 0 ,0, Maps.<String, String>newHashMap(), ""); }
Path thisTargetPath = new Path(configuration.getPublishDir(), filePathRelativeToSearchPath); CopyableFile copyableFile = CopyableFile.fromOriginAndDestination(this.fs, file, thisTargetPath, configuration).fileSet(datasetURN()) .datasetOutputPath(thisTargetPath.toString()).ancestorsOwnerAndPermission(CopyableFile .resolveReplicatedOwnerAndPermissionsRecursively(this.fs, file.getPath().getParent(), nonGlobSearchPath, configuration)).build(); copyableFile.setFsDatasets(this.fs, targetFs); copyableFiles.add(copyableFile);
long len = file.getFileStatus().getLen(); long blockSize = ArithmeticUtils.lcm(file.getFileStatus().getBlockSize(), file.getBlockSize(targetFs)); long maxSplitSize = workUnit.getPropAsLong(MAX_SPLIT_SIZE_KEY, DEFAULT_MAX_SPLIT_SIZE); String.format("%s.__PART%d__", file.getDestination().getName(), i)); String serializedSplit = GSON.toJson(split);
private void addLineageInfo(CopyEntity copyEntity, WorkUnit workUnit) { if (copyEntity instanceof CopyableFile) { CopyableFile copyableFile = (CopyableFile) copyEntity; /* * In Gobblin Distcp, the source and target path info of a CopyableFile are determined by its dataset found by * a DatasetFinder. Consequently, the source and destination dataset for the CopyableFile lineage are expected * to be set by the same logic */ if (lineageInfo.isPresent() && copyableFile.getSourceData() != null && copyableFile.getDestinationData() != null) { lineageInfo.get().setSource(copyableFile.getSourceData(), workUnit); } } } }
private void modifyExtensionAtDestination(CopyableFile file) { if (extensionsToRemove().size() > 0) { file.setDestination(PathUtils.removeExtension(file.getDestination(), extensionsToRemove().toArray(new String[0]))); } } }
/** * @return desired block size for destination file. */ public long getBlockSize(FileSystem targetFs) { return getPreserve().preserve(PreserveAttributes.Option.BLOCK_SIZE) ? getOrigin().getBlockSize() : targetFs.getDefaultBlockSize(this.destination); }
CopyConfiguration.builder(FileSystem.getLocal(new Configuration()), properties).preserve(preserveAttributes).build(); CopyableFile copyableFile = CopyableFile.builder(originFS, origin, datasetRoot, copyConfiguration) .destination(targetPath) Assert.assertEquals(copyableFile.getPreserve().toMnemonicString(), preserveAttributes.toMnemonicString()); Assert.assertEquals(copyableFile.getFileSet(), ""); Assert.assertEquals(copyableFile.getOrigin(), origin); Assert.assertEquals(copyableFile.getChecksum().length, 0); Assert.assertEquals(copyableFile.getDestination().toString(), targetPath.toString()); Assert.assertEquals(copyableFile.getDestinationOwnerAndPermission().getGroup(), origin.getGroup()); Assert.assertEquals(copyableFile.getDestinationOwnerAndPermission().getOwner(), origin.getOwner()); Assert.assertEquals(copyableFile.getDestinationOwnerAndPermission().getFsPermission(), origin.getPermission()); Assert.assertEquals(copyableFile.getOriginTimestamp(), origin.getModificationTime()); Assert.assertEquals(copyableFile.getUpstreamTimestamp(), origin.getModificationTime());
log.info(String.format("Merging split file %s.", file.getDestination())); file.getDatasetAndPartition(CopySource.deserializeCopyableDataset(oldWorkUnit)); Path parentPath = FileAwareInputStreamDataWriter.getOutputFilePath(file, outputDir, datasetAndPartition) .getParent();
/** * Submit an sla event when a {@link org.apache.gobblin.data.management.copy.CopyableFile} is published. The <code>workUnitState</code> passed should have the * required {@link SlaEventKeys} set. * * @see SlaEventSubmitter#submit() * * @param eventSubmitter * @param workUnitState */ static void submitSuccessfulFilePublish(EventSubmitter eventSubmitter, CopyableFile cf, WorkUnitState workUnitState) { String datasetUrn = workUnitState.getProp(SlaEventKeys.DATASET_URN_KEY); String partition = workUnitState.getProp(SlaEventKeys.PARTITION_KEY); String completenessPercentage = workUnitState.getProp(SlaEventKeys.COMPLETENESS_PERCENTAGE_KEY); String recordCount = workUnitState.getProp(SlaEventKeys.RECORD_COUNT_KEY); String previousPublishTimestamp = workUnitState.getProp(SlaEventKeys.PREVIOUS_PUBLISH_TS_IN_MILLI_SECS_KEY); String dedupeStatus = workUnitState.getProp(SlaEventKeys.DEDUPE_STATUS_KEY); SlaEventSubmitter.builder().eventSubmitter(eventSubmitter).eventName(FILE_PUBLISHED_EVENT_NAME) .datasetUrn(datasetUrn).partition(partition).originTimestamp(Long.toString(cf.getOriginTimestamp())) .upstreamTimestamp(Long.toString(cf.getUpstreamTimestamp())).completenessPercentage(completenessPercentage) .recordCount(recordCount).previousPublishTimestamp(previousPublishTimestamp).dedupeStatus(dedupeStatus) .additionalMetadata(TARGET_PATH, cf.getDestination().toString()) .additionalMetadata(SOURCE_PATH, cf.getOrigin().getPath().toString()) .additionalMetadata(SIZE_IN_BYTES, Long.toString(cf.getOrigin().getLen())).build().submit(); } }
CopyableFile copyableFile = new CopyableFile(origin, new Path(destinationPath), null, null, null, PreserveAttributes.fromMnemonicString(""), "", 0, 0, Maps.<String, String>newHashMap(), ""); copyableFile.setFsDatasets(originFs, targetFs); DatasetDescriptor source = (DatasetDescriptor) copyableFile.getSourceData(); Assert.assertEquals(source.getName(), "/data/databases/source"); Assert.assertEquals(source.getPlatform(), "hdfs"); Assert.assertEquals(source.getMetadata().get("fsUri"), originFsUri); DatasetDescriptor destination = (DatasetDescriptor) copyableFile.getDestinationData(); Assert.assertEquals(destination.getName(), "/data/databases/destination"); Assert.assertEquals(destination.getPlatform(), "file"); destinationPath = targetFsUri + destinationPath; origin = new FileStatus(0l, true, 0, 0l, 0l, new Path(originPath)); copyableFile = new CopyableFile(origin, new Path(destinationPath), null, null, null, PreserveAttributes.fromMnemonicString(""), "", 0, 0, Maps.<String, String>newHashMap(), ""); copyableFile.setFsDatasets(originFs, targetFs); source = (DatasetDescriptor) copyableFile.getSourceData(); Assert.assertEquals(source.getName(), "/data/databases/source/profile"); Assert.assertEquals(source.getPlatform(), "hdfs"); Assert.assertEquals(source.getMetadata().get("fsUri"), originFsUri); destination = (DatasetDescriptor) copyableFile.getDestinationData(); Assert.assertEquals(destination.getName(), "/data/databases/destination/profile"); Assert.assertEquals(destination.getPlatform(), "file");
if (!fileSetRoot.isPresent() && copyableFile.getDatasetOutputPath() != null) { fileSetRoot = Optional.of(copyableFile.getDatasetOutputPath()); lineageInfo.get().putDestination(copyableFile.getDestinationData(), 0, wus); if (datasetOriginTimestamp > copyableFile.getOriginTimestamp()) { datasetOriginTimestamp = copyableFile.getOriginTimestamp(); if (datasetUpstreamTimestamp > copyableFile.getUpstreamTimestamp()) { datasetUpstreamTimestamp = copyableFile.getUpstreamTimestamp();
FileAwareInputStream record) throws IOException { final short replication = copyableFile.getReplication(this.fs); final long blockSize = copyableFile.getBlockSize(this.fs); final long fileSize = copyableFile.getFileStatus().getLen(); } else { if (copyableFile.getFileStatus().isDirectory()) { this.fs.mkdirs(writeAt); return; this.taskBroker.getSharedResource(new StreamThrottler.Factory<GobblinScopeTypes>(), new EmptyKey()); ThrottledInputStream throttledInputStream = throttler.throttleInputStream().inputStream(inputStream) .sourceURI(copyableFile.getOrigin().getPath().makeQualified(defaultFS.getUri(), defaultFS.getWorkingDirectory()).toUri()) .targetURI(this.fs.makeQualified(writeAt).toUri()).build(); StreamCopier copier = new StreamCopier(throttledInputStream, os, maxBytes).withBufferSize(this.bufferSize); log.info("File {}: Starting copy", copyableFile.getOrigin().getPath()); log.info("File {}: copied {} bytes, average rate: {} B/s", copyableFile.getOrigin().getPath(), this.copySpeedMeter.getCount(), this.copySpeedMeter.getMeanRate()); } else { log.info("File {} copied.", copyableFile.getOrigin().getPath());
private void recomputeStats() { this.totalEntities = this.generatedEntities.size(); this.totalSize = 0; for (CopyEntity copyEntity : this.generatedEntities) { if (copyEntity instanceof CopyableFile) { this.totalSize += ((CopyableFile) copyEntity).getOrigin().getLen(); } } }
private ClassifiedFiles classifyFiles(Collection<? extends CopyEntity> copyEntities) { Map<Path, Path> pathsToCopy = Maps.newHashMap(); Set<Path> pathsToDelete = Sets.newHashSet(); for (CopyEntity ce : copyEntities) { if (ce instanceof CopyableFile) { pathsToCopy.put(((CopyableFile) ce).getOrigin().getPath(), ((CopyableFile) ce).getDestination()); } if (ce instanceof CommitStepCopyEntity) { CommitStep step = ((CommitStepCopyEntity) ce).getStep(); if (step instanceof DeleteFileCommitStep) { for (FileStatus status : ((DeleteFileCommitStep) step).getPathsToDelete()) { pathsToDelete.add(status.getPath()); } } } } return new ClassifiedFiles(pathsToCopy, pathsToDelete); }
@Test public void testCopySource() throws Exception { SourceState state = new SourceState(); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, "file:///"); state.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, "file:///"); state.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/target/dir"); state.setProp(DatasetUtils.DATASET_PROFILE_CLASS_KEY, TestCopyableDatasetFinder.class.getName()); CopySource source = new CopySource(); List<WorkUnit> workunits = source.getWorkunits(state); workunits = JobLauncherUtils.flattenWorkUnits(workunits); Assert.assertEquals(workunits.size(), TestCopyableDataset.FILE_COUNT); Extract extract = workunits.get(0).getExtract(); for (WorkUnit workUnit : workunits) { CopyableFile file = (CopyableFile) CopySource.deserializeCopyEntity(workUnit); Assert.assertTrue(file.getOrigin().getPath().toString().startsWith(TestCopyableDataset.ORIGIN_PREFIX)); Assert.assertEquals(file.getDestinationOwnerAndPermission(), TestCopyableDataset.OWNER_AND_PERMISSION); Assert.assertEquals(workUnit.getExtract(), extract); } }
@Override public FileAwareInputStream readRecord(@Deprecated FileAwareInputStream reuse) throws DataRecordException, IOException { if (!this.recordRead) { Configuration conf = this.state == null ? HadoopUtils.newConfiguration() : HadoopUtils.getConfFromState(this.state); FileSystem fsFromFile = this.file.getOrigin().getPath().getFileSystem(conf); this.recordRead = true; FileAwareInputStream.FileAwareInputStreamBuilder builder = FileAwareInputStream.builder().file(this.file); if (this.file.getFileStatus().isDirectory()) { return builder.inputStream(EmptyInputStream.instance).build(); } FSDataInputStream dataInputStream = fsFromFile.open(this.file.getFileStatus().getPath()); if (this.state != null && DistcpFileSplitter.isSplitWorkUnit(this.state)) { Optional<DistcpFileSplitter.Split> split = DistcpFileSplitter.getSplit(this.state); builder.split(split); if (split.isPresent()) { dataInputStream.seek(split.get().getLowPosition()); } } builder.inputStream(MeteredInputStream.builder().in(dataInputStream).build()); return builder.build(); } return null; }
.fromOriginAndDestination(copyFromFs, originFileStatus, copyToFs.makeQualified(newPath), copyConfiguration) .fileSet(PathUtils.getPathWithoutSchemeAndAuthority(copyTo.getDatasetPath()).toString()).build(); copyableFile.setFsDatasets(copyFromFs, copyToFs); copyableFiles.add(copyableFile);
@Test public void testSerializeDeserialze() throws Exception { CopyableFile copyableFile = new CopyableFile(new FileStatus(10, false, 12, 100, 12345, new Path("/path")), new Path("/destination"), new OwnerAndPermission("owner", "group", FsPermission.getDefault()), Lists.newArrayList(new OwnerAndPermission("owner2", "group2", FsPermission.getDefault())), "checksum".getBytes(), PreserveAttributes.fromMnemonicString(""), "", 0, 0, Maps .<String, String>newHashMap(), ""); DatasetDescriptor dataset = new DatasetDescriptor("hive", "db.table"); PartitionDescriptor descriptor = new PartitionDescriptor("datepartition=2018/09/05", dataset); copyableFile.setDestinationData(descriptor); String s = CopyEntity.serialize(copyableFile); CopyEntity de = CopyEntity.deserialize(s); Assert.assertEquals(de, copyableFile); }
protected Path getStagingFilePath(CopyableFile file) { if (DistcpFileSplitter.isSplitWorkUnit(this.state)) { return new Path(this.stagingDir, DistcpFileSplitter.getSplit(this.state).get().getPartName()); } return new Path(this.stagingDir, file.getDestination().getName()); }
properties.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/publisher"); CopyableFile cf = CopyableFile.fromOriginAndDestination(this.fs, status, destination, CopyConfiguration.builder(FileSystem.getLocal(new Configuration()), properties).publishDir(new Path("/target")) .preserve(PreserveAttributes.fromMnemonicString("")).build()) Path outputRoot = FileAwareInputStreamDataWriter.getPartitionOutputRoot(outputDir, cf.getDatasetAndPartition(metadata)); Path existingOutputPath = new Path(outputRoot, destinationExistingToken); this.fs.mkdirs(existingOutputPath);