protected Path getStagingFilePath(CopyableFile file) { if (DistcpFileSplitter.isSplitWorkUnit(this.state)) { return new Path(this.stagingDir, DistcpFileSplitter.getSplit(this.state).get().getPartName()); } return new Path(this.stagingDir, file.getDestination().getName()); }
Assert.assertEquals(copyableFile.getDestination().toString(), targetPath.toString()); Assert.assertEquals(copyableFile.getDestinationOwnerAndPermission().getGroup(), origin.getGroup()); Assert.assertEquals(copyableFile.getDestinationOwnerAndPermission().getOwner(), origin.getOwner());
public static Path getOutputFilePath(CopyableFile file, Path outputDir, CopyEntity.DatasetAndPartition datasetAndPartition) { Path destinationWithoutSchemeAndAuthority = PathUtils.getPathWithoutSchemeAndAuthority(file.getDestination()); return new Path(getPartitionOutputRoot(outputDir, datasetAndPartition), PathUtils.withoutLeadingSeparator(destinationWithoutSchemeAndAuthority)); }
/** * Merges all the splits for a given file. * Should be called on the target/destination file system (after blocks have been copied to targetFs). * @param fs {@link FileSystem} where file parts exist. * @param file {@link CopyableFile} to merge. * @param workUnits {@link WorkUnitState}s for all parts of this file. * @param parentPath {@link Path} where the parts of the file are located. * @return a {@link WorkUnit} equivalent to the distcp work unit if the file had not been split. * @throws IOException */ private static WorkUnitState mergeSplits(FileSystem fs, CopyableFile file, Collection<WorkUnitState> workUnits, Path parentPath) throws IOException { log.info(String.format("File %s was written in %d parts. Merging.", file.getDestination(), workUnits.size())); Path[] parts = new Path[workUnits.size()]; for (WorkUnitState workUnit : workUnits) { if (!isSplitWorkUnit(workUnit)) { throw new IOException("Not a split work unit."); } Split split = getSplit(workUnit).get(); parts[split.getSplitNumber()] = new Path(parentPath, split.getPartName()); } Path target = new Path(parentPath, file.getDestination().getName()); fs.rename(parts[0], target); fs.concat(target, Arrays.copyOfRange(parts, 1, parts.length)); WorkUnitState finalWorkUnit = workUnits.iterator().next(); finalWorkUnit.removeProp(SPLIT_KEY); return finalWorkUnit; }
private void modifyExtensionAtDestination(CopyableFile file) { if (extensionsToRemove().size() > 0) { file.setDestination(PathUtils.removeExtension(file.getDestination(), extensionsToRemove().toArray(new String[0]))); } } }
@Override public final void writeImpl(FileAwareInputStream fileAwareInputStream) throws IOException { CopyableFile copyableFile = fileAwareInputStream.getFile(); if (encryptionConfig != null) { copyableFile.setDestination(PathUtils.addExtension(copyableFile.getDestination(), "." + EncryptionConfigParser.getEncryptionType(encryptionConfig))); } Path stagingFile = getStagingFilePath(copyableFile); if (this.actualProcessedCopyableFile.isPresent()) { throw new IOException(this.getClass().getCanonicalName() + " can only process one file."); } this.actualProcessedCopyableFile = Optional.of(copyableFile); this.fs.mkdirs(stagingFile.getParent()); writeImpl(fileAwareInputStream.getInputStream(), stagingFile, copyableFile, fileAwareInputStream); this.filesWritten.incrementAndGet(); }
log.info(String.format("Merging split file %s.", file.getDestination()));
String.format("%s.__PART%d__", file.getDestination().getName(), i)); String serializedSplit = GSON.toJson(split);
private ClassifiedFiles classifyFiles(Collection<? extends CopyEntity> copyEntities) { Map<Path, Path> pathsToCopy = Maps.newHashMap(); Set<Path> pathsToDelete = Sets.newHashSet(); for (CopyEntity ce : copyEntities) { if (ce instanceof CopyableFile) { pathsToCopy.put(((CopyableFile) ce).getOrigin().getPath(), ((CopyableFile) ce).getDestination()); } if (ce instanceof CommitStepCopyEntity) { CommitStep step = ((CommitStepCopyEntity) ce).getStep(); if (step instanceof DeleteFileCommitStep) { for (FileStatus status : ((DeleteFileCommitStep) step).getPathsToDelete()) { pathsToDelete.add(status.getPath()); } } } } return new ClassifiedFiles(pathsToCopy, pathsToDelete); }
@Test public void testExtensionStripping() throws DataConversionException, IOException { List<String> helloWorldFiles = ImmutableList.of("helloworld.txt.gzip", "helloworld.txt.gz"); UnGzipConverter converter = new UnGzipConverter(); FileSystem fs = FileSystem.getLocal(new Configuration()); for (String fileName: helloWorldFiles) { String filePath = "unGzipConverterTest/" + fileName; String fullPath = getClass().getClassLoader().getResource(filePath).getFile(); FileAwareInputStream fileAwareInputStream = FileAwareInputStream.builder() .file(CopyableFileUtils.getTestCopyableFile(filePath, "/tmp/" + fileName, null, null)) .inputStream(fs.open(new Path(fullPath))).build(); Iterable<FileAwareInputStream> iterable = converter.convertRecord("outputSchema", fileAwareInputStream, new WorkUnitState()); FileAwareInputStream out = iterable.iterator().next(); Assert.assertEquals(out.getFile().getDestination().getName(), "helloworld.txt"); String contents = IOUtils.toString(out.getInputStream(), StandardCharsets.UTF_8); Assert.assertEquals(contents, "helloworld\n"); } }
PathUtils.getPathWithoutSchemeAndAuthority(new Path(sourceDir))); Path targetRelativePath = PathUtils.relativizePath(PathUtils.getPathWithoutSchemeAndAuthority(file.getDestination()), PathUtils.getPathWithoutSchemeAndAuthority(new Path(destinationDir)));
private Collection<WorkUnit> createMockSplitWorkUnits(FileSystem fs, long fileLen, long blockSize, long maxSplitSize) throws Exception { FileStatus file = mock(FileStatus.class); when(file.getLen()).thenReturn(fileLen); when(file.getBlockSize()).thenReturn(blockSize); URI uri = new URI("hdfs", "dummyhost", "/test", "test"); Path path = new Path(uri); when(fs.getUri()).thenReturn(uri); CopyableDatasetMetadata cdm = new CopyableDatasetMetadata(new TestCopyableDataset(path)); CopyableFile cf = CopyableFileUtils.getTestCopyableFile(); CopyableFile spy = spy(cf); doReturn(file).when(spy).getFileStatus(); doReturn(blockSize).when(spy).getBlockSize(any(FileSystem.class)); doReturn(path).when(spy).getDestination(); WorkUnit wu = WorkUnit.createEmpty(); wu.setProp(DistcpFileSplitter.MAX_SPLIT_SIZE_KEY, maxSplitSize); wu.setProp(ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_OUTPUT_DIR, 1, 0), path.toString()); CopySource.setWorkUnitGuid(wu, Guid.fromStrings(wu.toString())); CopySource.serializeCopyEntity(wu, cf); CopySource.serializeCopyableDataset(wu, cdm); return DistcpFileSplitter.splitFile(spy, wu, fs); }
@Test (enabled=false) public void testConvertDifferentEncryption() throws IOException, DataConversionException { final String expectedFileContents = "2345678"; WorkUnitState workUnitState = new WorkUnitState(); workUnitState.getJobState() .setProp("converter.encrypt." + EncryptionConfigParser.ENCRYPTION_ALGORITHM_KEY, "insecure_shift"); try (DecryptConverter converter = new DecryptConverter()) { converter.init(workUnitState); FileSystem fs = FileSystem.getLocal(new Configuration()); URL url = getClass().getClassLoader().getResource("decryptConverterTest/decrypt-test.txt.insecure_shift"); Assert.assertNotNull(url); String testFilePath = url.getFile(); try (FSDataInputStream testFileInput = fs.open(new Path(testFilePath))) { FileAwareInputStream fileAwareInputStream = FileAwareInputStream.builder() .file(CopyableFileUtils.getTestCopyableFile()).inputStream(testFileInput).build(); fileAwareInputStream.getFile().setDestination(new Path("file:///tmp/decrypt-test.txt.insecure_shift")); Iterable<FileAwareInputStream> iterable = converter.convertRecord("outputSchema", fileAwareInputStream, workUnitState); FileAwareInputStream decryptedStream = Iterables.getFirst(iterable, null); Assert.assertNotNull(decryptedStream); String actual = IOUtils.toString(decryptedStream.getInputStream(), Charsets.UTF_8); Assert.assertEquals(actual, expectedFileContents); Assert.assertEquals(decryptedStream.getFile().getDestination().getName(), "decrypt-test.txt"); } } }
/** * Submit an sla event when a {@link org.apache.gobblin.data.management.copy.CopyableFile} is published. The <code>workUnitState</code> passed should have the * required {@link SlaEventKeys} set. * * @see SlaEventSubmitter#submit() * * @param eventSubmitter * @param workUnitState */ static void submitSuccessfulFilePublish(EventSubmitter eventSubmitter, CopyableFile cf, WorkUnitState workUnitState) { String datasetUrn = workUnitState.getProp(SlaEventKeys.DATASET_URN_KEY); String partition = workUnitState.getProp(SlaEventKeys.PARTITION_KEY); String completenessPercentage = workUnitState.getProp(SlaEventKeys.COMPLETENESS_PERCENTAGE_KEY); String recordCount = workUnitState.getProp(SlaEventKeys.RECORD_COUNT_KEY); String previousPublishTimestamp = workUnitState.getProp(SlaEventKeys.PREVIOUS_PUBLISH_TS_IN_MILLI_SECS_KEY); String dedupeStatus = workUnitState.getProp(SlaEventKeys.DEDUPE_STATUS_KEY); SlaEventSubmitter.builder().eventSubmitter(eventSubmitter).eventName(FILE_PUBLISHED_EVENT_NAME) .datasetUrn(datasetUrn).partition(partition).originTimestamp(Long.toString(cf.getOriginTimestamp())) .upstreamTimestamp(Long.toString(cf.getUpstreamTimestamp())).completenessPercentage(completenessPercentage) .recordCount(recordCount).previousPublishTimestamp(previousPublishTimestamp).dedupeStatus(dedupeStatus) .additionalMetadata(TARGET_PATH, cf.getDestination().toString()) .additionalMetadata(SOURCE_PATH, cf.getOrigin().getPath().toString()) .additionalMetadata(SIZE_IN_BYTES, Long.toString(cf.getOrigin().getLen())).build().submit(); } }
@Test(dataProvider = "testFileDataProvider") public void testWrite(final String filePath, final String newFileName, final String expectedText) throws Exception { String expectedFileContents = "text"; String fileNameInArchive = "text.txt"; WorkUnitState state = TestUtils.createTestWorkUnitState(); state.setProp(ConfigurationKeys.WRITER_STAGING_DIR, new Path(testTempPath, "staging").toString()); state.setProp(ConfigurationKeys.WRITER_OUTPUT_DIR, new Path(testTempPath, "output").toString()); state.setProp(ConfigurationKeys.WRITER_FILE_PATH, "writer_file_path_" + RandomStringUtils.randomAlphabetic(5)); CopyableDatasetMetadata metadata = new CopyableDatasetMetadata(new TestCopyableDataset(new Path("/source"))); CopySource.serializeCopyableDataset(state, metadata); FileAwareInputStream fileAwareInputStream = getCompressedInputStream(filePath, newFileName); CopySource.serializeCopyEntity(state, fileAwareInputStream.getFile()); TarArchiveInputStreamDataWriter dataWriter = new TarArchiveInputStreamDataWriter(state, 1, 0); dataWriter.write(fileAwareInputStream); dataWriter.commit(); // the archive file contains file test.txt Path unArchivedFilePath = new Path(fileAwareInputStream.getFile().getDestination(), fileNameInArchive); // Path at which the writer writes text.txt Path taskOutputFilePath = new Path(new Path(state.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR), fileAwareInputStream.getFile().getDatasetAndPartition(metadata).identifier()), PathUtils.withoutLeadingSeparator(unArchivedFilePath)); Assert.assertEquals(IOUtils.toString(new FileInputStream(taskOutputFilePath.toString())).trim(), expectedFileContents); }
@Test public void testWrite() throws Exception { String streamString = "testContents"; FileStatus status = fs.getFileStatus(testTempPath); OwnerAndPermission ownerAndPermission = new OwnerAndPermission(status.getOwner(), status.getGroup(), new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); CopyableFile cf = CopyableFileUtils.getTestCopyableFile(ownerAndPermission); CopyableDatasetMetadata metadata = new CopyableDatasetMetadata(new TestCopyableDataset(new Path("/source"))); WorkUnitState state = TestUtils.createTestWorkUnitState(); state.setProp(ConfigurationKeys.WRITER_STAGING_DIR, new Path(testTempPath, "staging").toString()); state.setProp(ConfigurationKeys.WRITER_OUTPUT_DIR, new Path(testTempPath, "output").toString()); state.setProp(ConfigurationKeys.WRITER_FILE_PATH, RandomStringUtils.randomAlphabetic(5)); CopySource.serializeCopyEntity(state, cf); CopySource.serializeCopyableDataset(state, metadata); FileAwareInputStreamDataWriter dataWriter = new FileAwareInputStreamDataWriter(state, 1, 0); FileAwareInputStream fileAwareInputStream = FileAwareInputStream.builder().file(cf) .inputStream(StreamUtils.convertStream(IOUtils.toInputStream(streamString))).build(); dataWriter.write(fileAwareInputStream); dataWriter.commit(); Path writtenFilePath = new Path(new Path(state.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR), cf.getDatasetAndPartition(metadata).identifier()), cf.getDestination()); Assert.assertEquals(IOUtils.toString(new FileInputStream(writtenFilePath.toString())), streamString); }
@Test public void testWriteWithEncryption() throws Exception { byte[] streamString = "testEncryptedContents".getBytes("UTF-8"); byte[] expectedContents = new byte[streamString.length]; for (int i = 0; i < streamString.length; i++) { expectedContents[i] = (byte)((streamString[i] + 1) % 256); } FileStatus status = fs.getFileStatus(testTempPath); OwnerAndPermission ownerAndPermission = new OwnerAndPermission(status.getOwner(), status.getGroup(), new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); CopyableFile cf = CopyableFileUtils.getTestCopyableFile(ownerAndPermission); CopyableDatasetMetadata metadata = new CopyableDatasetMetadata(new TestCopyableDataset(new Path("/source"))); WorkUnitState state = TestUtils.createTestWorkUnitState(); state.setProp(ConfigurationKeys.WRITER_STAGING_DIR, new Path(testTempPath, "staging").toString()); state.setProp(ConfigurationKeys.WRITER_OUTPUT_DIR, new Path(testTempPath, "output").toString()); state.setProp(ConfigurationKeys.WRITER_FILE_PATH, RandomStringUtils.randomAlphabetic(5)); state.setProp("writer.encrypt." + EncryptionConfigParser.ENCRYPTION_ALGORITHM_KEY, "insecure_shift"); CopySource.serializeCopyEntity(state, cf); CopySource.serializeCopyableDataset(state, metadata); FileAwareInputStreamDataWriter dataWriter = new FileAwareInputStreamDataWriter(state, 1, 0); FileAwareInputStream fileAwareInputStream = FileAwareInputStream.builder().file(cf) .inputStream(StreamUtils.convertStream(new ByteArrayInputStream(streamString))).build(); dataWriter.write(fileAwareInputStream); dataWriter.commit(); Path writtenFilePath = new Path(new Path(state.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR), cf.getDatasetAndPartition(metadata).identifier()), cf.getDestination()); Assert.assertTrue(writtenFilePath.getName().endsWith("insecure_shift"), "Expected encryption name to be appended to destination"); Assert.assertEquals(IOUtils.toByteArray(new FileInputStream(writtenFilePath.toString())), expectedContents); }
int splits = (int) (streamString.length() / splitLen + 1); DistcpFileSplitter.Split split = new DistcpFileSplitter.Split(0, splitLen, 0, splits, String.format("%s.__PART%d__", cf.getDestination().getName(), 0)); FSDataInputStream dataInputStream = StreamUtils.convertStream(IOUtils.toInputStream(streamString)); dataInputStream.seek(split.getLowPosition()); dataWriter.commit(); Path writtenFilePath = new Path(new Path(state.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR), cf.getDatasetAndPartition(metadata).identifier()), cf.getDestination()); Assert.assertEquals(IOUtils.toString(new FileInputStream(writtenFilePath.toString())), streamString.substring(0, (int) splitLen));
cf.getDatasetAndPartition(metadata).identifier()), cf.getDestination()); Assert.assertTrue(writtenFilePath.getName().endsWith("gpg"), "Expected encryption name to be appended to destination");
cf.getDatasetAndPartition(metadata).identifier()), cf.getDestination()); Assert.assertTrue(writtenFilePath.getName().endsWith("gpg"), "Expected encryption name to be appended to destination");