protected FileBasedSource<byte[]> getSource() { return CompressedSource.from(new TFRecordSource(getFilepattern())) .withCompression(getCompression()); }
@VisibleForTesting BoundedSource<T> createSource() { return CompressedSource.from( new XmlSource<>( StaticValueProvider.of(getFileOrPatternSpec()), getConfiguration(), 1L)) .withCompression(getCompression()); }
protected FileBasedSource<String> getSource() { return CompressedSource.from( new TextSource( getFilepattern(), getMatchConfiguration().getEmptyMatchTreatment(), getDelimiter())) .withCompression(getCompression()); }
/** Test splittability of files in AUTO mode. */ @Test public void testAutoSplittable() throws Exception { CompressedSource<Byte> source; // GZip files are not splittable source = CompressedSource.from(new ByteSource("input.gz", 1)); assertFalse(source.isSplittable()); source = CompressedSource.from(new ByteSource("input.GZ", 1)); assertFalse(source.isSplittable()); // BZ2 files are not splittable source = CompressedSource.from(new ByteSource("input.bz2", 1)); assertFalse(source.isSplittable()); source = CompressedSource.from(new ByteSource("input.BZ2", 1)); assertFalse(source.isSplittable()); // ZIP files are not splittable source = CompressedSource.from(new ByteSource("input.zip", 1)); assertFalse(source.isSplittable()); source = CompressedSource.from(new ByteSource("input.ZIP", 1)); assertFalse(source.isSplittable()); // DEFLATE files are not splittable source = CompressedSource.from(new ByteSource("input.deflate", 1)); assertFalse(source.isSplittable()); source = CompressedSource.from(new ByteSource("input.DEFLATE", 1)); assertFalse(source.isSplittable()); // Other extensions are assumed to be splittable. source = CompressedSource.from(new ByteSource("input.txt", 1)); assertTrue(source.isSplittable()); source = CompressedSource.from(new ByteSource("input.csv", 1)); assertTrue(source.isSplittable()); }
@Test public void testGzipFileIsNotSplittable() throws Exception { String baseName = "test-input"; File compressedFile = tmpFolder.newFile(baseName + ".gz"); writeFile(compressedFile, generateInput(10), CompressionMode.GZIP); CompressedSource<Byte> source = CompressedSource.from(new ByteSource(compressedFile.getPath(), 1)); assertFalse(source.isSplittable()); }
@Test public void testBzip2FileIsNotSplittable() throws Exception { String baseName = "test-input"; File compressedFile = tmpFolder.newFile(baseName + ".bz2"); writeFile(compressedFile, generateInput(10), CompressionMode.BZIP2); CompressedSource<Byte> source = CompressedSource.from(new ByteSource(compressedFile.getPath(), 1)); assertFalse(source.isSplittable()); }
/** Test reading multiple files. */ @Test public void testCompressedReadMultipleFiles() throws Exception { int numFiles = 3; String baseName = "test_input-"; String filePattern = new File(tmpFolder.getRoot().toString(), baseName + "*").toString(); List<Byte> expected = new ArrayList<>(); for (int i = 0; i < numFiles; i++) { byte[] generated = generateInput(100); File tmpFile = tmpFolder.newFile(baseName + i); writeFile(tmpFile, generated, CompressionMode.GZIP); expected.addAll(Bytes.asList(generated)); } CompressedSource<Byte> source = CompressedSource.from(new ByteSource(filePattern, 1)) .withDecompression(CompressionMode.GZIP); List<Byte> actual = SourceTestUtils.readFromSource(source, PipelineOptionsFactory.create()); assertEquals(HashMultiset.create(expected), HashMultiset.create(actual)); }
/** * Test a concatenation of gzip files is correctly decompressed. * * <p>A concatenation of gzip files as one file is a valid gzip file and should decompress to be * the concatenation of those individual files. */ @Test public void testReadConcatenatedGzip() throws IOException { byte[] header = "a,b,c\n".getBytes(StandardCharsets.UTF_8); byte[] body = "1,2,3\n4,5,6\n7,8,9\n".getBytes(StandardCharsets.UTF_8); byte[] expected = concat(header, body); byte[] totalGz = concat(compressGzip(header), compressGzip(body)); File tmpFile = tmpFolder.newFile(); try (FileOutputStream os = new FileOutputStream(tmpFile)) { os.write(totalGz); } CompressedSource<Byte> source = CompressedSource.from(new ByteSource(tmpFile.getAbsolutePath(), 1)) .withDecompression(CompressionMode.GZIP); List<Byte> actual = SourceTestUtils.readFromSource(source, PipelineOptionsFactory.create()); assertEquals(Bytes.asList(expected), actual); }
/** Test splittability of files in GZIP mode -- none should be splittable. */ @Test public void testGzipSplittable() throws Exception { CompressedSource<Byte> source; // GZip files are not splittable source = CompressedSource.from(new ByteSource("input.gz", 1)) .withDecompression(CompressionMode.GZIP); assertFalse(source.isSplittable()); source = CompressedSource.from(new ByteSource("input.GZ", 1)) .withDecompression(CompressionMode.GZIP); assertFalse(source.isSplittable()); // Other extensions are also not splittable. source = CompressedSource.from(new ByteSource("input.txt", 1)) .withDecompression(CompressionMode.GZIP); assertFalse(source.isSplittable()); source = CompressedSource.from(new ByteSource("input.csv", 1)) .withDecompression(CompressionMode.GZIP); assertFalse(source.isSplittable()); }
private void verifyReadContents( byte[] expected, File inputFile, @Nullable DecompressingChannelFactory decompressionFactory) throws IOException { CompressedSource<Byte> source = CompressedSource.from(new ByteSource(inputFile.toPath().toString(), 1)); if (decompressionFactory != null) { source = source.withDecompression(decompressionFactory); } List<KV<Long, Byte>> actualOutput = Lists.newArrayList(); try (BoundedReader<Byte> reader = source.createReader(PipelineOptionsFactory.create())) { for (boolean more = reader.start(); more; more = reader.advance()) { actualOutput.add(KV.of(reader.getCurrentTimestamp().getMillis(), reader.getCurrent())); } } List<KV<Long, Byte>> expectedOutput = Lists.newArrayList(); for (int i = 0; i < expected.length; i++) { expectedOutput.add(KV.of((long) i, expected[i])); } assertEquals(expectedOutput, actualOutput); }
@Test public void testUncompressedFileWithAutoIsSplittable() throws Exception { String baseName = "test-input"; File uncompressedFile = tmpFolder.newFile(baseName + ".bin"); Files.write(generateInput(10), uncompressedFile); CompressedSource<Byte> source = CompressedSource.from(new ByteSource(uncompressedFile.getPath(), 1)); assertTrue(source.isSplittable()); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); }
@Test public void testUncompressedFileWithUncompressedIsSplittable() throws Exception { String baseName = "test-input"; File uncompressedFile = tmpFolder.newFile(baseName + ".bin"); Files.write(generateInput(10), uncompressedFile); CompressedSource<Byte> source = CompressedSource.from(new ByteSource(uncompressedFile.getPath(), 1)) .withDecompression(CompressionMode.UNCOMPRESSED); assertTrue(source.isSplittable()); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); }
/** Test reading multiple files with different compression. */ @Test public void testHeterogeneousCompression() throws Exception { String baseName = "test-input"; // Expected data byte[] generated = generateInput(1000); List<Byte> expected = new ArrayList<>(); // Every sort of compression File uncompressedFile = tmpFolder.newFile(baseName + ".bin"); generated = generateInput(1000, 1); Files.write(generated, uncompressedFile); expected.addAll(Bytes.asList(generated)); File gzipFile = tmpFolder.newFile(baseName + ".gz"); generated = generateInput(1000, 2); writeFile(gzipFile, generated, CompressionMode.GZIP); expected.addAll(Bytes.asList(generated)); File bzip2File = tmpFolder.newFile(baseName + ".bz2"); generated = generateInput(1000, 3); writeFile(bzip2File, generated, CompressionMode.BZIP2); expected.addAll(Bytes.asList(generated)); String filePattern = new File(tmpFolder.getRoot().toString(), baseName + ".*").toString(); CompressedSource<Byte> source = CompressedSource.from(new ByteSource(filePattern, 1)); List<Byte> actual = SourceTestUtils.readFromSource(source, PipelineOptionsFactory.create()); assertEquals(HashMultiset.create(actual), HashMultiset.create(expected)); }
@Test public void testUnsplittable() throws IOException { String baseName = "test-input"; File compressedFile = tmpFolder.newFile(baseName + ".gz"); byte[] input = generateInput(10000); writeFile(compressedFile, input, CompressionMode.GZIP); CompressedSource<Byte> source = CompressedSource.from(new ByteSource(compressedFile.getPath(), 1)); List<Byte> expected = Lists.newArrayList(); for (byte i : input) { expected.add(i); } PipelineOptions options = PipelineOptionsFactory.create(); BoundedReader<Byte> reader = source.createReader(options); List<Byte> actual = Lists.newArrayList(); for (boolean hasNext = reader.start(); hasNext; hasNext = reader.advance()) { actual.add(reader.getCurrent()); // checkpoint every 9 elements if (actual.size() % 9 == 0) { Double fractionConsumed = reader.getFractionConsumed(); assertNotNull(fractionConsumed); assertNull(reader.splitAtFraction(fractionConsumed)); } } assertEquals(expected.size(), actual.size()); assertEquals(Sets.newHashSet(expected), Sets.newHashSet(actual)); }
@Test public void testDisplayData() { ByteSource inputSource = new ByteSource("foobar.txt", 1) { @Override public void populateDisplayData(DisplayData.Builder builder) { builder.add(DisplayData.item("foo", "bar")); } }; CompressedSource<?> compressedSource = CompressedSource.from(inputSource); CompressedSource<?> gzipSource = compressedSource.withDecompression(CompressionMode.GZIP); DisplayData compressedSourceDisplayData = DisplayData.from(compressedSource); DisplayData gzipDisplayData = DisplayData.from(gzipSource); assertThat(compressedSourceDisplayData, hasDisplayItem("compressionMode")); assertThat(gzipDisplayData, hasDisplayItem("compressionMode", CompressionMode.GZIP.toString())); assertThat(compressedSourceDisplayData, hasDisplayItem("source", inputSource.getClass())); assertThat(compressedSourceDisplayData, includesDisplayDataFor("source", inputSource)); }
@ProcessElement public void process(ProcessContext c) throws IOException { ReadableFile file = c.element().getKey(); OffsetRange range = c.element().getValue(); FileBasedSource<T> source = CompressedSource.from(createSource.apply(file.getMetadata().resourceId().toString())) .withCompression(file.getCompression()); try (BoundedSource.BoundedReader<T> reader = source .createForSubrangeOfFile(file.getMetadata(), range.getFrom(), range.getTo()) .createReader(c.getPipelineOptions())) { for (boolean more = reader.start(); more; more = reader.advance()) { c.output(reader.getCurrent()); } } } }
@Test public void testEmptyGzipProgress() throws IOException { File tmpFile = tmpFolder.newFile("empty.gz"); String filename = tmpFile.toPath().toString(); writeFile(tmpFile, new byte[0], CompressionMode.GZIP); PipelineOptions options = PipelineOptionsFactory.create(); CompressedSource<Byte> source = CompressedSource.from(new ByteSource(filename, 1)); try (BoundedReader<Byte> readerOrig = source.createReader(options)) { assertThat(readerOrig, instanceOf(CompressedReader.class)); CompressedReader<Byte> reader = (CompressedReader<Byte>) readerOrig; // before starting assertEquals(0.0, reader.getFractionConsumed(), 1e-6); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(1, reader.getSplitPointsRemaining()); // confirm empty assertFalse(reader.start()); // after reading empty source assertEquals(1.0, reader.getFractionConsumed(), 1e-6); assertEquals(0, reader.getSplitPointsConsumed()); assertEquals(0, reader.getSplitPointsRemaining()); } }
CompressedSource<Byte> source = CompressedSource.from(new ByteSource(filename, 1)); try (BoundedReader<Byte> readerOrig = source.createReader(options)) { assertThat(readerOrig, not(instanceOf(CompressedReader.class)));
CompressedSource<Byte> source = CompressedSource.from(new ByteSource(filename, 1)); try (BoundedReader<Byte> readerOrig = source.createReader(options)) { assertThat(readerOrig, instanceOf(CompressedReader.class));