/** Like {@link #withDecompression} but takes a canonical {@link Compression}. */ public CompressedSource<T> withCompression(Compression compression) { return withDecompression(CompressionMode.fromCanonical(compression)); }
/** Test reading multiple files. */ @Test public void testCompressedReadMultipleFiles() throws Exception { int numFiles = 3; String baseName = "test_input-"; String filePattern = new File(tmpFolder.getRoot().toString(), baseName + "*").toString(); List<Byte> expected = new ArrayList<>(); for (int i = 0; i < numFiles; i++) { byte[] generated = generateInput(100); File tmpFile = tmpFolder.newFile(baseName + i); writeFile(tmpFile, generated, CompressionMode.GZIP); expected.addAll(Bytes.asList(generated)); } CompressedSource<Byte> source = CompressedSource.from(new ByteSource(filePattern, 1)) .withDecompression(CompressionMode.GZIP); List<Byte> actual = SourceTestUtils.readFromSource(source, PipelineOptionsFactory.create()); assertEquals(HashMultiset.create(expected), HashMultiset.create(actual)); }
/** * Test a concatenation of gzip files is correctly decompressed. * * <p>A concatenation of gzip files as one file is a valid gzip file and should decompress to be * the concatenation of those individual files. */ @Test public void testReadConcatenatedGzip() throws IOException { byte[] header = "a,b,c\n".getBytes(StandardCharsets.UTF_8); byte[] body = "1,2,3\n4,5,6\n7,8,9\n".getBytes(StandardCharsets.UTF_8); byte[] expected = concat(header, body); byte[] totalGz = concat(compressGzip(header), compressGzip(body)); File tmpFile = tmpFolder.newFile(); try (FileOutputStream os = new FileOutputStream(tmpFile)) { os.write(totalGz); } CompressedSource<Byte> source = CompressedSource.from(new ByteSource(tmpFile.getAbsolutePath(), 1)) .withDecompression(CompressionMode.GZIP); List<Byte> actual = SourceTestUtils.readFromSource(source, PipelineOptionsFactory.create()); assertEquals(Bytes.asList(expected), actual); }
/** Test splittability of files in GZIP mode -- none should be splittable. */ @Test public void testGzipSplittable() throws Exception { CompressedSource<Byte> source; // GZip files are not splittable source = CompressedSource.from(new ByteSource("input.gz", 1)) .withDecompression(CompressionMode.GZIP); assertFalse(source.isSplittable()); source = CompressedSource.from(new ByteSource("input.GZ", 1)) .withDecompression(CompressionMode.GZIP); assertFalse(source.isSplittable()); // Other extensions are also not splittable. source = CompressedSource.from(new ByteSource("input.txt", 1)) .withDecompression(CompressionMode.GZIP); assertFalse(source.isSplittable()); source = CompressedSource.from(new ByteSource("input.csv", 1)) .withDecompression(CompressionMode.GZIP); assertFalse(source.isSplittable()); }
private void verifyReadContents( byte[] expected, File inputFile, @Nullable DecompressingChannelFactory decompressionFactory) throws IOException { CompressedSource<Byte> source = CompressedSource.from(new ByteSource(inputFile.toPath().toString(), 1)); if (decompressionFactory != null) { source = source.withDecompression(decompressionFactory); } List<KV<Long, Byte>> actualOutput = Lists.newArrayList(); try (BoundedReader<Byte> reader = source.createReader(PipelineOptionsFactory.create())) { for (boolean more = reader.start(); more; more = reader.advance()) { actualOutput.add(KV.of(reader.getCurrentTimestamp().getMillis(), reader.getCurrent())); } } List<KV<Long, Byte>> expectedOutput = Lists.newArrayList(); for (int i = 0; i < expected.length; i++) { expectedOutput.add(KV.of((long) i, expected[i])); } assertEquals(expectedOutput, actualOutput); }
@Test public void testUncompressedFileWithUncompressedIsSplittable() throws Exception { String baseName = "test-input"; File uncompressedFile = tmpFolder.newFile(baseName + ".bin"); Files.write(generateInput(10), uncompressedFile); CompressedSource<Byte> source = CompressedSource.from(new ByteSource(uncompressedFile.getPath(), 1)) .withDecompression(CompressionMode.UNCOMPRESSED); assertTrue(source.isSplittable()); SourceTestUtils.assertSplitAtFractionExhaustive(source, PipelineOptionsFactory.create()); }
@Test public void testDisplayData() { ByteSource inputSource = new ByteSource("foobar.txt", 1) { @Override public void populateDisplayData(DisplayData.Builder builder) { builder.add(DisplayData.item("foo", "bar")); } }; CompressedSource<?> compressedSource = CompressedSource.from(inputSource); CompressedSource<?> gzipSource = compressedSource.withDecompression(CompressionMode.GZIP); DisplayData compressedSourceDisplayData = DisplayData.from(compressedSource); DisplayData gzipDisplayData = DisplayData.from(gzipSource); assertThat(compressedSourceDisplayData, hasDisplayItem("compressionMode")); assertThat(gzipDisplayData, hasDisplayItem("compressionMode", CompressionMode.GZIP.toString())); assertThat(compressedSourceDisplayData, hasDisplayItem("source", inputSource.getClass())); assertThat(compressedSourceDisplayData, includesDisplayDataFor("source", inputSource)); }