/** * Check if a remote {@link URI} exists. */ public boolean remoteExists(URI uri) throws IOException { try { FileSystems.matchSingleFileSpec(uri.toString()); return true; } catch (FileNotFoundException e) { return false; } }
private static Metadata getMetadata(URI src) throws IOException { return FileSystems.matchSingleFileSpec(src.toString()); }
/** Constructs an {@link ExplicitShardedFile} for the given files. */ public ExplicitShardedFile(Collection<String> files) throws IOException { this.files = new ArrayList<>(); for (String file : files) { this.files.add(FileSystems.matchSingleFileSpec(file)); } }
/** * Used by the Dataflow worker. Do not introduce new usages. Do not delete without confirming that * Dataflow ValidatesRunner tests pass. * * @deprecated Used by Dataflow worker */ @Deprecated public BlockBasedSource<T> createForSubrangeOfFile(String fileName, long start, long end) throws IOException { return createForSubrangeOfFile(FileSystems.matchSingleFileSpec(fileName), start, end); }
String readFromFile(String passwordFile) throws IOException { MatchResult.Metadata m = FileSystems.matchSingleFileSpec(passwordFile); LOGGER.info("Reading password from file: {}", m.resourceId().toString()); InputStream inputStream = Channels.newInputStream(FileSystems.open(m.resourceId())); return CharStreams.toString(new InputStreamReader(inputStream, Charsets.UTF_8)); }
private boolean alreadyStaged(PackageAttributes attributes) throws IOException { try { long remoteLength = FileSystems.matchSingleFileSpec(attributes.getDestination().getLocation()).sizeBytes(); return remoteLength == attributes.getSize(); } catch (FileNotFoundException expected) { // If the file doesn't exist, it means we need to upload it. return false; } }
@Test public void testToStringFile() throws Exception { File f = createFileWithData("foo", Collections.emptyList()); Metadata metadata = FileSystems.matchSingleFileSpec(f.getPath()); TestFileBasedSource source = new TestFileBasedSource(metadata, 1, 0, 10, null); assertEquals(String.format("%s range [0, 10)", f.getAbsolutePath()), source.toString()); } }
@Test public void testReadRangeFromFileWithSplitsFromStart() throws IOException { PipelineOptions options = PipelineOptionsFactory.create(); String header = "<h>"; List<String> data = new ArrayList<>(); for (int i = 0; i < 10; i++) { data.add(header); data.addAll(createStringDataset(3, 9)); } String fileName = "file"; File file = createFileWithData(fileName, data); Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath()); TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 60, header); TestFileBasedSource source2 = new TestFileBasedSource(metadata, 64, 60, Long.MAX_VALUE, header); List<String> expectedResults = new ArrayList<>(); expectedResults.addAll(data); // Remove all occurrences of header from expected results. expectedResults.removeAll(Arrays.asList(header)); List<String> results = new ArrayList<>(); results.addAll(readFromSource(source1, options)); results.addAll(readFromSource(source2, options)); assertThat(expectedResults, containsInAnyOrder(results.toArray())); }
@Setup public void setup() throws Exception { if (spec.getTikaConfigPath() != null) { ResourceId configResource = FileSystems.matchSingleFileSpec(spec.getTikaConfigPath().get()).resourceId(); tikaConfig = new TikaConfig(Channels.newInputStream(FileSystems.open(configResource))); } }
@Test public void testSplitAtFractionExhaustive() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); // Smaller file for exhaustive testing. File file = createFileWithData("file", createStringDataset(3, 20)); Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath()); TestFileBasedSource source = new TestFileBasedSource(metadata, 1, 0, file.length(), null); assertSplitAtFractionExhaustive(source, options); }
@Test public void testReadRangeFromFileWithSplitsFromMiddle() throws IOException { PipelineOptions options = PipelineOptionsFactory.create(); String header = "<h>"; List<String> data = new ArrayList<>(); for (int i = 0; i < 10; i++) { data.add(header); data.addAll(createStringDataset(3, 9)); } String fileName = "file"; File file = createFileWithData(fileName, data); Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath()); TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 42, header); TestFileBasedSource source2 = new TestFileBasedSource(metadata, 64, 42, 112, header); TestFileBasedSource source3 = new TestFileBasedSource(metadata, 64, 112, Long.MAX_VALUE, header); List<String> expectedResults = new ArrayList<>(); expectedResults.addAll(data); // Remove all occurrences of header from expected results. expectedResults.removeAll(Collections.singletonList(header)); List<String> results = new ArrayList<>(); results.addAll(readFromSource(source1, options)); results.addAll(readFromSource(source2, options)); results.addAll(readFromSource(source3, options)); assertThat(expectedResults, containsInAnyOrder(results.toArray())); }
@Test public void testReadRangeAtStart() throws IOException { PipelineOptions options = PipelineOptionsFactory.create(); List<String> data = createStringDataset(3, 50); String fileName = "file"; File file = createFileWithData(fileName, data); Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath()); TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 25, null); TestFileBasedSource source2 = new TestFileBasedSource(metadata, 64, 25, Long.MAX_VALUE, null); List<String> results = new ArrayList<>(); results.addAll(readFromSource(source1, options)); results.addAll(readFromSource(source2, options)); assertThat(data, containsInAnyOrder(results.toArray())); }
@Test public void testReadRangeFromFileWithSplitsFromMiddleOfHeader() throws IOException { PipelineOptions options = PipelineOptionsFactory.create(); String header = "<h>"; List<String> data = new ArrayList<>(); for (int i = 0; i < 10; i++) { data.add(header); data.addAll(createStringDataset(3, 9)); } String fileName = "file"; File file = createFileWithData(fileName, data); List<String> expectedResults = new ArrayList<>(); expectedResults.addAll(data.subList(10, data.size())); // Remove all occurrences of header from expected results. expectedResults.removeAll(Collections.singletonList(header)); Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath()); // Split starts after "<" of the header TestFileBasedSource source = new TestFileBasedSource(metadata, 64, 1, Long.MAX_VALUE, header); assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray())); // Split starts after "<h" of the header source = new TestFileBasedSource(metadata, 64, 2, Long.MAX_VALUE, header); assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray())); // Split starts after "<h>" of the header source = new TestFileBasedSource(metadata, 64, 3, Long.MAX_VALUE, header); assertThat(expectedResults, containsInAnyOrder(readFromSource(source, options).toArray())); }
@Test public void testReadSchemaString() throws Exception { List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT); String codec = DataFileConstants.NULL_CODEC; String filename = generateTestFile( codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec); Metadata fileMeta = FileSystems.matchSingleFileSpec(filename); AvroMetadata metadata = AvroSource.readMetadataFromFile(fileMeta.resourceId()); // By default, parse validates the schema, which is what we want. Schema schema = new Schema.Parser().parse(metadata.getSchemaString()); assertEquals(4, schema.getFields().size()); }
@Test public void testReadFileWithSplitsWithEmptyRange() throws IOException { PipelineOptions options = PipelineOptionsFactory.create(); String header = "<h>"; List<String> data = new ArrayList<>(); for (int i = 0; i < 5; i++) { data.add(header); data.addAll(createStringDataset(3, 9)); } String fileName = "file"; File file = createFileWithData(fileName, data); Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath()); TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 42, header); TestFileBasedSource source2 = new TestFileBasedSource(metadata, 64, 42, 62, header); TestFileBasedSource source3 = new TestFileBasedSource(metadata, 64, 62, Long.MAX_VALUE, header); List<String> expectedResults = new ArrayList<>(); expectedResults.addAll(data); // Remove all occurrences of header from expected results. expectedResults.removeAll(Collections.singletonList(header)); List<String> results = new ArrayList<>(); results.addAll(readFromSource(source1, options)); results.addAll(readFromSource(source2, options)); results.addAll(readFromSource(source3, options)); assertThat(expectedResults, containsInAnyOrder(results.toArray())); }
@Test public void testReadRangeAtEnd() throws IOException { PipelineOptions options = PipelineOptionsFactory.create(); List<String> data = createStringDataset(3, 50); String fileName = "file"; File file = createFileWithData(fileName, data); Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath()); TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 162, null); TestFileBasedSource source2 = new TestFileBasedSource(metadata, 1024, 162, Long.MAX_VALUE, null); List<String> results = new ArrayList<>(); results.addAll(readFromSource(source1, options)); results.addAll(readFromSource(source2, options)); assertThat(data, containsInAnyOrder(results.toArray())); }
@Test public void testReadRangeAtMiddle() throws IOException { PipelineOptions options = PipelineOptionsFactory.create(); List<String> data = createStringDataset(3, 50); String fileName = "file"; File file = createFileWithData(fileName, data); Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath()); TestFileBasedSource source1 = new TestFileBasedSource(metadata, 64, 0, 52, null); TestFileBasedSource source2 = new TestFileBasedSource(metadata, 64, 52, 72, null); TestFileBasedSource source3 = new TestFileBasedSource(metadata, 64, 72, Long.MAX_VALUE, null); List<String> results = new ArrayList<>(); results.addAll(readFromSource(source1, options)); results.addAll(readFromSource(source2, options)); results.addAll(readFromSource(source3, options)); assertThat(data, containsInAnyOrder(results.toArray())); }
@Test public void testSplitAtFraction() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); File file = createFileWithData("file", createStringDataset(3, 100)); Metadata metadata = FileSystems.matchSingleFileSpec(file.getPath()); TestFileBasedSource source = new TestFileBasedSource(metadata, 1, 0, file.length(), null); // Shouldn't be able to split while unstarted. assertSplitAtFractionFails(source, 0, 0.7, options); assertSplitAtFractionSucceedsAndConsistent(source, 1, 0.7, options); assertSplitAtFractionSucceedsAndConsistent(source, 30, 0.7, options); assertSplitAtFractionFails(source, 0, 0.0, options); assertSplitAtFractionFails(source, 70, 0.3, options); assertSplitAtFractionFails(source, 100, 1.0, options); assertSplitAtFractionFails(source, 100, 0.99, options); assertSplitAtFractionSucceedsAndConsistent(source, 100, 0.995, options); }
@Test public void testReadMetadataWithCodecs() throws Exception { // Test reading files generated using all codecs. String[] codecs = { DataFileConstants.NULL_CODEC, DataFileConstants.BZIP2_CODEC, DataFileConstants.DEFLATE_CODEC, DataFileConstants.SNAPPY_CODEC, DataFileConstants.XZ_CODEC }; List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT); for (String codec : codecs) { String filename = generateTestFile( codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec); Metadata fileMeta = FileSystems.matchSingleFileSpec(filename); AvroMetadata metadata = AvroSource.readMetadataFromFile(fileMeta.resourceId()); assertEquals(codec, metadata.getCodec()); } }
@Test public void testSchemaStringIsInterned() throws Exception { List<Bird> birds = createRandomRecords(100); String filename = generateTestFile( "tmp.avro", birds, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC); Metadata fileMetadata = FileSystems.matchSingleFileSpec(filename); String schema = AvroSource.readMetadataFromFile(fileMetadata.resourceId()).getSchemaString(); // Add "" to the schema to make sure it is not interned. AvroSource<GenericRecord> sourceA = AvroSource.from(filename).withSchema("" + schema); AvroSource<GenericRecord> sourceB = AvroSource.from(filename).withSchema("" + schema); assertSame(sourceA.getReaderSchemaString(), sourceB.getReaderSchemaString()); // Ensure that deserialization still goes through interning AvroSource<GenericRecord> sourceC = SerializableUtils.clone(sourceB); assertSame(sourceA.getReaderSchemaString(), sourceC.getReaderSchemaString()); }