@Override public void close() throws IOException { if (this.invalidLineCount > 0) { if (LOG.isWarnEnabled()) { LOG.warn("In file \"" + currentSplit.getPath() + "\" (split start: " + this.splitStart + ") " + this.invalidLineCount +" invalid line(s) were skipped."); } } if (this.commentCount > 0) { if (LOG.isInfoEnabled()) { LOG.info("In file \"" + currentSplit.getPath() + "\" (split start: " + this.splitStart + ") " + this.commentCount +" comment line(s) were skipped."); } } super.close(); }
@Override public void run() { try { final FileSystem fs = FileSystem.get(this.split.getPath().toUri()); this.fdis = fs.open(this.split.getPath()); // check for canceling and close the stream in that case, because no one will obtain it if (this.aborted) { final FSDataInputStream f = this.fdis; this.fdis = null; f.close(); } } catch (Throwable t) { this.error = t; } }
/** * Creates the input splits to be forwarded to the downstream tasks of the * {@link ContinuousFileReaderOperator}. Splits are sorted <b>by modification time</b> before * being forwarded and only splits belonging to files in the {@code eligibleFiles} * list will be processed. * @param eligibleFiles The files to process. */ private Map<Long, List<TimestampedFileInputSplit>> getInputSplitsSortedByModTime( Map<Path, FileStatus> eligibleFiles) throws IOException { Map<Long, List<TimestampedFileInputSplit>> splitsByModTime = new TreeMap<>(); if (eligibleFiles.isEmpty()) { return splitsByModTime; } for (FileInputSplit split: format.createInputSplits(readerParallelism)) { FileStatus fileStatus = eligibleFiles.get(split.getPath()); if (fileStatus != null) { Long modTime = fileStatus.getModificationTime(); List<TimestampedFileInputSplit> splitsToForward = splitsByModTime.get(modTime); if (splitsToForward == null) { splitsToForward = new ArrayList<>(); splitsByModTime.put(modTime, splitsToForward); } splitsToForward.add(new TimestampedFileInputSplit( modTime, split.getSplitNumber(), split.getPath(), split.getStart(), split.getLength(), split.getHostnames())); } } return splitsByModTime; }
private DataFileReader<E> initReader(FileInputSplit split) throws IOException { DatumReader<E> datumReader; if (org.apache.avro.generic.GenericRecord.class == avroValueType) { datumReader = new GenericDatumReader<E>(); } else { datumReader = org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType) ? new SpecificDatumReader<E>(avroValueType) : new ReflectDatumReader<E>(avroValueType); } if (LOG.isInfoEnabled()) { LOG.info("Opening split {}", split); } SeekableInput in = new FSDataInputStreamWrapper(stream, split.getPath().getFileSystem().getFileStatus(split.getPath()).getLen()); DataFileReader<E> dataFileReader = (DataFileReader) DataFileReader.openReader(in, datumReader); if (LOG.isDebugEnabled()) { LOG.debug("Loaded SCHEMA: {}", dataFileReader.getSchema()); } end = split.getStart() + split.getLength(); recordsReadSinceLastSync = 0; return dataFileReader; }
/** * This method allows to wrap/decorate the raw {@link FSDataInputStream} for a certain file split, e.g., for decoding. * When overriding this method, also consider adapting {@link FileInputFormat#testForUnsplittable} if your * stream decoration renders the input file unsplittable. Also consider calling existing superclass implementations. * * @param inputStream is the input stream to decorated * @param fileSplit is the file split for which the input stream shall be decorated * @return the decorated input stream * @throws Throwable if the decoration fails * @see org.apache.flink.api.common.io.InputStreamFSInputWrapper */ protected FSDataInputStream decorateInputStream(FSDataInputStream inputStream, FileInputSplit fileSplit) throws Throwable { // Wrap stream in a extracting (decompressing) stream if file ends with a known compression file extension. InflaterInputStreamFactory<?> inflaterInputStreamFactory = getInflaterInputStreamFactory(fileSplit.getPath()); if (inflaterInputStreamFactory != null) { return new InputStreamFSInputWrapper(inflaterInputStreamFactory.create(stream)); } return inputStream; }
+ "ParserError " + parser.getErrorState() + " \n" + "Expect field types: "+fieldTypesToString() + " \n" + "in file: " + currentSplit.getPath()); throw new ParseException("Line could not be parsed: '" + lineAsString+"'\n" + "Expect field types: "+fieldTypesToString()+" \n" + "in file: " + currentSplit.getPath()); } else { return false;
/** * Checks if the expected input splits were created. */ @Test public void checkInputSplits() throws IOException { FileInputSplit[] inputSplits = this.createInputFormat().createInputSplits(0); Arrays.sort(inputSplits, new InputSplitSorter()); int splitIndex = 0; for (int fileIndex = 0; fileIndex < this.parallelism; fileIndex++) { List<FileInputSplit> sameFileSplits = new ArrayList<FileInputSplit>(); Path lastPath = inputSplits[splitIndex].getPath(); for (; splitIndex < inputSplits.length; splitIndex++) { if (!inputSplits[splitIndex].getPath().equals(lastPath)) { break; } sameFileSplits.add(inputSplits[splitIndex]); } Assert.assertEquals(this.getExpectedBlockCount(fileIndex), sameFileSplits.size()); long lastBlockLength = this.rawDataSizes[fileIndex] % (this.blockSize - getInfoSize()) + getInfoSize(); for (int index = 0; index < sameFileSplits.size(); index++) { Assert.assertEquals(this.blockSize * index, sameFileSplits.get(index).getStart()); if (index < sameFileSplits.size() - 1) { Assert.assertEquals(this.blockSize, sameFileSplits.get(index).getLength()); } } Assert.assertEquals(lastBlockLength, sameFileSplits.get(sameFileSplits.size() - 1).getLength()); } }
paths.add(split.getPath().toString());
@Test public void testExcludeFiles() { try { final String contents = "CONTENTS"; // create some accepted, some ignored files File child1 = temporaryFolder.newFile("dataFile1.txt"); File child2 = temporaryFolder.newFile("another_file.bin"); File[] files = { child1, child2 }; createTempFiles(contents.getBytes(ConfigConstants.DEFAULT_CHARSET), files); // test that only the valid files are accepted Configuration configuration = new Configuration(); final DummyFileInputFormat format = new DummyFileInputFormat(); format.setFilePath(temporaryFolder.getRoot().toURI().toString()); format.configure(configuration); format.setFilesFilter(new GlobFilePathFilter( Collections.singletonList("**"), Collections.singletonList("**/another_file.bin"))); FileInputSplit[] splits = format.createInputSplits(1); Assert.assertEquals(1, splits.length); final URI uri1 = splits[0].getPath().toUri(); final URI childUri1 = child1.toURI(); Assert.assertEquals(uri1, childUri1); } catch (Exception e) { System.err.println(e.getMessage()); e.printStackTrace(); Assert.fail(e.getMessage()); } }
@Override public void open(FileInputSplit fileSplit) throws IOException { LOG.debug("Opening ORC file {}", fileSplit.getPath()); org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(fileSplit.getPath().getPath()); Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf));
for (int i = 0; i < inputSplits.length; i++) { Assert.assertEquals(String.format("%d. split has block size length.", i), blockSize, inputSplits[i].getLength()); if (inputSplits[i].getPath().toString().equals(pathFile1)) { numSplitsFile1++; } else if (inputSplits[i].getPath().toString().equals(pathFile2)) { numSplitsFile2++; } else {
@Test public void testCreateInputSplitSingleFile() throws IOException { String tempFile = TestFileUtils.createTempFile("Hello World"); FileInputFormat fif = new DummyFileInputFormat(); fif.setFilePath(tempFile); fif.configure(new Configuration()); FileInputSplit[] splits = fif.createInputSplits(2); Assert.assertEquals(2, splits.length); Assert.assertEquals(tempFile, splits[0].getPath().toString()); Assert.assertEquals(tempFile, splits[1].getPath().toString()); }
private TimestampedFileInputSplit getTimestampedSplit(long modTime, FileInputSplit split) { Preconditions.checkNotNull(split); return new TimestampedFileInputSplit( modTime, split.getSplitNumber(), split.getPath(), split.getStart(), split.getLength(), split.getHostnames()); }
for (FileInputSplit fis : splits) { Assert.assertEquals(0, fis.getStart()); if (fis.getPath().toString().equals(tempFile1)) { numSplitsFile1++; Assert.assertEquals(21, fis.getLength()); } else if (fis.getPath().toString().equals(tempFile2)) { numSplitsFile2++; Assert.assertEquals(22, fis.getLength()); } else if (fis.getPath().toString().equals(tempFile3)) { numSplitsFile3++; Assert.assertEquals(23, fis.getLength());
Assert.assertEquals(5, splitsMixed.length); for(FileInputSplit split : splitsMixed) { if(split.getPath().getName().endsWith(".deflate")) {
/** * Tests that the records are read correctly when the split boundary is in the middle of a record. */ @Test public void testReadOverSplitBoundariesUnaligned() throws IOException { final String myString = "value1\nvalue2\nvalue3"; final FileInputSplit split = createTempFile(myString); FileInputSplit split1 = new FileInputSplit(0, split.getPath(), 0, split.getLength() / 2, split.getHostnames()); FileInputSplit split2 = new FileInputSplit(1, split.getPath(), split1.getLength(), split.getLength(), split.getHostnames()); final Configuration parameters = new Configuration(); format.configure(parameters); format.open(split1); assertEquals("value1", format.nextRecord(null)); assertEquals("value2", format.nextRecord(null)); assertNull(format.nextRecord(null)); assertTrue(format.reachedEnd()); format.close(); format.open(split2); assertEquals("value3", format.nextRecord(null)); assertNull(format.nextRecord(null)); assertTrue(format.reachedEnd()); format.close(); }