/** * Creates the input splits to be forwarded to the downstream tasks of the * {@link ContinuousFileReaderOperator}. Splits are sorted <b>by modification time</b> before * being forwarded and only splits belonging to files in the {@code eligibleFiles} * list will be processed. * @param eligibleFiles The files to process. */ private Map<Long, List<TimestampedFileInputSplit>> getInputSplitsSortedByModTime( Map<Path, FileStatus> eligibleFiles) throws IOException { Map<Long, List<TimestampedFileInputSplit>> splitsByModTime = new TreeMap<>(); if (eligibleFiles.isEmpty()) { return splitsByModTime; } for (FileInputSplit split: format.createInputSplits(readerParallelism)) { FileStatus fileStatus = eligibleFiles.get(split.getPath()); if (fileStatus != null) { Long modTime = fileStatus.getModificationTime(); List<TimestampedFileInputSplit> splitsToForward = splitsByModTime.get(modTime); if (splitsToForward == null) { splitsToForward = new ArrayList<>(); splitsByModTime.put(modTime, splitsToForward); } splitsToForward.add(new TimestampedFileInputSplit( modTime, split.getSplitNumber(), split.getPath(), split.getStart(), split.getLength(), split.getHostnames())); } } return splitsByModTime; }
final FileInputSplit split = createTempFile(myString); FileInputSplit split1 = new FileInputSplit(0, split.getPath(), 0, split.getLength() / 2, split.getHostnames()); FileInputSplit split2 = new FileInputSplit(1, split.getPath(), split1.getLength(), split.getLength(), split.getHostnames()); format.setBufferSize(2 * ((int) split1.getLength())); format.configure(parameters);
this.splitStart = fileSplit.getStart(); this.splitLength = fileSplit.getLength(); LOG.debug("Opening input split " + fileSplit.getPath() + " [" + this.splitStart + "," + this.splitLength + "]"); throw new IOException("Error opening the Input Split " + fileSplit.getPath() + " [" + splitStart + "," + splitLength + "]: " + t.getMessage(), t);
Preconditions.checkNotNull(split, "reopen() cannot be called on a null split."); Preconditions.checkNotNull(state, "reopen() cannot be called with a null initial state."); Preconditions.checkArgument(state == -1 || state >= split.getStart(), " Illegal offset "+ state +", smaller than the splits start=" + split.getStart()); if (state > this.splitStart + split.getLength()) { this.end = true; } else if (state > split.getStart()) { initBuffers(); if (split.getLength() == -1) { this.splitLength = this.splitStart + split.getLength() - this.offset; if (splitLength <= 0) { this.end = true;
@Override public void close() throws IOException { if (this.invalidLineCount > 0) { if (LOG.isWarnEnabled()) { LOG.warn("In file \"" + currentSplit.getPath() + "\" (split start: " + this.splitStart + ") " + this.invalidLineCount +" invalid line(s) were skipped."); } } if (this.commentCount > 0) { if (LOG.isInfoEnabled()) { LOG.info("In file \"" + currentSplit.getPath() + "\" (split start: " + this.splitStart + ") " + this.commentCount +" comment line(s) were skipped."); } } super.close(); }
@Override public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException { FileInputSplit[] splits = new FileInputSplit[minNumSplits]; for (int i = 0; i < minNumSplits; i++) { splits[i] = new FileInputSplit(i, getFilePaths()[0], i * linesPerSplit + 1, linesPerSplit, null); } return splits; }
@Override public void open(FileInputSplit split) throws IOException { super.open(split); dataFileReader = initReader(split); dataFileReader.sync(split.getStart()); lastSync = dataFileReader.previousSync(); }
@Test public void testCreateInputSplitsWithOneFile() throws IOException { // create temporary file with 3 blocks final File tempFile = File.createTempFile("binary_input_format_test", "tmp"); tempFile.deleteOnExit(); final int blockInfoSize = new BlockInfo().getInfoSize(); final int blockSize = blockInfoSize + 8; final int numBlocks = 3; FileOutputStream fileOutputStream = new FileOutputStream(tempFile); for(int i = 0; i < blockSize * numBlocks; i++) { fileOutputStream.write(new byte[]{1}); } fileOutputStream.close(); final Configuration config = new Configuration(); config.setLong("input.block_size", blockSize + 10); final BinaryInputFormat<Record> inputFormat = new MyBinaryInputFormat(); inputFormat.setFilePath(tempFile.toURI().toString()); inputFormat.setBlockSize(blockSize); inputFormat.configure(config); FileInputSplit[] inputSplits = inputFormat.createInputSplits(numBlocks); Assert.assertEquals("Returns requested numbers of splits.", numBlocks, inputSplits.length); Assert.assertEquals("1. split has block size length.", blockSize, inputSplits[0].getLength()); Assert.assertEquals("2. split has block size length.", blockSize, inputSplits[1].getLength()); Assert.assertEquals("3. split has block size length.", blockSize, inputSplits[2].getLength()); }
@Override public String toString() { return "[" + getSplitNumber() + "] " + file + ":" + start + "+" + length; } }
private DataFileReader<E> initReader(FileInputSplit split) throws IOException { DatumReader<E> datumReader; if (org.apache.avro.generic.GenericRecord.class == avroValueType) { datumReader = new GenericDatumReader<E>(); } else { datumReader = org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType) ? new SpecificDatumReader<E>(avroValueType) : new ReflectDatumReader<E>(avroValueType); } if (LOG.isInfoEnabled()) { LOG.info("Opening split {}", split); } SeekableInput in = new FSDataInputStreamWrapper(stream, split.getPath().getFileSystem().getFileStatus(split.getPath()).getLen()); DataFileReader<E> dataFileReader = (DataFileReader) DataFileReader.openReader(in, datumReader); if (LOG.isDebugEnabled()) { LOG.debug("Loaded SCHEMA: {}", dataFileReader.getSchema()); } end = split.getStart() + split.getLength(); recordsReadSinceLastSync = 0; return dataFileReader; }
private Tuple2<Long, Long> getOffsetAndLengthForSplit(FileInputSplit split, List<StripeInformation> stripes) { long splitStart = split.getStart(); long splitEnd = splitStart + split.getLength(); long readStart = Long.MAX_VALUE; long readEnd = Long.MIN_VALUE; for (StripeInformation s : stripes) { if (splitStart <= s.getOffset() && s.getOffset() < splitEnd) { // stripe starts in split, so it is included readStart = Math.min(readStart, s.getOffset()); readEnd = Math.max(readEnd, s.getOffset() + s.getLength()); } } if (readStart < Long.MAX_VALUE) { // at least one split is included return Tuple2.of(readStart, readEnd - readStart); } else { return Tuple2.of(0L, 0L); } }
@Override public void run() { try { final FileSystem fs = FileSystem.get(this.split.getPath().toUri()); this.fdis = fs.open(this.split.getPath()); // check for canceling and close the stream in that case, because no one will obtain it if (this.aborted) { final FSDataInputStream f = this.fdis; this.fdis = null; f.close(); } } catch (Throwable t) { this.error = t; } }
private FileInputSplit createTempDeflateFile(String content) throws IOException { File tempFile = File.createTempFile("test_contents", "tmp.deflate"); tempFile.deleteOnExit(); DataOutputStream dos = new DataOutputStream(new DeflaterOutputStream(new FileOutputStream(tempFile))); dos.writeBytes(content); dos.close(); return new FileInputSplit(0, new Path(tempFile.toURI().toString()), 0, tempFile.length(), new String[] {"localhost"}); }
@Override public void open(FileInputSplit split) throws IOException { super.open(split); dataFileReader = initReader(split); dataFileReader.sync(split.getStart()); lastSync = dataFileReader.previousSync(); }
@Test public void testReadExactlyBufferSize() throws IOException { final String myString = "aaaaaaa\nbbbbbbb\nccccccc\nddddddd\n"; final FileInputSplit split = createTempFile(myString); final Configuration parameters = new Configuration(); format.setBufferSize((int) split.getLength()); format.configure(parameters); format.open(split); String next; int count = 0; while ((next = format.nextRecord(null)) != null) { assertEquals(7, next.length()); count++; } assertNull(format.nextRecord(null)); assertTrue(format.reachedEnd()); format.close(); assertEquals(4, count); }
@Override public int hashCode() { return getSplitNumber() ^ (file == null ? 0 : file.hashCode()); }
private TimestampedFileInputSplit getTimestampedSplit(long modTime, FileInputSplit split) { Preconditions.checkNotNull(split); return new TimestampedFileInputSplit( modTime, split.getSplitNumber(), split.getPath(), split.getStart(), split.getLength(), split.getHostnames()); }
/** * Checks if the expected input splits were created. */ @Test public void checkInputSplits() throws IOException { FileInputSplit[] inputSplits = this.createInputFormat().createInputSplits(0); Arrays.sort(inputSplits, new InputSplitSorter()); int splitIndex = 0; for (int fileIndex = 0; fileIndex < this.parallelism; fileIndex++) { List<FileInputSplit> sameFileSplits = new ArrayList<FileInputSplit>(); Path lastPath = inputSplits[splitIndex].getPath(); for (; splitIndex < inputSplits.length; splitIndex++) { if (!inputSplits[splitIndex].getPath().equals(lastPath)) { break; } sameFileSplits.add(inputSplits[splitIndex]); } Assert.assertEquals(this.getExpectedBlockCount(fileIndex), sameFileSplits.size()); long lastBlockLength = this.rawDataSizes[fileIndex] % (this.blockSize - getInfoSize()) + getInfoSize(); for (int index = 0; index < sameFileSplits.size(); index++) { Assert.assertEquals(this.blockSize * index, sameFileSplits.get(index).getStart()); if (index < sameFileSplits.size() - 1) { Assert.assertEquals(this.blockSize, sameFileSplits.get(index).getLength()); } } Assert.assertEquals(lastBlockLength, sameFileSplits.get(sameFileSplits.size() - 1).getLength()); } }