/** * Creates the input splits to be forwarded to the downstream tasks of the * {@link ContinuousFileReaderOperator}. Splits are sorted <b>by modification time</b> before * being forwarded and only splits belonging to files in the {@code eligibleFiles} * list will be processed. * @param eligibleFiles The files to process. */ private Map<Long, List<TimestampedFileInputSplit>> getInputSplitsSortedByModTime( Map<Path, FileStatus> eligibleFiles) throws IOException { Map<Long, List<TimestampedFileInputSplit>> splitsByModTime = new TreeMap<>(); if (eligibleFiles.isEmpty()) { return splitsByModTime; } for (FileInputSplit split: format.createInputSplits(readerParallelism)) { FileStatus fileStatus = eligibleFiles.get(split.getPath()); if (fileStatus != null) { Long modTime = fileStatus.getModificationTime(); List<TimestampedFileInputSplit> splitsToForward = splitsByModTime.get(modTime); if (splitsToForward == null) { splitsToForward = new ArrayList<>(); splitsByModTime.put(modTime, splitsToForward); } splitsToForward.add(new TimestampedFileInputSplit( modTime, split.getSplitNumber(), split.getPath(), split.getStart(), split.getLength(), split.getHostnames())); } } return splitsByModTime; }
private DataFileReader<E> initReader(FileInputSplit split) throws IOException { DatumReader<E> datumReader; if (org.apache.avro.generic.GenericRecord.class == avroValueType) { datumReader = new GenericDatumReader<E>(); } else { datumReader = org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType) ? new SpecificDatumReader<E>(avroValueType) : new ReflectDatumReader<E>(avroValueType); } if (LOG.isInfoEnabled()) { LOG.info("Opening split {}", split); } SeekableInput in = new FSDataInputStreamWrapper(stream, split.getPath().getFileSystem().getFileStatus(split.getPath()).getLen()); DataFileReader<E> dataFileReader = (DataFileReader) DataFileReader.openReader(in, datumReader); if (LOG.isDebugEnabled()) { LOG.debug("Loaded SCHEMA: {}", dataFileReader.getSchema()); } end = split.getStart() + split.getLength(); recordsReadSinceLastSync = 0; return dataFileReader; }
this.splitLength = fileSplit.getLength();
private Tuple2<Long, Long> getOffsetAndLengthForSplit(FileInputSplit split, List<StripeInformation> stripes) { long splitStart = split.getStart(); long splitEnd = splitStart + split.getLength(); long readStart = Long.MAX_VALUE; long readEnd = Long.MIN_VALUE; for (StripeInformation s : stripes) { if (splitStart <= s.getOffset() && s.getOffset() < splitEnd) { // stripe starts in split, so it is included readStart = Math.min(readStart, s.getOffset()); readEnd = Math.max(readEnd, s.getOffset() + s.getLength()); } } if (readStart < Long.MAX_VALUE) { // at least one split is included return Tuple2.of(readStart, readEnd - readStart); } else { return Tuple2.of(0L, 0L); } }
@Test public void testCreateInputSplitsWithOneFile() throws IOException { // create temporary file with 3 blocks final File tempFile = File.createTempFile("binary_input_format_test", "tmp"); tempFile.deleteOnExit(); final int blockInfoSize = new BlockInfo().getInfoSize(); final int blockSize = blockInfoSize + 8; final int numBlocks = 3; FileOutputStream fileOutputStream = new FileOutputStream(tempFile); for(int i = 0; i < blockSize * numBlocks; i++) { fileOutputStream.write(new byte[]{1}); } fileOutputStream.close(); final Configuration config = new Configuration(); config.setLong("input.block_size", blockSize + 10); final BinaryInputFormat<Record> inputFormat = new MyBinaryInputFormat(); inputFormat.setFilePath(tempFile.toURI().toString()); inputFormat.setBlockSize(blockSize); inputFormat.configure(config); FileInputSplit[] inputSplits = inputFormat.createInputSplits(numBlocks); Assert.assertEquals("Returns requested numbers of splits.", numBlocks, inputSplits.length); Assert.assertEquals("1. split has block size length.", blockSize, inputSplits[0].getLength()); Assert.assertEquals("2. split has block size length.", blockSize, inputSplits[1].getLength()); Assert.assertEquals("3. split has block size length.", blockSize, inputSplits[2].getLength()); }
/** * Checks if the expected input splits were created. */ @Test public void checkInputSplits() throws IOException { FileInputSplit[] inputSplits = this.createInputFormat().createInputSplits(0); Arrays.sort(inputSplits, new InputSplitSorter()); int splitIndex = 0; for (int fileIndex = 0; fileIndex < this.parallelism; fileIndex++) { List<FileInputSplit> sameFileSplits = new ArrayList<FileInputSplit>(); Path lastPath = inputSplits[splitIndex].getPath(); for (; splitIndex < inputSplits.length; splitIndex++) { if (!inputSplits[splitIndex].getPath().equals(lastPath)) { break; } sameFileSplits.add(inputSplits[splitIndex]); } Assert.assertEquals(this.getExpectedBlockCount(fileIndex), sameFileSplits.size()); long lastBlockLength = this.rawDataSizes[fileIndex] % (this.blockSize - getInfoSize()) + getInfoSize(); for (int index = 0; index < sameFileSplits.size(); index++) { Assert.assertEquals(this.blockSize * index, sameFileSplits.get(index).getStart()); if (index < sameFileSplits.size() - 1) { Assert.assertEquals(this.blockSize, sameFileSplits.get(index).getLength()); } } Assert.assertEquals(lastBlockLength, sameFileSplits.get(sameFileSplits.size() - 1).getLength()); } }
private TimestampedFileInputSplit getTimestampedSplit(long modTime, FileInputSplit split) { Preconditions.checkNotNull(split); return new TimestampedFileInputSplit( modTime, split.getSplitNumber(), split.getPath(), split.getStart(), split.getLength(), split.getHostnames()); }
Assert.assertEquals(String.format("%d. split has block size length.", i), blockSize, inputSplits[i].getLength()); if (inputSplits[i].getPath().toString().equals(pathFile1)) { numSplitsFile1++;
final FileInputSplit split = createTempFile(myString); FileInputSplit split1 = new FileInputSplit(0, split.getPath(), 0, split.getLength() / 2, split.getHostnames()); FileInputSplit split2 = new FileInputSplit(1, split.getPath(), split1.getLength(), split.getLength(), split.getHostnames()); format.setBufferSize(2 * ((int) split1.getLength())); format.configure(parameters);
if (fis.getPath().toString().equals(tempFile1)) { numSplitsFile1++; Assert.assertEquals(21, fis.getLength()); } else if (fis.getPath().toString().equals(tempFile2)) { numSplitsFile2++; Assert.assertEquals(22, fis.getLength()); } else if (fis.getPath().toString().equals(tempFile3)) { numSplitsFile3++; Assert.assertEquals(23, fis.getLength()); } else { Assert.fail("Got split for unknown file.");
Assert.assertEquals(4, splits.length); for(FileInputSplit split : splits) { Assert.assertEquals(-1L, split.getLength()); // unsplittable deflate files have this size as a flag for "read whole file" for(FileInputSplit split : splitsMixed) { if(split.getPath().getName().endsWith(".deflate")) { Assert.assertEquals(-1L, split.getLength()); // unsplittable deflate files have this size as a flag for "read whole file" Assert.assertTrue("split size not correct", split.getLength() > 0);
@Test public void testReadExactlyBufferSize() throws IOException { final String myString = "aaaaaaa\nbbbbbbb\nccccccc\nddddddd\n"; final FileInputSplit split = createTempFile(myString); final Configuration parameters = new Configuration(); format.setBufferSize((int) split.getLength()); format.configure(parameters); format.open(split); String next; int count = 0; while ((next = format.nextRecord(null)) != null) { assertEquals(7, next.length()); count++; } assertNull(format.nextRecord(null)); assertTrue(format.reachedEnd()); format.close(); assertEquals(4, count); }
/** * Tests that the records are read correctly when the split boundary is in the middle of a record. */ @Test public void testReadOverSplitBoundariesUnaligned() throws IOException { final String myString = "value1\nvalue2\nvalue3"; final FileInputSplit split = createTempFile(myString); FileInputSplit split1 = new FileInputSplit(0, split.getPath(), 0, split.getLength() / 2, split.getHostnames()); FileInputSplit split2 = new FileInputSplit(1, split.getPath(), split1.getLength(), split.getLength(), split.getHostnames()); final Configuration parameters = new Configuration(); format.configure(parameters); format.open(split1); assertEquals("value1", format.nextRecord(null)); assertEquals("value2", format.nextRecord(null)); assertNull(format.nextRecord(null)); assertTrue(format.reachedEnd()); format.close(); format.open(split2); assertEquals("value3", format.nextRecord(null)); assertNull(format.nextRecord(null)); assertTrue(format.reachedEnd()); format.close(); }
assertEquals(inputSplit.getStart() + inputSplit.getLength(), offsetAtEndOfSplit[splitCounter]); splitCounter++;
@Override public BitcoinBlock nextRecord(BitcoinBlock reuse) throws IOException { BitcoinBlock dataBlock=null; if ((this.currentSplit.getLength()<0) ||(this.stream.getPos()<=this.currentSplit.getStart()+this.currentSplit.getLength())) { try { dataBlock=this.getBbr().readBlock(); } catch(BitcoinBlockReadException e) { LOG.error(e); } if (dataBlock==null) { this.isEndReached=true; } } else { this.isEndReached=true; } return dataBlock; }
@Override public EthereumBlock nextRecord(EthereumBlock reuse) throws IOException { EthereumBlock dataBlock=null; if ((this.currentSplit.getLength()<0) ||(this.stream.getPos()<=this.currentSplit.getStart()+this.currentSplit.getLength())) { try { dataBlock=this.getEbr().readBlock(); } catch (EthereumBlockReadException e) { LOG.error(e); throw new RuntimeException(e.toString()); } if (dataBlock==null) { this.isEndReached=true; } } else { this.isEndReached=true; } return dataBlock; }
protected RecordReader createReader( FileInputSplit fileSplit, TaskAttemptContext taskAttemptContext) throws IOException { // by default, we use org.apache.orc.mapreduce.OrcMapreduceRecordReader Configuration hadoopConf = taskAttemptContext.getConfiguration(); org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(fileSplit.getPath().toUri()); Reader file = OrcFile.createReader(filePath, OrcFile.readerOptions(hadoopConf).maxLength(OrcConf.MAX_FILE_LENGTH.getLong(hadoopConf))); return new OrcMapreduceRecordReader<>(file, org.apache.orc.mapred.OrcInputFormat.buildOptions(hadoopConf, file, fileSplit.getStart(), fileSplit.getLength())); } }
private TimestampedFileInputSplit createTimestampedFileSplit(FileInputSplit split, long modificationTime, Serializable state) { TimestampedFileInputSplit timestampedSplit = new TimestampedFileInputSplit( modificationTime, split.getSplitNumber(), split.getPath(), split.getStart(), split.getLength(), split.getHostnames()); if (state != null) { timestampedSplit.setSplitState(state); } return timestampedSplit; } }