/** {@inheritDoc} */ @Override public float getProgress() throws IOException, InterruptedException { assert null != mAvroFileReader; if (mEndPosition == mStartPosition) { // Trivial empty input split. return 0.0f; } long bytesRead = mAvroFileReader.previousSync() - mStartPosition; long bytesTotal = mEndPosition - mStartPosition; LOG.debug("Progress: bytesRead=" + bytesRead + ", bytesTotal=" + bytesTotal); return Math.min(1.0f, (float) bytesRead / (float) bytesTotal); }
@Override public void mark() throws IOException { long pos = fileReader.previousSync() - DataFileConstants.SYNC_SIZE; if (pos < 0) pos = 0; ((RemoteMarkable) ris).markPosition(pos); }
@Override public E nextRecord(E reuseValue) throws IOException { if (reachedEnd()) { return null; } // if we start a new block, then register the event, and // restart the counter. if (dataFileReader.previousSync() != lastSync) { lastSync = dataFileReader.previousSync(); recordsReadSinceLastSync = 0; } recordsReadSinceLastSync++; if (reuseAvroValue) { return dataFileReader.next(reuseValue); } else { if (GenericRecord.class == avroValueType) { return dataFileReader.next(); } else { return dataFileReader.next(InstantiationUtil.instantiate(avroValueType, Object.class)); } } }
@Override public void open(FileInputSplit split) throws IOException { super.open(split); dataFileReader = initReader(split); dataFileReader.sync(split.getStart()); lastSync = dataFileReader.previousSync(); }
static <T> T runOnPreview(byte[] bits, AvroPreviewProcessor<T> processor) throws IOException { DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(); SeekableByteArrayInput sbai = new SeekableByteArrayInput(bits); DataFileReader<GenericRecord> dataFileReader = null; try { dataFileReader = new DataFileReader<>(sbai, datumReader); int headerLen = (int) dataFileReader.previousSync(); byte[] header = Arrays.copyOf(bits, headerLen); if (dataFileReader.hasNext()) { GenericRecord gr = dataFileReader.next(); return processor.process(header, gr, dataFileReader.getBlockCount(), dataFileReader.getBlockSize()); } else { throw new RuntimeException("Empty Avro file - cannot run preview! "); } } finally { try { if (dataFileReader!=null) dataFileReader.close(); } catch (IOException safeToIgnore) {} } }
int numCorruptRecords = 0; int recordsWritten = 0; long position = fileReader.previousSync(); long blockSize = 0; long blockCount = 0; return 0; position = fileReader.previousSync(); blockCount = fileReader.getBlockCount(); blockSize = fileReader.getBlockSize(); + (badRecordsInBlock)); position = fileReader.previousSync(); } catch (Exception e) { err.println("Failed to read block " + numBlocks + ". Unknown record "
GenericRecord gr = new GenericData.Record(schema); Schema.Field[] flatSchema = flatSchema(schema); long sync = dataFileReader.previousSync(); if (sbai.chunkCnt == 0) { // Find data in first chunk while (dataFileReader.hasNext() && dataFileReader.previousSync() == sync) { gr = dataFileReader.next(gr);
/** {@inheritDoc} */ @Override public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { if (!(inputSplit instanceof FileSplit)) { throw new IllegalArgumentException("Only compatible with FileSplits."); } FileSplit fileSplit = (FileSplit) inputSplit; // Open a seekable input stream to the Avro container file. SeekableInput seekableFileInput = createSeekableInput(context.getConfiguration(), fileSplit.getPath()); // Wrap the seekable input stream in an Avro DataFileReader. Configuration conf = context.getConfiguration(); GenericData dataModel = AvroSerialization.createDataModel(conf); DatumReader<T> datumReader = dataModel.createDatumReader(mReaderSchema); mAvroFileReader = createAvroFileReader(seekableFileInput, datumReader); // Initialize the start and end offsets into the file based on the boundaries of the // input split we're responsible for. We will read the first block that begins // after the input split start boundary. We will read up to but not including the // first block that starts after input split end boundary. // Sync to the closest block/record boundary just after beginning of our input split. mAvroFileReader.sync(fileSplit.getStart()); // Initialize the start position to the beginning of the first block of the input split. mStartPosition = mAvroFileReader.previousSync(); // Initialize the end position to the end of the input split (this isn't necessarily // on a block boundary so using this for reporting progress will be approximate. mEndPosition = fileSplit.getStart() + fileSplit.getLength(); }
public void testSyncDiscovery() throws IOException { File file = makeFile(); DataFileReader<Object> reader = new DataFileReader<>(file, new GenericDatumReader<>()); try { // discover the sync points ArrayList<Long> syncs = new ArrayList<>(); long previousSync = -1; while (reader.hasNext()) { if (reader.previousSync() != previousSync) { previousSync = reader.previousSync(); syncs.add(previousSync); } reader.next(); } // confirm that the first point is the one reached by sync(0) reader.sync(0); assertEquals((long)reader.previousSync(), (long)syncs.get(0)); // and confirm that all points are reachable for (Long sync : syncs) { reader.seek(sync); assertNotNull(reader.next()); } } finally { reader.close(); } }
public void testReadWithHeader() throws IOException { File file = makeFile(); DataFileReader<Object> reader = new DataFileReader<>(file, new GenericDatumReader<>()); // get a header for this file DataFileStream.Header header = reader.getHeader(); // re-open to an arbitrary position near the middle, with sync == true SeekableFileInput sin = new SeekableFileInput(file); sin.seek(sin.length() / 2); reader = DataFileReader.openReader(sin, new GenericDatumReader<>(), header, true); assertNotNull("Should be able to reopen from arbitrary point", reader.next()); long validPos = reader.previousSync(); // post sync, we know of a valid sync point: re-open with seek (sync == false) sin.seek(validPos); reader = DataFileReader.openReader(sin, new GenericDatumReader<>(), header, false); assertEquals("Should not move from sync point on reopen", validPos, sin.tell()); assertNotNull("Should be able to reopen at sync point", reader.next()); }
assertEquals("apple", r.next().toString()); assertEquals("banana", r.next().toString()); long prevSync = r.previousSync(); try { r.next();
/** {@inheritDoc} */ @Override public float getProgress() throws IOException, InterruptedException { assert null != mAvroFileReader; if (mEndPosition == mStartPosition) { // Trivial empty input split. return 0.0f; } long bytesRead = mAvroFileReader.previousSync() - mStartPosition; long bytesTotal = mEndPosition - mStartPosition; LOG.debug("Progress: bytesRead=" + bytesRead + ", bytesTotal=" + bytesTotal); return Math.min(1.0f, (float) bytesRead / (float) bytesTotal); }
@Override public void mark() throws IOException { long pos = fileReader.previousSync() - DataFileConstants.SYNC_SIZE; if (pos < 0) pos = 0; ((RemoteMarkable) ris).markPosition(pos); }
@Override public IncomingMessageEnvelope readNext() { // get checkpoint for THIS record String checkpoint = nextOffset(); GenericRecord record = fileReader.next(); if (fileReader.previousSync() != curBlockStart) { curBlockStart = fileReader.previousSync(); curRecordOffset = 0; } else { curRecordOffset++; } // avro schema doesn't necessarily have key field return new IncomingMessageEnvelope(systemStreamPartition, checkpoint, null, record); }
@Override public void open(FileInputSplit split) throws IOException { super.open(split); dataFileReader = initReader(split); dataFileReader.sync(split.getStart()); lastSync = dataFileReader.previousSync(); }
@Override public void open(FileInputSplit split) throws IOException { super.open(split); dataFileReader = initReader(split); dataFileReader.sync(split.getStart()); lastSync = dataFileReader.previousSync(); }
@Override public void open(FileInputSplit split) throws IOException { super.open(split); dataFileReader = initReader(split); dataFileReader.sync(split.getStart()); lastSync = dataFileReader.previousSync(); }
@Override public void open(FileInputSplit split) throws IOException { super.open(split); dataFileReader = initReader(split); dataFileReader.sync(split.getStart()); lastSync = dataFileReader.previousSync(); }
@Override public void seek(Offset offset) { try { reader.sync(offset.getRecordOffset()); this.offset.setOffset(reader.previousSync() - 15); } catch (IOException ioe) { throw new ConnectException("Error seeking file " + getFilePath(), ioe); } }
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) split; reader = new DataFileReader<Object>(new FsInput(fileSplit.getPath(), context.getConfiguration()), new GenericStrDatumReader<Object>()); reader.sync(fileSplit.getStart()); // sync to start this.start = reader.previousSync(); this.end = fileSplit.getStart() + split.getLength(); }