@Override public void reset() throws IOException { long pos = ((RemoteMarkable) ris).getMarkPosition(); fileReader.sync(pos); }
/** * Read the last record in the file. */ private void initReader() throws IOException { long syncPos = trackerFile.length() - 256L; if (syncPos < 0) syncPos = 0L; reader.sync(syncPos); while (reader.hasNext()) { reader.next(metaCache); } }
private void initialize() throws IOException, NoSuchAlgorithmException { SeekableResettableInputBridge in = new SeekableResettableInputBridge(ris); long pos = in.tell(); in.seek(0L); fileReader = new DataFileReader<GenericRecord>(in, new GenericDatumReader<GenericRecord>()); fileReader.sync(pos); schema = fileReader.getSchema(); datumWriter = new GenericDatumWriter(schema); out = new ByteArrayOutputStream(); encoder = EncoderFactory.get().binaryEncoder(out, encoder); schemaHash = SchemaNormalization.parsingFingerprint("CRC-64-AVRO", schema); schemaHashString = Hex.encodeHexString(schemaHash); }
@Override public void open(FileInputSplit split) throws IOException { super.open(split); dataFileReader = initReader(split); dataFileReader.sync(split.getStart()); lastSync = dataFileReader.previousSync(); }
public TetherRecordReader(JobConf job, FileSplit split) throws IOException { this.in = new FsInput(split.getPath(), job); this.reader = new DataFileReader<>(in, new GenericDatumReader<>()); reader.sync(split.getStart()); // sync to start this.start = in.tell(); this.end = split.getStart() + split.getLength(); job.set(AvroJob.INPUT_SCHEMA, reader.getSchema().toString()); }
@Test public void testSyncInHeader() throws IOException { DataFileReader<Object> reader = new DataFileReader<> (new File("../../../share/test/data/syncInMeta.avro"), new GenericDatumReader<>()); reader.sync(0); for (Object datum : reader) assertNotNull(datum); }
/** * Construct a reader for a file at the current position of the input, * without reading the header. * @param sync True to read forward to the next sync point after opening, * false to assume that the input is already at a valid sync * point. */ public static <D> DataFileReader<D> openReader(SeekableInput in, DatumReader<D> reader, Header header, boolean sync) throws IOException { DataFileReader<D> dreader = new DataFileReader<>(in, reader, header); // seek/sync to an (assumed) valid position if (sync) dreader.sync(in.tell()); else dreader.seek(in.tell()); return dreader; }
/** * Construct a reader for a file at the current position of the input, * without reading the header. * @param sync True to read forward to the next sync point after opening, * false to assume that the input is already at a valid sync * point. */ public static <D> DataFileReader<D> openReader(SeekableInput in, DatumReader<D> reader, Header header, boolean sync) throws IOException { DataFileReader<D> dreader = new DataFileReader<D>(in, reader, header); // seek/sync to an (assumed) valid position if (sync) dreader.sync(in.tell()); else dreader.seek(in.tell()); return dreader; }
/** {@inheritDoc} */ @Override public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { if (!(inputSplit instanceof FileSplit)) { throw new IllegalArgumentException("Only compatible with FileSplits."); } FileSplit fileSplit = (FileSplit) inputSplit; // Open a seekable input stream to the Avro container file. SeekableInput seekableFileInput = createSeekableInput(context.getConfiguration(), fileSplit.getPath()); // Wrap the seekable input stream in an Avro DataFileReader. Configuration conf = context.getConfiguration(); GenericData dataModel = AvroSerialization.createDataModel(conf); DatumReader<T> datumReader = dataModel.createDatumReader(mReaderSchema); mAvroFileReader = createAvroFileReader(seekableFileInput, datumReader); // Initialize the start and end offsets into the file based on the boundaries of the // input split we're responsible for. We will read the first block that begins // after the input split start boundary. We will read up to but not including the // first block that starts after input split end boundary. // Sync to the closest block/record boundary just after beginning of our input split. mAvroFileReader.sync(fileSplit.getStart()); // Initialize the start position to the beginning of the first block of the input split. mStartPosition = mAvroFileReader.previousSync(); // Initialize the end position to the end of the input split (this isn't necessarily // on a block boundary so using this for reporting progress will be approximate. mEndPosition = fileSplit.getStart() + fileSplit.getLength(); }
public void testSplits() throws IOException { File file = makeFile(); DataFileReader<Object> reader = new DataFileReader<>(file, new GenericDatumReader<>()); Random rand = new Random(SEED); try { int splits = 10; // number of splits int length = (int)file.length(); // length of file int end = length; // end of split int remaining = end; // bytes remaining int count = 0; // count of entries while (remaining > 0) { int start = Math.max(0, end - rand.nextInt(2*length/splits)); reader.sync(start); // count entries in split while (!reader.pastSync(end)) { reader.next(); count++; } remaining -= end-start; end = start; } assertEquals(COUNT, count); } finally { reader.close(); } }
public void testSyncDiscovery() throws IOException { File file = makeFile(); DataFileReader<Object> reader = new DataFileReader<>(file, new GenericDatumReader<>()); try { // discover the sync points ArrayList<Long> syncs = new ArrayList<>(); long previousSync = -1; while (reader.hasNext()) { if (reader.previousSync() != previousSync) { previousSync = reader.previousSync(); syncs.add(previousSync); } reader.next(); } // confirm that the first point is the one reached by sync(0) reader.sync(0); assertEquals((long)reader.previousSync(), (long)syncs.get(0)); // and confirm that all points are reachable for (Long sync : syncs) { reader.seek(sync); assertNotNull(reader.next()); } } finally { reader.close(); } }
assertEquals("Invalid sync!", e.getCause().getMessage()); r.sync(prevSync); // go to sync point after previous successful one assertEquals("endive", r.next().toString()); assertEquals("fig", r.next().toString());
@Override public void reset() throws IOException { long pos = ((RemoteMarkable) ris).getMarkPosition(); fileReader.sync(pos); }
@Override public void close() throws IOException { reader.sync(0); reader.close(); }
@Override public void setup(final OutputMutator output) throws ExecutionSetupException { writer = new VectorContainerWriter(output); try { reader = new DataFileReader<>(new FsInput(hadoop, fsConf), new GenericDatumReader<GenericContainer>()); logger.debug("Processing file : {}, start position : {}, end position : {} ", hadoop, start, end); reader.sync(this.start); } catch (IOException e) { throw new ExecutionSetupException(e); } }
public TetherRecordReader(JobConf job, FileSplit split) throws IOException { this.in = new FsInput(split.getPath(), job); this.reader = new DataFileReader<Object>(in, new GenericDatumReader<Object>()); reader.sync(split.getStart()); // sync to start this.start = in.tell(); this.end = split.getStart() + split.getLength(); job.set(AvroJob.INPUT_SCHEMA, reader.getSchema().toString()); }
@Override public void seek(Offset offset) { try { reader.sync(offset.getRecordOffset()); this.offset.setOffset(reader.previousSync() - 15); } catch (IOException ioe) { throw new ConnectException("Error seeking file " + getFilePath(), ioe); } }
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) split; Configuration config = context.getConfiguration(); Path path = fileSplit.getPath(); this.in = new FsInput(path, config); DatumReader<T> datumReader = getDatumReader(config); this.reader = new DataFileReader<T>(in, datumReader); reader.sync(fileSplit.getStart()); // sync to start this.start = in.tell(); this.end = fileSplit.getStart() + split.getLength(); }
public AvroRecordReader(JobConf job, FileSplit split) throws IOException { this.in = new FsInput(split.getPath(), job); Schema s = AvroJob.getInputSchema(job); this.reader = new DataFileReader<T>(in, new SpecificDatumReader<T>(s)); reader.sync(split.getStart()); // sync to start this.start = in.tell(); this.end = split.getStart() + split.getLength(); }