Refine search
private GenericRecord getRecordFromFile(String path) throws IOException { DatumReader<GenericRecord> reader = new GenericDatumReader<>(); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(new File(path), reader); while (dataFileReader.hasNext()) { return dataFileReader.next(); } return null; }
/** Reads and returns the first datum in a data file. */ static Object datumFromFile(Schema schema, String file) throws IOException { DataFileReader<Object> in = new DataFileReader<>(new File(file), new GenericDatumReader<>(schema)); try { return in.next(); } finally { in.close(); } }
fakeHeader = new DataFileReader<>(new SeekableByteArrayInput(this.header), datumReader).getHeader(); dataFileReader = DataFileReader.openReader(sbai, datumReader, fakeHeader, true); Schema schema = dataFileReader.getSchema(); GenericRecord gr = new GenericData.Record(schema); Schema.Field[] flatSchema = flatSchema(schema); long sync = dataFileReader.previousSync(); if (sbai.chunkCnt == 0) { // Find data in first chunk while (dataFileReader.hasNext() && dataFileReader.previousSync() == sync) { gr = dataFileReader.next(gr); Log.trace(String.format("Avro: ChunkIdx: %d read %d records, start at %d off, block count: %d, block size: %d", cidx, cnt, din.getChunkDataStart(cidx), dataFileReader.getBlockCount(), dataFileReader.getBlockSize()));
public TetherRecordReader(JobConf job, FileSplit split) throws IOException { this.in = new FsInput(split.getPath(), job); this.reader = new DataFileReader<>(in, new GenericDatumReader<>()); reader.sync(split.getStart()); // sync to start this.start = in.tell(); this.end = split.getStart() + split.getLength(); job.set(AvroJob.INPUT_SCHEMA, reader.getSchema().toString()); }
static <T> T runOnPreview(byte[] bits, AvroPreviewProcessor<T> processor) throws IOException { DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(); SeekableByteArrayInput sbai = new SeekableByteArrayInput(bits); DataFileReader<GenericRecord> dataFileReader = null; try { dataFileReader = new DataFileReader<>(sbai, datumReader); int headerLen = (int) dataFileReader.previousSync(); byte[] header = Arrays.copyOf(bits, headerLen); if (dataFileReader.hasNext()) { GenericRecord gr = dataFileReader.next(); return processor.process(header, gr, dataFileReader.getBlockCount(), dataFileReader.getBlockSize()); } else { throw new RuntimeException("Empty Avro file - cannot run preview! "); } } finally { try { if (dataFileReader!=null) dataFileReader.close(); } catch (IOException safeToIgnore) {} } }
/** * Test that non-string map-keys are readable through ReflectDatumReader * This method should form the original map and should not return any * array of {key, value} as done by {@link #testGenericDatumRead()} */ private <T> List<T> testReflectDatumRead (String testType, byte[] bytes, T ... entityObjs) throws IOException { ReflectDatumReader<T> datumReader = new ReflectDatumReader<>(); SeekableByteArrayInput avroInputStream = new SeekableByteArrayInput(bytes); DataFileReader<T> fileReader = new DataFileReader<>(avroInputStream, datumReader); Schema schema = fileReader.getSchema(); T record = null; List<T> records = new ArrayList<>(); while (fileReader.hasNext()) { records.add (fileReader.next(record)); } return records; }
public void testSyncDiscovery() throws IOException { File file = makeFile(); DataFileReader<Object> reader = new DataFileReader<>(file, new GenericDatumReader<>()); try { // discover the sync points ArrayList<Long> syncs = new ArrayList<>(); long previousSync = -1; while (reader.hasNext()) { if (reader.previousSync() != previousSync) { previousSync = reader.previousSync(); syncs.add(previousSync); } reader.next(); } // confirm that the first point is the one reached by sync(0) reader.sync(0); assertEquals((long)reader.previousSync(), (long)syncs.get(0)); // and confirm that all points are reachable for (Long sync : syncs) { reader.seek(sync); assertNotNull(reader.next()); } } finally { reader.close(); } }
conf.set("fs.default.name", "file:///"); Path avroFile = new Path("target/temp.avro"); DataFileReader<GenericData.Record> dataFileReader = new DataFileReader<>(new FsInput(avroFile, conf), new SpecificDatumReader<>()); dataFileReader.seek(positionTwo); assertTrue(dataFileReader.hasNext()); // Record 2. assertEquals(2, dataFileReader.next()); dataFileReader.seek(positionOne); assertTrue(dataFileReader.hasNext()); // Record 1. assertEquals(1, dataFileReader.next()); dataFileReader.close();
DatumReader<GenericRecord> indexReader = new GenericDatumReader<>( AvroKeyValue.getSchema(options.getKeySchema(), Schema.create(Schema.Type.LONG))); FileReader<GenericRecord> indexFileReader = DataFileReader.openReader(indexFile, indexReader); AvroKeyValue.getSchema(options.getKeySchema(), options.getValueSchema())); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(dataFile, dataReader); dataFileReader.seek(indexRecords.get(0).getValue()); assertTrue(dataFileReader.hasNext()); AvroKeyValue<CharSequence, CharSequence> appleRecord = new AvroKeyValue<>(dataFileReader.next()); assertEquals("apple", appleRecord.getKey().toString()); assertEquals("Apple", appleRecord.getValue().toString()); dataFileReader.seek(indexRecords.get(1).getValue()); assertTrue(dataFileReader.hasNext()); AvroKeyValue<CharSequence, CharSequence> carrotRecord = new AvroKeyValue<>(dataFileReader.next()); assertEquals("carrot", carrotRecord.getKey().toString()); assertEquals("Carrot", carrotRecord.getValue().toString()); assertTrue(dataFileReader.hasNext()); AvroKeyValue<CharSequence, CharSequence> durianRecord = new AvroKeyValue<>(dataFileReader.next()); assertEquals("durian", durianRecord.getKey().toString()); assertEquals("Durian", durianRecord.getValue().toString()); } finally { dataFileReader.close();
FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*")); Assert.assertEquals(1, outputFiles.length); DataFileReader<TextStats> reader = new DataFileReader<>( new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new SpecificDatumReader<>()); Map<String, Integer> counts = new HashMap<>(); counts.put(record.name.toString(), record.count); reader.close();
/** * Get Avro schema from an Avro data file. */ public static Schema getSchemaFromDataFile(Path dataFile, FileSystem fs) throws IOException { try (SeekableInput sin = new FsInput(dataFile, fs.getConf()); DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, new GenericDatumReader<GenericRecord>())) { return reader.getSchema(); } }
/** * Returns an {@link DataFileReader} to the specified avro file. * <p> * Note: It is the caller's responsibility to close the returned {@link DataFileReader}. * </p> * * @param file The path to the avro file to open. * @return A {@link DataFileReader} for the specified avro file. * @throws FileBasedHelperException if there is a problem opening the {@link InputStream} for the specified file. */ public DataFileReader<GenericRecord> getAvroFile(String file) throws FileBasedHelperException { try { if (!this.getFileSystem().exists(new Path(file))) { LOGGER.warn(file + " does not exist."); return null; } if (this.getState().getPropAsBoolean(ConfigurationKeys.SHOULD_FS_PROXY_AS_USER, ConfigurationKeys.DEFAULT_SHOULD_FS_PROXY_AS_USER)) { return new DataFileReader<>(new ProxyFsInput(new Path(file), this.getFileSystem()), new GenericDatumReader<GenericRecord>()); } return new DataFileReader<>(new FsInput(new Path(file), this.getFileSystem().getConf()), new GenericDatumReader<GenericRecord>()); } catch (IOException e) { throw new FileBasedHelperException("Failed to open avro file " + file + " due to error " + e.getMessage(), e); } }
@Test public void testNamedCodecs() throws IOException { Configuration conf = new Configuration(); Path myfile = new Path(mTempDir.getRoot().getPath(), "myfile"); Schema key = Schema.create(Schema.Type.STRING); Schema value = Schema.create(Schema.Type.STRING); Schema recordSchema = AvroKeyValue.getSchema(key, value); DatumReader<GenericRecord> datumReader = SpecificData.get().createDatumReader(recordSchema); DataFileReader<GenericRecord> reader; SortedKeyValueFile.Writer.Options options = new SortedKeyValueFile.Writer.Options() .withKeySchema(key) .withValueSchema(value) .withConfiguration(conf) .withPath(myfile); SortedKeyValueFile.Writer<CharSequence, CharSequence> writer; for(String codec : new String[]{"null", "deflate", "snappy", "bzip2"}) { LOG.debug("Using " + codec + "codec for a SortedKeyValueFile..."); options.withCodec(codec); writer = new SortedKeyValueFile.Writer<>(options); writer.close(); reader = new DataFileReader<>( new FsInput(new Path(myfile, SortedKeyValueFile.DATA_FILENAME), conf), datumReader); assertEquals(codec, reader.getMetaString("avro.codec")); reader.close(); } }
public void testSplits() throws IOException { File file = makeFile(); DataFileReader<Object> reader = new DataFileReader<>(file, new GenericDatumReader<>()); Random rand = new Random(SEED); try { int splits = 10; // number of splits int length = (int)file.length(); // length of file int end = length; // end of split int remaining = end; // bytes remaining int count = 0; // count of entries while (remaining > 0) { int start = Math.max(0, end - rand.nextInt(2*length/splits)); reader.sync(start); // count entries in split while (!reader.pastSync(end)) { reader.next(); count++; } remaining -= end-start; end = start; } assertEquals(COUNT, count); } finally { reader.close(); } }
DataFileReader r = new DataFileReader<>(file, new GenericDatumReader<>(schema)); assertEquals("apple", r.next().toString()); assertEquals("banana", r.next().toString()); long prevSync = r.previousSync(); try { r.next(); fail("Corrupt block should throw exception"); } catch (AvroRuntimeException e) { assertEquals("Invalid sync!", e.getCause().getMessage()); r.sync(prevSync); // go to sync point after previous successful one assertEquals("endive", r.next().toString()); assertEquals("fig", r.next().toString()); assertFalse(r.hasNext());
public Schema getAvroSchema(String file) throws FileBasedHelperException { DataFileReader<GenericRecord> dfr = null; try { if (this.getState().getPropAsBoolean(ConfigurationKeys.SHOULD_FS_PROXY_AS_USER, ConfigurationKeys.DEFAULT_SHOULD_FS_PROXY_AS_USER)) { dfr = new DataFileReader<>(new ProxyFsInput(new Path(file), this.getFileSystem()), new GenericDatumReader<GenericRecord>()); } else { dfr = new DataFileReader<>(new FsInput(new Path(file), this.getFileSystem().getConf()), new GenericDatumReader<GenericRecord>()); } return dfr.getSchema(); } catch (IOException e) { throw new FileBasedHelperException("Failed to open avro file " + file + " due to error " + e.getMessage(), e); } finally { if (dfr != null) { try { dfr.close(); } catch (IOException e) { LOGGER.error("Failed to close avro file " + file, e); } } } }
public void testReadWithHeader() throws IOException { File file = makeFile(); DataFileReader<Object> reader = new DataFileReader<>(file, new GenericDatumReader<>()); // get a header for this file DataFileStream.Header header = reader.getHeader(); // re-open to an arbitrary position near the middle, with sync == true SeekableFileInput sin = new SeekableFileInput(file); sin.seek(sin.length() / 2); reader = DataFileReader.openReader(sin, new GenericDatumReader<>(), header, true); assertNotNull("Should be able to reopen from arbitrary point", reader.next()); long validPos = reader.previousSync(); // post sync, we know of a valid sync point: re-open with seek (sync == false) sin.seek(validPos); reader = DataFileReader.openReader(sin, new GenericDatumReader<>(), header, false); assertEquals("Should not move from sync point on reopen", validPos, sin.tell()); assertNotNull("Should be able to reopen at sync point", reader.next()); }