/** * Creates an Avro container file reader from a seekable input stream. * * @param input The input containing the Avro container file. * @param datumReader The reader to use for the individual records in the Avro container file. * @throws IOException If there is an error reading from the input stream. */ protected DataFileReader<T> createAvroFileReader( SeekableInput input, DatumReader<T> datumReader) throws IOException { return new DataFileReader<>(input, datumReader); } }
/** * Get Avro schema from an Avro data file. */ public static Schema getSchemaFromDataFile(Path dataFile, FileSystem fs) throws IOException { try (SeekableInput sin = new FsInput(dataFile, fs.getConf()); DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, new GenericDatumReader<GenericRecord>())) { return reader.getSchema(); } }
/** Reads and returns the first datum in a data file. */ static Object datumFromFile(Schema schema, String file) throws IOException { DataFileReader<Object> in = new DataFileReader<>(new File(file), new GenericDatumReader<>(schema)); try { return in.next(); } finally { in.close(); } }
private int countRecords(File outFile) throws IOException { GenericDatumReader<Object> reader = new GenericDatumReader<>(); try(DataFileReader<Object> fileReader = new DataFileReader<>(outFile, reader)) { int i = 0; for (@SuppressWarnings("unused") Object datum : fileReader) { i++; } return i; } }
private Schema getSchemaFromAvroDataFile() throws IOException { String firstDataFilePath = HdfsReader.getFirstDataFilePathInDir(this.dataLocationInHdfs); LOG.info("Extracting schema for table " + this.name + " from avro data file " + firstDataFilePath); SeekableInput sin = new HdfsReader(firstDataFilePath).getFsInput(); try (DataFileReader<Void> dfr = new DataFileReader<>(sin, new GenericDatumReader<Void>())) { Schema schema = dfr.getSchema(); return schema; } }
private void initialize() throws IOException, NoSuchAlgorithmException { SeekableResettableInputBridge in = new SeekableResettableInputBridge(ris); long pos = in.tell(); in.seek(0L); fileReader = new DataFileReader<GenericRecord>(in, new GenericDatumReader<GenericRecord>()); fileReader.sync(pos); schema = fileReader.getSchema(); datumWriter = new GenericDatumWriter(schema); out = new ByteArrayOutputStream(); encoder = EncoderFactory.get().binaryEncoder(out, encoder); schemaHash = SchemaNormalization.parsingFingerprint("CRC-64-AVRO", schema); schemaHashString = Hex.encodeHexString(schemaHash); }
public AvroFileReader(LogFilePath logFilePath, CompressionCodec codec) throws IOException { file = new File(logFilePath.getLogFilePath()); file.getParentFile().mkdirs(); String topic = logFilePath.getTopic(); Schema schema = schemaRegistryClient.getSchema(topic); DatumReader datumReader = new SpecificDatumReader(schema); try { reader = new DataFileReader(file, datumReader); } catch (IOException e) { throw new RuntimeException(e); } writer = new SpecificDatumWriter(schema); offset = logFilePath.getOffset(); }
private <D> List<D> read(Schema schema, File file) throws IOException { DatumReader<D> reader = newReader(schema); List<D> data = new ArrayList<>(); try (FileReader<D> fileReader = new DataFileReader<>(file, reader)) { for (D datum : fileReader) { data.add(datum); } } return data; }
public static void assertGenericRecords(File outputAvroFile, Schema schema) throws IOException { try (DataFileReader<GenericRecord> reader = new DataFileReader<>(outputAvroFile, new GenericDatumReader<GenericRecord>(schema))) { Iterator<GenericRecord> iterator = reader.iterator(); GenericRecord record = iterator.next(); Assert.assertEquals(record.get("name").toString(), "Alyssa"); record = iterator.next(); Assert.assertEquals(record.get("name").toString(), "Ben"); record = iterator.next(); Assert.assertEquals(record.get("name").toString(), "Charlie"); Assert.assertFalse(iterator.hasNext()); } }
public TetherRecordReader(JobConf job, FileSplit split) throws IOException { this.in = new FsInput(split.getPath(), job); this.reader = new DataFileReader<>(in, new GenericDatumReader<>()); reader.sync(split.getStart()); // sync to start this.start = in.tell(); this.end = split.getStart() + split.getLength(); job.set(AvroJob.INPUT_SCHEMA, reader.getSchema().toString()); }
private GenericRecord getRecordFromFile(String path) throws IOException { DatumReader<GenericRecord> reader = new GenericDatumReader<>(); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(new File(path), reader); while (dataFileReader.hasNext()) { return dataFileReader.next(); } return null; }
private GenericRecord getRecordFromFile(String path) throws IOException { DatumReader<GenericRecord> reader = new GenericDatumReader<>(); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(new File(path), reader); if (dataFileReader.hasNext()) { return dataFileReader.next(); } return null; }
private void updateRecordFromTestResource(String resourceName, String avroFileName) throws IOException { if (avroFileName == null) { avroFileName = resourceName + ".avro"; } recordSchema = new Schema.Parser().parse( getClass().getClassLoader().getResourceAsStream(resourceName + ".avsc") ); DatumReader<GenericRecord> reader = new GenericDatumReader<>(recordSchema); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>( new File(getClass().getClassLoader().getResource(avroFileName).getPath()), reader); Assert.assertTrue(dataFileReader.hasNext()); record = dataFileReader.next(record); accessor = new AvroGenericRecordAccessor(record); }
static <T> T runOnPreview(byte[] bits, AvroPreviewProcessor<T> processor) throws IOException { DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(); SeekableByteArrayInput sbai = new SeekableByteArrayInput(bits); DataFileReader<GenericRecord> dataFileReader = null; try { dataFileReader = new DataFileReader<>(sbai, datumReader); int headerLen = (int) dataFileReader.previousSync(); byte[] header = Arrays.copyOf(bits, headerLen); if (dataFileReader.hasNext()) { GenericRecord gr = dataFileReader.next(); return processor.process(header, gr, dataFileReader.getBlockCount(), dataFileReader.getBlockSize()); } else { throw new RuntimeException("Empty Avro file - cannot run preview! "); } } finally { try { if (dataFileReader!=null) dataFileReader.close(); } catch (IOException safeToIgnore) {} } }
private Map<String, GenericRecord> getGenericRecordMap(byte[] data, Schema schema, String key) throws IOException { // create a reader for the merged contet DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(schema); SeekableByteArrayInput input = new SeekableByteArrayInput(data); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(input, datumReader); // read all the records into a map to verify all the records are there Map<String,GenericRecord> records = new HashMap<>(); while (dataFileReader.hasNext()) { GenericRecord user = dataFileReader.next(); records.put(user.get(key).toString(), user); } return records; }
@Test public void testSyncInHeader() throws IOException { DataFileReader<Object> reader = new DataFileReader<> (new File("../../../share/test/data/syncInMeta.avro"), new GenericDatumReader<>()); reader.sync(0); for (Object datum : reader) assertNotNull(datum); }
private void fileIsGoodAvro(Path path) throws IOException { DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); try (FSDataInputStream in = fs.open(path, 0); FileOutputStream out = new FileOutputStream("target/FOO.avro")) { byte[] buffer = new byte[100]; int bytesRead; while ((bytesRead = in.read(buffer)) > 0) { out.write(buffer, 0, bytesRead); } } java.io.File file = new File("target/FOO.avro"); try (DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(file, datumReader)) { GenericRecord user = null; while (dataFileReader.hasNext()) { user = dataFileReader.next(user); } } file.delete(); } }
public TestExtractor(WorkUnitState workUnitState) { //super(workUnitState); Schema schema = new Schema.Parser().parse(AVRO_SCHEMA); Path sourceFile = new Path(workUnitState.getWorkunit().getProp(TestSource.SOURCE_FILE_KEY)); LOG.info("Reading from source file " + sourceFile); DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(schema); try { FileSystem fs = FileSystem .get(URI.create(workUnitState.getProp(ConfigurationKeys.FS_URI_KEY, ConfigurationKeys.LOCAL_FS_URI)), new Configuration()); fs.makeQualified(sourceFile); this.dataFileReader = new DataFileReader<GenericRecord>(new FsInput(sourceFile, new Configuration()), datumReader); } catch (IOException ioe) { LOG.error("Failed to read the source file " + sourceFile, ioe); } }
private void checkFileContains(File repairedFile, String... lines) throws IOException { DataFileReader r = new DataFileReader<>(repairedFile, new GenericDatumReader<>(SCHEMA)); for (String line : lines) { assertEquals(line, r.next().toString()); } assertFalse(r.hasNext()); }
@Test public void testGenericRecord() throws IOException { final Path outputPath = new Path(File.createTempFile("avro-output-file", "generic.avro").getAbsolutePath()); final AvroOutputFormat<GenericRecord> outputFormat = new AvroOutputFormat<>(outputPath, GenericRecord.class); Schema schema = new Schema.Parser().parse("{\"type\":\"record\", \"name\":\"user\", \"fields\": [{\"name\":\"user_name\", \"type\":\"string\"}, {\"name\":\"favorite_number\", \"type\":\"int\"}, {\"name\":\"favorite_color\", \"type\":\"string\"}]}"); outputFormat.setWriteMode(FileSystem.WriteMode.OVERWRITE); outputFormat.setSchema(schema); output(outputFormat, schema); GenericDatumReader<GenericRecord> reader = new GenericDatumReader<>(schema); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(new File(outputPath.getPath()), reader); while (dataFileReader.hasNext()) { GenericRecord record = dataFileReader.next(); assertEquals(record.get("user_name").toString(), "testUser"); assertEquals(record.get("favorite_number"), 1); assertEquals(record.get("favorite_color").toString(), "blue"); } //cleanup FileSystem fs = FileSystem.getLocalFileSystem(); fs.delete(outputPath, false); }