public Schema getSchema() { return reader.getSchema(); }
/** * Get Avro schema from an Avro data file. */ public static Schema getSchemaFromDataFile(Path dataFile, FileSystem fs) throws IOException { try (SeekableInput sin = new FsInput(dataFile, fs.getConf()); DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, new GenericDatumReader<GenericRecord>())) { return reader.getSchema(); } }
@Override public int run(InputStream stdin, PrintStream out, PrintStream err, List<String> args) throws Exception { if (args.size() != 1) { err.println("Expected 1 argument: input_file"); return 1; } DataFileReader<Void> reader = new DataFileReader<>(Util.openSeekableFromFS(args.get(0)), new GenericDatumReader<>()); out.println(reader.getSchema().toString(true)); return 0; } }
/** * Get the latest avro schema for a directory * @param directory the input dir that contains avro files * @param fs the {@link FileSystem} for the given directory. * @param latest true to return latest schema, false to return oldest schema * @return the latest/oldest schema in the directory * @throws IOException */ public static Schema getDirectorySchema(Path directory, FileSystem fs, boolean latest) throws IOException { Schema schema = null; try (Closer closer = Closer.create()) { List<FileStatus> files = getDirectorySchemaHelper(directory, fs); if (files == null || files.size() == 0) { LOG.warn("There is no previous avro file in the directory: " + directory); } else { FileStatus file = latest ? files.get(0) : files.get(files.size() - 1); LOG.debug("Path to get the avro schema: " + file); FsInput fi = new FsInput(file.getPath(), fs.getConf()); GenericDatumReader<GenericRecord> genReader = new GenericDatumReader<>(); schema = closer.register(new DataFileReader<>(fi, genReader)).getSchema(); } } catch (IOException ioe) { throw new IOException("Cannot get the schema for directory " + directory, ioe); } return schema; }
private Schema getSchemaFromAvroDataFile() throws IOException { String firstDataFilePath = HdfsReader.getFirstDataFilePathInDir(this.dataLocationInHdfs); LOG.info("Extracting schema for table " + this.name + " from avro data file " + firstDataFilePath); SeekableInput sin = new HdfsReader(firstDataFilePath).getFsInput(); try (DataFileReader<Void> dfr = new DataFileReader<>(sin, new GenericDatumReader<Void>())) { Schema schema = dfr.getSchema(); return schema; } }
private void initialize() throws IOException, NoSuchAlgorithmException { SeekableResettableInputBridge in = new SeekableResettableInputBridge(ris); long pos = in.tell(); in.seek(0L); fileReader = new DataFileReader<GenericRecord>(in, new GenericDatumReader<GenericRecord>()); fileReader.sync(pos); schema = fileReader.getSchema(); datumWriter = new GenericDatumWriter(schema); out = new ByteArrayOutputStream(); encoder = EncoderFactory.get().binaryEncoder(out, encoder); schemaHash = SchemaNormalization.parsingFingerprint("CRC-64-AVRO", schema); schemaHashString = Hex.encodeHexString(schemaHash); }
private DataFileReader<E> initReader(FileInputSplit split) throws IOException { DatumReader<E> datumReader; if (org.apache.avro.generic.GenericRecord.class == avroValueType) { datumReader = new GenericDatumReader<E>(); } else { datumReader = org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType) ? new SpecificDatumReader<E>(avroValueType) : new ReflectDatumReader<E>(avroValueType); } if (LOG.isInfoEnabled()) { LOG.info("Opening split {}", split); } SeekableInput in = new FSDataInputStreamWrapper(stream, split.getPath().getFileSystem().getFileStatus(split.getPath()).getLen()); DataFileReader<E> dataFileReader = (DataFileReader) DataFileReader.openReader(in, datumReader); if (LOG.isDebugEnabled()) { LOG.debug("Loaded SCHEMA: {}", dataFileReader.getSchema()); } end = split.getStart() + split.getLength(); recordsReadSinceLastSync = 0; return dataFileReader; }
public TetherRecordReader(JobConf job, FileSplit split) throws IOException { this.in = new FsInput(split.getPath(), job); this.reader = new DataFileReader<>(in, new GenericDatumReader<>()); reader.sync(split.getStart()); // sync to start this.start = in.tell(); this.end = split.getStart() + split.getLength(); job.set(AvroJob.INPUT_SCHEMA, reader.getSchema().toString()); }
public Schema getAvroSchema(String file) throws FileBasedHelperException { DataFileReader<GenericRecord> dfr = null; try { if (this.getState().getPropAsBoolean(ConfigurationKeys.SHOULD_FS_PROXY_AS_USER, ConfigurationKeys.DEFAULT_SHOULD_FS_PROXY_AS_USER)) { dfr = new DataFileReader<>(new ProxyFsInput(new Path(file), this.getFileSystem()), new GenericDatumReader<GenericRecord>()); } else { dfr = new DataFileReader<>(new FsInput(new Path(file), this.getFileSystem().getConf()), new GenericDatumReader<GenericRecord>()); } return dfr.getSchema(); } catch (IOException e) { throw new FileBasedHelperException("Failed to open avro file " + file + " due to error " + e.getMessage(), e); } finally { if (dfr != null) { try { dfr.close(); } catch (IOException e) { LOGGER.error("Failed to close avro file " + file, e); } } } }
/** Open a writer appending to an existing file. * <strong>Since 1.9.0 this method does not close in.</strong> * @param in reading the existing file. * @param out positioned at the end of the existing file. */ public DataFileWriter<D> appendTo(SeekableInput in, OutputStream out) throws IOException { assertNotOpen(); DataFileReader<D> reader = new DataFileReader<>(in, new GenericDatumReader<>()); this.schema = reader.getSchema(); this.sync = reader.getHeader().sync; this.meta.putAll(reader.getHeader().meta); byte[] codecBytes = this.meta.get(DataFileConstants.CODEC); if (codecBytes != null) { String strCodec = new String(codecBytes, "UTF-8"); this.codec = CodecFactory.fromString(strCodec).createInstance(); } else { this.codec = CodecFactory.nullCodec().createInstance(); } init(out); return this; }
reader); try { Schema schema = fileReader.getSchema(); String codecStr = fileReader.getMetaString(DataFileConstants.CODEC); CodecFactory codecFactory = CodecFactory.fromString("" + codecStr);
/** Open a writer appending to an existing file. * @param in reading the existing file. * @param out positioned at the end of the existing file. */ public DataFileWriter<D> appendTo(SeekableInput in, OutputStream out) throws IOException { assertNotOpen(); DataFileReader<D> reader = new DataFileReader<D>(in, new GenericDatumReader<D>()); this.schema = reader.getSchema(); this.sync = reader.getHeader().sync; this.meta.putAll(reader.getHeader().meta); byte[] codecBytes = this.meta.get(DataFileConstants.CODEC); if (codecBytes != null) { String strCodec = new String(codecBytes, "UTF-8"); this.codec = CodecFactory.fromString(strCodec).createInstance(); } else { this.codec = CodecFactory.nullCodec().createInstance(); } reader.close(); init(out); return this; }
GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); try (DataFileReader<GenericRecord> fileReader = new DataFileReader<>(schemaAvroFile, datumReader)) { Schema schema = fileReader.getSchema(); for (Schema.Field field : schema.getFields()) { String fieldName = field.name();
/** * Test that non-string map-keys are readable through GenericDatumReader * This method should read as array of {key, value} and not as a map */ private <T> List<GenericRecord> testGenericDatumRead (String testType, byte[] bytes, T ... entityObjs) throws IOException { GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); SeekableByteArrayInput avroInputStream = new SeekableByteArrayInput(bytes); DataFileReader<GenericRecord> fileReader = new DataFileReader<>(avroInputStream, datumReader); Schema schema = fileReader.getSchema(); assertNotNull("Unable to get schema for " + testType, schema); GenericRecord record = null; List<GenericRecord> records = new ArrayList<>(); while (fileReader.hasNext()) { records.add (fileReader.next(record)); } return records; }
fakeHeader = new DataFileReader<>(new SeekableByteArrayInput(this.header), datumReader).getHeader(); dataFileReader = DataFileReader.openReader(sbai, datumReader, fakeHeader, true); Schema schema = dataFileReader.getSchema(); GenericRecord gr = new GenericData.Record(schema); Schema.Field[] flatSchema = flatSchema(schema);
/** * Test that non-string map-keys are readable through ReflectDatumReader * This method should form the original map and should not return any * array of {key, value} as done by {@link #testGenericDatumRead()} */ private <T> List<T> testReflectDatumRead (String testType, byte[] bytes, T ... entityObjs) throws IOException { ReflectDatumReader<T> datumReader = new ReflectDatumReader<>(); SeekableByteArrayInput avroInputStream = new SeekableByteArrayInput(bytes); DataFileReader<T> fileReader = new DataFileReader<>(avroInputStream, datumReader); Schema schema = fileReader.getSchema(); T record = null; List<T> records = new ArrayList<>(); while (fileReader.hasNext()) { records.add (fileReader.next(record)); } return records; }
assertEquals(schema, fileReader.getSchema()); String codecStr = fileReader.getMetaString("avro.codec"); if (null == codecStr) {
private Schema getNewestSchemaFromSource(Path sourceDir, FileSystem fs) throws IOException { FileStatus[] files = fs.listStatus(sourceDir); Arrays.sort(files, new ReverseLastModifiedComparitor()); for (FileStatus f : files) { if (f.isDir()) { Schema schema = getNewestSchemaFromSource(f.getPath(), fs); if (schema != null) return schema; } else if (f.getPath().getName().endsWith(".avro")) { FsInput fi = new FsInput(f.getPath(), fs.getConf()); GenericDatumReader<GenericRecord> genReader = new GenericDatumReader<GenericRecord>(); DataFileReader<GenericRecord> reader = new DataFileReader<GenericRecord>(fi, genReader); return reader.getSchema(); } } return null; }
private Schema getSchemaFromAvroDataFile() throws IOException { String firstDataFilePath = HdfsReader.getFirstDataFilePathInDir(this.dataLocationInHdfs); LOG.info("Extracting schema for table " + this.name + " from avro data file " + firstDataFilePath); SeekableInput sin = new HdfsReader(firstDataFilePath).getFsInput(); try (DataFileReader<Void> dfr = new DataFileReader<>(sin, new GenericDatumReader<Void>())) { Schema schema = dfr.getSchema(); return schema; } }
public TetherRecordReader(JobConf job, FileSplit split) throws IOException { this.in = new FsInput(split.getPath(), job); this.reader = new DataFileReader<Object>(in, new GenericDatumReader<Object>()); reader.sync(split.getStart()); // sync to start this.start = in.tell(); this.end = split.getStart() + split.getLength(); job.set(AvroJob.INPUT_SCHEMA, reader.getSchema().toString()); }