Refine search
/** * Returns an {@link DataFileReader} to the specified avro file. * <p> * Note: It is the caller's responsibility to close the returned {@link DataFileReader}. * </p> * * @param file The path to the avro file to open. * @return A {@link DataFileReader} for the specified avro file. * @throws FileBasedHelperException if there is a problem opening the {@link InputStream} for the specified file. */ public DataFileReader<GenericRecord> getAvroFile(String file) throws FileBasedHelperException { try { if (!this.getFileSystem().exists(new Path(file))) { LOGGER.warn(file + " does not exist."); return null; } if (this.getState().getPropAsBoolean(ConfigurationKeys.SHOULD_FS_PROXY_AS_USER, ConfigurationKeys.DEFAULT_SHOULD_FS_PROXY_AS_USER)) { return new DataFileReader<>(new ProxyFsInput(new Path(file), this.getFileSystem()), new GenericDatumReader<GenericRecord>()); } return new DataFileReader<>(new FsInput(new Path(file), this.getFileSystem().getConf()), new GenericDatumReader<GenericRecord>()); } catch (IOException e) { throw new FileBasedHelperException("Failed to open avro file " + file + " due to error " + e.getMessage(), e); } }
public TetherRecordReader(JobConf job, FileSplit split) throws IOException { this.in = new FsInput(split.getPath(), job); this.reader = new DataFileReader<>(in, new GenericDatumReader<>()); reader.sync(split.getStart()); // sync to start this.start = in.tell(); this.end = split.getStart() + split.getLength(); job.set(AvroJob.INPUT_SCHEMA, reader.getSchema().toString()); }
@After public void tearDown() throws Exception { if (fsInput != null) { fsInput.close(); } }
@Test public void testConfigurationConstructor() throws Exception { try (FsInput in = new FsInput(new Path(file.getPath()), conf)) { int expectedByteCount = 1; byte[] readBytes = new byte[expectedByteCount]; int actualByteCount = fsInput.read(readBytes, 0, expectedByteCount); assertThat(actualByteCount, is(equalTo(expectedByteCount))); } }
DatumReader<GenericRecord> datumReader = model.createDatumReader( AvroKeyValue.getSchema(keySchema, Schema.create(Schema.Type.LONG))); DataFileReader<GenericRecord> fileReader = new DataFileReader<>( new FsInput(path, conf), datumReader); fileReader.close();
/** * Get Avro schema from an Avro data file. */ public static Schema getSchemaFromDataFile(Path dataFile, FileSystem fs) throws IOException { try (SeekableInput sin = new FsInput(dataFile, fs.getConf()); DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, new GenericDatumReader<GenericRecord>())) { return reader.getSchema(); } }
conf.set("fs.default.name", "file:///"); Path avroFile = new Path("target/temp.avro"); DataFileReader<GenericData.Record> dataFileReader = new DataFileReader<>(new FsInput(avroFile, conf), new SpecificDatumReader<>()); dataFileReader.seek(positionTwo); assertTrue(dataFileReader.hasNext()); // Record 2. assertEquals(2, dataFileReader.next());
public AvroAsTextRecordReader(JobConf job, FileSplit split) throws IOException { this(DataFileReader.openReader (new FsInput(split.getPath(), job), new GenericDatumReader<>()), split); }
public Schema getAvroSchema(String file) throws FileBasedHelperException { DataFileReader<GenericRecord> dfr = null; try { if (this.getState().getPropAsBoolean(ConfigurationKeys.SHOULD_FS_PROXY_AS_USER, ConfigurationKeys.DEFAULT_SHOULD_FS_PROXY_AS_USER)) { dfr = new DataFileReader<>(new ProxyFsInput(new Path(file), this.getFileSystem()), new GenericDatumReader<GenericRecord>()); } else { dfr = new DataFileReader<>(new FsInput(new Path(file), this.getFileSystem().getConf()), new GenericDatumReader<GenericRecord>()); } return dfr.getSchema(); } catch (IOException e) { throw new FileBasedHelperException("Failed to open avro file " + file + " due to error " + e.getMessage(), e); } finally { if (dfr != null) { try { dfr.close(); } catch (IOException e) { LOGGER.error("Failed to close avro file " + file, e); } } } }
@Test public void testNamedCodecs() throws IOException { Configuration conf = new Configuration(); Path myfile = new Path(mTempDir.getRoot().getPath(), "myfile"); Schema key = Schema.create(Schema.Type.STRING); Schema value = Schema.create(Schema.Type.STRING); Schema recordSchema = AvroKeyValue.getSchema(key, value); DatumReader<GenericRecord> datumReader = SpecificData.get().createDatumReader(recordSchema); DataFileReader<GenericRecord> reader; SortedKeyValueFile.Writer.Options options = new SortedKeyValueFile.Writer.Options() .withKeySchema(key) .withValueSchema(value) .withConfiguration(conf) .withPath(myfile); SortedKeyValueFile.Writer<CharSequence, CharSequence> writer; for(String codec : new String[]{"null", "deflate", "snappy", "bzip2"}) { LOG.debug("Using " + codec + "codec for a SortedKeyValueFile..."); options.withCodec(codec); writer = new SortedKeyValueFile.Writer<>(options); writer.close(); reader = new DataFileReader<>( new FsInput(new Path(myfile, SortedKeyValueFile.DATA_FILENAME), conf), datumReader); assertEquals(codec, reader.getMetaString("avro.codec")); reader.close(); } }
@Override public void setup(final OutputMutator output) throws ExecutionSetupException { writer = new VectorContainerWriter(output); try { reader = new DataFileReader<>(new FsInput(hadoop, fsConf), new GenericDatumReader<GenericContainer>()); logger.debug("Processing file : {}, start position : {}, end position : {} ", hadoop, start, end); reader.sync(this.start); } catch (IOException e) { throw new ExecutionSetupException(e); } }
/** * Creates a seekable input stream to an Avro container file. * * @param conf The hadoop configuration. * @param path The path to the avro container file. * @throws IOException If there is an error reading from the path. */ protected SeekableInput createSeekableInput(Configuration conf, Path path) throws IOException { return new FsInput(path, conf); }
public long getPos() throws IOException { return in.tell(); }
FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*")); Assert.assertEquals(1, outputFiles.length); DataFileReader<TextStats> reader = new DataFileReader<>( new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new SpecificDatumReader<>()); Map<String, Integer> counts = new HashMap<>(); counts.put(record.name.toString(), record.count); reader.close();
/** * Get the latest avro schema for a directory * @param directory the input dir that contains avro files * @param fs the {@link FileSystem} for the given directory. * @param latest true to return latest schema, false to return oldest schema * @return the latest/oldest schema in the directory * @throws IOException */ public static Schema getDirectorySchema(Path directory, FileSystem fs, boolean latest) throws IOException { Schema schema = null; try (Closer closer = Closer.create()) { List<FileStatus> files = getDirectorySchemaHelper(directory, fs); if (files == null || files.size() == 0) { LOG.warn("There is no previous avro file in the directory: " + directory); } else { FileStatus file = latest ? files.get(0) : files.get(files.size() - 1); LOG.debug("Path to get the avro schema: " + file); FsInput fi = new FsInput(file.getPath(), fs.getConf()); GenericDatumReader<GenericRecord> genReader = new GenericDatumReader<>(); schema = closer.register(new DataFileReader<>(fi, genReader)).getSchema(); } } catch (IOException ioe) { throw new IOException("Cannot get the schema for directory " + directory, ioe); } return schema; }
conf.set("fs.default.name", "file:///"); Path avroFile = new Path("target/temp.avro"); DataFileReader<GenericData.Record> avroFileReader = new DataFileReader<>(new FsInput(avroFile, conf), new SpecificDatumReader<>()); avroFileReader.seek(pointTwo); assertTrue(avroFileReader.hasNext()); AvroKeyValue<CharSequence, TextStats> secondRecord = new AvroKeyValue<>(avroFileReader.next());
public AvroRecordReader(JobConf job, FileSplit split) throws IOException { this(DataFileReader.openReader (new FsInput(split.getPath(), job), AvroJob.createInputDataModel(job) .createDatumReader(AvroJob.getInputSchema(job))), split); }
reader = new DataFileReader(new FsInput(path, conf), new GenericDatumReader<GenericRecord>()); return reader.getSchema(); } catch (IOException e) { throw new RuntimeException("Error reading schema from path: " + path, e); if (reader != null) { try { reader.close(); } catch (IOException e) {
@Test public void testDeflateClassCodec() throws IOException { Configuration conf = new Configuration(); Path myfile = new Path(mTempDir.getRoot().getPath(), "myfile"); Schema key = Schema.create(Schema.Type.STRING); Schema value = Schema.create(Schema.Type.STRING); Schema recordSchema = AvroKeyValue.getSchema(key, value); DatumReader<GenericRecord> datumReader = SpecificData.get().createDatumReader(recordSchema); DataFileReader<GenericRecord> reader; LOG.debug("Using CodecFactory.deflateCodec() for a SortedKeyValueFile..."); SortedKeyValueFile.Writer.Options options = new SortedKeyValueFile.Writer.Options() .withKeySchema(key) .withValueSchema(value) .withConfiguration(conf) .withPath(myfile) .withCodec(CodecFactory.deflateCodec(9)); SortedKeyValueFile.Writer<CharSequence, CharSequence> writer = new SortedKeyValueFile.Writer<>(options); writer.close(); reader = new DataFileReader<>( new FsInput(new Path(myfile, SortedKeyValueFile.DATA_FILENAME), conf), datumReader); assertEquals("deflate", reader.getMetaString("avro.codec")); reader.close(); }