@Override public void init(final DataFileStream<GenericRecord> reader, final String codec, final OutputStream out) throws IOException { writer = new GenericDatumWriter<>(reader.getSchema()); encoder = EncoderFactory.get().binaryEncoder(out, null); }
@Override public RecordSchema getSchema(Map<String, String> variables, final InputStream contentStream, final RecordSchema readSchema) throws SchemaNotFoundException, IOException { final DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(contentStream, new GenericDatumReader<GenericRecord>()); final Schema avroSchema = dataFileStream.getSchema(); final RecordSchema recordSchema = AvroTypeUtil.createSchema(avroSchema); return recordSchema; }
private void nextInput() throws IOException{ currentInput++; Path path = inFiles.get(currentInput); FSDataInputStream input = new FSDataInputStream(Util.openFromFS(path)); reader = new DataFileStream<>(input, new GenericDatumReader<>()); if (schema == null) { // if this is the first file, the schema gets saved schema = reader.getSchema(); } else if (!schema.equals(reader.getSchema())) { // subsequent files have to have equal schemas throw new IOException("schemas dont match"); } }
/** * Given an Avro data file, map from column to field type and time unit, return the equivalent Pinot schema. * * @param avroDataFile Avro data file * @param fieldTypeMap Map from column to field type * @param timeUnit Time unit * @return Pinot schema * @throws IOException */ public static Schema getPinotSchemaFromAvroDataFile(@Nonnull File avroDataFile, @Nullable Map<String, FieldSpec.FieldType> fieldTypeMap, @Nullable TimeUnit timeUnit) throws IOException { try (DataFileStream<GenericRecord> reader = getAvroReader(avroDataFile)) { org.apache.avro.Schema avroSchema = reader.getSchema(); return getPinotSchemaFromAvroSchema(avroSchema, fieldTypeMap, timeUnit); } }
/** * Retrieve schema from the given bytes * * @return the retrieved {@link Schema schema} * */ private Schema retrieveSchemaFromBytes(byte[] data) { ByteArrayInputStream bais = new ByteArrayInputStream(data); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); Schema schema = null; try { // dfs is AutoCloseable @SuppressWarnings("resource") DataFileStream<GenericRecord> dfs = new DataFileStream<GenericRecord>(bais, reader); schema = dfs.getSchema(); } catch (IOException ioe) { throw new AvroObjectInspectorException("An error occurred retrieving schema from bytes", ioe); } return schema; }
public void init() throws FileNotFoundException, IOException { dataStream = new DataFileStream<GenericRecord>(new FileInputStream(avroFile), new GenericDatumReader<GenericRecord>()); schema = dataStream.getSchema(); }
public static List<String> getColumnNamesFromAvro(File avro) throws IOException { List<String> ret = new ArrayList<String>(); DataFileStream<GenericRecord> dataStream = new DataFileStream<GenericRecord>(new FileInputStream(avro), new GenericDatumReader<GenericRecord>()); for (final Field field : dataStream.getSchema().getFields()) { ret.add(field.name()); } return ret; }
private byte[] initAvroWriter(ProcessSession session, String codec, DataFileStream<GenericRecord> reader, DataFileWriter<GenericRecord> writer, AtomicReference<FlowFile> flowFileRef) { writer.setCodec(CodecFactory.fromString(codec)); // Transfer metadata (this is a subset of the incoming file) for (String metaKey : reader.getMetaKeys()) { if (!RESERVED_METADATA.contains(metaKey)) { writer.setMeta(metaKey, reader.getMeta(metaKey)); } } final ByteArrayOutputStream avroHeader = new ByteArrayOutputStream(); flowFileRef.set(session.append(flowFileRef.get(), (out) -> { // Create writer so that records can be appended later. writer.create(reader.getSchema(), avroHeader); writer.close(); final byte[] header = avroHeader.toByteArray(); out.write(header); })); // Capture the Avro header byte array that is just written to the FlowFile. // This is needed when Avro records are appended to the same FlowFile. return avroHeader.toByteArray(); }
@Override public void init(final DataFileStream<GenericRecord> reader, final String codec, final OutputStream out) throws IOException { writer = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>()); if (transferMetadata) { for (String metaKey : reader.getMetaKeys()) { if (!RESERVED_METADATA.contains(metaKey)) { writer.setMeta(metaKey, reader.getMeta(metaKey)); } } } writer.setCodec(CodecFactory.fromString(codec)); writer.create(reader.getSchema(), out); }
public AvroReaderWithEmbeddedSchema(final InputStream in) throws IOException { this.in = in; dataFileStream = new DataFileStream<>(in, new NonCachingDatumReader<>()); this.avroSchema = dataFileStream.getSchema(); recordSchema = AvroTypeUtil.createSchema(avroSchema); }
private void validateSchema() { org.apache.avro.Schema avroSchema = _avroReader.getSchema(); for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String fieldName = fieldSpec.getName(); Field avroField = avroSchema.getField(fieldName); if (avroField == null) { LOGGER.warn("Pinot field: {} does not exist in Avro Schema", fieldName); } else { boolean isPinotFieldSingleValue = fieldSpec.isSingleValueField(); boolean isAvroFieldSingleValue = AvroUtils.isSingleValueField(avroField); if (isPinotFieldSingleValue != isAvroFieldSingleValue) { String errorMessage = "Pinot field: " + fieldName + " is " + (isPinotFieldSingleValue ? "Single" : "Multi") + "-valued in Pinot schema but not in Avro schema"; LOGGER.error(errorMessage); throw new IllegalStateException(errorMessage); } DataType pinotFieldDataType = fieldSpec.getDataType(); DataType avroFieldDataType = AvroUtils.extractFieldDataType(avroField); if (pinotFieldDataType != avroFieldDataType) { LOGGER.warn("Pinot field: {} of type: {} mismatches with corresponding field in Avro Schema of type: {}", fieldName, pinotFieldDataType, avroFieldDataType); } } } }
@Override public void process(InputStream in) throws IOException { try (DataFileStream<Record> stream = new DataFileStream<>( in, AvroUtil.newDatumReader(schema, Record.class))) { IncompatibleSchemaException.check( SchemaValidationUtil.canRead(stream.getSchema(), schema), "Incompatible file schema %s, expected %s", stream.getSchema(), schema); long written = 0L; try (DatasetWriter<Record> writer = target.newWriter()) { for (Record record : stream) { writer.write(record); written += 1; } } finally { session.adjustCounter("Stored records", written, true /* cannot roll back the write */); } } } });
new DataFileStream<>(inStream, reader); if (!fileReader.getSchema().equals(new Schema.Parser().parse(TEXT_FILE_SCHEMA))) { err.println("Avro file is not generic text schema"); p.printHelpOn(err);
@Override public void init(Map<String, String> props, Schema indexingSchema, String topicName) throws Exception { // Load Avro schema DataFileStream<GenericRecord> reader = AvroUtils.getAvroReader(avroFile); _avroSchema = reader.getSchema(); reader.close(); _rowGenerator = new AvroRecordToPinotRowGenerator(indexingSchema); _reader = new GenericDatumReader<>(_avroSchema); }
@BeforeClass public static void before() throws Exception { final String filePath = TestUtils.getFileFromResourceUrl(DictionariesTest.class.getClassLoader().getResource(AVRO_DATA)); if (INDEX_DIR.exists()) { FileUtils.deleteQuietly(INDEX_DIR); } // System.out.println(INDEX_DIR.getAbsolutePath()); final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null); final SegmentGeneratorConfig config = SegmentTestUtils .getSegmentGenSpecWithSchemAndProjectedColumns(new File(filePath), INDEX_DIR, "weeksSinceEpochSunday", TimeUnit.DAYS, "test"); config.setTimeColumnName("weeksSinceEpochSunday"); driver.init(config); driver.build(); final DataFileStream<GenericRecord> avroReader = AvroUtils.getAvroReader(new File(filePath)); final org.apache.avro.Schema avroSchema = avroReader.getSchema(); final String[] columns = new String[avroSchema.getFields().size()]; int i = 0; for (final Field f : avroSchema.getFields()) { columns[i] = f.name(); i++; } }
@BeforeClass public static void before() throws Exception { final String filePath = TestUtils.getFileFromResourceUrl(BlocksTest.class.getClassLoader().getResource(AVRO_DATA)); if (INDEX_DIR.exists()) { FileUtils.deleteQuietly(INDEX_DIR); } // System.out.println(INDEX_DIR.getAbsolutePath()); final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null); final SegmentGeneratorConfig config = SegmentTestUtils .getSegmentGenSpecWithSchemAndProjectedColumns(new File(filePath), INDEX_DIR, "daysSinceEpoch", TimeUnit.DAYS, "test"); config.setTimeColumnName("daysSinceEpoch"); driver.init(config); driver.build(); final DataFileStream<GenericRecord> avroReader = AvroUtils.getAvroReader(new File(filePath)); final org.apache.avro.Schema avroSchema = avroReader.getSchema(); final String[] columns = new String[avroSchema.getFields().size()]; int i = 0; for (final Field f : avroSchema.getFields()) { columns[i] = f.name(); i++; } } }
OutputStream outs = Util.fileOrStdout(args.get(1), out); AvroColumnWriter<Object> writer = new AvroColumnWriter<>(reader.getSchema(), new ColumnFileMetaData() .setCodec(codec.value(opts)));
public static Schema extractSchemaFromAvroWithoutTime(File avroFile) throws IOException { DataFileStream<GenericRecord> dataStream = new DataFileStream<GenericRecord>(new FileInputStream(avroFile), new GenericDatumReader<GenericRecord>()); Schema schema = new Schema(); for (final Field field : dataStream.getSchema().getFields()) { try { getColumnType(field); } catch (Exception e) { LOGGER.warn("Caught exception while converting Avro field {} of type {}, field will not be in schema.", field.name(), field.schema().getType()); continue; } final String columnName = field.name(); final String pinotType = field.getProp("pinotType"); final FieldSpec fieldSpec; if (pinotType != null && "METRIC".equals(pinotType)) { fieldSpec = new MetricFieldSpec(); } else { fieldSpec = new DimensionFieldSpec(); } fieldSpec.setName(columnName); fieldSpec.setDataType(getColumnType(dataStream.getSchema().getField(columnName))); fieldSpec.setSingleValueField(isSingleValueField(dataStream.getSchema().getField(columnName))); schema.addField(fieldSpec); } dataStream.close(); return schema; }
final org.apache.avro.Schema avroSchema = avroReader.getSchema(); final String[] columns = new String[avroSchema.getFields().size()]; int i = 0;
Schema schema = new Schema(); for (final Field field : dataStream.getSchema().getFields()) { final String columnName = field.name(); FieldType fieldType = fieldTypeMap.get(columnName);