org.apache.avro.file.DataFileStream.getSchema java code examples

@Override
public void init(final DataFileStream<GenericRecord> reader, final String codec, final OutputStream out) throws IOException {
  writer = new GenericDatumWriter<>(reader.getSchema());
  encoder = EncoderFactory.get().binaryEncoder(out, null);
}

@Override
public RecordSchema getSchema(Map<String, String> variables, final InputStream contentStream, final RecordSchema readSchema) throws SchemaNotFoundException, IOException {
  final DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(contentStream, new GenericDatumReader<GenericRecord>());
  final Schema avroSchema = dataFileStream.getSchema();
  final RecordSchema recordSchema = AvroTypeUtil.createSchema(avroSchema);
  return recordSchema;
}

private void nextInput() throws IOException{
 currentInput++;
 Path path = inFiles.get(currentInput);
 FSDataInputStream input = new FSDataInputStream(Util.openFromFS(path));
 reader = new DataFileStream<>(input, new GenericDatumReader<>());
 if (schema == null) {                            // if this is the first file, the schema gets saved
  schema = reader.getSchema();
 }
 else if (!schema.equals(reader.getSchema())) {   // subsequent files have to have equal schemas
  throw new IOException("schemas dont match");
 }
}

/**
 * Given an Avro data file, map from column to field type and time unit, return the equivalent Pinot schema.
 *
 * @param avroDataFile Avro data file
 * @param fieldTypeMap Map from column to field type
 * @param timeUnit Time unit
 * @return Pinot schema
 * @throws IOException
 */
public static Schema getPinotSchemaFromAvroDataFile(@Nonnull File avroDataFile,
  @Nullable Map<String, FieldSpec.FieldType> fieldTypeMap, @Nullable TimeUnit timeUnit)
  throws IOException {
 try (DataFileStream<GenericRecord> reader = getAvroReader(avroDataFile)) {
  org.apache.avro.Schema avroSchema = reader.getSchema();
  return getPinotSchemaFromAvroSchema(avroSchema, fieldTypeMap, timeUnit);
 }
}

/**
 * Retrieve schema from the given bytes
 *
 * @return the retrieved {@link Schema schema}
 * */
private Schema retrieveSchemaFromBytes(byte[] data) {
 ByteArrayInputStream bais = new ByteArrayInputStream(data);
 DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
 Schema schema = null;
 try {
  // dfs is AutoCloseable
  @SuppressWarnings("resource")
  DataFileStream<GenericRecord> dfs = new DataFileStream<GenericRecord>(bais, reader);
  schema = dfs.getSchema();
 } catch (IOException ioe) {
  throw new AvroObjectInspectorException("An error occurred retrieving schema from bytes", ioe);
 }
 return schema;
}

public void init()
  throws FileNotFoundException, IOException {
 dataStream =
   new DataFileStream<GenericRecord>(new FileInputStream(avroFile), new GenericDatumReader<GenericRecord>());
 schema = dataStream.getSchema();
}

public static List<String> getColumnNamesFromAvro(File avro)
  throws IOException {
 List<String> ret = new ArrayList<String>();
 DataFileStream<GenericRecord> dataStream =
   new DataFileStream<GenericRecord>(new FileInputStream(avro), new GenericDatumReader<GenericRecord>());
 for (final Field field : dataStream.getSchema().getFields()) {
  ret.add(field.name());
 }
 return ret;
}

private byte[] initAvroWriter(ProcessSession session, String codec, DataFileStream<GenericRecord> reader,
                   DataFileWriter<GenericRecord> writer, AtomicReference<FlowFile> flowFileRef) {
  writer.setCodec(CodecFactory.fromString(codec));
  // Transfer metadata (this is a subset of the incoming file)
  for (String metaKey : reader.getMetaKeys()) {
    if (!RESERVED_METADATA.contains(metaKey)) {
      writer.setMeta(metaKey, reader.getMeta(metaKey));
    }
  }
  final ByteArrayOutputStream avroHeader = new ByteArrayOutputStream();
  flowFileRef.set(session.append(flowFileRef.get(), (out) -> {
    // Create writer so that records can be appended later.
    writer.create(reader.getSchema(), avroHeader);
    writer.close();
    final byte[] header = avroHeader.toByteArray();
    out.write(header);
  }));
  // Capture the Avro header byte array that is just written to the FlowFile.
  // This is needed when Avro records are appended to the same FlowFile.
  return avroHeader.toByteArray();
}

@Override
public void init(final DataFileStream<GenericRecord> reader, final String codec, final OutputStream out) throws IOException {
  writer = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>());
  if (transferMetadata) {
    for (String metaKey : reader.getMetaKeys()) {
      if (!RESERVED_METADATA.contains(metaKey)) {
        writer.setMeta(metaKey, reader.getMeta(metaKey));
      }
    }
  }
  writer.setCodec(CodecFactory.fromString(codec));
  writer.create(reader.getSchema(), out);
}

public AvroReaderWithEmbeddedSchema(final InputStream in) throws IOException {
  this.in = in;
  dataFileStream = new DataFileStream<>(in, new NonCachingDatumReader<>());
  this.avroSchema = dataFileStream.getSchema();
  recordSchema = AvroTypeUtil.createSchema(avroSchema);
}

private void validateSchema() {
 org.apache.avro.Schema avroSchema = _avroReader.getSchema();
 for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) {
  String fieldName = fieldSpec.getName();
  Field avroField = avroSchema.getField(fieldName);
  if (avroField == null) {
   LOGGER.warn("Pinot field: {} does not exist in Avro Schema", fieldName);
  } else {
   boolean isPinotFieldSingleValue = fieldSpec.isSingleValueField();
   boolean isAvroFieldSingleValue = AvroUtils.isSingleValueField(avroField);
   if (isPinotFieldSingleValue != isAvroFieldSingleValue) {
    String errorMessage = "Pinot field: " + fieldName + " is " + (isPinotFieldSingleValue ? "Single" : "Multi")
      + "-valued in Pinot schema but not in Avro schema";
    LOGGER.error(errorMessage);
    throw new IllegalStateException(errorMessage);
   }
   DataType pinotFieldDataType = fieldSpec.getDataType();
   DataType avroFieldDataType = AvroUtils.extractFieldDataType(avroField);
   if (pinotFieldDataType != avroFieldDataType) {
    LOGGER.warn("Pinot field: {} of type: {} mismatches with corresponding field in Avro Schema of type: {}",
      fieldName, pinotFieldDataType, avroFieldDataType);
   }
  }
 }
}

  @Override
  public void process(InputStream in) throws IOException {
    try (DataFileStream<Record> stream = new DataFileStream<>(
        in, AvroUtil.newDatumReader(schema, Record.class))) {
      IncompatibleSchemaException.check(
          SchemaValidationUtil.canRead(stream.getSchema(), schema),
          "Incompatible file schema %s, expected %s",
          stream.getSchema(), schema);
      long written = 0L;
      try (DatasetWriter<Record> writer = target.newWriter()) {
        for (Record record : stream) {
          writer.write(record);
          written += 1;
        }
      } finally {
        session.adjustCounter("Stored records", written,
            true /* cannot roll back the write */);
      }
    }
  }
});

  new DataFileStream<>(inStream, reader);
if (!fileReader.getSchema().equals(new Schema.Parser().parse(TEXT_FILE_SCHEMA))) {
 err.println("Avro file is not generic text schema");
 p.printHelpOn(err);

@Override
public void init(Map<String, String> props, Schema indexingSchema, String topicName)
  throws Exception {
 // Load Avro schema
 DataFileStream<GenericRecord> reader = AvroUtils.getAvroReader(avroFile);
 _avroSchema = reader.getSchema();
 reader.close();
 _rowGenerator = new AvroRecordToPinotRowGenerator(indexingSchema);
 _reader = new GenericDatumReader<>(_avroSchema);
}

 @BeforeClass
 public static void before()
   throws Exception {
  final String filePath =
    TestUtils.getFileFromResourceUrl(DictionariesTest.class.getClassLoader().getResource(AVRO_DATA));
  if (INDEX_DIR.exists()) {
   FileUtils.deleteQuietly(INDEX_DIR);
  }

//    System.out.println(INDEX_DIR.getAbsolutePath());
  final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null);

  final SegmentGeneratorConfig config = SegmentTestUtils
    .getSegmentGenSpecWithSchemAndProjectedColumns(new File(filePath), INDEX_DIR, "weeksSinceEpochSunday",
      TimeUnit.DAYS, "test");
  config.setTimeColumnName("weeksSinceEpochSunday");
  driver.init(config);
  driver.build();

  final DataFileStream<GenericRecord> avroReader = AvroUtils.getAvroReader(new File(filePath));
  final org.apache.avro.Schema avroSchema = avroReader.getSchema();
  final String[] columns = new String[avroSchema.getFields().size()];
  int i = 0;
  for (final Field f : avroSchema.getFields()) {
   columns[i] = f.name();
   i++;
  }
 }

 @BeforeClass
 public static void before()
   throws Exception {
  final String filePath = TestUtils.getFileFromResourceUrl(BlocksTest.class.getClassLoader().getResource(AVRO_DATA));
  if (INDEX_DIR.exists()) {
   FileUtils.deleteQuietly(INDEX_DIR);
  }

//    System.out.println(INDEX_DIR.getAbsolutePath());
  final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null);

  final SegmentGeneratorConfig config = SegmentTestUtils
    .getSegmentGenSpecWithSchemAndProjectedColumns(new File(filePath), INDEX_DIR, "daysSinceEpoch", TimeUnit.DAYS,
      "test");
  config.setTimeColumnName("daysSinceEpoch");
  driver.init(config);
  driver.build();

  final DataFileStream<GenericRecord> avroReader = AvroUtils.getAvroReader(new File(filePath));
  final org.apache.avro.Schema avroSchema = avroReader.getSchema();
  final String[] columns = new String[avroSchema.getFields().size()];
  int i = 0;
  for (final Field f : avroSchema.getFields()) {
   columns[i] = f.name();
   i++;
  }
 }
}

OutputStream outs = Util.fileOrStdout(args.get(1), out);
AvroColumnWriter<Object> writer =
  new AvroColumnWriter<>(reader.getSchema(),
              new ColumnFileMetaData()
                .setCodec(codec.value(opts)));

public static Schema extractSchemaFromAvroWithoutTime(File avroFile)
  throws IOException {
 DataFileStream<GenericRecord> dataStream =
   new DataFileStream<GenericRecord>(new FileInputStream(avroFile), new GenericDatumReader<GenericRecord>());
 Schema schema = new Schema();
 for (final Field field : dataStream.getSchema().getFields()) {
  try {
   getColumnType(field);
  } catch (Exception e) {
   LOGGER.warn("Caught exception while converting Avro field {} of type {}, field will not be in schema.",
     field.name(), field.schema().getType());
   continue;
  }
  final String columnName = field.name();
  final String pinotType = field.getProp("pinotType");
  final FieldSpec fieldSpec;
  if (pinotType != null && "METRIC".equals(pinotType)) {
   fieldSpec = new MetricFieldSpec();
  } else {
   fieldSpec = new DimensionFieldSpec();
  }
  fieldSpec.setName(columnName);
  fieldSpec.setDataType(getColumnType(dataStream.getSchema().getField(columnName)));
  fieldSpec.setSingleValueField(isSingleValueField(dataStream.getSchema().getField(columnName)));
  schema.addField(fieldSpec);
 }
 dataStream.close();
 return schema;
}

final org.apache.avro.Schema avroSchema = avroReader.getSchema();
final String[] columns = new String[avroSchema.getFields().size()];
int i = 0;

Schema schema = new Schema();
for (final Field field : dataStream.getSchema().getFields()) {
 final String columnName = field.name();
 FieldType fieldType = fieldTypeMap.get(columnName);

Javadoc

Return the schema used in this file.

Popular methods of DataFileStream

<init>
create an unitialized DataFileStream
next
Read the next datum from the file.
close
Close this reader.
hasNext
True if more entries remain in this file.
getMetaString
Return the value of a metadata property.
getMeta
Return the value of a metadata property.
blockFinished
getMetaKeys
Return the list of keys in the metadata
hasNextBlock
initialize
Initialize the stream without reading from it.
resolveCodec
nextRawBlock

Popular in Java

Reading from database using SQL prepared statement
startActivity (Activity)
runOnUiThread (Activity)
setContentView (Activity)
Enumeration (java.util)
A legacy iteration interface.New code should use Iterator instead. Iterator replaces the enumeration
NoSuchElementException (java.util)
Thrown when trying to retrieve an element past the end of an Enumeration or Iterator.
UUID (java.util)
UUID is an immutable representation of a 128-bit universally unique identifier (UUID). There are mul
Cipher (javax.crypto)
This class provides access to implementations of cryptographic ciphers for encryption and decryption
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
JOptionPane (javax.swing)
Top PhpStorm plugins

How to use getSchemamethodin org.apache.avro.file.DataFileStream

Best Java code snippets using org.apache.avro.file.DataFileStream.getSchema (Showing top 20 results out of 315)

How to use
getSchema
method
in
org.apache.avro.file.DataFileStream