org.apache.parquet.avro.AvroParquetWriter java code examples

private static <T> ParquetWriter<T> createAvroParquetWriter(
    String schemaString,
    GenericData dataModel,
    OutputFile out) throws IOException {
  final Schema schema = new Schema.Parser().parse(schemaString);
  return AvroParquetWriter.<T>builder(out)
      .withSchema(schema)
      .withDataModel(dataModel)
      .build();
}

/**
 * Create a data file that gets exported to the db.
 * @param numRecords how many records to write to the file.
 */
protected void createParquetFile(int numRecords,
  ColumnGenerator... extraCols) throws IOException {
 Schema schema = buildSchema(extraCols);
 String fileName = UUID.randomUUID().toString() + ".parquet";
 Path filePath = new Path(getTablePath(), fileName);
 try (AvroParquetWriter parquetWriter = new AvroParquetWriter(filePath, schema, SNAPPY, DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE)) {
  for (int i = 0; i < numRecords; i++) {
   GenericRecord record = new GenericData.Record(schema);
   record.put("id", i);
   record.put("msg", getMsgPrefix() + i);
   addExtraColumns(record, i, extraCols);
   parquetWriter.write(record);
  }
 }
}

private void rotate() throws WriterException {
 if(parquetWriter!=null) {
  try {
   parquetWriter.close();
   String newFileName = previousFileName.substring(0, previousFileName.length() - 7);
   fs.rename(previousPath, new Path(newFileName + ".done"));
  } catch (IOException e) {
   LOG.warn("Fail to close Chukwa write ahead log.");
  }
 }
 startTime = System.currentTimeMillis();
 calendar.setTimeInMillis(startTime);
 String newName = new java.text.SimpleDateFormat("yyyyMMddHHmmssSSS")
   .format(calendar.getTime());
 newName += localHostAddr + new java.rmi.server.UID().toString();
 newName = newName.replace("-", "");
 newName = newName.replace(":", "");
 newName = newName.replace(".", "");
 newName = outputDir + "/" + newName.trim() + ".chukwa";
 LOG.info("writing: "+newName);
 Path path = new Path(newName);
 try {
  parquetWriter = new AvroParquetWriter<GenericRecord>(path, avroSchema, CompressionCodecName.SNAPPY, blockSize, pageSize);
  previousPath = path;
  previousFileName = newName;
 } catch (IOException e) {
  throw new WriterException(e);
 }
}

@Override
public void open() throws IOException {
 CompressionCodecName codecName = CompressionCodecName.UNCOMPRESSED;
 if (enableCompression) {
  codecName = getCompressionCodecName();
 }
 avroParquetWriter = new AvroParquetWriter<E>(fileSystem.makeQualified(path),
   schema, codecName, DEFAULT_ROW_GROUP_SIZE,
   ParquetWriter.DEFAULT_PAGE_SIZE,
   ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, conf);
}

@Override
public void append(E entity) throws IOException {
 avroParquetWriter.write(entity);
}

/** Create a new {@link AvroParquetWriter}.
 *
 * @param file a file path
 * @param avroSchema a schema for the write
 * @param compressionCodecName compression codec
 * @param blockSize target block size
 * @param pageSize target page size
 * @throws IOException if there is an error while writing
 */
@Deprecated
public AvroParquetWriter(Path file, Schema avroSchema,
  CompressionCodecName compressionCodecName, int blockSize,
  int pageSize) throws IOException {
 super(file, AvroParquetWriter.<T>writeSupport(avroSchema, SpecificData.get()),
    compressionCodecName, blockSize, pageSize);
}

@Override
public void close() throws WriterException {
 try {
  parquetWriter.close();
  fs.rename(previousPath, new Path(previousFileName + ".done"));
 } catch (IOException e) {
  throw new WriterException(e);
 }
}

 @Override
 public RecordWriter<SinkRecord> getRecordWriter(
   Configuration conf, final String fileName, SinkRecord record, final AvroData avroData)
   throws IOException {
  final Schema avroSchema = avroData.fromConnectSchema(record.valueSchema());
  CompressionCodecName compressionCodecName = CompressionCodecName.SNAPPY;

  int blockSize = 256 * 1024 * 1024;
  int pageSize = 64 * 1024;

  Path path = new Path(fileName);
  final ParquetWriter<GenericRecord> writer =
    new AvroParquetWriter<>(path, avroSchema, compressionCodecName, blockSize, pageSize, true, conf);

  return new RecordWriter<SinkRecord>() {
   @Override
   public void write(SinkRecord record) throws IOException {
    Object value = avroData.fromConnectData(record.valueSchema(), record.value());
    writer.write((GenericRecord) value);
   }

   @Override
   public void close() throws IOException {
    writer.close();
   }
  };
 }
}

@Override
public CommitStatus add(List<Chunk> chunks) throws WriterException {
 long elapsedTime = 0;
 CommitStatus rv = ChukwaWriter.COMMIT_OK;
 for(Chunk chunk : chunks) {
  try {
   GenericRecord record = new GenericData.Record(avroSchema);
   record.put("dataType", chunk.getDataType());
   record.put("data", ByteBuffer.wrap(chunk.getData()));
   record.put("tags", chunk.getTags());
   record.put("seqId", chunk.getSeqID());
   record.put("source", chunk.getSource());
   record.put("stream", chunk.getStreamName());
   parquetWriter.write(record);
   elapsedTime = System.currentTimeMillis() - startTime;
   if(elapsedTime > rotateInterval) {
    rotate();
   }
  } catch (IOException e) {
   LOG.warn("Failed to store data to HDFS.");
   LOG.warn(ExceptionUtil.getStackTrace(e));
  }
 }
 if (next != null) {
  rv = next.add(chunks); //pass data through
 }
 return rv;
}

/** Create a new {@link AvroParquetWriter}.
 *
 * @param file The file name to write to.
 * @param avroSchema The schema to write with.
 * @param compressionCodecName Compression code to use, or CompressionCodecName.UNCOMPRESSED
 * @param blockSize the block size threshold.
 * @param pageSize See parquet write up. Blocks are subdivided into pages for alignment and other purposes.
 * @param enableDictionary Whether to use a dictionary to compress columns.
 * @throws IOException if there is an error while writing
 */
@Deprecated
public AvroParquetWriter(Path file, Schema avroSchema,
             CompressionCodecName compressionCodecName, int blockSize,
             int pageSize, boolean enableDictionary) throws IOException {
 super(file, AvroParquetWriter.<T>writeSupport(avroSchema, SpecificData.get()),
   compressionCodecName, blockSize, pageSize, enableDictionary,
   DEFAULT_IS_VALIDATING_ENABLED);
}

 public void close() {
  synchronized (lock) {
 
   if (rotateTimer != null) {
    rotateTimer.cancel();
   }

   if (statTimer != null) {
    statTimer.cancel();
   }

   try {
    if (parquetWriter != null) {
     parquetWriter.close();
    }
    if (localToRemoteHdfsMover != null) {
     localToRemoteHdfsMover.shutdown();
    }
    
    fs.rename(currentPath, new Path(currentFileName + ".done"));
   } catch (IOException e) {
    log.error("failed to close and rename stream", e);
   }
  }
 }
}

private ParquetWriter createParquetWriter(final ProcessContext context, final FlowFile flowFile, final OutputStream out, final Schema schema)
    throws IOException {
  NifiParquetOutputFile nifiParquetOutputFile = new NifiParquetOutputFile(out);
  final AvroParquetWriter.Builder<GenericRecord> parquetWriter = AvroParquetWriter
      .<GenericRecord>builder(nifiParquetOutputFile)
      .withSchema(schema);
  Configuration conf = new Configuration();
  conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, true);
  conf.setBoolean("parquet.avro.add-list-element-records", false);
  conf.setBoolean("parquet.avro.write-old-list-structure", false);
  ParquetUtils.applyCommonConfig(parquetWriter, context, flowFile, conf, this);
  return parquetWriter.build();
}

currentFileName = newName;
chunksWrittenThisRotate = false;
parquetWriter = new AvroParquetWriter<GenericRecord>(newOutputPath, avroSchema, CompressionCodecName.SNAPPY, blockSize, pageSize);

record.put("source", chunk.getSource());
record.put("stream", chunk.getStreamName());
parquetWriter.write(record);

/** Create a new {@link AvroParquetWriter}.
 *
 * @param file The file name to write to.
 * @param avroSchema The schema to write with.
 * @param compressionCodecName Compression code to use, or CompressionCodecName.UNCOMPRESSED
 * @param blockSize the block size threshold.
 * @param pageSize See parquet write up. Blocks are subdivided into pages for alignment and other purposes.
 * @param enableDictionary Whether to use a dictionary to compress columns.
 * @param conf The Configuration to use.
 * @throws IOException if there is an error while writing
 */
@Deprecated
public AvroParquetWriter(Path file, Schema avroSchema,
             CompressionCodecName compressionCodecName,
             int blockSize, int pageSize, boolean enableDictionary,
             Configuration conf) throws IOException {
 this(file,
   AvroParquetWriter.<T>writeSupport(conf, avroSchema, SpecificData.get()),
   compressionCodecName, blockSize, pageSize,
   enableDictionary, DEFAULT_IS_VALIDATING_ENABLED, DEFAULT_WRITER_VERSION,
   conf);
}

public AvroParquetFileWriter(LogFilePath logFilePath, CompressionCodec codec) throws IOException {
  Path path = new Path(logFilePath.getLogFilePath());
  LOG.debug("Creating Brand new Writer for path {}", path);
  CompressionCodecName codecName = CompressionCodecName
      .fromCompressionCodec(codec != null ? codec.getClass() : null);
  topic = logFilePath.getTopic();
  // Not setting blockSize, pageSize, enableDictionary, and validating
  writer = AvroParquetWriter.builder(path)
      .withSchema(schemaRegistryClient.getSchema(topic))
      .withCompressionCodec(codecName)
      .build();
}

@Override
public HDFSRecordWriter createHDFSRecordWriter(final ProcessContext context, final FlowFile flowFile, final Configuration conf, final Path path, final RecordSchema schema)
    throws IOException, SchemaNotFoundException {
  final Schema avroSchema = AvroTypeUtil.extractAvroSchema(schema);
  final AvroParquetWriter.Builder<GenericRecord> parquetWriter = AvroParquetWriter
      .<GenericRecord>builder(path)
      .withSchema(avroSchema);
  ParquetUtils.applyCommonConfig(parquetWriter, context, flowFile, conf, this);
  return new AvroParquetHDFSRecordWriter(parquetWriter.build(), avroSchema);
}

writer.set(AvroParquetWriter.<GenericRecord>builder(toHadoopPath(javaPathOnDisk))
    .withSchema(m.getSchema())
    .build());

@Override
public HDFSRecordWriter createHDFSRecordWriter(final ProcessContext context, final FlowFile flowFile, final Configuration conf, final Path path, final RecordSchema schema)
    throws IOException, SchemaNotFoundException {
  final Schema avroSchema = AvroTypeUtil.extractAvroSchema(schema);
  final AvroParquetWriter.Builder<GenericRecord> parquetWriter = AvroParquetWriter
      .<GenericRecord>builder(path)
      .withSchema(avroSchema);
  applyCommonConfig(parquetWriter, context, flowFile, conf);
  return new AvroParquetHDFSRecordWriter(parquetWriter.build(), avroSchema);
}

@Override
protected IGenericRecordConsumer prepareRecordConsumer(Schema schema, URI uri) throws IOException {
  ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(toHadoopPath(uri))
      .withSchema(schema)
      .withConf(getConfiguration())
      .build();
  return new IGenericRecordConsumer() {
    @Override
    public void accept(GenericRecord t) {
      try {
        writer.write(t);
      } catch (IOException e) {
        throw new UncheckedIOException(e);
      }
    }
    @Override
    public void close() throws IOException {
      writer.close();
    }
  };
}

Javadoc

Write Avro records to a Parquet file.

Most used methods

builder
<init>
Create a new AvroParquetWriter.
write
close
writeSupport

Popular in Java

Running tasks concurrently on multiple threads
getApplicationContext (Context)
getSharedPreferences (Context)
putExtra (Intent)
HttpServer (com.sun.net.httpserver)
This class implements a simple HTTP server. A HttpServer is bound to an IP address and port number a
Calendar (java.util)
Calendar is an abstract base class for converting between a Date object and a set of integer fields
Set (java.util)
A Set is a data structure which does not allow duplicate elements.
HttpServletRequest (javax.servlet.http)
Extends the javax.servlet.ServletRequest interface to provide request information for HTTP servlets.
BufferedImage (java.awt.image)
The BufferedImage subclass describes an java.awt.Image with an accessible buffer of image data. All
Scheduler (org.quartz)
This is the main interface of a Quartz Scheduler. A Scheduler maintains a registry of org.quartz.Job
Top plugins for Android Studio

How to useAvroParquetWriter in org.apache.parquet.avro

Best Java code snippets using org.apache.parquet.avro.AvroParquetWriter (Showing top 20 results out of 315)

How to use
AvroParquetWriter
in
org.apache.parquet.avro