private static <T> ParquetWriter<T> createAvroParquetWriter( String schemaString, GenericData dataModel, OutputFile out) throws IOException { final Schema schema = new Schema.Parser().parse(schemaString); return AvroParquetWriter.<T>builder(out) .withSchema(schema) .withDataModel(dataModel) .build(); }
/** * Create a data file that gets exported to the db. * @param numRecords how many records to write to the file. */ protected void createParquetFile(int numRecords, ColumnGenerator... extraCols) throws IOException { Schema schema = buildSchema(extraCols); String fileName = UUID.randomUUID().toString() + ".parquet"; Path filePath = new Path(getTablePath(), fileName); try (AvroParquetWriter parquetWriter = new AvroParquetWriter(filePath, schema, SNAPPY, DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE)) { for (int i = 0; i < numRecords; i++) { GenericRecord record = new GenericData.Record(schema); record.put("id", i); record.put("msg", getMsgPrefix() + i); addExtraColumns(record, i, extraCols); parquetWriter.write(record); } } }
private void rotate() throws WriterException { if(parquetWriter!=null) { try { parquetWriter.close(); String newFileName = previousFileName.substring(0, previousFileName.length() - 7); fs.rename(previousPath, new Path(newFileName + ".done")); } catch (IOException e) { LOG.warn("Fail to close Chukwa write ahead log."); } } startTime = System.currentTimeMillis(); calendar.setTimeInMillis(startTime); String newName = new java.text.SimpleDateFormat("yyyyMMddHHmmssSSS") .format(calendar.getTime()); newName += localHostAddr + new java.rmi.server.UID().toString(); newName = newName.replace("-", ""); newName = newName.replace(":", ""); newName = newName.replace(".", ""); newName = outputDir + "/" + newName.trim() + ".chukwa"; LOG.info("writing: "+newName); Path path = new Path(newName); try { parquetWriter = new AvroParquetWriter<GenericRecord>(path, avroSchema, CompressionCodecName.SNAPPY, blockSize, pageSize); previousPath = path; previousFileName = newName; } catch (IOException e) { throw new WriterException(e); } }
@Override public void open() throws IOException { CompressionCodecName codecName = CompressionCodecName.UNCOMPRESSED; if (enableCompression) { codecName = getCompressionCodecName(); } avroParquetWriter = new AvroParquetWriter<E>(fileSystem.makeQualified(path), schema, codecName, DEFAULT_ROW_GROUP_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, conf); }
@Override public void append(E entity) throws IOException { avroParquetWriter.write(entity); }
/** Create a new {@link AvroParquetWriter}. * * @param file a file path * @param avroSchema a schema for the write * @param compressionCodecName compression codec * @param blockSize target block size * @param pageSize target page size * @throws IOException if there is an error while writing */ @Deprecated public AvroParquetWriter(Path file, Schema avroSchema, CompressionCodecName compressionCodecName, int blockSize, int pageSize) throws IOException { super(file, AvroParquetWriter.<T>writeSupport(avroSchema, SpecificData.get()), compressionCodecName, blockSize, pageSize); }
@Override public void close() throws WriterException { try { parquetWriter.close(); fs.rename(previousPath, new Path(previousFileName + ".done")); } catch (IOException e) { throw new WriterException(e); } }
@Override public RecordWriter<SinkRecord> getRecordWriter( Configuration conf, final String fileName, SinkRecord record, final AvroData avroData) throws IOException { final Schema avroSchema = avroData.fromConnectSchema(record.valueSchema()); CompressionCodecName compressionCodecName = CompressionCodecName.SNAPPY; int blockSize = 256 * 1024 * 1024; int pageSize = 64 * 1024; Path path = new Path(fileName); final ParquetWriter<GenericRecord> writer = new AvroParquetWriter<>(path, avroSchema, compressionCodecName, blockSize, pageSize, true, conf); return new RecordWriter<SinkRecord>() { @Override public void write(SinkRecord record) throws IOException { Object value = avroData.fromConnectData(record.valueSchema(), record.value()); writer.write((GenericRecord) value); } @Override public void close() throws IOException { writer.close(); } }; } }
@Override public CommitStatus add(List<Chunk> chunks) throws WriterException { long elapsedTime = 0; CommitStatus rv = ChukwaWriter.COMMIT_OK; for(Chunk chunk : chunks) { try { GenericRecord record = new GenericData.Record(avroSchema); record.put("dataType", chunk.getDataType()); record.put("data", ByteBuffer.wrap(chunk.getData())); record.put("tags", chunk.getTags()); record.put("seqId", chunk.getSeqID()); record.put("source", chunk.getSource()); record.put("stream", chunk.getStreamName()); parquetWriter.write(record); elapsedTime = System.currentTimeMillis() - startTime; if(elapsedTime > rotateInterval) { rotate(); } } catch (IOException e) { LOG.warn("Failed to store data to HDFS."); LOG.warn(ExceptionUtil.getStackTrace(e)); } } if (next != null) { rv = next.add(chunks); //pass data through } return rv; }
/** Create a new {@link AvroParquetWriter}. * * @param file The file name to write to. * @param avroSchema The schema to write with. * @param compressionCodecName Compression code to use, or CompressionCodecName.UNCOMPRESSED * @param blockSize the block size threshold. * @param pageSize See parquet write up. Blocks are subdivided into pages for alignment and other purposes. * @param enableDictionary Whether to use a dictionary to compress columns. * @throws IOException if there is an error while writing */ @Deprecated public AvroParquetWriter(Path file, Schema avroSchema, CompressionCodecName compressionCodecName, int blockSize, int pageSize, boolean enableDictionary) throws IOException { super(file, AvroParquetWriter.<T>writeSupport(avroSchema, SpecificData.get()), compressionCodecName, blockSize, pageSize, enableDictionary, DEFAULT_IS_VALIDATING_ENABLED); }
public void close() { synchronized (lock) { if (rotateTimer != null) { rotateTimer.cancel(); } if (statTimer != null) { statTimer.cancel(); } try { if (parquetWriter != null) { parquetWriter.close(); } if (localToRemoteHdfsMover != null) { localToRemoteHdfsMover.shutdown(); } fs.rename(currentPath, new Path(currentFileName + ".done")); } catch (IOException e) { log.error("failed to close and rename stream", e); } } } }
private ParquetWriter createParquetWriter(final ProcessContext context, final FlowFile flowFile, final OutputStream out, final Schema schema) throws IOException { NifiParquetOutputFile nifiParquetOutputFile = new NifiParquetOutputFile(out); final AvroParquetWriter.Builder<GenericRecord> parquetWriter = AvroParquetWriter .<GenericRecord>builder(nifiParquetOutputFile) .withSchema(schema); Configuration conf = new Configuration(); conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, true); conf.setBoolean("parquet.avro.add-list-element-records", false); conf.setBoolean("parquet.avro.write-old-list-structure", false); ParquetUtils.applyCommonConfig(parquetWriter, context, flowFile, conf, this); return parquetWriter.build(); }
currentFileName = newName; chunksWrittenThisRotate = false; parquetWriter = new AvroParquetWriter<GenericRecord>(newOutputPath, avroSchema, CompressionCodecName.SNAPPY, blockSize, pageSize);
record.put("source", chunk.getSource()); record.put("stream", chunk.getStreamName()); parquetWriter.write(record);
/** Create a new {@link AvroParquetWriter}. * * @param file The file name to write to. * @param avroSchema The schema to write with. * @param compressionCodecName Compression code to use, or CompressionCodecName.UNCOMPRESSED * @param blockSize the block size threshold. * @param pageSize See parquet write up. Blocks are subdivided into pages for alignment and other purposes. * @param enableDictionary Whether to use a dictionary to compress columns. * @param conf The Configuration to use. * @throws IOException if there is an error while writing */ @Deprecated public AvroParquetWriter(Path file, Schema avroSchema, CompressionCodecName compressionCodecName, int blockSize, int pageSize, boolean enableDictionary, Configuration conf) throws IOException { this(file, AvroParquetWriter.<T>writeSupport(conf, avroSchema, SpecificData.get()), compressionCodecName, blockSize, pageSize, enableDictionary, DEFAULT_IS_VALIDATING_ENABLED, DEFAULT_WRITER_VERSION, conf); }
public AvroParquetFileWriter(LogFilePath logFilePath, CompressionCodec codec) throws IOException { Path path = new Path(logFilePath.getLogFilePath()); LOG.debug("Creating Brand new Writer for path {}", path); CompressionCodecName codecName = CompressionCodecName .fromCompressionCodec(codec != null ? codec.getClass() : null); topic = logFilePath.getTopic(); // Not setting blockSize, pageSize, enableDictionary, and validating writer = AvroParquetWriter.builder(path) .withSchema(schemaRegistryClient.getSchema(topic)) .withCompressionCodec(codecName) .build(); }
@Override public HDFSRecordWriter createHDFSRecordWriter(final ProcessContext context, final FlowFile flowFile, final Configuration conf, final Path path, final RecordSchema schema) throws IOException, SchemaNotFoundException { final Schema avroSchema = AvroTypeUtil.extractAvroSchema(schema); final AvroParquetWriter.Builder<GenericRecord> parquetWriter = AvroParquetWriter .<GenericRecord>builder(path) .withSchema(avroSchema); ParquetUtils.applyCommonConfig(parquetWriter, context, flowFile, conf, this); return new AvroParquetHDFSRecordWriter(parquetWriter.build(), avroSchema); }
writer.set(AvroParquetWriter.<GenericRecord>builder(toHadoopPath(javaPathOnDisk)) .withSchema(m.getSchema()) .build());
@Override public HDFSRecordWriter createHDFSRecordWriter(final ProcessContext context, final FlowFile flowFile, final Configuration conf, final Path path, final RecordSchema schema) throws IOException, SchemaNotFoundException { final Schema avroSchema = AvroTypeUtil.extractAvroSchema(schema); final AvroParquetWriter.Builder<GenericRecord> parquetWriter = AvroParquetWriter .<GenericRecord>builder(path) .withSchema(avroSchema); applyCommonConfig(parquetWriter, context, flowFile, conf); return new AvroParquetHDFSRecordWriter(parquetWriter.build(), avroSchema); }
@Override protected IGenericRecordConsumer prepareRecordConsumer(Schema schema, URI uri) throws IOException { ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(toHadoopPath(uri)) .withSchema(schema) .withConf(getConfiguration()) .build(); return new IGenericRecordConsumer() { @Override public void accept(GenericRecord t) { try { writer.write(t); } catch (IOException e) { throw new UncheckedIOException(e); } } @Override public void close() throws IOException { writer.close(); } }; }