org.apache.parquet.hadoop.ParquetFileWriter java code examples

 @Override
 public ParquetFileWriter run() throws Exception {
  final ParquetFileWriter parquetFileWriter =
    new ParquetFileWriter(checkNotNull(conf), checkNotNull(schema), path, ParquetFileWriter.Mode.CREATE, DEFAULT_BLOCK_SIZE,
     MAX_PADDING_SIZE_DEFAULT, true);
  parquetFileWriter.start();
  return parquetFileWriter;
 }
});

  @Override
  protected void mergeOutput(FileSystem fs, String sourceFolder, String targetFile) throws IOException {
    FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder);
    List<Path> sourceFiles = new ArrayList<>();
    for (FileStatus sourceStatus : sourceStatuses) {
      sourceFiles.add(sourceStatus.getPath());
    }
    FileMetaData mergedMeta = ParquetFileWriter.mergeMetadataFiles(sourceFiles, fs.getConf()).getFileMetaData();
    ParquetFileWriter writer = new ParquetFileWriter(fs.getConf(), mergedMeta.getSchema(), new Path(targetFile),
        ParquetFileWriter.Mode.CREATE);
    writer.start();
    for (Path input : sourceFiles) {
      writer.appendFile(fs.getConf(), input);
    }
    writer.end(mergedMeta.getKeyValueMetaData());
  }
}

public void writeToFileWriter(ParquetFileWriter writer) throws IOException {
 writer.startColumn(path, totalValueCount, compressor.getCodecName());
 if (dictionaryPage != null) {
  writer.writeDictionaryPage(dictionaryPage);
  // tracking the dictionary encoding is handled in writeDictionaryPage
 }
 writer.writeDataPages(buf, uncompressedLength, compressedLength, totalStatistics,
   rlEncodings, dlEncodings, dataEncodings);
 writer.endColumn();
 if (LOG.isDebugEnabled()) {
  LOG.debug(
    String.format(
      "written %,dB for %s: %,d values, %,dB raw, %,dB comp, %d pages, encodings: %s",
      buf.size(), path, totalValueCount, uncompressedLength, compressedLength, pageCount, new HashSet<Encoding>(dataEncodings))
      + (dictionaryPage != null ? String.format(
      ", dic { %,d entries, %,dB raw, %,dB comp}",
      dictionaryPage.getDictionarySize(), dictionaryPage.getUncompressedSize(), dictionaryPage.getDictionarySize())
      : ""));
 }
 rlEncodings.clear();
 dlEncodings.clear();
 dataEncodings.clear();
 pageCount = 0;
}

ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
w.start();
HashMap<String, Integer> columnValuesWritten = new HashMap<>();
int valsWritten;
for (int k = 0; k < props.numberRowGroups; k++) {
 w.startBlock(props.recordsPerRowGroup);
 currentBooleanByte = 0;
 booleanBitCounter.reset();
  ColumnDescriptor c1 = schema.getColumnDescription(path1);
  w.startColumn(c1, props.recordsPerRowGroup, codec);
  final int valsPerPage = (int) Math.ceil(props.recordsPerRowGroup / (float) fieldInfo.numberOfPages);
  final int PAGE_SIZE = 1024 * 1024; // 1 MB
   System.arraycopy(repLevelBytes, 0, fullPage, bytes.length, repLevelBytes.length);
   System.arraycopy(defLevelBytes, 0, fullPage, bytes.length + repLevelBytes.length, defLevelBytes.length);
   w.writeDataPage( (props.recordsPerRowGroup / fieldInfo.numberOfPages), fullPage.length, BytesInput.from(fullPage), RLE, RLE, PLAIN);
   currentBooleanByte = 0;
  w.endColumn();
  columnValuesWritten.remove(fieldInfo.name);
  columnValuesWritten.put(fieldInfo.name, valsWritten);
 w.endBlock();
w.end(new HashMap<String, String>());
logger.debug("Finished generating parquet file.");

public int merge(List<InputFile> inputFiles, CodecFactory.BytesCompressor compressor, String createdBy, long maxBlockSize) throws IOException {
 List<ParquetFileReader> readers = getReaders(inputFiles);
 try {
  ByteBufferAllocator allocator = new HeapByteBufferAllocator();
  ColumnReadStoreImpl columnReadStore = new ColumnReadStoreImpl(null, new DummyRecordConverter(schema).getRootConverter(), schema, createdBy);
  this.start();
  List<BlocksCombiner.SmallBlocksUnion> largeBlocks = BlocksCombiner.combineLargeBlocks(readers, maxBlockSize);
  for (BlocksCombiner.SmallBlocksUnion smallBlocks : largeBlocks) {
       ColumnReader columnReader = columnReadStore.newMemColumnReader(path, columnChunkPageReader.get());
       for (int i = 0; i < columnReader.getTotalValueCount(); i++) {
        consumeTriplet(columnWriteStoreV1, columnWriter, columnReader);
       String[] parentPath = getExisingParentPath(path, inputFileSchema);
       int def = parquetFileReader.getFileMetaData().getSchema().getMaxDefinitionLevel(parentPath);
       int rep = parquetFileReader.getFileMetaData().getSchema().getMaxRepetitionLevel(parentPath);
     this.startBlock(smallBlocks.getRowCount());
   this.endBlock();
  this.end(Collections.emptyMap());
 }finally {
  BlocksCombiner.closeReaders(readers);

private void flushAndClose() throws IOException {
 if(parquetFileWriter == null){
  return;
 }
 if (recordCount > 0) {
  long memSize = store.getBufferedSize();
  parquetFileWriter.startBlock(recordCount);
  consumer.flush();
  store.flush();
  ColumnChunkPageWriteStoreExposer.flushPageStore(pageStore, parquetFileWriter);
  parquetFileWriter.endBlock();
  long recordsWritten = recordCount;
  // we are writing one single block per file
  parquetFileWriter.end(extraMetaData);
  byte[] metadata = this.trackingConverter == null ? null : trackingConverter.getMetadata();
  final long fileSize = parquetFileWriter.getPos();
  listener.recordsWritten(recordsWritten, fileSize, path.toString(), metadata /** TODO: add parquet footer **/, partition.getBucketNumber());
  parquetFileWriter = null;
  updateStats(memSize, recordCount);
  recordCount = 0;
 }
 if(store != null){
  store.close();
 }
 store = null;
 pageStore = null;
 index++;
}

private void flush() throws IOException {
 try {
  if (recordCount > 0) {
   parquetFileWriter.startBlock(recordCount);
   consumer.flush();
   store.flush();
   pageStore.flushToFileWriter(parquetFileWriter);
   recordCount = 0;
   parquetFileWriter.endBlock();
   // we are writing one single block per file
   parquetFileWriter.end(extraMetaData);
   parquetFileWriter = null;
  }
 } finally {
  store.close();
  pageStore.close();
  store = null;
  pageStore = null;
  index++;
 }
}

private void flushRowGroupToStore()
  throws IOException {
 recordConsumer.flush();
 LOG.debug("Flushing mem columnStore to file. allocated memory: {}", columnStore.getAllocatedSize());
 if (columnStore.getAllocatedSize() > (3 * rowGroupSizeThreshold)) {
  LOG.warn("Too much memory used: {}", columnStore.memUsageString());
 }
 if (recordCount > 0) {
  parquetFileWriter.startBlock(recordCount);
  columnStore.flush();
  pageStore.flushToFileWriter(parquetFileWriter);
  recordCount = 0;
  parquetFileWriter.endBlock();
  this.nextRowGroupSize = Math.min(
    parquetFileWriter.getNextRowGroupSize(),
    rowGroupSizeThreshold);
 }
 columnStore = null;
 pageStore = null;
}

public void appendRowGroup(SeekableInputStream from, BlockMetaData rowGroup,
              boolean dropColumns) throws IOException {
 startBlock(rowGroup.getRowCount());
    columnsInOrder.get(i + 1).getStartingPos() != (start + length)) {
   copy(from, out, start, length);
 endBlock();

private void flushRowGroup(boolean finished) {
 try {
  if (recordCount > 0) {
   writer.startBlock(recordCount);
   writeStore.flush();
   flushPageStoreToWriter.invoke(writer);
   writer.endBlock();
   if (!finished) {
    startRowGroup();
   }
  }
 } catch (IOException e) {
  throw new RuntimeIOException(e, "Failed to flush row group");
 }
}

 Set<Encoding> dlEncodings,
 List<Encoding> dataEncodings) throws IOException {
startColumn(descriptor, valueCount, compressionCodecName);
 writeDictionaryPage(dictionaryPage);
this.offsetIndexBuilder = offsetIndexBuilder;
endColumn();

 @Override
 public void close() throws IOException {
  flushRowGroup(true);
  writeStore.close();
  writer.end(metadata);
 }
}

@Override
public void write(Writable value)
    throws IOException
{
  recordWriter.write(value);
  length = fileWriter.getPos();
}

public ParquetMetadata getFooter() {
 return parquetFileWriter.getFooter();
}

private void startRowGroup() {
 try {
  this.nextRowGroupSize = min(writer.getNextRowGroupSize(), targetRowGroupSize);
 } catch (IOException e) {
  throw new RuntimeIOException(e);
 }
 this.nextCheckRecordCount = min(max(recordCount / 2, 100), 10000);
 this.recordCount = 0;
 PageWriteStore pageStore = pageStoreCtor.newInstance(
   compressor, parquetSchema, props.getAllocator());
 this.flushPageStoreToWriter = flushToWriter.bind(pageStore);
 this.writeStore = props.newColumnWriteStore(parquetSchema, pageStore);
 model.setColumnStore(writeStore);
}

private void flushRowGroupToStore()
  throws IOException {
 recordConsumer.flush();
 LOG.info("Flushing mem columnStore to file. allocated memory: {}", columnStore.getAllocatedSize());
 if (columnStore.getAllocatedSize() > (3 * rowGroupSizeThreshold)) {
  LOG.warn("Too much memory used: {}", columnStore.memUsageString());
 }
 if (recordCount > 0) {
  parquetFileWriter.startBlock(recordCount);
  columnStore.flush();
  pageStore.flushToFileWriter(parquetFileWriter);
  recordCount = 0;
  parquetFileWriter.endBlock();
  this.nextRowGroupSize = Math.min(
    parquetFileWriter.getNextRowGroupSize(),
    rowGroupSizeThreshold);
 }
 columnStore = null;
 pageStore = null;
}

public void appendRowGroup(SeekableInputStream from, BlockMetaData rowGroup,
              boolean dropColumns) throws IOException {
 startBlock(rowGroup.getRowCount());
    columnsInOrder.get(i + 1).getStartingPos() != (start + length)) {
   copy(from, out, start, length);
 endBlock();

public void close() throws IOException, InterruptedException {
 if (!closed) {
  flushRowGroupToStore();
  FinalizedWriteContext finalWriteContext = writeSupport.finalizeWrite();
  Map<String, String> finalMetadata = new HashMap<String, String>(extraMetaData);
  String modelName = writeSupport.getName();
  if (modelName != null) {
   finalMetadata.put(ParquetWriter.OBJECT_MODEL_NAME_PROP, modelName);
  }
  finalMetadata.putAll(finalWriteContext.getExtraMetaData());
  parquetFileWriter.end(finalMetadata);
  closed = true;
 }
}

private void checkBlockSizeReached() throws IOException {
 if (recordCount >= recordCountForNextMemCheck) { // checking the memory size is relatively expensive, so let's not do it for every record.
  long memSize = columnStore.getBufferedSize();
  long recordSize = memSize / recordCount;
  // flush the row group if it is within ~2 records of the limit
  // it is much better to be slightly under size than to be over at all
  if (memSize > (nextRowGroupSize - 2 * recordSize)) {
   LOG.info("mem size {} > {}: flushing {} records to disk.", memSize, nextRowGroupSize, recordCount);
   flushRowGroupToStore();
   initStore();
   recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK);
   this.lastRowGroupEndPos = parquetFileWriter.getPos();
  } else {
   recordCountForNextMemCheck = min(
     max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + (long)(nextRowGroupSize / ((float)recordSize))) / 2), // will check halfway
     recordCount + MAXIMUM_RECORD_COUNT_FOR_CHECK // will not look more than max records ahead
     );
   LOG.debug("Checked mem at {} will check again at: {}", recordCount, recordCountForNextMemCheck);
  }
 }
}

public ParquetMetadata getFooter() {
 return parquetFileWriter.getFooter();
}

Javadoc

Internal implementation of the Parquet file writer as a block container

Most used methods

<init>
end
ends a file once all blocks have been written. closes the file.
start
start the file
endBlock
ends a block once all column chunks have been written
startBlock
start a block
endColumn
end a column (once all rep, def and data have been written)
getPos
startColumn
start a column inside a block
getFooter
getNextRowGroupSize
mergeMetadataFiles
Given a list of metadata files, merge them into a single ParquetMetadata Requires that the schemas b
writeDictionaryPage
writes a dictionary page page

Popular in Java

Parsing JSON documents to java classes using gson
setContentView (Activity)
getApplicationContext (Context)
getExternalFilesDir (Context)
HttpURLConnection (java.net)
An URLConnection for HTTP (RFC 2616 [http://tools.ietf.org/html/rfc2616]) used to send and receive d
Comparator (java.util)
A Comparator is used to compare two objects to determine their ordering with respect to each other.
GregorianCalendar (java.util)
GregorianCalendar is a concrete subclass of Calendarand provides the standard calendar used by most
Graphics2D (java.awt)
This Graphics2D class extends the Graphics class to provide more sophisticated control overgraphics
JCheckBox (javax.swing)
Response (javax.ws.rs.core)
Defines the contract between a returned instance and the runtime when an application needs to provid
Top plugins for WebStorm

How to useParquetFileWriter in org.apache.parquet.hadoop

Best Java code snippets using org.apache.parquet.hadoop.ParquetFileWriter (Showing top 20 results out of 315)

How to use
ParquetFileWriter
in
org.apache.parquet.hadoop