@Override public ParquetFileWriter run() throws Exception { final ParquetFileWriter parquetFileWriter = new ParquetFileWriter(checkNotNull(conf), checkNotNull(schema), path, ParquetFileWriter.Mode.CREATE, DEFAULT_BLOCK_SIZE, MAX_PADDING_SIZE_DEFAULT, true); parquetFileWriter.start(); return parquetFileWriter; } });
@Override protected void mergeOutput(FileSystem fs, String sourceFolder, String targetFile) throws IOException { FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder); List<Path> sourceFiles = new ArrayList<>(); for (FileStatus sourceStatus : sourceStatuses) { sourceFiles.add(sourceStatus.getPath()); } FileMetaData mergedMeta = ParquetFileWriter.mergeMetadataFiles(sourceFiles, fs.getConf()).getFileMetaData(); ParquetFileWriter writer = new ParquetFileWriter(fs.getConf(), mergedMeta.getSchema(), new Path(targetFile), ParquetFileWriter.Mode.CREATE); writer.start(); for (Path input : sourceFiles) { writer.appendFile(fs.getConf(), input); } writer.end(mergedMeta.getKeyValueMetaData()); } }
public void writeToFileWriter(ParquetFileWriter writer) throws IOException { writer.startColumn(path, totalValueCount, compressor.getCodecName()); if (dictionaryPage != null) { writer.writeDictionaryPage(dictionaryPage); // tracking the dictionary encoding is handled in writeDictionaryPage } writer.writeDataPages(buf, uncompressedLength, compressedLength, totalStatistics, rlEncodings, dlEncodings, dataEncodings); writer.endColumn(); if (LOG.isDebugEnabled()) { LOG.debug( String.format( "written %,dB for %s: %,d values, %,dB raw, %,dB comp, %d pages, encodings: %s", buf.size(), path, totalValueCount, uncompressedLength, compressedLength, pageCount, new HashSet<Encoding>(dataEncodings)) + (dictionaryPage != null ? String.format( ", dic { %,d entries, %,dB raw, %,dB comp}", dictionaryPage.getDictionarySize(), dictionaryPage.getUncompressedSize(), dictionaryPage.getDictionarySize()) : "")); } rlEncodings.clear(); dlEncodings.clear(); dataEncodings.clear(); pageCount = 0; }
ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path); w.start(); HashMap<String, Integer> columnValuesWritten = new HashMap<>(); int valsWritten; for (int k = 0; k < props.numberRowGroups; k++) { w.startBlock(props.recordsPerRowGroup); currentBooleanByte = 0; booleanBitCounter.reset(); ColumnDescriptor c1 = schema.getColumnDescription(path1); w.startColumn(c1, props.recordsPerRowGroup, codec); final int valsPerPage = (int) Math.ceil(props.recordsPerRowGroup / (float) fieldInfo.numberOfPages); final int PAGE_SIZE = 1024 * 1024; // 1 MB System.arraycopy(repLevelBytes, 0, fullPage, bytes.length, repLevelBytes.length); System.arraycopy(defLevelBytes, 0, fullPage, bytes.length + repLevelBytes.length, defLevelBytes.length); w.writeDataPage( (props.recordsPerRowGroup / fieldInfo.numberOfPages), fullPage.length, BytesInput.from(fullPage), RLE, RLE, PLAIN); currentBooleanByte = 0; w.endColumn(); columnValuesWritten.remove(fieldInfo.name); columnValuesWritten.put(fieldInfo.name, valsWritten); w.endBlock(); w.end(new HashMap<String, String>()); logger.debug("Finished generating parquet file.");
public int merge(List<InputFile> inputFiles, CodecFactory.BytesCompressor compressor, String createdBy, long maxBlockSize) throws IOException { List<ParquetFileReader> readers = getReaders(inputFiles); try { ByteBufferAllocator allocator = new HeapByteBufferAllocator(); ColumnReadStoreImpl columnReadStore = new ColumnReadStoreImpl(null, new DummyRecordConverter(schema).getRootConverter(), schema, createdBy); this.start(); List<BlocksCombiner.SmallBlocksUnion> largeBlocks = BlocksCombiner.combineLargeBlocks(readers, maxBlockSize); for (BlocksCombiner.SmallBlocksUnion smallBlocks : largeBlocks) { ColumnReader columnReader = columnReadStore.newMemColumnReader(path, columnChunkPageReader.get()); for (int i = 0; i < columnReader.getTotalValueCount(); i++) { consumeTriplet(columnWriteStoreV1, columnWriter, columnReader); String[] parentPath = getExisingParentPath(path, inputFileSchema); int def = parquetFileReader.getFileMetaData().getSchema().getMaxDefinitionLevel(parentPath); int rep = parquetFileReader.getFileMetaData().getSchema().getMaxRepetitionLevel(parentPath); this.startBlock(smallBlocks.getRowCount()); this.endBlock(); this.end(Collections.emptyMap()); }finally { BlocksCombiner.closeReaders(readers);
private void flushAndClose() throws IOException { if(parquetFileWriter == null){ return; } if (recordCount > 0) { long memSize = store.getBufferedSize(); parquetFileWriter.startBlock(recordCount); consumer.flush(); store.flush(); ColumnChunkPageWriteStoreExposer.flushPageStore(pageStore, parquetFileWriter); parquetFileWriter.endBlock(); long recordsWritten = recordCount; // we are writing one single block per file parquetFileWriter.end(extraMetaData); byte[] metadata = this.trackingConverter == null ? null : trackingConverter.getMetadata(); final long fileSize = parquetFileWriter.getPos(); listener.recordsWritten(recordsWritten, fileSize, path.toString(), metadata /** TODO: add parquet footer **/, partition.getBucketNumber()); parquetFileWriter = null; updateStats(memSize, recordCount); recordCount = 0; } if(store != null){ store.close(); } store = null; pageStore = null; index++; }
private void flush() throws IOException { try { if (recordCount > 0) { parquetFileWriter.startBlock(recordCount); consumer.flush(); store.flush(); pageStore.flushToFileWriter(parquetFileWriter); recordCount = 0; parquetFileWriter.endBlock(); // we are writing one single block per file parquetFileWriter.end(extraMetaData); parquetFileWriter = null; } } finally { store.close(); pageStore.close(); store = null; pageStore = null; index++; } }
private void flushRowGroupToStore() throws IOException { recordConsumer.flush(); LOG.debug("Flushing mem columnStore to file. allocated memory: {}", columnStore.getAllocatedSize()); if (columnStore.getAllocatedSize() > (3 * rowGroupSizeThreshold)) { LOG.warn("Too much memory used: {}", columnStore.memUsageString()); } if (recordCount > 0) { parquetFileWriter.startBlock(recordCount); columnStore.flush(); pageStore.flushToFileWriter(parquetFileWriter); recordCount = 0; parquetFileWriter.endBlock(); this.nextRowGroupSize = Math.min( parquetFileWriter.getNextRowGroupSize(), rowGroupSizeThreshold); } columnStore = null; pageStore = null; }
public void appendRowGroup(SeekableInputStream from, BlockMetaData rowGroup, boolean dropColumns) throws IOException { startBlock(rowGroup.getRowCount()); columnsInOrder.get(i + 1).getStartingPos() != (start + length)) { copy(from, out, start, length); endBlock();
private void flushRowGroup(boolean finished) { try { if (recordCount > 0) { writer.startBlock(recordCount); writeStore.flush(); flushPageStoreToWriter.invoke(writer); writer.endBlock(); if (!finished) { startRowGroup(); } } } catch (IOException e) { throw new RuntimeIOException(e, "Failed to flush row group"); } }
Set<Encoding> dlEncodings, List<Encoding> dataEncodings) throws IOException { startColumn(descriptor, valueCount, compressionCodecName); writeDictionaryPage(dictionaryPage); this.offsetIndexBuilder = offsetIndexBuilder; endColumn();
@Override public void close() throws IOException { flushRowGroup(true); writeStore.close(); writer.end(metadata); } }
@Override public void write(Writable value) throws IOException { recordWriter.write(value); length = fileWriter.getPos(); }
public ParquetMetadata getFooter() { return parquetFileWriter.getFooter(); }
private void startRowGroup() { try { this.nextRowGroupSize = min(writer.getNextRowGroupSize(), targetRowGroupSize); } catch (IOException e) { throw new RuntimeIOException(e); } this.nextCheckRecordCount = min(max(recordCount / 2, 100), 10000); this.recordCount = 0; PageWriteStore pageStore = pageStoreCtor.newInstance( compressor, parquetSchema, props.getAllocator()); this.flushPageStoreToWriter = flushToWriter.bind(pageStore); this.writeStore = props.newColumnWriteStore(parquetSchema, pageStore); model.setColumnStore(writeStore); }
private void flushRowGroupToStore() throws IOException { recordConsumer.flush(); LOG.info("Flushing mem columnStore to file. allocated memory: {}", columnStore.getAllocatedSize()); if (columnStore.getAllocatedSize() > (3 * rowGroupSizeThreshold)) { LOG.warn("Too much memory used: {}", columnStore.memUsageString()); } if (recordCount > 0) { parquetFileWriter.startBlock(recordCount); columnStore.flush(); pageStore.flushToFileWriter(parquetFileWriter); recordCount = 0; parquetFileWriter.endBlock(); this.nextRowGroupSize = Math.min( parquetFileWriter.getNextRowGroupSize(), rowGroupSizeThreshold); } columnStore = null; pageStore = null; }
public void appendRowGroup(SeekableInputStream from, BlockMetaData rowGroup, boolean dropColumns) throws IOException { startBlock(rowGroup.getRowCount()); columnsInOrder.get(i + 1).getStartingPos() != (start + length)) { copy(from, out, start, length); endBlock();
public void close() throws IOException, InterruptedException { if (!closed) { flushRowGroupToStore(); FinalizedWriteContext finalWriteContext = writeSupport.finalizeWrite(); Map<String, String> finalMetadata = new HashMap<String, String>(extraMetaData); String modelName = writeSupport.getName(); if (modelName != null) { finalMetadata.put(ParquetWriter.OBJECT_MODEL_NAME_PROP, modelName); } finalMetadata.putAll(finalWriteContext.getExtraMetaData()); parquetFileWriter.end(finalMetadata); closed = true; } }
private void checkBlockSizeReached() throws IOException { if (recordCount >= recordCountForNextMemCheck) { // checking the memory size is relatively expensive, so let's not do it for every record. long memSize = columnStore.getBufferedSize(); long recordSize = memSize / recordCount; // flush the row group if it is within ~2 records of the limit // it is much better to be slightly under size than to be over at all if (memSize > (nextRowGroupSize - 2 * recordSize)) { LOG.info("mem size {} > {}: flushing {} records to disk.", memSize, nextRowGroupSize, recordCount); flushRowGroupToStore(); initStore(); recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK); this.lastRowGroupEndPos = parquetFileWriter.getPos(); } else { recordCountForNextMemCheck = min( max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + (long)(nextRowGroupSize / ((float)recordSize))) / 2), // will check halfway recordCount + MAXIMUM_RECORD_COUNT_FOR_CHECK // will not look more than max records ahead ); LOG.debug("Checked mem at {} will check again at: {}", recordCount, recordCountForNextMemCheck); } } }
public ParquetMetadata getFooter() { return parquetFileWriter.getFooter(); }