public static MessageColumnIO getColumnIO(MessageType fileSchema, MessageType requestedSchema) { return (new ColumnIOFactory()).getColumnIO(requestedSchema, fileSchema, true); }
/** * @param schema the requestedSchema we want to read/write * @param fileSchema the file schema (when reading it can be different from the requested schema) * @return the corresponding serializing/deserializing structure */ public MessageColumnIO getColumnIO(MessageType requestedSchema, MessageType fileSchema) { return getColumnIO(requestedSchema, fileSchema, true); }
/** * @param schema the schema we want to read/write * @return the corresponding serializing/deserializing structure */ public MessageColumnIO getColumnIO(MessageType schema) { return this.getColumnIO(schema, schema); }
public static List<PrimitiveColumnIO> getColumns(MessageType fileSchema, MessageType requestedSchema) { return (new ColumnIOFactory()).getColumnIO(requestedSchema, fileSchema, true).getLeaves(); }
private void checkRead() throws IOException { if (current == totalCountLoadedSoFar) { if (current != 0) { long timeAssembling = System.currentTimeMillis() - startedAssemblingCurrentBlockAt; totalTimeSpentProcessingRecords += timeAssembling; LOG.info("Assembled and processed " + totalCountLoadedSoFar + " records from " + columnCount + " columns in " + totalTimeSpentProcessingRecords + " ms: "+((float)totalCountLoadedSoFar / totalTimeSpentProcessingRecords) + " rec/ms, " + ((float)totalCountLoadedSoFar * columnCount / totalTimeSpentProcessingRecords) + " cell/ms"); long totalTime = totalTimeSpentProcessingRecords + totalTimeSpentReadingBytes; long percentReading = 100 * totalTimeSpentReadingBytes / totalTime; long percentProcessing = 100 * totalTimeSpentProcessingRecords / totalTime; LOG.info("time spent so far " + percentReading + "% reading ("+totalTimeSpentReadingBytes+" ms) and " + percentProcessing + "% processing ("+totalTimeSpentProcessingRecords+" ms)"); } LOG.info("at row " + current + ". reading next block"); long t0 = System.currentTimeMillis(); PageReadStore pages = reader.readNextRowGroup(); if (pages == null) { throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total); } long timeSpentReading = System.currentTimeMillis() - t0; totalTimeSpentReadingBytes += timeSpentReading; BenchmarkCounter.incrementTime(timeSpentReading); LOG.info("block read in memory in " + timeSpentReading + " ms. row count = " + pages.getRowCount()); if (Log.DEBUG) LOG.debug("initializing Record assembly with requested schema " + requestedSchema); MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema); recordReader = columnIO.getRecordReader(pages, recordConverter, recordFilter); startedAssemblingCurrentBlockAt = System.currentTimeMillis(); totalCountLoadedSoFar += pages.getRowCount(); ++ currentBlock; } }
@Override public void prepareForWrite(RecordConsumer recordConsumer) { final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema); this.parquetWriteProtocol = new ParquetWriteProtocol(recordConsumer, columnIO, thriftStruct); }
if (Log.INFO) LOG.info("block read in memory in " + timeSpentReading + " ms. row count = " + pages.getRowCount()); if (Log.DEBUG) LOG.debug("initializing Record assembly with requested schema " + requestedSchema); MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking); recordReader = columnIO.getRecordReader(pages, recordConverter, filter); startedAssemblingCurrentBlockAt = System.currentTimeMillis();
private static MessageColumnIO newColumnFactory(MessageType schema) { return new ColumnIOFactory().getColumnIO(schema); } private static void read(RecordReader<Object> recordReader, int count, MessageType schema) {
@Override public void prepareForWrite(RecordConsumer recordConsumer) { final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema); this.parquetWriteProtocol = new ParquetWriteProtocol(recordConsumer, columnIO, thriftStruct); thriftWriteSupport.prepareForWrite(recordConsumer); }
private void initStore() { // we don't want this number to be too small // ideally we divide the block equally across the columns // it is unlikely all columns are going to be the same size. int initialBlockBufferSize = max(MINIMUM_BUFFER_SIZE, blockSize / schema.getColumns().size() / 5); pageStore = new ColumnChunkPageWriteStore(compressor, schema, initialBlockBufferSize); // we don't want this number to be too small either // ideally, slightly bigger than the page size, but not bigger than the block buffer int initialPageBufferSize = max(MINIMUM_BUFFER_SIZE, min(pageSize + pageSize / 10, initialBlockBufferSize)); store = new ColumnWriteStoreImpl(pageStore, pageSize, initialPageBufferSize, dictionaryPageSize, enableDictionary, writerVersion); MessageColumnIO columnIO = new ColumnIOFactory(validating).getColumnIO(schema); writeSupport.prepareForWrite(columnIO.getRecordWriter(store)); }
private void initStore() { pageStore = new ColumnChunkPageWriteStore(compressor, schema, pageSize); columnStore = parquetProperties.newColumnWriteStore( schema, pageStore, pageSize); MessageColumnIO columnIO = new ColumnIOFactory(validating).getColumnIO(schema); writeSupport.prepareForWrite(columnIO.getRecordWriter(columnStore)); }
footer.getFileMetaData().getSchema(), mappingConfiguration); this.columnIo = new ColumnIOFactory().getColumnIO( materializer.getMaterializeSchema(), footer.getFileMetaData().getSchema());
@Test public void testPushParser() { MemPageStore memPageStore = new MemPageStore(); MemColumnWriteStore columns = new MemColumnWriteStore(memPageStore, 800); MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema); new GroupWriter(columnIO.getRecordWriter(columns), schema).write(r1); columns.flush(); final Deque<String> expectations = new ArrayDeque<String>(); for (String string : expectedEventsForR1) { expectations.add(string); } RecordReader<Void> recordReader = columnIO.getRecordReader(memPageStore, new ExpectationValidatingConverter(expectations, schema)); recordReader.read(); }
MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema); GroupWriter groupWriter = new GroupWriter(columnIO.getRecordWriter(columns), schema); groupWriter.write(r1);
MemColumnWriteStore columns = new MemColumnWriteStore(memPageStore, 800); ColumnIOFactory columnIOFactory = new ColumnIOFactory(true); MessageColumnIO columnIO = columnIOFactory.getColumnIO(schema); log(columnIO); GroupWriter groupWriter = new GroupWriter(columnIO.getRecordWriter(columns), schema); MessageColumnIO columnIO2 = columnIOFactory.getColumnIO(schema2);