public void flushToFileWriter(ParquetFileWriter writer) throws IOException { for (ColumnDescriptor path : schema.getColumns()) { ColumnChunkPageWriter pageWriter = writers.get(path); pageWriter.writeToFileWriter(writer); } }
public ColumnChunkPageWriteStore(BytesCompressor compressor, MessageType schema, int pageSize) { this.schema = schema; for (ColumnDescriptor path : schema.getColumns()) { writers.put(path, new ColumnChunkPageWriter(path, compressor, pageSize)); } }
private void initializeColumnReaders() { for (ColumnDescriptor column : requestedSchema.getColumns()) { columnReadersMap.put(column, ParquetColumnReader.createReader(column)); } } }
public void flushToFileWriter(ParquetFileWriter writer) throws IOException { List<ColumnDescriptor> columns = schema.getColumns(); for (ColumnDescriptor columnDescriptor : columns) { ColumnChunkPageWriter pageWriter = writers.get(columnDescriptor); pageWriter.writeToFileWriter(writer); } }
private static int lookupParquetColumn(HiveColumnHandle column, MessageType fileSchema) { // map column has more than one primitive columns in parquet file // the column ordinal number does not always equal to hive column index // need to do a look up in parquet file schema columns int parquetFieldIndex = 0; for (; parquetFieldIndex < fileSchema.getColumns().size(); parquetFieldIndex++) { String[] path = fileSchema.getColumns().get(parquetFieldIndex).getPath(); String columnName = path[path.length - 1]; if (column.getName().equals(columnName)) { break; } } return parquetFieldIndex; }
public ColumnWriteStoreV2( MessageType schema, PageWriteStore pageWriteStore, int pageSizeThreshold, ParquetProperties parquetProps) { super(); this.pageSizeThreshold = pageSizeThreshold; this.thresholdTolerance = (long)(pageSizeThreshold * THRESHOLD_TOLERANCE_RATIO); Map<ColumnDescriptor, ColumnWriterV2> mcolumns = new TreeMap<ColumnDescriptor, ColumnWriterV2>(); for (ColumnDescriptor path : schema.getColumns()) { PageWriter pageWriter = pageWriteStore.getPageWriter(path); mcolumns.put(path, new ColumnWriterV2(path, pageWriter, parquetProps, pageSizeThreshold)); } this.columns = unmodifiableMap(mcolumns); this.writers = this.columns.values(); }
public int nextBatch() throws IOException, InterruptedException { if (nextRowInGroup >= currentGroupRowCount) { if (!advanceToNextRowGroup()) { return -1; } } int batchSize = Ints.checkedCast(Math.min(MAX_VECTOR_LENGTH, currentGroupRowCount - nextRowInGroup)); nextRowInGroup += batchSize; currentPosition += batchSize; for (ColumnDescriptor column : requestedSchema.getColumns()) { ParquetColumnReader columnReader = columnReadersMap.get(column); columnReader.prepareNextRead(batchSize); } return batchSize; }
private ParquetFileReader createFileReader(ParquetMetadata meta, List<BlockMetaData> blocks) throws IOException { FileMetaData fileMetaData = meta.getFileMetaData(); if (FILE_READER_NEWER_CTOR != null) { try { return FILE_READER_NEWER_CTOR.newInstance( hadoopConfiguration, fileMetaData, path, blocks, fileMetaData.getSchema().getColumns()); } catch (ReflectiveOperationException | IllegalArgumentException | SecurityException e) { LOG.debug("failed ParquetFileReader.<init>", e); } } return new ParquetFileReader( hadoopConfiguration, path, blocks, fileMetaData.getSchema().getColumns()); }
maxColCount = Math.max(w.getSchema().getColumns().size(), maxColCount);
private void initStore() { // we don't want this number to be too small // ideally we divide the block equally across the columns // it is unlikely all columns are going to be the same size. int initialBlockBufferSize = max(MINIMUM_BUFFER_SIZE, blockSize / schema.getColumns().size() / 5); pageStore = new ColumnChunkPageWriteStore(compressor, schema, initialBlockBufferSize); // we don't want this number to be too small either // ideally, slightly bigger than the page size, but not bigger than the block buffer int initialPageBufferSize = max(MINIMUM_BUFFER_SIZE, min(pageSize + pageSize / 10, initialBlockBufferSize)); store = new ColumnWriteStoreImpl(pageStore, pageSize, initialPageBufferSize, dictionaryPageSize, enableDictionary, writerVersion); MessageColumnIO columnIO = new ColumnIOFactory(validating).getColumnIO(schema); writeSupport.prepareForWrite(columnIO.getRecordWriter(store)); }
private static List<Mapping> computeMappingByPosition( DataModelDescriptor target, MessageType source) { if (LOG.isDebugEnabled()) { LOG.debug(MessageFormat.format( "Mapping columns by their position: model={0}", //$NON-NLS-1$ target.getDataModelClass().getName())); } List<ColumnDescriptor> sources = source.getColumns(); List<? extends PropertyDescriptor> targets = target.getPropertyDescriptors(); List<Mapping> mappings = new ArrayList<>(); int limit = Math.min(sources.size(), targets.size()); for (int i = 0; i < limit; i++) { ColumnDescriptor s = sources.get(i); Type sType = source.getType(s.getPath()); PropertyDescriptor t = targets.get(i); mappings.add(new Mapping(s, sType, t)); } for (int i = limit, n = sources.size(); i < n; i++) { ColumnDescriptor s = sources.get(i); Type sType = source.getType(s.getPath()); mappings.add(new Mapping(s, sType, null)); } for (int i = limit, n = targets.size(); i < n; i++) { mappings.add(new Mapping(null, null, targets.get(i))); } return mappings; }
private SchemaCompatibilityValidator(MessageType schema) { for (ColumnDescriptor cd : schema.getColumns()) { ColumnPath columnPath = ColumnPath.get(cd.getPath()); columnsAccordingToSchema.put(columnPath, cd); OriginalType ot = schema.getType(cd.getPath()).getOriginalType(); if (ot != null) { originalTypes.put(columnPath, ot); } } }
private static List<Mapping> computeMappingByName( DataModelDescriptor target, MessageType source) { if (LOG.isDebugEnabled()) { LOG.debug(MessageFormat.format( "Mapping columns by their name: model={0}", //$NON-NLS-1$ target.getDataModelClass().getName())); } Set<PropertyDescriptor> rest = new LinkedHashSet<>(target.getPropertyDescriptors()); List<Mapping> mappings = new ArrayList<>(); for (ColumnDescriptor s : source.getColumns()) { String name = s.getPath()[0]; Type sType = source.getType(s.getPath()); PropertyDescriptor t = target.findPropertyDescriptor(name); if (t != null) { mappings.add(new Mapping(s, sType, t)); rest.remove(t); } else { mappings.add(new Mapping(s, sType, null)); } } for (PropertyDescriptor t : rest) { mappings.add(new Mapping(null, null, t)); } return mappings; }
public void initialize(MessageType requestedSchema, MessageType fileSchema, Map<String, String> extraMetadata, Map<String, String> readSupportMetadata, Path file, List<BlockMetaData> blocks, Configuration configuration) throws IOException { this.requestedSchema = requestedSchema; this.fileSchema = fileSchema; this.file = file; this.columnCount = this.requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, extraMetadata, fileSchema, new ReadSupport.ReadContext(requestedSchema, readSupportMetadata)); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, file, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } LOG.info("RecordReader initialized will read a total of " + total + " records."); }
ColumnDescriptor columnDescriptor = requestedSchema.getColumns().get(fieldIndex); blocks[fieldId] = new LazyBlock(batchSize, new ParquetBlockLoader(columnDescriptor, type));
ColumnChunkMetaData columnChunkMetaData = blockMetadata.getColumns().get(ordinal); for (int i = 0; i < requestedSchema.getColumns().size(); i++) { ColumnDescriptor columnDescriptor = requestedSchema.getColumns().get(i); if (isColumnPredicate(columnDescriptor, effectivePredicate) && columnChunkMetaData.getPath().equals(ColumnPath.get(columnDescriptor.getPath())) &&
public void initialize(MessageType fileSchema, Map<String, String> fileMetadata, Path file, List<BlockMetaData> blocks, Configuration configuration) throws IOException { // initialize a ReadContext for this file ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.requestedSchema = readContext.getRequestedSchema(); this.fileSchema = fileSchema; this.file = file; this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, file, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } LOG.info("RecordReader initialized will read a total of " + total + " records."); }
Iterator<?> iterator = expectedValues.iterator(); for (int batchSize = parquetReader.nextBatch(); batchSize >= 0; batchSize = parquetReader.nextBatch()) { ColumnDescriptor columnDescriptor = fileSchema.getColumns().get(0); Block block = parquetReader.readBlock(columnDescriptor, type); for (int i = 0; i < batchSize; i++) {
@Test public void testMultiPartitionKeySync() throws IOException, InitializationError, URISyntaxException, TException, InterruptedException { String commitTime = "100"; TestUtil.createCOWDataset(commitTime, 5); HiveSyncConfig hiveSyncConfig = HiveSyncConfig.copy(TestUtil.hiveSyncConfig); hiveSyncConfig.partitionValueExtractorClass = MultiPartKeysValueExtractor.class.getCanonicalName(); hiveSyncConfig.tableName = "multi_part_key"; hiveSyncConfig.partitionFields = Lists.newArrayList("year", "month", "day"); TestUtil.getCreatedTablesSet().add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName); HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); assertFalse("Table " + hiveSyncConfig.tableName + " should not exist initially", hiveClient.doesTableExist()); // Lets do the sync HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); tool.syncHoodieTable(); assertTrue("Table " + hiveSyncConfig.tableName + " should exist after sync completes", hiveClient.doesTableExist()); assertEquals("Hive Schema should match the dataset schema + partition fields", hiveClient.getTableSchema().size(), hiveClient.getDataSchema().getColumns().size() + 3); assertEquals("Table partitions should match the number of partitions we wrote", 5, hiveClient.scanTablePartitions().size()); assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", commitTime, hiveClient.getLastCommitTimeSynced().get()); } }
@Test public void testBasicSync() throws IOException, InitializationError, URISyntaxException, TException, InterruptedException { String commitTime = "100"; TestUtil.createCOWDataset(commitTime, 5); HoodieHiveClient hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + " should not exist initially", hiveClient.doesTableExist()); // Lets do the sync HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem); tool.syncHoodieTable(); assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + " should exist after sync completes", hiveClient.doesTableExist()); assertEquals("Hive Schema should match the dataset schema + partition field", hiveClient.getTableSchema().size(), hiveClient.getDataSchema().getColumns().size() + 1); assertEquals("Table partitions should match the number of partitions we wrote", 5, hiveClient.scanTablePartitions().size()); assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", commitTime, hiveClient.getLastCommitTimeSynced().get()); }