parquet.schema.MessageType.getColumns java code examples

public void flushToFileWriter(ParquetFileWriter writer) throws IOException {
 for (ColumnDescriptor path : schema.getColumns()) {
  ColumnChunkPageWriter pageWriter = writers.get(path);
  pageWriter.writeToFileWriter(writer);
 }
}

public ColumnChunkPageWriteStore(BytesCompressor compressor, MessageType schema, int pageSize) {
 this.schema = schema;
 for (ColumnDescriptor path : schema.getColumns()) {
  writers.put(path,  new ColumnChunkPageWriter(path, compressor, pageSize));
 }
}

  private void initializeColumnReaders()
  {
    for (ColumnDescriptor column : requestedSchema.getColumns()) {
      columnReadersMap.put(column, ParquetColumnReader.createReader(column));
    }
  }
}

public void flushToFileWriter(ParquetFileWriter writer) throws IOException {
 List<ColumnDescriptor> columns = schema.getColumns();
 for (ColumnDescriptor columnDescriptor : columns) {
  ColumnChunkPageWriter pageWriter = writers.get(columnDescriptor);
  pageWriter.writeToFileWriter(writer);
 }
}

private static int lookupParquetColumn(HiveColumnHandle column, MessageType fileSchema)
{
  // map column has more than one primitive columns in parquet file
  // the column ordinal number does not always equal to hive column index
  // need to do a look up in parquet file schema columns
  int parquetFieldIndex = 0;
  for (; parquetFieldIndex < fileSchema.getColumns().size(); parquetFieldIndex++) {
    String[] path = fileSchema.getColumns().get(parquetFieldIndex).getPath();
    String columnName = path[path.length - 1];
    if (column.getName().equals(columnName)) {
      break;
    }
  }
  return parquetFieldIndex;
}

public ColumnWriteStoreV2(
  MessageType schema,
  PageWriteStore pageWriteStore,
  int pageSizeThreshold,
  ParquetProperties parquetProps) {
 super();
 this.pageSizeThreshold = pageSizeThreshold;
 this.thresholdTolerance = (long)(pageSizeThreshold * THRESHOLD_TOLERANCE_RATIO);
 Map<ColumnDescriptor, ColumnWriterV2> mcolumns = new TreeMap<ColumnDescriptor, ColumnWriterV2>();
 for (ColumnDescriptor path : schema.getColumns()) {
  PageWriter pageWriter = pageWriteStore.getPageWriter(path);
  mcolumns.put(path, new ColumnWriterV2(path, pageWriter, parquetProps, pageSizeThreshold));
 }
 this.columns = unmodifiableMap(mcolumns);
 this.writers = this.columns.values();
}

public int nextBatch()
    throws IOException, InterruptedException
{
  if (nextRowInGroup >= currentGroupRowCount) {
    if (!advanceToNextRowGroup()) {
      return -1;
    }
  }
  int batchSize = Ints.checkedCast(Math.min(MAX_VECTOR_LENGTH, currentGroupRowCount - nextRowInGroup));
  nextRowInGroup += batchSize;
  currentPosition += batchSize;
  for (ColumnDescriptor column : requestedSchema.getColumns()) {
    ParquetColumnReader columnReader = columnReadersMap.get(column);
    columnReader.prepareNextRead(batchSize);
  }
  return batchSize;
}

private ParquetFileReader createFileReader(ParquetMetadata meta, List<BlockMetaData> blocks) throws IOException {
  FileMetaData fileMetaData = meta.getFileMetaData();
  if (FILE_READER_NEWER_CTOR != null) {
    try {
      return FILE_READER_NEWER_CTOR.newInstance(
          hadoopConfiguration,
          fileMetaData,
          path,
          blocks,
          fileMetaData.getSchema().getColumns());
    } catch (ReflectiveOperationException | IllegalArgumentException | SecurityException e) {
      LOG.debug("failed ParquetFileReader.<init>", e);
    }
  }
  return new ParquetFileReader(
      hadoopConfiguration,
      path,
      blocks,
      fileMetaData.getSchema().getColumns());
}

maxColCount = Math.max(w.getSchema().getColumns().size(), maxColCount);

private void initStore() {
 // we don't want this number to be too small
 // ideally we divide the block equally across the columns
 // it is unlikely all columns are going to be the same size.
 int initialBlockBufferSize = max(MINIMUM_BUFFER_SIZE, blockSize / schema.getColumns().size() / 5);
 pageStore = new ColumnChunkPageWriteStore(compressor, schema, initialBlockBufferSize);
 // we don't want this number to be too small either
 // ideally, slightly bigger than the page size, but not bigger than the block buffer
 int initialPageBufferSize = max(MINIMUM_BUFFER_SIZE, min(pageSize + pageSize / 10, initialBlockBufferSize));
 store = new ColumnWriteStoreImpl(pageStore, pageSize, initialPageBufferSize, dictionaryPageSize, enableDictionary, writerVersion);
 MessageColumnIO columnIO = new ColumnIOFactory(validating).getColumnIO(schema);
 writeSupport.prepareForWrite(columnIO.getRecordWriter(store));
}

private static List<Mapping> computeMappingByPosition(
    DataModelDescriptor target, MessageType source) {
  if (LOG.isDebugEnabled()) {
    LOG.debug(MessageFormat.format(
        "Mapping columns by their position: model={0}", //$NON-NLS-1$
        target.getDataModelClass().getName()));
  }
  List<ColumnDescriptor> sources = source.getColumns();
  List<? extends PropertyDescriptor> targets = target.getPropertyDescriptors();
  List<Mapping> mappings = new ArrayList<>();
  int limit = Math.min(sources.size(), targets.size());
  for (int i = 0; i < limit; i++) {
    ColumnDescriptor s = sources.get(i);
    Type sType = source.getType(s.getPath());
    PropertyDescriptor t = targets.get(i);
    mappings.add(new Mapping(s, sType, t));
  }
  for (int i = limit, n = sources.size(); i < n; i++) {
    ColumnDescriptor s = sources.get(i);
    Type sType = source.getType(s.getPath());
    mappings.add(new Mapping(s, sType, null));
  }
  for (int i = limit, n = targets.size(); i < n; i++) {
    mappings.add(new Mapping(null, null, targets.get(i)));
  }
  return mappings;
}

private SchemaCompatibilityValidator(MessageType schema) {
 for (ColumnDescriptor cd : schema.getColumns()) {
  ColumnPath columnPath = ColumnPath.get(cd.getPath());
  columnsAccordingToSchema.put(columnPath, cd);
  OriginalType ot = schema.getType(cd.getPath()).getOriginalType();
  if (ot != null) {
   originalTypes.put(columnPath, ot);
  }
 }
}

private static List<Mapping> computeMappingByName(
    DataModelDescriptor target, MessageType source) {
  if (LOG.isDebugEnabled()) {
    LOG.debug(MessageFormat.format(
        "Mapping columns by their name: model={0}", //$NON-NLS-1$
        target.getDataModelClass().getName()));
  }
  Set<PropertyDescriptor> rest = new LinkedHashSet<>(target.getPropertyDescriptors());
  List<Mapping> mappings = new ArrayList<>();
  for (ColumnDescriptor s : source.getColumns()) {
    String name = s.getPath()[0];
    Type sType = source.getType(s.getPath());
    PropertyDescriptor t = target.findPropertyDescriptor(name);
    if (t != null) {
      mappings.add(new Mapping(s, sType, t));
      rest.remove(t);
    } else {
      mappings.add(new Mapping(s, sType, null));
    }
  }
  for (PropertyDescriptor t : rest) {
    mappings.add(new Mapping(null, null, t));
  }
  return mappings;
}

public void initialize(MessageType requestedSchema, MessageType fileSchema,
            Map<String, String> extraMetadata, Map<String, String> readSupportMetadata,
            Path file, List<BlockMetaData> blocks, Configuration configuration)
  throws IOException {
 this.requestedSchema = requestedSchema;
 this.fileSchema = fileSchema;
 this.file = file;
 this.columnCount = this.requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(
   configuration, extraMetadata, fileSchema,
   new ReadSupport.ReadContext(requestedSchema, readSupportMetadata));
 List<ColumnDescriptor> columns = requestedSchema.getColumns();
 reader = new ParquetFileReader(configuration, file, blocks, columns);
 for (BlockMetaData block : blocks) {
  total += block.getRowCount();
 }
 LOG.info("RecordReader initialized will read a total of " + total + " records.");
}

ColumnDescriptor columnDescriptor = requestedSchema.getColumns().get(fieldIndex);
blocks[fieldId] = new LazyBlock(batchSize, new ParquetBlockLoader(columnDescriptor, type));

ColumnChunkMetaData columnChunkMetaData = blockMetadata.getColumns().get(ordinal);
for (int i = 0; i < requestedSchema.getColumns().size(); i++) {
  ColumnDescriptor columnDescriptor = requestedSchema.getColumns().get(i);
  if (isColumnPredicate(columnDescriptor, effectivePredicate) &&
      columnChunkMetaData.getPath().equals(ColumnPath.get(columnDescriptor.getPath())) &&

public void initialize(MessageType fileSchema,
  Map<String, String> fileMetadata,
  Path file, List<BlockMetaData> blocks, Configuration configuration)
  throws IOException {
 // initialize a ReadContext for this file
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
   configuration, toSetMultiMap(fileMetadata), fileSchema));
 this.requestedSchema = readContext.getRequestedSchema();
 this.fileSchema = fileSchema;
 this.file = file;
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(
   configuration, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 List<ColumnDescriptor> columns = requestedSchema.getColumns();
 reader = new ParquetFileReader(configuration, file, blocks, columns);
 for (BlockMetaData block : blocks) {
  total += block.getRowCount();
 }
 LOG.info("RecordReader initialized will read a total of " + total + " records.");
}

Iterator<?> iterator = expectedValues.iterator();
for (int batchSize = parquetReader.nextBatch(); batchSize >= 0; batchSize = parquetReader.nextBatch()) {
  ColumnDescriptor columnDescriptor = fileSchema.getColumns().get(0);
  Block block = parquetReader.readBlock(columnDescriptor, type);
  for (int i = 0; i < batchSize; i++) {

 @Test
 public void testMultiPartitionKeySync()
   throws IOException, InitializationError, URISyntaxException, TException,
   InterruptedException {
  String commitTime = "100";
  TestUtil.createCOWDataset(commitTime, 5);

  HiveSyncConfig hiveSyncConfig = HiveSyncConfig.copy(TestUtil.hiveSyncConfig);
  hiveSyncConfig.partitionValueExtractorClass = MultiPartKeysValueExtractor.class.getCanonicalName();
  hiveSyncConfig.tableName = "multi_part_key";
  hiveSyncConfig.partitionFields = Lists.newArrayList("year", "month", "day");
  TestUtil.getCreatedTablesSet().add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);

  HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig,
    TestUtil.getHiveConf(), TestUtil.fileSystem);
  assertFalse("Table " + hiveSyncConfig.tableName + " should not exist initially",
    hiveClient.doesTableExist());
  // Lets do the sync
  HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
  tool.syncHoodieTable();
  assertTrue("Table " + hiveSyncConfig.tableName + " should exist after sync completes",
    hiveClient.doesTableExist());
  assertEquals("Hive Schema should match the dataset schema + partition fields",
    hiveClient.getTableSchema().size(), hiveClient.getDataSchema().getColumns().size() + 3);
  assertEquals("Table partitions should match the number of partitions we wrote", 5,
    hiveClient.scanTablePartitions().size());
  assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES",
    commitTime, hiveClient.getLastCommitTimeSynced().get());
 }
}

@Test
public void testBasicSync()
  throws IOException, InitializationError, URISyntaxException, TException,
  InterruptedException {
 String commitTime = "100";
 TestUtil.createCOWDataset(commitTime, 5);
 HoodieHiveClient hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig,
   TestUtil.getHiveConf(), TestUtil.fileSystem);
 assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + " should not exist initially",
   hiveClient.doesTableExist());
 // Lets do the sync
 HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
   TestUtil.fileSystem);
 tool.syncHoodieTable();
 assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + " should exist after sync completes",
   hiveClient.doesTableExist());
 assertEquals("Hive Schema should match the dataset schema + partition field",
   hiveClient.getTableSchema().size(), hiveClient.getDataSchema().getColumns().size() + 1);
 assertEquals("Table partitions should match the number of partitions we wrote", 5,
   hiveClient.scanTablePartitions().size());
 assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES",
   commitTime, hiveClient.getLastCommitTimeSynced().get());
}

Popular methods of MessageType

Popular in Java

Parsing JSON documents to java classes using gson
runOnUiThread (Activity)
scheduleAtFixedRate (ScheduledExecutorService)
getSupportFragmentManager (FragmentActivity)
FileNotFoundException (java.io)
Thrown when a file specified by a program cannot be found.
FileOutputStream (java.io)
An output stream that writes bytes to a file. If the output file exists, it can be replaced or appen
URL (java.net)
A Uniform Resource Locator that identifies the location of an Internet resource as specified by RFC
ConcurrentHashMap (java.util.concurrent)
A plug-in replacement for JDK1.5 java.util.concurrent.ConcurrentHashMap. This version is based on or
HttpServlet (javax.servlet.http)
Provides an abstract class to be subclassed to create an HTTP servlet suitable for a Web site. A sub
Loader (org.hibernate.loader)
Abstract superclass of object loading (and querying) strategies. This class implements useful common
Github Copilot alternatives

How to use getColumnsmethodin parquet.schema.MessageType

Best Java code snippets using parquet.schema.MessageType.getColumns (Showing top 20 results out of 315)

How to use
getColumns
method
in
parquet.schema.MessageType