org.apache.parquet.schema.MessageType.getPaths java code examples

private void initializeInternal() throws IOException, UnsupportedOperationException {
 // Check that the requested schema is supported.
 missingColumns = new boolean[requestedSchema.getFieldCount()];
 List<ColumnDescriptor> columns = requestedSchema.getColumns();
 List<String[]> paths = requestedSchema.getPaths();
 for (int i = 0; i < requestedSchema.getFieldCount(); ++i) {
  Type t = requestedSchema.getFields().get(i);
  if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) {
   throw new UnsupportedOperationException("Complex types not supported.");
  }
  String[] colPath = paths.get(i);
  if (fileSchema.containsPath(colPath)) {
   ColumnDescriptor fd = fileSchema.getColumnDescription(colPath);
   if (!fd.equals(columns.get(i))) {
    throw new UnsupportedOperationException("Schema evolution not supported.");
   }
   missingColumns[i] = false;
  } else {
   if (columns.get(i).getMaxDefinitionLevel() == 0) {
    // Column is missing in data but the required data is non-nullable. This file is invalid.
    throw new IOException("Required column is missing in data file. Col: " +
     Arrays.toString(colPath));
   }
   missingColumns[i] = true;
  }
 }
}

private void initializeInternal() throws IOException, UnsupportedOperationException {
 // Check that the requested schema is supported.
 missingColumns = new boolean[requestedSchema.getFieldCount()];
 List<ColumnDescriptor> columns = requestedSchema.getColumns();
 List<String[]> paths = requestedSchema.getPaths();
 for (int i = 0; i < requestedSchema.getFieldCount(); ++i) {
  Type t = requestedSchema.getFields().get(i);
  if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) {
   throw new UnsupportedOperationException("Complex types not supported.");
  }
  String[] colPath = paths.get(i);
  if (fileSchema.containsPath(colPath)) {
   ColumnDescriptor fd = fileSchema.getColumnDescription(colPath);
   if (!fd.equals(columns.get(i))) {
    throw new UnsupportedOperationException("Schema evolution not supported.");
   }
   missingColumns[i] = false;
  } else {
   if (columns.get(i).getMaxDefinitionLevel() == 0) {
    // Column is missing in data but the required data is non-nullable. This file is invalid.
    throw new IOException("Required column is missing in data file. Col: " +
     Arrays.toString(colPath));
   }
   missingColumns[i] = true;
  }
 }
}

private void initializeInternal() throws IOException, UnsupportedOperationException {
 /**
  * Check that the requested schema is supported.
  */
 missingColumns = new boolean[requestedSchema.getFieldCount()];
 List<ColumnDescriptor> columns = requestedSchema.getColumns();
 List<String[]> paths = requestedSchema.getPaths();
 for (int i = 0; i < requestedSchema.getFieldCount(); ++i) {
  Type t = requestedSchema.getFields().get(i);
  if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) {
   throw new UnsupportedOperationException("Complex types not supported.");
  }
  String[] colPath = paths.get(i);
  if (fileSchema.containsPath(colPath)) {
   ColumnDescriptor fd = fileSchema.getColumnDescription(colPath);
   if (!fd.equals(columns.get(i))) {
    throw new UnsupportedOperationException("Schema evolution not supported.");
   }
   missingColumns[i] = false;
  } else {
   if (columns.get(i).getMaxDefinitionLevel() == 0) {
    // Column is missing in data but the required data is non-nullable. This file is invalid.
    throw new IOException("Required column is missing in data file. Col: " +
     Arrays.toString(colPath));
   }
   missingColumns[i] = true;
  }
 }
}

public List<String[]> getPaths() {
 return this.getPaths(0);
}

public List<String[]> getPaths() {
 return this.getPaths(0);
}

private static String[] columnNames(MessageType messageType) {
 String[] colNames = new String[messageType.getPaths().size()];
 int i = 0;
 for (String[] path : messageType.getPaths()) {
  assert path.length == 1;
  colNames[i++] = path[0];
 }
 return colNames;
}

public static Predicate buildPredicate(MessageType requestedSchema, TupleDomain<ColumnDescriptor> parquetTupleDomain, Map<List<String>, RichColumnDescriptor> descriptorsByPath)
{
  ImmutableList.Builder<RichColumnDescriptor> columnReferences = ImmutableList.builder();
  for (String[] paths : requestedSchema.getPaths()) {
    RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(paths));
    if (descriptor != null) {
      columnReferences.add(descriptor);
    }
  }
  return new TupleDomainParquetPredicate(parquetTupleDomain, columnReferences.build());
}

public static Predicate buildPredicate(MessageType requestedSchema, TupleDomain<ColumnDescriptor> parquetTupleDomain, Map<List<String>, RichColumnDescriptor> descriptorsByPath)
{
  ImmutableList.Builder<RichColumnDescriptor> columnReferences = ImmutableList.builder();
  for (String[] paths : requestedSchema.getPaths()) {
    RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(paths));
    if (descriptor != null) {
      columnReferences.add(descriptor);
    }
  }
  return new TupleDomainParquetPredicate(parquetTupleDomain, columnReferences.build());
}

public static Map<List<String>, RichColumnDescriptor> getDescriptors(MessageType fileSchema, MessageType requestedSchema)
{
  Map<List<String>, RichColumnDescriptor> descriptorsByPath = new HashMap<>();
  List<PrimitiveColumnIO> columns = getColumns(fileSchema, requestedSchema);
  for (String[] paths : fileSchema.getPaths()) {
    List<String> columnPath = Arrays.asList(paths);
    getDescriptor(columns, columnPath)
        .ifPresent(richColumnDescriptor -> descriptorsByPath.put(columnPath, richColumnDescriptor));
  }
  return descriptorsByPath;
}

private List<ColumnOrder> getColumnOrders(MessageType schema) {
 List<ColumnOrder> columnOrders = new ArrayList<>();
 // Currently, only TypeDefinedOrder is supported, so we create a column order for each columns with
 // TypeDefinedOrder even if some types (e.g. INT96) have undefined column orders.
 for (int i = 0, n = schema.getPaths().size(); i < n; ++i) {
  ColumnOrder columnOrder = new ColumnOrder();
  columnOrder.setTYPE_ORDER(TYPE_DEFINED_ORDER);
  columnOrders.add(columnOrder);
 }
 return columnOrders;
}

private List<ColumnOrder> getColumnOrders(MessageType schema) {
 List<ColumnOrder> columnOrders = new ArrayList<>();
 // Currently, only TypeDefinedOrder is supported, so we create a column order for each columns with
 // TypeDefinedOrder even if some types (e.g. INT96) have undefined column orders.
 for (int i = 0, n = schema.getPaths().size(); i < n; ++i) {
  ColumnOrder columnOrder = new ColumnOrder();
  columnOrder.setTYPE_ORDER(TYPE_DEFINED_ORDER);
  columnOrders.add(columnOrder);
 }
 return columnOrders;
}

public static Map<List<String>, RichColumnDescriptor> getDescriptors(MessageType fileSchema, MessageType requestedSchema)
{
  Map<List<String>, RichColumnDescriptor> descriptorsByPath = new HashMap<>();
  List<PrimitiveColumnIO> columns = getColumns(fileSchema, requestedSchema);
  for (String[] paths : fileSchema.getPaths()) {
    List<String> columnPath = Arrays.asList(paths);
    getDescriptor(columns, columnPath)
        .ifPresent(richColumnDescriptor -> descriptorsByPath.put(columnPath, richColumnDescriptor));
  }
  return descriptorsByPath;
}

public List<ColumnDescriptor> getColumns() {
 List<String[]> paths = this.getPaths(0);
 List<ColumnDescriptor> columns = new ArrayList<ColumnDescriptor>(paths.size());
 for (String[] path : paths) {
  // TODO: optimize this
  PrimitiveType primitiveType = getType(path).asPrimitiveType();
  columns.add(new ColumnDescriptor(
          path,
          primitiveType,
          getMaxRepetitionLevel(path),
          getMaxDefinitionLevel(path)));
 }
 return columns;
}

public List<ColumnDescriptor> getColumns() {
 List<String[]> paths = this.getPaths(0);
 List<ColumnDescriptor> columns = new ArrayList<ColumnDescriptor>(paths.size());
 for (String[] path : paths) {
  // TODO: optimize this
  PrimitiveType primitiveType = getType(path).asPrimitiveType();
  columns.add(new ColumnDescriptor(
          path,
          primitiveType,
          getMaxRepetitionLevel(path),
          getMaxDefinitionLevel(path)));
 }
 return columns;
}

private static byte[] roughGuessTypes(MessageType messageType) {
 byte[] types = new byte[messageType.getPaths().size()];
 for (int i = 0; i < types.length; i++) {
  Type parquetType = messageType.getType(i);
  assert parquetType.isPrimitive();
  switch (parquetType.asPrimitiveType().getPrimitiveTypeName()) {
   case INT32:
   case BOOLEAN:
   case FLOAT:
   case DOUBLE:
    types[i] = Vec.T_NUM;
    break;
   case INT96:
    types[i] = Vec.T_TIME;
    break;
   case INT64:
    types[i] = OriginalType.TIMESTAMP_MILLIS.equals(parquetType.getOriginalType()) ? Vec.T_TIME : Vec.T_NUM;
    break;
   default:
    types[i] = Vec.T_BAD;
  }
 }
 return types;
}

public void initialize(ParquetFileReader reader, Configuration configuration)
  throws IOException {
 // initialize a ReadContext for this file
 this.reader = reader;
 FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
 this.fileSchema = parquetFileMetadata.getSchema();
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
   configuration, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(
   configuration, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 this.total = reader.getFilteredRecordCount();
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
 this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true);
 reader.setRequestedSchema(requestedSchema);
 LOG.info("RecordReader initialized will read a total of {} records.", total);
}

public void initialize(ParquetFileReader reader, Configuration configuration)
  throws IOException {
 // initialize a ReadContext for this file
 this.reader = reader;
 FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
 this.fileSchema = parquetFileMetadata.getSchema();
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
   configuration, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(
   configuration, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 this.total = reader.getRecordCount();
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
 this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true);
 reader.setRequestedSchema(requestedSchema);
 LOG.info("RecordReader initialized will read a total of {} records.", total);
}

private static void checkCompatibility(ParquetMetadata metadata) {
 // make sure we can map Parquet blocks to Chunks
 for (BlockMetaData block : metadata.getBlocks()) {
  if (block.getRowCount() > Integer.MAX_VALUE) {
   IcedHashMapGeneric.IcedHashMapStringObject dbg = new IcedHashMapGeneric.IcedHashMapStringObject();
   dbg.put("startingPos", block.getStartingPos());
   dbg.put("rowCount", block.getRowCount());
   throw new H2OUnsupportedDataFileException("Unsupported Parquet file (technical limitation).",
       "Current implementation doesn't support Parquet files with blocks larger than " +
       Integer.MAX_VALUE + " rows.", dbg); // because we map each block to a single H2O Chunk
  }
 }
 // check that file doesn't have nested structures
 MessageType schema = metadata.getFileMetaData().getSchema();
 for (String[] path : schema.getPaths())
  if (path.length != 1) {
   throw new H2OUnsupportedDataFileException("Parquet files with nested structures are not supported.",
       "Detected a column with a nested structure " + Arrays.asList(path));
  }
}

public void initialize(FileMetaData parquetFileMetadata,
            Path file, List<BlockMetaData> blocks, Configuration configuration)
  throws IOException {
 // initialize a ReadContext for this file
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
   configuration, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.fileSchema = parquetFileMetadata.getSchema();
 this.file = file;
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(
   configuration, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 List<ColumnDescriptor> columns = requestedSchema.getColumns();
 reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
 for (BlockMetaData block : blocks) {
  total += block.getRowCount();
 }
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
 LOG.info("RecordReader initialized will read a total of " + total + " records.");
}

public void initialize(FileMetaData parquetFileMetadata,
            Path file, List<BlockMetaData> blocks, Configuration configuration)
  throws IOException {
 // initialize a ReadContext for this file
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
   configuration, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.fileSchema = parquetFileMetadata.getSchema();
 this.file = file;
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(
   configuration, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 List<ColumnDescriptor> columns = requestedSchema.getColumns();
 reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
 for (BlockMetaData block : blocks) {
  total += block.getRowCount();
 }
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
 LOG.info("RecordReader initialized will read a total of " + total + " records.");
}

Popular methods of MessageType

Popular in Java

Finding current android device location
findViewById (Activity)
getSystemService (Context)
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
BufferedInputStream (java.io)
A BufferedInputStream adds functionality to another input stream-namely, the ability to buffer the i
SocketException (java.net)
This SocketException may be thrown during socket creation or setting options, and is the superclass
CountDownLatch (java.util.concurrent)
A synchronization aid that allows one or more threads to wait until a set of operations being perfor
DateTimeFormat (org.joda.time.format)
Factory that creates instances of DateTimeFormatter from patterns and styles. Datetime formatting i
Logger (org.slf4j)
The org.slf4j.Logger interface is the main user entry point of SLF4J API. It is expected that loggin
Window (java.awt)
A Window object is a top-level window with no borders and no menubar. The default layout for a windo
Best IntelliJ plugins

How to use getPathsmethodin org.apache.parquet.schema.MessageType

Best Java code snippets using org.apache.parquet.schema.MessageType.getPaths (Showing top 20 results out of 315)

How to use
getPaths
method
in
org.apache.parquet.schema.MessageType