org.apache.parquet.column.ColumnDescriptor java code examples

Refine search

private Path wrapPathForCache(Path path, Object fileKey, JobConf configuration,
  List<BlockMetaData> blocks, String tag) throws IOException {
 if (fileKey == null || cache == null) {
  return path;
 }
 HashSet<ColumnPath> includedCols = new HashSet<>();
 for (ColumnDescriptor col : requestedSchema.getColumns()) {
  includedCols.add(ColumnPath.get(col.getPath()));
 }
 // We could make some assumptions given how the reader currently does the work (consecutive
 // chunks, etc.; blocks and columns stored in offset order in the lists), but we won't -
 // just save all the chunk boundaries and lengths for now.
 TreeMap<Long, Long> chunkIndex = new TreeMap<>();
 for (BlockMetaData block : blocks) {
  for (ColumnChunkMetaData mc : block.getColumns()) {
   if (!includedCols.contains(mc.getPath())) continue;
   chunkIndex.put(mc.getStartingPos(), mc.getStartingPos() + mc.getTotalSize());
  }
 }
 // Register the cache-aware path so that Parquet reader would go thru it.
 configuration.set("fs." + LlapCacheAwareFs.SCHEME + ".impl",
   LlapCacheAwareFs.class.getCanonicalName());
 path = LlapCacheAwareFs.registerFile(cache, path, fileKey, chunkIndex, configuration, tag);
 this.cacheFsPath = path;
 return path;
}

private void readTimestamp(int total, TimestampColumnVector c, int rowId) throws IOException {
 int left = total;
 while (left > 0) {
  readRepetitionAndDefinitionLevels();
  if (definitionLevel >= maxDefLevel) {
   switch (descriptor.getType()) {
   //INT64 is not yet supported
   case INT96:
    c.set(rowId, dataColumn.readTimestamp().toSqlTimestamp());
    break;
   default:
    throw new IOException(
      "Unsupported parquet logical type: " + type.getOriginalType() + " for timestamp");
   }
   c.isNull[rowId] = false;
   c.isRepeating =
     c.isRepeating && ((c.time[0] == c.time[rowId]) && (c.nanos[0] == c.nanos[rowId]));
  } else {
   setNullValue(c, rowId);
  }
  rowId++;
  left--;
 }
}

private void initializeInternal() throws IOException, UnsupportedOperationException {
 // Check that the requested schema is supported.
 missingColumns = new boolean[requestedSchema.getFieldCount()];
 List<ColumnDescriptor> columns = requestedSchema.getColumns();
 List<String[]> paths = requestedSchema.getPaths();
 for (int i = 0; i < requestedSchema.getFieldCount(); ++i) {
  Type t = requestedSchema.getFields().get(i);
  if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) {
   throw new UnsupportedOperationException("Complex types not supported.");
  }
  String[] colPath = paths.get(i);
  if (fileSchema.containsPath(colPath)) {
   ColumnDescriptor fd = fileSchema.getColumnDescription(colPath);
   if (!fd.equals(columns.get(i))) {
    throw new UnsupportedOperationException("Schema evolution not supported.");
   }
   missingColumns[i] = false;
  } else {
   if (columns.get(i).getMaxDefinitionLevel() == 0) {
    // Column is missing in data but the required data is non-nullable. This file is invalid.
    throw new IOException("Required column is missing in data file. Col: " +
     Arrays.toString(colPath));
   }
   missingColumns[i] = true;
  }
 }
}

private void readPageV2(DataPageV2 page) {
 this.pageValueCount = page.getValueCount();
 this.repetitionLevelColumn = newRLEIterator(descriptor.getMaxRepetitionLevel(),
   page.getRepetitionLevels());
 this.definitionLevelColumn = newRLEIterator(descriptor.getMaxDefinitionLevel(), page.getDefinitionLevels());
 try {
  LOG.debug("page data size " + page.getData().size() + " bytes and " + pageValueCount + " records");
  initDataReader(page.getDataEncoding(), page.getData().toInputStream(), page.getValueCount());
 } catch (IOException e) {
  throw new ParquetDecodingException("could not read page " + page + " in col " + descriptor, e);
 }
}

/**
 * Helper function to construct exception for parquet schema mismatch.
 */
private SchemaColumnConvertNotSupportedException constructConvertNotSupportedException(
  ColumnDescriptor descriptor,
  WritableColumnVector column) {
 return new SchemaColumnConvertNotSupportedException(
  Arrays.toString(descriptor.getPath()),
  descriptor.getPrimitiveType().getPrimitiveTypeName().toString(),
  column.dataType().catalogString());
}

public ColumnDescriptor getColumnDescription(String[] path) {
 int maxRep = getMaxRepetitionLevel(path);
 int maxDef = getMaxDefinitionLevel(path);
 PrimitiveType type = getType(path).asPrimitiveType();
 return new ColumnDescriptor(path, type, maxRep, maxDef);
}

 @Override
 public Dictionary initDictionary(ColumnDescriptor descriptor, DictionaryPage dictionaryPage) throws IOException {
  switch (descriptor.getType()) {
  case BINARY:
   return new PlainBinaryDictionary(dictionaryPage);
  case FIXED_LEN_BYTE_ARRAY:
   return new PlainBinaryDictionary(dictionaryPage, descriptor.getTypeLength());
  case INT96:
   return new PlainBinaryDictionary(dictionaryPage, 12);
  case INT64:
   return new PlainLongDictionary(dictionaryPage);
  case DOUBLE:
   return new PlainDoubleDictionary(dictionaryPage);
  case INT32:
   return new PlainIntegerDictionary(dictionaryPage);
  case FLOAT:
   return new PlainFloatDictionary(dictionaryPage);
  default:
   throw new ParquetDecodingException("Dictionary encoding not supported for type: " + descriptor.getType());
  }
 }
},

 @Override
 public ValuesReader getValuesReader(ColumnDescriptor descriptor,
   ValuesType valuesType) {
  if (descriptor.getType() != BINARY && descriptor.getType() != FIXED_LEN_BYTE_ARRAY) {
   throw new ParquetDecodingException("Encoding DELTA_BYTE_ARRAY is only supported for type BINARY and FIXED_LEN_BYTE_ARRAY");
  }
  return new DeltaByteArrayReader();
 }
},

static int getMaxLevel(ColumnDescriptor descriptor, ValuesType valuesType)
{
  switch (valuesType) {
    case REPETITION_LEVEL:
      return descriptor.getMaxRepetitionLevel();
    case DEFINITION_LEVEL:
      return descriptor.getMaxDefinitionLevel();
    case VALUES:
      if (descriptor.getType() == BOOLEAN) {
        return 1;
      }
    default:
      throw new ParquetDecodingException("Unsupported  values type: " + valuesType);
  }
}

public RichColumnDescriptor(
    ColumnDescriptor descriptor,
    PrimitiveType primitiveType)
{
  super(descriptor.getPath(), primitiveType.getPrimitiveTypeName(), primitiveType.getTypeLength(), descriptor.getMaxRepetitionLevel(), descriptor.getMaxDefinitionLevel());
  this.primitiveType = primitiveType;
  this.required = primitiveType.getRepetition() != OPTIONAL;
}

 private void readPageV2(DataPageV2 page) throws IOException {
  this.pageValueCount = page.getValueCount();
  this.repetitionLevelColumn = createRLEIterator(descriptor.getMaxRepetitionLevel(),
    page.getRepetitionLevels(), descriptor);

  int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel());
  this.defColumn = new VectorizedRleValuesReader(bitWidth);
  this.definitionLevelColumn = new ValuesReaderIntIterator(this.defColumn);
  this.defColumn.initFromBuffer(
    this.pageValueCount, page.getDefinitionLevels().toByteArray());
  try {
   initDataReader(page.getDataEncoding(), page.getData().toByteArray(), 0);
  } catch (IOException e) {
   throw new IOException("could not read page " + page + " in col " + descriptor, e);
  }
 }
}

  (descriptor.getType() == PrimitiveType.PrimitiveTypeName.INT32 ||
  (descriptor.getType() == PrimitiveType.PrimitiveTypeName.INT64  &&
    column.dataType() != DataTypes.TimestampType) ||
  descriptor.getType() == PrimitiveType.PrimitiveTypeName.FLOAT ||
  descriptor.getType() == PrimitiveType.PrimitiveTypeName.DOUBLE ||
  descriptor.getType() == PrimitiveType.PrimitiveTypeName.BINARY))) {
switch (descriptor.getType()) {
 case BOOLEAN:
  readBooleanBatch(rowId, num, column);
  break;
 case FIXED_LEN_BYTE_ARRAY:
  readFixedLenByteArrayBatch(rowId, num, column, descriptor.getTypeLength());
  break;
 default:
  throw new IOException("Unsupported type: " + descriptor.getType());

public void resolveDrillType(Map<String, SchemaElement> schemaElements, OptionManager options) {
 se = schemaElements.get(ParquetReaderUtility.getFullColumnPath(column));
 type = ParquetToDrillTypeConverter.toMajorType(column.getType(), column.getTypeLength(),
   getDataMode(column), se, options);
 field = MaterializedField.create(toFieldName(column.getPath()).getLastSegment().getNameSegment().getPath(), type);
 length = getDataTypeLength();
}

@Override
public void writeLine(Row row)
{
  Group group = groupFactory.newGroup();
  List<ColumnDescriptor> columns = schema.getColumns();
  for (int i = 0; i < row.size(); i++) {
    Object value = row.getAs(i);
    addValueToGroup(columns.get(i).getType().javaType, group, i++, value);
  }
  try {
    writeGroup(group);
  }
  catch (IOException e) {
    logger.error("", e);
  }
}

public BaseVectorizedColumnReader(
  ColumnDescriptor descriptor,
  PageReader pageReader,
  boolean skipTimestampConversion,
  Type parquetType, TypeInfo hiveType) throws IOException {
 this.descriptor = descriptor;
 this.type = parquetType;
 this.pageReader = pageReader;
 this.maxDefLevel = descriptor.getMaxDefinitionLevel();
 this.skipTimestampConversion = skipTimestampConversion;
 this.hiveType = hiveType;
 DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
 if (dictionaryPage != null) {
  try {
   this.dictionary = ParquetDataColumnReaderFactory
     .getDataColumnReaderByTypeOnDictionary(parquetType.asPrimitiveType(), hiveType,
       dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage),
       skipTimestampConversion);
   this.isCurrentPageDictionaryEncoded = true;
  } catch (IOException e) {
   throw new IOException("could not decode the dictionary for " + descriptor, e);
  }
 } else {
  this.dictionary = null;
  this.isCurrentPageDictionaryEncoded = false;
 }
}

private List<ColumnDescriptor> getAllColumnDescriptorByType(
 int depth,
 Type type,
 List<ColumnDescriptor> columns) throws ParquetRuntimeException {
 List<ColumnDescriptor> res = new ArrayList<>();
 for (ColumnDescriptor descriptor : columns) {
  if (depth >= descriptor.getPath().length) {
   throw new InvalidSchemaException("Corrupted Parquet schema");
  }
  if (type.getName().equals(descriptor.getPath()[depth])) {
   res.add(descriptor);
  }
 }
 return res;
}

private String[] getExisingParentPath(ColumnDescriptor path, MessageType inputFileSchema) {
 List<String> parentPath = Arrays.asList(path.getPath());
 while (parentPath.size() > 0 && !inputFileSchema.containsPath(parentPath.toArray(new String[parentPath.size()]))) {
  parentPath = parentPath.subList(0, parentPath.size() - 1);
 }
 return parentPath.toArray(new String[parentPath.size()]);
}

private void readPageV2(DataPageV2 page) {
  this.pageValueCount = page.getValueCount();
  int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel());
  defColumn = new VectorizedDefValuesReader(bitWidth);
  try {
    defColumn.initFromBuffer(this.pageValueCount, page.getDefinitionLevels().toByteArray());
    initDataReader(page.getDataEncoding(), page.getData().toByteArray(), 0);
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read page " + page + " in col " + descriptor, e);
  }
}

 descriptor.getPrimitiveType().getPrimitiveTypeName();
if (isCurrentPageDictionaryEncoded) {
  case FIXED_LEN_BYTE_ARRAY:
   readFixedLenByteArrayBatch(
    rowId, num, column, descriptor.getPrimitiveType().getTypeLength());
   break;
  default:

public ValuesWriter newRepetitionLevelWriter(ColumnDescriptor path) {
 return newColumnDescriptorValuesWriter(path.getMaxRepetitionLevel());
}

Javadoc

Describes a column's type as well as its position in its containing schema.

Most used methods

Popular in Java

Creating JSON documents from java classes using gson
onCreateOptionsMenu (Activity)
setContentView (Activity)
getApplicationContext (Context)
Proxy (java.net)
This class represents proxy server settings. A created instance of Proxy stores a type and an addres
Timestamp (java.sql)
A Java representation of the SQL TIMESTAMP type. It provides the capability of representing the SQL
Calendar (java.util)
Calendar is an abstract base class for converting between a Date object and a set of integer fields
Set (java.util)
A Set is a data structure which does not allow duplicate elements.
ServletException (javax.servlet)
Defines a general exception a servlet can throw when it encounters difficulty.
FlowLayout (java.awt)
A flow layout arranges components in a left-to-right flow, much like lines of text in a paragraph. F
CodeWhisperer alternatives

How to useColumnDescriptor in org.apache.parquet.column

Best Java code snippets using org.apache.parquet.column.ColumnDescriptor (Showing top 20 results out of 315)

Refine search

How to use
ColumnDescriptor
in
org.apache.parquet.column