Refine search
private Path wrapPathForCache(Path path, Object fileKey, JobConf configuration, List<BlockMetaData> blocks, String tag) throws IOException { if (fileKey == null || cache == null) { return path; } HashSet<ColumnPath> includedCols = new HashSet<>(); for (ColumnDescriptor col : requestedSchema.getColumns()) { includedCols.add(ColumnPath.get(col.getPath())); } // We could make some assumptions given how the reader currently does the work (consecutive // chunks, etc.; blocks and columns stored in offset order in the lists), but we won't - // just save all the chunk boundaries and lengths for now. TreeMap<Long, Long> chunkIndex = new TreeMap<>(); for (BlockMetaData block : blocks) { for (ColumnChunkMetaData mc : block.getColumns()) { if (!includedCols.contains(mc.getPath())) continue; chunkIndex.put(mc.getStartingPos(), mc.getStartingPos() + mc.getTotalSize()); } } // Register the cache-aware path so that Parquet reader would go thru it. configuration.set("fs." + LlapCacheAwareFs.SCHEME + ".impl", LlapCacheAwareFs.class.getCanonicalName()); path = LlapCacheAwareFs.registerFile(cache, path, fileKey, chunkIndex, configuration, tag); this.cacheFsPath = path; return path; }
private void readTimestamp(int total, TimestampColumnVector c, int rowId) throws IOException { int left = total; while (left > 0) { readRepetitionAndDefinitionLevels(); if (definitionLevel >= maxDefLevel) { switch (descriptor.getType()) { //INT64 is not yet supported case INT96: c.set(rowId, dataColumn.readTimestamp().toSqlTimestamp()); break; default: throw new IOException( "Unsupported parquet logical type: " + type.getOriginalType() + " for timestamp"); } c.isNull[rowId] = false; c.isRepeating = c.isRepeating && ((c.time[0] == c.time[rowId]) && (c.nanos[0] == c.nanos[rowId])); } else { setNullValue(c, rowId); } rowId++; left--; } }
private void initializeInternal() throws IOException, UnsupportedOperationException { // Check that the requested schema is supported. missingColumns = new boolean[requestedSchema.getFieldCount()]; List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<String[]> paths = requestedSchema.getPaths(); for (int i = 0; i < requestedSchema.getFieldCount(); ++i) { Type t = requestedSchema.getFields().get(i); if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) { throw new UnsupportedOperationException("Complex types not supported."); } String[] colPath = paths.get(i); if (fileSchema.containsPath(colPath)) { ColumnDescriptor fd = fileSchema.getColumnDescription(colPath); if (!fd.equals(columns.get(i))) { throw new UnsupportedOperationException("Schema evolution not supported."); } missingColumns[i] = false; } else { if (columns.get(i).getMaxDefinitionLevel() == 0) { // Column is missing in data but the required data is non-nullable. This file is invalid. throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath)); } missingColumns[i] = true; } } }
private void readPageV2(DataPageV2 page) { this.pageValueCount = page.getValueCount(); this.repetitionLevelColumn = newRLEIterator(descriptor.getMaxRepetitionLevel(), page.getRepetitionLevels()); this.definitionLevelColumn = newRLEIterator(descriptor.getMaxDefinitionLevel(), page.getDefinitionLevels()); try { LOG.debug("page data size " + page.getData().size() + " bytes and " + pageValueCount + " records"); initDataReader(page.getDataEncoding(), page.getData().toInputStream(), page.getValueCount()); } catch (IOException e) { throw new ParquetDecodingException("could not read page " + page + " in col " + descriptor, e); } }
/** * Helper function to construct exception for parquet schema mismatch. */ private SchemaColumnConvertNotSupportedException constructConvertNotSupportedException( ColumnDescriptor descriptor, WritableColumnVector column) { return new SchemaColumnConvertNotSupportedException( Arrays.toString(descriptor.getPath()), descriptor.getPrimitiveType().getPrimitiveTypeName().toString(), column.dataType().catalogString()); }
public ColumnDescriptor getColumnDescription(String[] path) { int maxRep = getMaxRepetitionLevel(path); int maxDef = getMaxDefinitionLevel(path); PrimitiveType type = getType(path).asPrimitiveType(); return new ColumnDescriptor(path, type, maxRep, maxDef); }
@Override public Dictionary initDictionary(ColumnDescriptor descriptor, DictionaryPage dictionaryPage) throws IOException { switch (descriptor.getType()) { case BINARY: return new PlainBinaryDictionary(dictionaryPage); case FIXED_LEN_BYTE_ARRAY: return new PlainBinaryDictionary(dictionaryPage, descriptor.getTypeLength()); case INT96: return new PlainBinaryDictionary(dictionaryPage, 12); case INT64: return new PlainLongDictionary(dictionaryPage); case DOUBLE: return new PlainDoubleDictionary(dictionaryPage); case INT32: return new PlainIntegerDictionary(dictionaryPage); case FLOAT: return new PlainFloatDictionary(dictionaryPage); default: throw new ParquetDecodingException("Dictionary encoding not supported for type: " + descriptor.getType()); } } },
@Override public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) { if (descriptor.getType() != BINARY && descriptor.getType() != FIXED_LEN_BYTE_ARRAY) { throw new ParquetDecodingException("Encoding DELTA_BYTE_ARRAY is only supported for type BINARY and FIXED_LEN_BYTE_ARRAY"); } return new DeltaByteArrayReader(); } },
static int getMaxLevel(ColumnDescriptor descriptor, ValuesType valuesType) { switch (valuesType) { case REPETITION_LEVEL: return descriptor.getMaxRepetitionLevel(); case DEFINITION_LEVEL: return descriptor.getMaxDefinitionLevel(); case VALUES: if (descriptor.getType() == BOOLEAN) { return 1; } default: throw new ParquetDecodingException("Unsupported values type: " + valuesType); } }
public RichColumnDescriptor( ColumnDescriptor descriptor, PrimitiveType primitiveType) { super(descriptor.getPath(), primitiveType.getPrimitiveTypeName(), primitiveType.getTypeLength(), descriptor.getMaxRepetitionLevel(), descriptor.getMaxDefinitionLevel()); this.primitiveType = primitiveType; this.required = primitiveType.getRepetition() != OPTIONAL; }
private void readPageV2(DataPageV2 page) throws IOException { this.pageValueCount = page.getValueCount(); this.repetitionLevelColumn = createRLEIterator(descriptor.getMaxRepetitionLevel(), page.getRepetitionLevels(), descriptor); int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel()); this.defColumn = new VectorizedRleValuesReader(bitWidth); this.definitionLevelColumn = new ValuesReaderIntIterator(this.defColumn); this.defColumn.initFromBuffer( this.pageValueCount, page.getDefinitionLevels().toByteArray()); try { initDataReader(page.getDataEncoding(), page.getData().toByteArray(), 0); } catch (IOException e) { throw new IOException("could not read page " + page + " in col " + descriptor, e); } } }
(descriptor.getType() == PrimitiveType.PrimitiveTypeName.INT32 || (descriptor.getType() == PrimitiveType.PrimitiveTypeName.INT64 && column.dataType() != DataTypes.TimestampType) || descriptor.getType() == PrimitiveType.PrimitiveTypeName.FLOAT || descriptor.getType() == PrimitiveType.PrimitiveTypeName.DOUBLE || descriptor.getType() == PrimitiveType.PrimitiveTypeName.BINARY))) { switch (descriptor.getType()) { case BOOLEAN: readBooleanBatch(rowId, num, column); break; case FIXED_LEN_BYTE_ARRAY: readFixedLenByteArrayBatch(rowId, num, column, descriptor.getTypeLength()); break; default: throw new IOException("Unsupported type: " + descriptor.getType());
public void resolveDrillType(Map<String, SchemaElement> schemaElements, OptionManager options) { se = schemaElements.get(ParquetReaderUtility.getFullColumnPath(column)); type = ParquetToDrillTypeConverter.toMajorType(column.getType(), column.getTypeLength(), getDataMode(column), se, options); field = MaterializedField.create(toFieldName(column.getPath()).getLastSegment().getNameSegment().getPath(), type); length = getDataTypeLength(); }
@Override public void writeLine(Row row) { Group group = groupFactory.newGroup(); List<ColumnDescriptor> columns = schema.getColumns(); for (int i = 0; i < row.size(); i++) { Object value = row.getAs(i); addValueToGroup(columns.get(i).getType().javaType, group, i++, value); } try { writeGroup(group); } catch (IOException e) { logger.error("", e); } }
public BaseVectorizedColumnReader( ColumnDescriptor descriptor, PageReader pageReader, boolean skipTimestampConversion, Type parquetType, TypeInfo hiveType) throws IOException { this.descriptor = descriptor; this.type = parquetType; this.pageReader = pageReader; this.maxDefLevel = descriptor.getMaxDefinitionLevel(); this.skipTimestampConversion = skipTimestampConversion; this.hiveType = hiveType; DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); if (dictionaryPage != null) { try { this.dictionary = ParquetDataColumnReaderFactory .getDataColumnReaderByTypeOnDictionary(parquetType.asPrimitiveType(), hiveType, dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage), skipTimestampConversion); this.isCurrentPageDictionaryEncoded = true; } catch (IOException e) { throw new IOException("could not decode the dictionary for " + descriptor, e); } } else { this.dictionary = null; this.isCurrentPageDictionaryEncoded = false; } }
private List<ColumnDescriptor> getAllColumnDescriptorByType( int depth, Type type, List<ColumnDescriptor> columns) throws ParquetRuntimeException { List<ColumnDescriptor> res = new ArrayList<>(); for (ColumnDescriptor descriptor : columns) { if (depth >= descriptor.getPath().length) { throw new InvalidSchemaException("Corrupted Parquet schema"); } if (type.getName().equals(descriptor.getPath()[depth])) { res.add(descriptor); } } return res; }
private String[] getExisingParentPath(ColumnDescriptor path, MessageType inputFileSchema) { List<String> parentPath = Arrays.asList(path.getPath()); while (parentPath.size() > 0 && !inputFileSchema.containsPath(parentPath.toArray(new String[parentPath.size()]))) { parentPath = parentPath.subList(0, parentPath.size() - 1); } return parentPath.toArray(new String[parentPath.size()]); }
private void readPageV2(DataPageV2 page) { this.pageValueCount = page.getValueCount(); int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel()); defColumn = new VectorizedDefValuesReader(bitWidth); try { defColumn.initFromBuffer(this.pageValueCount, page.getDefinitionLevels().toByteArray()); initDataReader(page.getDataEncoding(), page.getData().toByteArray(), 0); } catch (IOException e) { throw new ParquetDecodingException("could not read page " + page + " in col " + descriptor, e); } }
descriptor.getPrimitiveType().getPrimitiveTypeName(); if (isCurrentPageDictionaryEncoded) { case FIXED_LEN_BYTE_ARRAY: readFixedLenByteArrayBatch( rowId, num, column, descriptor.getPrimitiveType().getTypeLength()); break; default:
public ValuesWriter newRepetitionLevelWriter(ColumnDescriptor path) { return newColumnDescriptorValuesWriter(path.getMaxRepetitionLevel()); }