private static Set<Integer> getIncludedOrcColumns(List<OrcType> types, Set<Integer> includedColumns) { Set<Integer> includes = new LinkedHashSet<>(); OrcType root = types.get(0); for (int includedColumn : includedColumns) { includeOrcColumnsRecursive(types, includes, root.getFieldTypeIndex(includedColumn)); } return includes; }
private static Map<Integer, ColumnStatistics> getStatisticsByColumnOrdinal(OrcType rootStructType, List<ColumnStatistics> fileStats) { requireNonNull(rootStructType, "rootStructType is null"); checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); requireNonNull(fileStats, "fileStats is null"); ImmutableMap.Builder<Integer, ColumnStatistics> statistics = ImmutableMap.builder(); for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { if (fileStats.size() > ordinal) { ColumnStatistics element = fileStats.get(rootStructType.getFieldTypeIndex(ordinal)); if (element != null) { statistics.put(ordinal, element); } } } return statistics.build(); }
private static void includeOrcColumnsRecursive(List<OrcType> types, Set<Integer> result, int typeId) { result.add(typeId); OrcType type = types.get(typeId); int children = type.getFieldCount(); for (int i = 0; i < children; ++i) { includeOrcColumnsRecursive(types, result, type.getFieldTypeIndex(i)); } }
private static Map<Integer, ColumnStatistics> getRowGroupStatistics(OrcType rootStructType, Map<StreamId, List<RowGroupIndex>> columnIndexes, int rowGroup) { requireNonNull(rootStructType, "rootStructType is null"); checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); requireNonNull(columnIndexes, "columnIndexes is null"); checkArgument(rowGroup >= 0, "rowGroup is negative"); Map<Integer, List<ColumnStatistics>> groupedColumnStatistics = new HashMap<>(); for (Entry<StreamId, List<RowGroupIndex>> entry : columnIndexes.entrySet()) { groupedColumnStatistics.computeIfAbsent(entry.getKey().getColumn(), key -> new ArrayList<>()) .add(entry.getValue().get(rowGroup).getColumnStatistics()); } ImmutableMap.Builder<Integer, ColumnStatistics> statistics = ImmutableMap.builder(); for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { List<ColumnStatistics> columnStatistics = groupedColumnStatistics.get(rootStructType.getFieldTypeIndex(ordinal)); if (columnStatistics != null) { if (columnStatistics.size() == 1) { statistics.put(ordinal, getOnlyElement(columnStatistics)); } else { // Merge statistics from different streams // This can happen if map is represented as struct (DWRF only) statistics.put(ordinal, mergeColumnStatistics(columnStatistics)); } } } return statistics.build(); }
private static StreamDescriptor createStreamDescriptor(String parentStreamName, String fieldName, int typeId, List<OrcType> types, OrcDataSource dataSource) { OrcType type = types.get(typeId); if (!fieldName.isEmpty()) { parentStreamName += "." + fieldName; } ImmutableList.Builder<StreamDescriptor> nestedStreams = ImmutableList.builder(); if (type.getOrcTypeKind() == OrcTypeKind.STRUCT) { for (int i = 0; i < type.getFieldCount(); ++i) { nestedStreams.add(createStreamDescriptor(parentStreamName, type.getFieldName(i), type.getFieldTypeIndex(i), types, dataSource)); } } else if (type.getOrcTypeKind() == OrcTypeKind.LIST) { nestedStreams.add(createStreamDescriptor(parentStreamName, "item", type.getFieldTypeIndex(0), types, dataSource)); } else if (type.getOrcTypeKind() == OrcTypeKind.MAP) { nestedStreams.add(createStreamDescriptor(parentStreamName, "key", type.getFieldTypeIndex(0), types, dataSource)); nestedStreams.add(createStreamDescriptor(parentStreamName, "value", type.getFieldTypeIndex(1), types, dataSource)); } return new StreamDescriptor(parentStreamName, typeId, fieldName, type.getOrcTypeKind(), dataSource, nestedStreams.build()); }
ImmutableSet.Builder<SliceDictionaryColumnWriter> sliceColumnWriters = ImmutableSet.builder(); for (int fieldId = 0; fieldId < types.size(); fieldId++) { int fieldColumnIndex = rootType.getFieldTypeIndex(fieldId); Type fieldType = types.get(fieldId); ColumnWriter columnWriter = createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, maxCompressionBufferSize, orcEncoding, hiveStorageTimeZone, options.getMaxStringStatisticsLimit());
int fieldColumnIndex = orcType.getFieldTypeIndex(0); Type fieldType = type.getTypeParameters().get(0); ColumnWriter elementWriter = createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, bufferSize, orcEncoding, hiveStorageTimeZone, stringStatisticsLimit); orcType.getFieldTypeIndex(0), orcTypes, type.getTypeParameters().get(0), stringStatisticsLimit); ColumnWriter valueWriter = createColumnWriter( orcType.getFieldTypeIndex(1), orcTypes, type.getTypeParameters().get(1), ImmutableList.Builder<ColumnWriter> fieldWriters = ImmutableList.builder(); for (int fieldId = 0; fieldId < orcType.getFieldCount(); fieldId++) { int fieldColumnIndex = orcType.getFieldTypeIndex(fieldId); Type fieldType = type.getTypeParameters().get(fieldId); fieldWriters.add(createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, bufferSize, orcEncoding, hiveStorageTimeZone, stringStatisticsLimit));
private static Set<Integer> getIncludedOrcColumns(List<OrcType> types, Set<Integer> includedColumns) { Set<Integer> includes = new LinkedHashSet<>(); OrcType root = types.get(0); for (int includedColumn : includedColumns) { includeOrcColumnsRecursive(types, includes, root.getFieldTypeIndex(includedColumn)); } return includes; }
private static Set<Integer> getIncludedOrcColumns(List<OrcType> types, Set<Integer> includedColumns) { Set<Integer> includes = new LinkedHashSet<>(); OrcType root = types.get(0); for (int includedColumn : includedColumns) { includeOrcColumnsRecursive(types, includes, root.getFieldTypeIndex(includedColumn)); } return includes; }
private static Map<Integer, ColumnStatistics> getStatisticsByColumnOrdinal(OrcType rootStructType, List<ColumnStatistics> fileStats) { requireNonNull(rootStructType, "rootStructType is null"); checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); requireNonNull(fileStats, "fileStats is null"); ImmutableMap.Builder<Integer, ColumnStatistics> statistics = ImmutableMap.builder(); for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { ColumnStatistics element = fileStats.get(rootStructType.getFieldTypeIndex(ordinal)); if (element != null) { statistics.put(ordinal, element); } } return statistics.build(); }
private static void includeOrcColumnsRecursive(List<OrcType> types, Set<Integer> result, int typeId) { result.add(typeId); OrcType type = types.get(typeId); int children = type.getFieldCount(); for (int i = 0; i < children; ++i) { includeOrcColumnsRecursive(types, result, type.getFieldTypeIndex(i)); } }
private static void includeOrcColumnsRecursive(List<OrcType> types, Set<Integer> result, int typeId) { result.add(typeId); OrcType type = types.get(typeId); int children = type.getFieldCount(); for (int i = 0; i < children; ++i) { includeOrcColumnsRecursive(types, result, type.getFieldTypeIndex(i)); } }
private static Map<Integer, ColumnStatistics> getStatisticsByColumnOrdinal(OrcType rootStructType, List<ColumnStatistics> fileStats) { requireNonNull(rootStructType, "rootStructType is null"); checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); requireNonNull(fileStats, "fileStats is null"); ImmutableMap.Builder<Integer, ColumnStatistics> statistics = ImmutableMap.builder(); for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { if (fileStats.size() > ordinal) { ColumnStatistics element = fileStats.get(rootStructType.getFieldTypeIndex(ordinal)); if (element != null) { statistics.put(ordinal, element); } } } return statistics.build(); }
private static Map<Integer, ColumnStatistics> getRowGroupStatistics(OrcType rootStructType, Map<Integer, List<RowGroupIndex>> columnIndexes, int rowGroup) { requireNonNull(rootStructType, "rootStructType is null"); checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); requireNonNull(columnIndexes, "columnIndexes is null"); checkArgument(rowGroup >= 0, "rowGroup is negative"); ImmutableMap.Builder<Integer, ColumnStatistics> statistics = ImmutableMap.builder(); for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { List<RowGroupIndex> rowGroupIndexes = columnIndexes.get(rootStructType.getFieldTypeIndex(ordinal)); if (rowGroupIndexes != null) { statistics.put(ordinal, rowGroupIndexes.get(rowGroup).getColumnStatistics()); } } return statistics.build(); }
private static Map<Integer, ColumnStatistics> getRowGroupStatistics(OrcType rootStructType, Map<StreamId, List<RowGroupIndex>> columnIndexes, int rowGroup) { requireNonNull(rootStructType, "rootStructType is null"); checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); requireNonNull(columnIndexes, "columnIndexes is null"); checkArgument(rowGroup >= 0, "rowGroup is negative"); Map<Integer, List<ColumnStatistics>> groupedColumnStatistics = new HashMap<>(); for (Entry<StreamId, List<RowGroupIndex>> entry : columnIndexes.entrySet()) { groupedColumnStatistics.computeIfAbsent(entry.getKey().getColumn(), key -> new ArrayList<>()) .add(entry.getValue().get(rowGroup).getColumnStatistics()); } ImmutableMap.Builder<Integer, ColumnStatistics> statistics = ImmutableMap.builder(); for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { List<ColumnStatistics> columnStatistics = groupedColumnStatistics.get(rootStructType.getFieldTypeIndex(ordinal)); if (columnStatistics != null) { if (columnStatistics.size() == 1) { statistics.put(ordinal, getOnlyElement(columnStatistics)); } else { // Merge statistics from different streams // This can happen if map is represented as struct (DWRF only) statistics.put(ordinal, mergeColumnStatistics(columnStatistics)); } } } return statistics.build(); }
private static StreamDescriptor createStreamDescriptor(String parentStreamName, String fieldName, int typeId, List<OrcType> types, OrcDataSource dataSource) { OrcType type = types.get(typeId); if (!fieldName.isEmpty()) { parentStreamName += "." + fieldName; } ImmutableList.Builder<StreamDescriptor> nestedStreams = ImmutableList.builder(); if (type.getOrcTypeKind() == OrcTypeKind.STRUCT) { for (int i = 0; i < type.getFieldCount(); ++i) { nestedStreams.add(createStreamDescriptor(parentStreamName, type.getFieldName(i), type.getFieldTypeIndex(i), types, dataSource)); } } else if (type.getOrcTypeKind() == OrcTypeKind.LIST) { nestedStreams.add(createStreamDescriptor(parentStreamName, "item", type.getFieldTypeIndex(0), types, dataSource)); } else if (type.getOrcTypeKind() == OrcTypeKind.MAP) { nestedStreams.add(createStreamDescriptor(parentStreamName, "key", type.getFieldTypeIndex(0), types, dataSource)); nestedStreams.add(createStreamDescriptor(parentStreamName, "value", type.getFieldTypeIndex(1), types, dataSource)); } return new StreamDescriptor(parentStreamName, typeId, fieldName, type.getOrcTypeKind(), dataSource, nestedStreams.build()); }
private static StreamDescriptor createStreamDescriptor(String parentStreamName, String fieldName, int typeId, List<OrcType> types, OrcDataSource dataSource) { OrcType type = types.get(typeId); if (!fieldName.isEmpty()) { parentStreamName += "." + fieldName; } ImmutableList.Builder<StreamDescriptor> nestedStreams = ImmutableList.builder(); if (type.getOrcTypeKind() == OrcTypeKind.STRUCT) { for (int i = 0; i < type.getFieldCount(); ++i) { nestedStreams.add(createStreamDescriptor(parentStreamName, type.getFieldName(i), type.getFieldTypeIndex(i), types, dataSource)); } } else if (type.getOrcTypeKind() == OrcTypeKind.LIST) { nestedStreams.add(createStreamDescriptor(parentStreamName, "item", type.getFieldTypeIndex(0), types, dataSource)); } else if (type.getOrcTypeKind() == OrcTypeKind.MAP) { nestedStreams.add(createStreamDescriptor(parentStreamName, "key", type.getFieldTypeIndex(0), types, dataSource)); nestedStreams.add(createStreamDescriptor(parentStreamName, "value", type.getFieldTypeIndex(1), types, dataSource)); } return new StreamDescriptor(parentStreamName, typeId, fieldName, type.getOrcTypeKind(), dataSource, nestedStreams.build()); }
return DecimalType.createDecimalType(type.getPrecision().get(), type.getScale().get()); case LIST: TypeSignature elementType = getType(types, type.getFieldTypeIndex(0)).getTypeSignature(); return typeManager.getParameterizedType(StandardTypes.ARRAY, ImmutableList.of(TypeSignatureParameter.of(elementType))); case MAP: TypeSignature keyType = getType(types, type.getFieldTypeIndex(0)).getTypeSignature(); TypeSignature valueType = getType(types, type.getFieldTypeIndex(1)).getTypeSignature(); return typeManager.getParameterizedType(StandardTypes.MAP, ImmutableList.of(TypeSignatureParameter.of(keyType), TypeSignatureParameter.of(valueType))); case STRUCT: fieldTypes.add(TypeSignatureParameter.of(new NamedTypeSignature( Optional.of(new RowFieldName(fieldNames.get(i), false)), getType(types, type.getFieldTypeIndex(i)).getTypeSignature())));
int fieldColumnIndex = orcType.getFieldTypeIndex(0); Type fieldType = type.getTypeParameters().get(0); ColumnWriter elementWriter = createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, bufferSize, orcEncoding, hiveStorageTimeZone, stringStatisticsLimit); orcType.getFieldTypeIndex(0), orcTypes, type.getTypeParameters().get(0), stringStatisticsLimit); ColumnWriter valueWriter = createColumnWriter( orcType.getFieldTypeIndex(1), orcTypes, type.getTypeParameters().get(1), ImmutableList.Builder<ColumnWriter> fieldWriters = ImmutableList.builder(); for (int fieldId = 0; fieldId < orcType.getFieldCount(); fieldId++) { int fieldColumnIndex = orcType.getFieldTypeIndex(fieldId); Type fieldType = type.getTypeParameters().get(fieldId); fieldWriters.add(createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, bufferSize, orcEncoding, hiveStorageTimeZone, stringStatisticsLimit));
ImmutableSet.Builder<SliceDictionaryColumnWriter> sliceColumnWriters = ImmutableSet.builder(); for (int fieldId = 0; fieldId < types.size(); fieldId++) { int fieldColumnIndex = rootType.getFieldTypeIndex(fieldId); Type fieldType = types.get(fieldId); ColumnWriter columnWriter = createColumnWriter(fieldColumnIndex, orcTypes, fieldType, compression, maxCompressionBufferSize, orcEncoding, hiveStorageTimeZone, options.getMaxStringStatisticsLimit());