/** * Translate the search argument to the filter predicate parquet uses. It includes * only the columns from the passed schema. * @return a filter predicate translated from search argument. null is returned * if failed to convert. */ public static FilterPredicate toFilterPredicate(SearchArgument sarg, MessageType schema) { Set<String> columns = null; if (schema != null) { columns = new HashSet<String>(); for (Type field : schema.getFields()) { columns.add(field.getName()); } } try { return translate(sarg.getExpression(), sarg.getLeaves(), columns, schema); } catch(Exception e) { return null; } }
List<Type> types = requestedSchema.getFields(); columnReaders = new VectorizedColumnReader[columns.size()];
for (Type type : fullSchema.getFields()) { if (tsField.equals(type.getName()) || metricsFields.contains(type.getName())
for (Type type : fullSchema.getFields()) { if (tsField.equals(type.getName()) || metricsFields.contains(type.getName())
public static void testConversion( final String columnNamesStr, final String columnsTypeStr, final String actualSchema) throws Exception { final List<String> columnNames = createHiveColumnsFrom(columnNamesStr); final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(columnsTypeStr); final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes); final MessageType expectedMT = MessageTypeParser.parseMessageType(actualSchema); assertEquals("converting " + columnNamesStr + ": " + columnsTypeStr + " to " + actualSchema, expectedMT, messageTypeFound); // Required to check the original types manually as PrimitiveType.equals does not care about it List<Type> expectedFields = expectedMT.getFields(); List<Type> actualFields = messageTypeFound.getFields(); for (int i = 0, n = expectedFields.size(); i < n; ++i) { OriginalType exp = expectedFields.get(i).getOriginalType(); OriginalType act = actualFields.get(i).getOriginalType(); assertEquals("Original types of the field do not match", exp, act); } } }
@Test public void testMapOriginalType() throws Exception { final String hiveColumnTypes = "map<string,string>"; final String hiveColumnNames = "mapCol"; final List<String> columnNames = createHiveColumnsFrom(hiveColumnNames); final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(hiveColumnTypes); final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes); // this messageType only has one optional field, whose name is mapCol, original Type is MAP assertEquals(1, messageTypeFound.getFieldCount()); org.apache.parquet.schema.Type topLevel = messageTypeFound.getFields().get(0); assertEquals("mapCol",topLevel.getName()); assertEquals(OriginalType.MAP, topLevel.getOriginalType()); assertEquals(Repetition.OPTIONAL, topLevel.getRepetition()); assertEquals(1, topLevel.asGroupType().getFieldCount()); org.apache.parquet.schema.Type secondLevel = topLevel.asGroupType().getFields().get(0); //there is one repeated field for mapCol, the field name is "map" and its original Type is MAP_KEY_VALUE; assertEquals("map", secondLevel.getName()); assertEquals(OriginalType.MAP_KEY_VALUE, secondLevel.getOriginalType()); assertEquals(Repetition.REPEATED, secondLevel.getRepetition()); } }
private void initializeInternal() throws IOException, UnsupportedOperationException { // Check that the requested schema is supported. missingColumns = new boolean[requestedSchema.getFieldCount()]; List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<String[]> paths = requestedSchema.getPaths(); for (int i = 0; i < requestedSchema.getFieldCount(); ++i) { Type t = requestedSchema.getFields().get(i); if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) { throw new UnsupportedOperationException("Complex types not supported."); } String[] colPath = paths.get(i); if (fileSchema.containsPath(colPath)) { ColumnDescriptor fd = fileSchema.getColumnDescription(colPath); if (!fd.equals(columns.get(i))) { throw new UnsupportedOperationException("Schema evolution not supported."); } missingColumns[i] = false; } else { if (columns.get(i).getMaxDefinitionLevel() == 0) { // Column is missing in data but the required data is non-nullable. This file is invalid. throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath)); } missingColumns[i] = true; } } }
private void initializeInternal() throws IOException, UnsupportedOperationException { // Check that the requested schema is supported. missingColumns = new boolean[requestedSchema.getFieldCount()]; List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<String[]> paths = requestedSchema.getPaths(); for (int i = 0; i < requestedSchema.getFieldCount(); ++i) { Type t = requestedSchema.getFields().get(i); if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) { throw new UnsupportedOperationException("Complex types not supported."); } String[] colPath = paths.get(i); if (fileSchema.containsPath(colPath)) { ColumnDescriptor fd = fileSchema.getColumnDescription(colPath); if (!fd.equals(columns.get(i))) { throw new UnsupportedOperationException("Schema evolution not supported."); } missingColumns[i] = false; } else { if (columns.get(i).getMaxDefinitionLevel() == 0) { // Column is missing in data but the required data is non-nullable. This file is invalid. throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath)); } missingColumns[i] = true; } } }
private void initializeInternal() throws IOException, UnsupportedOperationException { /** * Check that the requested schema is supported. */ missingColumns = new boolean[requestedSchema.getFieldCount()]; List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<String[]> paths = requestedSchema.getPaths(); for (int i = 0; i < requestedSchema.getFieldCount(); ++i) { Type t = requestedSchema.getFields().get(i); if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) { throw new UnsupportedOperationException("Complex types not supported."); } String[] colPath = paths.get(i); if (fileSchema.containsPath(colPath)) { ColumnDescriptor fd = fileSchema.getColumnDescription(colPath); if (!fd.equals(columns.get(i))) { throw new UnsupportedOperationException("Schema evolution not supported."); } missingColumns[i] = false; } else { if (columns.get(i).getMaxDefinitionLevel() == 0) { // Column is missing in data but the required data is non-nullable. This file is invalid. throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath)); } missingColumns[i] = true; } } }
/** * Converts a Parquet schema to a Tajo schema. * * @param parquetSchema The Parquet schema to convert. * @return The resulting Tajo schema. */ public Schema convert(MessageType parquetSchema) { return convertFields(parquetSchema.getFields()); }
/** * Converts a Parquet schema to a Tajo schema. * * @param parquetSchema The Parquet schema to convert. * @return The resulting Tajo schema. */ public Schema convert(MessageType parquetSchema) { return convertFields(parquetSchema.getFields()); }
public static List<IParquetInputField> buildInputFields( MessageType schema ) { List<IParquetInputField> inputFields = new ArrayList<>(); for ( Type type : schema.getFields() ) { if ( type.isPrimitive() ) { inputFields.add( convertField( type ) ); } } return inputFields; }
public Schema convert(MessageType parquetSchema) { return convertFields(parquetSchema.getName(), parquetSchema.getFields()); }
public static int getFieldIndex(MessageType fileSchema, String name) { try { return fileSchema.getFieldIndex(name.toLowerCase(Locale.ENGLISH)); } catch (InvalidRecordException e) { for (org.apache.parquet.schema.Type type : fileSchema.getFields()) { if (type.getName().equalsIgnoreCase(name)) { return fileSchema.getFieldIndex(type.getName()); } } return -1; } }
public static org.apache.parquet.schema.Type getParquetTypeByName(String columnName, MessageType messageType) { if (messageType.containsField(columnName)) { return messageType.getType(columnName); } // parquet is case-sensitive, but hive is not. all hive columns get converted to lowercase // check for direct match above but if no match found, try case-insensitive match for (org.apache.parquet.schema.Type type : messageType.getFields()) { if (type.getName().equalsIgnoreCase(columnName)) { return type; } } return null; }
/** * Maps a Parquet and Arrow Schema * For now does not validate primitive type compatibility * @param arrowSchema an Arrow schema * @param parquetSchema a Parquet message type * @return the mapping between the 2 */ public SchemaMapping map(Schema arrowSchema, MessageType parquetSchema) { List<TypeMapping> children = map(arrowSchema.getFields(), parquetSchema.getFields()); return new SchemaMapping(arrowSchema, parquetSchema, children); }
private Map<Integer, Converter> buildFieldToConverter(final MessageType schema) { final Map<Integer, Converter> fieldToConverter = new HashMap<>(fieldCount); int i = 0; for (final Type field : schema.getFields()) { if (field.isPrimitive()) { fieldToConverter.put(i, new PrimitiveConverter(parquetColumnToObject, field.asPrimitiveType().getPrimitiveTypeName().javaType.getSimpleName(), new String[]{field.getName()}, field.getOriginalType())); } else { fieldToConverter.put(i, new BypassGroupConverter(parquetColumnToObject, field.asGroupType(), new String[]{field.getName()})); } i++; } return fieldToConverter; }
public static MessageType addFallbackIds(MessageType fileSchema) { MessageTypeBuilder builder = org.apache.parquet.schema.Types.buildMessage(); int ordinal = 1; // ids are assigned starting at 1 for (Type type : fileSchema.getFields()) { builder.addField(type.withId(ordinal)); ordinal += 1; } return builder.named(fileSchema.getName()); } }
/** * Creates an Arrow Schema from an Parquet one and returns the mapping * @param parquetSchema the provided Parquet Schema * @return the mapping between the 2 */ public SchemaMapping fromParquet(MessageType parquetSchema) { List<Type> fields = parquetSchema.getFields(); List<TypeMapping> mappings = fromParquet(fields); List<Field> arrowFields = fields(mappings); return new SchemaMapping(new Schema(arrowFields), parquetSchema, mappings); }
private void verifyParquetSchema() { ParquetReader reader = new ParquetReader(tableDirPath); MessageType parquetSchema = reader.readParquetSchema(); String[] types = configuration.getTypes(); for (int i = 0; i < types.length; i ++) { String type = types[i]; if (isNumericSqlType(type)) { OriginalType parquetFieldType = parquetSchema.getFields().get(i).getOriginalType(); assertEquals(OriginalType.DECIMAL, parquetFieldType); } } }