/** * Searches column names by name on a given Parquet message schema, and returns its projected * Parquet schema types. * * @param schema Message type schema where to search for column names. * @param colNames List of column names. * @param colTypes List of column types. * @return A MessageType object of projected columns. */ public static MessageType getSchemaByName(MessageType schema, List<String> colNames, List<TypeInfo> colTypes) { List<Type> projectedFields = getProjectedGroupFields(schema, colNames, colTypes); Type[] typesArray = projectedFields.toArray(new Type[0]); return Types.buildMessage() .addFields(typesArray) .named(schema.getName()); }
if (typeInfo.getCategory().equals(Category.PRIMITIVE)) { if (typeInfo.equals(TypeInfoFactory.stringTypeInfo)) { return Types.primitive(PrimitiveTypeName.BINARY, repetition).as(OriginalType.UTF8) .named(name); } else if (typeInfo.equals(TypeInfoFactory.intTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT32, repetition).named(name); } else if (typeInfo.equals(TypeInfoFactory.shortTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT32, repetition) .as(OriginalType.INT_16).named(name); } else if (typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT32, repetition) .as(OriginalType.INT_8).named(name); } else if (typeInfo.equals(TypeInfoFactory.longTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT64, repetition).named(name); } else if (typeInfo.equals(TypeInfoFactory.doubleTypeInfo)) { return Types.primitive(PrimitiveTypeName.DOUBLE, repetition).named(name); } else if (typeInfo.equals(TypeInfoFactory.floatTypeInfo)) { return Types.primitive(PrimitiveTypeName.FLOAT, repetition).named(name); } else if (typeInfo.equals(TypeInfoFactory.booleanTypeInfo)) { return Types.primitive(PrimitiveTypeName.BOOLEAN, repetition).named(name); } else if (typeInfo.equals(TypeInfoFactory.binaryTypeInfo)) { return Types.primitive(PrimitiveTypeName.BINARY, repetition).named(name); } else if (typeInfo.equals(TypeInfoFactory.timestampTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT96, repetition).named(name); } else if (typeInfo.equals(TypeInfoFactory.voidTypeInfo)) { throw new UnsupportedOperationException("Void type not implemented"); } else if (typeInfo.getTypeName().toLowerCase().startsWith( serdeConstants.CHAR_TYPE_NAME)) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8)
return Types.buildGroup(fieldType.getRepetition()) .addFields(typesArray) .named(fieldType.getName()); subFieldType = getProjectedType(elemType, subFieldType); return Types.buildGroup(Repetition.OPTIONAL).as(OriginalType.LIST).addFields( subFieldType).named(fieldType.getName());
return Types.primitive(PrimitiveTypeName.BINARY, repetition).as(OriginalType.UTF8) .named(name); typeInfo.equals(TypeInfoFactory.shortTypeInfo) || typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT32, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT64, repetition).named(name); return Types.primitive(PrimitiveTypeName.DOUBLE, repetition).named(name); return Types.primitive(PrimitiveTypeName.FLOAT, repetition).named(name); return Types.primitive(PrimitiveTypeName.BOOLEAN, repetition).named(name); return Types.primitive(PrimitiveTypeName.BINARY, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT96, repetition).named(name); serdeConstants.CHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); return Types.repeated(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); serdeConstants.VARCHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name);
/** * Searchs column names by name on a given Parquet schema, and returns its corresponded * Parquet schema types. * * @param schema Group schema where to search for column names. * @param colNames List of column names. * @param colTypes List of column types. * @return List of GroupType objects of projected columns. */ private static List<Type> getProjectedGroupFields(GroupType schema, List<String> colNames, List<TypeInfo> colTypes) { List<Type> schemaTypes = new ArrayList<Type>(); ListIterator<String> columnIterator = colNames.listIterator(); while (columnIterator.hasNext()) { TypeInfo colType = colTypes.get(columnIterator.nextIndex()); String colName = columnIterator.next(); Type fieldType = getFieldTypeIgnoreCase(schema, colName); if (fieldType == null) { schemaTypes.add(Types.optional(PrimitiveTypeName.BINARY).named(colName)); } else { schemaTypes.add(getProjectedType(colType, fieldType)); } } return schemaTypes; }
private ConvertedField visitPrimitiveType(PrimitiveTypeName type, OriginalType orig, State state) { PrimitiveBuilder<PrimitiveType> b = primitive(type, state.repetition); if (orig != null) { b = b.as(orig); } if (fieldProjectionFilter.keep(state.path)) { return new Keep(state.path, b.named(state.name)); } else { return new Drop(state.path); } }
case DECIMAL: if ( f.getAllowNull() ) { return Types.optional( PrimitiveType.PrimitiveTypeName.BINARY ).as( OriginalType.DECIMAL ).precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName ); } else { return Types.required( PrimitiveType.PrimitiveTypeName.BINARY ).as( OriginalType.DECIMAL ).precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName ); return Types.optional( PrimitiveType.PrimitiveTypeName.INT32 ).as( OriginalType.DECIMAL ).precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName ); } else { return Types.required( PrimitiveType.PrimitiveTypeName.INT32 ).as( OriginalType.DECIMAL ).precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName ); return Types.optional( PrimitiveType.PrimitiveTypeName.INT64 ).as( OriginalType.DECIMAL ).precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName ); } else { return Types.required( PrimitiveType.PrimitiveTypeName.INT64 ).as( OriginalType.DECIMAL ).precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
public static MapBuilder<GroupType> optionalMap() { return map(Type.Repetition.OPTIONAL); }
public static ListBuilder<GroupType> requiredList() { return list(Type.Repetition.REQUIRED); }
return Types.primitive(PrimitiveTypeName.BINARY, repetition).as(OriginalType.UTF8) .named(name); typeInfo.equals(TypeInfoFactory.shortTypeInfo) || typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT32, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT64, repetition).named(name); return Types.primitive(PrimitiveTypeName.DOUBLE, repetition).named(name); return Types.primitive(PrimitiveTypeName.FLOAT, repetition).named(name); return Types.primitive(PrimitiveTypeName.BOOLEAN, repetition).named(name); return Types.primitive(PrimitiveTypeName.BINARY, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT96, repetition).named(name); serdeConstants.CHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); return Types.repeated(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); serdeConstants.VARCHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name);
/** * Searches column names by indexes on a given Parquet file schema, and returns its corresponded * Parquet schema types. * * @param schema Message schema where to search for column names. * @param colNames List of column names. * @param colIndexes List of column indexes. * @return A MessageType object of the column names found. */ public static MessageType getSchemaByIndex(MessageType schema, List<String> colNames, List<Integer> colIndexes) { List<Type> schemaTypes = new ArrayList<Type>(); for (Integer i : colIndexes) { if (i < colNames.size()) { if (i < schema.getFieldCount()) { schemaTypes.add(schema.getType(i)); } else { //prefixing with '_mask_' to ensure no conflict with named //columns in the file schema schemaTypes.add( Types.optional(PrimitiveTypeName.BINARY).named("_mask_" + colNames.get(i))); } } } return new MessageType(schema.getName(), schemaTypes); }
private static Type convertType( final String name, final InternalType type, final Type.Repetition repetition) { if (DataTypes.INT.equals(type)) { return Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition).named(name); } else if (DataTypes.SHORT.equals(type)) { return Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition) .as(OriginalType.INT_16) .named(name); } else if (DataTypes.BOOLEAN.equals(type)) { return Types.primitive(PrimitiveType.PrimitiveTypeName.BOOLEAN, repetition).named(name); } else if (DataTypes.BYTE.equals(type)) { return Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition) .as(OriginalType.INT_8) .named(name); } else if (DataTypes.DOUBLE.equals(type)) { return Types.primitive(PrimitiveType.PrimitiveTypeName.DOUBLE, repetition).named(name); } else if (DataTypes.FLOAT.equals(type)) { return Types.primitive(PrimitiveType.PrimitiveTypeName.FLOAT, repetition).named(name); } else if (DataTypes.LONG.equals(type)) { return Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, repetition).named(name); } else if (DataTypes.STRING.equals(type)) { return Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, repetition).as(OriginalType.UTF8) .named(name); } else if (DataTypes.DATE.equals(type)) { return Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition).as(OriginalType.DATE).named(name); } else if (DataTypes.TIME.equals(type)) { return Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition) .as(OriginalType.TIME_MILLIS) .named(name); } else if (DataTypes.TIMESTAMP.equals(type)) {
public static MapBuilder<GroupType> optionalMap() { return map(Type.Repetition.OPTIONAL); }
public static ListBuilder<GroupType> optionalList() { return list(Type.Repetition.OPTIONAL); }
@Test public void testUnannotatedListOfPrimitives() throws Exception { MessageType fileSchema = Types.buildMessage() .repeated(INT32).named("list_of_ints") .named("UnannotatedListOfPrimitives"); Path test = writeDirect("UnannotatedListOfPrimitives", fileSchema, new DirectWriter() { @Override public void write(RecordConsumer rc) { rc.startMessage(); rc.startField("list_of_ints", 0); rc.addInteger(34); rc.addInteger(35); rc.addInteger(36); rc.endField("list_of_ints", 0); rc.endMessage(); } }); ArrayWritable expected = list( new IntWritable(34), new IntWritable(35), new IntWritable(36)); List<ArrayWritable> records = read(test); Assert.assertEquals("Should have only one record", 1, records.size()); assertEquals("Should match expected record", expected, records.get(0)); }
return Types.primitive(PrimitiveTypeName.BINARY, repetition).as(OriginalType.UTF8) .named(name); typeInfo.equals(TypeInfoFactory.shortTypeInfo) || typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT32, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT64, repetition).named(name); return Types.primitive(PrimitiveTypeName.DOUBLE, repetition).named(name); return Types.primitive(PrimitiveTypeName.FLOAT, repetition).named(name); return Types.primitive(PrimitiveTypeName.BOOLEAN, repetition).named(name); return Types.primitive(PrimitiveTypeName.BINARY, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT96, repetition).named(name); serdeConstants.CHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); return Types.repeated(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); serdeConstants.VARCHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name);
return Types.primitive(PrimitiveTypeName.BINARY, repetition).as(OriginalType.UTF8) .named(name); typeInfo.equals(TypeInfoFactory.shortTypeInfo) || typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT32, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT64, repetition).named(name); return Types.primitive(PrimitiveTypeName.DOUBLE, repetition).named(name); return Types.primitive(PrimitiveTypeName.FLOAT, repetition).named(name); return Types.primitive(PrimitiveTypeName.BOOLEAN, repetition).named(name); return Types.primitive(PrimitiveTypeName.BINARY, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT96, repetition).named(name); return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); int scale = decimalTypeInfo.scale(); int bytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[prec - 1]; return Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes).as(OriginalType.DECIMAL).scale(scale).precision(prec).named(name); return Types.primitive(PrimitiveTypeName.INT32, repetition).as(OriginalType.DATE).named(name);
schemaTypes.add(Types.optional(PrimitiveTypeName.BINARY).named("_mask_" + colNames.get(i)));
Schema.Type type = schema.getType(); if (type.equals(Schema.Type.BOOLEAN)) { builder = Types.primitive(BOOLEAN, repetition); } else if (type.equals(Schema.Type.INT)) { builder = Types.primitive(INT32, repetition); } else if (type.equals(Schema.Type.LONG)) { builder = Types.primitive(INT64, repetition); } else if (type.equals(Schema.Type.FLOAT)) { builder = Types.primitive(FLOAT, repetition); } else if (type.equals(Schema.Type.DOUBLE)) { builder = Types.primitive(DOUBLE, repetition); } else if (type.equals(Schema.Type.BYTES)) { builder = Types.primitive(BINARY, repetition); } else if (type.equals(Schema.Type.STRING)) { builder = Types.primitive(BINARY, repetition).as(UTF8); } else if (type.equals(Schema.Type.RECORD)) { return new GroupType(repetition, fieldName, convertFields(schema.getFields())); } else if (type.equals(Schema.Type.ENUM)) { builder = Types.primitive(BINARY, repetition).as(ENUM); } else if (type.equals(Schema.Type.ARRAY)) { if (writeOldListStructure) { builder = Types.primitive(FIXED_LEN_BYTE_ARRAY, repetition) .length(schema.getFixedSize()); } else if (type.equals(Schema.Type.UNION)) {
@Override public TypeMapping visit(Struct_ type) { List<TypeMapping> parquetTypes = fromArrow(children); return new StructTypeMapping(field, addToBuilder(parquetTypes, Types.buildGroup(OPTIONAL)).named(fieldName), parquetTypes); }