/** * Prunes columns from a Parquet file schema that was written without field ids. * <p> * Files that were written without field ids are read assuming that schema evolution preserved * column order. Deleting columns was not allowed. * <p> * The order of columns in the resulting Parquet schema matches the Parquet file. * * @param fileSchema schema from a Parquet file that does not have field ids. * @param expectedSchema expected schema * @return a parquet schema pruned using the expected schema */ public static MessageType pruneColumnsFallback(MessageType fileSchema, Schema expectedSchema) { Set<Integer> selectedIds = Sets.newHashSet(); for (Types.NestedField field : expectedSchema.columns()) { selectedIds.add(field.fieldId()); } MessageTypeBuilder builder = org.apache.parquet.schema.Types.buildMessage(); int ordinal = 1; for (Type type : fileSchema.getFields()) { if (selectedIds.contains(ordinal)) { builder.addField(type.withId(ordinal)); } ordinal += 1; } return builder.named(fileSchema.getName()); }
@Override public Type message(MessageType message, List<Type> fields) { Types.MessageTypeBuilder builder = Types.buildMessage(); boolean hasChange = false; int fieldCount = 0; for (int i = 0; i < fields.size(); i += 1) { Type originalField = message.getType(i); Type field = fields.get(i); if (selectedIds.contains(getId(originalField))) { builder.addField(originalField); fieldCount += 1; } else if (field != null) { builder.addField(field); fieldCount += 1; hasChange = true; } } if (hasChange) { return builder.named(message.getName()); } else if (message.getFieldCount() == fieldCount) { return message; } return builder.named(message.getName()); }
/** * Clips `parquetSchema` according to `fieldNames`. */ private MessageType clipParquetSchema(GroupType parquetSchema, String[] fieldNames) { Type[] types = new Type[fieldNames.length]; for (int i = 0; i < fieldNames.length; ++i) { String fieldName = fieldNames[i]; if (parquetSchema.getFieldIndex(fieldName) < 0) { throw new IllegalArgumentException(fieldName + " does not exist"); } types[i] = parquetSchema.getType(fieldName); } return Types.buildMessage().addFields(types).named("flink-parquet"); } }
/** * Clips `parquetSchema` according to `fieldNames`. */ private MessageType clipParquetSchema(GroupType parquetSchema, String[] fieldNames) { Type[] types = new Type[fieldNames.length]; for (int i = 0; i < fieldNames.length; ++i) { String fieldName = fieldNames[i]; if (parquetSchema.getFieldIndex(fieldName) < 0) { throw new IllegalArgumentException(fieldName + " does not exist"); } types[i] = parquetSchema.getType(fieldName); } return Types.buildMessage().addFields(types).named("flink-parquet"); }
private static MessageType parse(String schemaString) { Tokenizer st = new Tokenizer(schemaString, " ;{}()\n\t"); Types.MessageTypeBuilder builder = Types.buildMessage(); String t = st.nextToken(); check(t, "message", "start with 'message'", st); String name = st.nextToken(); addGroupTypeFields(st.nextToken(), st, builder); return builder.named(name); }
/** * Returns a builder to construct a {@link MessageType}. * * @return a {@link MessageTypeBuilder} */ public static MessageTypeBuilder buildMessage() { return new MessageTypeBuilder(); }
private MessageTypeBuilder() { super(MessageType.class); repetition(Type.Repetition.REQUIRED); }
/** * Returns a builder to construct a {@link MessageType}. * * @return a {@link MessageTypeBuilder} */ public static MessageTypeBuilder buildMessage() { return new MessageTypeBuilder(); }
MessageType fromParquetSchema(List<SchemaElement> schema, List<ColumnOrder> columnOrders) { Iterator<SchemaElement> iterator = schema.iterator(); SchemaElement root = iterator.next(); Types.MessageTypeBuilder builder = Types.buildMessage(); if (root.isSetField_id()) { builder.id(root.field_id); } buildChildren(builder, iterator, root.getNum_children(), columnOrders, 0); return builder.named(root.name); }
MessageType fromParquetSchema(List<SchemaElement> schema, List<ColumnOrder> columnOrders) { Iterator<SchemaElement> iterator = schema.iterator(); SchemaElement root = iterator.next(); Types.MessageTypeBuilder builder = Types.buildMessage(); if (root.isSetField_id()) { builder.id(root.field_id); } buildChildren(builder, iterator, root.getNum_children(), columnOrders, 0); return builder.named(root.name); }
private static MessageType readParquetSchema(List<SchemaElement> schema) { Iterator<SchemaElement> schemaIterator = schema.iterator(); SchemaElement rootSchema = schemaIterator.next(); Types.MessageTypeBuilder builder = Types.buildMessage(); readTypeSchema(builder, schemaIterator, rootSchema.getNum_children()); return builder.named(rootSchema.name); }
private static MessageType readParquetSchema(List<SchemaElement> schema) { Iterator<SchemaElement> schemaIterator = schema.iterator(); SchemaElement rootSchema = schemaIterator.next(); Types.MessageTypeBuilder builder = Types.buildMessage(); readTypeSchema(builder, schemaIterator, rootSchema.getNum_children()); return builder.named(rootSchema.name); }
private static MessageType parse(String schemaString) { Tokenizer st = new Tokenizer(schemaString, " ;{}()\n\t"); Types.MessageTypeBuilder builder = Types.buildMessage(); String t = st.nextToken(); check(t, "message", "start with 'message'", st); String name = st.nextToken(); addGroupTypeFields(st.nextToken(), st, builder); return builder.named(name); }
private MessageTypeBuilder() { super(MessageType.class); repetition(Type.Repetition.REQUIRED); }
public static MessageType addFallbackIds(MessageType fileSchema) { MessageTypeBuilder builder = org.apache.parquet.schema.Types.buildMessage(); int ordinal = 1; // ids are assigned starting at 1 for (Type type : fileSchema.getFields()) { builder.addField(type.withId(ordinal)); ordinal += 1; } return builder.named(fileSchema.getName()); } }