@Override public T get() { return visit(type, visitor); } }
public static Schema selectNot(Schema schema, Set<Integer> fieldIds) { Set<Integer> projectedIds = getProjectedIds(schema); projectedIds.removeAll(fieldIds); return select(schema, projectedIds); }
private FixedDecimalWriter(ColumnDescriptor desc, int precision, int scale) { super(desc); this.precision = precision; this.scale = scale; this.length = TypeUtil.decimalRequriedBytes(precision); this.bytes = ThreadLocal.withInitial(() -> new byte[length]); }
Schema readSchema = TypeUtil.selectNot(requiredSchema, idColumns); Schema partitionSchema = TypeUtil.select(requiredSchema, idColumns); PartitionRowConverter convertToRow = new PartitionRowConverter(partitionSchema, spec); JoinedRow joined = new JoinedRow(); iterSchema = TypeUtil.join(readSchema, partitionSchema); iter = transform(open(task, readSchema, conf), joined::withLeft);
public static MessageType pruneColumns(MessageType fileSchema, Schema expectedSchema) { // column order must match the incoming type, so it doesn't matter that the ids are unordered Set<Integer> selectedIds = TypeUtil.getProjectedIds(expectedSchema); return (MessageType) ParquetTypeVisitor.visit(fileSchema, new PruneColumns(selectedIds)); }
/** * Creates a projection schema for a subset of columns, selected by name. * <p> * Names that identify nested fields will select part or all of the field's top-level column. * * @param names a List of String names for selected columns * @return a projection schema from this schema, by name */ public Schema select(Collection<String> names) { if (names.contains(ALL_COLUMNS)) { return this; } Set<Integer> selected = Sets.newHashSet(); for (String name : names) { Integer id = lazyNameToId().get(name); if (id != null) { selected.add(id); } } return TypeUtil.select(this, selected); }
@Override public List<String> primitive(Type.PrimitiveType readPrimitive) { if (currentType.equals(readPrimitive)) { return NO_ERRORS; } if (!currentType.isPrimitiveType()) { return ImmutableList.of(String.format(": %s cannot be read as a %s", currentType.typeId().toString().toLowerCase(Locale.ENGLISH), readPrimitive)); } if (!isPromotionAllowed(currentType.asPrimitiveType(), readPrimitive)) { return ImmutableList.of(String.format(": %s cannot be promoted to %s", currentType, readPrimitive)); } // both are primitives and promotion is allowed to the read type return NO_ERRORS; } }
private static Schema assignFreshIds(Schema schema) { AtomicInteger lastColumnId = new AtomicInteger(0); return TypeUtil.assignFreshIds(schema, lastColumnId::incrementAndGet); } }
@Override public void setSchema(Schema fileSchema) { this.fileSchema = fileSchema; Set<Integer> projectedIds = getProjectedIds(expectedSchema); Schema prunedSchema = AvroSchemaUtil.pruneColumns(fileSchema, projectedIds); this.readSchema = AvroSchemaUtil.buildAvroProjection(prunedSchema, expectedSchema, renames); this.wrapped = newDatumReader(); }
@Override public UpdateSchema updateColumn(String name, Type.PrimitiveType newType) { Types.NestedField field = schema.findField(name); Preconditions.checkArgument(field != null, "Cannot update missing column: %s", name); Preconditions.checkArgument(!deletes.contains(field.fieldId()), "Cannot update a column that will be deleted: %s", field.name()); Preconditions.checkArgument(TypeUtil.isPromotionAllowed(field.type(), newType), "Cannot change column type: %s: %s -> %s", name, field.type(), newType); // merge with a rename, if present int fieldId = field.fieldId(); Types.NestedField rename = updates.get(fieldId); if (rename != null) { updates.put(fieldId, Types.NestedField.required(fieldId, rename.name(), newType)); } else { updates.put(fieldId, Types.NestedField.required(fieldId, field.name(), newType)); } return this; }
private static Schema assignFreshIds(Schema schema) { AtomicInteger lastColumnId = new AtomicInteger(0); return TypeUtil.assignFreshIds(schema, lastColumnId::incrementAndGet); } }
/** * Convert a {@link Type} to a {@link DataType Spark type}. * * @param type a Type * @return the equivalent Spark type * @throws IllegalArgumentException if the type cannot be converted to Spark */ public static DataType convert(Type type) { return visit(type, new TypeToSparkType()); }
@Override public TableScan select(Collection<String> columns) { Set<Integer> requiredFieldIds = Sets.newHashSet(); // all of the filter columns are required requiredFieldIds.addAll( Binder.boundReferences(table.schema().asStruct(), Collections.singletonList(rowFilter))); // all of the projection columns are required requiredFieldIds.addAll(TypeUtil.getProjectedIds(table.schema().select(columns))); Schema projection = TypeUtil.select(table.schema(), requiredFieldIds); return new BaseTableScan(ops, table, snapshotId, projection, rowFilter); }
private DecimalWriter(int precision, int scale) { this.precision = precision; this.scale = scale; this.length = TypeUtil.decimalRequriedBytes(precision); this.bytes = ThreadLocal.withInitial(() -> new byte[length]); }
new Schema(required(1, "to_field", to)), fromSchema); if (TypeUtil.isPromotionAllowed(from, to)) { Assert.assertEquals("Should produce 0 error messages", 0, errors.size()); } else {
public TableMetadata buildReplacement(Schema schema, PartitionSpec partitionSpec, Map<String, String> properties) { AtomicInteger lastColumnId = new AtomicInteger(0); Schema freshSchema = TypeUtil.assignFreshIds(schema, lastColumnId::incrementAndGet);
/** * Assigns fresh ids from the {@link NextID nextId function} for all fields in a type. * * @param type a type * @param nextId an id assignment function * @return an structurally identical type with new ids assigned by the nextId function */ public static Type assignFreshIds(Type type, NextID nextId) { return TypeUtil.visit(type, new AssignFreshIds(nextId)); }
@Test public void testDeleteFields() { // use schema projection to test column deletes Set<Integer> ALL_IDS = ImmutableSet.copyOf(TypeUtil.getProjectedIds(SCHEMA)); List<String> columns = Lists.newArrayList("id", "data", "preferences", "preferences.feature1", "preferences.feature2", "locations", "locations.lat", "locations.long", "points", "points.x", "points.y", "doubles", "properties"); for (String name : columns) { Set<Integer> selected = Sets.newHashSet(ALL_IDS); // remove the id and any nested fields from the projection Types.NestedField nested = SCHEMA.findField(name); selected.remove(nested.fieldId()); selected.removeAll(TypeUtil.getProjectedIds(nested.type())); Schema del = new SchemaUpdate(SCHEMA, 19).deleteColumn(name).apply(); Assert.assertEquals("Should match projection with '" + name + "' removed", TypeUtil.select(SCHEMA, selected).asStruct(), del.asStruct()); } }
private DecimalWriter(int precision, int scale) { this.precision = precision; this.scale = scale; this.length = TypeUtil.decimalRequriedBytes(precision); this.bytes = ThreadLocal.withInitial(() -> new byte[length]); }
public static TableMetadata newTableMetadata(TableOperations ops, Schema schema, PartitionSpec spec, String location, Map<String, String> properties) { // reassign all column ids to ensure consistency AtomicInteger lastColumnId = new AtomicInteger(0); Schema freshSchema = TypeUtil.assignFreshIds(schema, lastColumnId::incrementAndGet); // rebuild the partition spec using the new column ids PartitionSpec.Builder specBuilder = PartitionSpec.builderFor(freshSchema) .withSpecId(INITIAL_SPEC_ID); for (PartitionField field : spec.fields()) { // look up the name of the source field in the old schema to get the new schema's id String sourceName = schema.findColumnName(field.sourceId()); specBuilder.add( freshSchema.findField(sourceName).fieldId(), field.name(), field.transform().toString()); } PartitionSpec freshSpec = specBuilder.build(); return new TableMetadata(ops, null, location, System.currentTimeMillis(), lastColumnId.get(), freshSchema, INITIAL_SPEC_ID, ImmutableList.of(freshSpec), ImmutableMap.copyOf(properties), -1, ImmutableList.of(), ImmutableList.of()); }