@Override public InternalRow get() { Object[] values = new Object[requiredSchema.size()]; for (int i = 0; i < values.length; i++) { if ("i".equals(requiredSchema.apply(i).name())) { values[i] = start; } else if ("j".equals(requiredSchema.apply(i).name())) { values[i] = -start; } } return new GenericInternalRow(values); }
@Override public InternalRow get() { Object[] values = new Object[requiredSchema.size()]; for (int i = 0; i < values.length; i++) { if ("i".equals(requiredSchema.apply(i).name())) { values[i] = start; } else if ("j".equals(requiredSchema.apply(i).name())) { values[i] = -start; } } return new GenericInternalRow(values); }
void validateDataFrameWithBeans(Bean bean, Dataset<Row> df) { StructType schema = df.schema(); Assert.assertEquals(new StructField("a", DoubleType$.MODULE$, false, Metadata.empty()), schema.apply("a")); Assert.assertEquals( new StructField("b", new ArrayType(IntegerType$.MODULE$, true), true, Metadata.empty()), schema.apply("b")); ArrayType valueType = new ArrayType(DataTypes.IntegerType, false); MapType mapType = new MapType(DataTypes.StringType, valueType, true); Assert.assertEquals( new StructField("c", mapType, true, Metadata.empty()), schema.apply("c")); Assert.assertEquals( new StructField("d", new ArrayType(DataTypes.StringType, true), true, Metadata.empty()), schema.apply("d")); Assert.assertEquals(new StructField("e", DataTypes.createDecimalType(38,0), true, Metadata.empty()), schema.apply("e")); Row first = df.select("a", "b", "c", "d", "e").first(); Assert.assertEquals(bean.getA(), first.getDouble(0), 0.0);
void validateDataFrameWithBeans(Bean bean, Dataset<Row> df) { StructType schema = df.schema(); Assert.assertEquals(new StructField("a", DoubleType$.MODULE$, false, Metadata.empty()), schema.apply("a")); Assert.assertEquals( new StructField("b", new ArrayType(IntegerType$.MODULE$, true), true, Metadata.empty()), schema.apply("b")); ArrayType valueType = new ArrayType(DataTypes.IntegerType, false); MapType mapType = new MapType(DataTypes.StringType, valueType, true); Assert.assertEquals( new StructField("c", mapType, true, Metadata.empty()), schema.apply("c")); Assert.assertEquals( new StructField("d", new ArrayType(DataTypes.StringType, true), true, Metadata.empty()), schema.apply("d")); Assert.assertEquals(new StructField("e", DataTypes.createDecimalType(38,0), true, Metadata.empty()), schema.apply("e")); Row first = df.select("a", "b", "c", "d", "e").first(); Assert.assertEquals(bean.getA(), first.getDouble(0), 0.0);
void validateDataFrameWithBeans(Bean bean, Dataset<Row> df) { StructType schema = df.schema(); Assert.assertEquals(new StructField("a", DoubleType$.MODULE$, false, Metadata.empty()), schema.apply("a")); Assert.assertEquals( new StructField("b", new ArrayType(IntegerType$.MODULE$, true), true, Metadata.empty()), schema.apply("b")); ArrayType valueType = new ArrayType(DataTypes.IntegerType, false); MapType mapType = new MapType(DataTypes.StringType, valueType, true); Assert.assertEquals( new StructField("c", mapType, true, Metadata.empty()), schema.apply("c")); Assert.assertEquals( new StructField("d", new ArrayType(DataTypes.StringType, true), true, Metadata.empty()), schema.apply("d")); Assert.assertEquals(new StructField("e", DataTypes.createDecimalType(38,0), true, Metadata.empty()), schema.apply("e")); Row first = df.select("a", "b", "c", "d", "e").first(); Assert.assertEquals(bean.getA(), first.getDouble(0), 0.0);
private StructType getStructSchema(StructType schema){ StructField structField = schema.apply(getStructCol()); return (StructType)structField.dataType(); }
() -> new KVComparator(ordering, keySchema.length()); boolean canUseRadixSort = keySchema.length() == 1 && SortPrefixUtils.canSortFullyWithPrefix(keySchema.apply(0));
final ArrayList<Object> records = new ArrayList<>(); if (paths[0].contains(".")) { // it is a nested structure recusivelyGenerateSparkObjects(parquetObjects, sparkSchema.apply(column).dataType(), records); } else { for (final String path : paths) { recusivelyGenerateSparkObjects(parquetObjects, sparkSchema.apply(path).dataType(), records);
() -> new KVComparator(ordering, keySchema.length()); boolean canUseRadixSort = keySchema.length() == 1 && SortPrefixUtils.canSortFullyWithPrefix(keySchema.apply(0));
private void recusivelyGenerateSparkObjects(final Iterator<Object> parquetObjects, final DataType fieldType, final ArrayList<Object> recordBuilder) throws SerialisationException { if (fieldType instanceof StructType) { final ArrayList<Object> nestedRecordBuilder = new ArrayList<>(); for (final String field : ((StructType) fieldType).fieldNames()) { final DataType innerDataType = ((StructType) fieldType).apply(field).dataType(); recusivelyGenerateSparkObjects(parquetObjects, innerDataType, nestedRecordBuilder); } final Object[] rowObjects = new Object[nestedRecordBuilder.size()]; nestedRecordBuilder.toArray(rowObjects); recordBuilder.add(new GenericRowWithSchema(rowObjects, (StructType) fieldType)); } else { // must be a primitive type final Object parquetObject = parquetObjects.next(); if (parquetObject instanceof Map) { recordBuilder.add(scala.collection.JavaConversions.mapAsScalaMap((Map<Object, Object>) parquetObject)); } else { recordBuilder.add(parquetObject); } } }
KVComparator recordComparator = new KVComparator(ordering, keySchema.length()); boolean canUseRadixSort = keySchema.length() == 1 && SortPrefixUtils.canSortFullyWithPrefix(keySchema.apply(0));
@Override public Row call(SimpleFeature feature) throws Exception { Object[] fields = new Serializable[schema.size()]; for (int i = 0; i < schema.size(); i++) { Object fieldObj = feature.getAttribute(i); if (fieldObj != null) { StructField structField = schema.apply(i); if (structField.name().equals("geom")) { fields[i] = fieldObj; } else if (structField.dataType() == DataTypes.TimestampType) { fields[i] = new Timestamp(((Date) fieldObj).getTime()); } else if (structField.dataType() != null) { fields[i] = fieldObj; } else { LOGGER.error("Unexpected attribute in field(" + structField.name() + "): " + fieldObj); } } } return new GenericRowWithSchema(fields, schema); } }
@Override public OneHotEncoderModelInfo getModelInfo(final OneHotEncoder from, DataFrame df) { OneHotEncoderModelInfo modelInfo = new OneHotEncoderModelInfo(); String inputColumn = from.getInputCol(); //Ugly but the only way to deal with spark here int numTypes = -1; Attribute attribute = Attribute.fromStructField(df.schema().apply(inputColumn)); if (attribute.attrType() == AttributeType.Nominal()) { numTypes = ((NominalAttribute) Attribute.fromStructField(df.schema().apply(inputColumn))).values().get().length; } else if (attribute.attrType() == AttributeType.Binary()) { numTypes = ((BinaryAttribute) Attribute.fromStructField(df.schema().apply(inputColumn))).values().get().length; } //TODO: Since dropLast is not accesible here, We are deliberately setting numTypes. This is the reason, we should use CustomOneHotEncoder modelInfo.setNumTypes(numTypes - 1); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
public static RowBasedKeyValueBatch allocate(StructType keySchema, StructType valueSchema, TaskMemoryManager manager, int maxRows) { boolean allFixedLength = true; // checking if there is any variable length fields // there is probably a more succinct impl of this for (String name : keySchema.fieldNames()) { allFixedLength = allFixedLength && UnsafeRow.isFixedLength(keySchema.apply(name).dataType()); } for (String name : valueSchema.fieldNames()) { allFixedLength = allFixedLength && UnsafeRow.isFixedLength(valueSchema.apply(name).dataType()); } if (allFixedLength) { return new FixedLengthRowBasedKeyValueBatch(keySchema, valueSchema, maxRows, manager); } else { return new VariableLengthRowBasedKeyValueBatch(keySchema, valueSchema, maxRows, manager); } }
public static RowBasedKeyValueBatch allocate(StructType keySchema, StructType valueSchema, TaskMemoryManager manager, int maxRows) { boolean allFixedLength = true; // checking if there is any variable length fields // there is probably a more succinct impl of this for (String name : keySchema.fieldNames()) { allFixedLength = allFixedLength && UnsafeRow.isFixedLength(keySchema.apply(name).dataType()); } for (String name : valueSchema.fieldNames()) { allFixedLength = allFixedLength && UnsafeRow.isFixedLength(valueSchema.apply(name).dataType()); } if (allFixedLength) { return new FixedLengthRowBasedKeyValueBatch(keySchema, valueSchema, maxRows, manager); } else { return new VariableLengthRowBasedKeyValueBatch(keySchema, valueSchema, maxRows, manager); } }
public static RowBasedKeyValueBatch allocate(StructType keySchema, StructType valueSchema, TaskMemoryManager manager, int maxRows) { boolean allFixedLength = true; // checking if there is any variable length fields // there is probably a more succinct impl of this for (String name : keySchema.fieldNames()) { allFixedLength = allFixedLength && UnsafeRow.isFixedLength(keySchema.apply(name).dataType()); } for (String name : valueSchema.fieldNames()) { allFixedLength = allFixedLength && UnsafeRow.isFixedLength(valueSchema.apply(name).dataType()); } if (allFixedLength) { return new FixedLengthRowBasedKeyValueBatch(keySchema, valueSchema, maxRows, manager); } else { return new VariableLengthRowBasedKeyValueBatch(keySchema, valueSchema, maxRows, manager); } }
@Override public Dataset<Row> check(Dataset<Row> dataset, Map<String, Dataset<Row>> stepDependencies) { if (isDependency()) { Dataset<Row> expectedDependency = stepDependencies.get(dependency); if (expectedDependency.count() == 1 && expectedDependency.schema().fields().length == 1 && expectedDependency.schema().apply(0).dataType() == DataTypes.LongType) { expected = expectedDependency.collectAsList().get(0).getLong(0); } else { throw new RuntimeException("Step dependency for count rule must have one row with a single field of long type"); } } if (expected < 0) { throw new RuntimeException("Failed to determine expected count: must be specified either as literal or step dependency"); } return dataset.groupBy().count().map(new CheckCount(expected, name), RowEncoder.apply(SCHEMA)); }
/** * Retrieves and converts Protobuf fields from a Message. * <p> * If the field in the {@link com.google.protobuf.Descriptors.Descriptor} exists in the {@link Message}, the value is * retrieved and converted using {@link #getFieldValue(Descriptors.FieldDescriptor, Object, DataType)}. * Otherwise, the field value is {@code null}. * The extraction honors the order of the {@code Descriptor}. * * @param dsc the Protobuf Descriptor with all fields * @param msg the Message with the current field values * @param schema the Dataset schema derived from the Descriptor * @return a list of converted values */ public static List<Object> buildRowValues(Descriptors.Descriptor dsc, Message msg, StructType schema) { List<Object> values = new ArrayList<>(); Object val; for (Descriptors.FieldDescriptor fd : dsc.getFields()) { if ( (!fd.isRepeated() && msg.hasField(fd)) || (fd.isRepeated() && msg.getRepeatedFieldCount(fd) > 0) ) { val = getFieldValue(fd, msg.getField(fd), schema.apply(fd.getName()).dataType()); } else { LOG.trace("FieldDescriptor[{}] => not found", fd.getFullName()); val = null; } values.add(val); } return values; }
public DataField createDataField(FieldName name){ StructType schema = getSchema(); StructField field = schema.apply(name.getValue()); org.apache.spark.sql.types.DataType sparkDataType = field.dataType(); if(sparkDataType instanceof StringType){ return createDataField(name, OpType.CATEGORICAL, DataType.STRING); } else if(sparkDataType instanceof IntegralType){ return createDataField(name, OpType.CONTINUOUS, DataType.INTEGER); } else if(sparkDataType instanceof DoubleType){ return createDataField(name, OpType.CONTINUOUS, DataType.DOUBLE); } else if(sparkDataType instanceof BooleanType){ return createDataField(name, OpType.CATEGORICAL, DataType.BOOLEAN); } else { throw new IllegalArgumentException("Expected string, integral, double or boolean type, got " + sparkDataType.typeName() + " type"); } }
final StructField field = schema.apply(i); final Object rowObj = row.apply(i); if (rowObj != null) {