org.apache.spark.sql.types.StructType.apply java code examples

@Override
public InternalRow get() {
 Object[] values = new Object[requiredSchema.size()];
 for (int i = 0; i < values.length; i++) {
  if ("i".equals(requiredSchema.apply(i).name())) {
   values[i] = start;
  } else if ("j".equals(requiredSchema.apply(i).name())) {
   values[i] = -start;
  }
 }
 return new GenericInternalRow(values);
}

@Override
public InternalRow get() {
 Object[] values = new Object[requiredSchema.size()];
 for (int i = 0; i < values.length; i++) {
  if ("i".equals(requiredSchema.apply(i).name())) {
   values[i] = start;
  } else if ("j".equals(requiredSchema.apply(i).name())) {
   values[i] = -start;
  }
 }
 return new GenericInternalRow(values);
}

void validateDataFrameWithBeans(Bean bean, Dataset<Row> df) {
 StructType schema = df.schema();
 Assert.assertEquals(new StructField("a", DoubleType$.MODULE$, false, Metadata.empty()),
  schema.apply("a"));
 Assert.assertEquals(
  new StructField("b", new ArrayType(IntegerType$.MODULE$, true), true, Metadata.empty()),
  schema.apply("b"));
 ArrayType valueType = new ArrayType(DataTypes.IntegerType, false);
 MapType mapType = new MapType(DataTypes.StringType, valueType, true);
 Assert.assertEquals(
  new StructField("c", mapType, true, Metadata.empty()),
  schema.apply("c"));
 Assert.assertEquals(
  new StructField("d", new ArrayType(DataTypes.StringType, true), true, Metadata.empty()),
  schema.apply("d"));
 Assert.assertEquals(new StructField("e", DataTypes.createDecimalType(38,0), true,
  Metadata.empty()), schema.apply("e"));
 Row first = df.select("a", "b", "c", "d", "e").first();
 Assert.assertEquals(bean.getA(), first.getDouble(0), 0.0);

void validateDataFrameWithBeans(Bean bean, Dataset<Row> df) {
 StructType schema = df.schema();
 Assert.assertEquals(new StructField("a", DoubleType$.MODULE$, false, Metadata.empty()),
  schema.apply("a"));
 Assert.assertEquals(
  new StructField("b", new ArrayType(IntegerType$.MODULE$, true), true, Metadata.empty()),
  schema.apply("b"));
 ArrayType valueType = new ArrayType(DataTypes.IntegerType, false);
 MapType mapType = new MapType(DataTypes.StringType, valueType, true);
 Assert.assertEquals(
  new StructField("c", mapType, true, Metadata.empty()),
  schema.apply("c"));
 Assert.assertEquals(
  new StructField("d", new ArrayType(DataTypes.StringType, true), true, Metadata.empty()),
  schema.apply("d"));
 Assert.assertEquals(new StructField("e", DataTypes.createDecimalType(38,0), true,
  Metadata.empty()), schema.apply("e"));
 Row first = df.select("a", "b", "c", "d", "e").first();
 Assert.assertEquals(bean.getA(), first.getDouble(0), 0.0);

void validateDataFrameWithBeans(Bean bean, Dataset<Row> df) {
 StructType schema = df.schema();
 Assert.assertEquals(new StructField("a", DoubleType$.MODULE$, false, Metadata.empty()),
  schema.apply("a"));
 Assert.assertEquals(
  new StructField("b", new ArrayType(IntegerType$.MODULE$, true), true, Metadata.empty()),
  schema.apply("b"));
 ArrayType valueType = new ArrayType(DataTypes.IntegerType, false);
 MapType mapType = new MapType(DataTypes.StringType, valueType, true);
 Assert.assertEquals(
  new StructField("c", mapType, true, Metadata.empty()),
  schema.apply("c"));
 Assert.assertEquals(
  new StructField("d", new ArrayType(DataTypes.StringType, true), true, Metadata.empty()),
  schema.apply("d"));
 Assert.assertEquals(new StructField("e", DataTypes.createDecimalType(38,0), true,
  Metadata.empty()), schema.apply("e"));
 Row first = df.select("a", "b", "c", "d", "e").first();
 Assert.assertEquals(bean.getA(), first.getDouble(0), 0.0);

private StructType getStructSchema(StructType schema){
  StructField structField = schema.apply(getStructCol());
  return (StructType)structField.dataType();
}

 () -> new KVComparator(ordering, keySchema.length());
boolean canUseRadixSort = keySchema.length() == 1 &&
 SortPrefixUtils.canSortFullyWithPrefix(keySchema.apply(0));

final ArrayList<Object> records = new ArrayList<>();
if (paths[0].contains(".")) { // it is a nested structure
  recusivelyGenerateSparkObjects(parquetObjects, sparkSchema.apply(column).dataType(), records);
} else {
  for (final String path : paths) {
    recusivelyGenerateSparkObjects(parquetObjects, sparkSchema.apply(path).dataType(), records);

 () -> new KVComparator(ordering, keySchema.length());
boolean canUseRadixSort = keySchema.length() == 1 &&
 SortPrefixUtils.canSortFullyWithPrefix(keySchema.apply(0));

private void recusivelyGenerateSparkObjects(final Iterator<Object> parquetObjects,
                      final DataType fieldType,
                      final ArrayList<Object> recordBuilder) throws SerialisationException {
  if (fieldType instanceof StructType) {
    final ArrayList<Object> nestedRecordBuilder = new ArrayList<>();
    for (final String field : ((StructType) fieldType).fieldNames()) {
      final DataType innerDataType = ((StructType) fieldType).apply(field).dataType();
      recusivelyGenerateSparkObjects(parquetObjects, innerDataType, nestedRecordBuilder);
    }
    final Object[] rowObjects = new Object[nestedRecordBuilder.size()];
    nestedRecordBuilder.toArray(rowObjects);
    recordBuilder.add(new GenericRowWithSchema(rowObjects, (StructType) fieldType));
  } else {
    // must be a primitive type
    final Object parquetObject = parquetObjects.next();
    if (parquetObject instanceof Map) {
      recordBuilder.add(scala.collection.JavaConversions.mapAsScalaMap((Map<Object, Object>) parquetObject));
    } else {
      recordBuilder.add(parquetObject);
    }
  }
}

KVComparator recordComparator = new KVComparator(ordering, keySchema.length());
boolean canUseRadixSort = keySchema.length() == 1 &&
 SortPrefixUtils.canSortFullyWithPrefix(keySchema.apply(0));

 @Override
 public Row call(SimpleFeature feature) throws Exception {
  Object[] fields = new Serializable[schema.size()];

  for (int i = 0; i < schema.size(); i++) {
   Object fieldObj = feature.getAttribute(i);
   if (fieldObj != null) {
    StructField structField = schema.apply(i);
    if (structField.name().equals("geom")) {
     fields[i] = fieldObj;
    } else if (structField.dataType() == DataTypes.TimestampType) {
     fields[i] = new Timestamp(((Date) fieldObj).getTime());
    } else if (structField.dataType() != null) {
     fields[i] = fieldObj;
    } else {
     LOGGER.error("Unexpected attribute in field(" + structField.name() + "): " + fieldObj);
    }
   }
  }

  return new GenericRowWithSchema(fields, schema);
 }
}

@Override
public OneHotEncoderModelInfo getModelInfo(final OneHotEncoder from, DataFrame df) {
  OneHotEncoderModelInfo modelInfo = new OneHotEncoderModelInfo();
  String inputColumn = from.getInputCol();
  //Ugly but the only way to deal with spark here
  int numTypes = -1;
  Attribute attribute = Attribute.fromStructField(df.schema().apply(inputColumn));
  if (attribute.attrType() == AttributeType.Nominal()) {
    numTypes = ((NominalAttribute) Attribute.fromStructField(df.schema().apply(inputColumn))).values().get().length;
  } else if (attribute.attrType() == AttributeType.Binary()) {
    numTypes = ((BinaryAttribute) Attribute.fromStructField(df.schema().apply(inputColumn))).values().get().length;
  }
  //TODO: Since dropLast is not accesible here, We are deliberately setting numTypes. This is the reason, we should use CustomOneHotEncoder
  modelInfo.setNumTypes(numTypes - 1);
  Set<String> inputKeys = new LinkedHashSet<String>();
  inputKeys.add(from.getInputCol());
  modelInfo.setInputKeys(inputKeys);
  Set<String> outputKeys = new LinkedHashSet<String>();
  outputKeys.add(from.getOutputCol());
  modelInfo.setOutputKeys(outputKeys);
  return modelInfo;
}

public static RowBasedKeyValueBatch allocate(StructType keySchema, StructType valueSchema,
                       TaskMemoryManager manager, int maxRows) {
 boolean allFixedLength = true;
 // checking if there is any variable length fields
 // there is probably a more succinct impl of this
 for (String name : keySchema.fieldNames()) {
  allFixedLength = allFixedLength
      && UnsafeRow.isFixedLength(keySchema.apply(name).dataType());
 }
 for (String name : valueSchema.fieldNames()) {
  allFixedLength = allFixedLength
      && UnsafeRow.isFixedLength(valueSchema.apply(name).dataType());
 }
 if (allFixedLength) {
  return new FixedLengthRowBasedKeyValueBatch(keySchema, valueSchema, maxRows, manager);
 } else {
  return new VariableLengthRowBasedKeyValueBatch(keySchema, valueSchema, maxRows, manager);
 }
}

public static RowBasedKeyValueBatch allocate(StructType keySchema, StructType valueSchema,
                       TaskMemoryManager manager, int maxRows) {
 boolean allFixedLength = true;
 // checking if there is any variable length fields
 // there is probably a more succinct impl of this
 for (String name : keySchema.fieldNames()) {
  allFixedLength = allFixedLength
      && UnsafeRow.isFixedLength(keySchema.apply(name).dataType());
 }
 for (String name : valueSchema.fieldNames()) {
  allFixedLength = allFixedLength
      && UnsafeRow.isFixedLength(valueSchema.apply(name).dataType());
 }
 if (allFixedLength) {
  return new FixedLengthRowBasedKeyValueBatch(keySchema, valueSchema, maxRows, manager);
 } else {
  return new VariableLengthRowBasedKeyValueBatch(keySchema, valueSchema, maxRows, manager);
 }
}

public static RowBasedKeyValueBatch allocate(StructType keySchema, StructType valueSchema,
                       TaskMemoryManager manager, int maxRows) {
 boolean allFixedLength = true;
 // checking if there is any variable length fields
 // there is probably a more succinct impl of this
 for (String name : keySchema.fieldNames()) {
  allFixedLength = allFixedLength
      && UnsafeRow.isFixedLength(keySchema.apply(name).dataType());
 }
 for (String name : valueSchema.fieldNames()) {
  allFixedLength = allFixedLength
      && UnsafeRow.isFixedLength(valueSchema.apply(name).dataType());
 }
 if (allFixedLength) {
  return new FixedLengthRowBasedKeyValueBatch(keySchema, valueSchema, maxRows, manager);
 } else {
  return new VariableLengthRowBasedKeyValueBatch(keySchema, valueSchema, maxRows, manager);
 }
}

@Override
public Dataset<Row> check(Dataset<Row> dataset, Map<String, Dataset<Row>> stepDependencies) {
 if (isDependency()) {
  Dataset<Row> expectedDependency = stepDependencies.get(dependency);
  if (expectedDependency.count() == 1 && expectedDependency.schema().fields().length == 1
    && expectedDependency.schema().apply(0).dataType() == DataTypes.LongType) {
   expected = expectedDependency.collectAsList().get(0).getLong(0);
  } else {
   throw new RuntimeException("Step dependency for count rule must have one row with a single field of long type");
  }
 }
 if (expected < 0) {
  throw new RuntimeException("Failed to determine expected count: must be specified either as literal or step dependency");
 }
 return dataset.groupBy().count().map(new CheckCount(expected, name), RowEncoder.apply(SCHEMA));
}

/**
 * Retrieves and converts Protobuf fields from a Message.
 * <p>
 * If the field in the {@link com.google.protobuf.Descriptors.Descriptor} exists in the {@link Message}, the value is
 * retrieved and converted using {@link #getFieldValue(Descriptors.FieldDescriptor, Object, DataType)}.
 * Otherwise, the field value is {@code null}.
 * The extraction honors the order of the {@code Descriptor}.
 *
 * @param dsc the Protobuf Descriptor with all fields
 * @param msg the Message with the current field values
 * @param schema the Dataset schema derived from the Descriptor
 * @return a list of converted values
 */
public static List<Object> buildRowValues(Descriptors.Descriptor dsc, Message msg, StructType schema) {
 List<Object> values = new ArrayList<>();
 Object val;
 for (Descriptors.FieldDescriptor fd : dsc.getFields()) {
  if ( (!fd.isRepeated() && msg.hasField(fd)) || (fd.isRepeated() && msg.getRepeatedFieldCount(fd) > 0) ) {
   val = getFieldValue(fd, msg.getField(fd), schema.apply(fd.getName()).dataType());
  } else {
   LOG.trace("FieldDescriptor[{}] => not found", fd.getFullName());
   val = null;
  }
  values.add(val);
 }
 return values;
}

public DataField createDataField(FieldName name){
  StructType schema = getSchema();
  StructField field = schema.apply(name.getValue());
  org.apache.spark.sql.types.DataType sparkDataType = field.dataType();
  if(sparkDataType instanceof StringType){
    return createDataField(name, OpType.CATEGORICAL, DataType.STRING);
  } else
  if(sparkDataType instanceof IntegralType){
    return createDataField(name, OpType.CONTINUOUS, DataType.INTEGER);
  } else
  if(sparkDataType instanceof DoubleType){
    return createDataField(name, OpType.CONTINUOUS, DataType.DOUBLE);
  } else
  if(sparkDataType instanceof BooleanType){
    return createDataField(name, OpType.CATEGORICAL, DataType.BOOLEAN);
  } else
  {
    throw new IllegalArgumentException("Expected string, integral, double or boolean type, got " + sparkDataType.typeName() + " type");
  }
}

final StructField field = schema.apply(i);
final Object rowObj = row.apply(i);
if (rowObj != null) {

Popular methods of StructType

Popular in Java

Making http post requests using okhttp
setRequestProperty (URLConnection)
scheduleAtFixedRate (ScheduledExecutorService)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
File (java.io)
An "abstract" representation of a file system entity identified by a pathname. The pathname may be a
FileReader (java.io)
A specialized Reader that reads from a file in the file system. All read requests made by calling me
Collection (java.util)
Collection is the root of the collection hierarchy. It defines operations on data collections and t
Manifest (java.util.jar)
The Manifest class is used to obtain attribute information for a JarFile and its entries.
BufferedImage (java.awt.image)
The BufferedImage subclass describes an java.awt.Image with an accessible buffer of image data. All
Kernel (java.awt.image)
Best IntelliJ plugins

How to use applymethodin org.apache.spark.sql.types.StructType

Best Java code snippets using org.apache.spark.sql.types.StructType.apply (Showing top 20 results out of 315)

How to use
apply
method
in
org.apache.spark.sql.types.StructType