org.apache.spark.sql.Dataset.javaRDD java code examples

private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext sc, String hiveTable) {
  SparkSession sparkSession = SparkSession.builder().config(sc.getConf()).enableHiveSupport().getOrCreate();
  final Dataset intermediateTable = sparkSession.table(hiveTable);
  return intermediateTable.javaRDD().map(new Function<Row, String[]>() {
    @Override
    public String[] call(Row row) throws Exception {
      String[] result = new String[row.size()];
      for (int i = 0; i < row.size(); i++) {
        final Object o = row.get(i);
        if (o != null) {
          result[i] = o.toString();
        } else {
          result[i] = null;
        }
      }
      return result;
    }
  });
}

JavaRDD<String> numbers = spark.read().textFile(inputPath).javaRDD();

JavaRDD<String> numbers = spark.read().textFile(inputPath).javaRDD();

JavaRDD<String> R = session.read().textFile(datasetR).javaRDD();
R.saveAsTextFile(outputPath+"/R");  
JavaRDD<String> S = session.read().textFile(datasetS).javaRDD();
S.saveAsTextFile(outputPath+"/S");

JavaRDD<String> R = session.read().textFile(datasetR).javaRDD();
R.saveAsTextFile(outputPath+"/R");  
JavaRDD<String> S = session.read().textFile(datasetS).javaRDD();
S.saveAsTextFile(outputPath+"/S");

JavaRDD<String> R = session.read().textFile(datasetR).javaRDD();
R.saveAsTextFile(outputPath+"/R");  
JavaRDD<String> S = session.read().textFile(datasetS).javaRDD();
S.saveAsTextFile(outputPath+"/S");

@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
 if (!dependencies.containsKey(intoDependency)) {
  throw new RuntimeException("Nest deriver points to non-existent nest-into dependency");
 }
 Dataset<Row> into = dependencies.get(intoDependency);
 if (!dependencies.containsKey(fromDependency)) {
  throw new RuntimeException("Nest deriver points to non-existent nest-from dependency");
 }
 Dataset<Row> from = dependencies.get(fromDependency);
 ExtractFieldsFunction extractFieldsFunction = new ExtractFieldsFunction(keyFieldNames);
 JavaPairRDD<List<Object>, Row> keyedIntoRDD = into.javaRDD().keyBy(extractFieldsFunction);
 JavaPairRDD<List<Object>, Row> keyedFromRDD = from.javaRDD().keyBy(extractFieldsFunction);
 NestFunction nestFunction = new NestFunction();
 JavaRDD<Row> nestedRDD = keyedIntoRDD.cogroup(keyedFromRDD).values().map(nestFunction);
 StructType nestedSchema = into.schema().add(nestedFieldName, DataTypes.createArrayType(from.schema()));
 Dataset<Row> nested = into.sqlContext().createDataFrame(nestedRDD, nestedSchema);
 return nested;
}

private List<Object> getValuesFromStep(Set<Step> steps) {
 String stepName = config.getString(STEP_PROPERTY);
 Optional<Step> optionalStep = StepUtils.getStepForName(stepName, steps);
 
 if (!optionalStep.isPresent()) {
  throw new RuntimeException("Step source for loop step '" + getName() + "' does not exist.");
 }
 
 Step step = optionalStep.get();
 
 if (!(step instanceof DataStep)) {
  throw new RuntimeException("Step source for loop step '" + getName() + "' is not a data step.");
 }
 
 Dataset<Row> stepRows = ((DataStep)step).getData();
 
 if (stepRows.count() > 1000) {
  throw new RuntimeException("Step source for loop step '" + getName() + "' can not provide more than 1000 values to loop over");
 }
 
 if (stepRows.schema().fields().length != 1) {
  throw new RuntimeException("Step source for loop step '" + getName() + "' can only provide a single field");
 }
 
 List<Object> stepValues = stepRows.javaRDD().map(new FirstFieldFunction()).collect();
 
 return stepValues;
}

@Override
public void applyBulkMutations(List<Tuple2<MutationType, Dataset<Row>>> planned) {
 for (Tuple2<MutationType, Dataset<Row>> mutation : planned) {
  MutationType mutationType = mutation._1();
  Dataset<Row> mutationDF = mutation._2();
  if (mutationType.equals(MutationType.INSERT)) {
   mutationDF.javaRDD().foreach(new SendRowToLogFunction(delimiter, logLevel));
  }
 }
}

@Override
public void applyBulkMutations(List<Tuple2<MutationType, Dataset<Row>>> planned) {
 for (Tuple2<MutationType, Dataset<Row>> mutation : planned) {
  MutationType mutationType = mutation._1();
  Dataset<Row> mutationDF = mutation._2();
  if (mutationType.equals(MutationType.INSERT)) {
   mutationDF.javaRDD().foreachPartition(new SendRowToKafkaFunction(config));
  }
 }
}

@Override
public Dataset<Row> read() throws Exception {    
 JavaRDD<Long> baseRDD = Contexts.getSparkSession().range(tasks).javaRDD().repartition(tasks);
 
 JavaRDD<Row> fixRDD = baseRDD.flatMap(new GenerateFIXMessages(ordersPerTask));
 
 StructType schema = DataTypes.createStructType(Lists.newArrayList(DataTypes.createStructField("fix", DataTypes.StringType, false)));
 
 Dataset<Row> fixDF = Contexts.getSparkSession().createDataFrame(fixRDD, schema);
 
 return fixDF;
}

@Test
public void testInputRepartitionColumnsAndPartitionCount() throws Exception {
 Map<String, Object> configMap = Maps.newHashMap();
 configMap.put(BatchStep.REPARTITION_COLUMNS_PROPERTY, Lists.newArrayList("modulo"));
 configMap.put(BatchStep.REPARTITION_NUM_PARTITIONS_PROPERTY, 5);
 configMap.put(DataStep.INPUT_TYPE + "." + InputFactory.TYPE_CONFIG_NAME, DummyInput.class.getName());
 configMap.put(DataStep.INPUT_TYPE + "." + "starting.partitions", 10);
 Config config = ConfigFactory.parseMap(configMap);
 BatchStep batchStep = new BatchStep("test");
 batchStep.configure(config);
 batchStep.submit(Sets.<Step>newHashSet());
 Dataset<Row> df = batchStep.getData();
 int numPartitions = df.javaRDD().getNumPartitions();
 assertEquals(5, numPartitions);
}

@Test
public void testInputRepartition() throws Exception {
 Map<String, Object> configMap = Maps.newHashMap();
 configMap.put(DataStep.INPUT_TYPE + "." + InputFactory.TYPE_CONFIG_NAME, DummyInput.class.getName());
 configMap.put(DataStep.INPUT_TYPE + "." + "starting.partitions", 5);
 configMap.put(BatchStep.REPARTITION_NUM_PARTITIONS_PROPERTY, 10);
 Config config = ConfigFactory.parseMap(configMap);
 
 BatchStep batchStep = new BatchStep("test");
 batchStep.configure(config);
 batchStep.submit(Sets.<Step>newHashSet());
 Dataset<Row> df = batchStep.getData();
 int numPartitions = df.javaRDD().getNumPartitions(); 
 
 assertEquals(numPartitions, 10);
}

@Test
public void testInputCoalesce() throws Exception {
 Map<String, Object> configMap = Maps.newHashMap();
 configMap.put(DataStep.INPUT_TYPE + "." + InputFactory.TYPE_CONFIG_NAME, DummyInput.class.getName());
 configMap.put(DataStep.INPUT_TYPE + "." + "starting.partitions", 10);
 configMap.put(BatchStep.COALESCE_NUM_PARTITIONS_PROPERTY, 5);
 Config config = ConfigFactory.parseMap(configMap);
 
 BatchStep batchStep = new BatchStep("test");
 batchStep.configure(config);
 batchStep.submit(Sets.<Step>newHashSet());
 Dataset<Row> df = batchStep.getData();
 int numPartitions = df.javaRDD().getNumPartitions(); 
 
 assertEquals(numPartitions, 5);
}

/**
 * Create a compatible schema
 * and rdd for datavec
 *
 * @param dataFrame the dataframe to convert
 * @return the converted schema and rdd of writables
 */
public static Pair<Schema, JavaRDD<List<Writable>>> toRecords(DataRowsFacade dataFrame) {
  Schema schema = fromStructType(dataFrame.get().schema());
  return new Pair<>(schema, dataFrame.get().javaRDD().map(new ToRecord(schema)));
}

@Test
public void testInputRepartitionColumns() throws Exception {
 Map<String, Object> configMap = Maps.newHashMap();
 configMap.put(DataStep.INPUT_TYPE + "." + InputFactory.TYPE_CONFIG_NAME, DummyInput.class.getName());
 configMap.put(DataStep.INPUT_TYPE + "." + "starting.partitions", 10);
 configMap.put(BatchStep.REPARTITION_COLUMNS_PROPERTY, Lists.newArrayList("modulo"));
 Config config = ConfigFactory.parseMap(configMap);
 BatchStep batchStep = new BatchStep("test");
 batchStep.configure(config);
 batchStep.submit(Sets.<Step>newHashSet());
 Dataset<Row> df = batchStep.getData();
 int numPartitions = df.javaRDD().getNumPartitions();
 assertEquals(Contexts.getSparkSession().sqlContext().getConf("spark.sql.shuffle.partitions"),
   Integer.toString(numPartitions));
}

/**
 * Create a compatible schema
 * and rdd for datavec
 *
 * @param dataFrame the dataframe to convert
 * @return the converted schema and rdd of writables
 */
public static Pair<Schema, JavaRDD<List<Writable>>> toRecords(DataRowsFacade dataFrame) {
  Schema schema = fromStructType(dataFrame.get().schema());
  return new Pair<>(schema, dataFrame.get().javaRDD().map(new ToRecord(schema)));
}

private JavaRDD<Row> planMutationsByKey(Dataset<Row> arriving, List<String> keyFieldNames,
                    Config plannerConfig, Config outputConfig) {
 JavaPairRDD<Row, Row> keyedArriving = 
   arriving.javaRDD().keyBy(new ExtractKeyFunction(keyFieldNames, accumulators));
 JavaPairRDD<Row, Iterable<Row>> arrivingByKey = 
   keyedArriving.groupByKey(getPartitioner(keyedArriving));
 JavaPairRDD<Row, Tuple2<Iterable<Row>, Iterable<Row>>> arrivingAndExistingByKey =
   arrivingByKey.mapPartitionsToPair(new JoinExistingForKeysFunction(outputConfig, keyFieldNames, accumulators));
 JavaRDD<Row> planned = 
   arrivingAndExistingByKey.flatMap(new PlanForKeyFunction(plannerConfig, accumulators));
 return planned;
}

private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext sc, String hiveTable) {
  SparkSession sparkSession = SparkSession.builder().config(sc.getConf()).enableHiveSupport().getOrCreate();
  final Dataset intermediateTable = sparkSession.table(hiveTable);
  return intermediateTable.javaRDD().map(new Function<Row, String[]>() {
    @Override
    public String[] call(Row row) throws Exception {
      String[] result = new String[row.size()];
      for (int i = 0; i < row.size(); i++) {
        final Object o = row.get(i);
        if (o != null) {
          result[i] = o.toString();
        } else {
          result[i] = null;
        }
      }
      return result;
    }
  });
}

private JavaPairRDD<Row, Row> getDummyRDD(int numPartitions) {
 return Contexts.getSparkSession().range(numPartitions).javaRDD()
   .map(new LongToRowFunction()).keyBy(new ItselfFunction<Row>()).repartition(numPartitions);
}

Popular methods of Dataset

Popular in Java

Reactive rest calls using spring rest template
scheduleAtFixedRate (ScheduledExecutorService)
setScale (BigDecimal)
scheduleAtFixedRate (Timer)
Socket (java.net)
Provides a client-side TCP socket.
NoSuchElementException (java.util)
Thrown when trying to retrieve an element past the end of an Enumeration or Iterator.
Random (java.util)
This class provides methods that return pseudo-random values.It is dangerous to seed Random with the
SortedMap (java.util)
A map that has its keys ordered. The sorting is according to either the natural ordering of its keys
Semaphore (java.util.concurrent)
A counting semaphore. Conceptually, a semaphore maintains a set of permits. Each #acquire blocks if
SAXParseException (org.xml.sax)
Encapsulate an XML parse error or warning.> This module, both source code and documentation, is in t
From CI to AI: The AI layer in your organization

How to use javaRDDmethodin org.apache.spark.sql.Dataset

Best Java code snippets using org.apache.spark.sql.Dataset.javaRDD (Showing top 20 results out of 315)

How to use
javaRDD
method
in
org.apache.spark.sql.Dataset