private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext sc, String hiveTable) { SparkSession sparkSession = SparkSession.builder().config(sc.getConf()).enableHiveSupport().getOrCreate(); final Dataset intermediateTable = sparkSession.table(hiveTable); return intermediateTable.javaRDD().map(new Function<Row, String[]>() { @Override public String[] call(Row row) throws Exception { String[] result = new String[row.size()]; for (int i = 0; i < row.size(); i++) { final Object o = row.get(i); if (o != null) { result[i] = o.toString(); } else { result[i] = null; } } return result; } }); }
JavaRDD<String> numbers = spark.read().textFile(inputPath).javaRDD();
JavaRDD<String> numbers = spark.read().textFile(inputPath).javaRDD();
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception { if (!dependencies.containsKey(intoDependency)) { throw new RuntimeException("Nest deriver points to non-existent nest-into dependency"); } Dataset<Row> into = dependencies.get(intoDependency); if (!dependencies.containsKey(fromDependency)) { throw new RuntimeException("Nest deriver points to non-existent nest-from dependency"); } Dataset<Row> from = dependencies.get(fromDependency); ExtractFieldsFunction extractFieldsFunction = new ExtractFieldsFunction(keyFieldNames); JavaPairRDD<List<Object>, Row> keyedIntoRDD = into.javaRDD().keyBy(extractFieldsFunction); JavaPairRDD<List<Object>, Row> keyedFromRDD = from.javaRDD().keyBy(extractFieldsFunction); NestFunction nestFunction = new NestFunction(); JavaRDD<Row> nestedRDD = keyedIntoRDD.cogroup(keyedFromRDD).values().map(nestFunction); StructType nestedSchema = into.schema().add(nestedFieldName, DataTypes.createArrayType(from.schema())); Dataset<Row> nested = into.sqlContext().createDataFrame(nestedRDD, nestedSchema); return nested; }
private List<Object> getValuesFromStep(Set<Step> steps) { String stepName = config.getString(STEP_PROPERTY); Optional<Step> optionalStep = StepUtils.getStepForName(stepName, steps); if (!optionalStep.isPresent()) { throw new RuntimeException("Step source for loop step '" + getName() + "' does not exist."); } Step step = optionalStep.get(); if (!(step instanceof DataStep)) { throw new RuntimeException("Step source for loop step '" + getName() + "' is not a data step."); } Dataset<Row> stepRows = ((DataStep)step).getData(); if (stepRows.count() > 1000) { throw new RuntimeException("Step source for loop step '" + getName() + "' can not provide more than 1000 values to loop over"); } if (stepRows.schema().fields().length != 1) { throw new RuntimeException("Step source for loop step '" + getName() + "' can only provide a single field"); } List<Object> stepValues = stepRows.javaRDD().map(new FirstFieldFunction()).collect(); return stepValues; }
@Override public void applyBulkMutations(List<Tuple2<MutationType, Dataset<Row>>> planned) { for (Tuple2<MutationType, Dataset<Row>> mutation : planned) { MutationType mutationType = mutation._1(); Dataset<Row> mutationDF = mutation._2(); if (mutationType.equals(MutationType.INSERT)) { mutationDF.javaRDD().foreach(new SendRowToLogFunction(delimiter, logLevel)); } } }
@Override public void applyBulkMutations(List<Tuple2<MutationType, Dataset<Row>>> planned) { for (Tuple2<MutationType, Dataset<Row>> mutation : planned) { MutationType mutationType = mutation._1(); Dataset<Row> mutationDF = mutation._2(); if (mutationType.equals(MutationType.INSERT)) { mutationDF.javaRDD().foreachPartition(new SendRowToKafkaFunction(config)); } } }
@Override public Dataset<Row> read() throws Exception { JavaRDD<Long> baseRDD = Contexts.getSparkSession().range(tasks).javaRDD().repartition(tasks); JavaRDD<Row> fixRDD = baseRDD.flatMap(new GenerateFIXMessages(ordersPerTask)); StructType schema = DataTypes.createStructType(Lists.newArrayList(DataTypes.createStructField("fix", DataTypes.StringType, false))); Dataset<Row> fixDF = Contexts.getSparkSession().createDataFrame(fixRDD, schema); return fixDF; }
@Test public void testInputRepartitionColumnsAndPartitionCount() throws Exception { Map<String, Object> configMap = Maps.newHashMap(); configMap.put(BatchStep.REPARTITION_COLUMNS_PROPERTY, Lists.newArrayList("modulo")); configMap.put(BatchStep.REPARTITION_NUM_PARTITIONS_PROPERTY, 5); configMap.put(DataStep.INPUT_TYPE + "." + InputFactory.TYPE_CONFIG_NAME, DummyInput.class.getName()); configMap.put(DataStep.INPUT_TYPE + "." + "starting.partitions", 10); Config config = ConfigFactory.parseMap(configMap); BatchStep batchStep = new BatchStep("test"); batchStep.configure(config); batchStep.submit(Sets.<Step>newHashSet()); Dataset<Row> df = batchStep.getData(); int numPartitions = df.javaRDD().getNumPartitions(); assertEquals(5, numPartitions); }
@Test public void testInputRepartition() throws Exception { Map<String, Object> configMap = Maps.newHashMap(); configMap.put(DataStep.INPUT_TYPE + "." + InputFactory.TYPE_CONFIG_NAME, DummyInput.class.getName()); configMap.put(DataStep.INPUT_TYPE + "." + "starting.partitions", 5); configMap.put(BatchStep.REPARTITION_NUM_PARTITIONS_PROPERTY, 10); Config config = ConfigFactory.parseMap(configMap); BatchStep batchStep = new BatchStep("test"); batchStep.configure(config); batchStep.submit(Sets.<Step>newHashSet()); Dataset<Row> df = batchStep.getData(); int numPartitions = df.javaRDD().getNumPartitions(); assertEquals(numPartitions, 10); }
@Test public void testInputCoalesce() throws Exception { Map<String, Object> configMap = Maps.newHashMap(); configMap.put(DataStep.INPUT_TYPE + "." + InputFactory.TYPE_CONFIG_NAME, DummyInput.class.getName()); configMap.put(DataStep.INPUT_TYPE + "." + "starting.partitions", 10); configMap.put(BatchStep.COALESCE_NUM_PARTITIONS_PROPERTY, 5); Config config = ConfigFactory.parseMap(configMap); BatchStep batchStep = new BatchStep("test"); batchStep.configure(config); batchStep.submit(Sets.<Step>newHashSet()); Dataset<Row> df = batchStep.getData(); int numPartitions = df.javaRDD().getNumPartitions(); assertEquals(numPartitions, 5); }
/** * Create a compatible schema * and rdd for datavec * * @param dataFrame the dataframe to convert * @return the converted schema and rdd of writables */ public static Pair<Schema, JavaRDD<List<Writable>>> toRecords(DataRowsFacade dataFrame) { Schema schema = fromStructType(dataFrame.get().schema()); return new Pair<>(schema, dataFrame.get().javaRDD().map(new ToRecord(schema))); }
@Test public void testInputRepartitionColumns() throws Exception { Map<String, Object> configMap = Maps.newHashMap(); configMap.put(DataStep.INPUT_TYPE + "." + InputFactory.TYPE_CONFIG_NAME, DummyInput.class.getName()); configMap.put(DataStep.INPUT_TYPE + "." + "starting.partitions", 10); configMap.put(BatchStep.REPARTITION_COLUMNS_PROPERTY, Lists.newArrayList("modulo")); Config config = ConfigFactory.parseMap(configMap); BatchStep batchStep = new BatchStep("test"); batchStep.configure(config); batchStep.submit(Sets.<Step>newHashSet()); Dataset<Row> df = batchStep.getData(); int numPartitions = df.javaRDD().getNumPartitions(); assertEquals(Contexts.getSparkSession().sqlContext().getConf("spark.sql.shuffle.partitions"), Integer.toString(numPartitions)); }
/** * Create a compatible schema * and rdd for datavec * * @param dataFrame the dataframe to convert * @return the converted schema and rdd of writables */ public static Pair<Schema, JavaRDD<List<Writable>>> toRecords(DataRowsFacade dataFrame) { Schema schema = fromStructType(dataFrame.get().schema()); return new Pair<>(schema, dataFrame.get().javaRDD().map(new ToRecord(schema))); }
private JavaRDD<Row> planMutationsByKey(Dataset<Row> arriving, List<String> keyFieldNames, Config plannerConfig, Config outputConfig) { JavaPairRDD<Row, Row> keyedArriving = arriving.javaRDD().keyBy(new ExtractKeyFunction(keyFieldNames, accumulators)); JavaPairRDD<Row, Iterable<Row>> arrivingByKey = keyedArriving.groupByKey(getPartitioner(keyedArriving)); JavaPairRDD<Row, Tuple2<Iterable<Row>, Iterable<Row>>> arrivingAndExistingByKey = arrivingByKey.mapPartitionsToPair(new JoinExistingForKeysFunction(outputConfig, keyFieldNames, accumulators)); JavaRDD<Row> planned = arrivingAndExistingByKey.flatMap(new PlanForKeyFunction(plannerConfig, accumulators)); return planned; }
private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext sc, String hiveTable) { SparkSession sparkSession = SparkSession.builder().config(sc.getConf()).enableHiveSupport().getOrCreate(); final Dataset intermediateTable = sparkSession.table(hiveTable); return intermediateTable.javaRDD().map(new Function<Row, String[]>() { @Override public String[] call(Row row) throws Exception { String[] result = new String[row.size()]; for (int i = 0; i < row.size(); i++) { final Object o = row.get(i); if (o != null) { result[i] = o.toString(); } else { result[i] = null; } } return result; } }); }
private JavaPairRDD<Row, Row> getDummyRDD(int numPartitions) { return Contexts.getSparkSession().range(numPartitions).javaRDD() .map(new LongToRowFunction()).keyBy(new ItselfFunction<Row>()).repartition(numPartitions); }