private Map<Integer,Collection<String>> getDistinctValues(JavaRDD<String[]> parsedRDD) { int[] categoricalIndices = IntStream.range(0, inputSchema.getNumFeatures()). filter(inputSchema::isCategorical).toArray(); return parsedRDD.mapPartitions(data -> { Map<Integer,Collection<String>> categoryValues = new HashMap<>(); for (int i : categoricalIndices) { categoryValues.put(i, new HashSet<>()); } data.forEachRemaining(datum -> categoryValues.forEach((category, values) -> values.add(datum[category])) ); return Collections.singleton(categoryValues).iterator(); }).reduce((v1, v2) -> { // Assumes both have the same key set v1.forEach((category, values) -> values.addAll(v2.get(category))); return v1; }); }
AvgCount result = rdd.mapPartitions(setup).reduce(combine); System.out.println(result.avg());
return Collections.singleton(treeNodeIDCounts).iterator(); ).reduce((a, b) -> { Preconditions.checkArgument(a.size() == b.size()); for (int i = 0; i < a.size(); i++) {
/** * @param trainPointData data to run down trees * @param model random decision forest model to count on * @return map of predictor index to the number of training examples that reached a * node whose decision is based on that feature. The index is among predictors, not all * features, since there are fewer predictors than features. That is, the index will * match the one used in the {@link RandomForestModel}. */ private static IntLongHashMap predictorExampleCounts(JavaRDD<? extends LabeledPoint> trainPointData, RandomForestModel model) { return trainPointData.mapPartitions(data -> { IntLongHashMap featureIndexCount = new IntLongHashMap(); data.forEachRemaining(datum -> { double[] featureVector = datum.features().toArray(); for (DecisionTreeModel tree : model.trees()) { org.apache.spark.mllib.tree.model.Node node = tree.topNode(); // This logic cloned from Node.predict: while (!node.isLeaf()) { Split split = node.split().get(); int featureIndex = split.feature(); // Count feature featureIndexCount.addToValue(featureIndex, 1); node = nextNode(featureVector, node, split, featureIndex); } } }); return Collections.singleton(featureIndexCount).iterator(); }).reduce(RDFUpdate::merge); }
@Test public void reduce() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); int sum = rdd.reduce(new AddInts()); assertEquals(10, sum); }
@Test public void reduce() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); int sum = rdd.reduce(new AddInts()); assertEquals(10, sum); }
@Test public void reduce() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); int sum = rdd.reduce(new AddInts()); assertEquals(10, sum); }
@Test public void testJavaJdbcRDD() throws Exception { JavaRDD<Integer> rdd = JdbcRDD.create( sc, () -> DriverManager.getConnection("jdbc:derby:target/JavaJdbcRDDSuiteDb"), "SELECT DATA FROM FOO WHERE ? <= ID AND ID <= ?", 1, 100, 1, r -> r.getInt(1) ).cache(); Assert.assertEquals(100, rdd.count()); Assert.assertEquals(Integer.valueOf(10100), rdd.reduce((i1, i2) -> i1 + i2)); } }
@Test public void testJavaJdbcRDD() throws Exception { JavaRDD<Integer> rdd = JdbcRDD.create( sc, () -> DriverManager.getConnection("jdbc:derby:target/JavaJdbcRDDSuiteDb"), "SELECT DATA FROM FOO WHERE ? <= ID AND ID <= ?", 1, 100, 1, r -> r.getInt(1) ).cache(); Assert.assertEquals(100, rdd.count()); Assert.assertEquals(Integer.valueOf(10100), rdd.reduce((i1, i2) -> i1 + i2)); } }
@Test public void testJavaJdbcRDD() throws Exception { JavaRDD<Integer> rdd = JdbcRDD.create( sc, () -> DriverManager.getConnection("jdbc:derby:target/JavaJdbcRDDSuiteDb"), "SELECT DATA FROM FOO WHERE ? <= ID AND ID <= ?", 1, 100, 1, r -> r.getInt(1) ).cache(); Assert.assertEquals(100, rdd.count()); Assert.assertEquals(Integer.valueOf(10100), rdd.reduce((i1, i2) -> i1 + i2)); } }
@Test public void foldReduce() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 1, 2, 3, 5, 8, 13)); Function2<Integer, Integer, Integer> add = (a, b) -> a + b; int sum = rdd.fold(0, add); assertEquals(33, sum); sum = rdd.reduce(add); assertEquals(33, sum); }
@Test public void foldReduce() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 1, 2, 3, 5, 8, 13)); Function2<Integer, Integer, Integer> add = (a, b) -> a + b; int sum = rdd.fold(0, add); Assert.assertEquals(33, sum); sum = rdd.reduce(add); Assert.assertEquals(33, sum); }
@Test public void foldReduce() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 1, 2, 3, 5, 8, 13)); Function2<Integer, Integer, Integer> add = (a, b) -> a + b; int sum = rdd.fold(0, add); assertEquals(33, sum); sum = rdd.reduce(add); assertEquals(33, sum); }
@Test public void foldReduce() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 1, 2, 3, 5, 8, 13)); Function2<Integer, Integer, Integer> add = (a, b) -> a + b; int sum = rdd.fold(0, add); assertEquals(33, sum); sum = rdd.reduce(add); assertEquals(33, sum); }
@Test public void foldReduce() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 1, 2, 3, 5, 8, 13)); Function2<Integer, Integer, Integer> add = (a, b) -> a + b; int sum = rdd.fold(0, add); Assert.assertEquals(33, sum); sum = rdd.reduce(add); Assert.assertEquals(33, sum); }
@Test public void foldReduce() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 1, 2, 3, 5, 8, 13)); Function2<Integer, Integer, Integer> add = (a, b) -> a + b; int sum = rdd.fold(0, add); Assert.assertEquals(33, sum); sum = rdd.reduce(add); Assert.assertEquals(33, sum); }
@Override public Optional<T> reduce(@NonNull SerializableBinaryOperator<T> reducer) { return Optional.of(rdd.reduce((t, u) -> { Configurator.INSTANCE.configure(configBroadcast.value()); return reducer.apply(t, u); })); }
public static <FieldT extends AbstractFieldElementExpanded<FieldT>> FieldT distributedVariableBaseMSM( final JavaPairRDD<Long, FieldT> scalars, final JavaPairRDD<Long, FieldT> bases) { return scalars.join(bases).map(pair -> pair._2._1.mul(pair._2._2)).reduce(FieldT::add); } }
@SuppressWarnings("unchecked") @Override public Summary[] getSummaryImpl() throws DDFException { RDD<Object[]> rdd = (RDD<Object[]>) this.getDDF().getRepresentationHandler().get(RDD.class, Object[].class); JavaRDD<Object[]> data = rdd.toJavaRDD(); Summary[] stats = data.map(new GetSummaryMapper()).reduce(new GetSummaryReducer()); return stats; }