Refine search
JavaInputDStream<ConsumerRecord<K,M>> kafkaDStream = buildInputDStream(streamingContext); JavaPairDStream<K,M> pairDStream = kafkaDStream.mapToPair(mAndM -> new Tuple2<>(mAndM.key(), mAndM.value())); pairDStream.foreachRDD( new BatchUpdateFunction<>(getConfig(), keyClass, pairDStream.foreachRDD(new SaveToHDFSFunction<>( dataDirString + "/oryx", "data", pairDStream.foreachRDD(new DeleteOldDataFn<>(hadoopConf, dataDirString, Pattern.compile("-(\\d+)\\."), pairDStream.foreachRDD(new DeleteOldDataFn<>(hadoopConf, modelDirString, Pattern.compile("(\\d+)"),
JavaPairDStream<Integer, Long> responseCodeCountDStream = accessLogsDStream.transformToPair( new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<Integer, Long>>() { public JavaPairRDD<Integer, Long> call(JavaRDD<ApacheAccessLog> rdd) { return Functions.responseCodeCount(rdd); }}) .updateStateByKey(new Functions.ComputeRunningSum()); responseCodeCountDStream.foreachRDD(new Function<JavaPairRDD<Integer, Long>, Void>() { public Void call(JavaPairRDD<Integer, Long> rdd) { currentResponseCodeCounts = rdd.take(100); JavaPairDStream<String, Long> ipCumDStream = ipRawDStream.updateStateByKey( new Functions.ComputeRunningSum()); JavaPairDStream<String, Long> ipCountsDStream = ipDStream.reduceByKey(new Functions.LongSumReducer()); JavaPairDStream<String, Long> ipBytesSumDStream = ipBytesDStream.reduceByKey(new Functions.LongSumReducer()); JavaPairDStream<String, Tuple2<Long, Long>> ipBytesRequestCountDStream = ipBytesSumDStream.join(ipCountsDStream); JavaPairDStream<Text, LongWritable> writableDStream = ipDStream.mapToPair( new PairFunction<Tuple2<String, Long>, Text, LongWritable>() { public Tuple2<Text, LongWritable> call(Tuple2<String, Long> e) { class OutFormat extends SequenceFileOutputFormat<Text, LongWritable> { }; writableDStream.saveAsHadoopFiles(outDir, "pandas", Text.class, LongWritable.class, OutFormat.class); JavaDStream<String> ipAddressDStream = ipCumDStream.transform(
public void processAccessLogs(String outDir, JavaDStream<ApacheAccessLog> accessLogsDStream) { JavaDStream<ApacheAccessLog> windowDStream = accessLogsDStream.window( Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval()); JavaDStream<String> ip = accessLogsDStream.map( new Function<ApacheAccessLog, String>() { public String call(ApacheAccessLog entry) { }}); JavaDStream<Long> requestCountRBW = accessLogsDStream.map(new Function<ApacheAccessLog, Long>() { public Long call(ApacheAccessLog entry) { return 1L; return new Tuple2(entry.getIpAddress(), 1L); }}); JavaPairDStream<String, Long> ipCountDStream = ipAddressPairDStream.reduceByKeyAndWindow( Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval()); ipCountDStream.print(); JavaDStream<Long> requestCount = accessLogsDStream.countByWindow( Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval()); requestCount.print(); ipAddressRequestCount.print();
@Test public void testPairToNormalRDDTransform() { List<List<Tuple2<Integer, Integer>>> inputData = Arrays.asList( Arrays.asList( new Tuple2<>(3, 5), new Tuple2<>(1, 5), new Tuple2<>(4, 5), new Tuple2<>(2, 5)), Arrays.asList( new Tuple2<>(2, 5), new Tuple2<>(3, 5), new Tuple2<>(4, 5), new Tuple2<>(1, 5))); List<List<Integer>> expected = Arrays.asList( Arrays.asList(3, 1, 4, 2), Arrays.asList(2, 3, 4, 1)); JavaDStream<Tuple2<Integer, Integer>> stream = JavaTestUtils.attachTestInputStream( ssc, inputData, 1); JavaPairDStream<Integer, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream); JavaDStream<Integer> firstParts = pairStream.transform(in -> in.map(x -> x._1())); JavaTestUtils.attachTestOutputStream(firstParts); List<List<Integer>> result = JavaTestUtils.runStreams(ssc, 2, 2); Assert.assertEquals(expected, result); }
new Tuple2<>("california", Arrays.asList("dodgers", "giants")), new Tuple2<>("new york", Arrays.asList("yankees", "mets"))), Arrays.asList( new Tuple2<>("california", Arrays.asList("sharks", "ducks")), new Tuple2<>("new york", Arrays.asList("rangers", "islanders")))); JavaPairDStream<String, String> pairStream = JavaPairDStream.fromJavaDStream(stream); JavaPairDStream<String, Iterable<String>> grouped = pairStream.groupByKey(); JavaTestUtils.attachTestOutputStream(grouped); List<List<Tuple2<String, Iterable<String>>>> result = JavaTestUtils.runStreams(ssc, 2, 2);
@Test public void testReduceByKeyAndWindowWithInverse() { List<List<Tuple2<String, Integer>>> inputData = stringIntKVStream; List<List<Tuple2<String, Integer>>> expected = Arrays.asList( Arrays.asList(new Tuple2<>("california", 4), new Tuple2<>("new york", 5)), Arrays.asList(new Tuple2<>("california", 14), new Tuple2<>("new york", 9)), Arrays.asList(new Tuple2<>("california", 10), new Tuple2<>("new york", 4))); JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream); JavaPairDStream<String, Integer> reduceWindowed = pairStream.reduceByKeyAndWindow((x, y) -> x + y, (x, y) -> x - y, new Duration(2000), new Duration(1000)); JavaTestUtils.attachTestOutputStream(reduceWindowed); List<List<Tuple2<String, Integer>>> result = JavaTestUtils.runStreams(ssc, 3, 3); Assert.assertEquals(expected, result); }
@Test @SuppressWarnings("unchecked") public void javaAPI() { List<Vector> trainingBatch = Arrays.asList( Vectors.dense(1.0), Vectors.dense(0.0)); JavaDStream<Vector> training = attachTestInputStream(ssc, Arrays.asList(trainingBatch, trainingBatch), 2); List<Tuple2<Integer, Vector>> testBatch = Arrays.asList( new Tuple2<>(10, Vectors.dense(1.0)), new Tuple2<>(11, Vectors.dense(0.0))); JavaPairDStream<Integer, Vector> test = JavaPairDStream.fromJavaDStream( attachTestInputStream(ssc, Arrays.asList(testBatch, testBatch), 2)); StreamingKMeans skmeans = new StreamingKMeans() .setK(1) .setDecayFactor(1.0) .setInitialCenters(new Vector[]{Vectors.dense(1.0)}, new double[]{0.0}); skmeans.trainOn(training); JavaPairDStream<Integer, Integer> prediction = skmeans.predictOnValues(test); attachTestOutputStream(prediction.count()); runStreams(ssc, 2, 2); } }
Arrays.asList(new Tuple2<>(1, "x")), Arrays.asList(new Tuple2<>(2, "y")) ); Arrays.asList(new Tuple2<>(1, new Tuple2<>(1, "x"))), Arrays.asList(new Tuple2<>(2, new Tuple2<>(2, "y"))) ); JavaPairDStream<Integer, String> pairStream1 = JavaPairDStream.fromJavaDStream( JavaTestUtils.attachTestInputStream(ssc, pairStream1input, 1)); Arrays.asList(stream1, stream2, pairStream1.toJavaDStream()); JavaRDD<Integer> rdd2 = (JavaRDD<Integer>) listOfRDDs.get(1); JavaRDD<Tuple2<Integer, String>> rdd3 = (JavaRDD<Tuple2<Integer, String>>) listOfRDDs.get(2); JavaPairRDD<Integer, String> prdd3 = JavaPairRDD.fromJavaRDD(rdd3); PairFunction<Integer, Integer, Integer> mapToTuple = (Integer i) -> new Tuple2<>(i, i); return rdd1.union(rdd2).mapToPair(mapToTuple).join(prdd3); }); JavaTestUtils.attachTestOutputStream(transformed2);
List<List<Tuple2<Integer, Integer>>> inputData = Arrays.asList( Arrays.asList( new Tuple2<>(3, 5), new Tuple2<>(1, 5), new Tuple2<>(4, 5), new Tuple2<>(2, 5)), Arrays.asList( JavaPairDStream<Integer, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream); JavaPairDStream<Integer, Integer> sorted = pairStream.transformToPair(in -> in.sortByKey());
List<List<Tuple2<String, String>>> stringStringKVStream1 = Arrays.asList( Arrays.asList( new Tuple2<>("california", "dodgers"), new Tuple2<>("new york", "yankees")), Arrays.asList( new Tuple2<>("california", "sharks"), new Tuple2<>("new york", "rangers"))); JavaPairDStream<String, String> pairStream1 = JavaPairDStream.fromJavaDStream(stream1); JavaPairDStream<String, String> pairStream2 = JavaPairDStream.fromJavaDStream(stream2); pairStream1.transformWithToPair(pairStream2,(x, y, z) -> x.join(y));
@Test public void testVariousTransform() { // tests whether all variations of transform can be called from Java List<List<Integer>> inputData = Arrays.asList(Arrays.asList(1)); JavaDStream<Integer> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); List<List<Tuple2<String, Integer>>> pairInputData = Arrays.asList(Arrays.asList(new Tuple2<>("x", 1))); JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream( JavaTestUtils.attachTestInputStream(ssc, pairInputData, 1)); JavaDStream<Integer> transformed1 = stream.transform(in -> null); JavaDStream<Integer> transformed2 = stream.transform((x, time) -> null); JavaPairDStream<String, Integer> transformed3 = stream.transformToPair(x -> null); JavaPairDStream<String, Integer> transformed4 = stream.transformToPair((x, time) -> null); JavaDStream<Integer> pairTransformed1 = pairStream.transform(x -> null); JavaDStream<Integer> pairTransformed2 = pairStream.transform((x, time) -> null); JavaPairDStream<String, String> pairTransformed3 = pairStream.transformToPair(x -> null); JavaPairDStream<String, String> pairTransformed4 = pairStream.transformToPair((x, time) -> null); }
JavaDStream<IoTData> nonFilteredIotDataStream = directKafkaStream.map(tuple -> tuple._2()); JavaPairDStream<String,IoTData> iotDataPairStream = nonFilteredIotDataStream.mapToPair(iot -> new Tuple2<String,IoTData>(iot.getVehicleId(),iot)).reduceByKey((a, b) -> a ); .mapWithState(StateSpec.function(processedVehicleFunc).timeout(Durations.seconds(3600)));//maintain state for one hour .filter(tuple -> tuple._2.equals(Boolean.FALSE)); JavaDStream<IoTData> filteredIotDataStream = filteredIotDStreams.map(tuple -> tuple._1);
@Test public void testPairReduceByKey() { List<List<Tuple2<String, Integer>>> inputData = stringIntKVStream; List<List<Tuple2<String, Integer>>> expected = Arrays.asList( Arrays.asList( new Tuple2<>("california", 4), new Tuple2<>("new york", 5)), Arrays.asList( new Tuple2<>("california", 10), new Tuple2<>("new york", 4))); JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream( ssc, inputData, 1); JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream); JavaPairDStream<String, Integer> reduced = pairStream.reduceByKey((x, y) -> x + y); JavaTestUtils.attachTestOutputStream(reduced); List<List<Tuple2<String, Integer>>> result = JavaTestUtils.runStreams(ssc, 2, 2); Assert.assertEquals(expected, result); }
@SuppressWarnings("unchecked") @Test public void testVariousTransformWith() { // tests whether all variations of transformWith can be called from Java List<List<Integer>> inputData1 = Arrays.asList(Arrays.asList(1)); List<List<String>> inputData2 = Arrays.asList(Arrays.asList("x")); JavaDStream<Integer> stream1 = JavaTestUtils.attachTestInputStream(ssc, inputData1, 1); JavaDStream<String> stream2 = JavaTestUtils.attachTestInputStream(ssc, inputData2, 1); List<List<Tuple2<String, Integer>>> pairInputData1 = Arrays.asList(Arrays.asList(new Tuple2<>("x", 1))); List<List<Tuple2<Double, Character>>> pairInputData2 = Arrays.asList(Arrays.asList(new Tuple2<>(1.0, 'x'))); JavaPairDStream<String, Integer> pairStream1 = JavaPairDStream.fromJavaDStream( JavaTestUtils.attachTestInputStream(ssc, pairInputData1, 1)); JavaPairDStream<Double, Character> pairStream2 = JavaPairDStream.fromJavaDStream( JavaTestUtils.attachTestInputStream(ssc, pairInputData2, 1)); stream1.transformWith(stream2, (rdd1, rdd2, time) -> null); stream1.transformWith(pairStream1, (rdd1, rdd2, time) -> null); stream1.transformWithToPair(stream2, (rdd1, rdd2, time) -> null); stream1.transformWithToPair(pairStream1, (rdd1, rdd2, time) -> null); pairStream1.transformWith(stream2, (rdd1, rdd2, time) -> null); pairStream1.transformWith(pairStream1, (rdd1, rdd2, time) -> null); pairStream1.transformWithToPair(stream2, (rdd1, rdd2, time) -> null); pairStream1.transformWithToPair(pairStream2, (rdd1, rdd2, time) -> null); }
@Test public void testCombineByKey() { List<List<Tuple2<String, Integer>>> inputData = stringIntKVStream; List<List<Tuple2<String, Integer>>> expected = Arrays.asList( Arrays.asList( new Tuple2<>("california", 4), new Tuple2<>("new york", 5)), Arrays.asList( new Tuple2<>("california", 10), new Tuple2<>("new york", 4))); JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream( ssc, inputData, 1); JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream); JavaPairDStream<String, Integer> combined = pairStream.combineByKey(i -> i, (x, y) -> x + y, (x, y) -> x + y, new HashPartitioner(2)); JavaTestUtils.attachTestOutputStream(combined); List<List<Tuple2<String, Integer>>> result = JavaTestUtils.runStreams(ssc, 2, 2); Assert.assertEquals(expected, result); }
private <K, S, T> void testOperation( List<List<K>> input, StateSpec<K, Integer, S, T> mapWithStateSpec, List<Set<T>> expectedOutputs, List<Set<Tuple2<K, S>>> expectedStateSnapshots) { int numBatches = expectedOutputs.size(); JavaDStream<K> inputStream = JavaTestUtils.attachTestInputStream(ssc, input, 2); JavaMapWithStateDStream<K, Integer, S, T> mapWithStateDStream = JavaPairDStream.fromJavaDStream( inputStream.map(x -> new Tuple2<>(x, 1))).mapWithState(mapWithStateSpec); List<Set<T>> collectedOutputs = Collections.synchronizedList(new ArrayList<Set<T>>()); mapWithStateDStream.foreachRDD(rdd -> collectedOutputs.add(Sets.newHashSet(rdd.collect()))); List<Set<Tuple2<K, S>>> collectedStateSnapshots = Collections.synchronizedList(new ArrayList<Set<Tuple2<K, S>>>()); mapWithStateDStream.stateSnapshots().foreachRDD(rdd -> collectedStateSnapshots.add(Sets.newHashSet(rdd.collect()))); BatchCounter batchCounter = new BatchCounter(ssc.ssc()); ssc.start(); ((ManualClock) ssc.ssc().scheduler().clock()) .advance(ssc.ssc().progressListener().batchDuration() * numBatches + 1); batchCounter.waitUntilBatchesCompleted(numBatches, 10000); Assert.assertEquals(expectedOutputs, collectedOutputs); Assert.assertEquals(expectedStateSnapshots, collectedStateSnapshots); } }
JavaStreamingContext sc = new JavaStreamingContext(sparkcontext, new Duration(5000)); JavaDStream<String> words = statuses.flatMap(l -> Arrays.asList(l.split(" ")).iterator()); JavaDStream<String> hashTags = words.filter((Function<String, Boolean>) word -> word.startsWith("#")); JavaPairDStream<String, Integer> tuples = hashTags.mapToPair(l -> new Tuple2<>(l.substring(1).toLowerCase(), 1)); JavaPairDStream<String, Integer> counts = tuples.reduceByKeyAndWindow( (Function2<Integer, Integer, Integer>) (i1, i2) -> i1 + i2, (Function2<Integer, Integer, Integer>) (i1, i2) -> i1 - i2, new Duration(60 * 5 * 1000), /* Window Length */ new Duration(60 * 5 * 1000) /* Sliding Interval */ ); JavaPairDStream<Integer, String> swappedCounts = counts.mapToPair( (PairFunction<Tuple2<String, Integer>, Integer, String>) in -> in.swap() ); JavaPairDStream<Integer, String> sortedCounts = swappedCounts.transformToPair( (Function<JavaPairRDD<Integer, String>, JavaPairRDD<Integer, String>>) in -> in.sortByKey(false) ); sortedCounts.foreachRDD( rdd -> { for (Tuple2<Integer, String> t: rdd.take(25)) out.append(t.toString()).append("\n");
@SuppressWarnings("unchecked") @Test public void testGroupByKeyAndWindow() { List<List<Tuple2<String, Integer>>> inputData = stringIntKVStream; List<List<Tuple2<String, List<Integer>>>> expected = Arrays.asList( Arrays.asList( new Tuple2<>("california", Arrays.asList(1, 3)), new Tuple2<>("new york", Arrays.asList(1, 4)) ), Arrays.asList( new Tuple2<>("california", Arrays.asList(1, 3, 5, 5)), new Tuple2<>("new york", Arrays.asList(1, 1, 3, 4)) ), Arrays.asList( new Tuple2<>("california", Arrays.asList(5, 5)), new Tuple2<>("new york", Arrays.asList(1, 3)) ) ); JavaDStream<Tuple2<String, Integer>> stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream); JavaPairDStream<String, Iterable<Integer>> groupWindowed = pairStream.groupByKeyAndWindow(new Duration(2000), new Duration(1000)); JavaTestUtils.attachTestOutputStream(groupWindowed); List<List<Tuple2<String, List<Integer>>>> result = JavaTestUtils.runStreams(ssc, 3, 3); Assert.assertEquals(expected.size(), result.size()); for (int i = 0; i < result.size(); i++) { Assert.assertEquals(convert(expected.get(i)), convert(result.get(i))); } }
JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(1000)); JavaDStream<StreamingItem> streamOfItems = streamOfRecords.map(s -> new StreamingItem(s)); streamOfItems.mapToPair(si -> new Tuple2<>(si.getCategory(), si)); streamOfPairs.combineByKey(createCombinerFunction, mergeValueFunction, mergeCombinersFunction, new HashPartitioner(4)); streamOfCategoryCounts.foreachRDD(rdd -> { System.out.println("Batch size: " + rdd.count()); rdd.foreach(e -> System.out.println(e)); });
new Tuple2<>("california", 1), new Tuple2<>("new york", 2)); JavaPairRDD<String, Integer> initialRDD = JavaPairRDD.fromJavaRDD(tmpRDD); Arrays.asList(new Tuple2<>("california", 5), new Tuple2<>("new york", 7)), Arrays.asList(new Tuple2<>("california", 15), JavaPairDStream<String, Integer> pairStream = JavaPairDStream.fromJavaDStream(stream); JavaPairDStream<String, Integer> updated = pairStream.updateStateByKey((values, state) -> { int out = 0; if (state.isPresent()) {