protected final JavaStreamingContext buildStreamingContext() { log.info("Starting SparkContext with interval {} seconds", generationIntervalSec); SparkConf sparkConf = new SparkConf(); // Only for tests, really if (sparkConf.getOption("spark.master").isEmpty()) { log.info("Overriding master to {} for tests", streamingMaster); sparkConf.setMaster(streamingMaster); } // Only for tests, really if (sparkConf.getOption("spark.app.name").isEmpty()) { String appName = "Oryx" + getLayerName(); if (id != null) { appName = appName + "-" + id; } log.info("Overriding app name to {} for tests", appName); sparkConf.setAppName(appName); } extraSparkConfig.forEach((key, value) -> sparkConf.setIfMissing(key, value.toString())); // Turn this down to prevent long blocking at shutdown sparkConf.setIfMissing( "spark.streaming.gracefulStopTimeout", Long.toString(TimeUnit.MILLISECONDS.convert(generationIntervalSec, TimeUnit.SECONDS))); sparkConf.setIfMissing("spark.cleaner.ttl", Integer.toString(20 * generationIntervalSec)); long generationIntervalMS = TimeUnit.MILLISECONDS.convert(generationIntervalSec, TimeUnit.SECONDS); JavaSparkContext jsc = JavaSparkContext.fromSparkContext(SparkContext.getOrCreate(sparkConf)); return new JavaStreamingContext(jsc, new Duration(generationIntervalMS)); }
@Override public void setup() { // Create a StreamingContext with a SparkConf configuration SparkConf sparkConf = new SparkConf(false) .setAppName("JaiSpark") .setSparkHome("target/sparkhome") .setMaster("local") .set("spark.executor.memory", "128m") .set("spark.local.dir", new File("target/sparkhome/tmp").getAbsolutePath()) .set("spark.cores.max", "2").set("spark.akka.threads", "2") .set("spark.akka.timeout", "60").set("spark.logConf", "true") .set("spark.cleaner.delay", "3700") .set("spark.cleaner.ttl", "86400") .set("spark.shuffle.spill", "false") .set("spark.driver.host", "localhost") .set("spark.driver.port", "43214"); jssc = new JavaStreamingContext(sparkConf, new Duration(5000)); String checkpointDir = hadoopClusterService.getHDFSUri() + "/sparkcheckpoint"; jssc.checkpoint(checkpointDir); startFlumeStream(); }
@Before public void setUp() { kafkaTestUtils = new KafkaTestUtils(); kafkaTestUtils.setup(); SparkConf sparkConf = new SparkConf() .setMaster("local[4]").setAppName(this.getClass().getSimpleName()); ssc = new JavaStreamingContext(sparkConf, new Duration(500)); }
public static void main(String[] args) { SparkConf conf = new SparkConf() .setAppName("kafka-sandbox") .setMaster("local[*]"); JavaSparkContext sc = new JavaSparkContext(conf); JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(2000)); Set<String> topics = Collections.singleton("mytopic"); Map<String, String> kafkaParams = new HashMap<>(); kafkaParams.put("metadata.broker.list", "localhost:9092"); JavaPairInputDStream<String, byte[]> directKafkaStream = KafkaUtils.createDirectStream(ssc, String.class, byte[].class, StringDecoder.class, DefaultDecoder.class, kafkaParams, topics); directKafkaStream .map(message -> recordInjection.invert(message._2).get()) .foreachRDD(rdd -> { rdd.foreach(record -> { System.out.println("str1= " + record.get("str1") + ", str2= " + record.get("str2") + ", int1=" + record.get("int1")); }); }); ssc.start(); ssc.awaitTermination(); }
@SuppressWarnings("unchecked") @Test public void testQueueStream() { ssc.stop(); // Create a new JavaStreamingContext without checkpointing SparkConf conf = new SparkConf() .setMaster("local[2]") .setAppName("test") .set("spark.streaming.clock", "org.apache.spark.util.ManualClock"); ssc = new JavaStreamingContext(conf, new Duration(1000)); List<List<Integer>> expected = Arrays.asList( Arrays.asList(1,2,3), Arrays.asList(4,5,6), Arrays.asList(7,8,9)); JavaSparkContext jsc = new JavaSparkContext(ssc.ssc().sc()); JavaRDD<Integer> rdd1 = jsc.parallelize(Arrays.asList(1, 2, 3)); JavaRDD<Integer> rdd2 = jsc.parallelize(Arrays.asList(4, 5, 6)); JavaRDD<Integer> rdd3 = jsc.parallelize(Arrays.asList(7,8,9)); Queue<JavaRDD<Integer>> rdds = new LinkedList<>(); rdds.add(rdd1); rdds.add(rdd2); rdds.add(rdd3); JavaDStream<Integer> stream = ssc.queueStream(rdds); JavaTestUtils.attachTestOutputStream(stream); List<List<Integer>> result = JavaTestUtils.runStreams(ssc, 3, 3); Assert.assertEquals(expected, result); }
public static void main(String[] args) throws Exception { String zkQuorum = args[0]; String group = args[1]; SparkConf conf = new SparkConf().setAppName("KafkaInput"); // Create a StreamingContext with a 1 second batch size JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(1000)); Map<String, Integer> topics = new HashMap<String, Integer>(); topics.put("pandas", 1); JavaPairDStream<String, String> input = KafkaUtils.createStream(jssc, zkQuorum, group, topics); input.print(); // start our streaming context and wait for it to "finish" jssc.start(); // Wait for 10 seconds then exit. To run forever call without a timeout jssc.awaitTermination(10000); // Stop the streaming context jssc.stop(); } }
@Before public void setUp() { SparkConf conf = new SparkConf() .set("spark.streaming.clock", "org.apache.spark.util.ManualClock"); spark = SparkSession.builder() .master("local[2]") .appName("JavaStatistics") .config(conf) .getOrCreate(); jsc = new JavaSparkContext(spark.sparkContext()); ssc = new JavaStreamingContext(jsc, new Duration(1000)); ssc.checkpoint("checkpoint"); }
SparkConf sparkConf = new SparkConf().setAppName("Stream Tweets").setMaster("local[2]"); JavaStreamingContext sc = new JavaStreamingContext(sparkcontext, new Duration(5000)); JavaDStream<String> words = statuses.flatMap(l -> Arrays.asList(l.split(" ")).iterator()); JavaDStream<String> hashTags = words.filter((Function<String, Boolean>) word -> word.startsWith("#")); JavaPairDStream<String, Integer> tuples = hashTags.mapToPair(l -> new Tuple2<>(l.substring(1).toLowerCase(), 1)); JavaPairDStream<String, Integer> counts = tuples.reduceByKeyAndWindow( (Function2<Integer, Integer, Integer>) (i1, i2) -> i1 + i2, (Function2<Integer, Integer, Integer>) (i1, i2) -> i1 - i2, new Duration(60 * 5 * 1000), /* Window Length */ new Duration(60 * 5 * 1000) /* Sliding Interval */ ); JavaPairDStream<Integer, String> swappedCounts = counts.mapToPair( (PairFunction<Tuple2<String, Integer>, Integer, String>) in -> in.swap() ); JavaPairDStream<Integer, String> sortedCounts = swappedCounts.transformToPair( (Function<JavaPairRDD<Integer, String>, JavaPairRDD<Integer, String>>) in -> in.sortByKey(false) ); });
private JavaStreamingContext create(String streamingContextName, int port, long streamingBatchTime, String sparkHost) { SparkConf conf = new SparkConf(); conf.set("spark.ui.port", String.valueOf(port)); conf.setAppName(streamingContextName); conf.setJars(JavaStreamingContext.jarOfClass(StreamingEngine.class)); conf.setMaster(sparkHost); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.registerKryoClasses(new Class[] { StratioStreamingMessage.class, InsertMessage.class, ColumnType.class, Action.class}); HashMap<String, String> tuningProperties = configurationContext.getSparkTunningProperties(); if (tuningProperties != null && tuningProperties.size() > 0) { tuningProperties.forEach( (key, value) -> conf.set(key, value)); } JavaStreamingContext streamingContext = new JavaStreamingContext(conf, new Duration(streamingBatchTime)); return streamingContext; }
@Memoized JavaStreamingContext jsc() { SparkConf conf = new SparkConf(true) .setMaster(master()) .setAppName(getClass().getName()); if (!jars().isEmpty()) conf.setJars(jars().toArray(new String[0])); for (Map.Entry<String, String> entry : conf().entrySet()) { conf.set(entry.getKey(), entry.getValue()); } return new JavaStreamingContext(conf, new Duration(batchDuration())); }
public static void main(String[] args) throws DataIngestException { CmdLineParser cmdLineParser = new CmdLineParser(); final AppArgs appArgs = cmdLineParser.validateArgs(args); System.setProperty("HADOOP_USER_NAME", appArgs.getProperty(DiPConfiguration.HADOOP_USER_NAME)); SparkConf conf = new SparkConf().setAppName("SparkTwitterStreaming") .setMaster("local[*]"); try (JavaStreamingContext jsc = new JavaStreamingContext(new JavaSparkContext(conf), new Duration(1000))) { JavaPairReceiverInputDStream<String, String> stream = KafkaUtils.createStream(jsc, appArgs.getProperty(DiPConfiguration.ZK_HOST)+":"+appArgs.getProperty(DiPConfiguration.ZK_PORT), "spark-stream", getKafkaTopics(appArgs)); JavaDStream<Object[]> twitterStreams = stream.map(tuple -> FlatJsonConverter.convertToValuesArray(tuple._2)) .cache(); SparkHdfsWriter.write(twitterStreams, appArgs); new SparkHBaseWriter(jsc.sparkContext(), appArgs).write(twitterStreams); SparkJdbcSourceWriter jdbcSourceWriter = new SparkJdbcSourceWriter(new SQLContext(jsc.sparkContext()), appArgs); new TopNLocationByTweets(jdbcSourceWriter,Integer.valueOf(appArgs.getProperty("topN"))).compute(twitterStreams); new TopNUsersWithMaxFollowers(jdbcSourceWriter,Integer.valueOf(appArgs.getProperty("topN"))).compute(twitterStreams); jsc.start(); jsc.awaitTermination(); } }
Duration batchInterval = new Duration(2000); SparkConf sparkConfig = new SparkConf().setAppName("JavaKinesisWordCountASL"); JavaStreamingContext jssc = new JavaStreamingContext(sparkConfig, batchInterval); unionStreams = jssc.union(streamsList.get(0), streamsList.subList(1, streamsList.size())); } else { JavaDStream<String> words = unionStreams.flatMap(new FlatMapFunction<byte[], String>() { @Override public Iterator<String> call(byte[] line) { JavaPairDStream<String, Integer> wordCounts = words.mapToPair( new PairFunction<String, String, Integer>() { @Override ).reduceByKey( new Function2<Integer, Integer, Integer>() { @Override wordCounts.print(); jssc.start(); jssc.awaitTermination();
SparkConf sparkConf = new SparkConf().setAppName("StreamingAvg"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, new Duration(10000)); JavaDStream<Tuple2<Integer,Integer>> countAndSum = nums.reduce(new Function2<Tuple2<Integer,Integer>, Tuple2<Integer,Integer>, Tuple2<Integer,Integer>>() { @Override public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> a, Tuple2<Integer, Integer> b) { countAndSum.foreachRDD(new Function<JavaRDD<Tuple2<Integer, Integer>>, Void>() { @Override public Void call(JavaRDD<Tuple2<Integer, Integer>> tuple2JavaRDD) throws Exception { ssc.start(); ssc.awaitTermination();
public static void main(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: JavaFlumeEventCount <host> <port>"); System.exit(1); } String host = args[0]; int port = Integer.parseInt(args[1]); Duration batchInterval = new Duration(2000); SparkConf sparkConf = new SparkConf().setAppName("JavaFlumeEventCount"); JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, batchInterval); JavaReceiverInputDStream<SparkFlumeEvent> flumeStream = FlumeUtils.createStream(ssc, host, port); flumeStream.count(); flumeStream.count().map(in -> "Received " + in + " flume events.").print(); ssc.start(); ssc.awaitTermination(); } }
System.out.println("columnFamily:" + columnFamily); SparkConf sparkConf = new SparkConf(); sparkConf.set("spark.cleaner.ttl", "120000"); jsc.addJar("SparkHBase.jar"); JavaStreamingContext jssc = new JavaStreamingContext(jsc, new Duration(1000)); JavaReceiverInputDStream<String> javaDstream = jssc.socketTextStream(host, Integer.parseInt(port));
JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(1000)); JavaDStream<String> streamOfRecords = ssc.textFileStream(fm.getDestination().getAbsolutePath()); JavaDStream<String> threeSecondsEverySecond = streamOfRecords.window(new Duration(3000)); streamOfRecords.window(new Duration(5000), new Duration(2000)); streamOfRecords.foreachRDD((rdd, timeStamp) -> System.out.println("[original] TS: " + timeStamp + " Item count = " + rdd.count())); ssc.start();
public static void main(String[] args) throws Exception { String master = args[0]; JavaSparkContext sc = new JavaSparkContext(master, "StreamingLogInput"); // Create a StreamingContext with a 1 second batch size JavaStreamingContext jssc = new JavaStreamingContext(sc, new Duration(1000)); // Create a DStream from all the input on port 7777 JavaDStream<String> lines = jssc.socketTextStream("localhost", 7777); // Filter our DStream for lines with "error" JavaDStream<String> errorLines = lines.filter(new Function<String, Boolean>() { public Boolean call(String line) { return line.contains("error"); }}); // Print out the lines with errors, which causes this DStream to be evaluated errorLines.print(); // start our streaming context and wait for it to "finish" jssc.start(); // Wait for 10 seconds then exit. To run forever call without a timeout jssc.awaitTermination(10000); // Stop the streaming context jssc.stop(); } }
JavaStreamingContext ssc = new JavaStreamingContext("local[2]", "test", new Duration(200)); JavaReceiverInputDStream<String> input = ssc.receiverStream(new JavaSocketReceiver("localhost", server.port())); JavaDStream<String> mapped = input.map((Function<String, String>) v1 -> v1 + "."); mapped.foreachRDD((VoidFunction<JavaRDD<String>>) rdd -> { long count = rdd.count(); dataCounter.addAndGet(count); }); ssc.start(); long startTime = System.currentTimeMillis(); long timeout = 10000;
JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(1000)); checkpointDir.mkdir(); checkpointDir.deleteOnExit(); ssc.checkpoint(checkpointPath); JavaDStream<String> streamOfRecords = ssc.textFileStream(fm.getDestination().getAbsolutePath()); streamOfRecords.foreachRDD(rdd -> { long records = rdd.count(); System.out.println("[1] Records in this RDD: " + records);
JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(1000)); checkpointDir.mkdir(); checkpointDir.deleteOnExit(); ssc.checkpoint(checkpointPath); JavaDStream<String> streamOfRecords = ssc.textFileStream(fm.getDestination().getAbsolutePath()); JavaDStream<StreamingItem> streamOfItems = streamOfRecords.map(s -> new StreamingItem(s)); streamOfItems.mapToPair(si -> new Tuple2<>(si.getCategory(), si)); streamOfPairs.mapWithState(StateSpec.function(mappingFunction));