Class.forName("scala.collection.mutable.WrappedArray$ofRef") }; SparkConf conf = new SparkConf().setAppName("Merge dictionary for cube:" + cubeName + ", segment " + segmentId); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); try (JavaSparkContext sc = new JavaSparkContext(conf)) { KylinSparkJobListener jobListener = new KylinSparkJobListener(); sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(dictOutputPath)); final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration()); JavaRDD<Integer> indexRDD = sc.parallelize(indexs, columnLength + 1); JavaPairRDD<Text, Text> colToDictPathRDD = indexRDD.mapToPair(new MergeDictAndStatsFunction(cubeName, metaUrl, segmentId, StringUtil.splitByComma(segmentIds), statOutputPath, tblColRefs, sConf));
SparkConf conf = new SparkConf().setMaster(master).setAppName("basicavgwithkyro"); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", AvgRegistrator.class.getName()); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); Function2<AvgCount, Integer, AvgCount> addAndCount = new Function2<AvgCount, Integer, AvgCount>() { @Override AvgCount result = rdd.aggregate(initial, addAndCount, combine); System.out.println(result.avg());
private SparkConf createSparkConf(List<SparkConfiguration.Configuration> configurations, SparkConf old) { SparkConf sparkConf = new SparkConf(); sparkConf.set(SPARK_EXTRA_LISTENERS, old.get(SPARK_EXTRA_LISTENERS)); sparkConf.set(BEAKERX_ID, old.get(BEAKERX_ID)); if (old.contains(SPARK_APP_NAME)) { sparkConf.set(SPARK_APP_NAME, old.get(SPARK_APP_NAME)); } configurations.forEach(x -> { if (x.getName() != null) { sparkConf.set(x.getName(), (x.getValue() != null) ? x.getValue() : ""); } }); return sparkConf; }
public static JavaSparkContext createConf() { SparkConf sparkConf = new SparkConf(); sparkConf.setAppName("animalClass"); JavaSparkContext sc = new JavaSparkContext(sparkConf); return sc; } }
@Before public void setUp() { SparkConf conf = new SparkConf() .setMaster("local[2]") .setAppName("test") .set("spark.streaming.clock", "org.apache.spark.util.ManualClock"); ssc = new JavaStreamingContext(conf, new Duration(1000)); ssc.checkpoint("checkpoint"); }
SparkConf conf = new SparkConf(); conf.setMaster("local[*]"); conf.setAppName("DataVec Join Example"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<List<Writable>> customerInfo = sc.textFile(customerInfoPath).map(new StringToWritablesFunction(rr)); JavaRDD<List<Writable>> purchaseInfo = sc.textFile(purchaseInfoPath).map(new StringToWritablesFunction(rr)); List<List<Writable>> customerInfoList = customerInfo.collect(); List<List<Writable>> purchaseInfoList = purchaseInfo.collect();
SparkConf conf = new SparkConf(); conf.setMaster("local[*]"); conf.setAppName("DataVec Example"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> stringData = sc.textFile(filename); JavaRDD<List<Writable>> parsedInputData = stringData.filter((x) -> !x.isEmpty()).map(new StringToWritablesFunction(rr)); List<String> inputDataCollected = stringData.collect(); System.out.println("\n\n---- Original Data ----"); for(String s : inputDataCollected) System.out.println("'" + s + "'");
@Test public void javaSparkContext() { String[] jars = new String[] {}; java.util.Map<String, String> environment = new java.util.HashMap<>(); new JavaSparkContext(new SparkConf().setMaster("local").setAppName("name")).stop(); new JavaSparkContext("local", "name", new SparkConf()).stop(); new JavaSparkContext("local", "name").stop(); new JavaSparkContext("local", "name", "sparkHome", "jarFile").stop(); new JavaSparkContext("local", "name", "sparkHome", jars).stop(); new JavaSparkContext("local", "name", "sparkHome", jars, environment).stop(); }
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("JavaBookExample"); JavaSparkContext sc = new JavaSparkContext(sparkConf); JavaRDD<String> spam = sc.textFile("files/spam.txt"); JavaRDD<String> ham = sc.textFile("files/ham.txt"); JavaRDD<LabeledPoint> positiveExamples = spam.map(new Function<String, LabeledPoint>() { @Override public LabeledPoint call(String email) { return new LabeledPoint(1, tf.transform(Arrays.asList(email.split(" ")))); JavaRDD<LabeledPoint> negativeExamples = ham.map(new Function<String, LabeledPoint>() { @Override public LabeledPoint call(String email) { return new LabeledPoint(0, tf.transform(Arrays.asList(email.split(" ")))); JavaRDD<LabeledPoint> trainingData = positiveExamples.union(negativeExamples); trainingData.cache(); // Cache data since Logistic Regression is an iterative algorithm.
Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey") }; SparkConf conf = new SparkConf().setAppName("Fact distinct columns for:" + cubeName + " segment " + segmentId); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); try (JavaSparkContext sc = new JavaSparkContext(conf)) { sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration()); final LongAccumulator bytesWritten = sc.sc().longAccumulator(); JavaPairRDD<SelfDefineSortableKey, Text> flatOutputRDD = recordRDD.mapPartitionsToPair( new FlatOutputFucntion(cubeName, segmentId, metaUrl, sConf, samplingPercent, bytesWritten)); long recordCount = recordRDD.count(); logger.info("Map input records={}", recordCount); logger.info("HDFS Read: {} HDFS Write", bytesWritten.value());
.build(); SparkConf conf = new SparkConf(); conf.setMaster("local[*]"); conf.setAppName("DataVec Example"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> stringData = sc.textFile(directory); JavaRDD<List<Writable>> parsedInputData = stringData.map(new StringToWritablesFunction(rr));
private SparkConf initializeSparkConf(String pluginNames) { return new SparkConf() .setMaster("local") .setAppName("test") .set(EXECUTOR_PLUGIN_CONF_NAME, pluginNames); }
String mongodbUri = MONGODB_HOST + args[3]; SparkConf conf = new SparkConf().setAppName("SparkRecommender"); JavaSparkContext sc = new JavaSparkContext(conf); Logger log = sc.sc().log(); predictionsConfig.set("mongo.output.uri", mongodbUri); JavaPairRDD<Object,BSONObject> bsonRatingsData = sc.newAPIHadoopFile( ratingsUri, BSONFileInputFormat.class, Object.class, BSONObject.class, bsonDataConfig); log.warn("ratings = " + ratingsData.count()); log.warn("users = " + userData.count()); JavaRDD<Object> movieData = sc.newAPIHadoopFile(moviesUri, log.warn("movies = " + movieData.count()); JavaRDD<Rating> predictions = model.predict(usersMovies.rdd()).toJavaRDD(); Object.class, Object.class, MongoOutputFormat.class, predictionsConfig); sc.sc().log().info("predictionsOutput.splits() = " + predictionsOutput.splits().size());
protected final JavaStreamingContext buildStreamingContext() { log.info("Starting SparkContext with interval {} seconds", generationIntervalSec); SparkConf sparkConf = new SparkConf(); // Only for tests, really if (sparkConf.getOption("spark.master").isEmpty()) { log.info("Overriding master to {} for tests", streamingMaster); sparkConf.setMaster(streamingMaster); } // Only for tests, really if (sparkConf.getOption("spark.app.name").isEmpty()) { String appName = "Oryx" + getLayerName(); if (id != null) { appName = appName + "-" + id; } log.info("Overriding app name to {} for tests", appName); sparkConf.setAppName(appName); } extraSparkConfig.forEach((key, value) -> sparkConf.setIfMissing(key, value.toString())); // Turn this down to prevent long blocking at shutdown sparkConf.setIfMissing( "spark.streaming.gracefulStopTimeout", Long.toString(TimeUnit.MILLISECONDS.convert(generationIntervalSec, TimeUnit.SECONDS))); sparkConf.setIfMissing("spark.cleaner.ttl", Integer.toString(20 * generationIntervalSec)); long generationIntervalMS = TimeUnit.MILLISECONDS.convert(generationIntervalSec, TimeUnit.SECONDS); JavaSparkContext jsc = JavaSparkContext.fromSparkContext(SparkContext.getOrCreate(sparkConf)); return new JavaStreamingContext(jsc, new Duration(generationIntervalMS)); }
public static void main(String[] args) throws Exception { String zkQuorum = args[0]; String group = args[1]; SparkConf conf = new SparkConf().setAppName("KafkaInput"); // Create a StreamingContext with a 1 second batch size JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(1000)); Map<String, Integer> topics = new HashMap<String, Integer>(); topics.put("pandas", 1); JavaPairDStream<String, String> input = KafkaUtils.createStream(jssc, zkQuorum, group, topics); input.print(); // start our streaming context and wait for it to "finish" jssc.start(); // Wait for 10 seconds then exit. To run forever call without a timeout jssc.awaitTermination(10000); // Stop the streaming context jssc.stop(); } }
public static void main(String[] args) { String inputFile = args[0]; SparkConf conf = new SparkConf(); JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlCtx = new SQLContext(sc); DataFrame input = sqlCtx.jsonFile(inputFile); System.out.println(row.get(0)); JavaRDD<String> topTweetText = topTweets.toJavaRDD().map(new Function<Row, String>() { public String call(Row row) { return row.getString(0); }}); System.out.println(topTweetText.collect()); JavaRDD<HappyPerson> happyPeopleRDD = sc.parallelize(peopleList); DataFrame happyPeopleSchemaRDD = sqlCtx.applySchema(happyPeopleRDD, HappyPerson.class); happyPeopleSchemaRDD.registerTempTable("happy_people"); System.out.println(row.get(0)); sc.stop();
@Test public void scalaSparkContext() { List<String> jars = List$.MODULE$.empty(); Map<String, String> environment = Map$.MODULE$.empty(); new SparkContext(new SparkConf().setMaster("local").setAppName("name")).stop(); new SparkContext("local", "name", new SparkConf()).stop(); new SparkContext("local", "name").stop(); new SparkContext("local", "name", "sparkHome").stop(); new SparkContext("local", "name", "sparkHome", jars).stop(); new SparkContext("local", "name", "sparkHome", jars, environment).stop(); } }
SparkConf sparkConf = new SparkConf(); sparkConf.setMaster("local"); sparkConf.setAppName("TestSparkPlan-app"); sc = new JavaSparkContext(sparkConf); RDD<Tuple2<HiveKey, BytesWritable>> reducerRdd = sparkPlan.generateGraph().rdd(); Assert.assertTrue(reducerRdd.name().contains("Reducer 2")); Assert.assertTrue(reducerRdd instanceof MapPartitionsRDD); Assert.assertTrue(reducerRdd.creationSite().shortForm().contains("Reducer 2")); Assert.assertTrue(reducerRdd.creationSite().longForm().contains("Explain Plan")); Assert.assertTrue(reducerRdd.creationSite().longForm().contains("Reducer 2")); sc.close();