JavaSparkContext sc; JavaRDD<String> data = sc.textFile("path/input.csv"); JavaSQLContext sqlContext = new JavaSQLContext(sc); JavaRDD<Record> rdd_records = sc.textFile(data).map( new Function<String, Record>() { public Record call(String line) throws Exception { // Here you can use JSON // Gson gson = new Gson(); // gson.fromJson(line, Record.class); String[] fields = line.split(","); Record sd = new Record(fields[0], fields[1], fields[2].trim(), fields[3]); return sd; } });
/*** SimpleApp.java ***/ import org.apache.spark.api.java.*; import org.apache.spark.api.java.function.Function; public class SimpleApp { public static void main(String[] args) { String logFile = "$YOUR_SPARK_HOME/README.md"; // Should be some file on your system JavaSparkContext sc = new JavaSparkContext("local", "Simple App", "$YOUR_SPARK_HOME", new String[]{"target/simple-project-1.0.jar"}); JavaRDD<String> logData = sc.textFile(logFile).cache(); long numAs = logData.filter(new Function<String, Boolean>() { public Boolean call(String s) { return s.contains("a"); } }).count(); long numBs = logData.filter(new Function<String, Boolean>() { public Boolean call(String s) { return s.contains("b"); } }).count(); System.out.println("Lines with a: " + numAs + ", lines with b: " + numBs); } }
private static JavaPairRDD<String,float[]> readFeaturesRDD(JavaSparkContext sparkContext, Path path) { log.info("Loading features RDD from {}", path); JavaRDD<String> featureLines = sparkContext.textFile(path.toString()); return featureLines.mapToPair(line -> { List<?> update = TextUtils.readJSON(line, List.class); String key = update.get(0).toString(); float[] vector = TextUtils.convertViaJSON(update.get(1), float[].class); return new Tuple2<>(key, vector); }); }
public static void main(String[] args) throws Exception { if (args.length != 2) { throw new Exception("Usage BasicFlatMap sparkMaster inputFile"); } JavaSparkContext sc = new JavaSparkContext( args[0], "basicflatmap", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<String> rdd = sc.textFile(args[1]); JavaRDD<String> words = rdd.flatMap( new FlatMapFunction<String, String>() { public Iterable<String> call(String x) { return Arrays.asList(x.split(" ")); }}); Map<String, Long> result = words.countByValue(); for (Entry<String, Long> entry: result.entrySet()) { System.out.println(entry.getKey() + ":" + entry.getValue()); } } }
public void run(String master, String csv1, String csv2) throws Exception { JavaSparkContext sc = new JavaSparkContext( master, "basicjoincsv", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<String> csvFile1 = sc.textFile(csv1); JavaRDD<String> csvFile2 = sc.textFile(csv2); JavaPairRDD<Integer, String[]> keyedRDD1 = csvFile1.mapToPair(new ParseLine()); JavaPairRDD<Integer, String[]> keyedRDD2 = csvFile1.mapToPair(new ParseLine()); JavaPairRDD<Integer, Tuple2<String[], String[]>> result = keyedRDD1.join(keyedRDD2); List<Tuple2<Integer, Tuple2<String[], String[]>>> resultCollection = result.collect(); } }
public static void main(String[] args) throws Exception { if (args.length != 2) { throw new Exception("Usage KeyValueMapFilter sparkMaster inputFile"); } String master = args[0]; String inputFile = args[1]; JavaSparkContext sc = new JavaSparkContext( master, "KeyValueMapFilter", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<String> input = sc.textFile(inputFile); PairFunction<String, String, String> keyData = new PairFunction<String, String, String>() { @Override public Tuple2<String, String> call(String x) { return new Tuple2(x.split(" ")[0], x); } }; Function<Tuple2<String, String>, Boolean> longWordFilter = new Function<Tuple2<String, String>, Boolean>() { @Override public Boolean call(Tuple2<String, String> input) { return (input._2().length() < 20); } }; JavaPairRDD<String, String> rdd = input.mapToPair(keyData); JavaPairRDD<String, String> result = rdd.filter(longWordFilter); Map<String, String> resultMap = result.collectAsMap(); for (Entry<String, String> entry : resultMap.entrySet()) { System.out.println(entry.getKey() + ":" + entry.getValue()); } } }
public static void main(String[] args) throws Exception { String master = args[0]; JavaSparkContext sc = new JavaSparkContext( master, "wordcount", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<String> rdd = sc.textFile(args[1]); JavaPairRDD<String, Integer> counts = rdd.flatMap( new FlatMapFunction<String, String>() { public Iterable<String> call(String x) { return Arrays.asList(x.split(" ")); }}).mapToPair(new PairFunction<String, String, Integer>(){ public Tuple2<String, Integer> call(String x){ return new Tuple2(x, 1); }}).reduceByKey(new Function2<Integer, Integer, Integer>(){ public Integer call(Integer x, Integer y){ return x+y;}}); counts.saveAsTextFile(args[2]); } }
public static void main(String[] args) throws Exception { if (args.length != 3) { throw new Exception("Usage BasicLoadJson [sparkMaster] [jsoninput] [jsonoutput]"); } String master = args[0]; String fileName = args[1]; String outfile = args[2]; JavaSparkContext sc = new JavaSparkContext( master, "basicloadjson", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<String> input = sc.textFile(fileName); JavaRDD<Person> result = input.mapPartitions(new ParseJson()).filter(new LikesPandas()); JavaRDD<String> formatted = result.mapPartitions(new WriteJson()); formatted.saveAsTextFile(outfile); } }
@Test public void textFilesCompressed() throws IOException { String outputDir = new File(tempDir, "output").getAbsolutePath(); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); rdd.saveAsTextFile(outputDir, DefaultCodec.class); // Try reading it in as a text file RDD List<String> expected = Arrays.asList("1", "2", "3", "4"); JavaRDD<String> readRDD = sc.textFile(outputDir); assertEquals(expected, readRDD.collect()); }
@Test public void textFilesCompressed() throws IOException { String outputDir = new File(tempDir, "output").getAbsolutePath(); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); rdd.saveAsTextFile(outputDir, DefaultCodec.class); // Try reading it in as a text file RDD List<String> expected = Arrays.asList("1", "2", "3", "4"); JavaRDD<String> readRDD = sc.textFile(outputDir); assertEquals(expected, readRDD.collect()); }
@Test public void textFilesCompressed() throws IOException { String outputDir = new File(tempDir, "output").getAbsolutePath(); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); rdd.saveAsTextFile(outputDir, DefaultCodec.class); // Try reading it in as a text file RDD List<String> expected = Arrays.asList("1", "2", "3", "4"); JavaRDD<String> readRDD = sc.textFile(outputDir); assertEquals(expected, readRDD.collect()); }
@Test public void textFiles() throws IOException { String outputDir = new File(tempDir, "output").getAbsolutePath(); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); rdd.saveAsTextFile(outputDir); // Read the plain text file and check it's OK File outputFile = new File(outputDir, "part-00000"); String content = Files.toString(outputFile, StandardCharsets.UTF_8); assertEquals("1\n2\n3\n4\n", content); // Also try reading it in as a text file RDD List<String> expected = Arrays.asList("1", "2", "3", "4"); JavaRDD<String> readRDD = sc.textFile(outputDir); assertEquals(expected, readRDD.collect()); }
@Test public void textFiles() throws IOException { String outputDir = new File(tempDir, "output").getAbsolutePath(); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); rdd.saveAsTextFile(outputDir); // Read the plain text file and check it's OK File outputFile = new File(outputDir, "part-00000"); String content = Files.toString(outputFile, StandardCharsets.UTF_8); assertEquals("1\n2\n3\n4\n", content); // Also try reading it in as a text file RDD List<String> expected = Arrays.asList("1", "2", "3", "4"); JavaRDD<String> readRDD = sc.textFile(outputDir); assertEquals(expected, readRDD.collect()); }
@Test public void textFiles() throws IOException { String outputDir = new File(tempDir, "output").getAbsolutePath(); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); rdd.saveAsTextFile(outputDir); // Read the plain text file and check it's OK File outputFile = new File(outputDir, "part-00000"); String content = Files.toString(outputFile, StandardCharsets.UTF_8); assertEquals("1\n2\n3\n4\n", content); // Also try reading it in as a text file RDD List<String> expected = Arrays.asList("1", "2", "3", "4"); JavaRDD<String> readRDD = sc.textFile(outputDir); assertEquals(expected, readRDD.collect()); }
JavaRDD<String> stringData = sc.textFile(filename);
JavaRDD<String> stringData = sc.textFile(path);
JavaRDD<String> stringData = sc.textFile(directory);
JavaRDD<String> logLines = sc.textFile(EXTRACTED_PATH);