peopleFromJsonFile.printSchema(); peopleFromJsonRDD.printSchema();
peopleFromJsonFile.printSchema(); peopleFromJsonRDD.printSchema();
private void printSchema() { System.out.println("Schema for step " + getName() + ":"); data.printSchema(); }
private void start() { SparkSession spark = SparkSession.builder().appName( "Complex JSON array to Dataset").master("local").getOrCreate(); String filename = "data/array-complex.json"; long start = System.currentTimeMillis(); Dataset<Row> df = spark.read().json(filename); long stop = System.currentTimeMillis(); System.out.println("Processing took " + (stop - start) + " ms"); df.show(); df.printSchema(); } }
private void start() { SparkSession spark = SparkSession.builder().appName("JSON map to Dataset") .master("local").getOrCreate(); String filename = "data/map.json"; long start = System.currentTimeMillis(); Dataset<Row> df = spark.read().json(filename); long stop = System.currentTimeMillis(); System.out.println("Processing took " + (stop - start) + " ms"); df.show(); df.printSchema(); } }
private void start() { SparkSession spark = SparkSession.builder().appName("XML to Dataset") .master("local").getOrCreate(); String filename = "data/budget-2017.xml"; long start = System.currentTimeMillis(); Dataset<Row> df = spark.read().format("xml").option("rowTag", "item").load( filename); long stop = System.currentTimeMillis(); System.out.println("Processing took " + (stop - start) + " ms"); df.show(); df.printSchema(); } }
private void start() { SparkSession spark = SparkSession.builder().appName("JSON array to Dataset") .master("local").getOrCreate(); String filename = "data/array.json"; long start = System.currentTimeMillis(); Dataset<Row> df = spark.read().json(filename); long stop = System.currentTimeMillis(); System.out.println("Processing took " + (stop - start) + " ms"); df.show(); df.printSchema(); // Turns the "one liner" into a real column df = df.select(explode(df.col("valsInArrays"))).toDF("vals"); df.show(); df.printSchema(); } }
private void start() { SparkSession spark = SparkSession.builder().master("local").getOrCreate(); List<Integer> data = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); Dataset<Integer> df = spark.createDataset(data, Encoders.INT()); df.show(); df.printSchema(); Integer sumByReduce = df.reduce(new SumByReduce()); System.out.println("Sum should be 55 and it is... " + sumByReduce); } }
private void start() { SparkSession spark = SparkSession.builder().appName("JSON to Dataset") .master("local").getOrCreate(); String filename = "data/north-carolina-school-performance-data.json"; long start = System.currentTimeMillis(); Dataset<Row> df = spark.read().json(filename); long stop = System.currentTimeMillis(); System.out.println("Processing took " + (stop - start) + " ms"); df.show(); df.printSchema(); // Flatenization df = df.withColumn("district", df.col("fields.district")); df = df.drop(df.col("fields.district")); // this does not work as the column // stays here (Spark 2.0.0) df.show(); df.printSchema(); } }
private void start() { SparkSession spark = SparkSession.builder().appName( "Custom data set to Dataset") .master("local").getOrCreate(); String filename = "data/array-complex.json"; long start = System.currentTimeMillis(); Dataset<Row> df = spark.read() .format("net.jgp.labs.spark.x.datasource.SubStringCounterDataSource") .option(K.COUNT + "0", "a") // count the number of 'a' .option(K.COUNT + "1", "b") // count the number of 'b' .option(K.COUNT + "2", "color") // count the number of 'color' .load(filename); // local file long stop = System.currentTimeMillis(); log.info("Processing took {} ms", stop - start); df.printSchema(); df.show(); } }
private void start() { SparkSession spark = SparkSession.builder().appName( "Build a DataFrame from Scratch").master("local[*]") .getOrCreate(); List<String> stringAsList = new ArrayList<>(); stringAsList.add("bar"); JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext()); JavaRDD<Row> rowRDD = sparkContext.parallelize(stringAsList).map(( String row) -> RowFactory.create(row)); // Creates schema StructType schema = DataTypes.createStructType( new StructField[] { DataTypes.createStructField("foe", DataTypes.StringType, false) }); Dataset<Row> df = spark.sqlContext().createDataFrame(rowRDD, schema).toDF(); log.debug("** Schema: "); df.printSchema(); log.debug("** Data: "); df.show(); sparkContext.close(); } }
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset") .master("local").getOrCreate(); String filename = "data/csv-quoted.txt"; Dataset<Row> df = spark.read().option("inferSchema", "true").option( "header", "true").csv(filename); df.show(); df.printSchema(); } }
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset") .master("local").getOrCreate(); String filename = "data/csv-q.txt"; Dataset<Row> df = spark.read().option("inferSchema", "true").option( "header", "true").csv(filename); df.show(); df.printSchema(); } }
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset<Book>") .master("local").getOrCreate(); String filename = "data/books.csv"; Dataset<Row> df = spark.read().format("csv") .option("inferSchema", "true") .option("header", "true") .load(filename); df.show(); Dataset<Book> bookDs = df.map(new BookMapper(), Encoders.bean(Book.class)); bookDs.show(); bookDs.printSchema(); Dataset<Row> df2 = bookDs.toDF(); df2.show(); df2.printSchema(); } }
private void start() { log.debug("-> start()"); SparkSession spark = SparkSession.builder() .appName("Read lines over a file stream").master("local") .getOrCreate(); Dataset<Row> df = spark .readStream() .format("text") .load(StreamingUtils.getInputDirectory()); StreamingQuery query = df.writeStream().outputMode(OutputMode.Update()) .format("console").start(); try { query.awaitTermination(); } catch (StreamingQueryException e) { log.error("Exception while waiting for query to end {}.", e .getMessage(), e); } // In this case everything is a string df.show(); df.printSchema(); } }
private void start() { SparkSession spark = SparkSession.builder().appName("Book CSV to Dataset") .master("local").getOrCreate(); String filename = "data/books.csv"; // @formatter:off Dataset<Row> df = spark .read() .format("csv") .option("inferSchema", "false") // We are not inferring the schema for now .option("header", "true") .load(filename); // @formatter:on df.show(); // In this case everything is a string df.printSchema(); } }
private void start() { SparkSession spark = SparkSession.builder().appName("Book URL Builder") .master("local").getOrCreate(); String filename = "data/books.csv"; Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true") .option("header", "true") .load(filename); df.show(); Dataset<String> ds = df.map(new BookUrlBuilder(), Encoders.STRING()); ds.printSchema(); ds.show(20, 80); } }
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset<Book>") .master("local").getOrCreate(); String filename = "data/books.csv"; Dataset<Row> df = spark.read().format("csv") .option("inferSchema", "true") .option("header", "true") .load(filename); df.show(); Dataset<Book> bookDs = df.map(new BookMapper(), Encoders.bean(Book.class)); bookDs.show(); bookDs.printSchema(); } }
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset") .master("local").getOrCreate(); String filename = "data/csv-double-header.txt"; StructType schema = buildSchemaFromCsvDefinition("1st line of file", "2nd line of file"); // TODO // I use a dirty comment trick to avoid manipulating the data file, but // one could build the method... Dataset<Row> df = spark.read().schema(schema).option("inferSchema", "false") .option("comment", "#").option("header", "true").option("mode", "DROPMALFORMED") .csv(filename); df.show(); df.printSchema(); }