Refine search
@Test public void saveAndLoad() { Map<String, String> options = new HashMap<>(); options.put("path", path.toString()); df.write().mode(SaveMode.ErrorIfExists).format("json").options(options).save(); Dataset<Row> loadedDF = spark.read().format("json").options(options).load(); checkAnswer(loadedDF, df.collectAsList()); }
@Test public void testOptionsAPI() { HashMap<String, String> map = new HashMap<String, String>(); map.put("e", "1"); spark .read() .option("a", "1") .option("b", 1) .option("c", 1.0) .option("d", true) .options(map) .text() .write() .option("a", "1") .option("b", 1) .option("c", 1.0) .option("d", true) .options(map) .format("org.apache.spark.sql.test") .save(); }
SQLContext sqlContext = new SQLContext(ctx); Dataset<Row> schemaPeople = sqlContext.createDataFrame(people, Person.class); schemaPeople.registerTempTable("people"); Dataset<Row> teenagers = sqlContext.sql("SELECT name FROM people WHERE country = 'USA' AND age >= 13 AND age <= 19"); List<String> teenagerNames = teenagers.toJavaRDD() .map((Row row) -> "Name: " + row.getString(0)).collect(); schemaPeople.write().parquet("people.parquet"); Dataset<Row> parquetFile = sqlContext.read().parquet("people.parquet"); sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19"); teenagerNames = teenagers2.toJavaRDD() .map((Row row) -> "Name: " + row.getString(0)).collect(); Dataset<Row> peopleFromJsonFile = sqlContext.read().json(path); teenagerNames = teenagers3.toJavaRDD() .map((Row row) -> "Name: " + row.getString(0)).collect(); JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData); Dataset<Row> peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd());
@Override public Dataset<String> read(SparkSession spark, Properties profilerProps, Properties readerProps) { String inputPath = TELEMETRY_INPUT_PATH.get(profilerProps, String.class); if(inputFormat == null) { inputFormat = TELEMETRY_INPUT_FORMAT.get(profilerProps, String.class); } LOG.debug("Loading telemetry; inputPath={}, inputFormat={}", inputPath, inputFormat); return spark .read() .options(Maps.fromProperties(readerProps)) .format(inputFormat) .load(inputPath) .toJSON(); } }
/** * This only tests whether API compiles, but does not run it as orc() * cannot be run without Hive classes. */ public void testOrcAPI() { spark.read().schema(schema).orc(); spark.read().schema(schema).orc(input); spark.read().schema(schema).orc(input, input, input); spark.read().schema(schema).orc(new String[]{input, input}) .write().orc(output); } }
conn.commit(); SQLContext sqlContext = SparkUtil.getSparkSession().sqlContext(); Dataset phoenixDataSet = SparkUtil.getSparkSession().read().format("phoenix") .option(DataSourceOptions.TABLE_KEY, tableName) .option(PhoenixDataSource.ZOOKEEPER_URL, getUrl()).load(); phoenixDataSet.createOrReplaceTempView(tableName); Dataset<Row> dataset = sqlContext.sql("SELECT col1+col2, col4, a_string FROM " + tableName + " ORDER BY col1+col2, col4"); List<Row> rows = dataset.collectAsList(); ResultSet rs = new SparkResultSet(rows, dataset.columns()); assertTrue(rs.next()); assertEquals("a", rs.getString(3));
@Test public void applySchemaToJSON() { Dataset<String> jsonDS = spark.createDataset(Arrays.asList( "{\"string\":\"this is a simple string.\", \"integer\":10, \"long\":21474836470, " + "\"bigInteger\":92233720368547758070, \"double\":1.7976931348623157E308, " + "this is another simple string.")); Dataset<Row> df1 = spark.read().json(jsonDS); StructType actualSchema1 = df1.schema(); Assert.assertEquals(expectedSchema, actualSchema1); df1.createOrReplaceTempView("jsonTable1"); List<Row> actual1 = spark.sql("select * from jsonTable1").collectAsList(); Assert.assertEquals(expectedResult, actual1); Dataset<Row> df2 = spark.read().schema(expectedSchema).json(jsonDS); StructType actualSchema2 = df2.schema(); Assert.assertEquals(expectedSchema, actualSchema2);
@Test public void verifyLibSVMDF() { Dataset<Row> dataset = spark.read().format("libsvm").option("vectorType", "dense") .load(path); Assert.assertEquals("label", dataset.columns()[0]); Assert.assertEquals("features", dataset.columns()[1]); Row r = dataset.first(); Assert.assertEquals(1.0, r.getDouble(0), 1e-15); DenseVector v = r.getAs(1); Assert.assertEquals(Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0), v); } }
/** * Reads a Snomed relationship file and converts it to a {@link HierarchicalElement} dataset. * * @param spark the Spark session * @param snomedRelationshipPath path to the SNOMED relationship file * @return a dataset of{@link HierarchicalElement} representing the hierarchical relationship. */ public static Dataset<HierarchicalElement> readRelationshipFile(SparkSession spark, String snomedRelationshipPath) { return spark.read() .option("header", true) .option("delimiter", "\t") .csv(snomedRelationshipPath) .where(col("typeId").equalTo(lit(SNOMED_ISA_RELATIONSHIP_ID))) .where(col("active").equalTo(lit("1"))) .select(col("destinationId"), col("sourceId")) .where(col("destinationId").isNotNull() .and(col("destinationId").notEqual(lit("")))) .where(col("sourceId").isNotNull() .and(col("sourceId").notEqual(lit("")))) .map((MapFunction<Row, HierarchicalElement>) row -> { HierarchicalElement element = new HierarchicalElement(); element.setAncestorSystem(SNOMED_CODE_SYSTEM_URI); element.setAncestorValue(row.getString(0)); element.setDescendantSystem(SNOMED_CODE_SYSTEM_URI); element.setDescendantValue(row.getString(1)); return element; }, Hierarchies.getHierarchicalElementEncoder()); }
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset") .master("local").getOrCreate(); String filename = "data/csv-double-header.txt"; StructType schema = buildSchemaFromCsvDefinition("1st line of file", "2nd line of file"); // TODO // I use a dirty comment trick to avoid manipulating the data file, but // one could build the method... Dataset<Row> df = spark.read().schema(schema).option("inferSchema", "false") .option("comment", "#").option("header", "true").option("mode", "DROPMALFORMED") .csv(filename); df.show(); df.printSchema(); }
private void start() { SparkSession spark = SparkSession.builder().appName("For Each Claim") .master("local").getOrCreate(); String filename = "data/claims.csv"; Dataset<Row> claimsDf = spark.read().format("csv").option("inferSchema", "true").option("header", "true") .load(filename); claimsDf.show(); claimsDf.foreach(new ClaimPrepAndProcess()); } }
public void compact(String inputPath, String outputPath) throws IOException { this.setCompressionAndSerializationOptions(inputPath, outputPath); this.outputCompressionProperties(this.outputCompression); // Defining Spark Context with a generic Spark Configuration. SparkConf sparkConf = new SparkConf().setAppName("Spark Compaction"); JavaSparkContext sc = new JavaSparkContext(sparkConf); if (this.outputSerialization.equals(TEXT)) { JavaRDD<String> textFile = sc.textFile(this.concatInputPath(inputPath)); textFile.coalesce(this.splitSize).saveAsTextFile(outputPath); } else if (this.outputSerialization.equals(PARQUET)) { SQLContext sqlContext = new SQLContext(sc); DataFrame parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath)); parquetFile.coalesce(this.splitSize).write().parquet(outputPath); } else if (this.outputSerialization.equals(AVRO)) { // For this to work the files must end in .avro // Another issue is that when using compression the compression codec extension is not being added to the file name. SQLContext sqlContext = new SQLContext(sc); DataFrame avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath)); avroFile.coalesce(this.splitSize).write().format("com.databricks.spark.avro").save(outputPath); } else { System.out.println("Did not match any serialization type: text, parquet, or avro. Recieved: " + this.outputSerialization); } }
@Before public void setUp() throws IOException { spark = SparkSession.builder() .master("local[*]") .appName("testing") .getOrCreate(); path = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "datasource").getCanonicalFile(); if (path.exists()) { path.delete(); } List<String> jsonObjects = new ArrayList<>(10); for (int i = 0; i < 10; i++) { jsonObjects.add("{\"a\":" + i + ", \"b\":\"str" + i + "\"}"); } Dataset<String> ds = spark.createDataset(jsonObjects, Encoders.STRING()); df = spark.read().json(ds); df.createOrReplaceTempView("jsonTable"); }
@Test public void testBeanWithArrayFieldDeserialization() { Encoder<Record> encoder = Encoders.bean(Record.class); Dataset<Record> dataset = spark .read() .format("json") .schema("id int, intervals array<struct<startTime: bigint, endTime: bigint>>") .load("src/test/resources/test-data/with-array-fields.json") .as(encoder); List<Record> records = dataset.collectAsList(); Assert.assertEquals(records, RECORDS); }
/** * Given a bunch of hoodie keys, fetches all the individual records out as a data frame * * @return a dataframe */ public Dataset<Row> read(JavaRDD<HoodieKey> hoodieKeys, int parallelism) throws Exception { assertSqlContext(); JavaPairRDD<HoodieKey, Optional<String>> keyToFileRDD = index .fetchRecordLocation(hoodieKeys, jsc, hoodieTable); List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent()) .map(keyFileTuple -> keyFileTuple._2().get()).collect(); // record locations might be same for multiple keys, so need a unique list Set<String> uniquePaths = new HashSet<>(paths); Dataset<Row> originalDF = sqlContextOpt.get().read() .parquet(uniquePaths.toArray(new String[uniquePaths.size()])); StructType schema = originalDF.schema(); JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> { HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD), row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD)); return new Tuple2<>(key, row); }); // Now, we need to further filter out, for only rows that match the supplied hoodie keys JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1()); return sqlContextOpt.get().createDataFrame(rowRDD, schema); }
@Test public void testJsonRDDToDataFrame() { // This is a test for the deprecated API in SPARK-15615. JavaRDD<String> rdd = jsc.parallelize(Arrays.asList("{\"a\": 2}")); Dataset<Row> df = spark.read().json(rdd); Assert.assertEquals(1L, df.count()); Assert.assertEquals(2L, df.collectAsList().get(0).getLong(0)); }
PipelineModel pipelineModel = PipelineModel.load(pipelinePath); String dataPath = new Path(path, "data").toString(); DataFrame df = sqlContext().read().format("parquet").load(dataPath); Row row = df.select("markovOrder", "weights", "tagDictionary").head(); MarkovOrder order = MarkovOrder.values()[row.getInt(0)-1]; Vector w = row.getAs(1); scala.collection.immutable.HashMap<String, WrappedArray<Integer>> td = (scala.collection.immutable.HashMap<String, WrappedArray<Integer>>)row.get(2); Map<String, Set<Integer>> tagDict = new HashMap<String, Set<Integer>>(); Iterator<Tuple2<String, WrappedArray<Integer>>> iterator = td.iterator();