org.apache.spark.sql.DataFrameReader java code examples

Refine search

@Test
public void saveAndLoad() {
 Map<String, String> options = new HashMap<>();
 options.put("path", path.toString());
 df.write().mode(SaveMode.ErrorIfExists).format("json").options(options).save();
 Dataset<Row> loadedDF = spark.read().format("json").options(options).load();
 checkAnswer(loadedDF, df.collectAsList());
}

@Test
public void testOptionsAPI() {
 HashMap<String, String> map = new HashMap<String, String>();
 map.put("e", "1");
 spark
   .read()
   .option("a", "1")
   .option("b", 1)
   .option("c", 1.0)
   .option("d", true)
   .options(map)
   .text()
   .write()
   .option("a", "1")
   .option("b", 1)
   .option("c", 1.0)
   .option("d", true)
   .options(map)
   .format("org.apache.spark.sql.test")
   .save();
}

SQLContext sqlContext = new SQLContext(ctx);
Dataset<Row> schemaPeople = sqlContext.createDataFrame(people, Person.class);
schemaPeople.registerTempTable("people");
Dataset<Row> teenagers = sqlContext.sql("SELECT name FROM people WHERE country = 'USA' AND age >= 13 AND age <= 19");
List<String> teenagerNames = teenagers.toJavaRDD()
    .map((Row row) -> "Name: " + row.getString(0)).collect();
schemaPeople.write().parquet("people.parquet");
Dataset<Row> parquetFile = sqlContext.read().parquet("people.parquet");
 sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19");
teenagerNames = teenagers2.toJavaRDD()
    .map((Row row) -> "Name: " + row.getString(0)).collect();
Dataset<Row> peopleFromJsonFile = sqlContext.read().json(path);
teenagerNames = teenagers3.toJavaRDD()
    .map((Row row) -> "Name: " + row.getString(0)).collect();
JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData);
Dataset<Row> peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd());

 @Override
 public Dataset<String> read(SparkSession spark, Properties profilerProps, Properties readerProps) {
  String inputPath = TELEMETRY_INPUT_PATH.get(profilerProps, String.class);
  if(inputFormat == null) {
   inputFormat = TELEMETRY_INPUT_FORMAT.get(profilerProps, String.class);
  }
  LOG.debug("Loading telemetry; inputPath={}, inputFormat={}", inputPath, inputFormat);

  return spark
      .read()
      .options(Maps.fromProperties(readerProps))
      .format(inputFormat)
      .load(inputPath)
      .toJSON();
 }
}

 /**
  * This only tests whether API compiles, but does not run it as orc()
  * cannot be run without Hive classes.
  */
 public void testOrcAPI() {
  spark.read().schema(schema).orc();
  spark.read().schema(schema).orc(input);
  spark.read().schema(schema).orc(input, input, input);
  spark.read().schema(schema).orc(new String[]{input, input})
    .write().orc(output);
 }
}

@Test
public void testParquetAPI() {
 spark.read().schema(schema).parquet();
 spark.read().schema(schema).parquet(input);
 spark.read().schema(schema).parquet(input, input, input);
 spark.read().schema(schema).parquet(new String[] { input, input })
   .write().parquet(output);
}

conn.commit();
SQLContext sqlContext = SparkUtil.getSparkSession().sqlContext();
Dataset phoenixDataSet = SparkUtil.getSparkSession().read().format("phoenix")
    .option(DataSourceOptions.TABLE_KEY, tableName)
    .option(PhoenixDataSource.ZOOKEEPER_URL, getUrl()).load();
phoenixDataSet.createOrReplaceTempView(tableName);
Dataset<Row> dataset =
    sqlContext.sql("SELECT col1+col2, col4, a_string FROM " + tableName
        + " ORDER BY col1+col2, col4");
List<Row> rows = dataset.collectAsList();
ResultSet rs = new SparkResultSet(rows, dataset.columns());
assertTrue(rs.next());
assertEquals("a", rs.getString(3));

@Test
public void applySchemaToJSON() {
 Dataset<String> jsonDS = spark.createDataset(Arrays.asList(
  "{\"string\":\"this is a simple string.\", \"integer\":10, \"long\":21474836470, " +
   "\"bigInteger\":92233720368547758070, \"double\":1.7976931348623157E308, " +
   "this is another simple string."));
 Dataset<Row> df1 = spark.read().json(jsonDS);
 StructType actualSchema1 = df1.schema();
 Assert.assertEquals(expectedSchema, actualSchema1);
 df1.createOrReplaceTempView("jsonTable1");
 List<Row> actual1 = spark.sql("select * from jsonTable1").collectAsList();
 Assert.assertEquals(expectedResult, actual1);
 Dataset<Row> df2 = spark.read().schema(expectedSchema).json(jsonDS);
 StructType actualSchema2 = df2.schema();
 Assert.assertEquals(expectedSchema, actualSchema2);

 @Test
 public void verifyLibSVMDF() {
  Dataset<Row> dataset = spark.read().format("libsvm").option("vectorType", "dense")
   .load(path);
  Assert.assertEquals("label", dataset.columns()[0]);
  Assert.assertEquals("features", dataset.columns()[1]);
  Row r = dataset.first();
  Assert.assertEquals(1.0, r.getDouble(0), 1e-15);
  DenseVector v = r.getAs(1);
  Assert.assertEquals(Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0), v);
 }
}

/**
 * Reads a Snomed relationship file and converts it to a {@link HierarchicalElement} dataset.
 *
 * @param spark the Spark session
 * @param snomedRelationshipPath path to the SNOMED relationship file
 * @return a dataset of{@link HierarchicalElement} representing the hierarchical relationship.
 */
public static Dataset<HierarchicalElement> readRelationshipFile(SparkSession spark,
  String snomedRelationshipPath) {
 return spark.read()
   .option("header", true)
   .option("delimiter", "\t")
   .csv(snomedRelationshipPath)
   .where(col("typeId").equalTo(lit(SNOMED_ISA_RELATIONSHIP_ID)))
   .where(col("active").equalTo(lit("1")))
   .select(col("destinationId"), col("sourceId"))
   .where(col("destinationId").isNotNull()
     .and(col("destinationId").notEqual(lit(""))))
   .where(col("sourceId").isNotNull()
     .and(col("sourceId").notEqual(lit(""))))
   .map((MapFunction<Row, HierarchicalElement>) row -> {
    HierarchicalElement element = new HierarchicalElement();
    element.setAncestorSystem(SNOMED_CODE_SYSTEM_URI);
    element.setAncestorValue(row.getString(0));
    element.setDescendantSystem(SNOMED_CODE_SYSTEM_URI);
    element.setDescendantValue(row.getString(1));
    return element;
   }, Hierarchies.getHierarchicalElementEncoder());
}

private void start() {
 SparkSession spark = SparkSession.builder().appName("CSV to Dataset")
   .master("local").getOrCreate();
 String filename = "data/csv-double-header.txt";
 StructType schema = buildSchemaFromCsvDefinition("1st line of file",
   "2nd line of file"); // TODO
 // I use a dirty comment trick to avoid manipulating the data file, but
 // one could build the method...
 Dataset<Row> df = spark.read().schema(schema).option("inferSchema", "false")
   .option("comment", "#").option("header", "true").option("mode",
     "DROPMALFORMED")
   .csv(filename);
 df.show();
 df.printSchema();
}

 private void start() {
  SparkSession spark = SparkSession.builder().appName("For Each Claim")
    .master("local").getOrCreate();

  String filename = "data/claims.csv";
  Dataset<Row> claimsDf = spark.read().format("csv").option("inferSchema",
    "true").option("header", "true")
    .load(filename);
  claimsDf.show();

  claimsDf.foreach(new ClaimPrepAndProcess());
 }
}

@Test
public void testFormatAPI() {
 spark
   .read()
   .format("org.apache.spark.sql.test")
   .load()
   .write()
   .format("org.apache.spark.sql.test")
   .save();
}

public void compact(String inputPath, String outputPath) throws IOException {
  this.setCompressionAndSerializationOptions(inputPath, outputPath);
  this.outputCompressionProperties(this.outputCompression);
  
  // Defining Spark Context with a generic Spark Configuration.
  SparkConf sparkConf = new SparkConf().setAppName("Spark Compaction");
  JavaSparkContext sc = new JavaSparkContext(sparkConf);
  
  if (this.outputSerialization.equals(TEXT)) {
    JavaRDD<String> textFile = sc.textFile(this.concatInputPath(inputPath));
    textFile.coalesce(this.splitSize).saveAsTextFile(outputPath);
  } else if (this.outputSerialization.equals(PARQUET)) {
    SQLContext sqlContext = new SQLContext(sc);
    DataFrame parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath));
    parquetFile.coalesce(this.splitSize).write().parquet(outputPath);
  } else if (this.outputSerialization.equals(AVRO)) {
    // For this to work the files must end in .avro
    // Another issue is that when using compression the compression codec extension is not being added to the file name.
    SQLContext sqlContext = new SQLContext(sc);
    DataFrame avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath));
    avroFile.coalesce(this.splitSize).write().format("com.databricks.spark.avro").save(outputPath);
  } else {
    System.out.println("Did not match any serialization type: text, parquet, or avro.  Recieved: " +
        this.outputSerialization);
  }
}

@Before
public void setUp() throws IOException {
 spark = SparkSession.builder()
  .master("local[*]")
  .appName("testing")
  .getOrCreate();
 path =
  Utils.createTempDir(System.getProperty("java.io.tmpdir"), "datasource").getCanonicalFile();
 if (path.exists()) {
  path.delete();
 }
 List<String> jsonObjects = new ArrayList<>(10);
 for (int i = 0; i < 10; i++) {
  jsonObjects.add("{\"a\":" + i + ", \"b\":\"str" + i + "\"}");
 }
 Dataset<String> ds = spark.createDataset(jsonObjects, Encoders.STRING());
 df = spark.read().json(ds);
 df.createOrReplaceTempView("jsonTable");
}

@Test
public void testBeanWithArrayFieldDeserialization() {
 Encoder<Record> encoder = Encoders.bean(Record.class);
 Dataset<Record> dataset = spark
  .read()
  .format("json")
  .schema("id int, intervals array<struct<startTime: bigint, endTime: bigint>>")
  .load("src/test/resources/test-data/with-array-fields.json")
  .as(encoder);
 List<Record> records = dataset.collectAsList();
 Assert.assertEquals(records, RECORDS);
}

/**
 * Given a bunch of hoodie keys, fetches all the individual records out as a data frame
 *
 * @return a dataframe
 */
public Dataset<Row> read(JavaRDD<HoodieKey> hoodieKeys, int parallelism) throws Exception {
 assertSqlContext();
 JavaPairRDD<HoodieKey, Optional<String>> keyToFileRDD = index
   .fetchRecordLocation(hoodieKeys, jsc, hoodieTable);
 List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent())
   .map(keyFileTuple -> keyFileTuple._2().get()).collect();
 // record locations might be same for multiple keys, so need a unique list
 Set<String> uniquePaths = new HashSet<>(paths);
 Dataset<Row> originalDF = sqlContextOpt.get().read()
   .parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
 StructType schema = originalDF.schema();
 JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> {
  HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
    row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD));
  return new Tuple2<>(key, row);
 });
 // Now, we need to further filter out, for only rows that match the supplied hoodie keys
 JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1());
 return sqlContextOpt.get().createDataFrame(rowRDD, schema);
}

@Test
public void testJsonRDDToDataFrame() {
 // This is a test for the deprecated API in SPARK-15615.
 JavaRDD<String> rdd = jsc.parallelize(Arrays.asList("{\"a\": 2}"));
 Dataset<Row> df = spark.read().json(rdd);
 Assert.assertEquals(1L, df.count());
 Assert.assertEquals(2L, df.collectAsList().get(0).getLong(0));
}

PipelineModel pipelineModel = PipelineModel.load(pipelinePath);
String dataPath = new Path(path, "data").toString();
DataFrame df = sqlContext().read().format("parquet").load(dataPath);
Row row = df.select("markovOrder", "weights", "tagDictionary").head();
MarkovOrder order = MarkovOrder.values()[row.getInt(0)-1];
Vector w = row.getAs(1);
scala.collection.immutable.HashMap<String, WrappedArray<Integer>> td = (scala.collection.immutable.HashMap<String, WrappedArray<Integer>>)row.get(2);
Map<String, Set<Integer>> tagDict = new HashMap<String, Set<Integer>>();
Iterator<Tuple2<String, WrappedArray<Integer>>> iterator = td.iterator();

@Test
public void testJsonAPI() {
 spark.read().schema(schema).json();
 spark.read().schema(schema).json(input);
 spark.read().schema(schema).json(input, input, input);
 spark.read().schema(schema).json(new String[]{input, input})
   .write().json(output);
}

Most used methods

Popular in Java

Start an intent from android
onRequestPermissionsResult (Fragment)
getSupportFragmentManager (FragmentActivity)
startActivity (Activity)
String (java.lang)
Calendar (java.util)
Calendar is an abstract base class for converting between a Date object and a set of integer fields
SortedMap (java.util)
A map that has its keys ordered. The sorting is according to either the natural ordering of its keys
Handler (java.util.logging)
A Handler object accepts a logging request and exports the desired messages to a target, for example
HttpServletRequest (javax.servlet.http)
Extends the javax.servlet.ServletRequest interface to provide request information for HTTP servlets.
Logger (org.slf4j)
The org.slf4j.Logger interface is the main user entry point of SLF4J API. It is expected that loggin
CodeWhisperer alternatives

How to useDataFrameReader in org.apache.spark.sql

Best Java code snippets using org.apache.spark.sql.DataFrameReader (Showing top 20 results out of 315)

Refine search

How to use
DataFrameReader
in
org.apache.spark.sql