org.apache.spark.sql.DataFrameReader.csv java code examples

@Test
public void testCsvAPI() {
 spark.read().schema(schema).csv();
 spark.read().schema(schema).csv(input);
 spark.read().schema(schema).csv(input, input, input);
 spark.read().schema(schema).csv(new String[]{input, input})
   .write().csv(output);
}

@Test
public void testCsvAPI() {
 spark.read().schema(schema).csv();
 spark.read().schema(schema).csv(input);
 spark.read().schema(schema).csv(input, input, input);
 spark.read().schema(schema).csv(new String[]{input, input})
   .write().csv(output);
}

@Test
public void testCsvAPI() {
 spark.read().schema(schema).csv();
 spark.read().schema(schema).csv(input);
 spark.read().schema(schema).csv(input, input, input);
 spark.read().schema(schema).csv(new String[]{input, input})
   .write().csv(output);
}

public static Dataset<Row> loadFile(String inputFormat, String inputPath, SparkSession spark) {
  if (inputFormat == null || inputFormat.isEmpty() || inputFormat.equalsIgnoreCase("text")) {
    return spark.read().text(inputPath);
  } else if (inputFormat.equalsIgnoreCase("parquet")) {
    return spark.read().parquet(inputPath);
  } else if (inputFormat.equalsIgnoreCase("csv")) {
    return spark.read().option("header", "false").csv(inputPath);
  } else if (inputFormat.equalsIgnoreCase("csv_with_header")) {
    return spark.read().option("header", "true").csv(inputPath);
  } else if (inputFormat.equalsIgnoreCase("json")) {
    return spark.read().json(inputPath);
  } else {
    throw new RuntimeException(String.format("Unsupported inputFormat: %s, %s", inputFormat, inputPath));
  }
}

@Override
public Dataset<Row> csv(final String path) {
 final boolean userTriggered = initializeFunction(path);
 final Dataset<Row> result = Dataset.from(super.csv(path));
 this.setIsUserTriggered(userTriggered);
 return result;
}

private Dataset<Row> readCSV(String path) {
 LOG.debug("Reading CSV: {}", path);
 if (null != schema) {
  return Contexts.getSparkSession().read().schema(schema).options(options).csv(path);
 } else {
  return Contexts.getSparkSession().read().options(options).csv(path);
 }
}

@Override
public Dataset<Row> csv(final org.apache.spark.sql.Dataset<String> csvDataset) {
 final boolean userTriggered = initializeFunction(csvDataset);
 final Dataset<Row> result = Dataset.from(super.csv(csvDataset));
 this.setIsUserTriggered(userTriggered);
 return result;
}

@Override
public Dataset<Row> csv(final String... paths) {
 final boolean userTriggered = initializeFunction(paths);
 final Dataset<Row> result = Dataset.from(super.csv(paths));
 this.setIsUserTriggered(userTriggered);
 return result;
}

@Override
public Dataset<Row> csv(final scala.collection.Seq<String> paths) {
 final boolean userTriggered = initializeFunction(paths);
 final Dataset<Row> result = Dataset.from(super.csv(paths));
 this.setIsUserTriggered(userTriggered);
 return result;
}

 private void start() {
  SparkSession spark = SparkSession.builder().appName("CSV to Dataset")
    .master("local").getOrCreate();

  String filename = "data/csv-q.txt";
  Dataset<Row> df = spark.read().option("inferSchema", "true").option(
    "header", "true").csv(filename);
  df.show();
  df.printSchema();

 }
}

 private void start() {
  SparkSession spark = SparkSession.builder().appName("CSV to Dataset")
    .master("local").getOrCreate();

  String filename = "data/csv-quoted.txt";
  Dataset<Row> df = spark.read().option("inferSchema", "true").option(
    "header", "true").csv(filename);
  df.show();
  df.printSchema();

 }
}

private void start() {
 SparkSession spark = SparkSession.builder().appName("CSV to Dataset")
   .master("local").getOrCreate();
 String filename = "data/csv-double-header.txt";
 StructType schema = buildSchemaFromCsvDefinition("1st line of file",
   "2nd line of file"); // TODO
 // I use a dirty comment trick to avoid manipulating the data file, but
 // one could build the method...
 Dataset<Row> df = spark.read().schema(schema).option("inferSchema", "false")
   .option("comment", "#").option("header", "true").option("mode",
     "DROPMALFORMED")
   .csv(filename);
 df.show();
 df.printSchema();
}

/**
 * Reads the LOINC mutliaxial hierarchy file and converts it to a {@link HierarchicalElement}
 * dataset.
 *
 * @param spark the Spark session
 * @param loincHierarchyPath path to the multiaxial hierarchy CSV
 * @return a dataset of {@link HierarchicalElement} representing the hierarchical relationship.
 */
public static Dataset<HierarchicalElement> readMultiaxialHierarchyFile(SparkSession spark,
  String loincHierarchyPath) {
 return spark.read()
   .option("header", true)
   .csv(loincHierarchyPath)
   .select(col("IMMEDIATE_PARENT"), col("CODE"))
   .where(col("IMMEDIATE_PARENT").isNotNull()
     .and(col("IMMEDIATE_PARENT").notEqual(lit(""))))
   .where(col("CODE").isNotNull()
     .and(col("CODE").notEqual(lit(""))))
   .map((MapFunction<Row, HierarchicalElement>) row -> {
    HierarchicalElement element = new HierarchicalElement();
    element.setAncestorSystem(LOINC_CODE_SYSTEM_URI);
    element.setAncestorValue(row.getString(0));
    element.setDescendantSystem(LOINC_CODE_SYSTEM_URI);
    element.setDescendantValue(row.getString(1));
    return element;
   }, Hierarchies.getHierarchicalElementEncoder());
}

/**
 * Reads the LOINC mutliaxial hierarchy file and converts it to a {@link HierarchicalElement}
 * dataset.
 *
 * @param spark the Spark session
 * @param loincHierarchyPath path to the multiaxial hierarchy CSV
 * @return a dataset of {@link HierarchicalElement} representing the hierarchical relationship.
 */
public static Dataset<HierarchicalElement> readMultiaxialHierarchyFile(SparkSession spark,
  String loincHierarchyPath) {
 return spark.read()
   .option("header", true)
   .csv(loincHierarchyPath)
   .select(col("IMMEDIATE_PARENT"), col("CODE"))
   .where(col("IMMEDIATE_PARENT").isNotNull()
     .and(col("IMMEDIATE_PARENT").notEqual(lit(""))))
   .where(col("CODE").isNotNull()
     .and(col("CODE").notEqual(lit(""))))
   .map((MapFunction<Row, HierarchicalElement>) row -> {
    HierarchicalElement element = new HierarchicalElement();
    element.setAncestorSystem(LOINC_CODE_SYSTEM_URI);
    element.setAncestorValue(row.getString(0));
    element.setDescendantSystem(LOINC_CODE_SYSTEM_URI);
    element.setDescendantValue(row.getString(1));
    return element;
   }, Hierarchies.getHierarchicalElementEncoder());
}

/**
 * Reads a Snomed relationship file and converts it to a {@link HierarchicalElement} dataset.
 *
 * @param spark the Spark session
 * @param snomedRelationshipPath path to the SNOMED relationship file
 * @return a dataset of{@link HierarchicalElement} representing the hierarchical relationship.
 */
public static Dataset<HierarchicalElement> readRelationshipFile(SparkSession spark,
  String snomedRelationshipPath) {
 return spark.read()
   .option("header", true)
   .option("delimiter", "\t")
   .csv(snomedRelationshipPath)
   .where(col("typeId").equalTo(lit(SNOMED_ISA_RELATIONSHIP_ID)))
   .where(col("active").equalTo(lit("1")))
   .select(col("destinationId"), col("sourceId"))
   .where(col("destinationId").isNotNull()
     .and(col("destinationId").notEqual(lit(""))))
   .where(col("sourceId").isNotNull()
     .and(col("sourceId").notEqual(lit(""))))
   .map((MapFunction<Row, HierarchicalElement>) row -> {
    HierarchicalElement element = new HierarchicalElement();
    element.setAncestorSystem(SNOMED_CODE_SYSTEM_URI);
    element.setAncestorValue(row.getString(0));
    element.setDescendantSystem(SNOMED_CODE_SYSTEM_URI);
    element.setDescendantValue(row.getString(1));
    return element;
   }, Hierarchies.getHierarchicalElementEncoder());
}

/**
 * Reads a Snomed relationship file and converts it to a {@link HierarchicalElement} dataset.
 *
 * @param spark the Spark session
 * @param snomedRelationshipPath path to the SNOMED relationship file
 * @return a dataset of{@link HierarchicalElement} representing the hierarchical relationship.
 */
public static Dataset<HierarchicalElement> readRelationshipFile(SparkSession spark,
  String snomedRelationshipPath) {
 return spark.read()
   .option("header", true)
   .option("delimiter", "\t")
   .csv(snomedRelationshipPath)
   .where(col("typeId").equalTo(lit(SNOMED_ISA_RELATIONSHIP_ID)))
   .where(col("active").equalTo(lit("1")))
   .select(col("destinationId"), col("sourceId"))
   .where(col("destinationId").isNotNull()
     .and(col("destinationId").notEqual(lit(""))))
   .where(col("sourceId").isNotNull()
     .and(col("sourceId").notEqual(lit(""))))
   .map((MapFunction<Row, HierarchicalElement>) row -> {
    HierarchicalElement element = new HierarchicalElement();
    element.setAncestorSystem(SNOMED_CODE_SYSTEM_URI);
    element.setAncestorValue(row.getString(0));
    element.setDescendantSystem(SNOMED_CODE_SYSTEM_URI);
    element.setDescendantValue(row.getString(1));
    return element;
   }, Hierarchies.getHierarchicalElementEncoder());
}

Popular methods of DataFrameReader

orc,
table

Popular in Java

Making http requests using okhttp
getSystemService (Context)
onCreateOptionsMenu (Activity)
runOnUiThread (Activity)
Pointer (com.sun.jna)
An abstraction for a native pointer data type. A Pointer instance represents, on the Java side, a na
InputStreamReader (java.io)
A class for turning a byte stream into a character stream. Data read from the source input stream is
ServerSocket (java.net)
This class represents a server-side socket that waits for incoming client connections. A ServerSocke
JarFile (java.util.jar)
JarFile is used to read jar entries and their associated data from jar files.
FileUtils (org.apache.commons.io)
General file manipulation utilities. Facilities are provided in the following areas: * writing to a
BoxLayout (javax.swing)
Top Vim plugins

How to use csvmethodin org.apache.spark.sql.DataFrameReader

Best Java code snippets using org.apache.spark.sql.DataFrameReader.csv (Showing top 16 results out of 315)

How to use
csv
method
in
org.apache.spark.sql.DataFrameReader