public static Dataset<Row> loadFile(String inputFormat, String inputPath, SparkSession spark) { if (inputFormat == null || inputFormat.isEmpty() || inputFormat.equalsIgnoreCase("text")) { return spark.read().text(inputPath); } else if (inputFormat.equalsIgnoreCase("parquet")) { return spark.read().parquet(inputPath); } else if (inputFormat.equalsIgnoreCase("csv")) { return spark.read().option("header", "false").csv(inputPath); } else if (inputFormat.equalsIgnoreCase("csv_with_header")) { return spark.read().option("header", "true").csv(inputPath); } else if (inputFormat.equalsIgnoreCase("json")) { return spark.read().json(inputPath); } else { throw new RuntimeException(String.format("Unsupported inputFormat: %s, %s", inputFormat, inputPath)); } }
@Override public Dataset<Row> csv(final String path) { final boolean userTriggered = initializeFunction(path); final Dataset<Row> result = Dataset.from(super.csv(path)); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<Row> csv(final org.apache.spark.sql.Dataset<String> csvDataset) { final boolean userTriggered = initializeFunction(csvDataset); final Dataset<Row> result = Dataset.from(super.csv(csvDataset)); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<Row> csv(final String... paths) { final boolean userTriggered = initializeFunction(paths); final Dataset<Row> result = Dataset.from(super.csv(paths)); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<Row> csv(final scala.collection.Seq<String> paths) { final boolean userTriggered = initializeFunction(paths); final Dataset<Row> result = Dataset.from(super.csv(paths)); this.setIsUserTriggered(userTriggered); return result; }
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset") .master("local").getOrCreate(); String filename = "data/csv-q.txt"; Dataset<Row> df = spark.read().option("inferSchema", "true").option( "header", "true").csv(filename); df.show(); df.printSchema(); } }
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset") .master("local").getOrCreate(); String filename = "data/csv-quoted.txt"; Dataset<Row> df = spark.read().option("inferSchema", "true").option( "header", "true").csv(filename); df.show(); df.printSchema(); } }
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset") .master("local").getOrCreate(); String filename = "data/csv-double-header.txt"; StructType schema = buildSchemaFromCsvDefinition("1st line of file", "2nd line of file"); // TODO // I use a dirty comment trick to avoid manipulating the data file, but // one could build the method... Dataset<Row> df = spark.read().schema(schema).option("inferSchema", "false") .option("comment", "#").option("header", "true").option("mode", "DROPMALFORMED") .csv(filename); df.show(); df.printSchema(); }
/** * Reads the LOINC mutliaxial hierarchy file and converts it to a {@link HierarchicalElement} * dataset. * * @param spark the Spark session * @param loincHierarchyPath path to the multiaxial hierarchy CSV * @return a dataset of {@link HierarchicalElement} representing the hierarchical relationship. */ public static Dataset<HierarchicalElement> readMultiaxialHierarchyFile(SparkSession spark, String loincHierarchyPath) { return spark.read() .option("header", true) .csv(loincHierarchyPath) .select(col("IMMEDIATE_PARENT"), col("CODE")) .where(col("IMMEDIATE_PARENT").isNotNull() .and(col("IMMEDIATE_PARENT").notEqual(lit("")))) .where(col("CODE").isNotNull() .and(col("CODE").notEqual(lit("")))) .map((MapFunction<Row, HierarchicalElement>) row -> { HierarchicalElement element = new HierarchicalElement(); element.setAncestorSystem(LOINC_CODE_SYSTEM_URI); element.setAncestorValue(row.getString(0)); element.setDescendantSystem(LOINC_CODE_SYSTEM_URI); element.setDescendantValue(row.getString(1)); return element; }, Hierarchies.getHierarchicalElementEncoder()); }
/** * Reads the LOINC mutliaxial hierarchy file and converts it to a {@link HierarchicalElement} * dataset. * * @param spark the Spark session * @param loincHierarchyPath path to the multiaxial hierarchy CSV * @return a dataset of {@link HierarchicalElement} representing the hierarchical relationship. */ public static Dataset<HierarchicalElement> readMultiaxialHierarchyFile(SparkSession spark, String loincHierarchyPath) { return spark.read() .option("header", true) .csv(loincHierarchyPath) .select(col("IMMEDIATE_PARENT"), col("CODE")) .where(col("IMMEDIATE_PARENT").isNotNull() .and(col("IMMEDIATE_PARENT").notEqual(lit("")))) .where(col("CODE").isNotNull() .and(col("CODE").notEqual(lit("")))) .map((MapFunction<Row, HierarchicalElement>) row -> { HierarchicalElement element = new HierarchicalElement(); element.setAncestorSystem(LOINC_CODE_SYSTEM_URI); element.setAncestorValue(row.getString(0)); element.setDescendantSystem(LOINC_CODE_SYSTEM_URI); element.setDescendantValue(row.getString(1)); return element; }, Hierarchies.getHierarchicalElementEncoder()); }
/** * Reads a Snomed relationship file and converts it to a {@link HierarchicalElement} dataset. * * @param spark the Spark session * @param snomedRelationshipPath path to the SNOMED relationship file * @return a dataset of{@link HierarchicalElement} representing the hierarchical relationship. */ public static Dataset<HierarchicalElement> readRelationshipFile(SparkSession spark, String snomedRelationshipPath) { return spark.read() .option("header", true) .option("delimiter", "\t") .csv(snomedRelationshipPath) .where(col("typeId").equalTo(lit(SNOMED_ISA_RELATIONSHIP_ID))) .where(col("active").equalTo(lit("1"))) .select(col("destinationId"), col("sourceId")) .where(col("destinationId").isNotNull() .and(col("destinationId").notEqual(lit("")))) .where(col("sourceId").isNotNull() .and(col("sourceId").notEqual(lit("")))) .map((MapFunction<Row, HierarchicalElement>) row -> { HierarchicalElement element = new HierarchicalElement(); element.setAncestorSystem(SNOMED_CODE_SYSTEM_URI); element.setAncestorValue(row.getString(0)); element.setDescendantSystem(SNOMED_CODE_SYSTEM_URI); element.setDescendantValue(row.getString(1)); return element; }, Hierarchies.getHierarchicalElementEncoder()); }
/** * Reads a Snomed relationship file and converts it to a {@link HierarchicalElement} dataset. * * @param spark the Spark session * @param snomedRelationshipPath path to the SNOMED relationship file * @return a dataset of{@link HierarchicalElement} representing the hierarchical relationship. */ public static Dataset<HierarchicalElement> readRelationshipFile(SparkSession spark, String snomedRelationshipPath) { return spark.read() .option("header", true) .option("delimiter", "\t") .csv(snomedRelationshipPath) .where(col("typeId").equalTo(lit(SNOMED_ISA_RELATIONSHIP_ID))) .where(col("active").equalTo(lit("1"))) .select(col("destinationId"), col("sourceId")) .where(col("destinationId").isNotNull() .and(col("destinationId").notEqual(lit("")))) .where(col("sourceId").isNotNull() .and(col("sourceId").notEqual(lit("")))) .map((MapFunction<Row, HierarchicalElement>) row -> { HierarchicalElement element = new HierarchicalElement(); element.setAncestorSystem(SNOMED_CODE_SYSTEM_URI); element.setAncestorValue(row.getString(0)); element.setDescendantSystem(SNOMED_CODE_SYSTEM_URI); element.setDescendantValue(row.getString(1)); return element; }, Hierarchies.getHierarchicalElementEncoder()); }