@Test public void testSaveModeAPI() { spark .range(10) .write() .format("org.apache.spark.sql.test") .mode(SaveMode.ErrorIfExists) .save(); }
@Test public void saveAndLoad() { Map<String, String> options = new HashMap<>(); options.put("path", path.toString()); df.write().mode(SaveMode.ErrorIfExists).format("json").options(options).save(); Dataset<Row> loadedDF = spark.read().format("json").options(options).load(); checkAnswer(loadedDF, df.collectAsList()); }
@Test public void saveAndLoad() { Map<String, String> options = new HashMap<>(); options.put("path", path.toString()); df.write().mode(SaveMode.ErrorIfExists).format("json").options(options).save(); Dataset<Row> loadedDF = spark.read().format("json").options(options).load(); checkAnswer(loadedDF, df.collectAsList()); }
@Test public void saveAndLoad() { Map<String, String> options = new HashMap<>(); options.put("path", path.toString()); df.write().mode(SaveMode.ErrorIfExists).format("json").options(options).save(); Dataset<Row> loadedDF = spark.read().format("json").options(options).load(); checkAnswer(loadedDF, df.collectAsList()); }
@Test public void testSaveModeAPI() { spark .range(10) .write() .format("org.apache.spark.sql.test") .mode(SaveMode.ErrorIfExists) .save(); }
@Test public void saveAndLoadWithSchema() { Map<String, String> options = new HashMap<>(); options.put("path", path.toString()); df.write().format("json").mode(SaveMode.ErrorIfExists).options(options).save(); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("b", DataTypes.StringType, true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> loadedDF = spark.read().format("json").schema(schema).options(options).load(); checkAnswer(loadedDF, spark.sql("SELECT b FROM jsonTable").collectAsList()); } }
@Test public void saveAndLoadWithSchema() { Map<String, String> options = new HashMap<>(); options.put("path", path.toString()); df.write().format("json").mode(SaveMode.ErrorIfExists).options(options).save(); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("b", DataTypes.StringType, true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> loadedDF = spark.read().format("json").schema(schema).options(options).load(); checkAnswer(loadedDF, spark.sql("SELECT b FROM jsonTable").collectAsList()); } }
@Test public void saveAndLoadWithSchema() { Map<String, String> options = new HashMap<>(); options.put("path", path.toString()); df.write().format("json").mode(SaveMode.ErrorIfExists).options(options).save(); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("b", DataTypes.StringType, true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> loadedDF = spark.read().format("json").schema(schema).options(options).load(); checkAnswer(loadedDF, spark.sql("SELECT b FROM jsonTable").collectAsList()); } }
@Test public void testSaveModeAPI() { spark .range(10) .write() .format("org.apache.spark.sql.test") .mode(SaveMode.ErrorIfExists) .save(); }
public static void writeParquet(Dataset<Row> df, String outputPath, SaveMode saveMode, int numPartitions) { logger.info(String.format("Saving parquet file %s, saveMode: %s, numPartitions: %s", outputPath, saveMode, numPartitions)); String hdfsOutputPath = outputPath; if (hdfsOutputPath.toLowerCase().startsWith(HDFS_PREFIX_LOWERCASE)) { hdfsOutputPath = hdfsOutputPath.substring(HDFS_PREFIX_LOWERCASE.length()); } df.coalesce(numPartitions).write().mode(saveMode).parquet(hdfsOutputPath); logger.info(String.format("Saved parquet file %s, saveMode: %s, numPartitions: %s", outputPath, saveMode, numPartitions)); }
public void write(List<Row> rows, StructType schema, String tableName) { if (CollectionUtils.isNotEmpty(rows)) sqlContext.createDataFrame(rows, schema).write().mode(SaveMode.Overwrite).jdbc(props.getProperty("url"), tableName, props); } }
public void writeParquet() throws IOException { // TODO: Consider having a configuration to limit number records written out this.dataset.write().mode(SaveMode.Append).parquet(getDestWritePath().toString()); }
@Override public Object execute(SparkSession sparkSession, ActionStatement actionStatement, CredentialProvider credentialManager) { String filePath = actionStatement.getParamValues().get(0).getValue().toString(); String saveModeStr = actionStatement.getParamValues().get(1).getValue().toString(); String dfTableName = actionStatement.getParamValues().get(2).getValue().toString(); SaveMode saveMode = SaveMode.valueOf(saveModeStr); String sql = String.format("select * from %s", dfTableName); logger.info(String.format("Running sql [%s] to get data and then save it", sql)); Dataset<Row> df = sparkSession.sql(sql); logger.info(String.format("Saving to parquet %s, saveMode: %s", filePath, saveMode)); df.coalesce(1).write().mode(saveMode).parquet(filePath); logger.info(String.format("Saved to parquet %s, saveMode: %s", filePath, saveMode)); return null; } }
@Override public Object execute(SparkSession sparkSession, ActionStatement actionStatement, CredentialProvider credentialManager) { String filePath = actionStatement.getParamValues().get(0).getValue().toString(); String saveModeStr = actionStatement.getParamValues().get(1).getValue().toString(); String dfTableName = actionStatement.getParamValues().get(2).getValue().toString(); SaveMode saveMode = SaveMode.valueOf(saveModeStr); String sql = String.format("select * from %s", dfTableName); logger.info(String.format("Running sql [%s] to get data and then save it", sql)); Dataset<Row> df = sparkSession.sql(sql); logger.info(String.format("Saving to json %s, saveMode: %s", filePath, saveMode)); df.coalesce(1).write().mode(saveMode).json(filePath); logger.info(String.format("Saved to json %s, saveMode: %s", filePath, saveMode)); return null; } }
@Override public Object execute(SparkSession sparkSession, ActionStatement actionStatement, CredentialProvider credentialManager) { String filePath = actionStatement.getParamValues().get(0).getValue().toString(); String saveModeStr = actionStatement.getParamValues().get(1).getValue().toString(); String dfTableName = actionStatement.getParamValues().get(2).getValue().toString(); SaveMode saveMode = SaveMode.valueOf(saveModeStr); String sql = String.format("select * from %s", dfTableName); logger.info(String.format("Running sql [%s] to get data and then save it", sql)); Dataset<Row> df = sparkSession.sql(sql); logger.info(String.format("Saving to csv %s, saveMode: %s", filePath, saveMode)); df.coalesce(1).write().mode(saveMode).option("header", "false").csv(filePath); logger.info(String.format("Saved to csv %s, saveMode: %s", filePath, saveMode)); return null; } }
/** * Writes value records to a table. This class ensures the columns and partitions are mapped * properly, and is a workaround similar to the problem described <a * href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>. * * @param values a dataset of value records * @param tableName the table to write them to */ private static void writeValuesToTable(Dataset<Value> values, String tableName) { // Note the last two columns here must be the partitioned-by columns in order and in lower case // for Spark to properly match them to the partitions Dataset<Row> orderColumnDataset = values.select("system", "version", "value", "valueseturi", "valuesetversion"); orderColumnDataset.write() .mode(SaveMode.ErrorIfExists) .insertInto(tableName); } }
public static void writeDataToFolder(DataSpark data, String path, SQLContext sqlContext, String formatFile) throws Exception { data.getDataFrame(sqlContext).write().mode(SaveMode.Overwrite).format(formatFile).save(path); }
@Test public void saveTableAndQueryIt() { Map<String, String> options = new HashMap<>(); df.write() .format("org.apache.spark.sql.json") .mode(SaveMode.Append) .options(options) .saveAsTable("javaSavedTable"); checkAnswer( sqlContext.sql("SELECT * FROM javaSavedTable"), df.collectAsList()); } }
@Test public void saveTableAndQueryIt() { Map<String, String> options = new HashMap<>(); df.write() .format("org.apache.spark.sql.json") .mode(SaveMode.Append) .options(options) .saveAsTable("javaSavedTable"); checkAnswer( sqlContext.sql("SELECT * FROM javaSavedTable"), df.collectAsList()); } }
@Test public void saveExternalTableAndQueryIt() { Map<String, String> options = new HashMap<>(); options.put("path", path.toString()); df.write() .format("org.apache.spark.sql.json") .mode(SaveMode.Append) .options(options) .saveAsTable("javaSavedTable"); checkAnswer( sqlContext.sql("SELECT * FROM javaSavedTable"), df.collectAsList()); Dataset<Row> loadedDF = sqlContext.createExternalTable("externalTable", "org.apache.spark.sql.json", options); checkAnswer(loadedDF, df.collectAsList()); checkAnswer( sqlContext.sql("SELECT * FROM externalTable"), df.collectAsList()); }