public static void writeParquet(Dataset<Row> df, String outputPath, SaveMode saveMode, int numPartitions) { logger.info(String.format("Saving parquet file %s, saveMode: %s, numPartitions: %s", outputPath, saveMode, numPartitions)); String hdfsOutputPath = outputPath; if (hdfsOutputPath.toLowerCase().startsWith(HDFS_PREFIX_LOWERCASE)) { hdfsOutputPath = hdfsOutputPath.substring(HDFS_PREFIX_LOWERCASE.length()); } df.coalesce(numPartitions).write().mode(saveMode).parquet(hdfsOutputPath); logger.info(String.format("Saved parquet file %s, saveMode: %s, numPartitions: %s", outputPath, saveMode, numPartitions)); }
@Override public Object execute(SparkSession sparkSession, ActionStatement actionStatement, CredentialProvider credentialManager) { String filePath = actionStatement.getParamValues().get(0).getValue().toString(); String saveModeStr = actionStatement.getParamValues().get(1).getValue().toString(); String dfTableName = actionStatement.getParamValues().get(2).getValue().toString(); SaveMode saveMode = SaveMode.valueOf(saveModeStr); String sql = String.format("select * from %s", dfTableName); logger.info(String.format("Running sql [%s] to get data and then save it", sql)); Dataset<Row> df = sparkSession.sql(sql); logger.info(String.format("Saving to parquet %s, saveMode: %s", filePath, saveMode)); df.coalesce(1).write().mode(saveMode).parquet(filePath); logger.info(String.format("Saved to parquet %s, saveMode: %s", filePath, saveMode)); return null; } }
@Override public Object execute(SparkSession sparkSession, ActionStatement actionStatement, CredentialProvider credentialManager) { String filePath = actionStatement.getParamValues().get(0).getValue().toString(); String saveModeStr = actionStatement.getParamValues().get(1).getValue().toString(); String dfTableName = actionStatement.getParamValues().get(2).getValue().toString(); SaveMode saveMode = SaveMode.valueOf(saveModeStr); String sql = String.format("select * from %s", dfTableName); logger.info(String.format("Running sql [%s] to get data and then save it", sql)); Dataset<Row> df = sparkSession.sql(sql); logger.info(String.format("Saving to csv %s, saveMode: %s", filePath, saveMode)); df.coalesce(1).write().mode(saveMode).option("header", "false").csv(filePath); logger.info(String.format("Saved to csv %s, saveMode: %s", filePath, saveMode)); return null; } }
@Override public Object execute(SparkSession sparkSession, ActionStatement actionStatement, CredentialProvider credentialManager) { String filePath = actionStatement.getParamValues().get(0).getValue().toString(); String saveModeStr = actionStatement.getParamValues().get(1).getValue().toString(); String dfTableName = actionStatement.getParamValues().get(2).getValue().toString(); SaveMode saveMode = SaveMode.valueOf(saveModeStr); String sql = String.format("select * from %s", dfTableName); logger.info(String.format("Running sql [%s] to get data and then save it", sql)); Dataset<Row> df = sparkSession.sql(sql); logger.info(String.format("Saving to json %s, saveMode: %s", filePath, saveMode)); df.coalesce(1).write().mode(saveMode).json(filePath); logger.info(String.format("Saved to json %s, saveMode: %s", filePath, saveMode)); return null; } }
@Override public Dataset<T> coalesce(final int numPartitions) { final boolean userTriggered = initializeFunction(numPartitions); final Dataset<T> result = from(super.coalesce(numPartitions)); this.setIsUserTriggered(userTriggered); return result; }
private Dataset<Row> repartition(Dataset<Row> data) { int numPartitions = 0; List<String> colPartitions = null; if (config.hasPath(REPARTITION_NUM_PARTITIONS_PROPERTY)) { numPartitions = config.getInt(REPARTITION_NUM_PARTITIONS_PROPERTY); } if (config.hasPath(REPARTITION_COLUMNS_PROPERTY)) { colPartitions = config.getStringList(REPARTITION_COLUMNS_PROPERTY); } if (numPartitions > 0 && null != colPartitions) { data = data.repartition(numPartitions, RowUtils.toColumnArray(colPartitions)); } else if (numPartitions > 0) { data = data.repartition(numPartitions); } else if (null != colPartitions) { data = data.repartition(RowUtils.toColumnArray(colPartitions)); } if (config.hasPath(COALESCE_NUM_PARTITIONS_PROPERTY)) { numPartitions = config.getInt(COALESCE_NUM_PARTITIONS_PROPERTY); data = data.coalesce(numPartitions); } return data; }
Dataset<Row> df = sparkSession.sql(sql).coalesce(dfPartitionCount); SparkUtils.writeJdbc(df, connectionString, outputTableName, primaryKeys, indexColumns, textColumns, saveMode, postWriteSql, writesPerSecond); logger.info(String.format("Saved data [%s] to %s, %s", sql, outputTableName, saveMode));
@Override public JavaRDD<AvroPayload> getData(@NonNull final ParquetWorkUnitCalculatorResult workUnitCalcResult) { Preconditions.checkState(workUnitCalcResult.hasWorkUnits(), "No work to process for: " + hiveConf.getDataPath()); /** * Current implementation of HiveSource assumes that only a single work unit exists which * corresponds to the single partition that is processed per job. */ final List<String> workUnits = workUnitCalcResult.getWorkUnits(); final String hdfsPath = new Path(this.hiveConf.getDataPath(), workUnits.get(0)).toString(); log.info("Reading data from path: {}", hdfsPath); final Dataset<Row> data = this.sqlContext.read().parquet(hdfsPath); final int numPartitions = calculateHiveNumPartitions(data); log.info("Using {} partitions", numPartitions); final JavaRDD<AvroPayload> hiveRawData = data .coalesce(numPartitions) .javaRDD() .flatMap(row -> { final List<AvroPayload> payloads = new ArrayList<>(); this.converter.convert(row).forEach(d -> payloads.add(d.getSuccessData().get().getData())); return payloads.iterator(); }); return hiveRawData; }
.parquet(inputDir) .sort(firstSortColumn, groupBySeq.result()) .coalesce(numberOfOutputFiles) .write() .option("compression", "gzip")
.parquet(inputDir) .sort(firstSortColumn, groupBySeq.result()) .coalesce(1) .write() .option("compression", "gzip")
private File buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) { File location = new File(parent, desc); Table byId = TABLES.create(SCHEMA, spec, location.toString()); // do not combine splits because the tests expect a split per partition byId.updateProperties().set("read.split.target-size", "1").commit(); // copy the unpartitioned table into the partitioned table to produce the partitioned data Dataset<Row> allRows = spark.read() .format("iceberg") .load(unpartitioned.toString()); allRows .coalesce(1) // ensure only 1 file per partition is written .withColumn("part", callUDF(udf, column(partitionColumn))) .sortWithinPartitions("part") .drop("part") .write() .format("iceberg") .mode("append") .save(byId.location()); return location; }