Dataset<Row> schemaPeople = sqlContext.createDataFrame(people, Person.class); schemaPeople.registerTempTable("people");
Dataset<Row> schemaPeople = sqlContext.createDataFrame(people, Person.class); schemaPeople.registerTempTable("people");
SQLContext sqlcontext=new SQLContext(context); DataFrame outDataFrame=sqlcontext.createDataFrame(finalOutPutRDD, WebHttpOutPutVO.class); Properties prop = new java.util.Properties(); prop.setProperty("database", "Web_Session"); prop.setProperty("user", "user"); prop.setProperty("password", "pwd@123"); prop.setProperty("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver"); outDataFrame.write().mode(org.apache.spark.sql.SaveMode.Append).jdbc("jdbc:sqlserver://<Host>:1433", "test_table", prop);
public static DataFrame DataMapToDataFrame(SQLContext sql,DataMap map,Map<String,Double> labels){ return sql.createDataFrame(transformDataMap(map,labels), LabeledPoint.class); }
/** * Converts a list of dependency graphs to a data frame. * @param jsc * @param graphs * @param featureFrame * @return a data frame */ private DataFrame toDataFrame(JavaSparkContext jsc, List<DependencyGraph> graphs, FeatureFrame featureFrame) { List<ParsingContext> list = new ArrayList<ParsingContext>(); for (DependencyGraph graph : graphs) { List<ParsingContext> xy = TransitionDecoder.decode(graph, featureFrame); list.addAll(xy); } JavaRDD<ParsingContext> javaRDD = jsc.parallelize(list); return sqlContext.createDataFrame(javaRDD, ParsingContext.class); }
DataFrame df; SQLContext sqlContext; Long start; Long end; JavaPairRDD<Row, Long> indexedRDD = df.toJavaRDD().zipWithIndex(); JavaRDD filteredRDD = indexedRDD.filter((Tuple2<Row,Long> v1) -> v1._2 >= start && v1._2 < end); DataFrame filteredDataFrame = sqlContext.createDataFrame(filteredRDD, df.schema());
public void write(List<Row> rows, StructType schema, String tableName) { if (CollectionUtils.isNotEmpty(rows)) sqlContext.createDataFrame(rows, schema).write().mode(SaveMode.Overwrite).jdbc(props.getProperty("url"), tableName, props); } }
/** * Converts a dependency graph to a data frame. * @param jsc * @param graph * @param featureFrame * @return a data frame */ public DataFrame toDataFrame(JavaSparkContext jsc, DependencyGraph graph, FeatureFrame featureFrame) { List<ParsingContext> list = TransitionDecoder.decode(graph, featureFrame); JavaRDD<ParsingContext> javaRDD = jsc.parallelize(list); return sqlContext.createDataFrame(javaRDD, ParsingContext.class); }
@Override public DataFrame getDataFrame(SQLContext sql) { // Obtain the schema StructType schema = SchemaConverter.getSchema(attributes); // Transform the RDD JavaRDD<Row> rowRDD = DataFrameOps.toRowRDD(amidstRDD, attributes); // Create the DataFrame return sql.createDataFrame(rowRDD, schema); }
JavaRDD<Row> jrdd = preprocessedDocuments.map(f-> RowFactory.create(f.getLabel(), f.getText())); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); SQLContext sqlContext = new SQLContext(sc); DataFrame sentenceData = sqlContext.createDataFrame(jrdd, schema); Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words"); DataFrame wordsData = tokenizer.transform(sentenceData); int numFeatures = 20; HashingTF hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("rawFeatures") .setNumFeatures(numFeatures); DataFrame featurizedData = hashingTF.transform(wordsData); DataFrame rescaledData = idfModel.transform(featurizedData); JavaRDD<Row> rows = rescaledData.rdd().toJavaRDD(); JavaRDD<LabeledPoint> data = rows.map(f -> new LabeledPoint(f.getDouble(0), f.getAs(4)));
@Override public Dataset<Row> getBatch(Option<Offset> start, Offset end) { return sqlContext.createDataFrame( sharedSourceTaskContext.read(start.isDefined() ? Optional.of(start.get()) : Optional.empty(), end) .stream() .map(record -> new GenericRow(new Object[]{ record.topic(), record.kafkaPartition(), keyConverter.fromConnectData(record.topic(), record.keySchema(), record.key()), valueConverter.fromConnectData(record.topic(), record.valueSchema(), record.value()) })).collect(Collectors.toList()), DATA_SCHEMA); }
/** * Tags a list of sequences and writes the result to an output file with a * desired output format. * * @param sentences * @param outputFileName * @param outputFormat */ public void tag(List<String> sentences, String outputFileName, OutputFormat outputFormat) { List<Row> rows = new LinkedList<Row>(); for (String sentence : sentences) { rows.add(RowFactory.create(sentence)); } StructType schema = new StructType(new StructField[]{ new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); SQLContext sqlContext = new SQLContext(jsc); DataFrame input = sqlContext.createDataFrame(rows, schema); tag(input, outputFileName, outputFormat); }
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception { if (!dependencies.containsKey(intoDependency)) { throw new RuntimeException("Nest deriver points to non-existent nest-into dependency"); } Dataset<Row> into = dependencies.get(intoDependency); if (!dependencies.containsKey(fromDependency)) { throw new RuntimeException("Nest deriver points to non-existent nest-from dependency"); } Dataset<Row> from = dependencies.get(fromDependency); ExtractFieldsFunction extractFieldsFunction = new ExtractFieldsFunction(keyFieldNames); JavaPairRDD<List<Object>, Row> keyedIntoRDD = into.javaRDD().keyBy(extractFieldsFunction); JavaPairRDD<List<Object>, Row> keyedFromRDD = from.javaRDD().keyBy(extractFieldsFunction); NestFunction nestFunction = new NestFunction(); JavaRDD<Row> nestedRDD = keyedIntoRDD.cogroup(keyedFromRDD).values().map(nestFunction); StructType nestedSchema = into.schema().add(nestedFieldName, DataTypes.createArrayType(from.schema())); Dataset<Row> nested = into.sqlContext().createDataFrame(nestedRDD, nestedSchema); return nested; }
/** * Tags a distributed list of sentences and writes the result to an output file with * a desired output format. * @param sentences * @param outputFileName * @param outputFormat */ public void tag(JavaRDD<Row> sentences, String outputFileName, OutputFormat outputFormat) { StructType schema = new StructType(new StructField[]{ new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); SQLContext sqlContext = new SQLContext(jsc); DataFrame input = sqlContext.createDataFrame(sentences, schema); tag(input, outputFileName, outputFormat); }
/** * Creates a data frame from a collection of writables * rdd given a schema * * @param schema the schema to use * @param data the data to convert * @return the dataframe object */ public static DataRowsFacade toDataFrame(Schema schema, JavaRDD<List<Writable>> data) { JavaSparkContext sc = new JavaSparkContext(data.context()); SQLContext sqlContext = new SQLContext(sc); JavaRDD<Row> rows = data.map(new ToRow(schema)); return dataRows(sqlContext.createDataFrame(rows, fromSchema(schema))); }
/** * Creates a data frame from a collection of writables * rdd given a schema * * @param schema the schema to use * @param data the data to convert * @return the dataframe object */ public static DataRowsFacade toDataFrame(Schema schema, JavaRDD<List<Writable>> data) { JavaSparkContext sc = new JavaSparkContext(data.context()); SQLContext sqlContext = new SQLContext(sc); JavaRDD<Row> rows = data.map(new ToRow(schema)); return dataRows(sqlContext.createDataFrame(rows, fromSchema(schema))); }
private void start() { SparkSession spark = SparkSession.builder().appName( "Build a DataFrame from Scratch").master("local[*]") .getOrCreate(); List<String> stringAsList = new ArrayList<>(); stringAsList.add("bar"); JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext()); JavaRDD<Row> rowRDD = sparkContext.parallelize(stringAsList).map(( String row) -> RowFactory.create(row)); // Creates schema StructType schema = DataTypes.createStructType( new StructField[] { DataTypes.createStructField("foe", DataTypes.StringType, false) }); Dataset<Row> df = spark.sqlContext().createDataFrame(rowRDD, schema).toDF(); log.debug("** Schema: "); df.printSchema(); log.debug("** Data: "); df.show(); sparkContext.close(); } }
/** * Convert the given sequence data set to a DataFrame.<br> * <b>Note</b>: The resulting DataFrame has two additional columns added to it:<br> * - Column 0: Sequence UUID (name: {@link #SEQUENCE_UUID_COLUMN}) - a UUID for the original sequence<br> * - Column 1: Sequence index (name: {@link #SEQUENCE_INDEX_COLUMN} - an index (integer, starting at 0) for the position * of this record in the original time series.<br> * These two columns are required if the data is to be converted back into a sequence at a later point, for example * using {@link #toRecordsSequence(DataRowsFacade)} * * @param schema Schema for the data * @param data Sequence data to convert to a DataFrame * @return The dataframe object */ public static DataRowsFacade toDataFrameSequence(Schema schema, JavaRDD<List<List<Writable>>> data) { JavaSparkContext sc = new JavaSparkContext(data.context()); SQLContext sqlContext = new SQLContext(sc); JavaRDD<Row> rows = data.flatMap(new SequenceToRows(schema)); return dataRows(sqlContext.createDataFrame(rows, fromSchemaSequence(schema))); }
/** * Convert the given sequence data set to a DataFrame.<br> * <b>Note</b>: The resulting DataFrame has two additional columns added to it:<br> * - Column 0: Sequence UUID (name: {@link #SEQUENCE_UUID_COLUMN}) - a UUID for the original sequence<br> * - Column 1: Sequence index (name: {@link #SEQUENCE_INDEX_COLUMN} - an index (integer, starting at 0) for the position * of this record in the original time series.<br> * These two columns are required if the data is to be converted back into a sequence at a later point, for example * using {@link #toRecordsSequence(DataRowsFacade)} * * @param schema Schema for the data * @param data Sequence data to convert to a DataFrame * @return The dataframe object */ public static DataRowsFacade toDataFrameSequence(Schema schema, JavaRDD<List<List<Writable>>> data) { JavaSparkContext sc = new JavaSparkContext(data.context()); SQLContext sqlContext = new SQLContext(sc); JavaRDD<Row> rows = data.flatMap(new SequenceToRows(schema)); return dataRows(sqlContext.createDataFrame(rows, fromSchemaSequence(schema))); }
@Override public DataFrame transform(DataFrame dataset) { JavaRDD<Row> output = dataset.javaRDD().map(new DecodeFunction()); StructType schema = new StructType(new StructField[]{ new StructField("sentence", DataTypes.StringType, false, Metadata.empty()), new StructField("prediction", DataTypes.StringType, false, Metadata.empty()) }); return dataset.sqlContext().createDataFrame(output, schema); }