@Test public void standardScaler() { // The tests are to check Java compatibility. List<VectorIndexerSuite.FeatureData> points = Arrays.asList( new VectorIndexerSuite.FeatureData(Vectors.dense(0.0, -2.0)), new VectorIndexerSuite.FeatureData(Vectors.dense(1.0, 3.0)), new VectorIndexerSuite.FeatureData(Vectors.dense(1.0, 4.0)) ); Dataset<Row> dataFrame = spark.createDataFrame(jsc.parallelize(points, 2), VectorIndexerSuite.FeatureData.class); StandardScaler scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures") .setWithStd(true) .setWithMean(false); // Compute summary statistics by fitting the StandardScaler StandardScalerModel scalerModel = scaler.fit(dataFrame); // Normalize each feature to have unit standard deviation. Dataset<Row> scaledData = scalerModel.transform(dataFrame); scaledData.count(); } }
@Test public void pipeline() { StandardScaler scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures"); LogisticRegression lr = new LogisticRegression() .setFeaturesCol("scaledFeatures"); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{scaler, lr}); PipelineModel model = pipeline.fit(dataset); model.transform(dataset).createOrReplaceTempView("prediction"); Dataset<Row> predictions = spark.sql("SELECT label, probability, prediction FROM prediction"); predictions.collectAsList(); } }
@Test public void pipeline() { StandardScaler scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures"); LogisticRegression lr = new LogisticRegression() .setFeaturesCol("scaledFeatures"); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{scaler, lr}); PipelineModel model = pipeline.fit(dataset); model.transform(dataset).createOrReplaceTempView("prediction"); Dataset<Row> predictions = spark.sql("SELECT label, probability, prediction FROM prediction"); predictions.collectAsList(); } }
@Test public void standardScaler() { // The tests are to check Java compatibility. List<VectorIndexerSuite.FeatureData> points = Arrays.asList( new VectorIndexerSuite.FeatureData(Vectors.dense(0.0, -2.0)), new VectorIndexerSuite.FeatureData(Vectors.dense(1.0, 3.0)), new VectorIndexerSuite.FeatureData(Vectors.dense(1.0, 4.0)) ); Dataset<Row> dataFrame = spark.createDataFrame(jsc.parallelize(points, 2), VectorIndexerSuite.FeatureData.class); StandardScaler scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures") .setWithStd(true) .setWithMean(false); // Compute summary statistics by fitting the StandardScaler StandardScalerModel scalerModel = scaler.fit(dataFrame); // Normalize each feature to have unit standard deviation. Dataset<Row> scaledData = scalerModel.transform(dataFrame); scaledData.count(); } }
@Test public void pipeline() { StandardScaler scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures"); LogisticRegression lr = new LogisticRegression() .setFeaturesCol("scaledFeatures"); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{scaler, lr}); PipelineModel model = pipeline.fit(dataset); model.transform(dataset).createOrReplaceTempView("prediction"); Dataset<Row> predictions = spark.sql("SELECT label, probability, prediction FROM prediction"); predictions.collectAsList(); } }
@Test public void standardScaler() { // The tests are to check Java compatibility. List<VectorIndexerSuite.FeatureData> points = Arrays.asList( new VectorIndexerSuite.FeatureData(Vectors.dense(0.0, -2.0)), new VectorIndexerSuite.FeatureData(Vectors.dense(1.0, 3.0)), new VectorIndexerSuite.FeatureData(Vectors.dense(1.0, 4.0)) ); Dataset<Row> dataFrame = spark.createDataFrame(jsc.parallelize(points, 2), VectorIndexerSuite.FeatureData.class); StandardScaler scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures") .setWithStd(true) .setWithMean(false); // Compute summary statistics by fitting the StandardScaler StandardScalerModel scalerModel = scaler.fit(dataFrame); // Normalize each feature to have unit standard deviation. Dataset<Row> scaledData = scalerModel.transform(dataFrame); scaledData.count(); } }