@Override public LogisticRegressionModelInfo getModelInfo(final LogisticRegressionModel sparkLRModel) { final LogisticRegressionModelInfo logisticRegressionModelInfo = new LogisticRegressionModelInfo(); logisticRegressionModelInfo.setWeights(sparkLRModel.coefficients().toArray()); logisticRegressionModelInfo.setIntercept(sparkLRModel.intercept()); logisticRegressionModelInfo.setNumClasses(sparkLRModel.numClasses()); logisticRegressionModelInfo.setNumFeatures(sparkLRModel.numFeatures()); logisticRegressionModelInfo.setThreshold(sparkLRModel.getThreshold()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(sparkLRModel.getFeaturesCol()); logisticRegressionModelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(sparkLRModel.getPredictionCol()); outputKeys.add(sparkLRModel.getProbabilityCol()); logisticRegressionModelInfo.setOutputKeys(outputKeys); return logisticRegressionModelInfo; }
public void printModel() { LogisticRegressionModel lrModel = (LogisticRegressionModel) model.stages()[2]; System.out.println("intercept = " + lrModel.intercept()); System.out.println("number of features = " + lrModel.numFeatures()); System.out.println("regularization parameter = " + lrModel.getRegParam()); System.out.println(lrModel.explainParams()); }
final LogisticRegressionModel lrModel, final Vector vector) { Vector weights = lrModel.weights(); int numClasses = lrModel.numClasses(); int dataWithBiasSize = weights.size() / (numClasses - 1); boolean withBias = (vector.size() + 1) == dataWithBiasSize;
.setProbabilityCol("myProbability"); LogisticRegressionModel model = lr.fit(dataset); LogisticRegression parent = (LogisticRegression) model.parent(); Assert.assertEquals(10, parent.getMaxIter()); Assert.assertEquals(1.0, parent.getRegParam(), eps); Assert.assertEquals(0.6, parent.getThresholds()[1], eps); Assert.assertEquals(0.6, parent.getThreshold(), eps); Assert.assertEquals(0.6, model.getThreshold(), eps); model.setThreshold(1.0); model.transform(dataset).createOrReplaceTempView("predAllZero"); Dataset<Row> predAllZero = spark.sql("SELECT prediction, myProbability FROM predAllZero"); for (Row r : predAllZero.collectAsList()) { model.transform(dataset, model.threshold().w(0.0), model.probabilityCol().w("myProb")) .createOrReplaceTempView("predNotAllZero"); Dataset<Row> predNotAllZero = spark.sql("SELECT prediction, myProb FROM predNotAllZero"); LogisticRegression parent2 = (LogisticRegression) model2.parent(); Assert.assertEquals(5, parent2.getMaxIter()); Assert.assertEquals(0.1, parent2.getRegParam(), eps); Assert.assertEquals(0.4, parent2.getThreshold(), eps); Assert.assertEquals(0.4, model2.getThreshold(), eps); Assert.assertEquals("theProb", model2.getProbabilityCol());
@Test public void logisticRegressionDefaultParams() { LogisticRegression lr = new LogisticRegression(); Assert.assertEquals(lr.getLabelCol(), "label"); LogisticRegressionModel model = lr.fit(dataset); model.transform(dataset).createOrReplaceTempView("prediction"); Dataset<Row> predictions = spark.sql("SELECT label, probability, prediction FROM prediction"); predictions.collectAsList(); // Check defaults Assert.assertEquals(0.5, model.getThreshold(), eps); Assert.assertEquals("features", model.getFeaturesCol()); Assert.assertEquals("prediction", model.getPredictionCol()); Assert.assertEquals("probability", model.getProbabilityCol()); }
+ lrModel.coefficients() + " Intercept: " + lrModel.intercept()); LogisticRegressionTrainingSummary trainingSummary = lrModel.summary(); System.out.println("numIterations: " + trainingSummary.totalIterations()); System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary.objectiveHistory())); double bestThreshold = fMeasure.where(fMeasure.col("F-Measure").equalTo(maxFMeasure)) .select("threshold").head().getDouble(0); lrModel.setThreshold(bestThreshold);
@SuppressWarnings("unchecked") @Test public void logisticRegressionPredictorClassifierMethods() { LogisticRegression lr = new LogisticRegression(); LogisticRegressionModel model = lr.fit(dataset); Assert.assertEquals(2, model.numClasses()); model.transform(dataset).createOrReplaceTempView("transformed"); Dataset<Row> trans1 = spark.sql("SELECT rawPrediction, probability FROM transformed"); for (Row row : trans1.collectAsList()) { Vector raw = (Vector) row.get(0); Vector prob = (Vector) row.get(1); Assert.assertEquals(raw.size(), 2); Assert.assertEquals(prob.size(), 2); double probFromRaw1 = 1.0 / (1.0 + Math.exp(-raw.apply(1))); Assert.assertEquals(0, Math.abs(prob.apply(1) - probFromRaw1), eps); Assert.assertEquals(0, Math.abs(prob.apply(0) - (1.0 - probFromRaw1)), eps); } Dataset<Row> trans2 = spark.sql("SELECT prediction, probability FROM transformed"); for (Row row : trans2.collectAsList()) { double pred = row.getDouble(0); Vector prob = (Vector) row.get(1); double probOfPred = prob.apply((int) pred); for (int i = 0; i < prob.size(); ++i) { Assert.assertTrue(probOfPred >= prob.apply(i)); } } }
List<Double> coefficients = new ArrayList<>(VectorUtil.toList(model.coefficients())); RegressionModel regressionModel = RegressionModelUtil.createBinaryLogisticClassification(features, coefficients, model.intercept(), RegressionModel.NormalizationMethod.LOGIT, true, schema) .setOutput(null); Matrix coefficientMatrix = model.coefficientMatrix(); Vector interceptVector = model.interceptVector();
LogisticRegressionTrainingSummary trainingSummary = lrModel.summary(); double[] objectiveHistory = trainingSummary.objectiveHistory(); System.out.println("#(iterations) = " + objectiveHistory.length);
System.out.println("Model 1 was fit using parameters: " + model1.parent().extractParamMap()); System.out.println("Model 2 was fit using parameters: " + model2.parent().extractParamMap()); DataFrame results = model2.transform(test); for (Row r: results.select("features", "label", "myProbability", "prediction").collect()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
.setProbabilityCol("myProbability"); LogisticRegressionModel model = lr.fit(dataset); LogisticRegression parent = (LogisticRegression) model.parent(); Assert.assertEquals(10, parent.getMaxIter()); Assert.assertEquals(1.0, parent.getRegParam(), eps); Assert.assertEquals(0.6, parent.getThresholds()[1], eps); Assert.assertEquals(0.6, parent.getThreshold(), eps); Assert.assertEquals(0.6, model.getThreshold(), eps); model.setThreshold(1.0); model.transform(dataset).createOrReplaceTempView("predAllZero"); Dataset<Row> predAllZero = spark.sql("SELECT prediction, myProbability FROM predAllZero"); for (Row r : predAllZero.collectAsList()) { model.transform(dataset, model.threshold().w(0.0), model.probabilityCol().w("myProb")) .createOrReplaceTempView("predNotAllZero"); Dataset<Row> predNotAllZero = spark.sql("SELECT prediction, myProb FROM predNotAllZero"); LogisticRegression parent2 = (LogisticRegression) model2.parent(); Assert.assertEquals(5, parent2.getMaxIter()); Assert.assertEquals(0.1, parent2.getRegParam(), eps); Assert.assertEquals(0.4, parent2.getThreshold(), eps); Assert.assertEquals(0.4, model2.getThreshold(), eps); Assert.assertEquals("theProb", model2.getProbabilityCol());
@Test public void logisticRegressionDefaultParams() { LogisticRegression lr = new LogisticRegression(); Assert.assertEquals(lr.getLabelCol(), "label"); LogisticRegressionModel model = lr.fit(dataset); model.transform(dataset).createOrReplaceTempView("prediction"); Dataset<Row> predictions = spark.sql("SELECT label, probability, prediction FROM prediction"); predictions.collectAsList(); // Check defaults Assert.assertEquals(0.5, model.getThreshold(), eps); Assert.assertEquals("features", model.getFeaturesCol()); Assert.assertEquals("prediction", model.getPredictionCol()); Assert.assertEquals("probability", model.getProbabilityCol()); }
@SuppressWarnings("unchecked") @Test public void logisticRegressionPredictorClassifierMethods() { LogisticRegression lr = new LogisticRegression(); LogisticRegressionModel model = lr.fit(dataset); Assert.assertEquals(2, model.numClasses()); model.transform(dataset).createOrReplaceTempView("transformed"); Dataset<Row> trans1 = spark.sql("SELECT rawPrediction, probability FROM transformed"); for (Row row : trans1.collectAsList()) { Vector raw = (Vector) row.get(0); Vector prob = (Vector) row.get(1); Assert.assertEquals(raw.size(), 2); Assert.assertEquals(prob.size(), 2); double probFromRaw1 = 1.0 / (1.0 + Math.exp(-raw.apply(1))); Assert.assertEquals(0, Math.abs(prob.apply(1) - probFromRaw1), eps); Assert.assertEquals(0, Math.abs(prob.apply(0) - (1.0 - probFromRaw1)), eps); } Dataset<Row> trans2 = spark.sql("SELECT prediction, probability FROM transformed"); for (Row row : trans2.collectAsList()) { double pred = row.getDouble(0); Vector prob = (Vector) row.get(1); double probOfPred = prob.apply((int) pred); for (int i = 0; i < prob.size(); ++i) { Assert.assertTrue(probOfPred >= prob.apply(i)); } } }
@Test public void logisticRegressionTrainingSummary() { LogisticRegression lr = new LogisticRegression(); LogisticRegressionModel model = lr.fit(dataset); LogisticRegressionTrainingSummary summary = model.summary(); Assert.assertEquals(summary.totalIterations(), summary.objectiveHistory().length); } }
@Override public LogisticRegressionModelInfo getModelInfo(final LogisticRegressionModel sparkLRModel, DataFrame df) { final LogisticRegressionModelInfo logisticRegressionModelInfo = new LogisticRegressionModelInfo(); logisticRegressionModelInfo.setWeights(sparkLRModel.coefficients().toArray()); logisticRegressionModelInfo.setIntercept(sparkLRModel.intercept()); logisticRegressionModelInfo.setNumClasses(sparkLRModel.numClasses()); logisticRegressionModelInfo.setNumFeatures(sparkLRModel.numFeatures()); logisticRegressionModelInfo.setThreshold(sparkLRModel.getThreshold()); logisticRegressionModelInfo.setProbabilityKey(sparkLRModel.getProbabilityCol()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(sparkLRModel.getFeaturesCol()); logisticRegressionModelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(sparkLRModel.getPredictionCol()); outputKeys.add(sparkLRModel.getProbabilityCol()); logisticRegressionModelInfo.setOutputKeys(outputKeys); return logisticRegressionModelInfo; }
.setProbabilityCol("myProbability"); LogisticRegressionModel model = lr.fit(dataset); LogisticRegression parent = (LogisticRegression) model.parent(); Assert.assertEquals(10, parent.getMaxIter()); Assert.assertEquals(1.0, parent.getRegParam(), eps); Assert.assertEquals(0.6, parent.getThresholds()[1], eps); Assert.assertEquals(0.6, parent.getThreshold(), eps); Assert.assertEquals(0.6, model.getThreshold(), eps); model.setThreshold(1.0); model.transform(dataset).createOrReplaceTempView("predAllZero"); Dataset<Row> predAllZero = spark.sql("SELECT prediction, myProbability FROM predAllZero"); for (Row r : predAllZero.collectAsList()) { model.transform(dataset, model.threshold().w(0.0), model.probabilityCol().w("myProb")) .createOrReplaceTempView("predNotAllZero"); Dataset<Row> predNotAllZero = spark.sql("SELECT prediction, myProb FROM predNotAllZero"); LogisticRegression parent2 = (LogisticRegression) model2.parent(); Assert.assertEquals(5, parent2.getMaxIter()); Assert.assertEquals(0.1, parent2.getRegParam(), eps); Assert.assertEquals(0.4, parent2.getThreshold(), eps); Assert.assertEquals(0.4, model2.getThreshold(), eps); Assert.assertEquals("theProb", model2.getProbabilityCol());
@Test public void logisticRegressionDefaultParams() { LogisticRegression lr = new LogisticRegression(); Assert.assertEquals(lr.getLabelCol(), "label"); LogisticRegressionModel model = lr.fit(dataset); model.transform(dataset).createOrReplaceTempView("prediction"); Dataset<Row> predictions = spark.sql("SELECT label, probability, prediction FROM prediction"); predictions.collectAsList(); // Check defaults Assert.assertEquals(0.5, model.getThreshold(), eps); Assert.assertEquals("features", model.getFeaturesCol()); Assert.assertEquals("prediction", model.getPredictionCol()); Assert.assertEquals("probability", model.getProbabilityCol()); }
@SuppressWarnings("unchecked") @Test public void logisticRegressionPredictorClassifierMethods() { LogisticRegression lr = new LogisticRegression(); LogisticRegressionModel model = lr.fit(dataset); Assert.assertEquals(2, model.numClasses()); model.transform(dataset).createOrReplaceTempView("transformed"); Dataset<Row> trans1 = spark.sql("SELECT rawPrediction, probability FROM transformed"); for (Row row : trans1.collectAsList()) { Vector raw = (Vector) row.get(0); Vector prob = (Vector) row.get(1); Assert.assertEquals(raw.size(), 2); Assert.assertEquals(prob.size(), 2); double probFromRaw1 = 1.0 / (1.0 + Math.exp(-raw.apply(1))); Assert.assertEquals(0, Math.abs(prob.apply(1) - probFromRaw1), eps); Assert.assertEquals(0, Math.abs(prob.apply(0) - (1.0 - probFromRaw1)), eps); } Dataset<Row> trans2 = spark.sql("SELECT prediction, probability FROM transformed"); for (Row row : trans2.collectAsList()) { double pred = row.getDouble(0); Vector prob = (Vector) row.get(1); double probOfPred = prob.apply((int) pred); for (int i = 0; i < prob.size(); ++i) { Assert.assertTrue(probOfPred >= prob.apply(i)); } } }
@Test public void logisticRegressionTrainingSummary() { LogisticRegression lr = new LogisticRegression(); LogisticRegressionModel model = lr.fit(dataset); LogisticRegressionTrainingSummary summary = model.summary(); Assert.assertEquals(summary.totalIterations(), summary.objectiveHistory().length); } }
@Test public void logisticRegressionTrainingSummary() { LogisticRegression lr = new LogisticRegression(); LogisticRegressionModel model = lr.fit(dataset); LogisticRegressionTrainingSummary summary = model.summary(); Assert.assertEquals(summary.totalIterations(), summary.objectiveHistory().length); } }