/** * Loads a pipeline model from an external file. * @param pipelineModelFileName * @return a pipeline model. */ public PipelineModel load(String pipelineModelFileName) { model = PipelineModel.load(pipelineModelFileName); return model; }
public void printModel() { LogisticRegressionModel lrModel = (LogisticRegressionModel) model.stages()[2]; System.out.println("intercept = " + lrModel.intercept()); System.out.println("number of features = " + lrModel.numFeatures()); System.out.println("regularization parameter = " + lrModel.getRegParam()); System.out.println(lrModel.explainParams()); }
@Override public List<Transformer> apply(Transformer transformer){ if(transformer instanceof PipelineModel){ PipelineModel pipelineModel = (PipelineModel)transformer; return Arrays.asList(pipelineModel.stages()); } else if(transformer instanceof CrossValidatorModel){ CrossValidatorModel crossValidatorModel = (CrossValidatorModel)transformer; return Collections.singletonList(crossValidatorModel.bestModel()); } else if(transformer instanceof TrainValidationSplitModel){ TrainValidationSplitModel trainValidationSplitModel = (TrainValidationSplitModel)transformer; return Collections.singletonList(trainValidationSplitModel.bestModel()); } return null; } };
@Test public void crossValidationWithLogisticRegression() { LogisticRegression lr = new LogisticRegression(); ParamMap[] lrParamMaps = new ParamGridBuilder() .addGrid(lr.regParam(), new double[]{0.001, 1000.0}) .addGrid(lr.maxIter(), new int[]{0, 10}) .build(); BinaryClassificationEvaluator eval = new BinaryClassificationEvaluator(); CrossValidator cv = new CrossValidator() .setEstimator(lr) .setEstimatorParamMaps(lrParamMaps) .setEvaluator(eval) .setNumFolds(3); CrossValidatorModel cvModel = cv.fit(dataset); LogisticRegression parent = (LogisticRegression) cvModel.bestModel().parent(); Assert.assertEquals(0.001, parent.getRegParam(), 0.0); Assert.assertEquals(10, parent.getMaxIter()); } }
@Test public void pipeline() { StandardScaler scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures"); LogisticRegression lr = new LogisticRegression() .setFeaturesCol("scaledFeatures"); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{scaler, lr}); PipelineModel model = pipeline.fit(dataset); model.transform(dataset).createOrReplaceTempView("prediction"); Dataset<Row> predictions = spark.sql("SELECT label, probability, prediction FROM prediction"); predictions.collectAsList(); } }
/** * Creates a processing pipeline. * @return a pipeline */ protected Pipeline createPipeline() { Tokenizer tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("tokens"); CountVectorizer countVectorizer = new CountVectorizer() .setInputCol("tokens") .setOutputCol("features") .setMinDF((Double)params.getOrDefault(params.getMinFF())) .setVocabSize((Integer)params.getOrDefault(params.getNumFeatures())); StringIndexer transitionIndexer = new StringIndexer() .setInputCol("transition") .setOutputCol("label"); Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{tokenizer, countVectorizer, transitionIndexer}); return pipeline; }
/** * Creates a transition-based parser using a MLP transition classifier. * @param jsc * @param classifierFileName * @param featureFrame */ public TransitionBasedParserMLP(JavaSparkContext jsc, String classifierFileName, FeatureFrame featureFrame) { this.featureFrame = featureFrame; this.classifier = TransitionClassifier.load(jsc, new Path(classifierFileName, "data").toString()); this.pipelineModel = PipelineModel.load(new Path(classifierFileName, "pipelineModel").toString()); this.transitionName = ((StringIndexerModel)pipelineModel.stages()[2]).labels(); String[] features = ((CountVectorizerModel)(pipelineModel.stages()[1])).vocabulary(); this.featureMap = new HashMap<String, Integer>(); for (int j = 0; j < features.length; j++) { this.featureMap.put(features[j], j); } }
/** * Creates a conditional Markov model. * @param pipelineModel * @param weights * @param markovOrder */ public CMMModel(PipelineModel pipelineModel, Vector weights, MarkovOrder markovOrder, Map<String, Set<Integer>> tagDictionary) { this.pipelineModel = pipelineModel; this.contextExtractor = new ContextExtractor(markovOrder, Constants.REGEXP_FILE); this.weights = weights; this.tags = ((StringIndexerModel)(pipelineModel.stages()[2])).labels(); String[] features = ((CountVectorizerModel)(pipelineModel.stages()[1])).vocabulary(); featureMap = new HashMap<String, Integer>(); for (int j = 0; j < features.length; j++) { featureMap.put(features[j], j); } this.tagDictionary = tagDictionary; }
@Override public PipelineModelInfo getModelInfo(final PipelineModel from, final DataFrame df) { final PipelineModelInfo modelInfo = new PipelineModelInfo(); final ModelInfo stages[] = new ModelInfo[from.stages().length]; for (int i = 0; i < from.stages().length; i++) { Transformer sparkModel = from.stages()[i]; stages[i] = ModelInfoAdapterFactory.getAdapter(sparkModel.getClass()).adapt(sparkModel, df); } modelInfo.setStages(stages); return modelInfo; }
public PMMLBuilder verify(Dataset<Row> dataset, double precision, double zeroThreshold){ PipelineModel pipelineModel = getPipelineModel(); Dataset<Row> transformedDataset = pipelineModel.transform(dataset); Verification verification = new Verification(dataset, transformedDataset) .setPrecision(precision) .setZeroThreshold(zeroThreshold); return setVerification(verification); }
@Override PopularWordsEstimatorModelInfo getModelInfo(PopularWordsModel from) { PopularWordsEstimatorModelInfo modelInfo = new PopularWordsEstimatorModelInfo(); modelInfo.setPopularWords(new HashSet<>(Arrays.asList(from.popularWords()))); Set<String> inputKeys = new LinkedHashSet<>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
public Transformer build(){ Evaluator evaluator = getEvaluator(); PMMLTransformer pmmlTransformer = new PMMLTransformer(evaluator, this.columnProducers); if(this.exploded){ ColumnExploder columnExploder = new ColumnExploder(pmmlTransformer.getOutputCol()); ColumnPruner columnPruner = new ColumnPruner(new Set.Set1<>(pmmlTransformer.getOutputCol())); PipelineModel pipelineModel = new PipelineModel(null, new Transformer[]{pmmlTransformer, columnExploder, columnPruner}); return pipelineModel; } return pmmlTransformer; }
public PMMLBuilder putOptions(PipelineStage pipelineStage, Map<String, ?> map){ return putOptions(Pattern.compile(pipelineStage.uid(), Pattern.LITERAL), map); }
@Override public List<OutputField> registerOutputFields(Label label, SparkMLEncoder encoder){ T model = getTransformer(); String predictionCol = model.getPredictionCol(); OutputField predictedField = ModelUtil.createPredictedField(FieldName.create(predictionCol), label.getDataType(), OpType.CONTINUOUS); encoder.putOnlyFeature(predictionCol, new ContinuousFeature(encoder, predictedField)); return Collections.singletonList(predictedField); } }
@Test public void crossValidationWithLogisticRegression() { LogisticRegression lr = new LogisticRegression(); ParamMap[] lrParamMaps = new ParamGridBuilder() .addGrid(lr.regParam(), new double[]{0.001, 1000.0}) .addGrid(lr.maxIter(), new int[]{0, 10}) .build(); BinaryClassificationEvaluator eval = new BinaryClassificationEvaluator(); CrossValidator cv = new CrossValidator() .setEstimator(lr) .setEstimatorParamMaps(lrParamMaps) .setEvaluator(eval) .setNumFolds(3); CrossValidatorModel cvModel = cv.fit(dataset); LogisticRegression parent = (LogisticRegression) cvModel.bestModel().parent(); Assert.assertEquals(0.001, parent.getRegParam(), 0.0); Assert.assertEquals(10, parent.getMaxIter()); } }
@Test public void pipeline() { StandardScaler scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures"); LogisticRegression lr = new LogisticRegression() .setFeaturesCol("scaledFeatures"); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{scaler, lr}); PipelineModel model = pipeline.fit(dataset); model.transform(dataset).createOrReplaceTempView("prediction"); Dataset<Row> predictions = spark.sql("SELECT label, probability, prediction FROM prediction"); predictions.collectAsList(); } }
/** * Creates a processing pipeline. * @return a pipeline */ private Pipeline createPipeline() { Tokenizer tokenizer = new Tokenizer() .setInputCol("featureStrings") .setOutputCol("tokens"); CountVectorizer countVectorizer = new CountVectorizer() .setInputCol("tokens") .setOutputCol("features") .setMinDF((Double)params.getOrDefault(params.getMinFF())) .setVocabSize((Integer)params.getOrDefault(params.getNumFeatures())); StringIndexer tagIndexer = new StringIndexer() .setInputCol("tag") .setOutputCol("label"); Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{tokenizer, countVectorizer, tagIndexer}); return pipeline; }
@Override public PipelineModelInfo getModelInfo(final PipelineModel from) { final PipelineModelInfo modelInfo = new PipelineModelInfo(); final ModelInfo stages[] = new ModelInfo[from.stages().length]; for (int i = 0; i < from.stages().length; i++) { Transformer sparkModel = from.stages()[i]; stages[i] = ModelInfoAdapterFactory.getAdapter(sparkModel.getClass()).adapt(sparkModel); } modelInfo.setStages(stages); return modelInfo; }
@Test public void crossValidationWithLogisticRegression() { LogisticRegression lr = new LogisticRegression(); ParamMap[] lrParamMaps = new ParamGridBuilder() .addGrid(lr.regParam(), new double[]{0.001, 1000.0}) .addGrid(lr.maxIter(), new int[]{0, 10}) .build(); BinaryClassificationEvaluator eval = new BinaryClassificationEvaluator(); CrossValidator cv = new CrossValidator() .setEstimator(lr) .setEstimatorParamMaps(lrParamMaps) .setEvaluator(eval) .setNumFolds(3); CrossValidatorModel cvModel = cv.fit(dataset); LogisticRegression parent = (LogisticRegression) cvModel.bestModel().parent(); Assert.assertEquals(0.001, parent.getRegParam(), 0.0); Assert.assertEquals(10, parent.getMaxIter()); } }
@Test public void pipeline() { StandardScaler scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures"); LogisticRegression lr = new LogisticRegression() .setFeaturesCol("scaledFeatures"); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{scaler, lr}); PipelineModel model = pipeline.fit(dataset); model.transform(dataset).createOrReplaceTempView("prediction"); Dataset<Row> predictions = spark.sql("SELECT label, probability, prediction FROM prediction"); predictions.collectAsList(); } }