df.groupBy("label").count().show(); org.apache.spark.ml.feature.Tokenizer tokenizer = new Tokenizer() .setInputCol("context").setOutputCol("words"); HashingTF hashingTF = new HashingTF().setNumFeatures(numFeatures) .setInputCol(tokenizer.getOutputCol()).setOutputCol("features"); LogisticRegression lr = new LogisticRegression().setMaxIter(100) .setRegParam(0.01);
@Override public List<Feature> encodeFeatures(SparkMLEncoder encoder){ Tokenizer transformer = getTransformer(); Feature feature = encoder.getOnlyFeature(transformer.getInputCol()); Apply apply = PMMLUtil.createApply("lowercase", feature.ref()); DerivedField derivedField = encoder.createDerivedField(FeatureUtil.createName("lowercase", feature), OpType.CATEGORICAL, DataType.STRING, apply); return Collections.singletonList(new DocumentFeature(encoder, derivedField, "\\s+")); } }
/** * Creates a processing pipeline. * @return a pipeline */ private Pipeline createPipeline() { Tokenizer tokenizer = new Tokenizer() .setInputCol("featureStrings") .setOutputCol("tokens"); CountVectorizer countVectorizer = new CountVectorizer() .setInputCol("tokens") .setOutputCol("features") .setMinDF((Double)params.getOrDefault(params.getMinFF())) .setVocabSize((Integer)params.getOrDefault(params.getNumFeatures())); StringIndexer tagIndexer = new StringIndexer() .setInputCol("tag") .setOutputCol("label"); Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{tokenizer, countVectorizer, tagIndexer}); return pipeline; }
Tokenizer tokenizer = new Tokenizer() .setInputCol("sentence") .setOutputCol("words"); Dataset<Row> wordsData = tokenizer.transform(sentenceData); int numFeatures = 20; HashingTF hashingTF = new HashingTF()
Tokenizer tokenizer = new Tokenizer() .setInputCol("sentence") .setOutputCol("words"); Dataset<Row> wordsData = tokenizer.transform(sentenceData); int numFeatures = 20; HashingTF hashingTF = new HashingTF()
/** * Creates a processing pipeline. * @return a pipeline */ protected Pipeline createPipeline() { Tokenizer tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("tokens"); CountVectorizer countVectorizer = new CountVectorizer() .setInputCol("tokens") .setOutputCol("features") .setMinDF((Double)params.getOrDefault(params.getMinFF())) .setVocabSize((Integer)params.getOrDefault(params.getNumFeatures())); StringIndexer transitionIndexer = new StringIndexer() .setInputCol("transition") .setOutputCol("label"); Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{tokenizer, countVectorizer, transitionIndexer}); return pipeline; }
Tokenizer tokenizer = new Tokenizer() .setInputCol("sentence") .setOutputCol("words"); Dataset<Row> wordsData = tokenizer.transform(sentenceData); int numFeatures = 20; HashingTF hashingTF = new HashingTF()