@Override public CountVectorizerModelInfo getModelInfo(final CountVectorizerModel from, final DataFrame df) { final CountVectorizerModelInfo modelInfo = new CountVectorizerModelInfo(); modelInfo.setMinTF(from.getMinTF()); modelInfo.setVocabulary(from.vocabulary()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
CountVectorizerModel cvm = new CountVectorizerModel(model.getVocabulary()).setInputCol("text") .setOutputCol("feature"); Dataset<Row> eventDF = cvm.transform(df);
CountVectorizerModel transformer = getTransformer(); DocumentFeature documentFeature = (DocumentFeature)encoder.getOnlyFeature(transformer.getInputCol()); .setTokenize(Boolean.TRUE) .setWordSeparatorCharacterRE(documentFeature.getWordSeparatorRE()) .setLocalTermWeights(transformer.getBinary() ? TextIndex.LocalTermWeights.BINARY : null) .setExpression(new FieldRef(termField.getName())); String[] vocabulary = transformer.vocabulary(); for(int i = 0; i < vocabulary.length; i++){ String term = vocabulary[i];
/** * Creates a transition-based parser using a MLP transition classifier. * @param jsc * @param classifierFileName * @param featureFrame */ public TransitionBasedParserMLP(JavaSparkContext jsc, String classifierFileName, FeatureFrame featureFrame) { this.featureFrame = featureFrame; this.classifier = TransitionClassifier.load(jsc, new Path(classifierFileName, "data").toString()); this.pipelineModel = PipelineModel.load(new Path(classifierFileName, "pipelineModel").toString()); this.transitionName = ((StringIndexerModel)pipelineModel.stages()[2]).labels(); String[] features = ((CountVectorizerModel)(pipelineModel.stages()[1])).vocabulary(); this.featureMap = new HashMap<String, Integer>(); for (int j = 0; j < features.length; j++) { this.featureMap.put(features[j], j); } }
.setP(1.0); Dataset<Row> eventDF = cvm.transform(df).select("value", "feature"); return new AgePredictModel(languageCode, model, cvm.vocabulary(), wrapper);
CountVectorizerModel cvm = new CountVectorizerModel(model.getVocabulary()).setInputCol("text") .setOutputCol("feature"); Dataset<Row> eventDF = cvm.transform(df);
/** * Creates a conditional Markov model. * @param pipelineModel * @param weights * @param markovOrder */ public CMMModel(PipelineModel pipelineModel, Vector weights, MarkovOrder markovOrder, Map<String, Set<Integer>> tagDictionary) { this.pipelineModel = pipelineModel; this.contextExtractor = new ContextExtractor(markovOrder, Constants.REGEXP_FILE); this.weights = weights; this.tags = ((StringIndexerModel)(pipelineModel.stages()[2])).labels(); String[] features = ((CountVectorizerModel)(pipelineModel.stages()[1])).vocabulary(); featureMap = new HashMap<String, Integer>(); for (int j = 0; j < features.length; j++) { featureMap.put(features[j], j); } this.tagDictionary = tagDictionary; }
@Override public CountVectorizerModelInfo getModelInfo(final CountVectorizerModel from) { final CountVectorizerModelInfo modelInfo = new CountVectorizerModelInfo(); modelInfo.setMinTF(from.getMinTF()); modelInfo.setVocabulary(from.vocabulary()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
CountVectorizerModel cvm = new CountVectorizerModel(model.getVocabulary()) .setInputCol("context") .setOutputCol("feature"); .setP(1.0); Dataset<Row> eventDF= cvm.transform(df).select("value", "feature");
int vocabSize = cvm.vocabulary().length; numFeatures = Math.min(numFeatures, vocabSize);