@Override public StringIndexerModelInfo getModelInfo(final StringIndexerModel from) { final String[] labels = from.labels(); final Map<String, Double> labelToIndex = new HashMap<String, Double>(); for (int i = 0; i < labels.length; i++) { labelToIndex.put(labels[i], (double) i); } final StringIndexerModelInfo modelInfo = new StringIndexerModelInfo(); modelInfo.setLabelToIndex(labelToIndex); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
StringIndexerModel transformer = getTransformer(); Feature feature = encoder.getOnlyFeature(transformer.getInputCol()); categories.addAll(Arrays.asList(transformer.labels())); String handleInvalid = transformer.getHandleInvalid();
/** * Creates a transition-based parser using a MLP transition classifier. * @param jsc * @param classifierFileName * @param featureFrame */ public TransitionBasedParserMLP(JavaSparkContext jsc, String classifierFileName, FeatureFrame featureFrame) { this.featureFrame = featureFrame; this.classifier = TransitionClassifier.load(jsc, new Path(classifierFileName, "data").toString()); this.pipelineModel = PipelineModel.load(new Path(classifierFileName, "pipelineModel").toString()); this.transitionName = ((StringIndexerModel)pipelineModel.stages()[2]).labels(); String[] features = ((CountVectorizerModel)(pipelineModel.stages()[1])).vocabulary(); this.featureMap = new HashMap<String, Integer>(); for (int j = 0; j < features.length; j++) { this.featureMap.put(features[j], j); } }
@Test public void testStringIndexer() { StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("label", StringType, false) }); List<Row> data = Arrays.asList( cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c")); Dataset<Row> dataset = spark.createDataFrame(data, schema); StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex"); Dataset<Row> output = indexer.fit(dataset).transform(dataset); Assert.assertEquals( Arrays.asList(cr(0, 0.0), cr(1, 2.0), cr(2, 1.0), cr(3, 0.0), cr(4, 0.0), cr(5, 1.0)), output.orderBy("id").select("id", "labelIndex").collectAsList()); }
/** * Creates a conditional Markov model. * @param pipelineModel * @param weights * @param markovOrder */ public CMMModel(PipelineModel pipelineModel, Vector weights, MarkovOrder markovOrder, Map<String, Set<Integer>> tagDictionary) { this.pipelineModel = pipelineModel; this.contextExtractor = new ContextExtractor(markovOrder, Constants.REGEXP_FILE); this.weights = weights; this.tags = ((StringIndexerModel)(pipelineModel.stages()[2])).labels(); String[] features = ((CountVectorizerModel)(pipelineModel.stages()[1])).vocabulary(); featureMap = new HashMap<String, Integer>(); for (int j = 0; j < features.length; j++) { featureMap.put(features[j], j); } this.tagDictionary = tagDictionary; }
@Test public void testStringIndexer() { StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("label", StringType, false) }); List<Row> data = Arrays.asList( cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c")); Dataset<Row> dataset = spark.createDataFrame(data, schema); StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex"); Dataset<Row> output = indexer.fit(dataset).transform(dataset); Assert.assertEquals( Arrays.asList(cr(0, 0.0), cr(1, 2.0), cr(2, 1.0), cr(3, 0.0), cr(4, 0.0), cr(5, 1.0)), output.orderBy("id").select("id", "labelIndex").collectAsList()); }
@Override public StringIndexerModelInfo getModelInfo(final StringIndexerModel from, DataFrame df) { final String[] labels = from.labels(); final Map<String, Double> labelToIndex = new HashMap<String, Double>(); for (int i = 0; i < labels.length; i++) { labelToIndex.put(labels[i], (double) i); } final StringIndexerModelInfo modelInfo = new StringIndexerModelInfo(); modelInfo.setLabelToIndex(labelToIndex); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
@Test public void testStringIndexer() { StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("label", StringType, false) }); List<Row> data = Arrays.asList( cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c")); Dataset<Row> dataset = spark.createDataFrame(data, schema); StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex"); Dataset<Row> output = indexer.fit(dataset).transform(dataset); Assert.assertEquals( Arrays.asList(cr(0, 0.0), cr(1, 2.0), cr(2, 1.0), cr(3, 0.0), cr(4, 0.0), cr(5, 1.0)), output.orderBy("id").select("id", "labelIndex").collectAsList()); }