org.apache.spark.ml.feature.StringIndexerModel java code examples

@Override
public StringIndexerModelInfo getModelInfo(final StringIndexerModel from) {
  final String[] labels = from.labels();
  final Map<String, Double> labelToIndex = new HashMap<String, Double>();
  for (int i = 0; i < labels.length; i++) {
    labelToIndex.put(labels[i], (double) i);
  }
  final StringIndexerModelInfo modelInfo = new StringIndexerModelInfo();
  modelInfo.setLabelToIndex(labelToIndex);
  Set<String> inputKeys = new LinkedHashSet<String>();
  inputKeys.add(from.getInputCol());
  modelInfo.setInputKeys(inputKeys);
  Set<String> outputKeys = new LinkedHashSet<String>();
  outputKeys.add(from.getOutputCol());
  modelInfo.setOutputKeys(outputKeys);
  return modelInfo;
}

StringIndexerModel transformer = getTransformer();
Feature feature = encoder.getOnlyFeature(transformer.getInputCol());
categories.addAll(Arrays.asList(transformer.labels()));
String handleInvalid = transformer.getHandleInvalid();

/**
 * Creates a transition-based parser using a MLP transition classifier.
 * @param jsc
 * @param classifierFileName
 * @param featureFrame
 */
public TransitionBasedParserMLP(JavaSparkContext jsc, String classifierFileName, FeatureFrame featureFrame) {
  this.featureFrame = featureFrame;
  this.classifier = TransitionClassifier.load(jsc, new Path(classifierFileName, "data").toString());
  this.pipelineModel = PipelineModel.load(new Path(classifierFileName, "pipelineModel").toString());
  this.transitionName = ((StringIndexerModel)pipelineModel.stages()[2]).labels();
  String[] features = ((CountVectorizerModel)(pipelineModel.stages()[1])).vocabulary();
  this.featureMap = new HashMap<String, Integer>();
  for (int j = 0; j < features.length; j++) {
    this.featureMap.put(features[j], j);
  }
  
}

@Test
public void testStringIndexer() {
 StructType schema = createStructType(new StructField[]{
  createStructField("id", IntegerType, false),
  createStructField("label", StringType, false)
 });
 List<Row> data = Arrays.asList(
  cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c"));
 Dataset<Row> dataset = spark.createDataFrame(data, schema);
 StringIndexer indexer = new StringIndexer()
  .setInputCol("label")
  .setOutputCol("labelIndex");
 Dataset<Row> output = indexer.fit(dataset).transform(dataset);
 Assert.assertEquals(
  Arrays.asList(cr(0, 0.0), cr(1, 2.0), cr(2, 1.0), cr(3, 0.0), cr(4, 0.0), cr(5, 1.0)),
  output.orderBy("id").select("id", "labelIndex").collectAsList());
}

/**
 * Creates a conditional Markov model.
 * @param pipelineModel
 * @param weights
 * @param markovOrder
 */
public CMMModel(PipelineModel pipelineModel, Vector weights, MarkovOrder markovOrder, Map<String, Set<Integer>> tagDictionary) {
  this.pipelineModel = pipelineModel;
  this.contextExtractor = new ContextExtractor(markovOrder, Constants.REGEXP_FILE);
  this.weights = weights;
  this.tags = ((StringIndexerModel)(pipelineModel.stages()[2])).labels();
  String[] features = ((CountVectorizerModel)(pipelineModel.stages()[1])).vocabulary();
  featureMap = new HashMap<String, Integer>();
  for (int j = 0; j < features.length; j++) {
    featureMap.put(features[j], j);
  }
  this.tagDictionary = tagDictionary;
}

@Test
public void testStringIndexer() {
 StructType schema = createStructType(new StructField[]{
  createStructField("id", IntegerType, false),
  createStructField("label", StringType, false)
 });
 List<Row> data = Arrays.asList(
  cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c"));
 Dataset<Row> dataset = spark.createDataFrame(data, schema);
 StringIndexer indexer = new StringIndexer()
  .setInputCol("label")
  .setOutputCol("labelIndex");
 Dataset<Row> output = indexer.fit(dataset).transform(dataset);
 Assert.assertEquals(
  Arrays.asList(cr(0, 0.0), cr(1, 2.0), cr(2, 1.0), cr(3, 0.0), cr(4, 0.0), cr(5, 1.0)),
  output.orderBy("id").select("id", "labelIndex").collectAsList());
}

@Override
public StringIndexerModelInfo getModelInfo(final StringIndexerModel from, DataFrame df) {
  final String[] labels = from.labels();
  final Map<String, Double> labelToIndex = new HashMap<String, Double>();
  for (int i = 0; i < labels.length; i++) {
    labelToIndex.put(labels[i], (double) i);
  }
  final StringIndexerModelInfo modelInfo = new StringIndexerModelInfo();
  modelInfo.setLabelToIndex(labelToIndex);
  Set<String> inputKeys = new LinkedHashSet<String>();
  inputKeys.add(from.getInputCol());
  modelInfo.setInputKeys(inputKeys);
  Set<String> outputKeys = new LinkedHashSet<String>();
  outputKeys.add(from.getOutputCol());
  modelInfo.setOutputKeys(outputKeys);
  return modelInfo;
}

@Test
public void testStringIndexer() {
 StructType schema = createStructType(new StructField[]{
  createStructField("id", IntegerType, false),
  createStructField("label", StringType, false)
 });
 List<Row> data = Arrays.asList(
  cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c"));
 Dataset<Row> dataset = spark.createDataFrame(data, schema);
 StringIndexer indexer = new StringIndexer()
  .setInputCol("label")
  .setOutputCol("labelIndex");
 Dataset<Row> output = indexer.fit(dataset).transform(dataset);
 Assert.assertEquals(
  Arrays.asList(cr(0, 0.0), cr(1, 2.0), cr(2, 1.0), cr(3, 0.0), cr(4, 0.0), cr(5, 1.0)),
  output.orderBy("id").select("id", "labelIndex").collectAsList());
}

Most used methods

Popular in Java

Reading from database using SQL prepared statement
getSupportFragmentManager (FragmentActivity)
getContentResolver (Context)
compareTo (BigDecimal)
InputStream (java.io)
A readable source of bytes.Most clients will use input streams that read data from the file system (
SimpleDateFormat (java.text)
Formats and parses dates in a locale-sensitive manner. Formatting turns a Date into a String, and pa
Locale (java.util)
Locale represents a language/country/variant combination. Locales are used to alter the presentatio
Scanner (java.util)
A parser that parses a text string of primitive types and strings with the help of regular expressio
Handler (java.util.logging)
A Handler object accepts a logging request and exports the desired messages to a target, for example
Component (java.awt)
A component is an object having a graphical representation that can be displayed on the screen and t
Github Copilot alternatives

How to useStringIndexerModel in org.apache.spark.ml.feature

Best Java code snippets using org.apache.spark.ml.feature.StringIndexerModel (Showing top 8 results out of 315)

How to use
StringIndexerModel
in
org.apache.spark.ml.feature