public static Normalizer normalizeValFunc(final int min, final int max, final int floor, final int ceiling) { return new Normalizer() { @Override public int normalize(int value) { /* Use min, max, floor, ceiling & value here to return something. */ } }; }
StringIndexer indexer = new StringIndexer() .setInputCol("userId") .setOutputCol("userIdMapped"); Dataset<Row> userJoinedDataSet = indexer.fit(feedbackDS).transform(feedbackDS);
@Override public void execute() { // prepare dataset Dataset<Row> dataset = null; // fit PCA PCAModel pca = new PCA() .setInputCol(featureName) .setOutputCol("pca") .setK(kValue) .fit(dataset); Dataset<Row> result = pca.transform(dataset).select("pca"); result.show(false); }
/** * Creates a processing pipeline. * @return a pipeline */ protected Pipeline createPipeline() { Tokenizer tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("tokens"); CountVectorizer countVectorizer = new CountVectorizer() .setInputCol("tokens") .setOutputCol("features") .setMinDF((Double)params.getOrDefault(params.getMinFF())) .setVocabSize((Integer)params.getOrDefault(params.getNumFeatures())); StringIndexer transitionIndexer = new StringIndexer() .setInputCol("transition") .setOutputCol("label"); Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{tokenizer, countVectorizer, transitionIndexer}); return pipeline; }
@Test public void standardScaler() { // The tests are to check Java compatibility. List<VectorIndexerSuite.FeatureData> points = Arrays.asList( new VectorIndexerSuite.FeatureData(Vectors.dense(0.0, -2.0)), new VectorIndexerSuite.FeatureData(Vectors.dense(1.0, 3.0)), new VectorIndexerSuite.FeatureData(Vectors.dense(1.0, 4.0)) ); Dataset<Row> dataFrame = spark.createDataFrame(jsc.parallelize(points, 2), VectorIndexerSuite.FeatureData.class); StandardScaler scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures") .setWithStd(true) .setWithMean(false); // Compute summary statistics by fitting the StandardScaler StandardScalerModel scalerModel = scaler.fit(dataFrame); // Normalize each feature to have unit standard deviation. Dataset<Row> scaledData = scalerModel.transform(dataFrame); scaledData.count(); } }
@Test public void testStringIndexer() { StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("label", StringType, false) }); List<Row> data = Arrays.asList( cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c")); Dataset<Row> dataset = spark.createDataFrame(data, schema); StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex"); Dataset<Row> output = indexer.fit(dataset).transform(dataset); Assert.assertEquals( Arrays.asList(cr(0, 0.0), cr(1, 2.0), cr(2, 1.0), cr(3, 0.0), cr(4, 0.0), cr(5, 1.0)), output.orderBy("id").select("id", "labelIndex").collectAsList()); }
@Override public BucketizerModelInfo getModelInfo(final Bucketizer from, final DataFrame df) { final BucketizerModelInfo modelInfo = new BucketizerModelInfo(); modelInfo.setSplits(from.getSplits()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
@Override public ChiSqSelectorModelInfo getModelInfo(final ChiSqSelectorModel from, DataFrame df) { ChiSqSelectorModelInfo modelInfo = new ChiSqSelectorModelInfo(); modelInfo.setSelectedFeatures(from.selectedFeatures()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getFeaturesCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
/** * Creates a conditional Markov model. * @param pipelineModel * @param weights * @param markovOrder */ public CMMModel(PipelineModel pipelineModel, Vector weights, MarkovOrder markovOrder, Map<String, Set<Integer>> tagDictionary) { this.pipelineModel = pipelineModel; this.contextExtractor = new ContextExtractor(markovOrder, Constants.REGEXP_FILE); this.weights = weights; this.tags = ((StringIndexerModel)(pipelineModel.stages()[2])).labels(); String[] features = ((CountVectorizerModel)(pipelineModel.stages()[1])).vocabulary(); featureMap = new HashMap<String, Integer>(); for (int j = 0; j < features.length; j++) { featureMap.put(features[j], j); } this.tagDictionary = tagDictionary; }
JavaRDD<String> lines = spark.read().textFile(Input_file_path).toJavaRDD(); JavaRDD<Iterable<String>> words_iterable = lines.map(new Function<String, Iterable<String>>() { public Iterable<String> call(String s) throws Exception { String[] words = s.split(" "); Iterable<String> output = Arrays.asList(words); return output; } }); Word2Vec word2vec = new Word2Vec(); Word2VecModel word2vecmodel = word2vec.fit(words_iterable);
@Override public HashingTFModelInfo getModelInfo(final HashingTF from) { final HashingTFModelInfo modelInfo = new HashingTFModelInfo(); modelInfo.setNumFeatures(from.getNumFeatures()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
@Override public List<Feature> encodeFeatures(SparkMLEncoder encoder){ VectorSlicer transformer = getTransformer(); String[] names = transformer.getNames(); if(names != null && names.length > 0){ throw new IllegalArgumentException("Expected index mode, got name mode"); } return encoder.getFeatures(transformer.getInputCol(), transformer.getIndices()); } }
@Override public List<Feature> encodeFeatures(SparkMLEncoder encoder){ NGram transformer = getTransformer(); DocumentFeature documentFeature = (DocumentFeature)encoder.getOnlyFeature(transformer.getInputCol()); return Collections.singletonList(documentFeature); } }
/** * Creates a processing pipeline. * @return a pipeline */ private Pipeline createPipeline() { Tokenizer tokenizer = new Tokenizer() .setInputCol("featureStrings") .setOutputCol("tokens"); CountVectorizer countVectorizer = new CountVectorizer() .setInputCol("tokens") .setOutputCol("features") .setMinDF((Double)params.getOrDefault(params.getMinFF())) .setVocabSize((Integer)params.getOrDefault(params.getNumFeatures())); StringIndexer tagIndexer = new StringIndexer() .setInputCol("tag") .setOutputCol("label"); Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{tokenizer, countVectorizer, tagIndexer}); return pipeline; }
@Test public void standardScaler() { // The tests are to check Java compatibility. List<VectorIndexerSuite.FeatureData> points = Arrays.asList( new VectorIndexerSuite.FeatureData(Vectors.dense(0.0, -2.0)), new VectorIndexerSuite.FeatureData(Vectors.dense(1.0, 3.0)), new VectorIndexerSuite.FeatureData(Vectors.dense(1.0, 4.0)) ); Dataset<Row> dataFrame = spark.createDataFrame(jsc.parallelize(points, 2), VectorIndexerSuite.FeatureData.class); StandardScaler scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures") .setWithStd(true) .setWithMean(false); // Compute summary statistics by fitting the StandardScaler StandardScalerModel scalerModel = scaler.fit(dataFrame); // Normalize each feature to have unit standard deviation. Dataset<Row> scaledData = scalerModel.transform(dataFrame); scaledData.count(); } }
@Test public void testStringIndexer() { StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("label", StringType, false) }); List<Row> data = Arrays.asList( cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c")); Dataset<Row> dataset = spark.createDataFrame(data, schema); StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex"); Dataset<Row> output = indexer.fit(dataset).transform(dataset); Assert.assertEquals( Arrays.asList(cr(0, 0.0), cr(1, 2.0), cr(2, 1.0), cr(3, 0.0), cr(4, 0.0), cr(5, 1.0)), output.orderBy("id").select("id", "labelIndex").collectAsList()); }
@Override public BucketizerModelInfo getModelInfo(final Bucketizer from) { final BucketizerModelInfo modelInfo = new BucketizerModelInfo(); modelInfo.setSplits(from.getSplits()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
@Override public ChiSqSelectorModelInfo getModelInfo(final ChiSqSelectorModel from) { ChiSqSelectorModelInfo modelInfo = new ChiSqSelectorModelInfo(); modelInfo.setSelectedFeatures(from.selectedFeatures()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getFeaturesCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
@Override public HashingTFModelInfo getModelInfo(final HashingTF from, DataFrame df) { final HashingTFModelInfo modelInfo = new HashingTFModelInfo(); modelInfo.setNumFeatures(from.getNumFeatures()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
@Test public void testStringIndexer() { StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("label", StringType, false) }); List<Row> data = Arrays.asList( cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c")); Dataset<Row> dataset = spark.createDataFrame(data, schema); StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex"); Dataset<Row> output = indexer.fit(dataset).transform(dataset); Assert.assertEquals( Arrays.asList(cr(0, 0.0), cr(1, 2.0), cr(2, 1.0), cr(3, 0.0), cr(4, 0.0), cr(5, 1.0)), output.orderBy("id").select("id", "labelIndex").collectAsList()); }