StringIndexer indexer = new StringIndexer() .setInputCol("userId") .setOutputCol("userIdMapped"); Dataset<Row> userJoinedDataSet = indexer.fit(feedbackDS).transform(feedbackDS);
@Test public void testStringIndexer() { StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("label", StringType, false) }); List<Row> data = Arrays.asList( cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c")); Dataset<Row> dataset = spark.createDataFrame(data, schema); StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex"); Dataset<Row> output = indexer.fit(dataset).transform(dataset); Assert.assertEquals( Arrays.asList(cr(0, 0.0), cr(1, 2.0), cr(2, 1.0), cr(3, 0.0), cr(4, 0.0), cr(5, 1.0)), output.orderBy("id").select("id", "labelIndex").collectAsList()); }
@Test public void testStringIndexer() { StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("label", StringType, false) }); List<Row> data = Arrays.asList( cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c")); Dataset<Row> dataset = spark.createDataFrame(data, schema); StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex"); Dataset<Row> output = indexer.fit(dataset).transform(dataset); Assert.assertEquals( Arrays.asList(cr(0, 0.0), cr(1, 2.0), cr(2, 1.0), cr(3, 0.0), cr(4, 0.0), cr(5, 1.0)), output.orderBy("id").select("id", "labelIndex").collectAsList()); }
@Test public void testStringIndexer() { StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("label", StringType, false) }); List<Row> data = Arrays.asList( cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c")); Dataset<Row> dataset = spark.createDataFrame(data, schema); StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex"); Dataset<Row> output = indexer.fit(dataset).transform(dataset); Assert.assertEquals( Arrays.asList(cr(0, 0.0), cr(1, 2.0), cr(2, 1.0), cr(3, 0.0), cr(4, 0.0), cr(5, 1.0)), output.orderBy("id").select("id", "labelIndex").collectAsList()); }