private JavaRDD<Vector> parsedToVectorRDD(JavaRDD<String[]> parsedRDD) { return parsedRDD.map(data -> { try { return Vectors.dense(KMeansUtils.featuresFromTokens(data, inputSchema)); } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) { log.warn("Bad input: {}", Arrays.toString(data)); throw e; } }); }
return new LabeledPoint(target, Vectors.dense(features)); } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) { log.warn("Bad input: {}", Arrays.toString(data));
@Override public LabeledPoint call(String record) { // record: <Price><,><Age><,><KM><,><FuelType1><,><FuelType2><,><HP><,><MetColor><,><Automatic><,><CC><,><Doors><,><Weight> // tokens[0] = <Price> String[] tokens = StringUtils.split(record, ","); double[] features = new double[tokens.length - 1]; for (int i = 0; i < features.length; i++) { features[i] = Double.parseDouble(tokens[i+1]); } // double price = Double.parseDouble(tokens[0]); return new LabeledPoint(price, Vectors.dense(features)); } });
@Override public Tuple2<String, Double> call(String record) { // each record has this format: // <Age><,><KM><,><FuelType1><,><FuelType2><,><HP><,><MetColor><,><Automatic><,><CC><,><Doors><,><Weight> String[] tokens = StringUtils.split(record, ","); double[] features = new double[tokens.length]; for (int i = 0; i < features.length; i++) { features[i] = Double.parseDouble(tokens[i]); } // double carPricePrediction = model.predict(Vectors.dense(features)); // return new Tuple2<String, Double>(record, carPricePrediction); } });
private Vector vec2vec(Vec[] vals) { double[] dense = new double[vals.length]; for (int i = 0; i < vals.length; i++) { dense[i] = vals[i].at(0); } return Vectors.dense(dense); } }
@Override public Vector call(Tuple2<WritableComparable, HCatRecord> v1) throws Exception { HCatRecord hCatRecord = v1._2(); double[] arr = new double[NUM_FEATURES]; for (int i = 0; i < NUM_FEATURES; i++) { Object val = hCatRecord.get(featurePositions[i]); arr[i] = val == null ? 0d : (Double) val; } return Vectors.dense(arr); } });
@Test public void rowMatrixQRDecomposition() { Vector v1 = Vectors.dense(1.0, 10.0, 100.0); Vector v2 = Vectors.dense(2.0, 20.0, 200.0); Vector v3 = Vectors.dense(3.0, 30.0, 300.0); JavaRDD<Vector> rows = jsc.parallelize(Arrays.asList(v1, v2, v3), 1); RowMatrix mat = new RowMatrix(rows.rdd()); QRDecomposition<RowMatrix, Matrix> result = mat.tallSkinnyQR(true); } }
@Test public void rowMatrixQRDecomposition() { Vector v1 = Vectors.dense(1.0, 10.0, 100.0); Vector v2 = Vectors.dense(2.0, 20.0, 200.0); Vector v3 = Vectors.dense(3.0, 30.0, 300.0); JavaRDD<Vector> rows = jsc.parallelize(Arrays.asList(v1, v2, v3), 1); RowMatrix mat = new RowMatrix(rows.rdd()); QRDecomposition<RowMatrix, Matrix> result = mat.tallSkinnyQR(true); } }
@Test public void rowMatrixQRDecomposition() { Vector v1 = Vectors.dense(1.0, 10.0, 100.0); Vector v2 = Vectors.dense(2.0, 20.0, 200.0); Vector v3 = Vectors.dense(3.0, 30.0, 300.0); JavaRDD<Vector> rows = jsc.parallelize(Arrays.asList(v1, v2, v3), 1); RowMatrix mat = new RowMatrix(rows.rdd()); QRDecomposition<RowMatrix, Matrix> result = mat.tallSkinnyQR(true); } }
@Test public void testPredictJavaRDD() { List<Vector> points = Arrays.asList( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0) ); JavaRDD<Vector> data = jsc.parallelize(points, 2); KMeansModel model = new KMeans().setK(1).setMaxIterations(5).run(data.rdd()); JavaRDD<Integer> predictions = model.predict(data); // Should be able to get the first prediction. predictions.first(); } }
@Test public void denseArrayConstruction() { Vector v = Vectors.dense(1.0, 2.0, 3.0); assertArrayEquals(new double[]{1.0, 2.0, 3.0}, v.toArray(), 0.0); }
@Test public void denseArrayConstruction() { Vector v = Vectors.dense(1.0, 2.0, 3.0); assertArrayEquals(new double[]{1.0, 2.0, 3.0}, v.toArray(), 0.0); }
@Test public void testPredictJavaRDD() { List<Vector> points = Arrays.asList( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0) ); JavaRDD<Vector> data = jsc.parallelize(points, 2); KMeansModel model = new KMeans().setK(1).setMaxIterations(5).run(data.rdd()); JavaRDD<Integer> predictions = model.predict(data); // Should be able to get the first prediction. predictions.first(); } }
@Test public void denseArrayConstruction() { Vector v = Vectors.dense(1.0, 2.0, 3.0); assertArrayEquals(new double[]{1.0, 2.0, 3.0}, v.toArray(), 0.0); }
@Test public void localLdaMethods() { JavaRDD<Tuple2<Long, Vector>> docs = jsc.parallelize(toyData, 2); JavaPairRDD<Long, Vector> pairedDocs = JavaPairRDD.fromJavaRDD(docs); // check: topicDistributions assertEquals(toyModel.topicDistributions(pairedDocs).count(), pairedDocs.count()); // check: logPerplexity double logPerplexity = toyModel.logPerplexity(pairedDocs); // check: logLikelihood. List<Tuple2<Long, Vector>> docsSingleWord = new ArrayList<>(); docsSingleWord.add(new Tuple2<>(0L, Vectors.dense(1.0, 0.0, 0.0))); JavaPairRDD<Long, Vector> single = JavaPairRDD.fromJavaRDD(jsc.parallelize(docsSingleWord)); double logLikelihood = toyModel.logLikelihood(single); }
@Test public void localLdaMethods() { JavaRDD<Tuple2<Long, Vector>> docs = jsc.parallelize(toyData, 2); JavaPairRDD<Long, Vector> pairedDocs = JavaPairRDD.fromJavaRDD(docs); // check: topicDistributions assertEquals(toyModel.topicDistributions(pairedDocs).count(), pairedDocs.count()); // check: logPerplexity double logPerplexity = toyModel.logPerplexity(pairedDocs); // check: logLikelihood. List<Tuple2<Long, Vector>> docsSingleWord = new ArrayList<>(); docsSingleWord.add(new Tuple2<>(0L, Vectors.dense(1.0, 0.0, 0.0))); JavaPairRDD<Long, Vector> single = JavaPairRDD.fromJavaRDD(jsc.parallelize(docsSingleWord)); double logLikelihood = toyModel.logLikelihood(single); }
@Test public void testConvertVectorColumnsToAndFromML() { Vector x = Vectors.dense(2.0); Dataset<Row> dataset = spark.createDataFrame( Collections.singletonList(new LabeledPoint(1.0, x)), LabeledPoint.class ).select("label", "features"); Dataset<Row> newDataset1 = MLUtils.convertVectorColumnsToML(dataset); Row new1 = newDataset1.first(); Assert.assertEquals(RowFactory.create(1.0, x.asML()), new1); Row new2 = MLUtils.convertVectorColumnsToML(dataset, "features").first(); Assert.assertEquals(new1, new2); Row old1 = MLUtils.convertVectorColumnsFromML(newDataset1).first(); Assert.assertEquals(RowFactory.create(1.0, x), old1); }