Refine search
private JavaRDD<Vector> parsedToVectorRDD(JavaRDD<String[]> parsedRDD) { return parsedRDD.map(data -> { try { return Vectors.dense(KMeansUtils.featuresFromTokens(data, inputSchema)); } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) { log.warn("Bad input: {}", Arrays.toString(data)); throw e; } }); }
.setMaster("local"); final SparkContext sparkContext = new SparkContext(conf); final JavaSparkContext sc = new JavaSparkContext(sparkContext); final JavaRDD<String> textFile = sc.textFile(path, 1); System.out.println("Data Count: " + textFile.count()); final JavaRDD<LabeledPoint> data = textFile.map(line -> { final String[] split = line.split(","); final double label = normalizeLabel(split[split.length - 1]); doubles[i] = normalizeFeature(split[i]); final Vector features = Vectors.dense(doubles); return new LabeledPoint(label, features); }); data.take(10).forEach(System.out::println); sc.stop();
JavaRDD<String> data = context.textFile("./data/gene/data.csv.gz"); LOGGER.info("count = " + data.count()); final String header = data.first(); data = data.filter(row -> !row.equalsIgnoreCase(header)); LOGGER.info("count = " + data.count()); values[i - 1] = Double.parseDouble(sarray[i]); return Vectors.dense(values); }); parsedData.cache(); clusters.save(context.sc(), "./hdfs/KMeansModel"); KMeansModel sameModel = KMeansModel.load(context.sc(), "./hdfs/KMeansModel");
@Test public void rowMatrixQRDecomposition() { Vector v1 = Vectors.dense(1.0, 10.0, 100.0); Vector v2 = Vectors.dense(2.0, 20.0, 200.0); Vector v3 = Vectors.dense(3.0, 30.0, 300.0); JavaRDD<Vector> rows = jsc.parallelize(Arrays.asList(v1, v2, v3), 1); RowMatrix mat = new RowMatrix(rows.rdd()); QRDecomposition<RowMatrix, Matrix> result = mat.tallSkinnyQR(true); } }
JavaPairRDD<String, Set<Integer>> tagDictionary = wt.mapToPair(new PairFunction<Row, String, Set<Integer>>(){ private static final long serialVersionUID = 5865372074294028547L; @Override JavaRDD<Tuple2<Object, Vector>> trainingData = rows.map(new Function<Row, Tuple2<Object, Vector>>(){ private static final long serialVersionUID = -8579021851841129697L; @Override trainingData.cache(); numFeatures = Math.min(numFeatures, vocabSize); Vector initialWeights = Vectors.dense(new double[(numLabels-1)*(numFeatures+1)]);
@Test public void testPredictJavaRDD() { List<Vector> points = Arrays.asList( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0) ); JavaRDD<Vector> data = jsc.parallelize(points, 2); KMeansModel model = new KMeans().setK(1).setMaxIterations(5).run(data.rdd()); JavaRDD<Integer> predictions = model.predict(data); // Should be able to get the first prediction. predictions.first(); } }
@Override public CompanyPrediction predict(CompanyInfo companyInfo) { final JavaRDD<Vector> normalizedCompanyInfo = javaSparkContext .parallelize(Collections.singletonList(companyInfo)) .map(info -> Vectors.dense( // Order is important! normalizeFeature(companyInfo.getIndustrialRisk()), normalizeFeature(companyInfo.getManagementRisk()), normalizeFeature(companyInfo.getFinancialFlexibility()), normalizeFeature(companyInfo.getCredibility()), normalizeFeature(companyInfo.getCompetitiveness()), normalizeFeature(companyInfo.getOperatingRisk()) )); final double prediction = logisticRegressionModel.predict(normalizedCompanyInfo).first(); return deNormalizeResult(prediction); }
@Test public void chiSqTest() { JavaRDD<LabeledPoint> data = jsc.parallelize(Arrays.asList( new LabeledPoint(0.0, Vectors.dense(0.1, 2.3)), new LabeledPoint(1.0, Vectors.dense(1.5, 5.1)), new LabeledPoint(0.0, Vectors.dense(2.4, 8.1)))); ChiSqTestResult[] testResults = Statistics.chiSqTest(data); }
@Test public void runGaussianMixture() { List<Vector> points = Arrays.asList( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0) ); JavaRDD<Vector> data = jsc.parallelize(points, 2); GaussianMixtureModel model = new GaussianMixture().setK(2).setMaxIterations(1).setSeed(1234) .run(data); assertEquals(model.gaussians().length, 2); JavaRDD<Integer> predictions = model.predict(data); predictions.first(); } }
@Test public void twoDimensionalData() { JavaRDD<Vector> points = jsc.parallelize(Arrays.asList( Vectors.dense(4, -1), Vectors.dense(4, 1), Vectors.sparse(2, new int[]{0}, new double[]{1.0}) ), 2); BisectingKMeans bkm = new BisectingKMeans() .setK(4) .setMaxIterations(2) .setSeed(1L); BisectingKMeansModel model = bkm.run(points); Assert.assertEquals(3, model.k()); Assert.assertArrayEquals(new double[]{3.0, 0.0}, model.root().center().toArray(), 1e-12); for (ClusteringTreeNode child : model.root().children()) { double[] center = child.center().toArray(); if (center[0] > 2) { Assert.assertEquals(2, child.size()); Assert.assertArrayEquals(new double[]{4.0, 0.0}, center, 1e-12); } else { Assert.assertEquals(1, child.size()); Assert.assertArrayEquals(new double[]{1.0, 0.0}, center, 1e-12); } } } }
@Override public LabeledPoint call(String record) { // record: <Price><,><Age><,><KM><,><FuelType1><,><FuelType2><,><HP><,><MetColor><,><Automatic><,><CC><,><Doors><,><Weight> // tokens[0] = <Price> String[] tokens = StringUtils.split(record, ","); double[] features = new double[tokens.length - 1]; for (int i = 0; i < features.length; i++) { features[i] = Double.parseDouble(tokens[i+1]); } // double price = Double.parseDouble(tokens[0]); return new LabeledPoint(price, Vectors.dense(features)); } });
@Test public void diagonalMatrixConstruction() { Vector v = Vectors.dense(1.0, 0.0, 2.0); Vector sv = Vectors.sparse(3, new int[]{0, 2}, new double[]{1.0, 2.0}); Matrix m = Matrices.diag(v); Matrix sm = Matrices.diag(sv); DenseMatrix d = DenseMatrix.diag(v); DenseMatrix sd = DenseMatrix.diag(sv); SparseMatrix s = SparseMatrix.spdiag(v); SparseMatrix ss = SparseMatrix.spdiag(sv); assertArrayEquals(m.toArray(), sm.toArray(), 0.0); assertArrayEquals(d.toArray(), sm.toArray(), 0.0); assertArrayEquals(d.toArray(), sd.toArray(), 0.0); assertArrayEquals(sd.toArray(), s.toArray(), 0.0); assertArrayEquals(s.toArray(), ss.toArray(), 0.0); assertArrayEquals(s.values(), ss.values(), 0.0); assertEquals(2, s.values().length); assertEquals(2, ss.values().length); assertEquals(4, s.colPtrs().length); assertEquals(4, ss.colPtrs().length); }
corpus.filter(tuple2 -> Vectors.norm(tuple2._2(), 1.0) != 0.0); assertEquals(topicDistributions.count(), nonEmptyCorpus.count()); Tuple3<Long, int[], double[]> topTopics = model.javaTopTopicsPerDocument(3).first(); Tuple3<Long, int[], int[]> topicAssignment = model.javaTopicAssignments().first(); Long docId2 = topicAssignment._1(); int[] termIndices2 = topicAssignment._2();
/** * converts a datamap's doc data into a vector object to be used in the creation of a LabeledPoint object * @param docdata * @param numfeatures * @return */ private static Vector getVector(Map<Integer,FeatureData> docdata, int numfeatures) { List<Integer> keys = new ArrayList<Integer>(docdata.keySet().size()); List<Double> values = new ArrayList<Double>(docdata.keySet().size()); for (Integer i : docdata.keySet()){ keys.add(i); values.add(docdata.get(i).getValue()); } return Vectors.sparse(numfeatures, Ints.toArray(keys), Doubles.toArray(values)); } }
@Test public void rowMatrixQRDecomposition() { Vector v1 = Vectors.dense(1.0, 10.0, 100.0); Vector v2 = Vectors.dense(2.0, 20.0, 200.0); Vector v3 = Vectors.dense(3.0, 30.0, 300.0); JavaRDD<Vector> rows = jsc.parallelize(Arrays.asList(v1, v2, v3), 1); RowMatrix mat = new RowMatrix(rows.rdd()); QRDecomposition<RowMatrix, Matrix> result = mat.tallSkinnyQR(true); } }
@Test public void testPredictJavaRDD() { List<Vector> points = Arrays.asList( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0) ); JavaRDD<Vector> data = jsc.parallelize(points, 2); KMeansModel model = new KMeans().setK(1).setMaxIterations(5).run(data.rdd()); JavaRDD<Integer> predictions = model.predict(data); // Should be able to get the first prediction. predictions.first(); } }
@Test public void chiSqTest() { JavaRDD<LabeledPoint> data = jsc.parallelize(Arrays.asList( new LabeledPoint(0.0, Vectors.dense(0.1, 2.3)), new LabeledPoint(1.0, Vectors.dense(1.5, 5.1)), new LabeledPoint(0.0, Vectors.dense(2.4, 8.1)))); ChiSqTestResult[] testResults = Statistics.chiSqTest(data); }
@Test public void runGaussianMixture() { List<Vector> points = Arrays.asList( Vectors.dense(1.0, 2.0, 6.0), Vectors.dense(1.0, 3.0, 0.0), Vectors.dense(1.0, 4.0, 6.0) ); JavaRDD<Vector> data = jsc.parallelize(points, 2); GaussianMixtureModel model = new GaussianMixture().setK(2).setMaxIterations(1).setSeed(1234) .run(data); assertEquals(model.gaussians().length, 2); JavaRDD<Integer> predictions = model.predict(data); predictions.first(); } }
@Test public void twoDimensionalData() { JavaRDD<Vector> points = jsc.parallelize(Arrays.asList( Vectors.dense(4, -1), Vectors.dense(4, 1), Vectors.sparse(2, new int[]{0}, new double[]{1.0}) ), 2); BisectingKMeans bkm = new BisectingKMeans() .setK(4) .setMaxIterations(2) .setSeed(1L); BisectingKMeansModel model = bkm.run(points); Assert.assertEquals(3, model.k()); Assert.assertArrayEquals(new double[]{3.0, 0.0}, model.root().center().toArray(), 1e-12); for (ClusteringTreeNode child : model.root().children()) { double[] center = child.center().toArray(); if (center[0] > 2) { Assert.assertEquals(2, child.size()); Assert.assertArrayEquals(new double[]{4.0, 0.0}, center, 1e-12); } else { Assert.assertEquals(1, child.size()); Assert.assertArrayEquals(new double[]{1.0, 0.0}, center, 1e-12); } } } }