org.apache.spark.mllib.linalg.Vectors java code examples

Refine search

private JavaRDD<Vector> parsedToVectorRDD(JavaRDD<String[]> parsedRDD) {
 return parsedRDD.map(data -> {
  try {
   return Vectors.dense(KMeansUtils.featuresFromTokens(data, inputSchema));
  } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) {
   log.warn("Bad input: {}", Arrays.toString(data));
   throw e;
  }
 });
}

    .setMaster("local");
final SparkContext sparkContext = new SparkContext(conf);
final JavaSparkContext sc = new JavaSparkContext(sparkContext);
final JavaRDD<String> textFile = sc.textFile(path, 1);
System.out.println("Data Count: " + textFile.count());
final JavaRDD<LabeledPoint> data = textFile.map(line -> {
  final String[] split = line.split(",");
  final double label = normalizeLabel(split[split.length - 1]);
    doubles[i] = normalizeFeature(split[i]);
  final Vector features = Vectors.dense(doubles);
  return new LabeledPoint(label, features);
});
data.take(10).forEach(System.out::println);
sc.stop();

JavaRDD<String> data = context.textFile("./data/gene/data.csv.gz");
LOGGER.info("count = " + data.count());
final String  header = data.first();
data = data.filter(row -> !row.equalsIgnoreCase(header));
LOGGER.info("count = " + data.count());
    values[i - 1] = Double.parseDouble(sarray[i]);
  return Vectors.dense(values);
});
parsedData.cache();
clusters.save(context.sc(), "./hdfs/KMeansModel");
KMeansModel sameModel = KMeansModel.load(context.sc(),
    "./hdfs/KMeansModel");

 @Test
 public void rowMatrixQRDecomposition() {
  Vector v1 = Vectors.dense(1.0, 10.0, 100.0);
  Vector v2 = Vectors.dense(2.0, 20.0, 200.0);
  Vector v3 = Vectors.dense(3.0, 30.0, 300.0);

  JavaRDD<Vector> rows = jsc.parallelize(Arrays.asList(v1, v2, v3), 1);
  RowMatrix mat = new RowMatrix(rows.rdd());

  QRDecomposition<RowMatrix, Matrix> result = mat.tallSkinnyQR(true);
 }
}

JavaPairRDD<String, Set<Integer>> tagDictionary = wt.mapToPair(new PairFunction<Row, String, Set<Integer>>(){
  private static final long serialVersionUID = 5865372074294028547L;
  @Override
JavaRDD<Tuple2<Object, Vector>> trainingData = rows.map(new Function<Row, Tuple2<Object, Vector>>(){
  private static final long serialVersionUID = -8579021851841129697L;
  @Override
trainingData.cache();
numFeatures = Math.min(numFeatures, vocabSize);
Vector initialWeights = Vectors.dense(new double[(numLabels-1)*(numFeatures+1)]);

 @Test
 public void testPredictJavaRDD() {
  List<Vector> points = Arrays.asList(
   Vectors.dense(1.0, 2.0, 6.0),
   Vectors.dense(1.0, 3.0, 0.0),
   Vectors.dense(1.0, 4.0, 6.0)
  );
  JavaRDD<Vector> data = jsc.parallelize(points, 2);
  KMeansModel model = new KMeans().setK(1).setMaxIterations(5).run(data.rdd());
  JavaRDD<Integer> predictions = model.predict(data);
  // Should be able to get the first prediction.
  predictions.first();
 }
}

@Override
public CompanyPrediction predict(CompanyInfo companyInfo) {
  final JavaRDD<Vector> normalizedCompanyInfo = javaSparkContext
      .parallelize(Collections.singletonList(companyInfo))
      .map(info -> Vectors.dense( // Order is important!
          normalizeFeature(companyInfo.getIndustrialRisk()),
          normalizeFeature(companyInfo.getManagementRisk()),
          normalizeFeature(companyInfo.getFinancialFlexibility()),
          normalizeFeature(companyInfo.getCredibility()),
          normalizeFeature(companyInfo.getCompetitiveness()),
          normalizeFeature(companyInfo.getOperatingRisk())
      ));
  final double prediction = logisticRegressionModel.predict(normalizedCompanyInfo).first();
  return deNormalizeResult(prediction);
}

@Test
public void chiSqTest() {
 JavaRDD<LabeledPoint> data = jsc.parallelize(Arrays.asList(
  new LabeledPoint(0.0, Vectors.dense(0.1, 2.3)),
  new LabeledPoint(1.0, Vectors.dense(1.5, 5.1)),
  new LabeledPoint(0.0, Vectors.dense(2.4, 8.1))));
 ChiSqTestResult[] testResults = Statistics.chiSqTest(data);
}

 @Test
 public void runGaussianMixture() {
  List<Vector> points = Arrays.asList(
   Vectors.dense(1.0, 2.0, 6.0),
   Vectors.dense(1.0, 3.0, 0.0),
   Vectors.dense(1.0, 4.0, 6.0)
  );

  JavaRDD<Vector> data = jsc.parallelize(points, 2);
  GaussianMixtureModel model = new GaussianMixture().setK(2).setMaxIterations(1).setSeed(1234)
   .run(data);
  assertEquals(model.gaussians().length, 2);
  JavaRDD<Integer> predictions = model.predict(data);
  predictions.first();
 }
}

 @Test
 public void twoDimensionalData() {
  JavaRDD<Vector> points = jsc.parallelize(Arrays.asList(
   Vectors.dense(4, -1),
   Vectors.dense(4, 1),
   Vectors.sparse(2, new int[]{0}, new double[]{1.0})
  ), 2);

  BisectingKMeans bkm = new BisectingKMeans()
   .setK(4)
   .setMaxIterations(2)
   .setSeed(1L);
  BisectingKMeansModel model = bkm.run(points);
  Assert.assertEquals(3, model.k());
  Assert.assertArrayEquals(new double[]{3.0, 0.0}, model.root().center().toArray(), 1e-12);
  for (ClusteringTreeNode child : model.root().children()) {
   double[] center = child.center().toArray();
   if (center[0] > 2) {
    Assert.assertEquals(2, child.size());
    Assert.assertArrayEquals(new double[]{4.0, 0.0}, center, 1e-12);
   } else {
    Assert.assertEquals(1, child.size());
    Assert.assertArrayEquals(new double[]{1.0, 0.0}, center, 1e-12);
   }
  }
 }
}

  @Override
  public LabeledPoint call(String record) {
    // record: <Price><,><Age><,><KM><,><FuelType1><,><FuelType2><,><HP><,><MetColor><,><Automatic><,><CC><,><Doors><,><Weight>
    // tokens[0] = <Price>
    String[] tokens = StringUtils.split(record, ","); 
    double[] features = new double[tokens.length - 1];
    for (int i = 0; i < features.length; i++) {
      features[i] = Double.parseDouble(tokens[i+1]);
    }
    // 
    double price = Double.parseDouble(tokens[0]); 
    return new LabeledPoint(price, Vectors.dense(features));    
  }
});

@Test
public void diagonalMatrixConstruction() {
 Vector v = Vectors.dense(1.0, 0.0, 2.0);
 Vector sv = Vectors.sparse(3, new int[]{0, 2}, new double[]{1.0, 2.0});
 Matrix m = Matrices.diag(v);
 Matrix sm = Matrices.diag(sv);
 DenseMatrix d = DenseMatrix.diag(v);
 DenseMatrix sd = DenseMatrix.diag(sv);
 SparseMatrix s = SparseMatrix.spdiag(v);
 SparseMatrix ss = SparseMatrix.spdiag(sv);
 assertArrayEquals(m.toArray(), sm.toArray(), 0.0);
 assertArrayEquals(d.toArray(), sm.toArray(), 0.0);
 assertArrayEquals(d.toArray(), sd.toArray(), 0.0);
 assertArrayEquals(sd.toArray(), s.toArray(), 0.0);
 assertArrayEquals(s.toArray(), ss.toArray(), 0.0);
 assertArrayEquals(s.values(), ss.values(), 0.0);
 assertEquals(2, s.values().length);
 assertEquals(2, ss.values().length);
 assertEquals(4, s.colPtrs().length);
 assertEquals(4, ss.colPtrs().length);
}

  corpus.filter(tuple2 -> Vectors.norm(tuple2._2(), 1.0) != 0.0);
assertEquals(topicDistributions.count(), nonEmptyCorpus.count());
Tuple3<Long, int[], double[]> topTopics = model.javaTopTopicsPerDocument(3).first();
Tuple3<Long, int[], int[]> topicAssignment = model.javaTopicAssignments().first();
Long docId2 = topicAssignment._1();
int[] termIndices2 = topicAssignment._2();

Row event = normEventDF.first();
Vector testData = Vectors.sparse(sp.size(), sp.indices(), sp.values());
return linModel.predict(testData.compressed());

  /**
   * converts a datamap's doc data into a vector object to be used in the creation of a LabeledPoint object
   * @param docdata
   * @param numfeatures
   * @return
   */
  private static Vector getVector(Map<Integer,FeatureData> docdata, int numfeatures) {
    List<Integer> keys = new ArrayList<Integer>(docdata.keySet().size());
    List<Double> values = new ArrayList<Double>(docdata.keySet().size());
    for (Integer i : docdata.keySet()){
      keys.add(i);
      values.add(docdata.get(i).getValue());
    }
    
    return Vectors.sparse(numfeatures, Ints.toArray(keys), Doubles.toArray(values));
  }
}

 @Test
 public void rowMatrixQRDecomposition() {
  Vector v1 = Vectors.dense(1.0, 10.0, 100.0);
  Vector v2 = Vectors.dense(2.0, 20.0, 200.0);
  Vector v3 = Vectors.dense(3.0, 30.0, 300.0);

  JavaRDD<Vector> rows = jsc.parallelize(Arrays.asList(v1, v2, v3), 1);
  RowMatrix mat = new RowMatrix(rows.rdd());

  QRDecomposition<RowMatrix, Matrix> result = mat.tallSkinnyQR(true);
 }
}

 @Test
 public void testPredictJavaRDD() {
  List<Vector> points = Arrays.asList(
   Vectors.dense(1.0, 2.0, 6.0),
   Vectors.dense(1.0, 3.0, 0.0),
   Vectors.dense(1.0, 4.0, 6.0)
  );
  JavaRDD<Vector> data = jsc.parallelize(points, 2);
  KMeansModel model = new KMeans().setK(1).setMaxIterations(5).run(data.rdd());
  JavaRDD<Integer> predictions = model.predict(data);
  // Should be able to get the first prediction.
  predictions.first();
 }
}

@Test
public void chiSqTest() {
 JavaRDD<LabeledPoint> data = jsc.parallelize(Arrays.asList(
  new LabeledPoint(0.0, Vectors.dense(0.1, 2.3)),
  new LabeledPoint(1.0, Vectors.dense(1.5, 5.1)),
  new LabeledPoint(0.0, Vectors.dense(2.4, 8.1))));
 ChiSqTestResult[] testResults = Statistics.chiSqTest(data);
}

 @Test
 public void runGaussianMixture() {
  List<Vector> points = Arrays.asList(
   Vectors.dense(1.0, 2.0, 6.0),
   Vectors.dense(1.0, 3.0, 0.0),
   Vectors.dense(1.0, 4.0, 6.0)
  );

  JavaRDD<Vector> data = jsc.parallelize(points, 2);
  GaussianMixtureModel model = new GaussianMixture().setK(2).setMaxIterations(1).setSeed(1234)
   .run(data);
  assertEquals(model.gaussians().length, 2);
  JavaRDD<Integer> predictions = model.predict(data);
  predictions.first();
 }
}

 @Test
 public void twoDimensionalData() {
  JavaRDD<Vector> points = jsc.parallelize(Arrays.asList(
   Vectors.dense(4, -1),
   Vectors.dense(4, 1),
   Vectors.sparse(2, new int[]{0}, new double[]{1.0})
  ), 2);

  BisectingKMeans bkm = new BisectingKMeans()
   .setK(4)
   .setMaxIterations(2)
   .setSeed(1L);
  BisectingKMeansModel model = bkm.run(points);
  Assert.assertEquals(3, model.k());
  Assert.assertArrayEquals(new double[]{3.0, 0.0}, model.root().center().toArray(), 1e-12);
  for (ClusteringTreeNode child : model.root().children()) {
   double[] center = child.center().toArray();
   if (center[0] > 2) {
    Assert.assertEquals(2, child.size());
    Assert.assertArrayEquals(new double[]{4.0, 0.0}, center, 1e-12);
   } else {
    Assert.assertEquals(1, child.size());
    Assert.assertArrayEquals(new double[]{1.0, 0.0}, center, 1e-12);
   }
  }
 }
}

How to useVectors in org.apache.spark.mllib.linalg

Best Java code snippets using org.apache.spark.mllib.linalg.Vectors (Showing top 20 results out of 315)

Refine search

How to use
Vectors
in
org.apache.spark.mllib.linalg