@Override public Map<String, Instance> call(Tuple2<String, Instance> instance) throws Exception { Map<String, Instance> result = new HashMap<String, Instance>(); Cluster c = clusterFactory.create(); c.add(instance._2); result.put(c.getId(), c); return result; }
private Map<String, Instance> initKMeans(SparkDataSet ds) { Map<String, Instance> kmeans = new HashMap<String, Instance>(k); List<Tuple2<String, Instance>> kpoints = ds.getRDD().takeSample(false, k, (new Random()).nextInt()); for (Tuple2<String, Instance> point : kpoints) { Cluster c = this.createCluster(); c.add(point._2); kmeans.put(c.getId(), c); } return kmeans; }
@Override public Iterable<Instance> call(Iterator<Tuple2<String, Instance>> instances) throws Exception { Map<String, Instance> clusters = new HashMap<String, Instance>(); BestClusterFunction bestClusterFunc = new BestClusterFunction(distFunc, clusters, threshold); while (instances.hasNext()) { Tuple2<String, Instance> inst = instances.next(); Tuple2<String, Instance> result = bestClusterFunc.call(inst); Cluster c; if (result._1 == null) { c = clusterFactory.create(); clusters.put(c.getId(), c); } else { c = (Cluster)clusters.get(result._1); } c.add(inst._2); } return clusters.values(); }
@Override public Map<String, Instance> call(Tuple2<String, Instance> inst) throws Exception { Instance bestCluster = null; double bestScore = Double.MAX_VALUE; for (String clusterId : clusters.keySet()) { Instance cluster = clusters.get(clusterId); double d = distFunc.distance(inst._2, cluster); if (d < bestScore && d < threshold) { bestScore = d; bestCluster = cluster; } } Map<String, Instance> result = new HashMap<String, Instance>(); if (bestCluster == null) { Cluster c = clusterFactory.create(); c.add(inst._2); result.put(c.getId(), c); } else { result.put(bestCluster.getId(), bestCluster); } return result; } }