public boolean replace(Instance oldValue, Instance newValue) { boolean altered = remove(oldValue); if (altered) { add(newValue); } return altered; }
@Override public Map<String, Instance> call(Tuple2<String, Instance> instance) throws Exception { Map<String, Instance> result = new HashMap<String, Instance>(); Cluster c = clusterFactory.create(); c.add(instance._2); result.put(c.getId(), c); return result; }
private Map<String, Instance> initKMeans(SparkDataSet ds) { Map<String, Instance> kmeans = new HashMap<String, Instance>(k); List<Tuple2<String, Instance>> kpoints = ds.getRDD().takeSample(false, k, (new Random()).nextInt()); for (Tuple2<String, Instance> point : kpoints) { Cluster c = this.createCluster(); c.add(point._2); kmeans.put(c.getId(), c); } return kmeans; }
@Override public Map<String, Instance> call(Map<String, Instance> clusters1, Map<String, Instance> cluster2) throws Exception { BestClusterFunction bestClusterFunc = new BestClusterFunction( distFunc, clusters1, threshold ); for (String key : cluster2.keySet()) { Instance instance = cluster2.get(key); Tuple2<String, Instance> result = bestClusterFunc.call(new Tuple2<String, Instance>(instance.getId(), instance)); if (result == null) { clusters1.put(instance.getId(), instance); } else { Cluster cluster = (Cluster)clusters1.get(result._1); cluster.add(instance); // revise the cluster centroid cluster.getMembers().clear(); // no need to retain the member list } } return clusters1; } }
@Override public Map<String, Instance> call(Map<String, Instance> clusterList1, Map<String, Instance> clusterList2) throws Exception { if (clusterList1.isEmpty()) { clusterList1.putAll(clusterList2); return clusterList1; } for (String id : clusterList2.keySet()) { Instance c = clusterList2.get(id); BestClusterFunction bestClusterFunc = new BestClusterFunction( distFunc, clusterList1, threshold ); Tuple2<String, Instance> result = bestClusterFunc.call(new Tuple2<String, Instance>(c.getId(), c)); if (result._1 == null) { clusterList1.put(c.getId(), c); } else { ((Cluster)clusterList1.get(result._1)).add(c); } } return clusterList1; } }
@Override public Iterable<Instance> call(Iterator<Tuple2<String, Instance>> instances) throws Exception { Map<String, Instance> clusters = new HashMap<String, Instance>(); BestClusterFunction bestClusterFunc = new BestClusterFunction(distFunc, clusters, threshold); while (instances.hasNext()) { Tuple2<String, Instance> inst = instances.next(); Tuple2<String, Instance> result = bestClusterFunc.call(inst); Cluster c; if (result._1 == null) { c = clusterFactory.create(); clusters.put(c.getId(), c); } else { c = (Cluster)clusters.get(result._1); } c.add(inst._2); } return clusters.values(); }
private List<Cluster> initKMeans(DataSet ds) { List<Cluster> kmeans = new LinkedList<Cluster>(); int ki = (ds.size() < k) ? ds.size(): k; // randomly pick k instances as the initial k means ArrayList<String> indexes = new ArrayList<String>(ds.size()); ArrayList<String> keys = new ArrayList<String>(ds.getKeys()); for (int i = 0; i < keys.size(); i++) { indexes.add( keys.get(i) ); } Collections.shuffle(indexes); for (int i = 0; i < ki; i++) { Cluster c = this.createCluster(); c.add( ds.get(indexes.get(i)) ); c.updateCentroid(); kmeans.add(c); } return kmeans; }
@Override public Map<String, Instance> call(Tuple2<String, Instance> inst) throws Exception { Instance bestCluster = null; double bestScore = Double.MAX_VALUE; for (String clusterId : clusters.keySet()) { Instance cluster = clusters.get(clusterId); double d = distFunc.distance(inst._2, cluster); if (d < bestScore && d < threshold) { bestScore = d; bestCluster = cluster; } } Map<String, Instance> result = new HashMap<String, Instance>(); if (bestCluster == null) { Cluster c = clusterFactory.create(); c.add(inst._2); result.put(c.getId(), c); } else { result.put(bestCluster.getId(), bestCluster); } return result; } }
@SuppressWarnings("unchecked") private void updateCluster(Instance inst, Cluster cluster) { if (inst instanceof Cluster) { // merge the clusters Cluster c = (Cluster)inst; for (String key : c.getCentroids().keySet()) { Centroid<Feature> update = c.getCentroids().get(key); Centroid<Feature> centroid = cluster.getCentroids().get(key); // get all the aggregate feature values associated with update for (Feature f : update.getAggregatableCentroid()) { centroid.add(f); } } // after merging cluster we should manually update the resulting centroid cluster.updateCentroid(); // TODO should handle merging the cluster members } else { // simply add the instance to the cluster cluster.add(inst); } }
if ((pair._2 instanceof Cluster) == false) { Cluster c = this.createCluster(); c.add(pair._2); curKmeans.put(pair._1, c);