public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.methodOption().create()); addOption(DefaultOptionCreator.clustersInOption() .withDescription("The input centroids, as Vectors. Must be a SequenceFile of Writable, Cluster/Canopy.") .create()); if (parseArguments(args) == null) { return -1; Path input = getInputPath(); Path output = getOutputPath(); if (getConf() == null) { setConf(new Configuration()); Path clustersIn = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION)); boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase( DefaultOptionCreator.SEQUENTIAL_METHOD); if (hasOption(DefaultOptionCreator.OUTLIER_THRESHOLD)) { clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD)); run(getConf(), input, clustersIn, output, clusterClassificationThreshold, true, runSequential);
public static void main(String[] args) throws Exception { ToolRunner.run(new Configuration(), new ClusterClassificationDriver(), args); }
public static void run(Configuration conf, Path input, Path clusteringOutputPath, Path output, double clusterClassificationThreshold, boolean emitMostLikely, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { if (runSequential) { classifyClusterSeq(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely); } else { classifyClusterMR(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely); } }
private static void classifyClusterSeq(Configuration conf, Path input, Path clusters, Path output, Double clusterClassificationThreshold, boolean emitMostLikely) throws IOException { List<Cluster> clusterModels = populateClusterModels(clusters, conf); ClusteringPolicy policy = ClusterClassifier.readPolicy(finalClustersPath(conf, clusters)); ClusterClassifier clusterClassifier = new ClusterClassifier(clusterModels, policy); selectCluster(input, clusterModels, clusterClassifier, output, clusterClassificationThreshold, emitMostLikely); }
private static void clusterData(Configuration conf, Path points, Path canopies, Path output, double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { ClusterClassifier.writePolicy(new CanopyClusteringPolicy(), canopies); ClusterClassificationDriver.run(conf, points, output, new Path(output, PathDirectory.CLUSTERED_POINTS_DIRECTORY), clusterClassificationThreshold, true, runSequential); }
if (shouldClassify(pdfPerCluster, clusterClassificationThreshold)) { classifyAndWrite(clusterModels, clusterClassificationThreshold, emitMostLikely, writer, new VectorWritable(vector), pdfPerCluster);
/** * Populates a list with clusters present in clusters-*-final directory. * * @param clusterOutputPath * The output path of the clustering. * @param conf * The Hadoop Configuration * @return The list of clusters found by the clustering. * @throws IOException */ private static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf) throws IOException { List<Cluster> clusterModels = new ArrayList<>(); Path finalClustersPath = finalClustersPath(conf, clusterOutputPath); Iterator<?> it = new SequenceFileDirValueIterator<>(finalClustersPath, PathType.LIST, PathFilters.partFilter(), null, false, conf); while (it.hasNext()) { ClusterWritable next = (ClusterWritable) it.next(); Cluster cluster = next.getValue(); cluster.configure(conf); clusterModels.add(cluster); } return clusterModels; }
private static void classifyClusterSeq(Configuration conf, Path input, Path clusters, Path output, Double clusterClassificationThreshold, boolean emitMostLikely) throws IOException { List<Cluster> clusterModels = populateClusterModels(clusters, conf); ClusteringPolicy policy = ClusterClassifier.readPolicy(finalClustersPath(conf, clusters)); ClusterClassifier clusterClassifier = new ClusterClassifier(clusterModels, policy); selectCluster(input, clusterModels, clusterClassifier, output, clusterClassificationThreshold, emitMostLikely); }
private static void clusterData(Configuration conf, Path points, Path canopies, Path output, double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { ClusterClassifier.writePolicy(new CanopyClusteringPolicy(), canopies); ClusterClassificationDriver.run(conf, points, output, new Path(output, PathDirectory.CLUSTERED_POINTS_DIRECTORY), clusterClassificationThreshold, true, runSequential); }
if (shouldClassify(pdfPerCluster, clusterClassificationThreshold)) { classifyAndWrite(clusterModels, clusterClassificationThreshold, emitMostLikely, writer, new VectorWritable(vector), pdfPerCluster);
/** * Populates a list with clusters present in clusters-*-final directory. * * @param clusterOutputPath * The output path of the clustering. * @param conf * The Hadoop Configuration * @return The list of clusters found by the clustering. * @throws IOException */ private static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf) throws IOException { List<Cluster> clusterModels = Lists.newArrayList(); Path finalClustersPath = finalClustersPath(conf, clusterOutputPath); Iterator<?> it = new SequenceFileDirValueIterator<Writable>(finalClustersPath, PathType.LIST, PathFilters.partFilter(), null, false, conf); while (it.hasNext()) { ClusterWritable next = (ClusterWritable) it.next(); Cluster cluster = next.getValue(); cluster.configure(conf); clusterModels.add(cluster); } return clusterModels; }
public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.methodOption().create()); addOption(DefaultOptionCreator.clustersInOption() .withDescription("The input centroids, as Vectors. Must be a SequenceFile of Writable, Cluster/Canopy.") .create()); if (parseArguments(args) == null) { return -1; Path input = getInputPath(); Path output = getOutputPath(); if (getConf() == null) { setConf(new Configuration()); Path clustersIn = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION)); boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase( DefaultOptionCreator.SEQUENTIAL_METHOD); if (hasOption(DefaultOptionCreator.OUTLIER_THRESHOLD)) { clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD)); run(getConf(), input, clustersIn, output, clusterClassificationThreshold, true, runSequential);
private static void classifyClusterSeq(Configuration conf, Path input, Path clusters, Path output, Double clusterClassificationThreshold, boolean emitMostLikely) throws IOException { List<Cluster> clusterModels = populateClusterModels(clusters, conf); ClusteringPolicy policy = ClusterClassifier.readPolicy(finalClustersPath(conf, clusters)); ClusterClassifier clusterClassifier = new ClusterClassifier(clusterModels, policy); selectCluster(input, clusterModels, clusterClassifier, output, clusterClassificationThreshold, emitMostLikely); }
public static void run(Configuration conf, Path input, Path clusteringOutputPath, Path output, double clusterClassificationThreshold, boolean emitMostLikely, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { if (runSequential) { classifyClusterSeq(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely); } else { classifyClusterMR(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely); } }
private static void clusterData(Configuration conf, Path points, Path canopies, Path output, double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { ClusterClassifier.writePolicy(new CanopyClusteringPolicy(), canopies); ClusterClassificationDriver.run(conf, points, output, new Path(output, PathDirectory.CLUSTERED_POINTS_DIRECTORY), clusterClassificationThreshold, true, runSequential); }
if (shouldClassify(pdfPerCluster, clusterClassificationThreshold)) { classifyAndWrite(clusterModels, clusterClassificationThreshold, emitMostLikely, writer, new VectorWritable(vector), pdfPerCluster);
public static void main(String[] args) throws Exception { ToolRunner.run(new Configuration(), new ClusterClassificationDriver(), args); }
/** * Populates a list with clusters present in clusters-*-final directory. * * @param clusterOutputPath * The output path of the clustering. * @param conf * The Hadoop Configuration * @return The list of clusters found by the clustering. * @throws IOException */ private static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf) throws IOException { List<Cluster> clusterModels = Lists.newArrayList(); Path finalClustersPath = finalClustersPath(conf, clusterOutputPath); Iterator<?> it = new SequenceFileDirValueIterator<Writable>(finalClustersPath, PathType.LIST, PathFilters.partFilter(), null, false, conf); while (it.hasNext()) { ClusterWritable next = (ClusterWritable) it.next(); Cluster cluster = next.getValue(); cluster.configure(conf); clusterModels.add(cluster); } return clusterModels; }
public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.methodOption().create()); addOption(DefaultOptionCreator.clustersInOption() .withDescription("The input centroids, as Vectors. Must be a SequenceFile of Writable, Cluster/Canopy.") .create()); if (parseArguments(args) == null) { return -1; Path input = getInputPath(); Path output = getOutputPath(); if (getConf() == null) { setConf(new Configuration()); Path clustersIn = new Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION)); boolean runSequential = getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase( DefaultOptionCreator.SEQUENTIAL_METHOD); if (hasOption(DefaultOptionCreator.OUTLIER_THRESHOLD)) { clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD)); run(getConf(), input, clustersIn, output, clusterClassificationThreshold, true, runSequential);
public static void run(Configuration conf, Path input, Path clusteringOutputPath, Path output, double clusterClassificationThreshold, boolean emitMostLikely, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { if (runSequential) { classifyClusterSeq(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely); } else { classifyClusterMR(conf, input, clusteringOutputPath, output, clusterClassificationThreshold, emitMostLikely); } }