@Override protected void setup(Context ctx) throws IOException, InterruptedException { similarity = ClassUtils.instantiateAs(ctx.getConfiguration().get(SIMILARITY_CLASSNAME), VectorSimilarityMeasure.class); numNonZeroEntries = Vectors.readAsIntMap(new Path(ctx.getConfiguration().get(NUM_NON_ZERO_ENTRIES_PATH)), ctx.getConfiguration()); maxValues = Vectors.read(new Path(ctx.getConfiguration().get(MAXVALUES_PATH)), ctx.getConfiguration()); threshold = Double.parseDouble(ctx.getConfiguration().get(THRESHOLD)); }
@Override protected void reduce(IntWritable row, Iterable<VectorWritable> partials, Context ctx) throws IOException, InterruptedException { Vector allSimilarities = Vectors.merge(partials); Vector topKSimilarities = Vectors.topKElements(maxSimilaritiesPerRow, allSimilarities); ctx.write(row, new VectorWritable(topKSimilarities)); } }
@Override protected void reduce(NullWritable nullWritable, Iterable<VectorWritable> partialVectors, Context ctx) throws IOException, InterruptedException { Vector counts = Vectors.sum(partialVectors.iterator()); Vectors.write(counts, new Path(ctx.getConfiguration().get(OBSERVATIONS_PER_COLUMN_PATH)), ctx.getConfiguration()); } }
public static OpenIntIntHashMap readAsIntMap(Path path, Configuration conf) throws IOException { FileSystem fs = FileSystem.get(path.toUri(), conf); FSDataInputStream in = fs.open(path); try { return readAsIntMap(in); } finally { Closeables.close(in, true); } }
@Override protected void reduce(IntWritable row, Iterable<VectorWritable> partialVectors, Context ctx) throws IOException, InterruptedException { Vector partialVector = Vectors.merge(partialVectors); if (row.get() == NORM_VECTOR_MARKER) { Vectors.write(partialVector, normsPath, ctx.getConfiguration()); } else if (row.get() == MAXVALUE_VECTOR_MARKER) { Vectors.write(partialVector, maxValuesPath, ctx.getConfiguration()); } else if (row.get() == NUM_NON_ZERO_ENTRIES_VECTOR_MARKER) { Vectors.write(partialVector, numNonZeroEntriesPath, ctx.getConfiguration(), true); } else { ctx.write(row, new VectorWritable(partialVector)); } } }
@Override protected void reduce(WritableComparable<?> key, Iterable<VectorWritable> values, Context ctx) throws IOException, InterruptedException { result.set(Vectors.sum(values.iterator())); ctx.write(key, result); } }
@Override protected void reduce(IntWritable row, Iterable<VectorWritable> partialVectors, Context ctx) throws IOException, InterruptedException { ctx.write(row, new VectorWritable(Vectors.merge(partialVectors))); } }
@Override protected void setup(Context ctx) throws IOException, InterruptedException { similarity = ClassUtils.instantiateAs(ctx.getConfiguration().get(SIMILARITY_CLASSNAME), VectorSimilarityMeasure.class); numberOfColumns = ctx.getConfiguration().getInt(NUMBER_OF_COLUMNS, -1); Preconditions.checkArgument(numberOfColumns > 0, "Number of columns must be greater then 0! But numberOfColumns = " + numberOfColumns); excludeSelfSimilarity = ctx.getConfiguration().getBoolean(EXCLUDE_SELF_SIMILARITY, false); norms = Vectors.read(new Path(ctx.getConfiguration().get(NORMS_PATH)), ctx.getConfiguration()); treshold = Double.parseDouble(ctx.getConfiguration().get(THRESHOLD)); }
public static void write(Vector vector, Path path, Configuration conf) throws IOException { write(vector, path, conf, false); }
@Override protected void map(IntWritable column, VectorWritable occurrenceVector, Context ctx) throws IOException, InterruptedException { Vector.Element[] occurrences = Vectors.toArray(occurrenceVector); Arrays.sort(occurrences, BY_INDEX); int cooccurrences = 0; int prunedCooccurrences = 0; for (int n = 0; n < occurrences.length; n++) { Vector.Element occurrenceA = occurrences[n]; Vector dots = new RandomAccessSparseVector(Integer.MAX_VALUE); for (int m = n; m < occurrences.length; m++) { Vector.Element occurrenceB = occurrences[m]; if (threshold == NO_THRESHOLD || consider(occurrenceA, occurrenceB)) { dots.setQuick(occurrenceB.index(), similarity.aggregate(occurrenceA.get(), occurrenceB.get())); cooccurrences++; } else { prunedCooccurrences++; } } ctx.write(new IntWritable(occurrenceA.index()), new VectorWritable(dots)); } ctx.getCounter(Counters.COOCCURRENCES).increment(cooccurrences); ctx.getCounter(Counters.PRUNED_COOCCURRENCES).increment(prunedCooccurrences); } }
public static OpenIntIntHashMap readAsIntMap(Path path, Configuration conf) throws IOException { FileSystem fs = FileSystem.get(path.toUri(), conf); FSDataInputStream in = fs.open(path); try { return readAsIntMap(in); } finally { Closeables.close(in, true); } }
@Override protected void reduce(IntWritable row, Iterable<VectorWritable> partialVectors, Context ctx) throws IOException, InterruptedException { Vector partialVector = Vectors.merge(partialVectors); if (row.get() == NORM_VECTOR_MARKER) { Vectors.write(partialVector, normsPath, ctx.getConfiguration()); } else if (row.get() == MAXVALUE_VECTOR_MARKER) { Vectors.write(partialVector, maxValuesPath, ctx.getConfiguration()); } else if (row.get() == NUM_NON_ZERO_ENTRIES_VECTOR_MARKER) { Vectors.write(partialVector, numNonZeroEntriesPath, ctx.getConfiguration(), true); } else { ctx.write(row, new VectorWritable(partialVector)); } } }
@Override protected void reduce(WritableComparable<?> key, Iterable<VectorWritable> values, Context ctx) throws IOException, InterruptedException { ctx.write(key, new VectorWritable(Vectors.sum(values.iterator()))); } }
@Override protected void reduce(IntWritable row, Iterable<VectorWritable> partialVectors, Context ctx) throws IOException, InterruptedException { ctx.write(row, new VectorWritable(Vectors.merge(partialVectors))); } }
@Override protected void setup(Context ctx) throws IOException, InterruptedException { similarity = ClassUtils.instantiateAs(ctx.getConfiguration().get(SIMILARITY_CLASSNAME), VectorSimilarityMeasure.class); numberOfColumns = ctx.getConfiguration().getInt(NUMBER_OF_COLUMNS, -1); Preconditions.checkArgument(numberOfColumns > 0, "Number of columns must be greater then 0! But numberOfColumns = " + numberOfColumns); excludeSelfSimilarity = ctx.getConfiguration().getBoolean(EXCLUDE_SELF_SIMILARITY, false); norms = Vectors.read(new Path(ctx.getConfiguration().get(NORMS_PATH)), ctx.getConfiguration()); treshold = Double.parseDouble(ctx.getConfiguration().get(THRESHOLD)); }
public static void write(Vector vector, Path path, Configuration conf) throws IOException { write(vector, path, conf, false); }
@Override protected void map(IntWritable column, VectorWritable occurrenceVector, Context ctx) throws IOException, InterruptedException { Vector.Element[] occurrences = Vectors.toArray(occurrenceVector); Arrays.sort(occurrences, BY_INDEX); int cooccurrences = 0; int prunedCooccurrences = 0; for (int n = 0; n < occurrences.length; n++) { Vector.Element occurrenceA = occurrences[n]; Vector dots = new RandomAccessSparseVector(Integer.MAX_VALUE); for (int m = n; m < occurrences.length; m++) { Vector.Element occurrenceB = occurrences[m]; if (threshold == NO_THRESHOLD || consider(occurrenceA, occurrenceB)) { dots.setQuick(occurrenceB.index(), similarity.aggregate(occurrenceA.get(), occurrenceB.get())); cooccurrences++; } else { prunedCooccurrences++; } } ctx.write(new IntWritable(occurrenceA.index()), new VectorWritable(dots)); } ctx.getCounter(Counters.COOCCURRENCES).increment(cooccurrences); ctx.getCounter(Counters.PRUNED_COOCCURRENCES).increment(prunedCooccurrences); } }
@Override protected void setup(Context ctx) throws IOException, InterruptedException { similarity = ClassUtils.instantiateAs(ctx.getConfiguration().get(SIMILARITY_CLASSNAME), VectorSimilarityMeasure.class); numNonZeroEntries = Vectors.readAsIntMap(new Path(ctx.getConfiguration().get(NUM_NON_ZERO_ENTRIES_PATH)), ctx.getConfiguration()); maxValues = Vectors.read(new Path(ctx.getConfiguration().get(MAXVALUES_PATH)), ctx.getConfiguration()); threshold = Double.parseDouble(ctx.getConfiguration().get(THRESHOLD)); }
public static OpenIntIntHashMap readAsIntMap(Path path, Configuration conf) throws IOException { FileSystem fs = FileSystem.get(path.toUri(), conf); FSDataInputStream in = fs.open(path); try { return readAsIntMap(in); } finally { Closeables.close(in, true); } }
@Override protected void reduce(IntWritable row, Iterable<VectorWritable> partials, Context ctx) throws IOException, InterruptedException { Vector allSimilarities = Vectors.merge(partials); Vector topKSimilarities = Vectors.topKElements(maxSimilaritiesPerRow, allSimilarities); ctx.write(row, new VectorWritable(topKSimilarities)); } }