ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize); sf.reset(); try { for (String term : value.getEntries()) { if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram int termId = dictionary.get(term);
value.addAll(it.next().getEntries());
ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize); sf.reset(); try { for (String term : value.getEntries()) { if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram int termId = dictionary.get(term);
protected void map(Text key, StringTuple value, final Context context) throws IOException, InterruptedException { ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxShingleSize); sf.reset(); try { new OpenObjectIntHashMap<String>(value.getEntries().size() * (maxShingleSize - 1)); OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(value.getEntries().size());
protected void map(Text key, StringTuple value, final Context context) throws IOException, InterruptedException { ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxShingleSize); sf.reset(); try { new OpenObjectIntHashMap<String>(value.getEntries().size() * (maxShingleSize - 1)); OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(value.getEntries().size());
@Override protected void map(Text key, StringTuple value, final Context context) throws IOException, InterruptedException { OpenObjectLongHashMap<String> wordCount = new OpenObjectLongHashMap<>(); for (String word : value.getEntries()) { if (wordCount.containsKey(word)) { wordCount.put(word, wordCount.get(word) + 1); } else { wordCount.put(word, 1); } } wordCount.forEachPair(new ObjectLongProcedure<String>() { @Override public boolean apply(String first, long second) { try { context.write(new Text(first), new LongWritable(second)); } catch (IOException e) { context.getCounter("Exception", "Output IO Exception").increment(1); } catch (InterruptedException e) { context.getCounter("Exception", "Interrupted Exception").increment(1); } return true; } }); } }
@Override protected void map(Text key, StringTuple value, final Context context) throws IOException, InterruptedException { OpenObjectLongHashMap<String> wordCount = new OpenObjectLongHashMap<String>(); for (String word : value.getEntries()) { if (wordCount.containsKey(word)) { wordCount.put(word, wordCount.get(word) + 1); } else { wordCount.put(word, 1); } } wordCount.forEachPair(new ObjectLongProcedure<String>() { @Override public boolean apply(String first, long second) { try { context.write(new Text(first), new LongWritable(second)); } catch (IOException e) { context.getCounter("Exception", "Output IO Exception").increment(1); } catch (InterruptedException e) { context.getCounter("Exception", "Interrupted Exception").increment(1); } return true; } }); } }
protected void map(Text key, StringTuple value, final Context context) throws IOException, InterruptedException { try (ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxShingleSize)){ sf.reset(); int count = 0; // ngram count new OpenObjectIntHashMap<>(value.getEntries().size() * (maxShingleSize - 1)); OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<>(value.getEntries().size());
@Override protected void map(Text key, StringTuple value, final Context context) throws IOException, InterruptedException { OpenObjectLongHashMap<String> wordCount = new OpenObjectLongHashMap<String>(); for (String word : value.getEntries()) { if (wordCount.containsKey(word)) { wordCount.put(word, wordCount.get(word) + 1); } else { wordCount.put(word, 1); } } wordCount.forEachPair(new ObjectLongProcedure<String>() { @Override public boolean apply(String first, long second) { try { context.write(new Text(first), new LongWritable(second)); } catch (IOException e) { context.getCounter("Exception", "Output IO Exception").increment(1); } catch (InterruptedException e) { context.getCounter("Exception", "Interrupted Exception").increment(1); } return true; } }); } }
reader.next(key, value); assertEquals(documentId1, key.toString()); assertEquals(Arrays.asList("test", "document", "processor"), value.getEntries()); reader.next(key, value); assertEquals(documentId2, key.toString()); assertEquals(Arrays.asList("another", "one"), value.getEntries());