/** {@inheritDoc} */ public int getDocumentFrequency() { int docFreq = 0; for (LexiconEntry le : children) if (le != null) docFreq += le.getDocumentFrequency(); return docFreq; }
public int trimLexicon(int cutoff) { synchronized(modificationLock) { Object[] terms = super.map.keySet().toArray(); int removed = 0; for (Object t : terms) { Text text = (Text)t; LexiconEntry le = super.map.get(text); if (le.getDocumentFrequency()<cutoff) { super.map.remove(text); removed++; } } return removed; } }
/** * {@inheritDoc} */ public void count(LexiconEntry value) { numberOfTokens += value.getFrequency(); numberOfPointers += value.getDocumentFrequency(); numberOfTerms++; } /**
protected TIntArrayList[] createPointerForTerm(LexiconEntry le) { TIntArrayList[] tmpArray = new TIntArrayList[2 + fieldCount]; final int tmpNT = le.getDocumentFrequency(); for(int i = 0; i < fieldCount+2; i++) tmpArray[i] = new TIntArrayList(tmpNT); return tmpArray; }
public TIntObjectHashMap<ArrayList<Map.Entry<String,LexiconEntry>>> getTopTermBins(int binsize, int minDf) { synchronized(modificationLock) { TIntObjectHashMap<ArrayList<Map.Entry<String,LexiconEntry>>> bins = new TIntObjectHashMap<ArrayList<Map.Entry<String,LexiconEntry>>>(); Object[] terms = super.map.keySet().toArray(); for (Object t : terms) { Text text = (Text)t; LexiconEntry le = super.map.get(text); if (le.getDocumentFrequency()>=minDf) { int bin = ((int)Math.ceil(Math.log(le.getDocumentFrequency())))/binsize; if(!bins.contains(bin)) bins.put(bin, new ArrayList<Map.Entry<String,LexiconEntry>>()); Map.Entry<String,LexiconEntry> entry = new AbstractMap.SimpleEntry<String,LexiconEntry>(text.toString(), le); bins.get(bin).add(entry); } } return bins; } }
/** * {@inheritDoc} */ public String toString() { return '('+getDocumentFrequency()+","+getFrequency()+')' + pointerToString(); } /**
public void count(LexiconEntry value) { numberOfTokens += value.getFrequency(); numberOfPointers += value.getDocumentFrequency(); numberOfTerms++; int[] fieldFreqs = ((FieldLexiconEntry)value).getFieldFrequencies(); for(int fi = 0; fi < numFields; fi++) { tokensF[fi] += (long)fieldFreqs[fi]; } }
/** * {@inheritDoc} */ public String toString() { return '('+getDocumentFrequency()+","+getFrequency()+')' + pointerToString(); } /**
protected TIntArrayList[] createPointerForTerm(LexiconEntry le) { TIntArrayList[] tmpArray = new TIntArrayList[4+fieldCount]; final int tmpNT = le.getDocumentFrequency(); for(int i = 0; i < fieldCount+3; i++) tmpArray[i] = new TIntArrayList(tmpNT); tmpArray[fieldCount+3] = new TIntArrayList(le.getFrequency()); return tmpArray; }
numberOfPointersThisIteration += le.getDocumentFrequency(); tmpStorageStorage.add(createPointerForTerm(le));
final int tmpNT = le.getDocumentFrequency(); for (int i=0;i<2+fieldCount;i++)
/** Add statistics for this posting onto the given LexiconEntry */ public void addToLexiconEntry(LexiconEntry le) { le.setStatistics(le.getDocumentFrequency() + termDf, le.getFrequency() + termTF); if (maxtf > le.getMaxFrequencyInDocuments()) le.setMaxFrequencyInDocuments(maxtf); }
@Test public void test_iterator() throws Exception { MemoryLexicon lexicon = new MemoryLexicon(); assertNotNull(lexicon); for (int i = 0; i < 10; i++) lexicon.term(terms[i].toString(), entries[i]); Iterator<Entry<String, LexiconEntry>> it = lexicon.iterator(); assertNotNull(it); int i = 0; while (it.hasNext()) { Entry<String, LexiconEntry> kv1 = it.next(); Entry<String, LexiconEntry> kv2 = lexicon.getLexiconEntry(i++); assertEquals(kv1.getKey(), kv2.getKey()); assertEquals(kv1.getValue().getTermId(), kv2.getValue().getTermId()); assertEquals(kv1.getValue().getDocumentFrequency(), kv2.getValue().getDocumentFrequency()); assertEquals(kv1.getValue().getFrequency(), kv2.getValue().getFrequency()); } }
@Test public void test_getLexiconEntry2() throws Exception { MemoryLexicon lexicon = new MemoryLexicon(); assertNotNull(lexicon); for (int i = 0; i < 10; i++) lexicon.term(terms[i].toString(), entries[i]); for (int i = 0; i < 10; i++) { Entry<String, LexiconEntry> kv = lexicon.getLexiconEntry(i); assertEquals("t" + i, kv.getKey()); assertEquals(i, kv.getValue().getTermId()); assertEquals(i + 1, kv.getValue().getDocumentFrequency()); assertEquals(i + 1, kv.getValue().getFrequency()); } }
@Test public void test_getIthLexiconEntry() throws Exception { MemoryLexicon lexicon = new MemoryLexicon(); assertNotNull(lexicon); for (int i = 0; i < 10; i++) lexicon.term(terms[i].toString(), entries[i]); for (int i = 0; i < 10; i++) { Entry<String, LexiconEntry> kv = lexicon.getIthLexiconEntry(i); assertEquals("t" + i, kv.getKey()); assertEquals(i, kv.getValue().getTermId()); assertEquals(i + 1, kv.getValue().getDocumentFrequency()); assertEquals(i + 1, kv.getValue().getFrequency()); } }
@Test public void test_incrementTerm2() throws Exception { MemoryLexicon lexicon = new MemoryLexicon(); assertNotNull(lexicon); for (int i = 0; i < 10; i++) lexicon.term(terms[i].toString(), entries[i]); for (int i = 0; i < 10; i++) lexicon.term(terms[i].toString(), entries[i]); assertEquals(10, lexicon.numberOfEntries()); for (int i = 0; i < 10; i++) { Entry<String, LexiconEntry> kv = lexicon.getLexiconEntry(i); assertEquals("t" + i, kv.getKey()); assertEquals(i, kv.getValue().getTermId()); assertEquals((i + 1) * 2, kv.getValue().getDocumentFrequency()); assertEquals((i + 1) * 2, kv.getValue().getFrequency()); } }
@Test public void test_getLexiconEntry1() throws Exception { MemoryLexicon lexicon = new MemoryLexicon(); assertNotNull(lexicon); for (int i = 0; i < 10; i++) lexicon.term(terms[i].toString(), entries[i]); for (int i = 0; i < 10; i++) { LexiconEntry le = lexicon.getLexiconEntry(terms[i].toString()); assertEquals(i, le.getTermId()); assertEquals(i + 1, le.getDocumentFrequency()); assertEquals(i + 1, le.getFrequency()); } }
protected void checkFrequencies(Index index) { LexiconEntry le = null; le = index.getLexicon().getLexiconEntry("dramatis"); assertNotNull(le); assertEquals(6, le.getFrequency()); assertEquals(5, le.getDocumentFrequency()); le = index.getLexicon().getLexiconEntry("personae"); assertNotNull(le); assertEquals(6, le.getFrequency()); assertEquals(5, le.getDocumentFrequency()); le = index.getLexicon().getLexiconEntry("isnae"); assertNotNull(le); assertEquals(1, le.getFrequency()); assertEquals(1, le.getDocumentFrequency()); }
@SuppressWarnings("unchecked") public static void checkContents(Index index, String term, int freq, int[] docids, int[] freqs, int[] docLens) throws Exception { Lexicon<String> lexicon = index.getLexicon(); assertNotNull(lexicon); LexiconEntry le = lexicon.getLexiconEntry(term); assertNotNull(le); assertEquals(freq, le.getFrequency()); assertEquals(docids.length, le.getDocumentFrequency()); PostingIndex<Pointer> inverted = (PostingIndex<Pointer>) index.getInvertedIndex(); assertNotNull(inverted); IterablePosting ip = inverted.getPostings(le); assertNotNull(ip); for(int i=0;i<docids.length;i++) { assertEquals(docids[i], ip.next()); assertEquals(docids[i], ip.getId()); assertEquals(freqs[i], ip.getFrequency()); assertEquals(docLens[i],ip.getDocumentLength()); } assertEquals(IterablePosting.EOL, ip.next()); }
LexiconEntry kv2 = lexicon2.getLexiconEntry(kv1.getKey()); assertNotNull(kv2); assertEquals(kv1.getValue().getDocumentFrequency(), kv2.getDocumentFrequency()); assertEquals(kv1.getValue().getFrequency(), kv2.getFrequency());