/** Add statistics for this posting onto the given LexiconEntry */ public void addToLexiconEntry(LexiconEntry le) { le.setStatistics(le.getDocumentFrequency() + termDf, le.getFrequency() + termTF); if (maxtf > le.getMaxFrequencyInDocuments()) le.setMaxFrequencyInDocuments(maxtf); }
public int term(String term, EntryStatistics es, int termid) { synchronized(modificationLock) { LexiconEntry le = this.getLexiconEntry(term); if (le != null) { le.add(es); return le.getTermId(); } Text key = keyFactory.newInstance(); key.set(term); ((LexiconEntry) es).setTermId(termid); super.map.put(key, (LexiconEntry) es); return termid; } }
/** * {@inheritDoc} */ public void count(LexiconEntry value) { numberOfTokens += value.getFrequency(); numberOfPointers += value.getDocumentFrequency(); numberOfTerms++; } /**
/** * {@inheritDoc} */ public String toString() { return '('+getDocumentFrequency()+","+getFrequency()+')' + pointerToString(); } /**
@Test public void test_iterator() throws Exception { MemoryLexicon lexicon = new MemoryLexicon(); assertNotNull(lexicon); for (int i = 0; i < 10; i++) lexicon.term(terms[i].toString(), entries[i]); Iterator<Entry<String, LexiconEntry>> it = lexicon.iterator(); assertNotNull(it); int i = 0; while (it.hasNext()) { Entry<String, LexiconEntry> kv1 = it.next(); Entry<String, LexiconEntry> kv2 = lexicon.getLexiconEntry(i++); assertEquals(kv1.getKey(), kv2.getKey()); assertEquals(kv1.getValue().getTermId(), kv2.getValue().getTermId()); assertEquals(kv1.getValue().getDocumentFrequency(), kv2.getValue().getDocumentFrequency()); assertEquals(kv1.getValue().getFrequency(), kv2.getValue().getFrequency()); } }
lee1.getValue().setPointer(newPointer); numberOfPointers+=newPointer.getNumberOfEntries(); if (! keepTermCodeMap) lee1.getValue().setTermId(newCodes++); lexOutStream.writeNextEntry(term1, lee1.getValue()); hasMore1 = lexInStream1.hasNext(); lee2.getValue().setPointer(newPointer); numberOfPointers+=newPointer.getNumberOfEntries(); termcodeHashmap.put(lee2.getValue().getTermId(), newCode); lee2.getValue().setTermId(newCode); lexOutStream.writeNextEntry(term2, lee2.getValue()); hasMore2 = lexInStream2.hasNext(); lee1.getValue().setPointer(newPointer1); if (keepTermCodeMap) termcodeHashmap.put(lee2.getValue().getTermId(), lee1.getValue().getTermId()); else lee1.getValue().setTermId(newCodes++); lee1.getValue().add(lee2.getValue()); lexOutStream.writeNextEntry(term1, lee1.getValue()); lee1.getValue().setPointer(newPointer); if (! keepTermCodeMap) lee1.getValue().setTermId(newCodes++); numberOfPointers+=newPointer.getNumberOfEntries(); lexOutStream.writeNextEntry(lee1.getKey(), lee1.getValue());
numberOfPointersThisIteration += le.getDocumentFrequency(); tmpStorageStorage.add(createPointerForTerm(le)); codesHashMap.put(le.getTermId(), j + 1);
if (t.getMaxFrequencyInDocuments() == Integer.MAX_VALUE && index.hasIndexStructure("maxtf")) if (maxTFStructure != null) t.setMaxFrequencyInDocuments(maxTFStructure.get(t.getTermId()));
le = lexicon.getLexiconEntry("cats"); assertNotNull(le); assertEquals(2, le.getFrequency()); assertEquals(2, le.getNumberOfEntries()); assertEquals(1, le.getMaxFrequencyInDocuments()); if (fieldsExpected) assertEquals("cats", lexicon.getLexiconEntry(le.getTermId()).getKey()); assertEquals(3, le.getFrequency()); assertEquals(3, le.getMaxFrequencyInDocuments()); assertEquals(1, le.getNumberOfEntries()); if (fieldsExpected) assertEquals("chicken", lexicon.getLexiconEntry(le.getTermId()).getKey()); assertEquals(1, le.getFrequency()); assertEquals(1, le.getNumberOfEntries()); if (fieldsExpected) assertEquals(t, lexicon.getLexiconEntry(le.getTermId()).getKey());
/** {@inheritDoc} */ public int getFrequency() { int freq = 0; for (LexiconEntry le : children) if (le != null) freq += le.getFrequency(); return freq; }
int lexicographicalCompare = term1.compareTo(term2); if (lexicographicalCompare < 0) { lee1.getValue().setTermId(termId); lee1.getValue().setPointer(p); lexOutStream.writeNextEntry(term1, lee1.getValue()); termId++; lee2.getValue().setTermId(termId); lee2.getValue().setPointer(p); lexOutStream.writeNextEntry(term2, lee2.getValue()); termId++; if (hasMore2 = lexInStream2.hasNext()) lee2 = lexInStream2.next(); } else { lee1.getValue().setTermId(termId); lee1.getValue().setPointer(p); lee1.getValue().add(lee2.getValue()); lexOutStream.writeNextEntry(term1, lee1.getValue()); if (hasMore1 = lexInStream1.hasNext()) lee1 = lexInStream1.next(); lee1.getValue().setTermId(termId); lee1.getValue().setPointer(p); lexOutStream.writeNextEntry(lee1.getKey(), lee1.getValue()); if (hasMore1 = lexInStream1.hasNext()) lee1 = lexInStream1.next(); lee1.getValue().setTermId(termId); lee1.getValue().setPointer(p); lexOutStream.writeNextEntry(lee2.getKey(), lee2.getValue()); if (hasMore2 = lexInStream2.hasNext()) lee2 = lexInStream2.next();
nextEntryToWrite = newLexiconEntry(targetTermId = currentEntries[i].getValue().getTermId()); else if (targetTermId != currentEntries[i].getValue().getTermId()) logger.error("Term "+targetTerm+" had two termids ("+targetTermId+","+currentEntries[i].getValue().getTermId()+")"); nextEntryToWrite.add(currentEntries[i].getValue()); hasMore[i] = lis[i].hasNext();
/** {@inheritDoc} */ public int getDocumentFrequency() { int docFreq = 0; for (LexiconEntry le : children) if (le != null) docFreq += le.getDocumentFrequency(); return docFreq; }
TF[i] = le.next().getValue().getFrequency(); i++; lee.getValue().setTermId(newTermId[i]); leOut.writeNextEntry(lee.getKey(), lee.getValue()); i++;
protected LexiconEntry newLexiconEntry(int termid) { LexiconEntry rtr = valueFactory.newInstance(); rtr.setTermId(termid); return rtr; }
lee.getValue().setPointer(bitPointer);
@Override public int getMaxFrequencyInDocuments() { if (this.maxtf != Integer.MAX_VALUE) return this.maxtf; int max = Integer.MIN_VALUE; for(LexiconEntry c : children) if (c != null && c.getMaxFrequencyInDocuments() > max) max = c.getMaxFrequencyInDocuments(); return max; }
/** {@inheritDoc} */ public int getNumberOfEntries() { int entries = 0; for (LexiconEntry le : children) if (le != null) entries += le.getNumberOfEntries(); return entries; }