/** * Returns {@link CollectionStatistics} for a field. * * This can be overridden for example, to return a field's statistics * across a distributed collection. * @lucene.experimental */ public CollectionStatistics collectionStatistics(String field) throws IOException { final int docCount; final long sumTotalTermFreq; final long sumDocFreq; assert field != null; Terms terms = MultiFields.getTerms(reader, field); if (terms == null) { docCount = 0; sumTotalTermFreq = 0; sumDocFreq = 0; } else { docCount = terms.getDocCount(); sumTotalTermFreq = terms.getSumTotalTermFreq(); sumDocFreq = terms.getSumDocFreq(); } return new CollectionStatistics(field, reader.maxDoc(), docCount, sumTotalTermFreq, sumDocFreq); } }
public TermWeight(IndexSearcher searcher, boolean needsScores, float boost, TermContext termStates) throws IOException { super(TermQuery.this); if (needsScores && termStates == null) { throw new IllegalStateException("termStates are required when scores are needed"); } this.needsScores = needsScores; this.termStates = termStates; this.similarity = searcher.getSimilarity(needsScores); final CollectionStatistics collectionStats; final TermStatistics termStats; if (needsScores) { collectionStats = searcher.collectionStatistics(term.field()); termStats = searcher.termStatistics(term, termStates); } else { // we do not need the actual stats, use fake stats with docFreq=maxDoc and ttf=-1 final int maxDoc = searcher.getIndexReader().maxDoc(); collectionStats = new CollectionStatistics(term.field(), maxDoc, -1, -1, -1); termStats = new TermStatistics(term.bytes(), maxDoc, -1); } this.stats = similarity.computeWeight(boost, collectionStats, termStats); }
public static ObjectObjectHashMap<String, CollectionStatistics> readFieldStats(StreamInput in, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics) throws IOException { final int numFieldStatistics = in.readVInt(); if (fieldStatistics == null) { fieldStatistics = HppcMaps.newNoNullKeysMap(numFieldStatistics); } for (int i = 0; i < numFieldStatistics; i++) { final String field = in.readString(); assert field != null; final long maxDoc = in.readVLong(); final long docCount = subOne(in.readVLong()); final long sumTotalTermFreq = subOne(in.readVLong()); final long sumDocFreq = subOne(in.readVLong()); CollectionStatistics stats = new CollectionStatistics(field, maxDoc, docCount, sumTotalTermFreq, sumDocFreq); fieldStatistics.put(field, stats); } return fieldStatistics; }
private static void validateScoresArePositive(Version indexCreatedVersion, Similarity similarity) throws IOException { CollectionStatistics collectionStats = new CollectionStatistics("some_field", 1200, 1100, 3000, 2000); TermStatistics termStats = new TermStatistics(new BytesRef("some_value"), 100, 130); SimWeight simWeight = similarity.computeWeight(2f, collectionStats, termStats); FieldInvertState state = new FieldInvertState(indexCreatedVersion.luceneVersion.major, "some_field", 20, 20, 0, 50); // length = 20, no overlap final long norm = similarity.computeNorm(state); LeafReader reader = new SingleNormLeafReader(norm); SimScorer scorer = similarity.simScorer(simWeight, reader.getContext()); for (int freq = 1; freq <= 10; ++freq) { float score = scorer.score(0, freq); if (score < 0) { DEPRECATION_LOGGER.deprecated("Similarities should not return negative scores:\n" + scorer.explain(0, Explanation.match(freq, "term freq"))); break; } } }
private static void validateScoresDoNotDecreaseWithFreq(Version indexCreatedVersion, Similarity similarity) throws IOException { CollectionStatistics collectionStats = new CollectionStatistics("some_field", 1200, 1100, 3000, 2000); TermStatistics termStats = new TermStatistics(new BytesRef("some_value"), 100, 130); SimWeight simWeight = similarity.computeWeight(2f, collectionStats, termStats); FieldInvertState state = new FieldInvertState(indexCreatedVersion.luceneVersion.major, "some_field", 20, 20, 0, 50); // length = 20, no overlap final long norm = similarity.computeNorm(state); LeafReader reader = new SingleNormLeafReader(norm); SimScorer scorer = similarity.simScorer(simWeight, reader.getContext()); float previousScore = Float.NEGATIVE_INFINITY; for (int freq = 1; freq <= 10; ++freq) { float score = scorer.score(0, freq); if (score < previousScore) { DEPRECATION_LOGGER.deprecated("Similarity scores should not decrease when term frequency increases:\n" + scorer.explain(0, Explanation.match(freq - 1, "term freq")) + "\n" + scorer.explain(0, Explanation.match(freq, "term freq"))); break; } previousScore = score; } }
private static void validateScoresDoNotIncreaseWithNorm(Version indexCreatedVersion, Similarity similarity) throws IOException { CollectionStatistics collectionStats = new CollectionStatistics("some_field", 1200, 1100, 3000, 2000); TermStatistics termStats = new TermStatistics(new BytesRef("some_value"), 100, 130); SimWeight simWeight = similarity.computeWeight(2f, collectionStats, termStats); SimScorer previousScorer = null; long previousNorm = 0; float previousScore = Float.POSITIVE_INFINITY; for (int length = 1; length <= 10; ++length) { FieldInvertState state = new FieldInvertState(indexCreatedVersion.luceneVersion.major, "some_field", length, length, 0, 50); // length = 20, no overlap final long norm = similarity.computeNorm(state); if (Long.compareUnsigned(previousNorm, norm) > 0) { // esoteric similarity, skip this check break; } LeafReader reader = new SingleNormLeafReader(norm); SimScorer scorer = similarity.simScorer(simWeight, reader.getContext()); float score = scorer.score(0, 1); if (score > previousScore) { DEPRECATION_LOGGER.deprecated("Similarity scores should not increase when norm increases:\n" + previousScorer.explain(0, Explanation.match(1, "term freq")) + "\n" + scorer.explain(0, Explanation.match(1, "term freq"))); break; } previousScorer = scorer; previousScore = score; previousNorm = norm; } }
CollectionStatistics existing = fieldStatistics.get(key); if (existing != null) { CollectionStatistics merged = new CollectionStatistics( key, existing.maxDoc() + value.maxDoc(), optionalSum(existing.docCount(), value.docCount()),
@Override public FunctionValues getValues(Map context, LeafReaderContext readerContext) throws IOException { IndexSearcher searcher = (IndexSearcher)context.get("searcher"); final TFIDFSimilarity similarity = IDFValueSource.asTFIDF(searcher.getSimilarity(true), field); if (similarity == null) { throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as ClassicSimilarity)"); } // Only works if the contribution of the tf is 1 when the freq is 1 and contribution of the idf // is 1 when docCount == docFreq == 1 final SimWeight simWeight = similarity.computeWeight(1f, new CollectionStatistics(field, 1, 1, 1, 1), new TermStatistics(new BytesRef("bogus"), 1, 1)); final SimScorer simScorer = similarity.simScorer(simWeight, readerContext); return new FloatDocValues(this) { int lastDocID = -1; @Override public float floatVal(int docID) throws IOException { if (docID < lastDocID) { throw new AssertionError("docs out of order: lastDocID=" + lastDocID + " docID=" + docID); } lastDocID = docID; return simScorer.score(docID, 1f); } }; }
/** * Returns {@link CollectionStatistics} for a field. * * This can be overridden for example, to return a field's statistics * across a distributed collection. * @lucene.experimental */ public CollectionStatistics collectionStatistics(String field) throws IOException { final int docCount; final long sumTotalTermFreq; final long sumDocFreq; assert field != null; Terms terms = MultiFields.getTerms(reader, field); if (terms == null) { docCount = 0; sumTotalTermFreq = 0; sumDocFreq = 0; } else { docCount = terms.getDocCount(); sumTotalTermFreq = terms.getSumTotalTermFreq(); sumDocFreq = terms.getSumDocFreq(); } return new CollectionStatistics(field, reader.maxDoc(), docCount, sumTotalTermFreq, sumDocFreq); } }
/** * Returns {@link CollectionStatistics} for a field. * * This can be overridden for example, to return a field's statistics * across a distributed collection. * @lucene.experimental */ public CollectionStatistics collectionStatistics(String field) throws IOException { final int docCount; final long sumTotalTermFreq; final long sumDocFreq; assert field != null; Terms terms = MultiFields.getTerms(reader, field); if (terms == null) { docCount = 0; sumTotalTermFreq = 0; sumDocFreq = 0; } else { docCount = terms.getDocCount(); sumTotalTermFreq = terms.getSumTotalTermFreq(); sumDocFreq = terms.getSumDocFreq(); } return new CollectionStatistics(field, reader.maxDoc(), docCount, sumTotalTermFreq, sumDocFreq); } }
public TermWeight(IndexSearcher searcher, boolean needsScores, float boost, TermContext termStates) throws IOException { super(TermQuery.this); if (needsScores && termStates == null) { throw new IllegalStateException("termStates are required when scores are needed"); } this.needsScores = needsScores; this.termStates = termStates; this.similarity = searcher.getSimilarity(needsScores); final CollectionStatistics collectionStats; final TermStatistics termStats; if (needsScores) { collectionStats = searcher.collectionStatistics(term.field()); termStats = searcher.termStatistics(term, termStates); } else { // we do not need the actual stats, use fake stats with docFreq=maxDoc and ttf=-1 final int maxDoc = searcher.getIndexReader().maxDoc(); collectionStats = new CollectionStatistics(term.field(), maxDoc, -1, -1, -1); termStats = new TermStatistics(term.bytes(), maxDoc, -1); } this.stats = similarity.computeWeight(boost, collectionStats, termStats); }
public TermWeight(final IndexSearcher searcher, final boolean needsScores, final float boost, final TermContext termStates) throws IOException { super(DependentTermQuery.this); if (needsScores && termStates == null) { throw new IllegalStateException("termStates are required when scores are needed"); } final Term term = getTerm(); this.needsScores = needsScores; this.termStates = termStates; this.similarity = searcher.getSimilarity(needsScores); final int maxDoc = searcher.getIndexReader().maxDoc(); final CollectionStatistics collectionStats = new CollectionStatistics(term.field(), maxDoc, -1, -1, -1); final TermStatistics termStats; if (needsScores) { termStats = searcher.termStatistics(term, termStates); } else { // we do not need the actual stats, use fake stats with docFreq=maxDoc and ttf=-1 termStats = new TermStatistics(term.bytes(), maxDoc, -1); } fieldBoostFactor = fieldBoost.getBoost(getTerm().field(), searcher.getIndexReader()); this.stats = similarity.computeWeight(boost * fieldBoostFactor, collectionStats, termStats); }
public static ObjectObjectHashMap<String, CollectionStatistics> readFieldStats(StreamInput in, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics) throws IOException { final int numFieldStatistics = in.readVInt(); if (fieldStatistics == null) { fieldStatistics = HppcMaps.newNoNullKeysMap(numFieldStatistics); } for (int i = 0; i < numFieldStatistics; i++) { final String field = in.readString(); assert field != null; final long maxDoc = in.readVLong(); final long docCount = subOne(in.readVLong()); final long sumTotalTermFreq = subOne(in.readVLong()); final long sumDocFreq = subOne(in.readVLong()); CollectionStatistics stats = new CollectionStatistics(field, maxDoc, docCount, sumTotalTermFreq, sumDocFreq); fieldStatistics.put(field, stats); } return fieldStatistics; }
public static ObjectObjectHashMap<String, CollectionStatistics> readFieldStats(StreamInput in, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics) throws IOException { final int numFieldStatistics = in.readVInt(); if (fieldStatistics == null) { fieldStatistics = HppcMaps.newNoNullKeysMap(numFieldStatistics); } for (int i = 0; i < numFieldStatistics; i++) { final String field = in.readString(); assert field != null; final long maxDoc = in.readVLong(); final long docCount = subOne(in.readVLong()); final long sumTotalTermFreq = subOne(in.readVLong()); final long sumDocFreq = subOne(in.readVLong()); CollectionStatistics stats = new CollectionStatistics(field, maxDoc, docCount, sumTotalTermFreq, sumDocFreq); fieldStatistics.put(field, stats); } return fieldStatistics; }
public static ObjectObjectHashMap<String, CollectionStatistics> readFieldStats(StreamInput in, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics) throws IOException { final int numFieldStatistics = in.readVInt(); if (fieldStatistics == null) { fieldStatistics = HppcMaps.newNoNullKeysMap(numFieldStatistics); } for (int i = 0; i < numFieldStatistics; i++) { final String field = in.readString(); assert field != null; final long maxDoc = in.readVLong(); final long docCount = subOne(in.readVLong()); final long sumTotalTermFreq = subOne(in.readVLong()); final long sumDocFreq = subOne(in.readVLong()); CollectionStatistics stats = new CollectionStatistics(field, maxDoc, docCount, sumTotalTermFreq, sumDocFreq); fieldStatistics.put(field, stats); } return fieldStatistics; }
public static ObjectObjectHashMap<String, CollectionStatistics> readFieldStats(StreamInput in, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics) throws IOException { final int numFieldStatistics = in.readVInt(); if (fieldStatistics == null) { fieldStatistics = HppcMaps.newNoNullKeysMap(numFieldStatistics); } for (int i = 0; i < numFieldStatistics; i++) { final String field = in.readString(); assert field != null; final long maxDoc = in.readVLong(); final long docCount = subOne(in.readVLong()); final long sumTotalTermFreq = subOne(in.readVLong()); final long sumDocFreq = subOne(in.readVLong()); CollectionStatistics stats = new CollectionStatistics(field, maxDoc, docCount, sumTotalTermFreq, sumDocFreq); fieldStatistics.put(field, stats); } return fieldStatistics; }
private static void validateScoresArePositive(Version indexCreatedVersion, Similarity similarity) throws IOException { CollectionStatistics collectionStats = new CollectionStatistics("some_field", 1200, 1100, 3000, 2000); TermStatistics termStats = new TermStatistics(new BytesRef("some_value"), 100, 130); SimWeight simWeight = similarity.computeWeight(2f, collectionStats, termStats); FieldInvertState state = new FieldInvertState(indexCreatedVersion.luceneVersion.major, "some_field", 20, 20, 0, 50); // length = 20, no overlap final long norm = similarity.computeNorm(state); LeafReader reader = new SingleNormLeafReader(norm); SimScorer scorer = similarity.simScorer(simWeight, reader.getContext()); for (int freq = 1; freq <= 10; ++freq) { float score = scorer.score(0, freq); if (score < 0) { DEPRECATION_LOGGER.deprecated("Similarities should not return negative scores:\n" + scorer.explain(0, Explanation.match(freq, "term freq"))); break; } } }
public TermWeight(IndexSearcher searcher, boolean needsScores, TermContext termStates) throws IOException { super(TermQuery.this); this.needsScores = needsScores; assert termStates != null : "TermContext must not be null"; // checked with a real exception in TermQuery constructor assert termStates.hasOnlyRealTerms(); this.termStates = termStates; this.similarity = searcher.getSimilarity(needsScores); final CollectionStatistics collectionStats; final TermStatistics termStats; if (needsScores) { collectionStats = searcher.collectionStatistics(term.field()); termStats = searcher.termStatistics(term, termStates); } else { // do not bother computing actual stats, scores are not needed final int maxDoc = searcher.getIndexReader().maxDoc(); final int docFreq = termStates.docFreq(); final long totalTermFreq = termStates.totalTermFreq(); collectionStats = new CollectionStatistics(term.field(), maxDoc, -1, -1, -1); termStats = new TermStatistics(term.bytes(), docFreq, totalTermFreq); } this.stats = similarity.computeWeight(collectionStats, termStats); }
public TermWeight(IndexSearcher searcher, boolean needsScores, TermContext termStates) throws IOException { super(TermQuery.this); this.needsScores = needsScores; assert termStates != null : "TermContext must not be null"; // checked with a real exception in TermQuery constructor assert termStates.hasOnlyRealTerms(); this.termStates = termStates; this.similarity = searcher.getSimilarity(needsScores); final CollectionStatistics collectionStats; final TermStatistics termStats; if (needsScores) { collectionStats = searcher.collectionStatistics(term.field()); termStats = searcher.termStatistics(term, termStates); } else { // do not bother computing actual stats, scores are not needed final int maxDoc = searcher.getIndexReader().maxDoc(); final int docFreq = termStates.docFreq(); final long totalTermFreq = termStates.totalTermFreq(); collectionStats = new CollectionStatistics(term.field(), maxDoc, -1, -1, -1); termStats = new TermStatistics(term.bytes(), docFreq, totalTermFreq); } this.stats = similarity.computeWeight(collectionStats, termStats); }
private static void validateScoresDoNotDecreaseWithFreq(Version indexCreatedVersion, Similarity similarity) throws IOException { CollectionStatistics collectionStats = new CollectionStatistics("some_field", 1200, 1100, 3000, 2000); TermStatistics termStats = new TermStatistics(new BytesRef("some_value"), 100, 130); SimWeight simWeight = similarity.computeWeight(2f, collectionStats, termStats); FieldInvertState state = new FieldInvertState(indexCreatedVersion.luceneVersion.major, "some_field", 20, 20, 0, 50); // length = 20, no overlap final long norm = similarity.computeNorm(state); LeafReader reader = new SingleNormLeafReader(norm); SimScorer scorer = similarity.simScorer(simWeight, reader.getContext()); float previousScore = Float.NEGATIVE_INFINITY; for (int freq = 1; freq <= 10; ++freq) { float score = scorer.score(0, freq); if (score < previousScore) { DEPRECATION_LOGGER.deprecated("Similarity scores should not decrease when term frequency increases:\n" + scorer.explain(0, Explanation.match(freq - 1, "term freq")) + "\n" + scorer.explain(0, Explanation.match(freq, "term freq"))); break; } previousScore = score; } }