/** * Builds a MoreLikeThis query for the specified luceneId over the * default text field in LuceneOptions. * * @param luceneId * @return * @throws DaoException */ public QueryBuilder setMoreLikeThisQuery(int luceneId) throws DaoException { return setMoreLikeThisQuery( searcher.getOptions().elements, luceneId); }
/** * Runs a specified lucene query in the specified language with a specified hitcount. * @param query * @param language * @param hitCount * @return */ public WikiBrainScoreDoc[] search(Query query, Language language, int hitCount, Filter filter) { return search(query, language, hitCount, filter, true); }
@Override public LinkedHashMap<LocalId, Float> resolve(Language language, String phrase, int maxPages) throws DaoException { LinkedHashMap<LocalId, Float> result = new LinkedHashMap<LocalId, Float>(); WikiBrainScoreDoc[] wikibrainScoreDocs = searcher.getQueryBuilderByLanguage(language) .setPhraseQuery(phrase) .setNumHits(10) .search(); if (wikibrainScoreDocs.length == 0 && phrase.indexOf(" ") < 0) { String phraseMultiVersion = ""; for (int i = 1; i < phrase.length(); i++) { phraseMultiVersion += (i > 2 ? phrase.substring(0, i) + " " : ""); phraseMultiVersion += (phrase.length() - i > 2 ? phrase.substring(i, phrase.length()) + " " : ""); } wikibrainScoreDocs = searcher.getQueryBuilderByLanguage(language) .setPhraseQuery(phraseMultiVersion) .setNumHits(10) .search(); } float totalScore = 0; for (WikiBrainScoreDoc wikibrainScoreDoc : wikibrainScoreDocs) { totalScore += wikibrainScoreDoc.score; } for (WikiBrainScoreDoc wikibrainScoreDoc : wikibrainScoreDocs) { int localPageId = searcher.getLocalIdFromDocId(wikibrainScoreDoc.luceneId, language); LocalId localId = new LocalId(language, localPageId); result.put(localId, wikibrainScoreDoc.score / totalScore); } return result; }
/** * Runs a specified lucene query in the specified language with a specified hitcount. * @param query * @param language * @param hitCount * @param filter * @param resolveWpIds if True, returns wikipedia ids. otherwise returns lucene ids. * @return */ public WikiBrainScoreDoc[] search(Query query, Language language, int hitCount, Filter filter, boolean resolveWpIds) { if (!searchers.containsKey(language)) throw new IllegalArgumentException("Unknown language: " + language); try { this.hitCount = hitCount; ScoreDoc[] scoreDocs = searchers.get(language).search(query, filter, hitCount).scoreDocs; WikiBrainScoreDoc[] wikibrainScoreDocs = new WikiBrainScoreDoc[scoreDocs.length]; for (int i = 0; i < scoreDocs.length; i++) { ScoreDoc scoreDoc = scoreDocs[i]; int wpId = resolveWpIds ? getLocalIdFromDocId(scoreDoc.doc, language) : -1; wikibrainScoreDocs[i] = new WikiBrainScoreDoc(scoreDoc.doc, wpId, scoreDoc.score); } return wikibrainScoreDocs; } catch (IOException e) { throw new RuntimeException(e); } }
@Override public LuceneSearcher get(String name, Config config, Map<String, String> runtimeParams) throws ConfigurationException { return new LuceneSearcher( getConfigurator().get(LanguageSet.class), getConfigurator().get(LuceneOptions.class, config.getString("options")) ); } }
private QueryBuilder getQueryBuilder() { QueryBuilder builder = searcher.getQueryBuilderByLanguage(language); builder.setResolveWikipediaIds(false); if (conceptFilter != null) { builder.addFilter(conceptFilter); } return builder; }
/** * Builds a phrase query over the specified field. * * @param fieldName the name of the field on which to search * @param searchString * @return */ public QueryBuilder setPhraseQuery(String fieldName, String searchString) { QueryParser parser = new QueryParser( searcher.getOptions().matchVersion, fieldName, searcher.getAnalyzerByLanguage(language)); try { searchString = QueryParserUtil.escape(searchString); // Lucene doesn't escape forward slash, but it needs to searchString = StringUtils.replace(searchString, "/", "\\/"); query = parser.parse(searchString); return this; } catch (ParseException e) { throw new RuntimeException(e); // should never happen after escaping } }
/** * Builds a MoreLikeThis query for the specified luceneId over the * specified text field. * * @param fieldName * @param luceneId * @return * @throws DaoException */ public QueryBuilder setMoreLikeThisQuery(String fieldName, int luceneId) throws DaoException { if (luceneId >= 0) { try { MoreLikeThis mlt = new MoreLikeThis(searcher.getReaderByLanguage(language)); mlt.setMaxDocFreqPct(maxPercentage); mlt.setMaxQueryTerms(maxQueryTerms); mlt.setMinDocFreq(minDocFreq); mlt.setMinTermFreq(minTermFreq); mlt.setAnalyzer(searcher.getAnalyzerByLanguage(language)); mlt.setFieldNames(new String[]{ fieldName }); query = mlt.like(luceneId); } catch (IOException e) { LOG.warn("Can't more like this query for luceneId: " + luceneId); } } else { throw new IllegalArgumentException("Illegal Lucene ID: " + luceneId); } return this; }
@Override public TIntFloatMap getVector(int pageId) throws DaoException { int luceneId = searcher.getDocIdFromLocalId(pageId, language); if (luceneId < 0) { LOG.warn("Unindexed document " + pageId + " in " + language.getEnLangName()); return new TIntFloatHashMap(); } WikiBrainScoreDoc[] wikibrainScoreDocs = getQueryBuilder() .setMoreLikeThisQuery(luceneId) .search(); wikibrainScoreDocs = pruneSimilar(wikibrainScoreDocs); return SimUtils.normalizeVector(expandScores(wikibrainScoreDocs)); }
@Override public List<Explanation> getExplanations(String phrase1, String phrase2, TIntFloatMap vector1, TIntFloatMap vector2, SRResult result) throws DaoException { Leaderboard lb = new Leaderboard(5); // TODO: make 5 configurable for (int id : vector1.keys()) { if (vector2.containsKey(id)) { lb.tallyScore(id, vector1.get(id) * vector2.get(id)); } } SRResultList top = lb.getTop(); if (top.numDocs() == 0) { return Arrays.asList(new Explanation("? and ? share no tags", phrase1, phrase2)); } List<Explanation> explanations = new ArrayList<Explanation>(); for (int i = 0; i < top.numDocs(); i++) { LocalPage p = pageDao.getById(language, searcher.getLocalIdFromDocId(top.getId(i), language)); if (p != null) { explanations.add(new Explanation("Both ? and ? have similar text to ?", phrase1, phrase2, p)); } } return explanations; }
@Override public PhraseAnalyzer get(String name, Config config, Map<String, String> runtimeParams) throws ConfigurationException { if (!config.getString("type").equals("olucene")) { return null; } LocalPageDao localPageDao = getConfigurator().get(LocalPageDao.class, config.getString("localPageDao")); LuceneSearcher searcher = new LuceneSearcher( new LanguageSet("simple"), getConfigurator().get(LuceneOptions.class)); return new LucenePhraseAnalyzer(localPageDao, searcher); }
private LinkedHashMap<LocalId, Float> resolveTextual(String phrase, int n) { if (n == 0) { return new LinkedHashMap<LocalId, Float>(); } WikiBrainScoreDoc results[] = searcher.getQueryBuilderByLanguage(language) .setPhraseQuery(new TextFieldElements().addPlainText(), phrase) .setNumHits(n*2) .search(); double total = 0.0; for (WikiBrainScoreDoc doc : results) { total += doc.score; } LinkedHashMap<LocalId, Float> expanded = new LinkedHashMap<LocalId, Float>(); for (int i = 0; i < n && i < results.length; i++) { expanded.put(new LocalId(language, results[i].wpId), (float)(results[i].score / total)); } return expanded; }
@Override public PhraseAnalyzer get(String name, Config config, Map<String, String> runtimeParams) throws ConfigurationException { if (!config.getString("type").equals("lucene")) { return null; } LocalPageDao localPageDao = getConfigurator().get(LocalPageDao.class, config.getString("localPageDao")); LanguageSet langs = getConfigurator().get(LanguageSet.class); LuceneSearcher searcher = new LuceneSearcher(langs, getConfigurator().get(LuceneOptions.class)); return new LucenePhraseAnalyzer(localPageDao, searcher); } }
WikiBrainScoreDoc[] wikibrainScoreDocs = searcher.getQueryBuilderByLanguage(language) .setPhraseQuery(new TextFieldElements().addTitle(), phrase) .setNumHits(maxPages * DOC_MULTIPLIER) wikibrainScoreDocs = searcher.getQueryBuilderByLanguage(language) .setPhraseQuery(new TextFieldElements().addPlainText(), phrase) .setNumHits(maxPages * DOC_MULTIPLIER) phraseMultiVersion += (phrase.length() - i > 2 ? phrase.substring(i, phrase.length()) + " " : ""); wikibrainScoreDocs = searcher.getQueryBuilderByLanguage(language) .setPhraseQuery(phraseMultiVersion) .setNumHits(10)
/** * Runs a specified lucene query in the specified language. * * @param query * @return */ public WikiBrainScoreDoc[] search(Query query, Language language) { return search(query, language, this.hitCount, null); }
/** * Builds a phrase query over the default text field in LuceneOptions. * * @param searchString * @return */ public QueryBuilder setPhraseQuery(String searchString) { return setPhraseQuery(searcher.getOptions().elements, searchString); }
public WikiBrainScoreDoc[] search(Query query, Language language, int hitCount) { return search(query, language, hitCount, null); }
public WikiBrainScoreDoc[] search() { if (!hasQuery()) { throw new IllegalArgumentException("no query specified. call one of the QueryBuilder.set* methods to specify a query"); } return searcher.search(query, language, numHits, getFilters(), resolveWikipediaIds); }