@Override public void save(RawPage page) throws DaoException { insert( page.getLanguage().getId(), page.getLocalId(), page.getRevisionId(), page.getBody() == null ? "" : page.getBody(), page.getTitle().getCanonicalTitle(), page.getLastEdit(), page.getNamespace().getArbitraryId(), page.isRedirect(), page.isDisambig(), page.getRedirectTitle() ); }
private void fillBuffer() { while (buffer == null && iter.hasNext()) { RawPage rp = iter.next(); if (rp != null) { try { String text = rp.getPlainText(false); if (text != null && text.trim().length() > 0) { buffer = new IdAndText(rp.getLocalId(), text.trim()); } } catch (Exception e) { LOG.warn("Error when extracting text from: " + rp.getTitle()); } } } } }
private boolean isInteresting(RawPage rp) { if (rp == null || rp.getNamespace() == null) { return false; } else if (validIds != null && !validIds.contains(rp.getLocalId())) { return false; } else { return nss.contains(rp.getNamespace()); } }
private void save(File file, RawPage rp) { try { rawPageDao.save(rp); metaDao.incrementRecords(rp.getClass(), rp.getLanguage()); } catch (Exception e) { LOG.warn("parsing of " + file + " failed:", e); metaDao.incrementErrorsQuietly(rp.getClass(), rp.getLanguage()); } try { LocalPage lp = new LocalPage( rp.getLanguage(), rp.getLocalId(), rp.getTitle(), rp.getNamespace(), rp.isRedirect(), rp.isDisambig() ); localPageDao.save(lp); metaDao.incrementRecords(lp.getClass(), lp.getLanguage()); } catch (Exception e) { LOG.warn("parsing of " + file + " failed:", e); metaDao.incrementErrorsQuietly(LocalPage.class, rp.getLanguage()); } }
@Override public void run() { boolean finished = false; while (!finished) { RawPage rp = null; Language lang = null; try { rp = queue.poll(100, TimeUnit.MILLISECONDS); if (rp == POISON_PILL) { queue.put(rp); finished = true; } else if (rp != null) { lang = rp.getLanguage(); luceneIndexer.indexPage(rp); metaDao.incrementRecords(LuceneSearcher.class, lang); } } catch (InterruptedException e) { LOG.warn("LuceneLoader.Worker received interrupt."); return; } catch (Exception e) { metaDao.incrementErrorsQuietly(LuceneSearcher.class, lang); String title = "unknown"; if (rp != null) title = rp.getTitle().toString(); LOG.warn("exception while parsing " + title, e); } } } }
/** * Indexes a specific RawPage * * @param page the page to index */ public void indexPage(RawPage page) throws DaoException { if (closed) { throw new IllegalStateException("Indexer has already been closed!"); } if (!language.equals(page.getLanguage())) { throw new IllegalStateException("Language mismatch!"); } try { Document document = new Document(); Field localIdField = new IntField(LuceneOptions.LOCAL_ID_FIELD_NAME, page.getLocalId(), Field.Store.YES); Field langIdField = new IntField(LuceneOptions.LANG_ID_FIELD_NAME, page.getLanguage().getId(), Field.Store.YES); Field canonicalTitleField = builder.buildTextField(page, new TextFieldElements().addTitle()); document.add(localIdField); document.add(langIdField); document.add(canonicalTitleField); if (!page.isRedirect()) { for (LuceneOptions option : options) { document.add(builder.buildTextField(page, option.elements)); } } writer.addDocument(document); } catch (IOException e) { throw new RuntimeException(e); } }
public void testWikify() throws DaoException { int barackId = lpd.getIdByTitle("Barack Obama", language, NameSpace.ARTICLE); RawPage rp = rpd.getById(language, barackId); for (int i = 0; i < 1; i++) { List<LocalLink> detected = wikify(rp.getLocalId()); System.out.println("Links detected for " + rp.getTitle() + " (" + i + ")"); for (LocalLink ll : detected) { System.out.println("\t" + ll + " page " + lpd.getById(language, ll.getDestId()).getTitle()); } } }
/** * <p> * Returns the title string of the revised page, including namespace * prefixes and subpages, if any. The string is formatted as it would be on * an HTML page and not as in the URL used by MediaWiki for the page. For * example, spaces are represented as spaces and not as underscores. For * example * </p> * <p> * On a single MediaWiki site, the prefixed page title is a key for a page * at any given moment. However, users may change the title and namespace by * moving pages. The page id provides a better clue to identify pages across * history. * </p> * * @return title string */ @Override public String getPrefixedTitle() { Title t = raw.getTitle(); if (raw.getNamespace() == NameSpace.WIKIPEDIA) { return t.getTitleStringWithoutNamespace(); } else { return t.getNamespaceString() + ":" + t.getTitleStringWithoutNamespace(); } }
/** * Builds a lucene text field for page based on the specified text field elements * * @param page * @param elements * @return * @throws DaoException */ public TextField buildTextField(RawPage page, TextFieldElements elements) throws DaoException { return buildTextField( localPageDao.getById(page.getLanguage(), page.getLocalId()), page, elements); }
if (xml.isRedirect()) { ParsedRedirect pr = new ParsedRedirect(); pr.location = new ParsedLocation(xml, -1, -1, -1); } else { try { ParsedPage pp = jwpl.parse(xml.getBody()); if (pp == null) { LOG.debug("invalid page: " + xml.getBody()); if (xml.getNamespace() == NameSpace.CATEGORY) { parseCategory(xml, pp); } else if (xml.getNamespace() == NameSpace.ARTICLE) { parseArticle(xml, pp);
@Override public void call(RawPage page) throws Exception { String text = page.getPlainText(false); List<LinkInfo> candidates = scoreMentions(page.getLocalId(), text); Collections.sort(candidates); List<String> words = new StringTokenizer().getWords(language, text); int target = (int) (words.size() * desiredWikifiedFraction); if (candidates.size() > target) candidates = candidates.subList(0, target); synchronized (results) { results.add(candidates); } } });
/** * Returns the numeric page id of the revised page. For any given MediaWiki * site, pages are uniquely identified by their page id. MediaWiki will try * to preserve the page id even across title changes (moves). * * @return integer page id */ @Override public int getPageId() { return raw.getLocalId(); }
public String toString(){ return String.format("%s / %s (%s)", this.getTitle(), this.localId, lang.getLangCode()); } }
private void loadRedirectIdsIntoMemory(Language language) throws DaoException{ redirectIdsToPageIds = new TIntIntHashMap(Constants.DEFAULT_CAPACITY, Constants.DEFAULT_LOAD_FACTOR, -1, -1); int i = 0; LOG.info("Begin loading redirects into memory: "); for (RawPage p : rawPages.get(new DaoFilter().setLanguages(language).setRedirect(true))) { Title pTitle = new Title(p.getRedirectTitle(), LanguageInfo.getByLanguage(language)); redirectIdsToPageIds.put(p.getLocalId(), localPages.getIdByTitle(pTitle.getCanonicalTitle(), language, pTitle.getNamespace())); if(i%100000==0) LOG.info("loading redirect # " + i); i++; } LOG.info("End loading redirects into memory."); }
private TextField buildTextField(LocalPage localPage, RawPage rawPage, TextFieldElements elements) throws DaoException { StringBuilder sb = new StringBuilder(); String title = rawPage.getTitle().getCanonicalTitle(); for (int i=0; i<elements.usesTitle(); i++) { sb.append(title); sb.append(" "); } if (elements.usesRedirects()) { TIntIterator iterator = redirectDao.getRedirects(localPage).iterator(); while (iterator.hasNext()) { sb.append(localPageDao .getById(localPage.getLanguage(), iterator.next()) .getTitle() .getCanonicalTitle()); sb.append(" "); } } if (elements.usesPlainText()) { String plainText = rawPage.getPlainText(); sb.append(plainText); } return new TextField(elements.getTextFieldName(), sb.toString().trim(), Field.Store.YES); }
private void processPage(TLongIntMap counts, RawPage page) { Language lang = page.getLanguage(); StringTokenizer tokenizer = new StringTokenizer(); StringBuilder buffer = new StringBuilder(); for (Token sentence : tokenizer.getSentenceTokens(lang, page.getPlainText())) { List<Token> words = tokenizer.getWordTokens(lang, sentence); for (int i = 0; i < words.size(); i++) {
/** * Returns the text content of the current revision. Traditionally, this is * a wiki text that is edited by users. More recently, however, other * formats, such as JSON, have been introduced by extensions like Wikibase. * The format of the text is specified by {@link #getFormat()}. To interpret * it properly, one should also know the content model, obtained from * {@link #getModel()}. * * @return text content of the revision */ @Override public String getText() { return raw.getBody(); }
@Override public void parseError(RawPage rp, Exception e) { metaDao.incrementErrorsQuietly(LocalCategoryMember.class, rp.getLanguage()); }
/** * Returns a plain text output of the body of this RawPage * @return */ public String getPlainText() { return getPlainText(false); } /**
/** * <p> * Returns the id of the MediaWiki namespace of the revised page. The * meaning of this id depends on the configuration of the site that the page * is from. Usually, 0 is the main namespace. Even ids usually refer to * normal article pages while their odd successors represent the * corresponding talk namespace. * </p> * <p> * On a single MediaWiki site, the combination of page title and page * namespace is a key for a page at any given moment. However, users may * change the title and namespace by moving pages. The page id provides a * better clue to identify pages across history. * </p> * * @return integer namespace id */ @Override public int getNamespace() { return raw.getNamespace().getValue(); }