/** * Returns the numeric page id of the revised page. For any given MediaWiki * site, pages are uniquely identified by their page id. MediaWiki will try * to preserve the page id even across title changes (moves). * * @return integer page id */ @Override public int getPageId() { return raw.getLocalId(); }
/** * Returns the numeric page id of the revised page. For any given MediaWiki * site, pages are uniquely identified by their page id. MediaWiki will try * to preserve the page id even across title changes (moves). * * @return integer page id */ @Override public int getPageId() { return raw.getLocalId(); }
@Override public void call(RawPage page) throws Exception { String text = page.getPlainText(false); List<LinkInfo> candidates = scoreMentions(page.getLocalId(), text); Collections.sort(candidates); List<String> words = new StringTokenizer().getWords(language, text); int target = (int) (words.size() * desiredWikifiedFraction); if (candidates.size() > target) candidates = candidates.subList(0, target); synchronized (results) { results.add(candidates); } } });
private void fillBuffer() { while (buffer == null && iter.hasNext()) { RawPage rp = iter.next(); if (rp != null) { try { String text = rp.getPlainText(false); if (text != null && text.trim().length() > 0) { buffer = new IdAndText(rp.getLocalId(), text.trim()); } } catch (Exception e) { LOG.warn("Error when extracting text from: " + rp.getTitle()); } } } } }
private boolean isInteresting(RawPage rp) { if (rp == null || rp.getNamespace() == null) { return false; } else if (validIds != null && !validIds.contains(rp.getLocalId())) { return false; } else { return nss.contains(rp.getNamespace()); } }
private boolean isInteresting(RawPage rp) { if (rp == null || rp.getNamespace() == null) { return false; } else if (validIds != null && !validIds.contains(rp.getLocalId())) { return false; } else { return nss.contains(rp.getNamespace()); } }
/** * Builds a lucene text field for page based on the specified text field elements * * @param page * @param elements * @return * @throws DaoException */ public TextField buildTextField(RawPage page, TextFieldElements elements) throws DaoException { return buildTextField( localPageDao.getById(page.getLanguage(), page.getLocalId()), page, elements); }
public void testWikify() throws DaoException { int barackId = lpd.getIdByTitle("Barack Obama", language, NameSpace.ARTICLE); RawPage rp = rpd.getById(language, barackId); for (int i = 0; i < 1; i++) { List<LocalLink> detected = wikify(rp.getLocalId()); System.out.println("Links detected for " + rp.getTitle() + " (" + i + ")"); for (LocalLink ll : detected) { System.out.println("\t" + ll + " page " + lpd.getById(language, ll.getDestId()).getTitle()); } } }
/** * Indexes a specific RawPage * * @param page the page to index */ public void indexPage(RawPage page) throws DaoException { if (closed) { throw new IllegalStateException("Indexer has already been closed!"); } if (!language.equals(page.getLanguage())) { throw new IllegalStateException("Language mismatch!"); } try { Document document = new Document(); Field localIdField = new IntField(LuceneOptions.LOCAL_ID_FIELD_NAME, page.getLocalId(), Field.Store.YES); Field langIdField = new IntField(LuceneOptions.LANG_ID_FIELD_NAME, page.getLanguage().getId(), Field.Store.YES); Field canonicalTitleField = builder.buildTextField(page, new TextFieldElements().addTitle()); document.add(localIdField); document.add(langIdField); document.add(canonicalTitleField); if (!page.isRedirect()) { for (LuceneOptions option : options) { document.add(builder.buildTextField(page, option.elements)); } } writer.addDocument(document); } catch (IOException e) { throw new RuntimeException(e); } }
private void loadRedirectIdsIntoMemory(Language language) throws DaoException{ redirectIdsToPageIds = new TIntIntHashMap(Constants.DEFAULT_CAPACITY, Constants.DEFAULT_LOAD_FACTOR, -1, -1); int i = 0; LOG.info("Begin loading redirects into memory: "); for (RawPage p : rawPages.get(new DaoFilter().setLanguages(language).setRedirect(true))) { Title pTitle = new Title(p.getRedirectTitle(), LanguageInfo.getByLanguage(language)); redirectIdsToPageIds.put(p.getLocalId(), localPages.getIdByTitle(pTitle.getCanonicalTitle(), language, pTitle.getNamespace())); if(i%100000==0) LOG.info("loading redirect # " + i); i++; } LOG.info("End loading redirects into memory."); }
private void loadRedirectIdsIntoMemory(Language language) throws DaoException{ redirectIdsToPageIds = new TIntIntHashMap(Constants.DEFAULT_CAPACITY, Constants.DEFAULT_LOAD_FACTOR, -1, -1); int i = 0; LOG.info("Begin loading redirects into memory: "); for (RawPage p : rawPages.get(new DaoFilter().setLanguages(language).setRedirect(true))) { Title pTitle = new Title(p.getRedirectTitle(), LanguageInfo.getByLanguage(language)); redirectIdsToPageIds.put(p.getLocalId(), localPages.getIdByTitle(pTitle.getCanonicalTitle(), language, pTitle.getNamespace())); if(i%100000==0) LOG.info("loading redirect # " + i); i++; } LOG.info("End loading redirects into memory."); }
private void save(File file, RawPage rp) { try { rawPageDao.save(rp); metaDao.incrementRecords(rp.getClass(), rp.getLanguage()); } catch (Exception e) { LOG.warn("parsing of " + file + " failed:", e); metaDao.incrementErrorsQuietly(rp.getClass(), rp.getLanguage()); } try { LocalPage lp = new LocalPage( rp.getLanguage(), rp.getLocalId(), rp.getTitle(), rp.getNamespace(), rp.isRedirect(), rp.isDisambig() ); localPageDao.save(lp); metaDao.incrementRecords(lp.getClass(), lp.getLanguage()); } catch (Exception e) { LOG.warn("parsing of " + file + " failed:", e); metaDao.incrementErrorsQuietly(LocalPage.class, rp.getLanguage()); } }
private void save(File file, RawPage rp) { try { rawPageDao.save(rp); metaDao.incrementRecords(rp.getClass(), rp.getLanguage()); } catch (Exception e) { LOG.warn("parsing of " + file + " failed:", e); metaDao.incrementErrorsQuietly(rp.getClass(), rp.getLanguage()); } try { LocalPage lp = new LocalPage( rp.getLanguage(), rp.getLocalId(), rp.getTitle(), rp.getNamespace(), rp.isRedirect(), rp.isDisambig() ); localPageDao.save(lp); metaDao.incrementRecords(lp.getClass(), lp.getLanguage()); } catch (Exception e) { LOG.warn("parsing of " + file + " failed:", e); metaDao.incrementErrorsQuietly(LocalPage.class, rp.getLanguage()); } }
lang, link.text, link.location.getXml().getLocalId(), destId, true,
@Override public void category(ParsedCategory cat) throws WikiBrainException { Language lang = cat.category.getLanguage(); try{ LanguageInfo langInfo = LanguageInfo.getByLanguage(lang); int c = counter.getAndIncrement(); if(c % 100000 == 0) LOG.info("Visited category #" + c); String catText = cat.category.getCanonicalTitle().split("\\|")[0]; //piped cat link catText = catText.split("#")[0]; //cat subsection Title catTitle = new Title(catText, langInfo); if(!isCategory(catText, langInfo) && !catTitle.getNamespace().equals(NameSpace.CATEGORY)) { throw new WikiBrainException("Thought it was a category, was not a category."); } int catId = pageDao.getIdByTitle(catTitle.getCanonicalTitle(), lang, NameSpace.CATEGORY); catMemDao.save( new LocalCategoryMember( catId, cat.location.getXml().getLocalId(), lang )); metaDao.incrementRecords(LocalCategoryMember.class, lang); } catch (DaoException e) { metaDao.incrementErrorsQuietly(LocalCategoryMember.class, lang); throw new WikiBrainException(e); } }
@Override public void save(RawPage page) throws DaoException { insert( page.getLanguage().getId(), page.getLocalId(), page.getRevisionId(), page.getBody() == null ? "" : page.getBody(), page.getTitle().getCanonicalTitle(), page.getLastEdit(), page.getNamespace().getArbitraryId(), page.isRedirect(), page.isDisambig(), page.getRedirectTitle() ); }