org.wikibrain.core.model.RawPage java code examples

@Override
public void save(RawPage page) throws DaoException {
  insert(
      page.getLanguage().getId(),
      page.getLocalId(),
      page.getRevisionId(),
      page.getBody() == null ? "" : page.getBody(),
      page.getTitle().getCanonicalTitle(),
      page.getLastEdit(),
      page.getNamespace().getArbitraryId(),
      page.isRedirect(),
      page.isDisambig(),
      page.getRedirectTitle()
  );
}

  private void fillBuffer() {
    while (buffer == null && iter.hasNext()) {
      RawPage rp = iter.next();
      if (rp != null) {
        try {
          String text = rp.getPlainText(false);
          if (text != null && text.trim().length() > 0) {
            buffer = new IdAndText(rp.getLocalId(), text.trim());
          }
        } catch (Exception e) {
          LOG.warn("Error when extracting text from: " + rp.getTitle());
        }
      }
    }
  }
}

private boolean isInteresting(RawPage rp) {
  if (rp == null || rp.getNamespace() == null) {
    return false;
  } else if (validIds != null && !validIds.contains(rp.getLocalId())) {
    return false;
  } else {
    return nss.contains(rp.getNamespace());
  }
}

private void save(File file, RawPage rp) {
  try {
    rawPageDao.save(rp);
    metaDao.incrementRecords(rp.getClass(), rp.getLanguage());
  } catch (Exception e) {
    LOG.warn("parsing of " + file + " failed:", e);
    metaDao.incrementErrorsQuietly(rp.getClass(), rp.getLanguage());
  }
  try {
    LocalPage lp = new LocalPage(
              rp.getLanguage(), rp.getLocalId(),
              rp.getTitle(), rp.getNamespace(),
              rp.isRedirect(), rp.isDisambig()
            );
    localPageDao.save(lp);
    metaDao.incrementRecords(lp.getClass(), lp.getLanguage());
  } catch (Exception e) {
    LOG.warn("parsing of " + file + " failed:", e);
    metaDao.incrementErrorsQuietly(LocalPage.class, rp.getLanguage());
  }
}

  @Override
  public void run() {
    boolean finished = false;
    while (!finished) {
      RawPage rp = null;
      Language lang = null;
      try {
        rp = queue.poll(100, TimeUnit.MILLISECONDS);
        if (rp == POISON_PILL) {
          queue.put(rp);
          finished = true;
        } else if (rp != null) {
          lang = rp.getLanguage();
          luceneIndexer.indexPage(rp);
          metaDao.incrementRecords(LuceneSearcher.class, lang);
        }
      } catch (InterruptedException e) {
        LOG.warn("LuceneLoader.Worker received interrupt.");
        return;
      } catch (Exception e) {
        metaDao.incrementErrorsQuietly(LuceneSearcher.class, lang);
        String title = "unknown";
        if (rp != null) title = rp.getTitle().toString();
        LOG.warn("exception while parsing " + title, e);
      }
    }
  }
}

/**
 * Indexes a specific RawPage
 *
 * @param page the page to index
 */
public void indexPage(RawPage page) throws DaoException {
  if (closed) {
    throw new IllegalStateException("Indexer has already been closed!");
  }
  if (!language.equals(page.getLanguage())) {
    throw new IllegalStateException("Language mismatch!");
  }
  try {
    Document document = new Document();
    Field localIdField = new IntField(LuceneOptions.LOCAL_ID_FIELD_NAME, page.getLocalId(), Field.Store.YES);
    Field langIdField = new IntField(LuceneOptions.LANG_ID_FIELD_NAME, page.getLanguage().getId(), Field.Store.YES);
    Field canonicalTitleField = builder.buildTextField(page, new TextFieldElements().addTitle());
    document.add(localIdField);
    document.add(langIdField);
    document.add(canonicalTitleField);
    if (!page.isRedirect()) {
      for (LuceneOptions option : options) {
        document.add(builder.buildTextField(page, option.elements));
      }
    }
    writer.addDocument(document);
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
}

public void testWikify() throws DaoException {
  int barackId = lpd.getIdByTitle("Barack Obama", language, NameSpace.ARTICLE);
  RawPage rp = rpd.getById(language, barackId);
  for (int i = 0; i < 1; i++) {
    List<LocalLink> detected = wikify(rp.getLocalId());
    System.out.println("Links detected for " + rp.getTitle() + " (" + i + ")");
    for (LocalLink ll : detected) {
      System.out.println("\t" + ll + " page " + lpd.getById(language, ll.getDestId()).getTitle());
    }
  }
}

/**
 * <p>
 * Returns the title string of the revised page, including namespace
 * prefixes and subpages, if any. The string is formatted as it would be on
 * an HTML page and not as in the URL used by MediaWiki for the page. For
 * example, spaces are represented as spaces and not as underscores. For
 * example
 * </p>
 * <p>
 * On a single MediaWiki site, the prefixed page title is a key for a page
 * at any given moment. However, users may change the title and namespace by
 * moving pages. The page id provides a better clue to identify pages across
 * history.
 * </p>
 *
 * @return title string
 */
@Override
public String getPrefixedTitle() {
  Title t = raw.getTitle();
  if (raw.getNamespace() == NameSpace.WIKIPEDIA) {
    return t.getTitleStringWithoutNamespace();
  } else {
    return t.getNamespaceString() + ":" + t.getTitleStringWithoutNamespace();
  }
}

/**
 * Builds a lucene text field for page based on the specified text field elements
 *
 * @param page
 * @param elements
 * @return
 * @throws DaoException
 */
public TextField buildTextField(RawPage page, TextFieldElements elements) throws DaoException {
  return buildTextField(
      localPageDao.getById(page.getLanguage(), page.getLocalId()),
      page,
      elements);
}

if (xml.isRedirect()) {
  ParsedRedirect pr = new ParsedRedirect();
  pr.location = new ParsedLocation(xml, -1, -1, -1);
} else {
  try {
    ParsedPage pp = jwpl.parse(xml.getBody());
    if (pp == null) {
      LOG.debug("invalid page: " + xml.getBody());
    if (xml.getNamespace() == NameSpace.CATEGORY) {
      parseCategory(xml, pp);
    } else if (xml.getNamespace() == NameSpace.ARTICLE) {
      parseArticle(xml, pp);

  @Override
  public void call(RawPage page) throws Exception {
    String text = page.getPlainText(false);
    List<LinkInfo> candidates = scoreMentions(page.getLocalId(), text);
    Collections.sort(candidates);
    List<String> words = new StringTokenizer().getWords(language, text);
    int target = (int) (words.size() * desiredWikifiedFraction);
    if (candidates.size() > target) candidates = candidates.subList(0, target);
    synchronized (results) { results.add(candidates); }
  }
});

/**
 * Returns the numeric page id of the revised page. For any given MediaWiki
 * site, pages are uniquely identified by their page id. MediaWiki will try
 * to preserve the page id even across title changes (moves).
 *
 * @return integer page id
 */
@Override
public int getPageId() {
  return raw.getLocalId();
}

  public String toString(){
    return String.format("%s / %s (%s)", this.getTitle(), this.localId, lang.getLangCode());
  }
}

private void loadRedirectIdsIntoMemory(Language language) throws DaoException{
  redirectIdsToPageIds = new TIntIntHashMap(Constants.DEFAULT_CAPACITY, Constants.DEFAULT_LOAD_FACTOR, -1, -1);
  int i = 0;
  LOG.info("Begin loading redirects into memory: ");
  for (RawPage p : rawPages.get(new DaoFilter().setLanguages(language).setRedirect(true))) {
    Title pTitle = new Title(p.getRedirectTitle(), LanguageInfo.getByLanguage(language));
    redirectIdsToPageIds.put(p.getLocalId(),
        localPages.getIdByTitle(pTitle.getCanonicalTitle(), language, pTitle.getNamespace()));
    if(i%100000==0)
      LOG.info("loading redirect # " + i);
    i++;
  }
  LOG.info("End loading redirects into memory.");
}

private TextField buildTextField(LocalPage localPage, RawPage rawPage, TextFieldElements elements) throws DaoException {
  StringBuilder sb = new StringBuilder();
  String title = rawPage.getTitle().getCanonicalTitle();
  for (int i=0; i<elements.usesTitle(); i++) {
    sb.append(title);
    sb.append(" ");
  }
  if (elements.usesRedirects()) {
    TIntIterator iterator = redirectDao.getRedirects(localPage).iterator();
    while (iterator.hasNext()) {
      sb.append(localPageDao
          .getById(localPage.getLanguage(), iterator.next())
          .getTitle()
          .getCanonicalTitle());
      sb.append(" ");
    }
  }
  if (elements.usesPlainText()) {
    String plainText = rawPage.getPlainText();
    sb.append(plainText);
  }
  return new TextField(elements.getTextFieldName(), sb.toString().trim(), Field.Store.YES);
}

private void processPage(TLongIntMap counts, RawPage page) {
  Language lang = page.getLanguage();
  StringTokenizer tokenizer = new StringTokenizer();
  StringBuilder buffer = new StringBuilder();
  for (Token sentence : tokenizer.getSentenceTokens(lang, page.getPlainText())) {
    List<Token> words = tokenizer.getWordTokens(lang, sentence);
    for (int i = 0; i < words.size(); i++) {

/**
 * Returns the text content of the current revision. Traditionally, this is
 * a wiki text that is edited by users. More recently, however, other
 * formats, such as JSON, have been introduced by extensions like Wikibase.
 * The format of the text is specified by {@link #getFormat()}. To interpret
 * it properly, one should also know the content model, obtained from
 * {@link #getModel()}.
 *
 * @return text content of the revision
 */
@Override
public String getText() {
  return raw.getBody();
}

@Override
public void parseError(RawPage rp, Exception e) {
  metaDao.incrementErrorsQuietly(LocalCategoryMember.class, rp.getLanguage());
}

/**
 * Returns a plain text output of the body of this RawPage
 * @return
 */
public String getPlainText() {
  return getPlainText(false);
}
/**

/**
 * <p>
 * Returns the id of the MediaWiki namespace of the revised page. The
 * meaning of this id depends on the configuration of the site that the page
 * is from. Usually, 0 is the main namespace. Even ids usually refer to
 * normal article pages while their odd successors represent the
 * corresponding talk namespace.
 * </p>
 * <p>
 * On a single MediaWiki site, the combination of page title and page
 * namespace is a key for a page at any given moment. However, users may
 * change the title and namespace by moving pages. The page id provides a
 * better clue to identify pages across history.
 * </p>
 *
 * @return integer namespace id
 */
@Override
public int getNamespace() {
  return raw.getNamespace().getValue();
}

Javadoc

Contains a single page's data from Wikipedia's Xml Dump with no processing. You probably don't want to use this class unless you are parsing or need full text.

Most used methods

getLocalId
getTitle
getLanguage
getBody
getNamespace
getPlainText
Returns a plain text output of the body of this RawPage
isRedirect
<init>
getLastEdit
getRedirectTitle
getRevisionId
isDisambig

Popular in Java

Updating database using SQL prepared statement
compareTo (BigDecimal)
setContentView (Activity)
setScale (BigDecimal)
Proxy (java.net)
This class represents proxy server settings. A created instance of Proxy stores a type and an addres
KeyStore (java.security)
KeyStore is responsible for maintaining cryptographic keys and their owners. The type of the syste
DecimalFormat (java.text)
A concrete subclass of NumberFormat that formats decimal numbers. It has a variety of features desig
Iterator (java.util)
An iterator over a sequence of objects, such as a collection.If a collection has been changed since
BlockingQueue (java.util.concurrent)
A java.util.Queue that additionally supports operations that wait for the queue to become non-empty
Rectangle (java.awt)
A Rectangle specifies an area in a coordinate space that is enclosed by the Rectangle object's top-
Best plugins for Eclipse

How to useRawPage in org.wikibrain.core.model

Best Java code snippets using org.wikibrain.core.model.RawPage (Showing top 20 results out of 315)

How to use
RawPage
in
org.wikibrain.core.model