private void setLinks(Article article, ParsedPage page){ final List<Link> links = new ArrayList<Link>(page.getLinks().size()); final List<Link> elinks = new ArrayList<Link>(page.getLinks().size()); setLinksInParagraphs(links, elinks, page); setLinksInTables(links, elinks, page); setLinksInLists(links, elinks, page); article.setLinks(links); article.setExternalLinks(elinks); }
public static void main(String[] args){ // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") String documentText = TestFile.getFileText(); // get a ParsedPage object MediaWikiParserFactory pf = new MediaWikiParserFactory(); MediaWikiParser parser = pf.createParser(); ParsedPage pp = parser.parse(documentText); // Link Context (return 1 token left, 2 token right of the link) for (Link link : pp.getLinks()) { System.out.println( link.getContext(1, 0) + "<" + link.getText().toString().toUpperCase() + ">" + link.getContext(0, 2) ); } } }
/** * Note that this method only returns the anchors that are not equal to the page's title. * Anchors might contain references to sections in an article in the form of "Page#Section". * If you need the plain title, e.g. for checking whether the page exists in Wikipedia, the Title object can be used. * * @return A set of strings used as anchor texts in links pointing to that page. * @throws WikiTitleParsingException */ public Set<String> getInlinkAnchors(Page page) throws WikiTitleParsingException { Set<String> inAnchors = new HashSet<String>(); for (Page p : page.getInlinks()) { ParsedPage pp = parser.parse(p.getText()); if (pp == null) { return inAnchors; } for (Link l : pp.getLinks()) { String pageTitle = page.getTitle().getPlainTitle(); String anchorText = l.getText(); if (l.getTarget().equals(pageTitle) && !anchorText.equals(pageTitle)) { inAnchors.add(anchorText); } } } return inAnchors; }
handleLinks( pp.getLinks(), !pageHandling.get( CIT.TEXT ), sb );
return outAnchors; for (Link l : pp.getLinks()) { if (l.getTarget().length() == 0) { continue;
int begin = 0; int end = 0; for (Link link : pp.getLinks()) { if (allowedLinkTypeList.contains(link.getType().name())) {
templateText = templateText.replaceAll("\\[\\[\\]\\]", ""); ParsedPage parsedTemplate = jwpl.parse(templateText); for (Link templateLink : parsedTemplate.getLinks()){ Title destTitle = link2Title(templateLink); if (destTitle == null) { continue; }