levelModifier = pp.getSection(0).getLevel()-1; handleContent( pp.getFirstParagraph(), firstParagraphHandling, sb ); deleteParagraph( pp.getFirstParagraphNr(), pp.getSections() ); for( Section s: pp.getSections() ) handleSection( s, sb ); sb.append( pp.getText() ); handleSpans( pp.getFormatSpans( FormatType.BOLD ), pp.getText(), sb ); handleSpans( pp.getFormatSpans( FormatType.ITALIC ), pp.getText(), sb ); handleLinks( pp.getLinks(), !pageHandling.get( CIT.TEXT ), sb );
protected void setEnWikiTitle(Article article, ParsedPage page) { if (article.isLang(Language.EN)) { return; } try { if (page.getLanguages() == null) { article.setEnWikiTitle(""); return; } } catch (final NullPointerException e) { // FIXME title is always null! logger.warn("no languages for page {} ", article.getTitle()); return; } for (final de.tudarmstadt.ukp.wikipedia.parser.Link l : page.getLanguages()) { if (l.getText().startsWith("en:")) { article.setEnWikiTitle(l.getTarget().substring(3)); break; } } }
"<table class=\"ParsedPage\">\n"+ "<tr><th class=\"ParsedPage\">ParsedPage: \n" + pp.getName()+ "</th></tr>\n"); for( Section s: pp.getSections() ) { result.append( sectionToHtml( s )); if( pp.getCategoryElement()!= null ){ result.append("<tr><td class=\"ParsedPage\">\n"); result.append("Categories:\n" + contentElementToHtml( pp.getCategoryElement() )); result.append("</td></tr>\n"); if( pp.getLanguagesElement()!= null ){ result.append("<tr><td class=\"ParsedPage\">\n"); result.append("Languages:\n" + contentElementToHtml( pp.getLanguagesElement() )); result.append("</td></tr>\n");
private String getPlainDocumentText(Page page) { String text = ""; ParsedPage pp = parser.parse(page.getText()); if (onlyFirstParagraph) { if (pp != null && pp.getParagraph(0) != null) { text = pp.getParagraph(0).getText(); } } else { if (pp != null ) { text = pp.getText(); } } return text; }
private void setLists(Article article, ParsedPage page) { List<List<String>> lists = new ArrayList<List<String>>(); for (DefinitionList dl : page.getDefinitionLists()) { List<String> l = new ArrayList<String>(); for (ContentElement c : dl.getDefinitions()) { l.add(c.getText()); } lists.add(l); } for (NestedListContainer dl : page.getNestedLists()) { List<String> l = new ArrayList<String>(); for (NestedList nl : dl.getNestedLists()) { l.add(nl.getText()); } lists.add(l); } article.setLists(lists); }
public static void checkRange( ParsedPage pp ){ for( Section s: pp.getSections() ){ if( s.getClass() == SectionContent.class ) checkRange( (SectionContent)s ); else checkRange( (SectionContainer)s ); } }
pp.setName(name); len_allPages += len_page; if( pp.nrOfDefinitionLists() != 0 )nrOfPagesWithDl++; if( pp.nrOfNestedLists() != 0 )nrOfPagesWithNl++; if( pp.nrOfTables() != 0 ) nrOfPagesWithTables++; if( pp.getTemplates().size() != 0 ) nrOfPagesWithTemplates++; if( pp.getSections().size()>1 )nrOfPagesWithSubSections++; for( FormatType ft: pp.getFormats() ){ if( ft == FormatType.BOLD ) nrOfPagesWithBold++; if( ft == FormatType.ITALIC ) nrOfPagesWithItalic++;
int paraNum = -pp.getFirstParagraphNr(); for (Section curSection: pp.getSections()){ try{ ParsedLink.SubarticleType secSubType = subarticleParser.isSeeAlsoHeader(lang, curSection.getTitle()); templateText = templateText.replaceAll("\\[\\[\\]\\]", ""); ParsedPage parsedTemplate = jwpl.parse(templateText); for (Link templateLink : parsedTemplate.getLinks()){ Title destTitle = link2Title(templateLink); if (destTitle == null) { continue; } for (Link cat : pp.getCategories()){ String linkText = cat.getText(); if (linkText.contains(Pattern.quote("|"))){
@Override protected String getPlainDocumentText(Page page) { String text = ""; ParsedPage pp = parser.parse(page.getText()); if (pp != null ) { text = pp.getText(); } return text; }
private void setLinks(Article article, ParsedPage page){ final List<Link> links = new ArrayList<Link>(page.getLinks().size()); final List<Link> elinks = new ArrayList<Link>(page.getLinks().size()); setLinksInParagraphs(links, elinks, page); setLinksInTables(links, elinks, page); setLinksInLists(links, elinks, page); article.setLinks(links); article.setExternalLinks(elinks); }
/** * Prints the targets of the internal links found in the page <i>Germany</i>. * @param args * @throws WikiApiException */ public static void main(String[] args) throws WikiApiException { // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") String documentText = TestFile.getFileText(); // get a ParsedPage object MediaWikiParserFactory pf = new MediaWikiParserFactory(); MediaWikiParser parser = pf.createParser(); ParsedPage pp = parser.parse(documentText); // only the links to other Wikipedia language editions for (Link language : pp.getLanguages()) { System.out.println(language.getTarget()); } //get the internal links of each section for (Section section : pp.getSections()){ System.out.println("Section: " + section.getTitle()); for (Link link : section.getLinks(Link.type.INTERNAL)) { System.out.println(" " + link.getTarget()); } } } }
if (ppage.getLanguagesElement()!=null) List<Link> languageLinks = ppage.getLanguages(); for(Link link : languageLinks) if(ppage.getFirstParagraph() == null) { continue; String text = ppage.getFirstParagraph().getText();
pp.setName(name); for( Template t: pp.getTemplates()){ nrOfTemplates++; String templateName = t.getName().toLowerCase(); templateNrOfOccurence.add(1); templateNames.add( templateName ); templateNameOfFirstOccurence.add( pp.getName() ); List<String> temp = new ArrayList<String>(); temp.add( pp.getName() ); if( pp.nrOfTables()!=0 ) nrOfTables++; boolean b = true; for( Table t: pp.getTables() ){ if( b )for( int i=0; i<t.nrOfTableElements(); i++ ){ TableElement te = t.getTableElement(i); if( te.nrOfSections() > 1 || te.getSection(0).getClass()==SectionContainer.class ){ pagesWithTableSections.add( pp.getName() ); b = false; break;
private void parseIlls(RawPage xml, ParsedPage pp) { if (pp.getLanguagesElement() != null){ for (Link ill : pp.getLanguages()){ try{ Matcher m = illPattern.matcher(ill.getTarget());
/** * * Returns the set of names of all templates that are contained in the given * article (without duplicates).<br> * * Note: The names are SQL escaped using {@link StringUtils#sqlEscape(String)}. * * @param pageText * the page to get the templates from * @return a set of template names (without duplicates) */ private Set<String> getTemplateNames(String pageText) { Set<String> names = new HashSet<String>(); if (!pageText.isEmpty()) { try { ParsedPage pp = parser.parse(pageText); List<Template> templates = pp.getTemplates(); for (Template t : templates) { names.add(StringUtils.sqlEscape(t.getName().toLowerCase())); } } catch (Exception e) { // Most likely parsing problems logger.error("Problems parsing page!", e); } } return names; }
public String getGloss(Entity entity) throws LexicalSemanticResourceException { if (!this.containsEntity(entity)) { return null; } Page p = WikipediaArticleUtils.entityToPage(wiki, entity, isCaseSensitive); ParsedPage pp = p.getParsedPage(); if (pp == null) { return ""; } Paragraph paragraph = pp.getFirstParagraph(); if (paragraph == null) { return ""; } return paragraph.getText(); }
private void setCategories(Article article, ParsedPage page) { final ArrayList<Link> categories = new ArrayList<Link>(10); for (final de.tudarmstadt.ukp.wikipedia.parser.Link c : page.getCategories()) { categories.add(new Link(c.getTarget(), c.getText(), c.getPos().getStart(), c.getPos().getEnd(), Link.Type.CATEGORY)); } article.setCategories(categories); }
private void setLinksInLists(final List<Link> links, final List<Link> externalLinks, ParsedPage page){ int listId = 0; for (NestedListContainer p : page.getNestedLists()){ int item = 0; for (NestedList list : p.getNestedLists()) { for (de.tudarmstadt.ukp.wikipedia.parser.Link link : list.getLinks()) { Link linkAdded = addLink(links, externalLinks, link, Link.Type.LIST); if (linkAdded != null) { linkAdded.setListId(listId); linkAdded.setListItem(item); } } item++; } listId++; } }
/** * Process a section data from the raw page input */ public ArrayList<String> sectionData(String text, String title) throws IOException { logger.trace("Processing page: " + title); ParsedPage parsedPage = WikiMarkupParser.getInstance().parsePage(text); ArrayList<String> sections = new ArrayList<>(); for (Section section : parsedPage.getSections()) { appendSection(title, section, null, sections); } return sections; }
@Override protected String getPlainDocumentText(Page page) { ParsedPage pp = parser.parse(page.getText()); if (pp != null) { return pp.getText(); } else { return ""; } }