Paragraph firstPar = new Paragraph(); if (p.getText().trim().length() == 0) { continue; List<Span> spans = p.getFormatSpans(Content.FormatType.ITALIC); for (Span s : spans) { if (s.getStart() == 0) { if (p.getText().trim().startsWith(":")) { continue; List<Link> links = firstPar.getLinks(); for (Link l : links) { String target = l.getTarget();
private void setHighlights(Article article, ParsedPage page) { final List<String> highlights = new ArrayList<String>(20); for (final Paragraph p : page.getParagraphs()) { for (final Span t : p.getFormatSpans(Content.FormatType.BOLD)) { highlights.add(t.getText(p.getText())); } for (final Span t : p.getFormatSpans(Content.FormatType.ITALIC)) { highlights.add(t.getText(p.getText())); } } article.setHighlights(highlights); }
Paragraph result = new Paragraph(); Span s = lineSpans.removeFirst(); paragraphSpans.add(s); result.setType(Paragraph.type.NORMAL); while (!lineSpans.isEmpty()) result.setType(Paragraph.type.BOXED); while (!lineSpans.isEmpty()) result.setType(Paragraph.type.INDENTED); s.trim(sm.setCharAt(s.getStart(), ' ')); break;
SpanManager ptext = new SpanManager(p.getText()); List<Span> delete = new ArrayList<Span>(); ptext.manageList(delete); List<Template> tl = p.getTemplates(); for (int j = tl.size() - 1; j >= 0; j--) List<Span> sl = p.getFormatSpans(FormatType.TAG); for (int j = sl.size() - 1; j >= 0; j--) List<Link> ll = p.getLinks(Link.type.IMAGE); for (int j = ll.size() - 1; j >= 0; j--)
private void setParagraphs(Article article, ParsedPage page) { final List<String> paragraphs = new ArrayList<String>(page.nrOfParagraphs()); int paragraphId = 0; for (final Paragraph p : page.getParagraphs()) { String text = p.getText(); // text = removeTemplates(text); text = text.replace("\n", " ").trim(); if (!text.isEmpty()){ paragraphs.add(text); } paragraphId++; } article.setParagraphs(paragraphs); }
private void setLinksInParagraphs(final List<Link> links, final List<Link> externalLinks, ParsedPage page){ int paragraphId = 0; for (Paragraph p : page.getParagraphs()){ for (de.tudarmstadt.ukp.wikipedia.parser.Link link : p.getLinks()){ Link linkAdded = addLink(links, externalLinks, link, Link.Type.BODY); if (linkAdded != null){ linkAdded.setParagraphId(paragraphId); } } paragraphId++; } }
private static void checkRange( SectionContent s ){ List<SrcSpan> eil = new ArrayList<SrcSpan>(); if( s.getTitleElement()!= null ){ checkRange( s.getTitleElement() ); eil.add( s.getTitleElement().getSrcSpan() ); } for( Paragraph p: s.getParagraphs() ){ checkRange( p ); eil.add( p.getSrcSpan() ); } for( DefinitionList dl: s.getDefinitionLists() ){ checkRange( dl ); eil.add( dl.getSrcSpan() ); } for( NestedListContainer nl: s.getNestedLists() ){ checkRange( nl ); eil.add( nl.getSrcSpan() ); } for( Table t: s.getTables() ){ checkRange( t ); eil.add( t.getSrcSpan() ); } s.setSrcSpan( getEvalInfo( s.getSrcSpan(), eil)); }
/** * Removes all empty Structures from a SectionContent and all substructures. */ public static SectionContent eliminateEmptyStructures( SectionContent sc ){ for( int i=sc.nrOfParagraphs()-1; i>=0; i-- ){ Paragraph p = sc.getParagraph(i); if( p.empty() ) sc.removeParagraph( p ); } for( int i=sc.nrOfDefinitionLists()-1; i>=0; i--){ DefinitionList dl = sc.getDefinitionList(i); eliminateEmptyStructures( dl ); if( dl.empty() ) sc.removeDefinitionList( dl ); } for( int i=sc.nrOfNestedLists()-1; i>=0; i--){ NestedListContainer nl = sc.getNestedList(i); eliminateEmptyStructures( nl ); if( nl.empty() ) sc.removeNestedList( nl ); } for( int i=sc.nrOfTables()-1; i>=0; i--){ Table t = sc.getTable(i); eliminateEmptyStructures( t ); if( t.empty() ) sc.removeTable( t ); } return sc; }
public String getGloss(Entity entity) throws LexicalSemanticResourceException { if (!this.containsEntity(entity)) { return null; } Page p = WikipediaArticleUtils.entityToPage(wiki, entity, isCaseSensitive); ParsedPage pp = p.getParsedPage(); if (pp == null) { return ""; } Paragraph paragraph = pp.getFirstParagraph(); if (paragraph == null) { return ""; } return paragraph.getText(); }
private String getPlainDocumentText(Page page) { String text = ""; ParsedPage pp = parser.parse(page.getText()); if (onlyFirstParagraph) { if (pp != null && pp.getParagraph(0) != null) { text = pp.getParagraph(0).getText(); } } else { if (pp != null ) { text = pp.getText(); } } return text; }
@Override protected String getPlainDocumentText(Page page) { String text = ""; ParsedPage pp = parser.parse(page.getText()); if (onlyFirstParagraph) { if (pp != null && pp.getParagraph(0) != null) { text = pp.getParagraph(0).getText(); } } else { if (pp != null ) { text = pp.getText(); } } return text; }
String text = ppage.getFirstParagraph().getText();