private static void handleContent( Content c, EnumMap<CIT, Boolean> hp, StringBuilder sb ){ if( hp != null ){ if( hp.get( CIT.TEXT )) sb.append( c.getText()+" " ); else{ if( hp.get( CIT.BOLD ) ) handleSpans( c.getFormatSpans( FormatType.BOLD), c.getText(), sb ); if( hp.get( CIT.ITALIC )) handleSpans( c.getFormatSpans( FormatType.ITALIC), c.getText(), sb ); } if( hp.get( CIT.LINK )) handleLinks( c.getLinks(), !hp.get( CIT.TEXT ), sb ); } }
public List<Template> getTemplates(){ List<Template> result = new ArrayList<Template>(); for( Content cc: ccl ) result.addAll( cc.getTemplates() ); return result; }
public List<Span> getFormatSpans(FormatType t){ List<Span> result = new ArrayList<Span>(); int offset = 0; for( Content c: ccl ){ for( Span b : c.getFormatSpans(t) ) result.add( b.clone().adjust( offset )); offset += 1 + c.length(); } return result; }
public String getText(){ StringBuilder result = new StringBuilder(); for( Content cc: ccl){ if(cc!=null) result.append( cc.getText()+" " ); } final int temp = result.length()-1; if( temp >= 0 ) result.deleteCharAt(temp); return result.toString(); }
public List<Template> getTemplates(Span s){ List<Template> result = new ArrayList<Template>(); Span a = new Span( -1, -1 ); for( Content c: ccl ){ int offset = a.getEnd()+1; a = new Span( offset, offset+ c.length() ); if( a.hits(s) ) result.addAll( c.getTemplates( s.clone().adjust( -offset ) ) ); } return result; } }
public List<Link> getLinks( Link.type linkType, Span s){ List<Link> result = new ArrayList<Link>(); Span a = new Span( -1, -1 ); for( Content c: ccl ){ int offset = a.getEnd()+1; a = new Span( offset, offset+ c.length() ); if( a.hits(s) ) result.addAll( c.getLinks( linkType, s.clone().adjust( -offset ) ) ); } return result; }
for (Content curContent : curSection.getContentList()){ for (Link curLink : curContent.getLinks()){ if (curLink.getTarget().isEmpty()){ LOG.debug("Found link with empty target: \t" + xml + "\t text=" + curLink.getText()); for (Template t : curContent.getTemplates()){ boolean errorWithSrcLocation = t.getSrcSpan().getEnd() < 0; // this checks for what seems to be when parsing fails in JWPL String templateTextOrig;
a = new Span( offset, offset+ c.length() ); for( FormatType t: c.getFormats( s.clone().adjust( -offset ) ) ) switch(t){ case BOLD:
public List<Link> getLinks( Link.type linkType ){ List<Link> result= new ArrayList<Link>(); for( Content c: ccl ) result.addAll( c.getLinks( linkType )); return result; }
public int length(){ int length = 0; for( Content cc: ccl ) if( cc!=null ) length += cc.length()+1; if( length > 0 ) length--; return length; }
for( FormatType t: c.getFormats()) switch(t){ case BOLD:
/** * Retruns the Link text or link caption. */ public String getText(){ if( home_cc == null ) { return null; } return pos.getText( home_cc.getText() ); }
public List<Link> getLinks(){ List<Link> result = new ArrayList<Link>(); for( Content c: ccl ) result.addAll( c.getLinks() ); return result; }
/** * Returns the whole content of the page tokenized in a single line. * The first token is the page title (with underscores) */ private String tokenizedText(ParsedPage parsedPage, String title) throws IOException { StringBuilder sb = new StringBuilder(); sb.append(title); sb.append(CharacterTable.SPACE); Tokenizer tokenizer = HardTokenizer.getInstance(); String tokenizedTitle = tokenizer.tokenizedString(title.replace(CharacterTable.LOW_LINE, CharacterTable.SPACE)); sb.append(tokenizedTitle); String rawContent; String tokenizedContent; List<Content> list; for (Section section : parsedPage.getSections()) { list = section.getContentList(); for (int i = 0; i < list.size(); i++) { rawContent = list.get(i).getText(); if (rawContent.length() > 0) { tokenizedContent = tokenizer.tokenizedString(rawContent); if (tokenizedContent.length() > 0) { sb.append(CharacterTable.SPACE); sb.append(tokenizedContent); } } } } return sb.toString(); }
public List<Span> getFormatSpans(FormatType t, Span s){ List<Span> result = new ArrayList<Span>(); Span a = new Span( -1, -1 ); for( Content c: ccl ){ int offset = a.getEnd()+1; a = new Span( offset, offset+ c.length() ); if( a.hits(s) ){ for( Span b: c.getFormatSpans( t, s.clone().adjust( -offset ) ) ) result.add( b.clone().adjust( offset ) ); } } return result; }
/** * Returns the whole content of the page tokenized in a single line. * The first token is the page title (with underscores) */ private String tokenizedText(ParsedPage parsedPage, String title) throws IOException { StringBuilder sb = new StringBuilder(); sb.append(title); sb.append(CharacterTable.SPACE); Tokenizer tokenizer = HardTokenizer.getInstance(); String tokenizedTitle = tokenizer.tokenizedString(title.replace(CharacterTable.LOW_LINE, CharacterTable.SPACE)); sb.append(tokenizedTitle); String rawContent; String tokenizedContent; List<Content> list; for (Section section : parsedPage.getSections()) { list = section.getContentList(); for (int i = 0; i < list.size(); i++) { rawContent = list.get(i).getText(); if (rawContent.length() > 0) { tokenizedContent = tokenizer.tokenizedString(rawContent); if (tokenizedContent.length() > 0) { sb.append(CharacterTable.SPACE); sb.append(tokenizedContent); } } } } return sb.toString(); }
list = section.getContentList(); for (int i = 0; i < list.size(); i++) { rawContent = list.get(i).getText();
final String text = home_cc.getText(); int temp;
public ParsedPageLink(Link link) { if (link.getTarget() != null) { page = normalizePageName(link.getTarget().trim()); } else { page = StringTable.EMPTY_STRING; } if (link.getText() != null) { form = removeSuffix(removeQuotes(link.getText().trim())); } String context = link.getHomeElement().getText(); if (context != null) { Span span = link.getPos(); leftContext = context.substring(0, span.getStart()); rightContext = context.substring(span.getEnd(), context.length()); } }