de.tudarmstadt.ukp.wikipedia.parser.Content java code examples

private static void handleContent( Content c, EnumMap<CIT, Boolean> hp, StringBuilder sb ){
  if( hp != null ){
    if( hp.get( CIT.TEXT ))
      sb.append( c.getText()+" " );
    else{
      if( hp.get( CIT.BOLD ) )
        handleSpans( c.getFormatSpans( FormatType.BOLD), c.getText(), sb );
      if( hp.get( CIT.ITALIC ))
        handleSpans( c.getFormatSpans( FormatType.ITALIC), c.getText(), sb );
    }
    if( hp.get( CIT.LINK ))
      handleLinks( c.getLinks(), !hp.get( CIT.TEXT ), sb );
  }
}

public List<Template> getTemplates(){
  List<Template> result = new ArrayList<Template>();
  for( Content cc: ccl )
    result.addAll( cc.getTemplates() );
  return result;
}

public List<Span> getFormatSpans(FormatType t){
  List<Span> result = new ArrayList<Span>();	
  int offset = 0;	
  for( Content c: ccl ){
    for( Span b : c.getFormatSpans(t) )
      result.add( b.clone().adjust( offset ));
    
    offset += 1 + c.length();
  }    
  return result;
}

 Content content = document.getContent();
String str = content.getString(0, content.length() - 1);

public String getText(){
  
  StringBuilder result = new StringBuilder();
  for( Content cc: ccl){
    if(cc!=null) result.append( cc.getText()+" " );
  }
  
  final int temp = result.length()-1;
  if( temp >= 0 ) result.deleteCharAt(temp);
  
  return result.toString();
}

  public List<Template> getTemplates(Span s){
    List<Template> result = new ArrayList<Template>();
    
    Span a = new Span( -1, -1 );

    for( Content c: ccl ){
      int offset = a.getEnd()+1;
      a = new Span( offset, offset+ c.length() );
      
      if( a.hits(s) )
        result.addAll( c.getTemplates( s.clone().adjust( -offset ) ) );
    }    
    return result;
  }
}

public List<Link> getLinks( Link.type linkType, Span s){
  List<Link> result = new ArrayList<Link>();
  
  Span a = new Span( -1, -1 );
  for( Content c: ccl ){
    int offset = a.getEnd()+1;
    a = new Span( offset, offset+ c.length() );
    
    if( a.hits(s) )
      result.addAll( c.getLinks( linkType, s.clone().adjust( -offset ) ) );
  }    
  return result;
}

for (Content curContent : curSection.getContentList()){
  for (Link curLink : curContent.getLinks()){
    if (curLink.getTarget().isEmpty()){
      LOG.debug("Found link with empty target: \t" + xml + "\t text=" + curLink.getText());
  for (Template t : curContent.getTemplates()){
    boolean errorWithSrcLocation = t.getSrcSpan().getEnd() < 0; // this checks for what seems to be when parsing fails in JWPL
    String templateTextOrig;

a = new Span( offset, offset+ c.length() );
  for( FormatType t: c.getFormats( s.clone().adjust( -offset ) ) )
    switch(t){
      case BOLD:

public List<Link> getLinks( Link.type linkType ){
  List<Link> result= new ArrayList<Link>();
  for( Content c: ccl ) result.addAll( c.getLinks( linkType ));
  return result;
}

public int length(){
  int length = 0;
  
  for( Content cc: ccl )
    if( cc!=null ) length += cc.length()+1;
  
  if( length > 0 ) length--;
  
  return length;
}

for( FormatType t: c.getFormats())
  switch(t){
  case BOLD:

/**
 * Retruns the Link text or link caption.
 */
public String getText(){
  if( home_cc == null ) {
    return null;
  }
  return pos.getText( home_cc.getText() );
}

public List<Link> getLinks(){
  List<Link> result = new ArrayList<Link>();
  for( Content c: ccl )
    result.addAll( c.getLinks() );
  return result;
}

/**
 * Returns the whole content of the page tokenized in a single line.
 * The first token is the page title (with underscores)
 */
private String tokenizedText(ParsedPage parsedPage, String title) throws IOException {
  StringBuilder sb = new StringBuilder();
  sb.append(title);
  sb.append(CharacterTable.SPACE);
  Tokenizer tokenizer = HardTokenizer.getInstance();
  String tokenizedTitle = tokenizer.tokenizedString(title.replace(CharacterTable.LOW_LINE, CharacterTable.SPACE));
  sb.append(tokenizedTitle);
  String rawContent;
  String tokenizedContent;
  List<Content> list;
  for (Section section : parsedPage.getSections()) {
    list = section.getContentList();
    for (int i = 0; i < list.size(); i++) {
      rawContent = list.get(i).getText();
      if (rawContent.length() > 0) {
        tokenizedContent = tokenizer.tokenizedString(rawContent);
        if (tokenizedContent.length() > 0) {
          sb.append(CharacterTable.SPACE);
          sb.append(tokenizedContent);
        }
      }
    }
  }
  return sb.toString();
}

public List<Span> getFormatSpans(FormatType t, Span s){
  List<Span> result = new ArrayList<Span>();
  
  Span a = new Span( -1, -1 );
  for( Content c: ccl ){
    int offset = a.getEnd()+1;
    a = new Span( offset, offset+ c.length() );
    
    if( a.hits(s) ){
      for( Span b: c.getFormatSpans( t, s.clone().adjust( -offset ) ) )
        result.add( b.clone().adjust( offset ) );
    }
  }    
  return result;
}

/**
 * Returns the whole content of the page tokenized in a single line.
 * The first token is the page title (with underscores)
 */
private String tokenizedText(ParsedPage parsedPage, String title) throws IOException {
  StringBuilder sb = new StringBuilder();
  sb.append(title);
  sb.append(CharacterTable.SPACE);
  Tokenizer tokenizer = HardTokenizer.getInstance();
  String tokenizedTitle = tokenizer.tokenizedString(title.replace(CharacterTable.LOW_LINE, CharacterTable.SPACE));
  sb.append(tokenizedTitle);
  String rawContent;
  String tokenizedContent;
  List<Content> list;
  for (Section section : parsedPage.getSections()) {
    list = section.getContentList();
    for (int i = 0; i < list.size(); i++) {
      rawContent = list.get(i).getText();
      if (rawContent.length() > 0) {
        tokenizedContent = tokenizer.tokenizedString(rawContent);
        if (tokenizedContent.length() > 0) {
          sb.append(CharacterTable.SPACE);
          sb.append(tokenizedContent);
        }
      }
    }
  }
  return sb.toString();
}

list = section.getContentList();
for (int i = 0; i < list.size(); i++) {
  rawContent = list.get(i).getText();

final String text = home_cc.getText();
int temp;

public ParsedPageLink(Link link) {
  if (link.getTarget() != null) {
    page = normalizePageName(link.getTarget().trim());
  }
  else {
    page = StringTable.EMPTY_STRING;
  }
  if (link.getText() != null) {
    form = removeSuffix(removeQuotes(link.getText().trim()));
  }
  String context = link.getHomeElement().getText();
  if (context != null) {
    Span span = link.getPos();
    leftContext = context.substring(0, span.getStart());
    rightContext = context.substring(span.getEnd(), context.length());
  }
}

Javadoc

This is a main Interface used by nearly all classes of this package.

Be aware, that all retured Spans refer to the String returned by getText()
this is true for any implementing class!

Most used methods

getLinks
returns all Links of this element of the specified type, in the Range of s
getTemplates
returns all Templates, in the Range of s.
getText
length
Content.getText().length() == Content.length()
getFormatSpans
returns the Format Spans of the Specified Type, in the Range of s.
getFormats
returns the Formats uses in this element, in the Range of s.
getString

Popular in Java

Reading from database using SQL prepared statement
requestLocationUpdates (LocationManager)
addToBackStack (FragmentTransaction)
startActivity (Activity)
Pointer (com.sun.jna)
An abstraction for a native pointer data type. A Pointer instance represents, on the Java side, a na
Connection (java.sql)
A connection represents a link from a Java application to a database. All SQL statements and results
DecimalFormat (java.text)
A concrete subclass of NumberFormat that formats decimal numbers. It has a variety of features desig
Comparator (java.util)
A Comparator is used to compare two objects to determine their ordering with respect to each other.
ReentrantLock (java.util.concurrent.locks)
A reentrant mutual exclusion Lock with the same basic behavior and semantics as the implicit monitor
Loader (org.hibernate.loader)
Abstract superclass of object loading (and querying) strategies. This class implements useful common
Top Vim plugins

How to useContent in de.tudarmstadt.ukp.wikipedia.parser

Best Java code snippets using de.tudarmstadt.ukp.wikipedia.parser.Content (Showing top 20 results out of 315)

How to use
Content
in
de.tudarmstadt.ukp.wikipedia.parser