edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer$Tokenization.getCharacterOffsets java code examples

/**
 * The default way to create a {@link TextAnnotation} from pre-tokenized text.
 * 
 * @param tokenizedSentences A list of sentences, each one being a list of tokens
 * @return A {@link TextAnnotation} containing the SENTENCE and TOKENS views.
 */
public static TextAnnotation createTextAnnotationFromTokens(String corpusId, String textId,
    List<String[]> tokenizedSentences) {
  Tokenization tokenization = tokenizeTextSpan(tokenizedSentences);
  StringBuilder text = new StringBuilder();
  for (String[] sentenceTokens : tokenizedSentences)
    text.append(StringUtils.join(sentenceTokens, ' '))
        .append(System.lineSeparator());
  return new TextAnnotation(corpusId, textId, text.toString(), tokenization.getCharacterOffsets(),
      tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes());
}

/**
 * The default way to create a {@link TextAnnotation} from pre-tokenized text.
 * 
 * @param tokenizedSentences A list of sentences, each one being a list of tokens
 * @return A {@link TextAnnotation} containing the SENTENCE and TOKENS views.
 */
public static TextAnnotation createTextAnnotationFromTokens(String corpusId, String textId,
    List<String[]> tokenizedSentences) {
  Tokenization tokenization = tokenizeTextSpan(tokenizedSentences);
  StringBuilder text = new StringBuilder();
  for (String[] sentenceTokens : tokenizedSentences)
    text.append(StringUtils.join(sentenceTokens, ' '))
        .append(System.lineSeparator());
  return new TextAnnotation(corpusId, textId, text.toString(), tokenization.getCharacterOffsets(),
      tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes());
}

Tokenizer.Tokenization tokenization = tokenizer.tokenize(rawText, tokens, sentences);
TextAnnotation ta = new TextAnnotation(corpusName, textId, rawText,
    tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes());

Tokenizer.Tokenization tokenization = tokenizer.tokenize(rawText, tokens, sentences);
TextAnnotation ta = new TextAnnotation(corpusName, textId, rawText,
    tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes());

  throws IllegalArgumentException {
Tokenizer.Tokenization tokenization = tokenizer.tokenizeTextSpan(text);
TextAnnotation ta = new TextAnnotation(corpusId, textId, text, tokenization.getCharacterOffsets(),
    tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes());
SpanLabelView view =

  throws IllegalArgumentException {
Tokenizer.Tokenization tokenization = tokenizer.tokenizeTextSpan(text);
TextAnnotation ta = new TextAnnotation(corpusId, textId, text, tokenization.getCharacterOffsets(),
    tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes());
SpanLabelView view =

/**
 * @param args
 */
public static void main(String[] args) {
  JapaneseTokenizer jt = new JapaneseTokenizer();
  String text = "\"ペンシルベニアドイツ語\",\"text\":\"ペンシルベニアドイツ語（標準ドイ"
          + "ツ語：Pennsylvania-Dutch, Pennsilfaani-Deitsch、アレマン語：Pennsylvania-Ditsch、英語：Pennsylvania-German）"
          + "は、北アメリカのカナダおよびアメリカ中西部でおよそ15万から25万人の人びとに話されているドイツ語の系統である。高地ドイツ語の"
          + "うち上部ドイツ語の一派アレマン語の一方言である。ペンシルベニアアレマン語(Pennsilfaani-Alemanisch, Pennsylvania-Alemannic)"
          + "とも呼ばれる。";
  Tokenization tokenized = jt.tokenizeTextSpan(text);
  
  // DEBUG print the results
  int tokIdx = 0;
  int[] sentenceEnds = tokenized.getSentenceEndTokenIndexes();
  String[] toks = tokenized.getTokens();
  IntPair [] charOffsetArray = tokenized.getCharacterOffsets();
  System.out.println("HOLY CRAP");
  for (int sentIdx = 0; sentIdx < sentenceEnds.length; sentIdx++) {
    
    // print the sentence.
    for (int tokoff = tokIdx; tokoff < sentenceEnds[sentIdx]; tokoff++)
      System.out.print(toks[tokoff]);
    System.out.println();
    for (; tokIdx < sentenceEnds[sentIdx]; tokIdx++)
      System.out.println(toks[tokIdx]+" = "+text.substring(charOffsetArray[tokIdx].getFirst(),charOffsetArray[tokIdx].getSecond()));
  }
}

@Override
public TextAnnotation createTextAnnotation(String corpusId, String textId, String text,
    Tokenization tokenization) throws IllegalArgumentException {
  return new TextAnnotation(corpusId, textId, text, tokenization.getCharacterOffsets(),
      tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes());
}

@Override
public TextAnnotation createTextAnnotation(String corpusId, String textId, String text,
    Tokenizer.Tokenization tokenization) throws IllegalArgumentException {
  return new TextAnnotation(corpusId, textId, text, tokenization.getCharacterOffsets(),
      tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes());
}

@Override
public TextAnnotation createTextAnnotation(String corpusId, String textId, String text,
    Tokenization tokenization) throws IllegalArgumentException {
  return new TextAnnotation(corpusId, textId, text, tokenization.getCharacterOffsets(),
      tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes());
}

@Override
public TextAnnotation createTextAnnotation(String corpusId, String textId, String text,
    Tokenizer.Tokenization tokenization) throws IllegalArgumentException {
  return new TextAnnotation(corpusId, textId, text, tokenization.getCharacterOffsets(),
      tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes());
}

Popular methods of Tokenizer$Tokenization

Popular in Java

Parsing JSON documents to java classes using gson
putExtra (Intent)
onCreateOptionsMenu (Activity)
scheduleAtFixedRate (Timer)
Thread (java.lang)
A thread is a thread of execution in a program. The Java Virtual Machine allows an application to ha
Collection (java.util)
Collection is the root of the collection hierarchy. It defines operations on data collections and t
Table (com.google.common.collect)
A collection that associates an ordered pair of keys, called a row key and a column key, with a sing
Notification (javax.management)
Project (org.apache.tools.ant)
Central representation of an Ant project. This class defines an Ant project with all of its targets,
Loader (org.hibernate.loader)
Abstract superclass of object loading (and querying) strategies. This class implements useful common
Github Copilot alternatives

How to use getCharacterOffsetsmethodin edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer$Tokenization

Best Java code snippets using edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer$Tokenization.getCharacterOffsets (Showing top 11 results out of 315)

How to use
getCharacterOffsets
method
in
edu.illinois.cs.cogcomp.nlp.tokenizer.Tokenizer$Tokenization