/** * The default way to create a {@link TextAnnotation} from pre-tokenized text. * * @param tokenizedSentences A list of sentences, each one being a list of tokens * @return A {@link TextAnnotation} containing the SENTENCE and TOKENS views. */ public static TextAnnotation createTextAnnotationFromTokens(String corpusId, String textId, List<String[]> tokenizedSentences) { Tokenization tokenization = tokenizeTextSpan(tokenizedSentences); StringBuilder text = new StringBuilder(); for (String[] sentenceTokens : tokenizedSentences) text.append(StringUtils.join(sentenceTokens, ' ')) .append(System.lineSeparator()); return new TextAnnotation(corpusId, textId, text.toString(), tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes()); }
/** * The default way to create a {@link TextAnnotation} from pre-tokenized text. * * @param tokenizedSentences A list of sentences, each one being a list of tokens * @return A {@link TextAnnotation} containing the SENTENCE and TOKENS views. */ public static TextAnnotation createTextAnnotationFromTokens(String corpusId, String textId, List<String[]> tokenizedSentences) { Tokenization tokenization = tokenizeTextSpan(tokenizedSentences); StringBuilder text = new StringBuilder(); for (String[] sentenceTokens : tokenizedSentences) text.append(StringUtils.join(sentenceTokens, ' ')) .append(System.lineSeparator()); return new TextAnnotation(corpusId, textId, text.toString(), tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes()); }
Tokenizer.Tokenization tokenization = tokenizer.tokenize(rawText, tokens, sentences); TextAnnotation ta = new TextAnnotation(corpusName, textId, rawText, tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes());
Tokenizer.Tokenization tokenization = tokenizer.tokenize(rawText, tokens, sentences); TextAnnotation ta = new TextAnnotation(corpusName, textId, rawText, tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes());
throws IllegalArgumentException { Tokenizer.Tokenization tokenization = tokenizer.tokenizeTextSpan(text); TextAnnotation ta = new TextAnnotation(corpusId, textId, text, tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes()); SpanLabelView view =
throws IllegalArgumentException { Tokenizer.Tokenization tokenization = tokenizer.tokenizeTextSpan(text); TextAnnotation ta = new TextAnnotation(corpusId, textId, text, tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes()); SpanLabelView view =
/** * @param args */ public static void main(String[] args) { JapaneseTokenizer jt = new JapaneseTokenizer(); String text = "\"ペンシルベニアドイツ語\",\"text\":\"ペンシルベニアドイツ語(標準ドイ" + "ツ語:Pennsylvania-Dutch, Pennsilfaani-Deitsch、アレマン語:Pennsylvania-Ditsch、英語:Pennsylvania-German)" + "は、北アメリカのカナダおよびアメリカ中西部でおよそ15万から25万人の人びとに話されているドイツ語の系統である。高地ドイツ語の" + "うち上部ドイツ語の一派アレマン語の一方言である。ペンシルベニアアレマン語(Pennsilfaani-Alemanisch, Pennsylvania-Alemannic)" + "とも呼ばれる。"; Tokenization tokenized = jt.tokenizeTextSpan(text); // DEBUG print the results int tokIdx = 0; int[] sentenceEnds = tokenized.getSentenceEndTokenIndexes(); String[] toks = tokenized.getTokens(); IntPair [] charOffsetArray = tokenized.getCharacterOffsets(); System.out.println("HOLY CRAP"); for (int sentIdx = 0; sentIdx < sentenceEnds.length; sentIdx++) { // print the sentence. for (int tokoff = tokIdx; tokoff < sentenceEnds[sentIdx]; tokoff++) System.out.print(toks[tokoff]); System.out.println(); for (; tokIdx < sentenceEnds[sentIdx]; tokIdx++) System.out.println(toks[tokIdx]+" = "+text.substring(charOffsetArray[tokIdx].getFirst(),charOffsetArray[tokIdx].getSecond())); } }
@Override public TextAnnotation createTextAnnotation(String corpusId, String textId, String text, Tokenization tokenization) throws IllegalArgumentException { return new TextAnnotation(corpusId, textId, text, tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes()); }
@Override public TextAnnotation createTextAnnotation(String corpusId, String textId, String text, Tokenizer.Tokenization tokenization) throws IllegalArgumentException { return new TextAnnotation(corpusId, textId, text, tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes()); }
@Override public TextAnnotation createTextAnnotation(String corpusId, String textId, String text, Tokenization tokenization) throws IllegalArgumentException { return new TextAnnotation(corpusId, textId, text, tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes()); }
@Override public TextAnnotation createTextAnnotation(String corpusId, String textId, String text, Tokenizer.Tokenization tokenization) throws IllegalArgumentException { return new TextAnnotation(corpusId, textId, text, tokenization.getCharacterOffsets(), tokenization.getTokens(), tokenization.getSentenceEndTokenIndexes()); }