@Override public void setOriginalText(String originalText) { label.setOriginalText(originalText); }
/** This is provided as a simple way to make a CoreLabel for a word from a String. * It's often useful in fixup or test code. It sets all three of the Text, OriginalText, * and Value annotations to the given value. * * @param word The word string to make a CoreLabel for * @return A CoreLabel for this word string */ public static CoreLabel wordFromString(String word) { CoreLabel cl = new CoreLabel(); cl.setWord(word); cl.setOriginalText(word); cl.setValue(word); return cl; }
private CoreLabel makeXmlToken(String tokenText, boolean doNormalization, int charOffsetBegin, int charOffsetEnd) { CoreLabel token = new CoreLabel(); token.setOriginalText(tokenText); if (separatorPattern.matcher(tokenText).matches()) { // Map to CoreNLP newline token tokenText = AbstractTokenizer.NEWLINE_TOKEN; } else if (doNormalization && normalizeSpace) { tokenText = tokenText.replace(' ', '\u00A0'); // change space to non-breaking space } token.setWord(tokenText); token.setValue(tokenText); token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, charOffsetBegin); token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, charOffsetEnd); if (VERBOSE) { log.info("Adding token " + token.toShorterString()); } return token; }
CoreLabel cl = (CoreLabel) newChunk; cl.setValue(cl.word()); cl.setOriginalText(cl.word());
((CoreLabel) t.label()).setValue(toks[0].trim().intern()); ((CoreLabel) t.label()).setWord(toks[0].trim().intern()); ((CoreLabel) t.label()).setOriginalText(toks[1].trim().intern()); } else { System.err.printf("%s: Cannot store morph analysis in non-CoreLabel: %s%n",this.getClass().getName(),t.label().getClass().getName());
/** * Handles verbs with attached suffixes, marked by the lexer: * * Escribamosela => Escribamo + se + la => escribamos + se + la * Sentaos => senta + os => sentad + os * Damelo => da + me + lo * */ private CoreLabel processVerb(CoreLabel cl) { cl.remove(ParentAnnotation.class); SpanishVerbStripper.StrippedVerb stripped = verbStripper.separatePronouns(cl.word()); if (stripped == null) { return cl; } // Split the CoreLabel into separate labels, tracking changing begin + end // positions. int stemEnd = cl.beginPosition() + stripped.getOriginalStem().length(); int lengthRemoved = 0; for (String pronoun : stripped.getPronouns()) { int beginOffset = stemEnd + lengthRemoved; compoundBuffer.add(copyCoreLabel(cl, pronoun, beginOffset)); lengthRemoved += pronoun.length(); } CoreLabel stem = copyCoreLabel(cl, stripped.getStem(), cl.beginPosition(), stemEnd); stem.setOriginalText(stripped.getOriginalStem()); return stem; }
/** * Constructs a CoreLabel as a String with a corresponding BEGIN and END position, * when the original OriginalTextAnnotation is different from TextAnnotation * (Does not take substring). */ public CoreLabel makeToken(String tokenText, String originalText, int begin, int length) { CoreLabel cl = addIndices ? new CoreLabel(5) : new CoreLabel(); cl.setValue(tokenText); cl.setWord(tokenText); cl.setOriginalText(originalText); if(addIndices) { cl.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin); cl.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, begin+length); } return cl; }
public List<CoreLabel> segmentStringToTokenList(String line) { List<CoreLabel> tokenList = CollectionUtils.makeList(); List<CoreLabel> labeledSequence = segmentStringToIOB(line); for (IntPair span : IOBUtils.TokenSpansForIOB(labeledSequence)) { CoreLabel token = new CoreLabel(); String text = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker, span.getSource(), span.getTarget()); token.setWord(text); token.setValue(text); token.set(CoreAnnotations.TextAnnotation.class, text); token.set(CoreAnnotations.ArabicSegAnnotation.class, "1"); int start = labeledSequence.get(span.getSource()).beginPosition(); int end = labeledSequence.get(span.getTarget() - 1).endPosition(); token.setOriginalText(line.substring(start, end)); token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, start); token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end); tokenList.add(token); } return tokenList; }
String morphAnalysis = lemmaMorph.second(); if (lemma.equals(toks[0])) { cl.setOriginalText(toks[1].trim().intern()); } else { cl.setOriginalText(newMorphAnalysis.intern());
if (proto.hasBefore()) { word.setBefore(proto.getBefore()); } if (proto.hasAfter()) { word.setAfter(proto.getAfter()); } if (proto.hasOriginalText()) { word.setOriginalText(proto.getOriginalText()); } if (proto.hasNer()) { word.setNER(proto.getNer()); } if (proto.hasCoarseNER()) { word.set(CoreAnnotations.CoarseNamedEntityTagAnnotation.class, proto.getCoarseNER()); }
@Override public void setOriginalText(String originalText) { label.setOriginalText(originalText); }
@Override public void setOriginalText(String originalText) { label.setOriginalText(originalText); }
/** This is provided as a simple way to make a CoreLabel for a word from a String. * It's often useful in fixup or test code. It sets all three of the Text, OriginalText, * and Value annotations to the given value. * * @param word The word string to make a CoreLabel for * @return A CoreLabel for this word string */ public static CoreLabel wordFromString(String word) { CoreLabel cl = new CoreLabel(); cl.setWord(word); cl.setOriginalText(word); cl.setValue(word); return cl; }
/** This is provided as a simple way to make a CoreLabel for a word from a String. * It's often useful in fixup or test code. It sets all three of the Text, OriginalText, * and Value annotations to the given value. * * @param word The word string to make a CoreLabel for * @return A CoreLabel for this word string */ public static CoreLabel wordFromString(String word) { CoreLabel cl = new CoreLabel(); cl.setWord(word); cl.setOriginalText(word); cl.setValue(word); return cl; }
public static List<CoreLabel> stanfordTokenize(String str) { TokenizerFactory<? extends HasWord> tf = PTBTokenizer.coreLabelFactory(); // ptb3Escaping=false -> '(' not converted as '-LRB-', Dont use it, it will cause Dependency resolution err. Tokenizer<? extends HasWord> originalWordTokenizer = tf.getTokenizer(new StringReader(str), "ptb3Escaping=false"); Tokenizer<? extends HasWord> tokenizer = tf.getTokenizer(new StringReader(str)); List<? extends HasWord> originalTokens = originalWordTokenizer.tokenize(); List<? extends HasWord> tokens = tokenizer.tokenize(); // Curse you Stanford! List<CoreLabel> coreLabels = new ArrayList<>(tokens.size()); for (int i = 0; i < tokens.size(); i++) { CoreLabel coreLabel = new CoreLabel(); coreLabel.setWord(tokens.get(i).word()); coreLabel.setOriginalText(originalTokens.get(i).word()); coreLabel.setValue(tokens.get(i).word()); coreLabel.setBeginPosition(((CoreLabel) tokens.get(i)).beginPosition()); coreLabel.setEndPosition(((CoreLabel) tokens.get(i)).endPosition()); coreLabels.add(coreLabel); } return coreLabels; }
private CoreLabel makeXmlToken(String tokenText, boolean doNormalization, int charOffsetBegin, int charOffsetEnd) { CoreLabel token = new CoreLabel(); token.setOriginalText(tokenText); if (separatorPattern.matcher(tokenText).matches()) { // Map to CoreNLP newline token tokenText = AbstractTokenizer.NEWLINE_TOKEN; } else if (doNormalization && normalizeSpace) { tokenText = tokenText.replace(' ', '\u00A0'); // change space to non-breaking space } token.setWord(tokenText); token.setValue(tokenText); token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, charOffsetBegin); token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, charOffsetEnd); if (VERBOSE) { log.info("Adding token " + token.toShorterString()); } return token; }
/** * Constructs a CoreLabel as a String with a corresponding BEGIN and END position. * (Does not take substring). */ public CoreLabel makeToken(String str, int begin, int length) { CoreLabel cl; if (addIndices) { cl = new CoreLabel(8); // Save a reallocation, as there will be at least 5 keys } else { cl = new CoreLabel(); } cl.setWord(str); cl.setOriginalText(str); if (addIndices) { cl.set(CharacterOffsetBeginAnnotation.class, begin); cl.set(CharacterOffsetEndAnnotation.class, begin+length); } return cl; }
/** * Constructs a CoreLabel as a String with a corresponding BEGIN and END position, * when the original OriginalTextAnnotation is different from TextAnnotation * (Does not take substring). */ public CoreLabel makeToken(String tokenText, String originalText, int begin, int length) { CoreLabel cl = addIndices ? new CoreLabel(5) : new CoreLabel(); cl.setValue(tokenText); cl.setWord(tokenText); cl.setOriginalText(originalText); if(addIndices) { cl.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin); cl.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, begin+length); } return cl; }
/** * Constructs a CoreLabel as a String with a corresponding BEGIN and END position, * when the original OriginalTextAnnotation is different from TextAnnotation * (Does not take substring). */ public CoreLabel makeToken(String tokenText, String originalText, int begin, int length) { CoreLabel cl = addIndices ? new CoreLabel(5) : new CoreLabel(); cl.setValue(tokenText); cl.setWord(tokenText); cl.setOriginalText(originalText); if(addIndices) { cl.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin); cl.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, begin+length); } return cl; }
/** * Constructs a CoreLabel as a String with a corresponding BEGIN and END position, * when the original OriginalTextAnnotation is different from TextAnnotation * (Does not take substring). */ public CoreLabel makeToken(String tokenText, String originalText, int begin, int length) { CoreLabel cl = addIndices ? new CoreLabel(5) : new CoreLabel(); cl.setValue(tokenText); cl.setWord(tokenText); cl.setOriginalText(originalText); if(addIndices) { cl.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin); cl.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, begin+length); } return cl; }