Refine search
public Document(InputDoc input, List<List<Mention>> mentions) { this(); this.annotation = input.annotation; this.predictedMentions = mentions; this.goldMentions = input.goldMentions; this.docInfo = input.docInfo; this.numSentences = input.annotation.get(SentencesAnnotation.class).size(); this.conllDoc = input.conllDoc; // null if it's not conll input }
public List<Integer> scanForAnimates(Pair<Integer, Integer> span) { List<Integer> animateIndices = new ArrayList<>(); List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class); for(int i = span.first; i <= span.second && i < tokens.size() ; i++) { CoreLabel token = tokens.get(i); if(animacySet.contains(token.word())) animateIndices.add(i); } return animateIndices; }
public static Temporal parseOrNull(String str) { Annotation doc = new Annotation(str); pipeline.annotate(doc); if (doc.get(CoreAnnotations.SentencesAnnotation.class) == null) { return null; } if (doc.get(CoreAnnotations.SentencesAnnotation.class).isEmpty()) { return null; } List<CoreMap> timexAnnotations = doc.get(TimeAnnotations.TimexAnnotations.class); if (timexAnnotations.size() > 1) { return null; } else if (timexAnnotations.isEmpty()) { return null; } CoreMap timex = timexAnnotations.get(0); if (timex.get(TimeExpression.Annotation.class) == null) { return null; } else { return timex.get(TimeExpression.Annotation.class).getTemporal(); } }
public MentionData getClosestMention(CoreMap quote) { MentionData closestBackward = findClosestMentionInSpanBackward(new Pair<>(0, quote.get(CoreAnnotations.TokenBeginAnnotation.class) - 1)); MentionData closestForward = findClosestMentionInSpanForward(new Pair<>(quote.get(CoreAnnotations.TokenEndAnnotation.class), doc.get(CoreAnnotations.TokensAnnotation.class).size() - 1)); int backDistance = quote.get(CoreAnnotations.TokenBeginAnnotation.class) - closestBackward.end; int forwardDistance = closestForward.begin - quote.get(CoreAnnotations.TokenEndAnnotation.class) + 1; if(backDistance < forwardDistance) { return closestBackward; } else { return closestForward; } }
/** * Set index for each token and sentence in the document. * @param doc */ private static void setTokenIndices(Document doc) { int token_index = 0; for (CoreMap sent : doc.annotation.get(SentencesAnnotation.class)) { for (CoreLabel token : sent.get(TokensAnnotation.class)) { token.set(TokenBeginAnnotation.class, token_index++); } } }
private static void recallErrors(List<List<Mention>> goldMentions, List<List<Mention>> predictedMentions, Annotation doc) throws IOException { List<CoreMap> coreMaps = doc.get(CoreAnnotations.SentencesAnnotation.class); int numSentences = goldMentions.size(); for (int i=0;i<numSentences;i++){ CoreMap coreMap = coreMaps.get(i); List<CoreLabel> words = coreMap.get(CoreAnnotations.TokensAnnotation.class); Tree tree = coreMap.get(TreeCoreAnnotations.TreeAnnotation.class); List<Mention> goldMentionsSent = goldMentions.get(i); List<Pair<Integer,Integer>> goldMentionsSpans = extractSpans(goldMentionsSent); for (Pair<Integer,Integer> mentionSpan: goldMentionsSpans){ logger.finer("RECALL ERROR\n"); logger.finer(coreMap + "\n"); for (int x=mentionSpan.first;x<mentionSpan.second;x++){ logger.finer(words.get(x).value() + " "); } logger.finer("\n"+tree + "\n"); } } }
private static void mentionReordering(Document doc, HeadFinder headFinder) throws Exception { List<List<Mention>> mentions = doc.predictedMentions; List<CoreMap> sentences = doc.annotation.get(SentencesAnnotation.class); for (int i=0 ; i<sentences.size() ; i++) { List<Mention> mentionsInSent = mentions.get(i); mentions.set(i, mentionReorderingBySpan(mentionsInSent)); } }
public void oneNameSentence(Annotation doc) { List<CoreMap> quotes = doc.get(CoreAnnotations.QuotationsAnnotation.class); for(CoreMap quote : quotes) { if (quote.get(QuoteAttributionAnnotator.MentionAnnotation.class) != null) { continue; } Pair<Integer, Integer> range = QuoteAttributionUtils.getRemainderInSentence(doc, quote); if(range == null) { continue; } Pair<ArrayList<String>, ArrayList<Pair<Integer, Integer>>> namesAndNameIndices = scanForNames(range); ArrayList<String> names = namesAndNameIndices.first; ArrayList<Pair<Integer, Integer>> nameIndices = namesAndNameIndices.second; ArrayList<Integer> pronounsIndices = scanForPronouns(range); if (names.size() == 1) { List<Person> p = characterMap.get(names.get(0)); //guess if exactly one name if (p.size() == 1 && pronounsIndices.size() == 0) { fillInMention(quote, tokenRangeToString(nameIndices.get(0)), nameIndices.get(0).first, nameIndices.get(0).second, sieveName, NAME); } } } } }
public boolean rangeContainsCharIndex(Pair<Integer, Integer> tokenRange, int charIndex) { List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class); CoreLabel startToken = tokens.get(tokenRange.first()); CoreLabel endToken = tokens.get(tokenRange.second()); int startTokenCharBegin = startToken.beginPosition(); int endTokenCharEnd = endToken.endPosition(); return (startTokenCharBegin <= charIndex && charIndex <= endTokenCharEnd); }
/** * Set index for each token and sentence in the document. * @param doc */ private static void setTokenIndices(Document doc) { int token_index = 0; for (CoreMap sent : doc.annotation.get(SentencesAnnotation.class)) { for (CoreLabel token : sent.get(TokensAnnotation.class)) { token.set(TokenBeginAnnotation.class, token_index++); } } }
public static void addEnhancedSentences(Annotation doc) { //for every sentence that begins a paragraph: append this sentence and the previous one and see if sentence splitter would make a single sentence out of it. If so, add as extra sentence. //for each sieve that potentially uses augmentedSentences in original: List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class); WordToSentenceProcessor wsp = new WordToSentenceProcessor(WordToSentenceProcessor.NewlineIsSentenceBreak.NEVER); //create SentenceSplitter that never splits on newline int prevParagraph = 0; for(int i = 1; i < sentences.size(); i++) { CoreMap sentence = sentences.get(i); CoreMap prevSentence = sentences.get(i-1); List<CoreLabel> tokensConcat = new ArrayList<>(); tokensConcat.addAll(prevSentence.get(CoreAnnotations.TokensAnnotation.class)); tokensConcat.addAll(sentence.get(CoreAnnotations.TokensAnnotation.class)); List<List<CoreLabel>> sentenceTokens = wsp.process(tokensConcat); if(sentenceTokens.size() == 1) { //wsp would have put them into a single sentence --> add enhanced sentence. sentence.set(EnhancedSentenceAnnotation.class, constructSentence(sentenceTokens.get(0), prevSentence, sentence)); } } }
/** * Convert a CoreNLP Annotation object to a Document. * @param ann The CoreNLP Annotation object. */ @SuppressWarnings("Convert2streamapi") public Document(Properties props, Annotation ann) { this.defaultProps = props; StanfordCoreNLP.getDefaultAnnotatorPool(props, new AnnotatorImplementations()); // cache the annotator pool this.impl = new ProtobufAnnotationSerializer(false).toProtoBuilder(ann); List<CoreMap> sentences = ann.get(CoreAnnotations.SentencesAnnotation.class); this.sentences = new ArrayList<>(sentences.size()); for (CoreMap sentence : sentences) { this.sentences.add(new Sentence(this, this.serializer.toProtoBuilder(sentence), sentence.get(CoreAnnotations.TextAnnotation.class), this.defaultProps)); } }
/** Print raw document for analysis */ public static String printRawDoc(Document document, boolean gold, boolean printClusterID) throws FileNotFoundException { StringBuilder sb = new StringBuilder(); List<CoreMap> sentences = document.annotation.get(CoreAnnotations.SentencesAnnotation.class); StringBuilder doc = new StringBuilder(); for(int i = 0 ; i<sentences.size(); i++) { doc.append(sentenceStringWithMention(i, document, gold, printClusterID)); doc.append("\n"); } sb.append("PRINT RAW DOC START\n"); sb.append(document.annotation.get(CoreAnnotations.DocIDAnnotation.class)).append("\n"); if (gold) { sb.append("New DOC: (GOLD MENTIONS) ==================================================\n"); } else { sb.append("New DOC: (Predicted Mentions) ==================================================\n"); } sb.append(doc.toString()).append("\n"); sb.append("PRINT RAW DOC END").append("\n"); return sb.toString(); }
private static CoreMap constructCoreMap(Annotation doc, Pair<Integer, Integer> run) { List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class); // check if the second part of the run is a *NL* token, adjust accordingly int endTokenIndex = run.second; while (endTokenIndex > 0 && tokens.get(endTokenIndex).get(CoreAnnotations.IsNewlineAnnotation.class)) { endTokenIndex--; } // get the sentence text from the first and last character offsets int begin = tokens.get(run.first).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); int end = tokens.get(endTokenIndex).get(CoreAnnotations.CharacterOffsetEndAnnotation.class); String sentenceText = doc.get(CoreAnnotations.TextAnnotation.class).substring(begin, end); List<CoreLabel> sentenceTokens = tokens.subList(run.first, endTokenIndex+1); // create a sentence annotation with text and token offsets CoreMap sentence = new Annotation(sentenceText); sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin); sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end); sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens); return sentence; }