private Occurrence(Section sentence,Token start,Token end){ this.start = start.getStart(); this.end = end.getEnd(); String context = sentence.getSpan(); if(context.length() > MAX_CONTEXT_LENGTH){ context = start.getContext().getSpan().substring( Math.max(0, this.start-CONTEXT_TOKEN_COUNT), Math.min(this.end+CONTEXT_TOKEN_COUNT, start.getContext().getEnd())-1); } this.context = context; } /**
@Override public void reset() throws IOException { super.reset(); Iterator<Sentence> sentences = at.getSentences(); this.sections = sentences.hasNext() ? sentences : Collections.singleton(at).iterator(); sectionData = null; tokenIt = null; incrementCount = 0; lookupCount = 0; }
private void logAnnotations(AnalysedText at){ Iterator<Span> it = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Token)); while(it.hasNext()){ Span span = it.next(); log.trace(" > {}",span); for(Value<PosTag> value : span.getAnnotations(POS_ANNOTATION)){ log.trace(" - {}",value); } } } /**
protected AnalyzedTextSentenceTokenizer(AnalysedText at) { super(new StringReader(at.getText().toString())); this.at = at; sentences = at.getSentences(); }
private List<Section> detectSentences(AnalysedText at, String language) { SentenceDetector sentenceDetector = getSentenceDetector(language); List<Section> sentences; if(sentenceDetector != null){ sentences = new ArrayList<Section>(); for(opennlp.tools.util.Span sentSpan : sentenceDetector.sentPosDetect(at.getSpan())) { Sentence sentence = at.addSentence(sentSpan.getStart(), sentSpan.getEnd()); log.trace(" > add {}",sentence); sentences.add(sentence); } } else { sentences = null; } return sentences; }
TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText())); Token token = at.addToken(offset.startOffset(), offset.endOffset()); Sentence sent = at.addSentence(sentStartOffset, offset.startOffset()); if(ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))){ Chunk chunk = at.addChunk(ner.start, ner.end); chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag)); lastSent = at.addSentence(sentStartOffset, offset.endOffset()); nerList.get(nerSentIndex).context = lastSent.getSpan(); } else { //no sentence detected nerList.get(nerSentIndex).context = at.getSpan(); IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this); metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl( at.getSpan().substring(nerData.start, nerData.end),lang))); metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType())); metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
List<Section> sentences = new ArrayList<Section>(); AnalysedTextUtils.appandToList(at.getSentences(), sentences); if(sentences.isEmpty()){ //no sentence annotations contextElements.add(sentence); String context = at.getSpan().substring( sentences.get(Math.max(0, i-1)).getStart(), sentences.get(Math.min(sentences.size()-1, i+1)).getEnd()); String name = at.getSpan().substring(tokens.get(nameSpans[j].getStart()).getStart(), tokens.get(nameSpans[j].getEnd()-1).getEnd()); Double confidence = 1.0; nameOccurrences.put(name, occurrences); Chunk chunk = at.addChunk(start, end);
+ "JIRA issue about this."); if(!at.getSentences().hasNext()) { //no sentences ... use this engine to detect TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText())); try { while(sentences.incrementToken()){ OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class); Sentence s = at.addSentence(offset.startOffset(), offset.endOffset()); if(log.isTraceEnabled()) { log.trace("detected {}:{}",s,s.getSpan()); while(tokens.incrementToken()){ OffsetAttribute offset = tokens.addAttribute(OffsetAttribute.class); Token t = at.addToken(offset.startOffset(), offset.endOffset()); log.trace("detected {}",t);
new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100)});
final Blob blob = at.getBlob(); int sentimentCount = 0; Iterator<Span> spans = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence,SpanTypeEnum.Chunk)); Sentence context = null; Graph metadata = ci.getMetadata(); metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(context == null ? getDefaultSelectionContext(at.getSpan(), span.getSpan(), span.getStart()) : context.getSpan(), lang)));
return; case Sentence: span = at.addSentence(spanPos[0], spanPos[1]); break; case Chunk: span = at.addChunk(spanPos[0], spanPos[1]); break; case Token: span = at.addToken(spanPos[0], spanPos[1]); break; default:
Sentence sentence = sentPhrase.getSentence(); if(log.isDebugEnabled()){ //debug sentiment info CharSequence phraseText = at.getText().subSequence(sentPhrase.getStartIndex(), sentPhrase.getEndIndex()); log.debug("Write SentimentPhrase for {} (sentence: {})", phraseText, sentence == null ? "none" : sentence.getSpan().length() > 17 ? (sentence.getSpan().subSequence(0,17) + "...") : sentence.getSpan()); String phraseText = at.getSpan().substring(sentPhrase.getStartIndex(), sentPhrase.getEndIndex()); metadata.add(new TripleImpl(enh, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(phraseText, lang))); metadata.add(new TripleImpl(enh, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext( at.getSpan(), phraseText, sentPhrase.getStartIndex()),lang))); } else { metadata.add(new TripleImpl(enh, ENHANCER_SELECTION_CONTEXT,
Iterator<? extends Section> sentences = at.getSentences(); if(!sentences.hasNext()){ //no sentences ... iterate over the whole text sentences = Collections.singleton(at).iterator(); Chunk chunk = at.addChunk( tokenList.get(i-chunkTokenCount).getStart(), tokenList.get(i-1).getEnd()); Chunk chunk = at.addChunk( tokenList.get(i-chunkTokenCount).getStart(), tokenList.get(i-1).getEnd());
TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText())); try { sentences.reset(); while(sentences.incrementToken()){ OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class); Sentence s = at.addSentence(offset.startOffset(), offset.endOffset()); if(log.isTraceEnabled()) { log.trace("detected {}:{}",s,s.getSpan());
@Override public int canEnhance(ContentItem ci) throws EngineException { String language = getLanguage(this, ci, false); if(language == null || textProcessingConfig.getConfiguration(language) == null){ log.debug("Engine {} ignores ContentItem {} becuase language {} is not condigured.", new Object[]{ getName(), ci.getUri(), language}); return CANNOT_ENHANCE; } //we need a detected language, the AnalyzedText contentPart with Tokens. AnalysedText at = getAnalysedText(this, ci, false); return at != null && at.getTokens().hasNext() ? ENHANCE_ASYNC : CANNOT_ENHANCE; }
CharSequence tagSequence = at.getText().subSequence(start, end); log.trace(" > reduce tag {} - no overlapp with linkable token", tagSequence); CharSequence text = at.getText(); log.trace(" - matchable Span {}{} for Tag {}[{},{}]", new Object[]{ text.subSequence(mSpan[0],mSpan[1]), CharSequence text = at.getText(); log.trace(" - reduce tag {}[{},{}] - does only match " + "{} of {} of matchable Chunk {}[{},{}]", CharSequence text = at.getText(); log.trace(" + keep tag {}[{},{}] - matches {} of {} " + "matchable Tokens for matchable Chunk {}[{},{}]", CharSequence text = at.getText(); log.trace(" + keep tag {}[{},{}] - matches whole Chunk {}[{},{}]", new Object[]{text.subSequence(start, end), start, end, CharSequence tagSequence = at.getText().subSequence(start, end); log.trace(" + keep tag {} - not in processable chunk", tagSequence);