private Occurrence(Section sentence,Token start,Token end){ this.start = start.getStart(); this.end = end.getEnd(); String context = sentence.getSpan(); if(context.length() > MAX_CONTEXT_LENGTH){ context = start.getContext().getSpan().substring( Math.max(0, this.start-CONTEXT_TOKEN_COUNT), Math.min(this.end+CONTEXT_TOKEN_COUNT, start.getContext().getEnd())-1); } this.context = context; } /**
private Occurrence(Section sentence,Token start,Token end){ this.start = start.getStart(); this.end = end.getEnd(); String context = sentence.getSpan(); if(context.length() > MAX_CONTEXT_LENGTH){ context = start.getContext().getSpan().substring( Math.max(0, this.start-CONTEXT_TOKEN_COUNT), Math.min(this.end+CONTEXT_TOKEN_COUNT, start.getContext().getEnd())-1); } this.context = context; } /**
private List<Section> detectSentences(AnalysedText at, String language) { SentenceDetector sentenceDetector = getSentenceDetector(language); List<Section> sentences; if(sentenceDetector != null){ sentences = new ArrayList<Section>(); for(opennlp.tools.util.Span sentSpan : sentenceDetector.sentPosDetect(at.getSpan())) { Sentence sentence = at.addSentence(sentSpan.getStart(), sentSpan.getEnd()); log.trace(" > add {}",sentence); sentences.add(sentence); } } else { sentences = null; } return sentences; }
contextElements.add(sentence); String context = at.getSpan().substring( sentences.get(Math.max(0, i-1)).getStart(), sentences.get(Math.min(sentences.size()-1, i+1)).getEnd()); String name = at.getSpan().substring(tokens.get(nameSpans[j].getStart()).getStart(), tokens.get(nameSpans[j].getEnd()-1).getEnd()); Double confidence = 1.0;
if(log.isDebugEnabled()){ log.debug("computeEnhancements for ContentItem {} language {} text={}", new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100)});
new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100)});
if(log.isDebugEnabled()){ log.debug("computeEnhancements for ContentItem {} language {} text={}", new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100)});
SentenceDetector sentenceDetector = getSentenceDetector(language); if(sentenceDetector != null){ for(opennlp.tools.util.Span sentSpan : sentenceDetector.sentPosDetect(at.getSpan())) {
if(log.isDebugEnabled()){ log.debug("computeEnhancements from AnalysedText ContentPart of ContentItem {}: text={}", ci.getUri().getUnicodeString(), StringUtils.abbreviate(at.getSpan(), 100));
content = at.getSpan(); } else { //no analyzed text ... read is from the text/plain blob try {
log.debug("findNamedEntities model={}, language={}, text=", new Object[]{ nameFinderModel, language, StringUtils.abbreviate(at != null ? at.getSpan() : text, 100) });
List<LexicalEntry> terms; try { terms = this.client.performMorfologicalAnalysis(at.getSpan(), language); } catch (IOException e) { throw new EngineException("Error while calling the CELI Lemmatizer"
String phraseText = at.getSpan().substring(sentPhrase.getStartIndex(), sentPhrase.getEndIndex()); metadata.add(new TripleImpl(enh, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(phraseText, lang))); metadata.add(new TripleImpl(enh, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext( at.getSpan(), phraseText, sentPhrase.getStartIndex()),lang))); } else { metadata.add(new TripleImpl(enh, ENHANCER_SELECTION_CONTEXT,
nerList.get(nerSentIndex).context = lastSent.getSpan(); } else { //no sentence detected nerList.get(nerSentIndex).context = at.getSpan(); IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this); metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl( at.getSpan().substring(nerData.start, nerData.end),lang))); metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType())); metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(context == null ? getDefaultSelectionContext(at.getSpan(), span.getSpan(), span.getStart()) : context.getSpan(), lang)));
graph.add(new TripleImpl(segment, Nif20.endIndex.getUri(), lf.createTypedLiteral(span.getEnd()))); String content = text.getSpan(); if(span.getType() != SpanTypeEnum.Text){