org.apache.stanbol.enhancer.nlp.model.AnalysedText java code examples

private Occurrence(Section sentence,Token start,Token end){
  this.start = start.getStart();
  this.end = end.getEnd();
  String context = sentence.getSpan();
  if(context.length() > MAX_CONTEXT_LENGTH){
    context = start.getContext().getSpan().substring(
      Math.max(0, this.start-CONTEXT_TOKEN_COUNT),
      Math.min(this.end+CONTEXT_TOKEN_COUNT, start.getContext().getEnd())-1);
  }
  this.context = context;
}
/**

@Override
public void reset() throws IOException {
  super.reset();
  Iterator<Sentence> sentences = at.getSentences();
  this.sections = sentences.hasNext() ? sentences : Collections.singleton(at).iterator();
  sectionData = null;
  tokenIt = null;
  incrementCount = 0;
  lookupCount = 0;
}

private void logAnnotations(AnalysedText at){
  Iterator<Span> it = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Token));
  while(it.hasNext()){
    Span span = it.next();
    log.trace(" > {}",span);
    for(Value<PosTag> value : span.getAnnotations(POS_ANNOTATION)){
      log.trace("   - {}",value);
    }
  }
}
/**

protected AnalyzedTextSentenceTokenizer(AnalysedText at) {
  super(new StringReader(at.getText().toString()));
  this.at = at;
  sentences = at.getSentences();
}

private List<Section> detectSentences(AnalysedText at, String language) {
  SentenceDetector sentenceDetector = getSentenceDetector(language);
  List<Section> sentences;
  if(sentenceDetector != null){
    sentences = new ArrayList<Section>();
    for(opennlp.tools.util.Span sentSpan : sentenceDetector.sentPosDetect(at.getSpan())) {
      Sentence sentence = at.addSentence(sentSpan.getStart(), sentSpan.getEnd());
      log.trace(" > add {}",sentence);
      sentences.add(sentence);
    }
  } else {
    sentences = null;
  }
  return sentences;
}

TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText()));
    Token token = at.addToken(offset.startOffset(), offset.endOffset());
      Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
    if(ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))){
      Chunk chunk = at.addChunk(ner.start, ner.end);
      chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
    lastSent = at.addSentence(sentStartOffset, offset.endOffset());
      nerList.get(nerSentIndex).context = lastSent.getSpan();
    } else { //no sentence detected
      nerList.get(nerSentIndex).context = at.getSpan();
    IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
    metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(
      at.getSpan().substring(nerData.start, nerData.end),lang)));
    metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType()));
    metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));

List<Section> sentences = new ArrayList<Section>();
AnalysedTextUtils.appandToList(at.getSentences(), sentences);
if(sentences.isEmpty()){ //no sentence annotations
  contextElements.add(sentence);
  String context = at.getSpan().substring(
    sentences.get(Math.max(0, i-1)).getStart(),
    sentences.get(Math.min(sentences.size()-1, i+1)).getEnd());
    String name = at.getSpan().substring(tokens.get(nameSpans[j].getStart()).getStart(), 
      tokens.get(nameSpans[j].getEnd()-1).getEnd());
    Double confidence = 1.0;
    nameOccurrences.put(name, occurrences);
    Chunk chunk = at.addChunk(start, end);

    + "JIRA issue about this.");
if(!at.getSentences().hasNext()) { //no sentences  ... use this engine to detect
  TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
  try {
    while(sentences.incrementToken()){
      OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
      Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
      if(log.isTraceEnabled()) {
        log.trace("detected {}:{}",s,s.getSpan());
  while(tokens.incrementToken()){
    OffsetAttribute offset = tokens.addAttribute(OffsetAttribute.class);
    Token t = at.addToken(offset.startOffset(), offset.endOffset());
    log.trace("detected {}",t);

new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100)});

final Blob blob = at.getBlob();
  int sentimentCount = 0;
  Iterator<Span> spans = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence,SpanTypeEnum.Chunk));
  Sentence context = null;
  Graph metadata = ci.getMetadata();
            metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, 
              new PlainLiteralImpl(context == null ?
                  getDefaultSelectionContext(at.getSpan(), span.getSpan(), span.getStart()) :
                    context.getSpan(), lang)));

  return;
case Sentence:
  span = at.addSentence(spanPos[0], spanPos[1]);
  break;
case Chunk:
  span = at.addChunk(spanPos[0], spanPos[1]);
  break;
case Token:
  span = at.addToken(spanPos[0], spanPos[1]);
  break;
default:

if(at != null && at.getTokens().hasNext()){ //if the AnalysedText is present and tokens are present
  if(log.isDebugEnabled()){
    log.debug("computeEnhancements from AnalysedText ContentPart of ContentItem {}: text={}",
      ci.getUri().getUnicodeString(), StringUtils.abbreviate(at.getSpan(), 100));

  mentionedSpan = at.addChunk(spanStart, spanEnd);
  break;
case Sentence:
  break;
case Token:
  mentionedSpan = at.addToken(spanStart, spanEnd);
  break;

Sentence sentence = sentPhrase.getSentence();
if(log.isDebugEnabled()){ //debug sentiment info
  CharSequence phraseText = at.getText().subSequence(sentPhrase.getStartIndex(), sentPhrase.getEndIndex());
  log.debug("Write SentimentPhrase for {} (sentence: {})", phraseText,
    sentence == null ? "none" : sentence.getSpan().length() > 17 ? (sentence.getSpan().subSequence(0,17) + "...") : sentence.getSpan());
  String phraseText = at.getSpan().substring(sentPhrase.getStartIndex(), sentPhrase.getEndIndex());
  metadata.add(new TripleImpl(enh, ENHANCER_SELECTED_TEXT, 
    new PlainLiteralImpl(phraseText, lang)));
    metadata.add(new TripleImpl(enh, ENHANCER_SELECTION_CONTEXT, 
      new PlainLiteralImpl(getSelectionContext(
        at.getSpan(), phraseText, sentPhrase.getStartIndex()),lang)));
  } else {
    metadata.add(new TripleImpl(enh, ENHANCER_SELECTION_CONTEXT,

Iterator<? extends Section> sentences = at.getSentences();
if(!sentences.hasNext()){ //no sentences ... iterate over the whole text
  sentences = Collections.singleton(at).iterator();
      Chunk chunk = at.addChunk( 
        tokenList.get(i-chunkTokenCount).getStart(), 
        tokenList.get(i-1).getEnd());
    Chunk chunk = at.addChunk( 
      tokenList.get(i-chunkTokenCount).getStart(), 
      tokenList.get(i-1).getEnd());

List<LexicalEntry> terms;
try {
  terms = this.client.performMorfologicalAnalysis(at.getSpan(), language);
} catch (IOException e) {
  throw new EngineException("Error while calling the CELI Lemmatizer" 
  Token token = at.addToken(term.getFrom(), term.getTo());

TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
try {
  sentences.reset();
  while(sentences.incrementToken()){
    OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
    Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
    if(log.isTraceEnabled()) {
      log.trace("detected {}:{}",s,s.getSpan());

@Override
public void end() throws IOException {
 // set final offset
 offsetAtt.setOffset(at.getEnd(), at.getEnd());
}
@Override

@Override
public int canEnhance(ContentItem ci) throws EngineException {
  String language = getLanguage(this, ci, false);
  if(language == null || textProcessingConfig.getConfiguration(language) == null){
    log.debug("Engine {} ignores ContentItem {} becuase language {} is not condigured.",
      new Object[]{ getName(), ci.getUri(), language});
    return CANNOT_ENHANCE;
  }
  //we need a detected language, the AnalyzedText contentPart with Tokens.
  AnalysedText at = getAnalysedText(this, ci, false);
  return at != null && at.getTokens().hasNext() ?
      ENHANCE_ASYNC : CANNOT_ENHANCE;
}

CharSequence tagSequence = at.getText().subSequence(start, end);
log.trace(" > reduce tag {} - no overlapp with linkable token", tagSequence);
CharSequence text = at.getText();
log.trace(" - matchable Span {}{} for Tag {}[{},{}]", 
  new Object[]{ text.subSequence(mSpan[0],mSpan[1]),
      CharSequence text = at.getText();
      log.trace(" - reduce tag {}[{},{}] - does only match "
        + "{} of {} of matchable Chunk {}[{},{}]", 
    CharSequence text = at.getText();
    log.trace(" + keep tag {}[{},{}] - matches {} of {} "
      + "matchable Tokens for matchable Chunk {}[{},{}]", 
    CharSequence text = at.getText();
    log.trace(" + keep tag {}[{},{}] - matches whole Chunk {}[{},{}]", 
      new Object[]{text.subSequence(start, end), start, end, 
CharSequence tagSequence = at.getText().subSequence(start, end);
log.trace(" + keep tag {} - not in processable chunk", tagSequence);

Javadoc

Provides access to NLP processing results of the text/plain Blob of an ContentItem. Intended to be ContentItem#addPart(org.apache.clerezza.commons.rdf.IRI,Object) by using #ANALYSED_TEXT_URI.

Most used methods

getSpan
getSentences
All sentences of the Analysed texts. Returned Iterators MUST NOT throw ConcurrentModificationExcepti
getEnclosed
addChunk
Adds an Chunk
addSentence
Adds an Sentence
getEnd
getText
Getter for the text.
getTokens
addToken
getBlob
The analysed Blob. Typically Blob#getMimeType() will betext/plain.
getChunks
All Chunks of this analysed text. Returned Iterators MUST NOT throw ConcurrentModificationExceptionb

getChunks

Popular in Java

Updating database using SQL prepared statement
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
getResourceAsStream (ClassLoader)
notifyDataSetChanged (ArrayAdapter)
BufferedWriter (java.io)
Wraps an existing Writer and buffers the output. Expensive interaction with the underlying reader is
InputStream (java.io)
A readable source of bytes.Most clients will use input streams that read data from the file system (
Set (java.util)
A Set is a data structure which does not allow duplicate elements.
SSLHandshakeException (javax.net.ssl)
The exception that is thrown when a handshake could not be completed successfully.
HttpServletRequest (javax.servlet.http)
Extends the javax.servlet.ServletRequest interface to provide request information for HTTP servlets.
FileUtils (org.apache.commons.io)
General file manipulation utilities. Facilities are provided in the following areas: * writing to a
CodeWhisperer alternatives

How to useAnalysedText in org.apache.stanbol.enhancer.nlp.model

Best Java code snippets using org.apache.stanbol.enhancer.nlp.model.AnalysedText (Showing top 20 results out of 315)

How to use
AnalysedText
in
org.apache.stanbol.enhancer.nlp.model