/** * Checks the {@link #start} {@link #end} values against the span selected * by the parsed token.<p> * This method is called by all others that do add tokens. * @param token the added token */ private void checkSpan(Token token) { if(start > token.getStart()){ start = token.getStart(); } if(end < token.getEnd()){ end = token.getEnd(); } }
@Override public Chunk createChunk(Token start, Token end) { if(start == null || end == null){ throw new IllegalArgumentException("Parst start Token '" + start + "' and end Token '" + end +"' MUST NOT be NULL!"); } lock.writeLock().lock(); try { return at.addChunk(start.getStart(), end.getEnd()); } finally { lock.writeLock().unlock(); } }
/** * Everytime the entityLinker starts to process a token we need to check * if we need to add additional contextual information from the {@link ContentItem} * to the {@link InMemoryEntityIndex} */ @Override public void startToken(Token token) { log.debug(" > start token: {}",token); final Integer actIndex = token.getStart(); if(actIndex > lastIndex){ for(Collection<EntityMention> mentions : mentionIndex.subMap(lastIndex, actIndex).values()){ for(EntityMention mention : mentions){ addEntity(mention); } } lastIndex = actIndex; } else if(lastIndex > actIndex){ log.warn("Token {} has earlier start index as the last one {}!", token, lastIndex); } // else the same index ... ignore }
/** * Creates a new Linked Entity including the first {@link Occurrence} * @param section the sentence (context) for the occurrence. * @param startToken the index of the start token * @param tokenSpan the number of token included in this span * @param suggestions the entity suggestions * @param types the types of the linked entity. */ protected LinkedEntity(Section section,Token startToken,Token endToken, List<Suggestion> suggestions, Set<IRI> types) { this(startToken.getSpan().substring(startToken.getStart(), endToken.getEnd()), suggestions,types); addOccurrence(section, startToken,endToken); } /**
/** * Creates a new Linked Entity including the first {@link Occurrence} * @param section the sentence (context) for the occurrence. * @param startToken the index of the start token * @param tokenSpan the number of token included in this span * @param suggestions the entity suggestions * @param types the types of the linked entity. */ protected LinkedEntity(Section section,Token startToken,Token endToken, List<Suggestion> suggestions, Set<IRI> types) { this(startToken.getSpan().substring(startToken.getStart(), endToken.getEnd()), suggestions,types); addOccurrence(section, startToken,endToken); } /**
/** * Getter for the text covered by the next tokenCount tokens relative to * {@link #token}. It uses the {@link #textCache} to lookup/store such texts. * Given the Tokens * <pre> * [This, is, an, Example] * </pre> * and the parameter <code>3</code> this method will return * <pre> * This is an * </pre> * @param tokenCount the number of tokens to be included relative to * {@link #tokenIndex} * @return the text covered by the span start of {@link #token} to end of * token at <code>{@link #tokenIndex}+tokenCount</code>. */ public String getTokenText(int start, int tokenCount){ int offset = section.getStart(); return section.getSpan().substring( tokens.get(start).token.getStart()-offset, tokens.get(start+(tokenCount-1)).token.getEnd()-offset); }
/** * Getter for the text covered by the next tokenCount tokens relative to * {@link #token}. It uses the {@link #textCache} to lookup/store such texts. * Given the Tokens * <pre> * [This, is, an, Example] * </pre> * and the parameter <code>3</code> this method will return * <pre> * This is an * </pre> * @param tokenCount the number of tokens to be included relative to * {@link #tokenIndex} * @return the text covered by the span start of {@link #token} to end of * token at <code>{@link #tokenIndex}+tokenCount</code>. */ public String getTokenText(int start, int tokenCount){ int offset = section.getStart(); return section.getSpan().substring( tokens.get(start).token.getStart()-offset, tokens.get(start+(tokenCount-1)).token.getEnd()-offset); }
private Occurrence(Section sentence,Token start,Token end){ this.start = start.getStart(); this.end = end.getEnd(); String context = sentence.getSpan(); if(context.length() > MAX_CONTEXT_LENGTH){ context = start.getContext().getSpan().substring( Math.max(0, this.start-CONTEXT_TOKEN_COUNT), Math.min(this.end+CONTEXT_TOKEN_COUNT, start.getContext().getEnd())-1); } this.context = context; } /**
private Occurrence(Section sentence,Token start,Token end){ this.start = start.getStart(); this.end = end.getEnd(); String context = sentence.getSpan(); if(context.length() > MAX_CONTEXT_LENGTH){ context = start.getContext().getSpan().substring( Math.max(0, this.start-CONTEXT_TOKEN_COUNT), Math.min(this.end+CONTEXT_TOKEN_COUNT, start.getContext().getEnd())-1); } this.context = context; } /**
if(cursorToken.token.getStart() < endOffset){
phraseType.getPhraseType().name(), current.get(0).getStart(), current.get(current.size()-1).getEnd()});
String name = at.getSpan().substring(tokens.get(nameSpans[j].getStart()).getStart(), tokens.get(nameSpans[j].getEnd()-1).getEnd()); Double confidence = 1.0; confidence *= probs[k]; int start = tokens.get(nameSpans[j].getStart()).getStart(); int end = start + name.length(); NerTag nerTag = config.getNerTag(nameSpans[j].getType());
linkableTokenContext = linkableTokens.isEmpty() ? null : linkableTokens.get(0); if(linkableTokenContext == null || linkableTokenContext.linkableToken.token.getStart() >= end){ Math.max(start,linkableToken.token.getStart()), Math.min(end,linkableToken.token.getEnd())}; if(mSpan[0] > start){ for(int i = linkableToken.index-1; i >= 0; i--){ TokenData token = tokens.get(i); int tStart = token.token.getStart(); if(tStart < start){ break; if(td.isMatchable){ num++; if(match < 1 && td.token.getStart() >= start || match > 0 && td.token.getEnd() <= end){ match++;
while((token = nextToken(first)) != null){ log.trace(" < [{},{}]:{} (link {}, match; {})",new Object[]{ token.token.getStart(), token.token.getEnd(),token.getTokenText(), token.isLinkable, token.isMatchable}); first = false; log.trace("lookup: token [{},{}]: {} | word [{},{}]:{}", new Object[]{ offset.startOffset(), offset.endOffset(), termAtt, t.token.getStart(), t.token.getEnd(), t.getTokenText()});
activeChunk.matchableStartCharIndex = tokenData.token.getStart();
activeChunk.matchableStartCharIndex = tokenData.token.getStart();
this.value = value; this.sentence = sentence; this.start = token.getStart(); this.end = token.getEnd(); List<Value<PosTag>> tags = token.getAnnotations(NlpAnnotations.POS_ANNOTATION);
upperCase = token.getEnd() > token.getStart() && //not an empty token Character.isUpperCase(token.getSpan().codePointAt(0)); //and upper case boolean isLinkablePos = false;
upperCase = token.getEnd() > token.getStart() && //not an empty token Character.isUpperCase(token.getSpan().codePointAt(0)); //and upper case boolean isLinkablePos = false;