@Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append('[').append(token.index).append(',').append(token.token); sb.append("] chunk: "); if(token.inChunk == null){ sb.append("none"); } else { sb.append(token.inChunk.chunk); } sb.append("| sentence: "); if(section == null){ sb.append("none"); } else if(section.getSpan().length() > 45){ sb.append(section.getSpan().substring(0, 45)).append(" ..."); } else { sb.append(section.getSpan()); } return sb.toString(); }
@Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append('[').append(token.index).append(',').append(token.token); sb.append("] chunk: "); if(token.inChunk == null){ sb.append("none"); } else { sb.append(token.inChunk.chunk); } sb.append("| sentence: "); if(section == null){ sb.append("none"); } else if(section.getSpan().length() > 45){ sb.append(section.getSpan().substring(0, 45)).append(" ..."); } else { sb.append(section.getSpan()); } return sb.toString(); }
private List<Token> tokenize(Section section,String langauge) { Tokenizer tokenizer = getTokenizer(langauge); String text = section.getSpan(); List<Token> tokens = new ArrayList<Token>(text.length()/5); //assume avr. token length is 5 opennlp.tools.util.Span[] tokenSpans = tokenizer.tokenizePos(section.getSpan()); for(int i=0;i<tokenSpans.length;i++){ Token token = section.addToken(tokenSpans[i].getStart(), tokenSpans[i].getEnd()); log.trace(" > add {}",token); tokens.add(token); } return tokens; }
/** * Getter for the text covered by the next tokenCount tokens relative to * {@link #token}. It uses the {@link #textCache} to lookup/store such texts. * Given the Tokens * <pre> * [This, is, an, Example] * </pre> * and the parameter <code>3</code> this method will return * <pre> * This is an * </pre> * @param tokenCount the number of tokens to be included relative to * {@link #tokenIndex} * @return the text covered by the span start of {@link #token} to end of * token at <code>{@link #tokenIndex}+tokenCount</code>. */ public String getTokenText(int start, int tokenCount){ int offset = section.getStart(); return section.getSpan().substring( tokens.get(start).token.getStart()-offset, tokens.get(start+(tokenCount-1)).token.getEnd()-offset); }
/** * Getter for the text covered by the next tokenCount tokens relative to * {@link #token}. It uses the {@link #textCache} to lookup/store such texts. * Given the Tokens * <pre> * [This, is, an, Example] * </pre> * and the parameter <code>3</code> this method will return * <pre> * This is an * </pre> * @param tokenCount the number of tokens to be included relative to * {@link #tokenIndex} * @return the text covered by the span start of {@link #token} to end of * token at <code>{@link #tokenIndex}+tokenCount</code>. */ public String getTokenText(int start, int tokenCount){ int offset = section.getStart(); return section.getSpan().substring( tokens.get(start).token.getStart()-offset, tokens.get(start+(tokenCount-1)).token.getEnd()-offset); }
private Occurrence(Section sentence,Token start,Token end){ this.start = start.getStart(); this.end = end.getEnd(); String context = sentence.getSpan(); if(context.length() > MAX_CONTEXT_LENGTH){ context = start.getContext().getSpan().substring( Math.max(0, this.start-CONTEXT_TOKEN_COUNT), Math.min(this.end+CONTEXT_TOKEN_COUNT, start.getContext().getEnd())-1); } this.context = context; } /**
private Occurrence(Section sentence,Token start,Token end){ this.start = start.getStart(); this.end = end.getEnd(); String context = sentence.getSpan(); if(context.length() > MAX_CONTEXT_LENGTH){ context = start.getContext().getSpan().substring( Math.max(0, this.start-CONTEXT_TOKEN_COUNT), Math.min(this.end+CONTEXT_TOKEN_COUNT, start.getContext().getEnd())-1); } this.context = context; } /**
Section section = sections.next(); opennlp.tools.util.Span[] tokenSpans = tokenizer.tokenizePos(section.getSpan()); for(int i=0;i<tokenSpans.length;i++){ Token token = section.addToken(tokenSpans[i].getStart(), tokenSpans[i].getEnd());
String sentence = sentences.get(i).getSpan();
throw new EngineException("Missing POS value for Token '" + token.getSpan()+"' of ContentItem "+ci.getUri() + "(Sentence: '"+sentence.getSpan()+"'). This may " + "indicate that a POS tagging Engine is missing in " + "the EnhancementChain or that the used POS tagging "
if(span.getStart() >= span.getEnd()){ //save guard against empty spans log.warn("Detected Empty Span {} in section {}: '{}'", new Object[]{span,section, section.getSpan()});
if(span.getStart() >= span.getEnd()){ //save guard against empty spans log.warn("Detected Empty Span {} in section {}: '{}'", new Object[]{span,section, section.getSpan()});