private Span typedSpan(int from, int length, TokenType tokenType, SpanList spanList) { return (Span)spanList.span(from, length).annotate(AnnotationTypes.TOKEN_TYPE, tokenType.getValue()); }
@Override public boolean isIndexable() { return getType().isIndexable() && (getOrig().length() > 0); }
/** Translates this from the int code representation returned from {@link #getValue} */ public static TokenType valueOf(int value) { for (TokenType type : values()) { if (value == type.value) return type; } return UNKNOWN; }
if (!getType().equals(rhs.getType())) { return false;
@Override public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { if (input.isEmpty()) return Collections.emptyList(); List<Token> tokens = new ArrayList<>(); int nextCode = input.codePointAt(0); TokenType prevType = SimpleTokenType.valueOf(nextCode); for (int prev = 0, next = Character.charCount(nextCode); next <= input.length(); ) { nextCode = next < input.length() ? input.codePointAt(next) : SPACE_CODE; TokenType nextType = SimpleTokenType.valueOf(nextCode); if (!prevType.isIndexable() || !nextType.isIndexable()) { String original = input.substring(prev, next); String token = processToken(original, language, stemMode, removeAccents); tokens.add(new SimpleToken(original).setOffset(prev) .setType(prevType) .setTokenString(token)); prev = next; prevType = nextType; } next += Character.charCount(nextCode); } return tokens; }
@Override protected void doExecute(ExecutionContext ctx) { StringFieldValue input = (StringFieldValue)ctx.getValue(); if (input.getString().isEmpty()) { return; } StringFieldValue output = input.clone(); ctx.setValue(output); String prev = output.getString(); String next = toLowerCase(prev); SpanList root = new SpanList(); SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS, root); SpanNode node = new Span(0, prev.length()); tree.annotate(node, new Annotation(AnnotationTypes.TERM, next.equals(prev) ? null : new StringFieldValue(next))); tree.annotate(node, new Annotation(AnnotationTypes.TOKEN_TYPE, new IntegerFieldValue(TokenType.ALPHABETIC.getValue()))); root.add(node); output.setSpanTree(tree); }
@Override public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { if (input.isEmpty()) return Collections.emptyList(); Stemmer stemmer = getStemmerForLanguage(language, stemMode); if (stemmer == null) { return simpleTokenizer.tokenize(input, language, stemMode, removeAccents); } List<Token> tokens = new ArrayList<>(); int nextCode = input.codePointAt(0); TokenType prevType = SimpleTokenType.valueOf(nextCode); for (int prev = 0, next = Character.charCount(nextCode); next <= input.length(); ) { nextCode = next < input.length() ? input.codePointAt(next) : SPACE_CODE; TokenType nextType = SimpleTokenType.valueOf(nextCode); if (!prevType.isIndexable() || !nextType.isIndexable()) { String original = input.substring(prev, next); String token = processToken(original, language, stemMode, removeAccents, stemmer); tokens.add(new SimpleToken(original).setOffset(prev) .setType(prevType) .setTokenString(token)); prev = next; prevType = nextType; } next += Character.charCount(nextCode); } return tokens; }