@Override public boolean hasNext() { if (nextGram != null) { return true; } nextGram = findNext(); return nextGram != null; }
/** Translates this from the int code representation returned from {@link #getValue} */ public static TokenType valueOf(int value) { for (TokenType type : values()) { if (value == type.value) return type; } return UNKNOWN; }
private void findStems(Token token, List<StemList> out) { int len; if (token.isSpecialToken() || (len = token.getNumComponents()) == 0) { if (token.isIndexable()) { StemList word = new StemList(); word.add(token.getTokenString()); // takes care of getStem(0) for (int i = 1; i < token.getNumStems(); i++) { word.add(token.getStem(i)); } out.add(word); } } else { for (int i = 0; i < len; ++i) { findStems(token.getComponent(i), out); } } } }
private void findSegments(Token token, List<String> out) { int len; if (token.isSpecialToken() || (len = token.getNumComponents()) == 0) { if (token.isIndexable()) { out.add(token.getOrig()); } } else { for (int i = 0; i < len; ++i) { findSegments(token.getComponent(i), out); } } }
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) { token = normalizer.normalize(token); token = LinguisticsCase.toLowerCase(token); if (removeAccents) token = transformer.accentDrop(token, language); if (stemMode != StemMode.NONE) token = stemmer.stem(token); return token; }
/** * Convenience list which splits the remaining items in this iterator into a list of gram strings * * @return an immutable list of extracted grams */ public List<String> toExtractedList() { List<String> gramList = new ArrayList<>(); while (hasNext()) { gramList.add(next().extractFrom(input)); } return Collections.unmodifiableList(gramList); } }
@Override public List<String> segment(String input, Language language) { List<String> segments = new ArrayList<>(); for (Token token : tokenizer.tokenize(input, language, StemMode.NONE, false)) { findSegments(token, segments); } if (segments.isEmpty()) { segments.add(input); // no segments, return original string } return segments; }
private int indexOfNonWordChar(String s) { for (int i = 0; i < s.length(); i++) { if (!characterClasses.isLetterOrDigit(s.codePointAt(i))) { return i; } } return -1; }
public AnnotatorConfig setStemMode(String name) { this.stemMode = StemMode.valueOf(name); return this; }
@Inject @SuppressWarnings("deprecation") public SimpleLinguistics() { this.normalizer = new SimpleNormalizer(); this.transformer = new SimpleTransformer(); this.detector = new SimpleDetector(); this.characterClasses = new CharacterClasses(); this.gramSplitter = new GramSplitter(characterClasses); }
/** Returns true if this is a latin digit (other digits are not consistently parsed into numbers by Java) */ public boolean isLatinDigit(int c) { return Character.isDigit(c) && isLatin(c); }
public StemList(String... stems) { super(); this.stems = new ArrayList<>(Math.max(stems.length, 3)); for (String word : stems) { add(word); } }
@Override public Segmenter getSegmenter() { return new SegmenterImpl(getTokenizer()); }
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents, Stemmer stemmer) { token = normalizer.normalize(token); token = LinguisticsCase.toLowerCase(token); if (removeAccents) token = transformer.accentDrop(token, language); if (stemMode != StemMode.NONE) { token = doStemming(token, stemmer); } return token; }
private boolean areWordCharactersBackwards(int count,StringBuilder b) { for (int i=0; i<count; i++) { int checkIndex=b.length()-1-i; if (checkIndex<0) return false; if ( ! characterClasses.isLetterOrDigit(b.charAt(checkIndex))) return false; } return true; }
public void setStemMode(String name) { this.stemMode = StemMode.valueOf(name); }
@Override public Gram next() { Gram currentGram = nextGram; if (currentGram == null) { currentGram = findNext(); } if (currentGram == null) { throw new NoSuchElementException("No next gram at position " + i); } nextGram = null; return currentGram; }
private boolean nextIsLetterOrDigit(MatchTokenStrippingCharacterIterator characters) { return characterClasses.isLetterOrDigit(characters.peek()); }