/** * Create a new span. * * @param s the state at the start. */ State(TokenizerState s) { this.stateindex = s.ordinal(); }
/** * Create a new span. * * @param s the state at the start. */ State(TokenizerState s) { this.stateindex = s.ordinal(); }
/** * Create a new span. * * @param s the state at the start. */ State(TokenizerState s) { this.stateindex = s.ordinal(); }
@Override public Pair<String[], IntPair[]> tokenizeSentence(String sentence) { // parse the test TokenizerStateMachine tsm = new TokenizerStateMachine(splitOnDash, splitOnSecondNewline); tsm.parseText(sentence); // construct the data needed for the tokenization. int words = 0; for (State s : tsm.completed) { int idx = s.stateIndex(); if (idx != TokenizerState.IN_SENTENCE.ordinal()) words++; } IntPair[] wordOffsets = new IntPair[words]; String[] tokens = new String[words]; int wordIndex = 0; for (State s : tsm.completed) { State ms = (State) s; if (s.stateIndex() != TokenizerState.IN_SENTENCE.ordinal()) { tokens[wordIndex] = new String(tsm.text, ms.start, ms.end - ms.start); wordOffsets[wordIndex++] = new IntPair(ms.start, ms.end); } } return new Pair<>(tokens, wordOffsets); }
@Override public Pair<String[], IntPair[]> tokenizeSentence(String sentence) { // parse the test TokenizerStateMachine tsm = new TokenizerStateMachine(splitOnDash, splitOnSecondNewline); tsm.parseText(sentence); // construct the data needed for the tokenization. int words = 0; for (State s : tsm.completed) { int idx = s.stateIndex(); if (idx != TokenizerState.IN_SENTENCE.ordinal()) words++; } IntPair[] wordOffsets = new IntPair[words]; String[] tokens = new String[words]; int wordIndex = 0; for (State s : tsm.completed) { State ms = (State) s; if (s.stateIndex() != TokenizerState.IN_SENTENCE.ordinal()) { tokens[wordIndex] = new String(tsm.text, ms.start, ms.end - ms.start); wordOffsets[wordIndex++] = new IntPair(ms.start, ms.end); } } return new Pair<>(tokens, wordOffsets); }