public List<String> tokenize(String string){ Pattern pattern = getPattern(); if(("").equals(string)){ return Collections.emptyList(); } String[] tokens = pattern.split(string, -1); int count = 0; for(int i = 0, max = tokens.length; i < max; i++){ String token = tokens[i]; if(token.length() > 0){ token = TermUtil.trimPunctuation(token); if(token.length() > 0){ tokens[count] = token; count++; } } } if(count < tokens.length){ String[] tmpTokens = new String[count]; System.arraycopy(tokens, 0, tmpTokens, 0, count); tokens = tmpTokens; } return ImmutableList.copyOf(tokens); }
static public String trimPunctuation(String string){ int begin = 0; int end = string.length(); // Trim leading punctuation while(begin < end){ char c = string.charAt(begin); if(!isPunctuation(c)){ break; } begin++; } // Trim trailing punctuation while(end > begin){ char c = string.charAt(end - 1); if(!isPunctuation(c)){ break; } end--; } if(begin > 0 || end < string.length()){ string = string.substring(begin, end); } return string; }
@Test public void trimPunctuation(){ assertEquals("", TermUtil.trimPunctuation("")); assertEquals("", TermUtil.trimPunctuation("?")); assertEquals("", TermUtil.trimPunctuation("\u00BF?")); assertEquals("one", TermUtil.trimPunctuation("one")); assertEquals("one", TermUtil.trimPunctuation("one?")); assertEquals("one", TermUtil.trimPunctuation("\u00BFone?")); } }
static public String trimPunctuation(String string){ int begin = 0; int end = string.length(); // Trim leading punctuation while(begin < end){ char c = string.charAt(begin); if(!isPunctuation(c)){ break; } begin++; } // Trim trailing punctuation while(end > begin){ char c = string.charAt(end - 1); if(!isPunctuation(c)){ break; } end--; } if(begin > 0 || end < string.length()){ string = string.substring(begin, end); } return string; }