public String last() { if (isEmpty()) { return ""; } return words[words.length - 1]; }
public void advance() { if (!finished()) { cursor++; } }
public static List<String> loadLinesFromResource(String resourcePath) throws IOException { return loadLinesFromResource(resourcePath, null); }
public static String reduceHtml(String htmlToReduce) { String htmlBody = getHtmlBody(htmlToReduce); if (htmlBody == null) { Log.warn("Cannot get html body. "); return htmlToReduce; } List<String> parts = Regexps.allMatches(HTML_META_CONTENT_TAG, htmlToReduce); return HTML_START + "<html><head>" + Joiner.on(" ").join(parts) + "</head>\n" + cleanScripts(htmlBody) + "</html>"; }
public static String cleanAllHtmlRelated(String input) { return cleanHtmlTagsAndComments(removeAmpresandStrings(convertAmpersandStrings(input))); }
private static String getAttribute(Pattern pattern, String content) { String str = Regexps.firstMatch(pattern, content, 2); str = str == null ? "" : str.replace('\"', ' ').trim(); return TextUtil.convertAmpersandStrings(str); }
/** * Separate digits and non digits as Strings. Such as: * <pre> * A12 -> "A" "12" * 1A12'ye -> "1" "A" "12" "'ye" * </pre> * * @param s input. * @return separated list of numerical and non numerical tokens. */ public static List<String> separateNumbers(String s) { return Regexps.allMatches(NUMBER_SEPARATION, s); }
/** * it generates an HTML only containing bare head and meta tags with utf-8 charset. and body * content. it also eliminates all script tags. * * @param htmlToReduce html file to reduce. * @return reduced html file. charset is set to utf-8. */ public static String reduceHtmlFixedUTF8Charset(String htmlToReduce) { return HTML_START + "<html><head>" + META_CHARSET_UTF8 + "</head>\n" + cleanScripts(getHtmlBody(htmlToReduce)) + "</html>"; }
public static Iterator<TextChunk> iteratorFromCharIndex( Path path, int blockSize, long charIndex) { return new _SingleLoader(path, blockSize).iteratorFromCharIndex(charIndex); }
public static Iterator<TextChunk> singlePathIterator(Path path, int blockSize) { return new _SingleLoader(path, blockSize).iterator(); }
public static BlockTextLoader fromPaths(List<Path> corpora, int blockSize) { return new BlockTextLoader(corpora, blockSize); }
/** * returns a map with attributes of an xml line. For example if [content] is `<Foo a="one" * b="two">` it returns [a:one b:two] Map. It only checks the first match in the content. */ public static Map<String, String> getAttributes(String content) { return getAttributes(content, ""); }
public TokenSequence build() { return new TokenSequence(tokens); } }
@Override public String toString() { return asString(); }
@Override public int sourceSize(TokenSequence sourceSequence) { return sourceSequence.size(); }
public static TextSegmenter getWordSetSegmenter(Collection<String> words) { return new WordSetSegmenter(words); }
public static Builder builder() { return new Builder(); }
public String first() { if (isEmpty()) { return ""; } return words[0]; }
public static BlockTextLoader fromPaths(List<Path> corpora) { return new BlockTextLoader(corpora, _SingleLoader.DEFAULT_BLOCK_SIZE); }