zemberek.core.text java code examples

public String last() {
 if (isEmpty()) {
  return "";
 }
 return words[words.length - 1];
}

public String getAndAdvance() {
 String r = current();
 advance();
 return r;
}

public void advance() {
 if (!finished()) {
  cursor++;
 }
}

public static List<String> loadLinesFromResource(String resourcePath) throws IOException {
 return loadLinesFromResource(resourcePath, null);
}

public static String reduceHtml(String htmlToReduce) {
 String htmlBody = getHtmlBody(htmlToReduce);
 if (htmlBody == null) {
  Log.warn("Cannot get html body. ");
  return htmlToReduce;
 }
 List<String> parts = Regexps.allMatches(HTML_META_CONTENT_TAG, htmlToReduce);
 return HTML_START + "<html><head>" + Joiner.on(" ").join(parts) +
   "</head>\n" + cleanScripts(htmlBody) + "</html>";
}

public static String cleanAllHtmlRelated(String input) {
 return cleanHtmlTagsAndComments(removeAmpresandStrings(convertAmpersandStrings(input)));
}

private static String getAttribute(Pattern pattern, String content) {
 String str = Regexps.firstMatch(pattern, content, 2);
 str = str == null ? "" : str.replace('\"', ' ').trim();
 return TextUtil.convertAmpersandStrings(str);
}

/**
 * Separate digits and non digits as Strings. Such as:
 * <pre>
 * A12 -> "A" "12"
 * 1A12'ye -> "1" "A" "12" "'ye"
 * </pre>
 *
 * @param s input.
 * @return separated list of numerical and non numerical tokens.
 */
public static List<String> separateNumbers(String s) {
 return Regexps.allMatches(NUMBER_SEPARATION, s);
}

/**
 * it generates an HTML only containing bare head and meta tags with utf-8 charset. and body
 * content. it also eliminates all script tags.
 *
 * @param htmlToReduce html file to reduce.
 * @return reduced html file. charset is set to utf-8.
 */
public static String reduceHtmlFixedUTF8Charset(String htmlToReduce) {
 return HTML_START + "<html><head>" + META_CHARSET_UTF8 + "</head>\n" +
   cleanScripts(getHtmlBody(htmlToReduce)) + "</html>";
}

public static Iterator<TextChunk> iteratorFromCharIndex(
  Path path,
  int blockSize,
  long charIndex) {
 return new _SingleLoader(path, blockSize).iteratorFromCharIndex(charIndex);
}

public static Iterator<TextChunk> singlePathIterator(Path path, int blockSize) {
 return new _SingleLoader(path, blockSize).iterator();
}

public static BlockTextLoader fromPaths(List<Path> corpora, int blockSize) {
 return new BlockTextLoader(corpora, blockSize);
}

/**
 * returns a map with attributes of an xml line. For example if [content] is `<Foo a="one"
 * b="two">` it returns [a:one b:two] Map. It only checks the first match in the content.
 */
public static Map<String, String> getAttributes(String content) {
 return getAttributes(content, "");
}

 public TokenSequence build() {
  return new TokenSequence(tokens);
 }
}

@Override
public String toString() {
 return asString();
}

@Override
public int sourceSize(TokenSequence sourceSequence) {
 return sourceSequence.size();
}

public static TextSegmenter getWordSetSegmenter(Collection<String> words) {
 return new WordSetSegmenter(words);
}

public static Builder builder() {
 return new Builder();
}

public String first() {
 if (isEmpty()) {
  return "";
 }
 return words[0];
}

public static BlockTextLoader fromPaths(List<Path> corpora) {
 return new BlockTextLoader(corpora, _SingleLoader.DEFAULT_BLOCK_SIZE);
}

How to use zemberek.core.text

Best Java code snippets using zemberek.core.text (Showing top 20 results out of 315)