opennlp.tools.util.StringUtil java code examples

private int getFirstNonWS(String s, int pos) {
 while (pos < s.length() && StringUtil.isWhitespace(s.charAt(pos)))
  pos++;
 return pos;
}

public String[] put(String word, String... tags) {
 if (this.caseSensitive) {
  return dictionary.put(word, tags);
 } else {
  return dictionary.put(StringUtil.toLowerCase(word), tags);
 }
}

public StringList read() throws IOException {
 String line = lineStream.read();
 StringList name = null;
 if ((line != null) &&
   (!StringUtil.isEmpty(line))) {
  String name2;
  // find the location of the name separator in the line of data.
  int pos = line.indexOf(' ');
  if ((pos != -1)) {
   String parsed = line.substring(0, pos);
   // the data is in ALL CAPS ... so the easiest way is to convert
   // back to standard mixed case.
   if ((parsed.length() > 2) &&
     (parsed.startsWith("MC"))) {
    name2 = parsed.substring(0,1).toUpperCase(locale) +
        parsed.substring(1,2).toLowerCase(locale) +
        parsed.substring(2,3).toUpperCase(locale) +
        parsed.substring(3).toLowerCase(locale);
   } else {
    name2 = parsed.substring(0,1).toUpperCase(locale) +
        parsed.substring(1).toLowerCase(locale);
   }
   name = new StringList(new String[]{name2});
  }
 }
 return name;
}

/**
 * Get the SES required to go from a word to a lemma.
 * @param wordForm the word
 * @param lemma the lemma
 * @return the shortest edit script
 */
public static String getShortestEditScript(String wordForm, String lemma) {
 String reversedWF = new StringBuffer(wordForm.toLowerCase()).reverse().toString();
 String reversedLemma = new StringBuffer(lemma.toLowerCase()).reverse().toString();
 StringBuffer permutations = new StringBuffer();
 String ses;
 if (!reversedWF.equals(reversedLemma)) {
  int[][]levenDistance = StringUtil.levenshteinDistance(reversedWF, reversedLemma);
  StringUtil.computeShortestEditScript(reversedWF, reversedLemma, levenDistance, permutations);
  ses = permutations.toString();
 } else {
  ses = "O";
 }
 return ses;
}

String leftLower = StringUtil.toLowerCase(parts[parts.length - 1]);
key = leftLower + "+" + right;
if (CONTRACTIONS.containsKey(key)) {
 String r = CONTRACTIONS.get(key);
 String firstChar = r.substring(0, 1);
 r = StringUtil.toUpperCase(firstChar) + r.substring(1);
 sb.append(r);
 return sb.toString();

protected String getChunkTag(Node node) {
 String tag = node.getSyntacticTag();
 String phraseTag = tag.substring(tag.lastIndexOf(":") + 1);
 while (phraseTag.endsWith("-")) {
  phraseTag = phraseTag.substring(0, phraseTag.length() - 1);
 }
 // maybe we should use only np, vp and pp, but will keep ap and advp.
 if (phraseTag.equals("np") || phraseTag.equals("vp")
   || phraseTag.equals("pp") || phraseTag.equals("ap")
   || phraseTag.equals("advp") || phraseTag.equals("adjp")) {
  phraseTag = StringUtil.toUpperCase(phraseTag);
 } else {
  phraseTag = OTHER;
 }
 return phraseTag;
}

public static String[] encodeLemmas(String[] toks, String[] lemmas) {
 List<String> sesList = new ArrayList<>();
 for (int i = 0; i < toks.length; i++) {
  String ses = StringUtil.getShortestEditScript(toks[i], lemmas[i]);
  if (ses.length() == 0) {
   ses = "_";
  }
  sesList.add(ses);
 }
 return sesList.toArray(new String[sesList.size()]);
}

/**
 * Decodes the lemma from the word and the induced lemma class.
 * @param toks the array of tokens
 * @param preds the predicted lemma classes
 * @return the array of decoded lemmas
 */
public static String[] decodeLemmas(String[] toks, String[] preds) {
 List<String> lemmas = new ArrayList<>();
 for (int i = 0; i < toks.length; i++) {
  String lemma = StringUtil.decodeShortestEditScript(toks[i].toLowerCase(), preds[i]);
  if (lemma.length() == 0) {
   lemma = "_";
  }
  lemmas.add(lemma);
 }
 return lemmas.toArray(new String[lemmas.size()]);
}

String leftLower = StringUtil.toLowerCase(parts[parts.length - 1]);
key = leftLower + "+" + right;
if (CONTRACTIONS.containsKey(key)) {
 String r = CONTRACTIONS.get(key);
 String firstChar = r.substring(0, 1);
 r = StringUtil.toUpperCase(firstChar) + r.substring(1);
 sb.append(r);
 return sb.toString();

/**
 * Get the SES required to go from a word to a lemma.
 * @param wordForm the word
 * @param lemma the lemma
 * @return the shortest edit script
 */
public static String getShortestEditScript(String wordForm, String lemma) {
 String reversedWF = new StringBuffer(wordForm.toLowerCase()).reverse().toString();
 String reversedLemma = new StringBuffer(lemma.toLowerCase()).reverse().toString();
 StringBuffer permutations = new StringBuffer();
 String ses;
 if (!reversedWF.equals(reversedLemma)) {
  int[][]levenDistance = StringUtil.levenshteinDistance(reversedWF, reversedLemma);
  StringUtil.computeShortestEditScript(reversedWF, reversedLemma, levenDistance, permutations);
  ses = permutations.toString();
 } else {
  ses = "O";
 }
 return ses;
}

@Test
public void testToUpperCase() {
 Assert.assertEquals("TEST", StringUtil.toUpperCase("test"));
 Assert.assertEquals("SIMPLE", StringUtil.toUpperCase("simple"));
}

public static String[] encodeLemmas(String[] toks, String[] lemmas) {
 List<String> sesList = new ArrayList<>();
 for (int i = 0; i < toks.length; i++) {
  String ses = StringUtil.getShortestEditScript(toks[i], lemmas[i]);
  if (ses.length() == 0) {
   ses = "_";
  }
  sesList.add(ses);
 }
 return sesList.toArray(new String[sesList.size()]);
}

/**
 * Decodes the lemma from the word and the induced lemma class.
 * @param toks the array of tokens
 * @param preds the predicted lemma classes
 * @return the array of decoded lemmas
 */
public static String[] decodeLemmas(String[] toks, String[] preds) {
 List<String> lemmas = new ArrayList<>();
 for (int i = 0; i < toks.length; i++) {
  String lemma = StringUtil.decodeShortestEditScript(toks[i].toLowerCase(), preds[i]);
  if (lemma.length() == 0) {
   lemma = "_";
  }
  lemmas.add(lemma);
 }
 return lemmas.toArray(new String[lemmas.size()]);
}

private int getFirstWS(String s, int pos) {
 while (pos < s.length() && !StringUtil.isWhitespace(s.charAt(pos)))
  pos++;
 return pos;
}

 public void createFeatures(List<String> features, String[] tokens, int index, String[] preds) {
  if (lowercase) {
   features.add(WORD_PREFIX + "=" + StringUtil.toLowerCase(tokens[index]));
  }
  else {
   features.add(WORD_PREFIX + "=" + tokens[index]);
  }
 }
}

public static File getOpennlpDataDir() throws FileNotFoundException {
 final String dataDirectory = System.getProperty("OPENNLP_DATA_DIR");
 if (StringUtil.isEmpty(dataDirectory)) {
  throw new IllegalArgumentException("The OPENNLP_DATA_DIR is not set.");
 }
 final File file = new File(System.getProperty("OPENNLP_DATA_DIR"));
 if (!file.exists()) {
  throw new FileNotFoundException("The OPENNLP_DATA_DIR path of " + dataDirectory + " was not found.");
 }
 return file;
}

String leftLower = StringUtil.toLowerCase(parts[parts.length - 1]);
key = leftLower + "+" + right;
if (CONTRACTIONS.containsKey(key)) {
 String r = CONTRACTIONS.get(key);
 String firstChar = r.substring(0, 1);
 r = StringUtil.toUpperCase(firstChar) + r.substring(1);
 sb.append(r);
 return sb.toString();

/**
 * Get the SES required to go from a word to a lemma.
 * @param wordForm the word
 * @param lemma the lemma
 * @return the shortest edit script
 */
public static String getShortestEditScript(String wordForm, String lemma) {
 String reversedWF = new StringBuffer(wordForm.toLowerCase()).reverse().toString();
 String reversedLemma = new StringBuffer(lemma.toLowerCase()).reverse().toString();
 StringBuffer permutations = new StringBuffer();
 String ses;
 if (!reversedWF.equals(reversedLemma)) {
  int[][]levenDistance = StringUtil.levenshteinDistance(reversedWF, reversedLemma);
  StringUtil.computeShortestEditScript(reversedWF, reversedLemma, levenDistance, permutations);
  ses = permutations.toString();
 } else {
  ses = "O";
 }
 return ses;
}

protected String getChunkTag(Node node) {
 String tag = node.getSyntacticTag();
 String phraseTag = tag.substring(tag.lastIndexOf(":") + 1);
 while (phraseTag.endsWith("-")) {
  phraseTag = phraseTag.substring(0, phraseTag.length() - 1);
 }
 // maybe we should use only np, vp and pp, but will keep ap and advp.
 if (phraseTag.equals("np") || phraseTag.equals("vp")
   || phraseTag.equals("pp") || phraseTag.equals("ap")
   || phraseTag.equals("advp") || phraseTag.equals("adjp")) {
  phraseTag = StringUtil.toUpperCase(phraseTag);
 } else {
  phraseTag = OTHER;
 }
 return phraseTag;
}

public static String[] encodeLemmas(String[] toks, String[] lemmas) {
 List<String> sesList = new ArrayList<>();
 for (int i = 0; i < toks.length; i++) {
  String ses = StringUtil.getShortestEditScript(toks[i], lemmas[i]);
  if (ses.length() == 0) {
   ses = "_";
  }
  sesList.add(ses);
 }
 return sesList.toArray(new String[sesList.size()]);
}

Most used methods

isWhitespace
Determines if the specified character is a whitespace. A character is considered a whitespace when o
toLowerCase
Converts to lower case independent of the current locale via Character#toLowerCase(char) which uses
isEmpty
Returns true if CharSequence#length() is0 or null.
toUpperCase
Converts to upper case independent of the current locale via Character#toUpperCase(char) which uses
computeShortestEditScript
Computes the Shortest Edit Script (SES) to convert a word into its lemma. This is based on Chrupala'
decodeShortestEditScript
Read predicted SES by the lemmatizer model and apply the permutations to obtain the lemma from the w
getShortestEditScript
Get the SES required to go from a word to a lemma.
levenshteinDistance
Computes the Levenshtein distance of two strings in a matrix. Based on pseudo-code provided here: ht
minimum
Get mininum of three values.

Popular in Java

Running tasks concurrently on multiple threads
compareTo (BigDecimal)
getResourceAsStream (ClassLoader)
getExternalFilesDir (Context)
Runnable (java.lang)
Represents a command that can be executed. Often used to run code in a different Thread.
ZipFile (java.util.zip)
This class provides random read access to a zip file. You pay more to read the zip file's central di
FileUtils (org.apache.commons.io)
General file manipulation utilities. Facilities are provided in the following areas: * writing to a
Component (java.awt)
A component is an object having a graphical representation that can be displayed on the screen and t
Reference (javax.naming)
Location (org.springframework.beans.factory.parsing)
Class that models an arbitrary location in a Resource.Typically used to track the location of proble
CodeWhisperer alternatives

How to useStringUtil in opennlp.tools.util

Best Java code snippets using opennlp.tools.util.StringUtil (Showing top 20 results out of 315)

How to use
StringUtil
in
opennlp.tools.util