private int getFirstNonWS(String s, int pos) { while (pos < s.length() && StringUtil.isWhitespace(s.charAt(pos))) pos++; return pos; }
public String[] put(String word, String... tags) { if (this.caseSensitive) { return dictionary.put(word, tags); } else { return dictionary.put(StringUtil.toLowerCase(word), tags); } }
public StringList read() throws IOException { String line = lineStream.read(); StringList name = null; if ((line != null) && (!StringUtil.isEmpty(line))) { String name2; // find the location of the name separator in the line of data. int pos = line.indexOf(' '); if ((pos != -1)) { String parsed = line.substring(0, pos); // the data is in ALL CAPS ... so the easiest way is to convert // back to standard mixed case. if ((parsed.length() > 2) && (parsed.startsWith("MC"))) { name2 = parsed.substring(0,1).toUpperCase(locale) + parsed.substring(1,2).toLowerCase(locale) + parsed.substring(2,3).toUpperCase(locale) + parsed.substring(3).toLowerCase(locale); } else { name2 = parsed.substring(0,1).toUpperCase(locale) + parsed.substring(1).toLowerCase(locale); } name = new StringList(new String[]{name2}); } } return name; }
/** * Get the SES required to go from a word to a lemma. * @param wordForm the word * @param lemma the lemma * @return the shortest edit script */ public static String getShortestEditScript(String wordForm, String lemma) { String reversedWF = new StringBuffer(wordForm.toLowerCase()).reverse().toString(); String reversedLemma = new StringBuffer(lemma.toLowerCase()).reverse().toString(); StringBuffer permutations = new StringBuffer(); String ses; if (!reversedWF.equals(reversedLemma)) { int[][]levenDistance = StringUtil.levenshteinDistance(reversedWF, reversedLemma); StringUtil.computeShortestEditScript(reversedWF, reversedLemma, levenDistance, permutations); ses = permutations.toString(); } else { ses = "O"; } return ses; }
String leftLower = StringUtil.toLowerCase(parts[parts.length - 1]); key = leftLower + "+" + right; if (CONTRACTIONS.containsKey(key)) { String r = CONTRACTIONS.get(key); String firstChar = r.substring(0, 1); r = StringUtil.toUpperCase(firstChar) + r.substring(1); sb.append(r); return sb.toString();
protected String getChunkTag(Node node) { String tag = node.getSyntacticTag(); String phraseTag = tag.substring(tag.lastIndexOf(":") + 1); while (phraseTag.endsWith("-")) { phraseTag = phraseTag.substring(0, phraseTag.length() - 1); } // maybe we should use only np, vp and pp, but will keep ap and advp. if (phraseTag.equals("np") || phraseTag.equals("vp") || phraseTag.equals("pp") || phraseTag.equals("ap") || phraseTag.equals("advp") || phraseTag.equals("adjp")) { phraseTag = StringUtil.toUpperCase(phraseTag); } else { phraseTag = OTHER; } return phraseTag; }
public static String[] encodeLemmas(String[] toks, String[] lemmas) { List<String> sesList = new ArrayList<>(); for (int i = 0; i < toks.length; i++) { String ses = StringUtil.getShortestEditScript(toks[i], lemmas[i]); if (ses.length() == 0) { ses = "_"; } sesList.add(ses); } return sesList.toArray(new String[sesList.size()]); }
/** * Decodes the lemma from the word and the induced lemma class. * @param toks the array of tokens * @param preds the predicted lemma classes * @return the array of decoded lemmas */ public static String[] decodeLemmas(String[] toks, String[] preds) { List<String> lemmas = new ArrayList<>(); for (int i = 0; i < toks.length; i++) { String lemma = StringUtil.decodeShortestEditScript(toks[i].toLowerCase(), preds[i]); if (lemma.length() == 0) { lemma = "_"; } lemmas.add(lemma); } return lemmas.toArray(new String[lemmas.size()]); }
String leftLower = StringUtil.toLowerCase(parts[parts.length - 1]); key = leftLower + "+" + right; if (CONTRACTIONS.containsKey(key)) { String r = CONTRACTIONS.get(key); String firstChar = r.substring(0, 1); r = StringUtil.toUpperCase(firstChar) + r.substring(1); sb.append(r); return sb.toString();
/** * Get the SES required to go from a word to a lemma. * @param wordForm the word * @param lemma the lemma * @return the shortest edit script */ public static String getShortestEditScript(String wordForm, String lemma) { String reversedWF = new StringBuffer(wordForm.toLowerCase()).reverse().toString(); String reversedLemma = new StringBuffer(lemma.toLowerCase()).reverse().toString(); StringBuffer permutations = new StringBuffer(); String ses; if (!reversedWF.equals(reversedLemma)) { int[][]levenDistance = StringUtil.levenshteinDistance(reversedWF, reversedLemma); StringUtil.computeShortestEditScript(reversedWF, reversedLemma, levenDistance, permutations); ses = permutations.toString(); } else { ses = "O"; } return ses; }
@Test public void testToUpperCase() { Assert.assertEquals("TEST", StringUtil.toUpperCase("test")); Assert.assertEquals("SIMPLE", StringUtil.toUpperCase("simple")); }
public static String[] encodeLemmas(String[] toks, String[] lemmas) { List<String> sesList = new ArrayList<>(); for (int i = 0; i < toks.length; i++) { String ses = StringUtil.getShortestEditScript(toks[i], lemmas[i]); if (ses.length() == 0) { ses = "_"; } sesList.add(ses); } return sesList.toArray(new String[sesList.size()]); }
/** * Decodes the lemma from the word and the induced lemma class. * @param toks the array of tokens * @param preds the predicted lemma classes * @return the array of decoded lemmas */ public static String[] decodeLemmas(String[] toks, String[] preds) { List<String> lemmas = new ArrayList<>(); for (int i = 0; i < toks.length; i++) { String lemma = StringUtil.decodeShortestEditScript(toks[i].toLowerCase(), preds[i]); if (lemma.length() == 0) { lemma = "_"; } lemmas.add(lemma); } return lemmas.toArray(new String[lemmas.size()]); }
private int getFirstWS(String s, int pos) { while (pos < s.length() && !StringUtil.isWhitespace(s.charAt(pos))) pos++; return pos; }
public void createFeatures(List<String> features, String[] tokens, int index, String[] preds) { if (lowercase) { features.add(WORD_PREFIX + "=" + StringUtil.toLowerCase(tokens[index])); } else { features.add(WORD_PREFIX + "=" + tokens[index]); } } }
public static File getOpennlpDataDir() throws FileNotFoundException { final String dataDirectory = System.getProperty("OPENNLP_DATA_DIR"); if (StringUtil.isEmpty(dataDirectory)) { throw new IllegalArgumentException("The OPENNLP_DATA_DIR is not set."); } final File file = new File(System.getProperty("OPENNLP_DATA_DIR")); if (!file.exists()) { throw new FileNotFoundException("The OPENNLP_DATA_DIR path of " + dataDirectory + " was not found."); } return file; }
String leftLower = StringUtil.toLowerCase(parts[parts.length - 1]); key = leftLower + "+" + right; if (CONTRACTIONS.containsKey(key)) { String r = CONTRACTIONS.get(key); String firstChar = r.substring(0, 1); r = StringUtil.toUpperCase(firstChar) + r.substring(1); sb.append(r); return sb.toString();
/** * Get the SES required to go from a word to a lemma. * @param wordForm the word * @param lemma the lemma * @return the shortest edit script */ public static String getShortestEditScript(String wordForm, String lemma) { String reversedWF = new StringBuffer(wordForm.toLowerCase()).reverse().toString(); String reversedLemma = new StringBuffer(lemma.toLowerCase()).reverse().toString(); StringBuffer permutations = new StringBuffer(); String ses; if (!reversedWF.equals(reversedLemma)) { int[][]levenDistance = StringUtil.levenshteinDistance(reversedWF, reversedLemma); StringUtil.computeShortestEditScript(reversedWF, reversedLemma, levenDistance, permutations); ses = permutations.toString(); } else { ses = "O"; } return ses; }
protected String getChunkTag(Node node) { String tag = node.getSyntacticTag(); String phraseTag = tag.substring(tag.lastIndexOf(":") + 1); while (phraseTag.endsWith("-")) { phraseTag = phraseTag.substring(0, phraseTag.length() - 1); } // maybe we should use only np, vp and pp, but will keep ap and advp. if (phraseTag.equals("np") || phraseTag.equals("vp") || phraseTag.equals("pp") || phraseTag.equals("ap") || phraseTag.equals("advp") || phraseTag.equals("adjp")) { phraseTag = StringUtil.toUpperCase(phraseTag); } else { phraseTag = OTHER; } return phraseTag; }
public static String[] encodeLemmas(String[] toks, String[] lemmas) { List<String> sesList = new ArrayList<>(); for (int i = 0; i < toks.length; i++) { String ses = StringUtil.getShortestEditScript(toks[i], lemmas[i]); if (ses.length() == 0) { ses = "_"; } sesList.add(ses); } return sesList.toArray(new String[sesList.size()]); }