private String getNormalizedTitle(DocumentProtos.DocumentMetadata doc) { String title = DocumentWrapperUtils.getMainTitle(doc); title = StringTools.normalize(title); title = StringTools.replaceNumbersToDecimal(title); title = StringTools.normalizePartQualifiers(title); return title; }
private static String compactTitle(DocumentProtos.DocumentMetadata doc) { String docKey = DocumentWrapperUtils.getMainTitle(doc); return StringTools.normalize(docKey); }
public List<String> getStemmedPairs(final String text) throws IOException { String tmp = text.toLowerCase(); tmp = DiacriticsRemover.removeDiacritics(tmp); tmp = tmp.replaceAll("_", SPACE); tmp = tmp.replaceAll("\n", SPACE); tmp = tmp.replaceAll("[^a-z\\d-_/ ]", ""); List<String> strings = new ArrayList<String>(); PorterStemmer ps = new PorterStemmer(); for (String s : StringUtils.split(tmp, SPACE)) { if (!StopWordsRemover.isAnEnglishStopWords(s)) {; ps.add(s.toCharArray(), s.length()); ps.stem(); strings.add(ps.toString()); } } return strings; }
/** * Stem the word placed into the Stemmer buffer through calls to add(). * Returns true if the stemming process resulted in a word different from * the input. You can retrieve the result with * getResultLength()/getResultBuffer() or toString(). */ public void stem() { k = i - 1; if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); } i_end = k + 1; i = 0; }
/** * Generates key for the given {@link DocumentWrapper} * @param level influences the keyLength, the keyLength is a multiplication of the level and {@link #KEY_PART_LENGTH} */ @Override public String generateKey(DocumentProtos.DocumentMetadata doc) { String docKey = DocumentWrapperUtils.getMainTitle(doc); docKey = StringTools.normalize(docKey); docKey = StringTools.removeStopWords(docKey); docKey = docKey.replaceAll("\\s", ""); StringBuilder oddCharsSB = new StringBuilder(); for (int i=0; i < docKey.length(); i += 2) { oddCharsSB.append(docKey.charAt(i)); } docKey = oddCharsSB.toString(); int keyLength = KEY_PART_LENGTH; if (docKey.length() > keyLength) { docKey = docKey.substring(0, keyLength); } return docKey; } }
public static void main(String[] args) throws IOException { System.out.println("Is 'by' a stop word?: " + isAnEnglishStopWords("by")); System.out.println("Is 'eclipse' a stop word?: " + isAnEnglishStopWords("eclipse")); } }
private void step2() { if (ends("y") && vowelinstem()) { b[k] = 'i'; } }
/** * If the trailing part of the value is roman number then replaces it with * decimal number and returns the changed value, otherwise returns the * passed value */ public static String replaceLastRomanNumberToDecimal(String value) { if (value == null || !value.contains(" ")) { return value; } String number = value.substring(value.lastIndexOf(' ')).trim(); if (isRomanNumber(number)) { int decimalNumber = romanToDecimal(number); return value.substring(0, value.lastIndexOf(' ') + 1) + decimalNumber; } return value; }
private boolean vowelinstem() { int ii; for (ii = 0; ii <= j; ii++) { if (!cons(ii)) { return true; } } return false; }
@Override protected float doCalculate(String s1, String s2) { String s1TrailingInteger = StringTools.getTrailingInteger(s1); String s2TrailingInteger = StringTools.getTrailingInteger(s2); if (s1TrailingInteger != null && s2TrailingInteger != null) { return s1TrailingInteger.equals(s2TrailingInteger) ? 1.0f : 0.0f; } else { return (s1TrailingInteger == null && s2TrailingInteger == null) ? 1.0f : 0.0f; } } }
public static StreamSource resourceToStreamSource(Object obj,String localization){ return new StreamSource(resourceToInputStream(obj, localization)); }
/** * Returns the trailing integer from the given string or null if the string * does not end with number Example: Alice has got a cat 12 - will return 12 * (the position of '1') Alice has got a black cat - will return null (no * trailing number in the string) */ public static String getTrailingInteger(String str) { int positionOfTrailingInteger = getPositionOfTrailingInteger(str); if (positionOfTrailingInteger == -1) { // string does not end in digits return null; } return str.substring(positionOfTrailingInteger); }
/** * Returns the author of the publication that is on the given authorPosition. * Returns null if there is no author on the authorPosition. */ public static Author getAuthor(DocumentWrapper documentWrapper, int authorPosition) { List<Author> authors = getAuthors(documentWrapper); for (Author author : authors) { if (author.getPositionNumber()==authorPosition) { return author; } } return null; }
/** * If the trailing part of the value is a string denoting number (one, two, * three... ten) then it is replaced with an appropriate number */ public static String replaceLastWordNumberToDecimal(String value) { if (value == null || !value.contains(" ")) { return value; } String number = value.substring(value.lastIndexOf(' ')).trim().toUpperCase(); if (isEngWordNumber(number)) { return value.substring(0, value.lastIndexOf(' ') + 1) + wordToDecimal.get(number); } return value; }
@Override public Boolean exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return false; } try { String word = (String) input.get(0); return !StopWordsRemover.isAnEnglishStopWords(word); } catch (Exception e) { throw new IOException("Caught exception processing input row ", e); } } }
private boolean doublec(int j) { if (j < 1) { return false; } if (b[j] != b[j - 1]) { return false; } return cons(j); }
private boolean cons(int i) { switch (b[i]) { case 'a': case 'e': case 'i': case 'o': case 'u': return false; case 'y': return (i == 0) ? true : !cons(i - 1); default: return true; } }