/** * Returns the trailing integer from the given string or null if the string * does not end with number Example: Alice has got a cat 12 - will return 12 * (the position of '1') Alice has got a black cat - will return null (no * trailing number in the string) */ public static String getTrailingInteger(String str) { int positionOfTrailingInteger = getPositionOfTrailingInteger(str); if (positionOfTrailingInteger == -1) { // string does not end in digits return null; } return str.substring(positionOfTrailingInteger); }
@Override protected float doCalculate(String s1, String s2) { String s1TrailingInteger = StringTools.getTrailingInteger(s1); String s2TrailingInteger = StringTools.getTrailingInteger(s2); if (s1TrailingInteger != null && s2TrailingInteger != null) { return s1TrailingInteger.equals(s2TrailingInteger) ? 1.0f : 0.0f; } else { return (s1TrailingInteger == null && s2TrailingInteger == null) ? 1.0f : 0.0f; } } }
public static String replaceNumbersToDecimal(String value) { if (value == null) { return value; } String[] tokens = value.split(" "); String[] newTokens = new String[tokens.length]; for (int i = 0; i < tokens.length; i++) { String token = tokens[i]; if (isRomanNumber(token)) { newTokens[i] = String.valueOf(romanToDecimal(token)); } else if (isEngWordNumber(token)) { newTokens[i] = String.valueOf(wordToDecimal.get(token.toUpperCase())); } else { newTokens[i] = token; } } return StringUtils.join(newTokens, " "); }
private String getNormalizedTitle(DocumentProtos.DocumentMetadata doc) { String title = DocumentWrapperUtils.getMainTitle(doc); title = StringTools.normalize(title); title = StringTools.replaceNumbersToDecimal(title); title = StringTools.normalizePartQualifiers(title); return title; }
/** * If the trailing part of the value is roman number then replaces it with * decimal number and returns the changed value, otherwise returns the * passed value */ public static String replaceLastRomanNumberToDecimal(String value) { if (value == null || !value.contains(" ")) { return value; } String number = value.substring(value.lastIndexOf(' ')).trim(); if (isRomanNumber(number)) { int decimalNumber = romanToDecimal(number); return value.substring(0, value.lastIndexOf(' ') + 1) + decimalNumber; } return value; }
/** * Generates key for the given {@link DocumentWrapper} * @param level influences the keyLength, the keyLength is a multiplication of the level and {@link #KEY_PART_LENGTH} */ @Override public String generateKey(DocumentProtos.DocumentMetadata doc) { String docKey = DocumentWrapperUtils.getMainTitle(doc); docKey = StringTools.normalize(docKey); docKey = StringTools.removeStopWords(docKey); docKey = docKey.replaceAll("\\s", ""); StringBuilder oddCharsSB = new StringBuilder(); for (int i=0; i < docKey.length(); i += 2) { oddCharsSB.append(docKey.charAt(i)); } docKey = oddCharsSB.toString(); int keyLength = KEY_PART_LENGTH; if (docKey.length() > keyLength) { docKey = docKey.substring(0, keyLength); } return docKey; } }
Preconditions.checkArgument(isRomanNumber(romanNumber)); decimal = processDecimal(1000, lastNumber, decimal); lastNumber = 1000; break; decimal = processDecimal(500, lastNumber, decimal); lastNumber = 500; break; decimal = processDecimal(100, lastNumber, decimal); lastNumber = 100; break; decimal = processDecimal(50, lastNumber, decimal); lastNumber = 50; break; decimal = processDecimal(10, lastNumber, decimal); lastNumber = 10; break; decimal = processDecimal(5, lastNumber, decimal); lastNumber = 5; break; decimal = processDecimal(1, lastNumber, decimal); lastNumber = 1; break;
result = removeStopWords(result); result = result.toLowerCase(); result = result.trim().replaceAll(" +", " ");
private static String compactTitle(DocumentProtos.DocumentMetadata doc) { String docKey = DocumentWrapperUtils.getMainTitle(doc); return StringTools.normalize(docKey); }
/** * If the trailing part of the value is a string denoting number (one, two, * three... ten) then it is replaced with an appropriate number */ public static String replaceLastWordNumberToDecimal(String value) { if (value == null || !value.contains(" ")) { return value; } String number = value.substring(value.lastIndexOf(' ')).trim().toUpperCase(); if (isEngWordNumber(number)) { return value.substring(0, value.lastIndexOf(' ') + 1) + wordToDecimal.get(number); } return value; }
public static String normalizePartQualifiers(String str) { String[] tokens = str.split(" "); List<String> newTokens = new ArrayList<String>(); boolean recentlyPartName = false; for (int i = 0; i < tokens.length; i++) { if (partNames.contains(tokens[i].toUpperCase())) { newTokens.add(PART_NAME); recentlyPartName = true; } else if (!recentlyPartName && isDecimalNumber(tokens[i])) { newTokens.add(PART_NAME); newTokens.add(tokens[i]); recentlyPartName = false; } else { newTokens.add(tokens[i]); recentlyPartName = false; } } return StringUtils.join(newTokens, " "); }
@Override public Vote vote(DocumentProtos.DocumentMetadata doc1, DocumentProtos.DocumentMetadata doc2) { String issn1 = extractIssn(doc1); String issn2 = extractIssn(doc2); if (issn1 != null && !issn1.isEmpty() && issn1.equals(issn2)) { return new Vote(Vote.VoteStatus.PROBABILITY, 1.0f); } String journal1 = extractJournal(doc1); String journal2 = extractJournal(doc2); if (journal1 == null || journal2 == null) { return new Vote(Vote.VoteStatus.ABSTAIN); } journal1 = StringTools.normalize(journal1); journal2 = StringTools.normalize(journal2); //SimilarityCalculator calculator = new LCSSimilarity(); SimilarityCalculator calculator = new EditDistanceSimilarity(approveLevel, disapproveLevel); float similarity = calculator.calculateSimilarity(journal1, journal2); if (similarity > 0) { return new Vote(Vote.VoteStatus.PROBABILITY, similarity); } else { return new Vote(Vote.VoteStatus.NOT_EQUALS); } }
surname = StringTools.normalize(surname);