/** * Calculate string similarity with tfidf weights relative to each character * frequency and how many times a character appears in a given string * * @param strings the strings to calculate similarity for * @return the cosine similarity between the strings */ public static double stringSimilarity(String... strings) { if (strings == null) return 0; Counter<String> counter = new Counter<>(); Counter<String> counter2 = new Counter<>(); for (int i = 0; i < strings[0].length(); i++) counter.incrementCount(String.valueOf(strings[0].charAt(i)), 1.0); for (int i = 0; i < strings[1].length(); i++) counter2.incrementCount(String.valueOf(strings[1].charAt(i)), 1.0); Set<String> v1 = counter.keySet(); Set<String> v2 = counter2.keySet(); Set<String> both = SetUtils.intersection(v1, v2); double scalar = 0, norm1 = 0, norm2 = 0; for (String k : both) scalar += counter.getCount(k) * counter2.getCount(k); for (String k : v1) norm1 += counter.getCount(k) * counter.getCount(k); for (String k : v2) norm2 += counter2.getCount(k) * counter2.getCount(k); return scalar / Math.sqrt(norm1 * norm2); }
/** * Calculate string similarity with tfidf weights relative to each character * frequency and how many times a character appears in a given string * @param strings the strings to calculate similarity for * @return the cosine similarity between the strings */ public static double stringSimilarity(String...strings) { if(strings==null) return 0; Counter<String> counter = new Counter<String>(); Counter<String> counter2 = new Counter<String>(); for(int i = 0; i < strings[0].length(); i++) counter.incrementCount(String.valueOf(strings[0].charAt(i)), 1.0); for(int i = 0; i < strings[1].length(); i++) counter2.incrementCount(String.valueOf(strings[1].charAt(i)), 1.0); Set<String> v1 = counter.keySet(); Set<String> v2 = counter2.keySet(); Set<String> both = SetUtils.intersection(v1, v2); double sclar = 0, norm1 = 0, norm2 = 0; for (String k : both) sclar += counter.getCount(k) * counter2.getCount(k); for (String k : v1) norm1 += counter.getCount(k) * counter.getCount(k); for (String k : v2) norm2 += counter2.getCount(k) * counter2.getCount(k); return sclar / Math.sqrt(norm1 * norm2); }