/** * Compares two Strings with respect to the base String, by Levenshtein distance. * <p/> * The input that is the closest match to the base String will sort before the other. * * @param a an input to compare relative to the base. * @param b an input to compare relative to the base. * * @return -1 if {@code a} is closer to the base than {@code b}; 1 if {@code b} is * closer to the base than {@code a}; 0 if both {@code a} and {@code b} are * equally close to the base. */ @Override public int compare(String a, String b) { // shortcuts if (a.equals(b)) { return 0; // comparing the same value; don't bother } else if (a.equals(base)) { return -1; // a is equal to the base, so it's always first } else if (b.equals(base)) { return 1; // b is equal to the base, so it's always first } // determine which of the two is closer to the base and order it first return Integer.compare(LEVENSHTEIN_DISTANCE.apply(a, base), LEVENSHTEIN_DISTANCE.apply(b, base)); }
/** * Whether {@code term} is at X Lenvenstein of a {@code value} * with X=: * - 0 for strings of one or two characters * - 1 for strings of three, four or five characters * - 2 for strings of more than five characters * @param value * @param term * @return true if {@code term} is similar to {@code value} */ private static boolean isFuzzy(String term, String value){ int distance; term = term.trim(); if (term.length() < 3) { distance = 0; } else if (term.length() < 6) { distance = 1; } else { distance = 2; } return LevenshteinDistance.getDefaultInstance().apply(value, term)<=distance; }
@Override public void eval() { String input1 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput1.start, rawInput1.end, rawInput1.buffer); String input2 = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(rawInput2.start, rawInput2.end, rawInput2.buffer); out.value = d.apply(input1, input2); } }
@Signature public Integer levenshteinDistance(Environment env, Memory other, @Optional("null") Memory threshold) { LevenshteinDistance distance = new LevenshteinDistance( threshold.isNull() ? null : threshold.toInteger() ); return distance.apply(text, other.toString()); }
@Test public void testGetLevenshteinDistance_StringString() { assertThat(UNLIMITED_DISTANCE.apply("", "")).isEqualTo(0); assertThat(UNLIMITED_DISTANCE.apply("", "a")).isEqualTo(1); assertThat(UNLIMITED_DISTANCE.apply("aaapppp", "")).isEqualTo(7); assertThat(UNLIMITED_DISTANCE.apply("frog", "fog")).isEqualTo(1); assertThat(UNLIMITED_DISTANCE.apply("fly", "ant")).isEqualTo(3); assertThat(UNLIMITED_DISTANCE.apply("elephant", "hippo")).isEqualTo(7); assertThat(UNLIMITED_DISTANCE.apply("hippo", "elephant")).isEqualTo(7); assertThat(UNLIMITED_DISTANCE.apply("hippo", "zzzzzzzz")).isEqualTo(8); assertThat(UNLIMITED_DISTANCE.apply("zzzzzzzz", "hippo")).isEqualTo(8); assertThat(UNLIMITED_DISTANCE.apply("hello", "hallo")).isEqualTo(1); }
@UserFunction @Description("apoc.text.levenshteinDistance(text1, text2) - compare the given strings with the Levenshtein distance algorithm.") public Long levenshteinDistance(final @Name("text1") String text1, @Name("text2")final String text2) { if (text1 == null || text2 == null) { return null; } return (long)levenshteinDistance.apply(text1, text2); }
@ParameterizedTest @MethodSource("parameters") public void test(final Integer threshold, final CharSequence left, final CharSequence right, final Integer distance) { final LevenshteinDistance metric = new LevenshteinDistance(threshold); assertThat(metric.apply(left, right)).isEqualTo(distance); }
assertThat(new LevenshteinDistance(0).apply("", "")).isEqualTo(0); assertThat(new LevenshteinDistance(8).apply("aaapppp", "")).isEqualTo(7); assertThat(new LevenshteinDistance(7).apply("aaapppp", "")).isEqualTo(7); assertThat(new LevenshteinDistance(6).apply("aaapppp", "")).isEqualTo(-1); assertThat(new LevenshteinDistance(0).apply("b", "a")).isEqualTo(-1); assertThat(new LevenshteinDistance(0).apply("a", "b")).isEqualTo(-1); assertThat(new LevenshteinDistance(0).apply("aa", "aa")).isEqualTo(0); assertThat(new LevenshteinDistance(2).apply("aa", "aa")).isEqualTo(0); assertThat(new LevenshteinDistance(2).apply("aaa", "bbb")).isEqualTo(-1); assertThat(new LevenshteinDistance(3).apply("aaa", "bbb")).isEqualTo(3); assertThat(new LevenshteinDistance(10).apply("aaaaaa", "b")).isEqualTo(6); assertThat(new LevenshteinDistance(8).apply("aaapppp", "b")).isEqualTo(7); assertThat(new LevenshteinDistance(4).apply("a", "bbb")).isEqualTo(3); assertThat(new LevenshteinDistance(7).apply("aaapppp", "b")).isEqualTo(7); assertThat(new LevenshteinDistance(3).apply("a", "bbb")).isEqualTo(3); assertThat(new LevenshteinDistance(2).apply("a", "bbb")).isEqualTo(-1); assertThat(new LevenshteinDistance(2).apply("bbb", "a")).isEqualTo(-1); assertThat(new LevenshteinDistance(6).apply("aaapppp", "b")).isEqualTo(-1); assertThat(new LevenshteinDistance(1).apply("a", "bbb")).isEqualTo(-1); assertThat(new LevenshteinDistance(1).apply("bbb", "a")).isEqualTo(-1);
@Test public void testGetLevenshteinDistance_StringNullInt() { assertThatIllegalArgumentException().isThrownBy(() -> UNLIMITED_DISTANCE.apply("a", null)); }
@Test public void testGetLevenshteinDistance_NullString() { assertThatIllegalArgumentException().isThrownBy(() -> UNLIMITED_DISTANCE.apply("a", null)); }
@Test public void testGetLevenshteinDistance_StringNull() { assertThatIllegalArgumentException().isThrownBy(() -> UNLIMITED_DISTANCE.apply(null, "a")); }
@Test public void testGetLevenshteinDistance_NullStringInt() { assertThatIllegalArgumentException().isThrownBy(() -> UNLIMITED_DISTANCE.apply(null, "a")); }
@Test public void testApplyThrowsIllegalArgumentExceptionAndCreatesLevenshteinDistanceTakingInteger() { assertThatIllegalArgumentException().isThrownBy(() -> new LevenshteinDistance(0).apply(null, null)); }
@Override public double calculate(CharSequence s, CharSequence t) { org.apache.commons.text.similarity.LevenshteinDistance distance = new org.apache.commons.text.similarity.LevenshteinDistance(); return distance.apply(s, t); }
/** * Compares two Strings with respect to the base String, by Levenshtein distance. * <p/> * The input that is the closest match to the base String will sort before the other. * * @param a an input to compare relative to the base. * @param b an input to compare relative to the base. * * @return -1 if {@code a} is closer to the base than {@code b}; 1 if {@code b} is * closer to the base than {@code a}; 0 if both {@code a} and {@code b} are * equally close to the base. */ @Override public int compare(String a, String b) { // shortcuts if (a.equals(b)) { return 0; // comparing the same value; don't bother } else if (a.equals(base)) { return -1; // a is equal to the base, so it's always first } else if (b.equals(base)) { return 1; // b is equal to the base, so it's always first } // determine which of the two is closer to the base and order it first return Integer.compare(LEVENSHTEIN_DISTANCE.apply(a, base), LEVENSHTEIN_DISTANCE.apply(b, base)); }
/** * Compares two Strings with respect to the base String, by Levenshtein distance. * <p/> * The input that is the closest match to the base String will sort before the other. * * @param a an input to compare relative to the base. * @param b an input to compare relative to the base. * @return -1 if {@code a} is closer to the base than {@code b}; 1 if {@code b} is closer to * the base than {@code a}; 0 if both {@code a} and {@code b} are equally close to the * base. */ @Override public int compare(String a, String b) { // shortcuts if (a.equals(b)) { return 0; // comparing the same value; don't bother } else if (a.equals(base)) { return -1; // a is equal to the base, so it's always first } else if (b.equals(base)) { return 1; // b is equal to the base, so it's always first } // determine which of the two is closer to the base and order it first return Integer.compare(LEVENSHTEIN_DISTANCE.apply(a, base), LEVENSHTEIN_DISTANCE.apply(b, base)); }