public static final int noTimesSameOrder(final int[] term0Positions, final int[] term1Positions, final int windowSize, final int documentLengthInTokens) { //hack for following example: //noTimesSameOrder({1}, {7}, 8, 8) should be 1; if (term0Positions.length == 1 && term1Positions.length == 1 && term1Positions[0] > term0Positions[0] && term1Positions[0] - term0Positions[0] < windowSize) return 1; return noTimesSameOrder(term0Positions, 0, term0Positions.length, term1Positions, 0, term1Positions.length, windowSize, documentLengthInTokens); }
return countTrue(matchingWindows);
windowsForTerms(blocksForEachTerm[t], windowSize, numberOfNGrams, windows_for_terms[t]);
/** * how likely is it that these two postings have so many near-occurrences, * given the length of this document */ protected double scoreFDSD(boolean SD, int i, final Posting ip1, int j, final Posting ip2, final double _avgDocLen) { final int[] blocks1 = ((BlockPosting) ip1).getPositions(); final int[] blocks2 = ((BlockPosting) ip2).getPositions(); int docLength = ip1.getDocumentLength(); final int matchingNGrams = SD ? Distance.noTimesSameOrder(blocks1, blocks2, ngramLength, docLength) : Distance.noTimes(blocks1, blocks2, ngramLength, docLength); //System.err.println(this.getClass().getSimpleName() + " matchingNGrams="+matchingNGrams); final double s = scoreFDSD(matchingNGrams, docLength); if (Double.isNaN(s)) { System.err.println(this.getClass().getSimpleName() + " returned NaN for document " + ip1.getId() + " "+i+","+j+" pf="+matchingNGrams + " l="+ docLength); } return s; }
@Test public void testNoTimes_window2() { int[] x = new int[]{8,10,14,15}; int[] y = new int[]{1,4,6,12,17}; assertEquals(0, Distance.noTimes(new int[][]{x,y}, 2, 20)); x = new int[]{8}; y = new int[]{9}; //0:0-1 1:1-2 2:2-3 3:3-4 4:4-5 5:5-6 6:6-7 7:7-8 8:8-9 9:9-10 //10:10-11 11:11-12 12:12-13 13:13-14 14:14-15 15:15-16 16:16-17 17:18-18 18:18-19 //only 8-9 is a match assertEquals(1, Distance.noTimes(new int[][]{x,y}, 2, 20)); x = new int[]{8,10,14,15}; //8-9 and 9-10 are matches assertEquals(2, Distance.noTimes(new int[][]{x,y}, 2, 20)); y = new int[]{7}; //8-7 is a match assertEquals(1, Distance.noTimes(new int[][]{x,y}, 2, 20)); y = new int[]{7,9}; //(7,8), (8,9), (9,10) are matches assertEquals(3, Distance.noTimes(new int[][]{x,y}, 2, 20)); }
@Test public void testNoTimesNEW_2terms_window2() { for(int i=0;i<1000000;i++) { int[] x = new int[]{8,10,14,15}; int[] y = new int[]{1,4,6,12,17}; assertEquals(0, Distance.noTimesNEW(x,y, 2, 20)); x = new int[]{8}; y = new int[]{9}; //0:0-1 1:1-2 2:2-3 3:3-4 4:4-5 5:5-6 6:6-7 7:7-8 8:8-9 9:9-10 //10:10-11 11:11-12 12:12-13 13:13-14 14:14-15 15:15-16 16:16-17 17:18-18 18:18-19 //only 8-9 is a match assertEquals(1, Distance.noTimesNEW(x,y, 2, 20)); x = new int[]{8,10,14,15}; //8-9 and 9-10 are matches assertEquals(2, Distance.noTimesNEW(x,y, 2, 20)); y = new int[]{7}; //8-7 is a match assertEquals(1, Distance.noTimesNEW(x,y, 2, 20)); y = new int[]{7,9}; //(7,8), (8,9), (9,10) are matches assertEquals(3, Distance.noTimesNEW(x,y, 2, 20)); } }
@Test public void testFindSmallest() { int[] x = new int[]{8,14,10,15}; int[] y = new int[]{4,6,10,12,17,1}; assertEquals(0, Distance.findSmallest(x, y)); } }
/** * how likely is it that these two postings have so many near-occurrences, * given the length of this document */ protected double scoreFDSD(boolean SD, int i, final Posting ip1, int j, final Posting ip2, final double _avgDocLen) { final int[] blocks1 = ((BlockPosting) ip1).getPositions(); final int[] blocks2 = ((BlockPosting) ip2).getPositions(); int docLength = ip1.getDocumentLength(); final int matchingNGrams = SD ? Distance.noTimesSameOrder(blocks1, blocks2, ngramLength, docLength) : Distance.noTimes(blocks1, blocks2, ngramLength, docLength); //System.err.println(this.getClass().getSimpleName() + " matchingNGrams="+matchingNGrams); final double s = scoreFDSD(matchingNGrams, docLength); if (Double.isNaN(s)) { System.err.println(this.getClass().getSimpleName() + " returned NaN for document " + ip1.getId() + " "+i+","+j+" pf="+matchingNGrams + " l="+ docLength); } return s; }
@Test public void testNoTimes_2terms_window2() { for(int i=0;i<1000000;i++) { int[] x = new int[]{8,10,14,15}; int[] y = new int[]{1,4,6,12,17}; assertEquals(0, Distance.noTimes(x,y, 2, 20)); x = new int[]{8}; y = new int[]{9}; //0:0-1 1:1-2 2:2-3 3:3-4 4:4-5 5:5-6 6:6-7 7:7-8 8:8-9 9:9-10 //10:10-11 11:11-12 12:12-13 13:13-14 14:14-15 15:15-16 16:16-17 17:18-18 18:18-19 //only 8-9 is a match assertEquals(1, Distance.noTimes(x,y, 2, 20)); x = new int[]{8,10,14,15}; //8-9 and 9-10 are matches assertEquals(2, Distance.noTimes(x,y, 2, 20)); y = new int[]{7}; //8-7 is a match assertEquals(1, Distance.noTimes(x,y, 2, 20)); y = new int[]{7,9}; //(7,8), (8,9), (9,10) are matches assertEquals(3, Distance.noTimes(x,y, 2, 20)); } }
public static final int noTimesSameOrder(final int[] term0Positions, final int[] term1Positions, final int windowSize, final int documentLengthInTokens) { //hack for following example: //noTimesSameOrder({1}, {7}, 8, 8) should be 1; if (term0Positions.length == 1 && term1Positions.length == 1 && term1Positions[0] > term0Positions[0] && term1Positions[0] - term0Positions[0] < windowSize) return 1; return noTimesSameOrder(term0Positions, 0, term0Positions.length, term1Positions, 0, term1Positions.length, windowSize, documentLengthInTokens); }
final int[] windows_for_term1 = new int[numberOfNGrams]; final int[] windows_for_term2 = new int[numberOfNGrams]; windowsForTerms(blocksOfTerm1, windowSize, numberOfNGrams, windows_for_term1); windowsForTerms(blocksOfTerm2, windowSize, numberOfNGrams, windows_for_term2);
return countTrue(matchingWindows);
@Test public void testNoTimesSameOrder_window3() { int[] x = new int[]{8,10,14,15}; int[] y = new int[]{1,4,6,12,17}; assertEquals(0, Distance.noTimesSameOrder(x,y, 2, 20)); x = new int[]{8}; y = new int[]{9}; //0:0-2 1:1-3 2:2-4 3:3-5 4:4-6 5:5-7 6:6-8 7:7-9 8:8-10 9:9-11 //10:10-12 11:11-13 12:12-14 13:13-15 14:14-16 15:15-17 16:16-18 17:18-19 18:18-20 //only window 8-10 is a match assertEquals(1, Distance.noTimesSameOrder(x,y, 3, 20)); //(9,8) is not a match assertEquals(0, Distance.noTimesSameOrder(y,x, 3, 20)); x = new int[]{8,10,14,15}; //8-9 and 9-10 are matches assertEquals(1, Distance.noTimesSameOrder(x,y, 3, 20)); //8-7 is a NOT match, but 9-10 is a match assertEquals(1, Distance.noTimesSameOrder(x,y, 3, 20)); }
final int[] windows_for_term1 = new int[numberOfNGrams]; final int[] windows_for_term2 = new int[numberOfNGrams]; windowsForTerms(blocksOfTerm1, start1, end1, windowSize, numberOfNGrams, windows_for_term1); windowsForTerms(blocksOfTerm2, start2, end2, windowSize, numberOfNGrams, windows_for_term2);
@Test public void testNoTimesSameOrder_window2() { int[] x = new int[]{8,10,14,15}; int[] y = new int[]{1,4,6,12,17}; assertEquals(0, Distance.noTimesSameOrder(x,y, 2, 20)); x = new int[]{8}; y = new int[]{9}; //0:0-1 1:1-2 2:2-3 3:3-4 4:4-5 5:5-6 6:6-7 7:7-8 8:8-9 9:9-10 //10:10-11 11:11-12 12:12-13 13:13-14 14:14-15 15:15-16 16:16-17 17:18-18 18:18-19 //only 8-9 is a match assertEquals(1, Distance.noTimesSameOrder(x,y, 2, 20)); //9-8 is not a match assertEquals(0, Distance.noTimesSameOrder(y,x, 2, 20)); x = new int[]{8,10,14,15}; y = new int[]{9}; //8-9 is a match, 10-9 is not a match assertEquals(1, Distance.noTimesSameOrder(x,y, 2, 20)); //8-7 is a NOT match, but 9-10 is a match assertEquals(1, Distance.noTimesSameOrder(y,x, 2, 20)); }
final int[] windows_for_term1 = new int[numberOfNGrams]; final int[] windows_for_term2 = new int[numberOfNGrams]; windowsForTerms(blocksOfTerm1, windowSize, numberOfNGrams, windows_for_term1); windowsForTerms(blocksOfTerm2, windowSize, numberOfNGrams, windows_for_term2);
@Test public void testOverflow() { int[] x = new int[]{10,15}; int[] y = new int[]{1,6}; //assertEquals(0, Distance.noTimesSameOrder(x, 0, x.length -1, y, 0, y.length -1, 2, 872)); assertEquals(0, Distance.noTimesSameOrder(x, y, 2, 872)); }
final int[] windows_for_term1 = new int[numberOfNGrams]; final int[] windows_for_term2 = new int[numberOfNGrams]; windowsForTerms(blocksOfTerm1, start1, end1, windowSize, numberOfNGrams, windows_for_term1); windowsForTerms(blocksOfTerm2, start2, end2, windowSize, numberOfNGrams, windows_for_term2);
windowsForTerms(blocksForEachTerm[t], windowSize, numberOfNGrams, windows_for_terms[t]);