/** * Passes on all matches found. */ protected void compareCandidatesSimple(Record record, Collection<Record> candidates) { boolean found = false; for (Record candidate : candidates) { if (isSameAs(record, candidate)) continue; double prob = compare(record, candidate); if (prob > config.getThreshold()) { found = true; registerMatch(record, candidate, prob); } else if (config.getMaybeThreshold() != 0.0 && prob > config.getMaybeThreshold()) { found = true; // I guess? registerMatchPerhaps(record, candidate, prob); } } if (!found) registerNoMatchFor(record); }
@Test public void testEmpty() throws IOException, SAXException { Configuration config = ConfigLoader.load("classpath:config-empty.xml"); assertTrue(config.getDataSources().isEmpty()); assertTrue(config.getDataSources(1).isEmpty()); assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.4); assertEquals(config.getMaybeThreshold(), 0.0); assertTrue(config.getProperties().isEmpty()); }
/** * Passes on only the best match for each record. */ protected void compareCandidatesBest(Record record, Collection<Record> candidates) { double max = 0.0; Record best = null; // go through all candidates, and find the best for (Record candidate : candidates) { if (isSameAs(record, candidate)) continue; double prob = compare(record, candidate); if (prob > max) { max = prob; best = candidate; } } // pass on the best match, if any if (logger.isDebugEnabled()) { logger.debug("Best candidate at " + max + " is " + best); } if (max > config.getThreshold()) registerMatch(record, best, max); else if (config.getMaybeThreshold() != 0.0 && max > config.getMaybeThreshold()) registerMatchPerhaps(record, best, max); else registerNoMatchFor(record); }
@Test public void testString() throws IOException, SAXException { String cfg = "<duke>" + "<schema>" + "<threshold>0.4</threshold>" + "</schema>" + "</duke>"; Configuration config = ConfigLoader.loadFromString(cfg); assertTrue(config.getDataSources().isEmpty()); assertTrue(config.getDataSources(1).isEmpty()); assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.4); assertEquals(config.getMaybeThreshold(), 0.0); assertTrue(config.getProperties().isEmpty()); }
@Test public void testEmpty() throws IOException, SAXException { Configuration config = ConfigLoader.load("classpath:config-empty.xml"); assertTrue(config.getDataSources().isEmpty()); assertTrue(config.getDataSources(1).isEmpty()); assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.4); assertEquals(config.getMaybeThreshold(), 0.0); assertTrue(config.getProperties().isEmpty()); File outfile = tmpdir.newFile("config.xml"); ConfigWriter writer = new ConfigWriter(new FileOutputStream(outfile.getAbsolutePath())); writer.write(config); config = ConfigLoader.load(outfile.getAbsolutePath()); assertTrue(config.getDataSources().isEmpty()); assertTrue(config.getDataSources(1).isEmpty()); assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.4); assertEquals(config.getMaybeThreshold(), 0.0); assertTrue(config.getProperties().isEmpty()); }
assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.85); assertEquals(config.getMaybeThreshold(), 0.7); assertEquals(3, config.getProperties().size());
assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.85); assertEquals(config.getMaybeThreshold(), 0.7); assertEquals(3, config.getProperties().size());
assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.85); assertEquals(config.getMaybeThreshold(), 0.7); assertEquals(3, config.getProperties().size());
assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.85); assertEquals(config.getMaybeThreshold(), 0.7); assertEquals(3, config.getProperties().size());
if (config.getMaybeThreshold() != 0.0) writeElement("maybe-threshold", "" + config.getMaybeThreshold());
/** * Passes on all matches found. */ protected void compareCandidatesSimple(Record record, Collection<Record> candidates) { boolean found = false; for (Record candidate : candidates) { if (isSameAs(record, candidate)) continue; double prob = compare(record, candidate); if (prob > config.getThreshold()) { found = true; registerMatch(record, candidate, prob); } else if (config.getMaybeThreshold() != 0.0 && prob > config.getMaybeThreshold()) { found = true; // I guess? registerMatchPerhaps(record, candidate, prob); } } if (!found) registerNoMatchFor(record); }
/** * Passes on only the best match for each record. */ protected void compareCandidatesBest(Record record, Collection<Record> candidates) { double max = 0.0; Record best = null; // go through all candidates, and find the best for (Record candidate : candidates) { if (isSameAs(record, candidate)) continue; double prob = compare(record, candidate); if (prob > max) { max = prob; best = candidate; } } // pass on the best match, if any if (max > config.getThreshold()) registerMatch(record, best, max); else if (config.getMaybeThreshold() != 0.0 && max > config.getMaybeThreshold()) registerMatchPerhaps(record, best, max); else registerNoMatchFor(record); }
/** * Writes the given configuration to the given file. */ public static void write(Configuration config, String file) throws IOException { FileOutputStream fos = new FileOutputStream(file); XMLPrettyPrinter pp = new XMLPrettyPrinter(fos); pp.startDocument(); pp.startElement("duke", null); // FIXME: here we should write the objects, but that's not // possible with the current API. we don't need that for the // genetic algorithm at the moment, but it will be needed in // future. pp.startElement("schema", null); writeElement(pp, "threshold", "" + config.getThreshold()); if (config.getMaybeThreshold() != 0.0) writeElement(pp, "maybe-threshold", "" + config.getMaybeThreshold()); for (Property p : config.getProperties()) writeProperty(pp, p); pp.endElement("schema"); if (config.isDeduplicationMode()) for (DataSource src : config.getDataSources()) writeDataSource(pp, src); pp.endElement("duke"); pp.endDocument(); fos.close(); }