/** * Creates a new processor, bound to the given database. */ public Processor(Configuration config, Database database) { this.config = config; this.database1 = database; // using this List implementation so that listeners can be removed // while Duke is running (see issue 117) this.listeners = new CopyOnWriteArrayList<MatchListener>(); this.logger = new DummyLogger(); this.threads = 1; // precomputing for later optimizations this.proporder = new ArrayList(); for (Property p : config.getProperties()) if (!p.isIdProperty()) proporder.add(p); Collections.sort(proporder, new PropertyComparator()); // still precomputing double prob = 0.5; accprob = new double[proporder.size()]; for (int ix = proporder.size() - 1; ix >= 0; ix--) { prob = Utils.computeBayes(prob, proporder.get(ix).getHighProbability()); accprob[ix] = prob; } }
private void match(int dbno, Record record, boolean matchall) { long start = System.currentTimeMillis(); Collection<Record> candidates = getDB(dbno).findCandidateMatches(record); searching += System.currentTimeMillis() - start; if (logger.isDebugEnabled()) logger.debug("Matching record " + PrintMatchListener.toString(record, config.getProperties()) + " found " + candidates.size() + " candidates"); start = System.currentTimeMillis(); if (matchall) compareCandidatesSimple(record, candidates); else compareCandidatesBest(record, candidates); comparing += System.currentTimeMillis() - start; }
private static void showdata(Configuration config) { List<Property> props = config.getProperties(); List<DataSource> sources = new ArrayList(); sources.addAll(config.getDataSources()); sources.addAll(config.getDataSources(1)); sources.addAll(config.getDataSources(2)); for (DataSource src : sources) { RecordIterator it = src.getRecords(); while (it.hasNext()) { Record r = it.next(); PrintMatchListener.prettyPrint(r, props); System.out.println(""); } it.close(); } }
for (Property prop : config.getProperties()) { if (!prop.isIdProperty()) { if (evolve_comparators)
private void init(LinkDatabase linkdb, Configuration config, boolean debug, Processor processor, boolean showmatches, boolean pretty) { this.golddb = linkdb; this.dukedb = new InMemoryLinkDatabase(); //((InMemoryLinkDatabase) this.dukedb).setDoInference(true); this.idprops = config.getIdentityProperties(); this.props = config.getProperties(); this.debug = debug; this.processor = processor; this.database = processor.getDatabase(); this.linkage = !config.isDeduplicationMode(); this.showmatches = showmatches; this.pretty = pretty; }
/** * Returns the brief summary used in the command-line output. */ public String toString() { StringBuilder buf = new StringBuilder(); buf.append("[GeneticConfiguration " + shortnum(config.getThreshold())); for (Property p : config.getProperties()) if (p.isIdProperty()) buf.append(" [" + p.getName() + "]"); else buf.append(" [" + p.getName() + " " + shortname(p.getComparator()) + " " + shortnum(p.getHighProbability()) + " " + shortnum(p.getLowProbability()) + "]"); buf.append(" mr=" + mutation_rate + " rr=" + shortnum(recombination_rate)); buf.append("]"); return buf.toString(); }
@Test public void testEmpty() throws IOException, SAXException { Configuration config = ConfigLoader.load("classpath:config-empty.xml"); assertTrue(config.getDataSources().isEmpty()); assertTrue(config.getDataSources(1).isEmpty()); assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.4); assertEquals(config.getMaybeThreshold(), 0.0); assertTrue(config.getProperties().isEmpty()); }
public void run(String[] argv) throws IOException, SAXException { Collection<CommandLineParser.Option> options = Collections.singleton((CommandLineParser.Option) new CommandLineParser.StringOption("maxhits", 'H')); argv = init(argv, 3, 3, options); int max_hits = 10000; if (parser.getOptionValue("maxhits") != null) max_hits = Integer.parseInt(parser.getOptionValue("maxhits")); // build record RecordImpl prototype = new RecordImpl(); prototype.addValue(argv[1], argv[2]); // search Collection<Record> records = database.findCandidateMatches(prototype); int hitno = 1; for (Record record : records) { PrintMatchListener.prettyPrint(record, config.getProperties()); System.out.println(); if (hitno++ == max_hits) break; } }
for (Property prop : config.getProperties()) { if (prop.isIdProperty()) continue;
@Test public void testString() throws IOException, SAXException { String cfg = "<duke>" + "<schema>" + "<threshold>0.4</threshold>" + "</schema>" + "</duke>"; Configuration config = ConfigLoader.loadFromString(cfg); assertTrue(config.getDataSources().isEmpty()); assertTrue(config.getDataSources(1).isEmpty()); assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.4); assertEquals(config.getMaybeThreshold(), 0.0); assertTrue(config.getProperties().isEmpty()); }
@Test public void testEmpty() throws IOException, SAXException { Configuration config = ConfigLoader.load("classpath:config-empty.xml"); assertTrue(config.getDataSources().isEmpty()); assertTrue(config.getDataSources(1).isEmpty()); assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.4); assertEquals(config.getMaybeThreshold(), 0.0); assertTrue(config.getProperties().isEmpty()); File outfile = tmpdir.newFile("config.xml"); ConfigWriter writer = new ConfigWriter(new FileOutputStream(outfile.getAbsolutePath())); writer.write(config); config = ConfigLoader.load(outfile.getAbsolutePath()); assertTrue(config.getDataSources().isEmpty()); assertTrue(config.getDataSources(1).isEmpty()); assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.4); assertEquals(config.getMaybeThreshold(), 0.0); assertTrue(config.getProperties().isEmpty()); }
private void askQuestions(ExemplarsTracker tracker) { int count = 0; Filter f = new Filter(tracker.getExemplars()); while (true) { Pair pair = f.getNext(); if (pair == null) break; Record r1 = database.findRecordById(pair.id1); if (r1 == null) r1 = secondary.get(pair.id1); Record r2 = database.findRecordById(pair.id2); System.out.println(); PrintMatchListener.prettyCompare(r1, r2, (double) pair.counter, "Possible match", config.getProperties()); LinkKind kind = oracle.getLinkKind(pair.id1, pair.id2); Link link = new Link(pair.id1, pair.id2, LinkStatus.ASSERTED, kind, 1.0); testdb.assertLink(link); count++; if (count == questions) break; } asked += count; }
assertEquals(config.getThreshold(), 0.85); assertEquals(config.getMaybeThreshold(), 0.7); assertEquals(3, config.getProperties().size());
@Test public void testMakeRandomCopy() { GeneticConfiguration conf = new GeneticConfiguration(config1); GeneticConfiguration confrand = conf.makeRandomCopy(); Configuration rand = confrand.getConfiguration(); assertTrue("shouldn't have a parent", conf.getParent() == null); assertTrue("wrong parent", confrand.getParent() == conf); assertEquals("wrong number of properties", rand.getProperties().size(), 3); // same properties, but most aspects should now be different. // don't really want to a computation of how many aspects are different, // because with some degree of statistical likelihood, any limit is going // to be wrong some of the time. let's try this one and see how it works // out. let me know if this causes problems. int aspects = 1 + (3 * (config1.getProperties().size() - 1)); int differences = countDifferences(config1, rand); assertTrue("Not enough differences: " + differences, differences > 3); }
assertEquals(config.getThreshold(), 0.85); assertEquals(config.getMaybeThreshold(), 0.7); assertEquals(3, config.getProperties().size());
assertEquals(config.getThreshold(), 0.85); assertEquals(config.getMaybeThreshold(), 0.7); assertEquals(3, config.getProperties().size());
assertEquals(config.getThreshold(), 0.85); assertEquals(config.getMaybeThreshold(), 0.7); assertEquals(3, config.getProperties().size());
progress, !config.isDeduplicationMode(), config.getProperties(), pretty); processor.addMatchListener(listener);
writeElement("maybe-threshold", "" + config.getMaybeThreshold()); for (Property p : config.getProperties()) writeProperty(p);
public void run(String[] argv) throws IOException, SAXException { argv = init(argv, 3, 3); // build record RecordImpl prototype = new RecordImpl(); prototype.addValue(argv[1], argv[2]); // search Collection<Record> records = database.findCandidateMatches(prototype); for (Record record : records) { PrintMatchListener.prettyPrint(record, config.getProperties()); System.out.println(); } }