public LinkFileWriter(Writer out, Configuration config) { this.out = out; if (config != null) this.idprops = config.getIdentityProperties(); }
public void setRandomly(GeneticConfiguration cfg) { Configuration config = cfg.getConfiguration(); Property p = config.getPropertyByName(prop.getName()); p.setComparator(comparators.get((int) (comparators.size() * Math.random()))); }
/** * Checks to see if we need the spatial support, and if so creates * the necessary context objects. */ private void initSpatial() { // FIXME: for now, we only use geosearch if that's the only way to // find suitable records, since we don't know how to combine // geosearch ranking with normal search ranking. if (config.getLookupProperties().size() != 1) return; Property prop = config.getLookupProperties().iterator().next(); if (!(prop.getComparator() instanceof GeopositionComparator)) return; geoprop = new GeoProperty(prop); }
private void init(LinkDatabase linkdb, Configuration config, boolean debug, Processor processor, boolean showmatches, boolean pretty) { this.golddb = linkdb; this.dukedb = new InMemoryLinkDatabase(); //((InMemoryLinkDatabase) this.dukedb).setDoInference(true); this.idprops = config.getIdentityProperties(); this.props = config.getProperties(); this.debug = debug; this.processor = processor; this.database = processor.getDatabase(); this.linkage = !config.isDeduplicationMode(); this.showmatches = showmatches; this.pretty = pretty; }
public void setRandomly(GeneticConfiguration cfg) { Configuration config = cfg.getConfiguration(); double new_value = drift(config.getThreshold(), 1.0, 0.0); config.setThreshold(new_value); }
private static void reindex(Configuration config, Database database) { System.out.println("Reindexing all records..."); Processor processor = new Processor(config, database); if (config.isDeduplicationMode()) processor.index(config.getDataSources(), DEFAULT_BATCH_SIZE); else { processor.index(config.getDataSources(1), DEFAULT_BATCH_SIZE); processor.index(config.getDataSources(2), DEFAULT_BATCH_SIZE); } } }
@Test public void testEmpty() throws IOException, SAXException { Configuration config = ConfigLoader.load("classpath:config-empty.xml"); assertTrue(config.getDataSources().isEmpty()); assertTrue(config.getDataSources(1).isEmpty()); assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.4); assertEquals(config.getMaybeThreshold(), 0.0); assertTrue(config.getProperties().isEmpty()); }
config.validate(); for (Property p : config.getLookupProperties()) System.out.println(" " + p.getName()); System.out.println(); parser.getOptionState("showmaybe"), progress, !config.isDeduplicationMode(), config.getProperties(), pretty); processor.addMatchListener(listener); String fname = parser.getOptionValue("linkfile"); if (fname.endsWith(".ntriples")) linkfile = new NTriplesLinkFileListener(fname, config.getIdentityProperties()); else linkfile = new LinkFileListener(fname, config.getIdentityProperties(), interactive, parser.getOptionValue("testfile")); if (config.isDeduplicationMode()) throw new DukeConfigException("--singlematch only works in record linkage mode"); matchall = false; if (config.isDeduplicationMode()) processor.deduplicate(config.getDataSources(), batch_size); else {
public void setRandomly(GeneticConfiguration cfg) { Configuration config = cfg.getConfiguration(); Property p = config.getPropertyByName(prop.getName()); double new_value = drift(config.getThreshold(), 1.0, 0.5); p.setHighProbability(new_value); }
/** * Add the record to the index. */ public void index(Record record) { // FIXME: check if record is already indexed // allocate an ID for this record long id = store.makeNewRecordId(); store.registerRecord(id, record); // go through ID properties and register them for (Property p : config.getIdentityProperties()) for (String extid : record.getValues(p.getName())) store.registerId(id, extid); // go through lookup properties and register those for (Property p : config.getLookupProperties()) { String propname = p.getName(); for (String value : record.getValues(propname)) { String[] tokens = StringUtils.split(value); for (int ix = 0; ix < tokens.length; ix++) store.registerToken(id, propname, tokens[ix]); } } }
/** * Passes on all matches found. */ protected void compareCandidatesSimple(Record record, Collection<Record> candidates) { boolean found = false; for (Record candidate : candidates) { if (isSameAs(record, candidate)) continue; double prob = compare(record, candidate); if (prob > config.getThreshold()) { found = true; registerMatch(record, candidate, prob); } else if (config.getMaybeThreshold() != 0.0 && prob > config.getMaybeThreshold()) { found = true; // I guess? registerMatchPerhaps(record, candidate, prob); } } if (!found) registerNoMatchFor(record); }
if (config.isDeduplicationMode()) sources = config.getDataSources(); else sources = config.getDataSources(1); database = config.getDatabase(true); for (DataSource src : sources) { RecordIterator it = src.getRecords(); if (!config.isDeduplicationMode() && active) { for (DataSource src : config.getDataSources(2)) { RecordIterator it = src.getRecords(); while (it.hasNext()) {
/** * Creates a new processor, bound to the given database. */ public Processor(Configuration config, Database database) { this.config = config; this.database1 = database; // using this List implementation so that listeners can be removed // while Duke is running (see issue 117) this.listeners = new CopyOnWriteArrayList<MatchListener>(); this.logger = new DummyLogger(); this.threads = 1; // precomputing for later optimizations this.proporder = new ArrayList(); for (Property p : config.getProperties()) if (!p.isIdProperty()) proporder.add(p); Collections.sort(proporder, new PropertyComparator()); // still precomputing double prob = 0.5; accprob = new double[proporder.size()]; for (int ix = proporder.size() - 1; ix >= 0; ix--) { prob = Utils.computeBayes(prob, proporder.get(ix).getHighProbability()); accprob[ix] = prob; } }
private static void showdata(Configuration config) { List<Property> props = config.getProperties(); List<DataSource> sources = new ArrayList(); sources.addAll(config.getDataSources()); sources.addAll(config.getDataSources(1)); sources.addAll(config.getDataSources(2)); for (DataSource src : sources) { RecordIterator it = src.getRecords(); while (it.hasNext()) { Record r = it.next(); PrintMatchListener.prettyPrint(r, props); System.out.println(""); } it.close(); } }
/** * Returns the brief summary used in the command-line output. */ public String toString() { StringBuilder buf = new StringBuilder(); buf.append("[GeneticConfiguration " + shortnum(config.getThreshold())); for (Property p : config.getProperties()) if (p.isIdProperty()) buf.append(" [" + p.getName() + "]"); else buf.append(" [" + p.getName() + " " + shortname(p.getComparator()) + " " + shortnum(p.getHighProbability()) + " " + shortnum(p.getLowProbability()) + "]"); buf.append(" mr=" + mutation_rate + " rr=" + shortnum(recombination_rate)); buf.append("]"); return buf.toString(); }
/** * Does record linkage across the two groups, but does not link * records within each group. */ public void link() { link(config.getDataSources(1), config.getDataSources(2), DEFAULT_BATCH_SIZE); }
/** * Creates a new processor. * @param overwrite If true, make new Lucene index. If false, leave * existing data. */ public Processor(Configuration config, boolean overwrite) { this(config, config.getDatabase(1, overwrite)); database2 = config.getDatabase(2, overwrite); }
/** * Creates a copy of the starting configuration, keeping the aspects * list. */ public GeneticConfiguration(GeneticConfiguration config) { this.parent = config; this.config = parent.getConfiguration().copy(); this.aspects = parent.aspects; this.mutation_rate = config.getMutationRate(); this.recombination_rate = config.getRecombinationRate(); }
comparators.addAll(config.getCustomComparators()); for (Property prop : config.getProperties()) { if (!prop.isIdProperty()) { if (evolve_comparators)
private boolean[] whoThinksThisIsTrue(String id1, String id2) { Record r1 = database.findRecordById(id1); if (r1 == null) r1 = secondary.get(id1); Record r2 = database.findRecordById(id2); if (r2 == null) r2 = secondary.get(id2); List<GeneticConfiguration> configs = population.getConfigs(); boolean[] believers = new boolean[configs.size()]; for (int ix = 0; ix < configs.size(); ix++) { Configuration config = configs.get(ix).getConfiguration(); Processor proc = new Processor(config, database); believers[ix] = proc.compare(r1, r2) > config.getThreshold(); } return believers; } }