/** * Does record linkage across the two groups, but does not link * records within each group. */ public void link() { link(config.getDataSources(1), config.getDataSources(2), DEFAULT_BATCH_SIZE); }
/** * Reads all available records from the data sources and processes * them in batches, notifying the listeners throughout. */ public void deduplicate() { deduplicate(config.getDataSources(), DEFAULT_BATCH_SIZE); }
/** * Reads all available records from the data sources and processes * them in batches, notifying the listeners throughout. */ public void deduplicate(int batch_size) { deduplicate(config.getDataSources(), batch_size); }
private static void reindex(Configuration config, Database database) { System.out.println("Reindexing all records..."); Processor processor = new Processor(config, database); if (config.isDeduplicationMode()) processor.index(config.getDataSources(), DEFAULT_BATCH_SIZE); else { processor.index(config.getDataSources(1), DEFAULT_BATCH_SIZE); processor.index(config.getDataSources(2), DEFAULT_BATCH_SIZE); } } }
private static void showdata(Configuration config) { List<Property> props = config.getProperties(); List<DataSource> sources = new ArrayList(); sources.addAll(config.getDataSources()); sources.addAll(config.getDataSources(1)); sources.addAll(config.getDataSources(2)); for (DataSource src : sources) { RecordIterator it = src.getRecords(); while (it.hasNext()) { Record r = it.next(); PrintMatchListener.prettyPrint(r, props); System.out.println(""); } it.close(); } }
@Test public void testEmpty() throws IOException, SAXException { Configuration config = ConfigLoader.load("classpath:config-empty.xml"); assertTrue(config.getDataSources().isEmpty()); assertTrue(config.getDataSources(1).isEmpty()); assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.4); assertEquals(config.getMaybeThreshold(), 0.0); assertTrue(config.getProperties().isEmpty()); }
@Test public void testString() throws IOException, SAXException { String cfg = "<duke>" + "<schema>" + "<threshold>0.4</threshold>" + "</schema>" + "</duke>"; Configuration config = ConfigLoader.loadFromString(cfg); assertTrue(config.getDataSources().isEmpty()); assertTrue(config.getDataSources(1).isEmpty()); assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.4); assertEquals(config.getMaybeThreshold(), 0.0); assertTrue(config.getProperties().isEmpty()); }
@Test public void testEmpty() throws IOException, SAXException { Configuration config = ConfigLoader.load("classpath:config-empty.xml"); assertTrue(config.getDataSources().isEmpty()); assertTrue(config.getDataSources(1).isEmpty()); assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.4); assertEquals(config.getMaybeThreshold(), 0.0); assertTrue(config.getProperties().isEmpty()); File outfile = tmpdir.newFile("config.xml"); ConfigWriter writer = new ConfigWriter(new FileOutputStream(outfile.getAbsolutePath())); writer.write(config); config = ConfigLoader.load(outfile.getAbsolutePath()); assertTrue(config.getDataSources().isEmpty()); assertTrue(config.getDataSources(1).isEmpty()); assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.4); assertEquals(config.getMaybeThreshold(), 0.0); assertTrue(config.getProperties().isEmpty()); }
sources = config.getDataSources(); else sources = config.getDataSources(1); for (DataSource src : config.getDataSources(2)) { RecordIterator it = src.getRecords(); while (it.hasNext()) {
assertTrue(config.getDataSources().isEmpty()); assertTrue(config.getDataSources(1).isEmpty()); assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.85); assertEquals(config.getMaybeThreshold(), 0.7);
for (DataSource src : config.getDataSources()) writeDataSource(src); else { pp.startElement("group", null); for (DataSource src : config.getDataSources(1)) writeDataSource(src); pp.endElement("group"); for (DataSource src : config.getDataSources(2)) writeDataSource(src); pp.endElement("group");
assertEquals(1, config.getDataSources().size()); jndi = (JNDIDataSource) config.getDataSources().iterator().next(); assertEquals("select", jndi.getQuery()); assertEquals(3, jndi.getColumns().size()); assertTrue(config.getDataSources(1).isEmpty()); assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.85); assertEquals(config.getMaybeThreshold(), 0.7);
assertEquals(1, config.getDataSources().size()); csv = (CSVDataSource) config.getDataSources().iterator().next(); assertTrue(csv.getInputFile().endsWith("test.csv")); assertEquals(3, csv.getColumns().size()); assertTrue(config.getDataSources(1).isEmpty()); assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.85); assertEquals(config.getMaybeThreshold(), 0.7);
proc.addMatchListener(listener); if (cconfig.isDeduplicationMode()) proc.linkRecords(cconfig.getDataSources()); else proc.linkRecords(cconfig.getDataSources(2), false);
assertEquals(1, config.getDataSources().size()); jdbc = (JDBCDataSource) config.getDataSources().iterator().next(); assertEquals("klass", jdbc.getDriverClass()); assertEquals("konnection", jdbc.getConnectionString()); assertTrue(config.getDataSources(1).isEmpty()); assertTrue(config.getDataSources(2).isEmpty()); assertEquals(config.getThreshold(), 0.85); assertEquals(config.getMaybeThreshold(), 0.7);
processor.deduplicate(config.getDataSources(), batch_size); else { processor.linkRecords(config.getDataSources(2), matchall); } else processor.link(config.getDataSources(1), config.getDataSources(2), matchall, batch_size);
/** * Does record linkage across the two groups, but does not link * records within each group. */ public void link() { link(config.getDataSources(1), config.getDataSources(2), DEFAULT_BATCH_SIZE); }
/** * Reads all available records from the data sources and processes * them in batches, notifying the listeners throughout. */ public void deduplicate() { deduplicate(config.getDataSources(), DEFAULT_BATCH_SIZE); }
/** * Reads all available records from the data sources and processes * them in batches, notifying the listeners throughout. */ public void deduplicate(int batch_size) { deduplicate(config.getDataSources(), batch_size); }
private static void reindex(Configuration config, Database database) { System.out.println("Reindexing all records..."); Processor processor = new Processor(config, database); if (config.isDeduplicationMode()) processor.index(config.getDataSources(), DEFAULT_BATCH_SIZE); else { processor.index(config.getDataSources(1), DEFAULT_BATCH_SIZE); processor.index(config.getDataSources(2), DEFAULT_BATCH_SIZE); } } }