/** * Reads all available records from the data sources and processes * them in batches, notifying the listeners throughout. */ public void deduplicate(int batch_size) { deduplicate(config.getDataSources(), batch_size); }
/** * Reads all available records from the data sources and processes * them in batches, notifying the listeners throughout. */ public void deduplicate() { deduplicate(config.getDataSources(), DEFAULT_BATCH_SIZE); }
/** * Runs the record linkage process. */ public void process() { // are we ready to process yet, or have we had an error, and are // waiting a bit longer in the hope that it will resolve itself? if (error_skips > 0) { error_skips--; return; } try { if (logger != null) logger.debug("Starting processing"); status = "Processing"; lastCheck = System.currentTimeMillis(); // FIXME: how to break off processing if we don't want to keep going? processor.deduplicate(batch_size); status = "Sleeping"; if (logger != null) logger.debug("Finished processing"); } catch (Throwable e) { status = "Thread blocked on error: " + e; if (logger != null) logger.error("Error in processing; waiting", e); error_skips = error_factor; } }
if (count % batch_size == 0) { srcread += (System.currentTimeMillis() - start); deduplicate(batch); it2.batchProcessed(); batch = new ArrayList(); deduplicate(batch); it2.batchProcessed();
@Test public void testEmpty() throws IOException { processor.deduplicate(new ArrayList()); assertEquals(0, listener.getMatches().size()); assertEquals(0, listener.getRecordCount()); }
@Test public void testDoesNotMatchEnough() throws IOException { Collection<Record> records = new ArrayList(); records.add(TestUtils.makeRecord("ID", "1", "NAME", "A")); records.add(TestUtils.makeRecord("ID", "2", "NAME", "A")); processor.deduplicate(records); assertEquals(0, listener.getMatches().size()); assertEquals(2, listener.getRecordCount()); }
@Test public void testMatches2() throws IOException { Collection<Record> records = new ArrayList(); records.add(TestUtils.makeRecord("ID", "1", "NAME", "AAAAA", "EMAIL", "BBBBB")); records.add(TestUtils.makeRecord("ID", "2", "NAME", "AAAAA", "EMAIL", "BBBBB")); processor.deduplicate(records); assertEquals(2, listener.getRecordCount()); Collection<TestUtils.Pair> matches = listener.getMatches(); assertEquals(2, matches.size()); }
@Test public void testNoProperties() throws IOException { Collection<Record> records = new ArrayList(); records.add(TestUtils.makeRecord()); records.add(TestUtils.makeRecord()); processor.deduplicate(records); assertEquals(0, listener.getMatches().size()); assertEquals(2, listener.getRecordCount()); }
@Test public void testDoesNotMatch() throws IOException { Collection<Record> records = new ArrayList(); records.add(TestUtils.makeRecord("ID", "1", "NAME", "A")); records.add(TestUtils.makeRecord("ID", "2", "NAME", "B")); processor.deduplicate(records); assertEquals(0, listener.getMatches().size()); assertEquals(2, listener.getRecordCount()); }
@Test public void testMultiToken() throws IOException { Collection<Record> records = new ArrayList(); records.add(TestUtils.makeRecord("ID", "1", "NAME", "aaaaaaaaa aaaaa", "EMAIL", "bbbbb")); records.add(TestUtils.makeRecord("ID", "2", "NAME", "aaaaaaaaa aaaab", "EMAIL", "bbbbb")); processor.deduplicate(records); assertEquals(2, listener.getRecordCount()); Collection<TestUtils.Pair> matches = listener.getMatches(); assertEquals(2, matches.size()); }
@Test public void testIgnoreProperty() throws IOException { // make email an ignored property Property prop = config.getPropertyByName("EMAIL"); prop.setIgnoreProperty(true); // now run, and see that it doesn't match Collection<Record> records = new ArrayList(); records.add(TestUtils.makeRecord("ID", "1", "NAME", "aaaaa", "EMAIL", "BBBBB")); records.add(TestUtils.makeRecord("ID", "2", "NAME", "aaaaa", "EMAIL", "BBBBB")); processor.deduplicate(records); // no matches found assertEquals(0, listener.getMatches().size()); }
@Test public void testMatches1() throws IOException { Collection<Record> records = new ArrayList(); records.add(TestUtils.makeRecord("ID", "1", "NAME", "aaaaa", "EMAIL", "BBBBB")); records.add(TestUtils.makeRecord("ID", "2", "NAME", "aaaaa", "EMAIL", "BBBBB")); processor.deduplicate(records); assertEquals(2, listener.getRecordCount()); Collection<TestUtils.Pair> matches = listener.getMatches(); assertEquals(2, matches.size()); }
@Test public void testLuceneKeyword() throws IOException { Collection<Record> records = new ArrayList(); records.add(TestUtils.makeRecord("ID", "1", "NAME", "AND", "EMAIL", "BBBBB")); records.add(TestUtils.makeRecord("ID", "2", "NAME", "AND", "EMAIL", "BBBBB")); processor.deduplicate(records); assertEquals(2, listener.getRecordCount()); Collection<TestUtils.Pair> matches = listener.getMatches(); assertEquals(2, matches.size()); }
@Test public void testNoComparator() throws IOException { // nulling out comparator config.getPropertyByName("EMAIL").setComparator(null); // now attempt to match Collection<Record> records = new ArrayList(); records.add(TestUtils.makeRecord("ID", "1", "NAME", "aaaaa", "EMAIL", "BBBBB")); records.add(TestUtils.makeRecord("ID", "2", "NAME", "aaaaa", "EMAIL", "BBBBB")); processor.deduplicate(records); // this shouldn't produce any matches, because we're not comparing email assertEquals(0, listener.getMatches().size()); assertEquals(2, listener.getRecordCount()); }
@Test public void testBatchRemainder() throws IOException { // used to have a bug where batch_size = 1000, and having >1000 records // but <2000 would leave the records >1000 unprocessed // set up data source Collection<Record> records = new ArrayList(); records.add(TestUtils.makeRecord("ID", "1", "NAME", "aaaaa", "EMAIL", "BBBBB")); records.add(TestUtils.makeRecord("ID", "2", "NAME", "aaaaa", "EMAIL", "BBBBB")); records.add(TestUtils.makeRecord("ID", "3", "NAME", "aaaaa", "EMAIL", "BBBBB")); TestDataSource source = new TestDataSource(records); config.addDataSource(0, source); // let's process! processor.deduplicate(2); // batch of 2, plus remaining one // so, what was the result? assertEquals("wrong number of matches", 4, listener.getMatches().size()); assertEquals("wrong number of records processed", 3, listener.getRecordCount()); assertEquals("wrong number of batches", 2, source.getBatchCount()); }
@Test public void testMaybe() throws IOException { // this corresponds to maybe-threshold not being set at all config.setMaybeThreshold(0.0); // now lets try some matching Collection<Record> records = new ArrayList(); records.add(TestUtils.makeRecord("ID", "1", "NAME", "aaaaaa", "EMAIL", "bbbbb")); records.add(TestUtils.makeRecord("ID", "2", "NAME", "bbbb", "EMAIL", "bbbbb")); processor.deduplicate(records); Collection<TestUtils.Pair> matches = listener.getMatches(); // for (TestUtils.Pair match : matches) // PrintMatchListener.show(match.r1, match.r2, match.conf, "MATCH"); assertEquals("wrong number of records processed", 2, listener.getRecordCount()); assertEquals("found matches, but shouldn't have", 0, matches.size()); assertEquals("found maybe matches, but shouldn't have", 0, listener.getMaybeCount()); assertEquals("wrong number of no-matches", 2, listener.getNoMatchCount()); }
processor.deduplicate(config.getDataSources(), batch_size); else {
/** * Reads all available records from the data sources and processes * them in batches, notifying the listeners throughout. */ public void deduplicate() { deduplicate(config.getDataSources(), DEFAULT_BATCH_SIZE); }
/** * Reads all available records from the data sources and processes * them in batches, notifying the listeners throughout. */ public void deduplicate(int batch_size) { deduplicate(config.getDataSources(), batch_size); }
/** * Runs the record linkage process. */ public void process() { // are we ready to process yet, or have we had an error, and are // waiting a bit longer in the hope that it will resolve itself? if (error_skips > 0) { error_skips--; return; } try { if (logger != null) logger.debug("Starting processing"); status = "Processing"; lastCheck = System.currentTimeMillis(); // FIXME: how to break off processing if we don't want to keep going? processor.deduplicate(batch_size); status = "Sleeping"; if (logger != null) logger.debug("Finished processing"); } catch (Throwable e) { status = "Thread blocked on error: " + e; if (logger != null) logger.error("Error in processing; waiting", e); error_skips = error_factor; } }