@Override public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) { EventLoggingOutputCollector collector = new EventLoggingOutputCollector(topologyContext, outputCollector, TestRunEventLogger.getEventLogger(eventLogFilePath)); processorBolt.prepare(map, topologyContext, collector); }
protected void prepareParserBolt(String configFile, Map parserConfig) { parserConfig.put("parsefilters.config.file", configFile); bolt.prepare(parserConfig, TestUtil.getMockedTopologyContext(), new OutputCollector(output)); }
@Test public void testFeedParsingDetextBytes() throws IOException { Map parserConfig = new HashMap(); parserConfig.put("feed.sniffContent", true); parserConfig.put("parsefilters.config.file", "test.parsefilters.json"); bolt.prepare(parserConfig, TestUtil.getMockedTopologyContext(), new OutputCollector(output)); Metadata metadata = new Metadata(); parse("http://www.guardian.com/feed.xml", "guardian.rss", metadata); checkOutput(); }
@Test /** * Checks that content in script is not included in the text representation **/ public void testNoScriptInText() throws IOException { bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output)); parse("http://www.digitalpebble.com", "digitalpebble.com.html"); List<Object> parsedTuple = output.getEmitted().remove(0); // check in the metadata that the values match String text = (String) parsedTuple.get(3); Assert.assertFalse( "Text should not contain the content of script tags", text.contains("urchinTracker")); }
@Test public void testExecuteWithOutlinksLimitDisabled() throws IOException { stormConf.put("parser.emitOutlinks.max.per.page", -1); bolt.prepare(stormConf, TestUtil.getMockedTopologyContext(), new OutputCollector(output)); parse("http://www.digitalpebble.com", "digitalpebble.com.html"); List<List<Object>> statusTuples = output .getEmitted(Constants.StatusStreamName); // outlinks NOT being limited by property, since is disabled with -1 Assert.assertEquals(10, statusTuples.size()); }
@Test public void testExecuteWithOutlinksLimit() throws IOException { stormConf.put("parser.emitOutlinks.max.per.page", 5); bolt.prepare(stormConf, TestUtil.getMockedTopologyContext(), new OutputCollector(output)); parse("http://www.digitalpebble.com", "digitalpebble.com.html"); List<List<Object>> statusTuples = output .getEmitted(Constants.StatusStreamName); // outlinks being limited by property Assert.assertEquals(5, statusTuples.size()); }
@Test public void testSitemapSubdocuments() throws IOException { Map config = new HashMap(); // generate a dummy config file config.put("urlfilters.config.file", "basicurlnormalizer.json"); bolt.prepare(config, TestUtil.getMockedTopologyContext(), new OutputCollector(output)); Metadata metadata = new Metadata(); parse("http://www.digitalpebble.com/duplicates.html", "duplicateLinks.html", metadata); Assert.assertEquals(1, output.getEmitted(Constants.StatusStreamName) .size()); }
@Test /** * Checks that individual links marked as rel="nofollow" are not followed **/ public void testNoFollowOutlinks() throws IOException { bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output)); parse("http://www.digitalpebble.com", "digitalpebble.com.html"); List<List<Object>> statusTuples = output .getEmitted(Constants.StatusStreamName); Assert.assertEquals(10, statusTuples.size()); }
@Test public void testHTMLRedir() throws IOException { bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output)); parse("http://www.somesite.com", "redir.html"); List<List<Object>> statusTuples = output .getEmitted(Constants.StatusStreamName); // one for the redir + one for the discovered Assert.assertEquals(2, statusTuples.size()); }
@Test public void testFeedParsingNoMT() throws IOException { Map parserConfig = new HashMap(); parserConfig.put("feed.sniffContent", true); parserConfig.put("parsefilters.config.file", "test.parsefilters.json"); bolt.prepare(parserConfig, TestUtil.getMockedTopologyContext(), new OutputCollector(output)); Metadata metadata = new Metadata(); // set mime-type metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/rss+xml"); parse("http://www.guardian.com/feed.xml", "guardian.rss", metadata); checkOutput(); }
@Test public void testDodgyURL() throws IOException { TestOutputCollector output = new TestOutputCollector(); Map config = new HashMap(); config.put("http.agent.name", "this is only a test"); bolt.prepare(config, TestUtil.getMockedTopologyContext(), new OutputCollector(output)); Tuple tuple = mock(Tuple.class); when(tuple.getSourceComponent()).thenReturn("source"); when(tuple.getStringByField("url")).thenReturn("ahahaha"); when(tuple.getValueByField("metadata")).thenReturn(null); bolt.execute(tuple); boolean acked = output.getAckedTuples().contains(tuple); boolean failed = output.getAckedTuples().contains(tuple); // should be acked or failed Assert.assertEquals(true, acked || failed); List<List<Object>> statusTuples = output .getEmitted(Constants.StatusStreamName); // we should get one tuple on the status stream // to notify that the URL is an error Assert.assertEquals(1, statusTuples.size()); }
@Test public void testSitemapParsingNoMT() throws IOException { Map parserConfig = new HashMap(); parserConfig.put("sitemap.sniffContent", true); parserConfig.put("parsefilters.config.file", "test.parsefilters.json"); bolt.prepare(parserConfig, TestUtil.getMockedTopologyContext(), new OutputCollector(output)); Metadata metadata = new Metadata(); // do not specify that it is a sitemap file // do not set the mimetype parse("http://www.digitalpebble.com/sitemap.xml", "digitalpebble.sitemap.xml", metadata); Assert.assertEquals(6, output.getEmitted(Constants.StatusStreamName) .size()); // TODO test that the new links have the right metadata List<Object> fields = output.getEmitted(Constants.StatusStreamName) .get(0); Assert.assertEquals(3, fields.size()); }
@Test public void testRobotsMetaProcessor() throws IOException { bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output)); for (int i = 0; i < tests.length; i++) { byte[] bytes = tests[i].getBytes(); parse("http://www.digitalpebble.com", bytes, new Metadata()); Assert.assertEquals(1, output.getEmitted().size()); List<Object> parsedTuple = output.getEmitted().remove(0); // check in the metadata that the values match Metadata metadata = (Metadata) parsedTuple.get(2); Assert.assertNotNull(metadata); boolean isNoIndex = Boolean.parseBoolean(metadata .getFirstValue(RobotsTags.ROBOTS_NO_INDEX)); boolean isNoFollow = Boolean.parseBoolean(metadata .getFirstValue(RobotsTags.ROBOTS_NO_FOLLOW)); boolean isNoCache = Boolean.parseBoolean(metadata .getFirstValue(RobotsTags.ROBOTS_NO_CACHE)); Assert.assertEquals("incorrect noIndex value on doc " + i, answers[i][0], isNoIndex); Assert.assertEquals("incorrect noFollow value on doc " + i, answers[i][1], isNoFollow); Assert.assertEquals("incorrect noCache value on doc " + i, answers[i][2], isNoCache); } }
config.put("http.agent.name", "this is only a test"); bolt.prepare(config, TestUtil.getMockedTopologyContext(), new OutputCollector(output));
@Test public void testHTTPRobots() throws IOException { bolt.prepare(new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output)); Metadata metadata = new Metadata(); metadata.setValues("X-Robots-Tag", new String[] { "noindex", "nofollow" }); parse("http://www.digitalpebble.com", "digitalpebble.com.html", metadata); List<List<Object>> statusTuples = output .getEmitted(Constants.StatusStreamName); // no outlinks at all Assert.assertEquals(0, statusTuples.size()); Assert.assertEquals(1, output.getEmitted().size()); List<Object> parsedTuple = output.getEmitted().remove(0); // check in the metadata that the values match metadata = (Metadata) parsedTuple.get(2); Assert.assertNotNull(metadata); boolean isNoIndex = Boolean.parseBoolean(metadata .getFirstValue(RobotsTags.ROBOTS_NO_INDEX)); boolean isNoFollow = Boolean.parseBoolean(metadata .getFirstValue(RobotsTags.ROBOTS_NO_FOLLOW)); boolean isNoCache = Boolean.parseBoolean(metadata .getFirstValue(RobotsTags.ROBOTS_NO_CACHE)); Assert.assertEquals("incorrect noIndex", true, isNoIndex); Assert.assertEquals("incorrect noFollow", true, isNoFollow); Assert.assertEquals("incorrect noCache", false, isNoCache); }