public FetchedData fetch(Resource resource) throws Exception { LOG.info("DEFAULT FETCHER {}", resource.getUrl()); URLConnection urlConn = new URL(resource.getUrl()).openConnection(); if (httpHeaders != null){ httpHeaders.forEach(urlConn::setRequestProperty); urlConn.setReadTimeout(READ_TIMEOUT); int responseCode = ((HttpURLConnection)urlConn).getResponseCode(); LOG.debug("STATUS CODE : " + responseCode + " " + resource.getUrl()); boolean truncated = false; try (InputStream inStream = urlConn.getInputStream()) { if (bufferOutStream.size() >= CONTENT_LIMIT) { truncated = true; LOG.info("Content Truncated: {}, TotalSize={}, TruncatedSize={}", resource.getUrl(), urlConn.getContentLength(), bufferOutStream.size()); break;
@Override public FetchedData fetch(Resource resource) throws Exception { LOG.info("JBrowser FETCHER {}", resource.getUrl()); FetchedData fetchedData; if (!isWebPage(resource.getUrl())) { LOG.debug("{} not a html. Falling back to default fetcher.", resource.getUrl()); driver.get(resource.getUrl()); LOG.debug("Time taken to load {} - {} ", resource.getUrl(), (System.currentTimeMillis() - start)); LOG.info("{} Failed to fetch the page. Falling back to default fetcher.", resource.getUrl()); return super.fetch(resource);
@Override public FetchedData apply(Resource resource) { try { return this.fetch(resource); } catch (Exception e) { int statusCode = DEFAULT_ERROR_CODE; if (e instanceof FileNotFoundException){ statusCode = 404; } LOG.warn("FETCH-ERROR {}", resource.getUrl()); LOG.debug(e.getMessage(), e); FetchedData fetchedData = new FetchedData(new byte[0], "", statusCode); resource.setStatus(ResourceStatus.ERROR.toString()); fetchedData.setResource(resource); return fetchedData; } } }
@Override public FetchedData fetch(Resource resource) throws Exception { LOG.info("HtmlUnit FETCHER {}", resource.getUrl()); FetchedData fetchedData; try { driver.addRequestHeader(USER_AGENT, userAgent); Page page = driver.getPage(resource.getUrl()); truncated = (contentLength > fetchedData.getContentLength()); if (truncated) { LOG.info("Content Truncated: {}, TotalSize={}", resource.getUrl(), contentLength);