for (final Iterator<ArchiveRecord> i = reader.iterator(); i.hasNext();) { WARCRecord r = (WARCRecord)i.next(); if (!isARCType(r.getHeader().getMimetype())) { continue; if (r.getHeader().getContentBegin() <= 0) { getHeaderValue((WARCConstants.HEADER_KEY_IP)); long length = r.getHeader().getLength(); int offset = r.getHeader().getContentBegin(); String mimetype = r.getHeader().getMimetype(); String t = r.getHeader().getDate().replaceAll("[-T:Z]", ""); long time = ArchiveUtils.getSecondsSinceEpoch(t).getTime(); writer.write(r.getHeader().getUrl(), mimetype, ip, time, (int)(length - offset), r);
log.debug("Processing @" + header.getOffset() + "+" + record.available() + "," + header.getLength() + ": " + header.getUrl()); for (String h : header.getHeaderFields().keySet()) { log.debug( "ArchiveHeader: " + h + " -> " + header.getHeaderValue(h)); MDX mdx = new MDX(); Date crawl_date = ArchiveUtils .parse14DigitISODate(header.getDate(), null); if (crawl_date != null) { mdx.setTs(ArchiveUtils.get14DigitDate(crawl_date)); } else { mdx.setTs(header.getDate()); mdx.setUrl(header.getUrl()); mdx.setHash(header.getDigest()); mdx.put("content-type", header.getMimetype()); mdx.put("content-length", "" + header.getContentLength()); mdx.put("length", "" + header.getLength()); mdx.put("source-offset", "" + header.getOffset()); mdx.put("record-identifier", header.getRecordIdentifier()); for (String k : header.getHeaderFieldKeys()) { mdx.put("HEADER-" + k, "" + header.getHeaderValue(k)); if (record instanceof WARCRecord) { mdx.setRecordType( "warc." + header.getHeaderValue(HEADER_KEY_TYPE)); mdx.setHash("" + header.getHeaderValue(
log.debug("Processing @"+header.getOffset()+ "+"+record.available()+","+header.getLength()+ ": "+header.getUrl()); for( String h : header.getHeaderFields().keySet()) { log.debug("ArchiveHeader: "+h+" -> "+header.getHeaderValue(h)); output.collect( new Text("record-type"), new Text("WARC-RECORD-TYPE\t"+header.getHeaderValue( HEADER_KEY_TYPE ))); if( record instanceof WARCRecord ) { output.collect( new Text("record-type"), new Text("RECORD-TYPE-WARC") ); output.collect( new Text("content-types"), new Text("CONTENT-TYPE\t"+header.getMimetype()) ); String date = header.getDate(); if( date != null && date.length() > 4 ) { output.collect( new Text("content-types"), new Text("YEAR\t"+date.substring(0,4)) ); String uri = header.getUrl(); if( uri != null ){ UsableURI uuri = UsableURIFactory.getInstance(uri);
if( !header.getHeaderFields().isEmpty() ) { if( header.getHeaderFieldKeys().contains( HEADER_KEY_TYPE ) ) { log.debug("Looking at " + header.getHeaderValue(HEADER_KEY_TYPE)); if( !checkRecordType( ( String ) header.getHeaderValue( HEADER_KEY_TYPE ) ) ) { return null; solr.setField(SolrFields.SOLR_RECORD_TYPE, (String) header.getHeaderValue(HEADER_KEY_TYPE)); solr.setField(SolrFields.WARC_KEY_ID, (String) header.getHeaderValue(HEADER_KEY_ID)); solr.setField(SolrFields.WARC_IP, (String) header.getHeaderValue(HEADER_KEY_IP)); if( header.getUrl() == null ) return null; String targetUrl = Normalisation.sanitiseWARCHeaderValue(header.getUrl()); solr.setField(SolrFields.SOURCE_FILE_OFFSET,"" + header.getOffset()); String filePath = header.getReaderIdentifier();//Full path of file .digest(Normalisation.sanitiseWARCHeaderValue(header.getUrl()).getBytes("UTF-8")); solr.setField(SolrFields.SOLR_URL, Normalisation.sanitiseWARCHeaderValue(header.getUrl())); if (addNormalisedURL) { solr.setField( SolrFields.SOLR_URL_NORMALISED, Normalisation.canonicaliseURL(targetUrl) ); long content_length = header.getLength();
if( !header.getHeaderFields().isEmpty() ) { newKey = (header.getDate().replaceAll("[^0-9]", "")) + "/" + header.getUrl();
protected String outputCdx(final String strippedFileName) throws IOException { // Read the whole record so we get out a hash. Should be safe calling // close on already closed Record. close(); ArchiveRecordHeader h = getHeader(); StringBuilder buffer = new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE); buffer.append(h.getDate()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getIp4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getUrl()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getMimetype4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getStatusCode4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getDigest4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getOffset()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getLength()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(strippedFileName != null? strippedFileName: '-'); return buffer.toString(); }
String file = transformWARCFilename(header.getReaderIdentifier()); long offset = header.getOffset(); result.setCaptureTimestamp(transformWARCDate(header.getDate())); result.setFile(file); result.setOffset(offset); result.setDigest(transformWARCDigest(header.getHeaderValue( WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); String origUrl = header.getUrl(); if(origUrl == null) { String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); if(type.equals(WARCConstants.WARCRecordType.warcinfo)) { String filename = header.getHeaderValue( WARCConstants.HEADER_KEY_FILENAME).toString(); result.setOriginalUrl("filedesc:"+filename);
public SolrRecord(int defaultMaxFieldLength, HashMap<String, Integer> maxFieldLengths, String filename, ArchiveRecordHeader header) { defaultMax = defaultMaxFieldLength; maxLengths = maxFieldLengths; setField(SolrFields.ID, "exception-at-" + filename + "@" + header.getOffset()); setField(SolrFields.SOURCE_FILE, filename); setField(SolrFields.SOURCE_FILE_OFFSET, "" + header.getOffset()); setField(SolrFields.SOLR_URL, Normalisation.sanitiseWARCHeaderValue(header.getUrl())); setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_UNKNOWN); }
if (!h.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).equals("response")) { continue; if (h.getUrl().startsWith("dns:")) { invalidUrls++; continue; Date d = iso8601.parse(h.getDate()); String date = ArchiveUtils.get14DigitDate(d); String key = UrlUtils.urlToKey(h.getUrl()); String type = WarcRecordUtils.getWarcResponseMimeType(content); LOG.error("Invalid URL: " + h.getUrl()); invalidUrls++; continue; if ((int) h.getLength() > MAX_CONTENT_SIZE) { toolarge++; } else {
public void map(LongWritable key, WarcRecordWritable record, Context context) throws IOException, InterruptedException { context.getCounter(Records.TOTAL).increment(1); ArchiveRecordHeader header = record.getRecord().getHeader(); if (header.getHeaderValue("WARC-Type").equals("response")) { return; } String url = header.getUrl(); if ((url != null) && url.startsWith("http://")) { KEY.set(url); context.write(KEY, VALUE); } } }
if (!header.getHeaderFields().isEmpty()) { solr = indexer.extract(key.toString(), value.getRecord()); LOG.debug("WARCIndexer returned NULL for: " + header.getUrl()); reporter.incrCounter(MyCounters.NUM_NULLS, 1); return; LOG.error(e.getClass().getName() + ": " + e.getMessage() + "; " + header.getUrl() + "; " + header.getOffset()); reporter.incrCounter(MyCounters.NUM_ERRORS, 1); solr.addParseException(e); } catch (OutOfMemoryError e) { LOG.error("OOME " + e.getClass().getName() + ": " + e.getMessage() + "; " + header.getUrl() + "; " + header.getOffset()); reporter.incrCounter(MyCounters.NUM_ERRORS, 1); solr.addParseException(e);
header = value.getRecord().getHeader(); if( header.getHeaderFieldKeys().contains( HEADER_KEY_TYPE ) && !header.getHeaderValue( HEADER_KEY_TYPE ).equals( WARCRecordType.response.toString() ) ) { return; resourceUrl = Normalisation.sanitiseWARCHeaderValue(value.getRecord().getHeader().getUrl()); if( matcher.matches() ) { resourceHost = matcher.group( 2 ); year = value.getRecord().getHeader().getDate().substring( 0, 4 ); outputKey = new Text( year + "\t" + resourceHost );
String url = getHeader().getUrl(); if(!url.startsWith("http") || getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) { return null; this.httpStatus = new StatusLine(statusLine); } catch(IOException e) { logger.warning(e.getMessage() + " at offset: " + h.getOffset()); this.errors.add(ArcRecordErrors.HTTP_STATUS_LINE_EXCEPTION); if (getIn().available() == 0) { httpHeaderBytesRead += statusBytes.length; logger.warning("HTTP header truncated at offset: " + h.getOffset()); this.errors.add(ArcRecordErrors.HTTP_HEADER_TRUNCATED); this.setEor(true);
url = Normalisation.sanitiseWARCHeaderValue(header.getUrl()); try { digest = MessageDigest.getInstance( MessageDigestAlgorithms.SHA_1); } catch (NoSuchAlgorithmException e) { log.error( "Hashing: " + url + "@" + header.getOffset(), e ); if( header.getHeaderFieldKeys().contains( HEADER_KEY_PAYLOAD_DIGEST ) ) { headerHash = ( String ) header.getHeaderValue( HEADER_KEY_PAYLOAD_DIGEST ); if( header.getHeaderFieldKeys().contains( HEADER_KEY_TYPE ) && header.getHeaderValue( HEADER_KEY_TYPE ).equals(WARCConstants.WARCRecordType.response.toString()) ) { if( ! headerHash.equals(hash)) { log.error( "Hashing: " + url + "@" + header.getOffset(), i );
recordBytes = WarcRecordUtils.toBytes(record); content = WarcRecordUtils.getContent(WarcRecordUtils.fromBytes(recordBytes)); url = header.getUrl(); type = WarcRecordUtils.getWarcResponseMimeType(content); } catch (java.lang.OutOfMemoryError e) { Date date = null; try { date = iso8601.parse(header.getDate()); } catch (java.text.ParseException e) { e.printStackTrace();
if (!header.getHeaderValue("WARC-Type").equals("response")) { return; String url = header.getUrl(); content = WarcRecordUtils.getContent(record.getRecord()); type = WarcRecordUtils.getWarcResponseMimeType(content); d = ISO8601.parse(header.getDate()); date = ArchiveUtils.get14DigitDate(d); } catch (OutOfMemoryError e) {
/** * Is it likely that this record contains headers? * This method will return true if the body is a http response that includes * http response headers or the body is a http request that includes request * headers, etc. Be aware that headers in content are distinct from * {@link ArchiveRecordHeader} 'headers'. * @return True if this Record's content has headers: */ public boolean hasContentHeaders() { final String url = getHeader().getUrl(); if (url == null) { return false; } if (!url.toLowerCase().startsWith("http")) { return false; } if (getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) { return false; } return true; }
System.out.println(r.getHeader().getUrl()); System.out.println();
/** * whether this Resource is {@code server-not-modified} revisit. * (this method used to be {@code AccessPoint#isWarcRevisitNotModified(Resource)}. * Not made a part of {@code Resource} interface because it was unused.) * @return {@code true} if it is */ public boolean isRevisitNotModified() { Map<String, Object> warcHeaders = getWarcHeaders().getHeaderFields(); String warcProfile = (String)warcHeaders.get("WARC-Profile"); return PROFILE_REVISIT_SERVER_NOT_MODIFIED.equals(warcProfile); } }
LOG.debug(r.getHeader().getUrl() + " -- " + r.available()); if (r.getHeader().getMimetype().equals("application/http; msgtype=response")) {