getHeaderValue((WARCConstants.HEADER_KEY_IP)); long length = r.getHeader().getLength(); int offset = r.getHeader().getContentBegin();
.getHeaderValue((ARCConstants.IP_HEADER_FIELD_KEY)); if (ip != null && ip.length() > 0) { ar.addLabelValue(WARCConstants.NAMED_FIELD_IP_LABEL, ip);
log.error( "ProtocolException [" + statusCode + "]: " + header.getHeaderValue( WARCConstants.HEADER_KEY_FILENAME) + "@" + header.getHeaderValue( WARCConstants.ABSOLUTE_OFFSET_KEY), p); + header.getHeaderValue(WARCConstants.HEADER_KEY_FILENAME) + "@" + header.getHeaderValue(WARCConstants.ABSOLUTE_OFFSET_KEY));
public void map(LongWritable key, WarcRecordWritable record, Context context) throws IOException, InterruptedException { context.getCounter(Records.TOTAL).increment(1); ArchiveRecordHeader header = record.getRecord().getHeader(); if (header.getHeaderValue("WARC-Type").equals("response")) { return; } String url = header.getUrl(); if ((url != null) && url.startsWith("http://")) { KEY.set(url); context.write(KEY, VALUE); } } }
": "+header.getUrl()); for( String h : header.getHeaderFields().keySet()) { log.debug("ArchiveHeader: "+h+" -> "+header.getHeaderValue(h)); output.collect( new Text("record-type"), new Text("WARC-RECORD-TYPE\t"+header.getHeaderValue( HEADER_KEY_TYPE ))); if( record instanceof WARCRecord ) { output.collect( new Text("record-type"), new Text("RECORD-TYPE-WARC") );
!header.getHeaderValue( HEADER_KEY_TYPE ).equals( WARCRecordType.response.toString() ) ) { return;
": "+header.getUrl()); for( String h : header.getHeaderFields().keySet()) { log.debug("ArchiveHeader: "+h+" -> "+header.getHeaderValue(h)); output.collect( new Text("record-type"), new Text("WARC-RECORD-TYPE\t"+header.getHeaderValue( HEADER_KEY_TYPE ))); if( record instanceof WARCRecord ) { output.collect( new Text("record-type"), new Text("RECORD-TYPE-WARC") );
if (!header.getHeaderValue("WARC-Type").equals("response")) { return;
Object warcType = nextRecord.getHeader().getHeaderValue("WARC-Type"); if (nextRecord != null && warcType.equals("warcinfo")) {
if (!h.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).equals("response")) { continue;
String rectypeStr = (String)rec.getHeader().getHeaderValue("WARC-Type"); WARCRecordType rectype; try { status = 200; headers = new HashMap<String, String>(); String ct = (String)rec.getHeader().getHeaderValue("Content-Type"); if (ct != null) { headers.put("Content-Type", ct);
result.setFile(file); result.setOffset(offset); result.setDigest(transformWARCDigest(header.getHeaderValue( WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); if(type.equals(WARCConstants.WARCRecordType.warcinfo)) { String filename = header.getHeaderValue( WARCConstants.HEADER_KEY_FILENAME).toString(); result.setOriginalUrl("filedesc:"+filename);
result.setFile(file); result.setOffset(offset); result.setDigest(transformWARCDigest(header.getHeaderValue( WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); if(type.equals(WARCConstants.WARCRecordType.warcinfo)) { String filename = header.getHeaderValue( WARCConstants.HEADER_KEY_FILENAME).toString(); result.setOriginalUrl("filedesc:"+filename);
getHeaderValue((WARCConstants.HEADER_KEY_IP)); long length = r.getHeader().getLength(); int offset = r.getHeader().getContentBegin();
@Test public void shouldStoreAndIterageOverData() throws IOException { String folder = tempFolder.newFolder().toString(); Page target = new Page(new URL(url), html, responseHeaders); target.setTargetRelevance(TargetRelevance.RELEVANT); target.setFetchTime(System.currentTimeMillis()); WarcTargetRepository repository = new WarcTargetRepository(folder); // when repository.insert(target); repository.close(); File testFolder = new File(folder); if (testFolder.isDirectory()) { File[] allFiles = testFolder.listFiles(); assertTrue(allFiles[0].getName().startsWith("crawl_data")); } Iterator<WARCRecord> it = repository.iterator(); // then assertThat(it.hasNext(), is(true)); WARCRecord page = it.next(); assertThat(it.hasNext(), is(false)); assertThat(page.getHeader().getUrl(), is(url)); assertThat(page.getHeader().getHeaderValue("Content-Type"), is(WARCConstants.HTTP_RESPONSE_MIMETYPE)); assertThat(page.getHeader().getHeaderValue("ACHE-IsRelevant"), is(target.getTargetRelevance().isRelevant() + "")); assertThat(Double.valueOf(page.getHeader().getHeaderValue("ACHE-Relevance").toString()), is(Double.valueOf(target.getTargetRelevance().getRelevance()))); }
String typeStr = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); WARCRecordType type; try {
name = entry.getHeader().getHeaderValue(WARCRecord.HEADER_KEY_TYPE)+":"+name;
name = entry.getHeader().getHeaderValue(WARCRecord.HEADER_KEY_TYPE)+":"+name;
String typeStr = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString(); WARCRecordType type; try {
.getHeaderValue((ARCConstants.IP_HEADER_FIELD_KEY)); if (ip != null && ip.length() > 0) { ar.addLabelValue(WARCConstants.NAMED_FIELD_IP_LABEL, ip);