public WritableWarcRecord(WarcRecord o) { record=new WarcRecord(o); }
public String getTargetURI() { return warcRecord.getHeaderMetadataItem("WARC-Target-URI"); }
public void readFields(DataInput in) throws IOException { if (record!=null) { record.readFields(in); } }
public static WarcRecord readNextWarcRecord(DataInputStream in) throws IOException { byte[] recordContent=readNextRecord(in, recordHeader); if (recordContent==null) { WarcRecord retRecord=new WarcRecord(); for (int i=0; i < headerLines.length; i++) { String[] pieces=headerLines[i].split(":", 2); if (pieces.length!=2) { retRecord.addHeaderMetadata(pieces[0], ""); continue; retRecord.setWarcRecordType(thisValue); } else if (thisKey.equals("WARC-Date")) { retRecord.setWarcDate(thisValue); } else if (thisKey.equals("WARC-Record-ID")) { retRecord.setWarcUUID(thisValue); } else if (thisKey.equals("Content-Type")) { retRecord.setWarcContentType(thisValue); } else { retRecord.addHeaderMetadata(thisKey, thisValue); retRecord.setContent(recordContent);
public boolean next(LongWritable key, WritableWarcRecord value) throws IOException { DataInputStream whichStream=null; if (compressionInput!=null) { whichStream=compressionInput; } else if (currentFile!=null) { whichStream=currentFile; } if (whichStream==null) { return false; } WarcRecord newRecord=WarcRecord.readNextWarcRecord(whichStream); if (newRecord==null) { // try advancing the file if (openNextFile()) { newRecord=WarcRecord.readNextWarcRecord(whichStream); } if (newRecord==null) { return false; } } totalNumBytesRead += (long)newRecord.getTotalRecordLength(); newRecord.setWarcFilePath(filePathList[currentFilePath].toString()); // now, set our output variables value.setRecord(newRecord); key.set(recordNumber); recordNumber++; return true; }
if (wr.getHeaderRecordType().equals("response") == false) return; byte[] binarycontent = wr.getContent(); String uri = wr.getHeaderMetadataItem("WARC-Target-URI"); return; String ip = wr.getHeaderMetadataItem("WARC-IP-Address");
public static void main(String[] args) throws IOException { // use a callback class for handling WARC record data: IProcessWarcRecord processor = new SampleProcessWarcRecord(); String inputWarcFile="CC-MAIN-20140305125104-00002-ip-10-183-142-35.ec2.internal.warc.gz"; GZIPInputStream gzInputStream=new GZIPInputStream(new FileInputStream(inputWarcFile)); DataInputStream inStream=new DataInputStream(gzInputStream); WarcRecord thisWarcRecord; while ((thisWarcRecord=WarcRecord.readNextWarcRecord(inStream))!=null) { System.out.println("%% thisWarcRecord.getHeaderRecordType() = " + thisWarcRecord.getHeaderRecordType()); if (thisWarcRecord.getHeaderRecordType().equals("response")) { WarcHTMLResponseRecord htmlRecord=new WarcHTMLResponseRecord(thisWarcRecord); String thisTargetURI=htmlRecord.getTargetURI(); String thisContentUtf8 = htmlRecord.getRawRecord().getContentUTF8(); // handle WARC record content: processor.process(thisTargetURI, thisContentUtf8); } } inStream.close(); // done processing all WARC records: processor.done(); } }
String url = record.getHeaderMetadataItem(WARC_TARGET_URI); Text metadata = new Text(record.getContent());
public void setContent(String content) { setContent(content.getBytes()); } public void setContentLength(int len) {
while ((!foundMark) && ((line = readLineFromInputStream(in)) != null)) { if (line.startsWith(WARC_VERSION)) { WARC_VERSION_LINE = line; for (line = readLineFromInputStream(in).trim(); line.length() > 0 || contentLength < 0; line = readLineFromInputStream(in).trim()) {
byte[] contentBytes=warcRecord.getContent();
byte[] recordContent = readNextRecord(in, recordHeader); if (recordContent == null) { WarcRecord retRecord = new WarcRecord(); for (int i = 0; i < headerLines.length; i++) { String[] pieces = headerLines[i].split(":", 2); if (pieces.length != 2) { retRecord.addHeaderMetadata(pieces[0], ""); continue; retRecord.setWarcRecordType(thisValue); } else if (thisKey.equals(WARC_DATE)) { retRecord.setWarcDate(thisValue); } else if (thisKey.equals(WARC_RECORD_ID)) { retRecord.setWarcUUID(thisValue); } else if (thisKey.equals(CONTENT_TYPE)) { retRecord.setWarcContentType(thisValue); } else { retRecord.addHeaderMetadata(thisKey, thisValue); retRecord.setContent(recordContent);
public boolean next(LongWritable key, WritableWarcRecord value) throws IOException { DataInputStream whichStream=null; if (compressionInput!=null) { whichStream=compressionInput; } else if (currentFile!=null) { whichStream=currentFile; } if (whichStream==null) { return false; } WarcRecord newRecord=WarcRecord.readNextWarcRecord(whichStream); if (newRecord==null) { // try advancing the file if (openNextFile()) { newRecord=WarcRecord.readNextWarcRecord(whichStream); } if (newRecord==null) { return false; } } totalNumBytesRead += (long)newRecord.getTotalRecordLength(); newRecord.setWarcFilePath(filePathList[currentFilePath].toString()); // now, set our output variables value.setRecord(newRecord); key.set(recordNumber); recordNumber++; return true; }
while ((thisWarcRecord = WarcRecord.readNextWarcRecord(inStream)) != null) { if (thisWarcRecord.getHeaderRecordType().equals("response")) { WarcHTMLResponseRecord htmlRecord = new WarcHTMLResponseRecord(thisWarcRecord); String thisTargetURI = htmlRecord.getTargetURI(); String thisContentUtf8 = htmlRecord.getRawRecord().getContentUTF8();
public void setContent(String content) { setContent(content.getBytes()); } public void setContentLength(int len) {
while ((!foundMark) && ((line=readLineFromInputStream(in))!=null)) { if (line.startsWith(WARC_VERSION)) { WARC_VERSION_LINE = line; for (line = readLineFromInputStream(in).trim(); line.length() > 0 || contentLength < 0; line = readLineFromInputStream(in).trim()) {
byte[] contentBytes = warcRecord.getContent();
public static WarcRecord readNextWarcRecord(DataInputStream in) throws IOException { byte[] recordContent=readNextRecord(in, recordHeader); if (recordContent==null) { WarcRecord retRecord=new WarcRecord(); for (int i=0; i < headerLines.length; i++) { String[] pieces=headerLines[i].split(":", 2); if (pieces.length!=2) { retRecord.addHeaderMetadata(pieces[0], ""); continue; retRecord.setWarcRecordType(thisValue); } else if (thisKey.equals("WARC-Date")) { retRecord.setWarcDate(thisValue); } else if (thisKey.equals("WARC-Record-ID")) { retRecord.setWarcUUID(thisValue); } else if (thisKey.equals("Content-Type")) { retRecord.setWarcContentType(thisValue); } else { retRecord.addHeaderMetadata(thisKey, thisValue); retRecord.setContent(recordContent);
public boolean next(LongWritable key, WritableWarcRecord value) throws IOException { DataInputStream whichStream = null; if (compressionInput != null) { whichStream = compressionInput; } else if (currentFile != null) { whichStream = currentFile; } if (whichStream == null) { return false; } WarcRecord newRecord = WarcRecord.readNextWarcRecord(whichStream); if (newRecord == null) { // try advancing the file if (openNextFile()) { newRecord = WarcRecord.readNextWarcRecord(whichStream); } if (newRecord == null) { return false; } } totalNumBytesRead += (long) newRecord.getTotalRecordLength(); newRecord.setWarcFilePath(filePathList[currentFilePath].toString()); // now, set our output variables value.setRecord(newRecord); key.set(recordNumber); recordNumber++; return true; }
public WritableWarcRecord(WarcRecord o) { record = new WarcRecord(o); }