String t = r.getHeader().getDate().replaceAll("[-T:Z]", ""); long time = ArchiveUtils.getSecondsSinceEpoch(t).getTime(); writer.write(r.getHeader().getUrl(), mimetype, ip, time,
String arcDateString = r.getHeader().getDate(); String warcDateString = DateTimeFormat.forPattern("yyyyMMddHHmmss") .withZone(DateTimeZone.UTC)
protected String outputCdx(final String strippedFileName) throws IOException { // Read the whole record so we get out a hash. Should be safe calling // close on already closed Record. close(); ArchiveRecordHeader h = getHeader(); StringBuilder buffer = new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE); buffer.append(h.getDate()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getIp4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getUrl()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getMimetype4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getStatusCode4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getDigest4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getOffset()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getLength()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(strippedFileName != null? strippedFileName: '-'); return buffer.toString(); }
protected String outputCdx(final String strippedFileName) throws IOException { // Read the whole record so we get out a hash. Should be safe calling // close on already closed Record. close(); ArchiveRecordHeader h = getHeader(); StringBuilder buffer = new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE); buffer.append(h.getDate()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getIp4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getUrl()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getMimetype4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getStatusCode4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getDigest4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getOffset()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getLength()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(strippedFileName != null? strippedFileName: '-'); return buffer.toString(); }
protected String outputCdx(final String strippedFileName) throws IOException { // Read the whole record so we get out a hash. Should be safe calling // close on already closed Record. close(); ArchiveRecordHeader h = getHeader(); StringBuilder buffer = new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE); buffer.append(h.getDate()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getIp4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getUrl()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getMimetype4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getStatusCode4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getDigest4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getOffset()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getLength()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(strippedFileName != null? strippedFileName: '-'); return buffer.toString(); }
newKey = (header.getDate().replaceAll("[^0-9]", "")) + "/" + header.getUrl();
newKey = (header.getDate().replaceAll("[^0-9]", "")) + "/" + header.getUrl();
this.fetchTime = Instant.parse(warc.getHeader().getDate()).toEpochMilli();
if( matcher.matches() ) { resourceHost = matcher.group( 2 ); year = value.getRecord().getHeader().getDate().substring( 0, 4 ); outputKey = new Text( year + "\t" + resourceHost );
content = WarcRecordUtils.getContent(record.getRecord()); type = WarcRecordUtils.getWarcResponseMimeType(content); d = ISO8601.parse(header.getDate()); date = ArchiveUtils.get14DigitDate(d); } catch (OutOfMemoryError e) {
String date = header.getDate(); if( date != null && date.length() > 4 ) { output.collect( new Text("content-types"), new Text("YEAR\t"+date.substring(0,4)) );
String date = header.getDate(); if( date != null && date.length() > 4 ) { output.collect( new Text("content-types"), new Text("YEAR\t"+date.substring(0,4)) );
Date d = iso8601.parse(h.getDate()); String date = ArchiveUtils.get14DigitDate(d);
String year = WARCIndexer.extractYear(header.getDate());
String date = rec.getHeader().getDate(); if (date != null) { try {
String t = r.getHeader().getDate().replaceAll("[-T:Z]", ""); long time = ArchiveUtils.getSecondsSinceEpoch(t).getTime(); writer.write(r.getHeader().getUrl(), mimetype, ip, time,
long offset = header.getOffset(); result.setCaptureTimestamp(transformWARCDate(header.getDate())); result.setFile(file); result.setOffset(offset);
long offset = header.getOffset(); result.setCaptureTimestamp(transformWARCDate(header.getDate())); result.setFile(file); result.setOffset(offset);
String arcDateString = r.getHeader().getDate(); String warcDateString = DateTimeFormat.forPattern("yyyyMMddHHmmss") .withZone(DateTimeZone.UTC)
/** * Create a test revisit record referring Resource {@code revisited}. * @param timestamp CDX-style 14digit timestamp * @param revisited Capture being revisited (must be a {@link WarcResource} * or {@code ClassCastException} will be the result) * @param withHeader {@code true} unless you want to emulate old implementation * where revisit record had no HTTP headers. * @return new Resource object * @throws IOException for unexpected I/O error building payload */ public static Resource createTestRevisitResource(String timestamp, Resource revisited, boolean withHeader) throws IOException { String clen = revisited.getHttpHeaders().get("Content-Length"); int len = clen != null ? Integer.parseInt(clen) : -1; TestWARCRecordInfo recinfo = TestWARCRecordInfo .createRevisitHttpResponse("text/html", len, withHeader); recinfo.setCreate14DigitDateFromDT14(timestamp); ArchiveRecordHeader warcHeader = ((WarcResource)revisited).getWarcHeaders(); recinfo.addExtraHeader("WARC-Refers-To-Target-URI", warcHeader.getUrl()); recinfo.addExtraHeader("WARC-Refers-To-Date", warcHeader.getDate()); recinfo.setUrl(warcHeader.getUrl()); TestWARCReader ar = new TestWARCReader(recinfo); WARCRecord rec = ar.get(0); WarcResource resource = new WarcResource(rec, ar); resource.parseHeaders(); return resource; }