@Override public String getRefersToTargetURI() { if (warcRecord != null) { HeaderLine h = warcRecord.getHeader("WARC-Refers-To-Target-URI"); if (h != null) return h.value; } return null; }
/** * Set digest options for WARC reader. * @param reader WARC reader instance */ protected void setReaderOptions(WarcReader reader) throws JhoveException { reader.setBlockDigestEnabled(bComputeBlockDigest); reader.setPayloadDigestEnabled(bComputePayloadDigest); if (!reader.setBlockDigestAlgorithm(blockDigestAlgorithm)) { throw new JhoveException(MessageConstants.ERR_BLOCK_DIGEST_INVALID + blockDigestAlgorithm); } if (!reader.setPayloadDigestAlgorithm(payloadDigestAlgorithm)) { throw new JhoveException(MessageConstants.ERR_PAYLOAD_DIGEST_INVALID + payloadDigestAlgorithm); } reader.setBlockDigestEncoding(blockDigestEncoding); reader.setPayloadDigestEncoding(payloadDigestEncoding); if (bStrictTargetUriValidation) { reader.setWarcTargetUriProfile(UriProfile.RFC3986); } else { reader.setWarcTargetUriProfile(UriProfile.RFC3986_ABS_16BIT_LAX); } if (bStrictUriValidation) { reader.setUriProfile(UriProfile.RFC3986); } else { reader.setUriProfile(UriProfile.RFC3986_ABS_16BIT_LAX); } }
@Override public int parse(InputStream stream, RepInfo info, int parseIndex) throws IOException { WarcReader reader = WarcReaderFactory.getReader(new InputStreamNoSkip(stream), 8192); try { info.setFormat(_format[0]); info.setMimeType(_mimeType[0]); info.setModule(this); setReaderOptions(reader); parseRecords(reader); info.setValid(reader.isCompliant()); info.setWellFormed(reader.isCompliant()); reportResults(reader, info); if (reader.isCompliant()) { info.setSigMatch(_name); } } catch (JhoveException e) { info.setMessage(new ErrorMessage(e.getMessage())); info.setValid(false); info.setWellFormed(false); } finally { if(reader != null) { reader.close(); reader = null; } } return 0; }
/** * {@inheritDoc} */ @Override protected WarcRecord getNextEntry() throws IOException { WarcRecord record = null; if (this.iterator.hasNext()) { record = this.iterator.next(); // skip all but responses, and only accept HTTP 200s while (record != null && (!"response".equals(record.header.warcTypeStr) || record.getHttpHeader() == null || (HTTP_ACCEPTED != record.getHttpHeader().statusCode))) { if (this.iterator.hasNext()) { record = this.iterator.next(); } else { record = null; } } } return record; }
@Override public void close() throws IOException { if (warcRecord != null) { warcRecord.close(); } if (warcReader != null) { warcReader.close(); } if (arcRecord != null) { arcRecord.close(); } if (arcReader != null) { arcReader.close(); } if (gzipEntry != null) { gzipEntry.close(); } if (gzipReader != null) { gzipReader.close(); } if (pbin != null) { pbin.close(); } }
/** * Skips InputStream to next record past <code>start</code> or not at all if start is * exactly the start of a record */ private void skipToNextRecord(long start) throws IOException { // Skip record by record until (position in input stream) >= start while (fsin.getCount() < start && warcReader.getNextRecord() != null) { } ; lastRecordEnd = fsin.getCount(); } }
/** * Submits a request to droid. * @param entry the arc entry to submit * @param entryName the name of the entry * @param parentName the name of the parent file * @param in the archive input stream * @param correlationId the correlation Id for the request * @param originatorNodeId the Id of the originator node * @throws IOException if the input stream could not be read */ final void submit(WarcRecord entry, String entryName, URI parentName, InputStream in, ResourceId correlationId, long originatorNodeId) throws IOException { WarcHeader header = entry.header; long size = header.contentLength; Date time = header.warcDate; RequestMetaData metaData = new RequestMetaData( size == -1 ? null : size, time == null ? null : time.getTime(), entryName); super.submit(WEB_ARCHIVE_TYPE, metaData, parentName, entry.getPayloadContent(), correlationId, originatorNodeId); }
/** * Process a WARC record. * Does not characterize the record payload. * @param record WARC record from WARC reader * @throws EOFException if EOF occurs prematurely * @throws IOException if an IO error occurs while processing * @throws JhoveException if a serious problem needs to be reported */ protected void processRecord(WarcRecord record) throws IOException, JhoveException { if (record.header.bValidVersionFormat) { Integer count = versions.get(record.header.versionStr); if (count == null) { count = 0; } ++count; versions.put(record.header.versionStr, count); } WarcRecordProperties properties = new WarcRecordProperties(record); Property p = new Property("Record", PropertyType.STRING, PropertyArity.MAP, properties.getProperties()); recordProperties.add(p); record.close(); }
@Override public void checkSignatures (File file, InputStream stream, RepInfo info) throws IOException { info.setFormat (_format[0]); info.setMimeType (_mimeType[0]); info.setModule (this); ByteCountingPushBackInputStream pbin = new ByteCountingPushBackInputStream(stream, GzipReader.DEFAULT_INPUT_BUFFER_SIZE); // First try warc uncompressed boolean checkIsWarc = WarcReaderFactory.isWarcFile(pbin); if (checkIsWarc) { info.setSigMatch(_name); return; } // Then try warc compressed boolean checkIsGzip = GzipReader.isGzipped(pbin); if (checkIsGzip) { info.setSigMatch(_name); return; } // Not a warc or a gzip info.setWellFormed (false); }
/** * {@inheritDoc} */ @Override protected WarcRecord getNextEntry() throws IOException { WarcRecord record = null; if (this.iterator.hasNext()) { record = this.iterator.next(); // skip all but responses, and only accept HTTP 200s while (record != null && (!"response".equals(record.header.warcTypeStr) || record.getHttpHeader() == null || (HTTP_ACCEPTED != record.getHttpHeader().statusCode))) { if (this.iterator.hasNext()) { record = this.iterator.next(); } else { record = null; } } } return record; }
@Override public void close() throws IOException { if (warcRecord != null) { warcRecord.close(); } if (warcReader != null) { warcReader.close(); } if (arcRecord != null) { arcRecord.close(); } if (arcReader != null) { arcReader.close(); } if (gzipEntry != null) { gzipEntry.close(); } if (gzipReader != null) { gzipReader.close(); } if (pbin != null) { pbin.close(); } }
/** * Parse WARC records. Parsing should be straight forward with all records accessible through the same source. * @param reader WARC reader used to parse records * @throws EOFException if EOF occurs prematurely * @throws IOException if an IO error occurs while processing * @throws JhoveException if a serious problem needs to be reported */ protected void parseRecords(WarcReader reader) throws IOException, JhoveException { if (reader != null) { WarcRecord record; while ((record = reader.getNextRecord()) != null) { processRecord(record); reader.diagnostics.addAll(record.diagnostics); } } else { throw new JhoveException(MessageConstants.ERR_RECORD_NULL); } }
/** * Submits a request to droid. * @param entry the arc entry to submit * @param entryName the name of the entry * @param parentName the name of the parent file * @param in the archive input stream * @param correlationId the correlation Id for the request * @param originatorNodeId the Id of the originator node * @throws IOException if the input stream could not be read */ final void submit(WarcRecord entry, String entryName, URI parentName, InputStream in, ResourceId correlationId, long originatorNodeId) throws IOException { WarcHeader header = entry.header; long size = header.contentLength; Date time = header.warcDate; RequestMetaData metaData = new RequestMetaData( size == -1 ? null : size, time == null ? null : time.getTime(), entryName); super.submit(WEB_ARCHIVE_TYPE, metaData, parentName, entry.getPayloadContent(), correlationId, originatorNodeId); }
@Override public String getRefersToTargetURI() { if (warcRecord != null) { HeaderLine h = warcRecord.getHeader("WARC-Refers-To-Target-URI"); if (h != null) return h.value; } return null; }
@Override public String getRefersToDate() { if (warcRecord != null) { HeaderLine h = warcRecord.getHeader("WARC-Refers-To-Date"); if (h != null) { Date date = ArchiveUtils.parse14DigitISODate(h.value, null); if (date != null) { return ArchiveUtils.get14DigitDate(date); } } } return null; }
@Override public String getRefersToDate() { if (warcRecord != null) { HeaderLine h = warcRecord.getHeader("WARC-Refers-To-Date"); if (h != null) { Date date = ArchiveUtils.parse14DigitISODate(h.value, null); if (date != null) { return ArchiveUtils.get14DigitDate(date); } } } return null; }
private static WARCRecordType getWARCRecordType(WarcRecord rec) throws ResourceNotAvailableException { HeaderLine rectypeHeader = rec.getHeader(HEADER_KEY_TYPE); if (rectypeHeader == null) { throw new ResourceNotAvailableException("WARC-Type header is missing"); } try { return WARCRecordType.valueOf(rectypeHeader.value); } catch (IllegalArgumentException ex) { throw new ResourceNotAvailableException( "unrecognized WARC-Type \"" + rectypeHeader.value + "\""); } }
private static WARCRecordType getWARCRecordType(WarcRecord rec) throws ResourceNotAvailableException { HeaderLine rectypeHeader = rec.getHeader(HEADER_KEY_TYPE); if (rectypeHeader == null) { throw new ResourceNotAvailableException("WARC-Type header is missing"); } try { return WARCRecordType.valueOf(rectypeHeader.value); } catch (IllegalArgumentException ex) { throw new ResourceNotAvailableException( "unrecognized WARC-Type \"" + rectypeHeader.value + "\""); } }