/** * {@inheritDoc} */ @Override protected WarcRecord getNextEntry() throws IOException { WarcRecord record = null; if (this.iterator.hasNext()) { record = this.iterator.next(); // skip all but responses, and only accept HTTP 200s while (record != null && (!"response".equals(record.header.warcTypeStr) || record.getHttpHeader() == null || (HTTP_ACCEPTED != record.getHttpHeader().statusCode))) { if (this.iterator.hasNext()) { record = this.iterator.next(); } else { record = null; } } } return record; }
@Override public String getRefersToTargetURI() { if (warcRecord != null) { HeaderLine h = warcRecord.getHeader("WARC-Refers-To-Target-URI"); if (h != null) return h.value; } return null; }
WARCRecordType rectype = getWARCRecordType(r.warcRecord); if (rectype == WARCRecordType.response || rectype == WARCRecordType.revisit) { payload = r.warcRecord.getPayload(); if (payload != null) { httpHeader = r.warcRecord.getHttpHeader(); payload = r.warcRecord.getPayload(); r.payloadStream = payload.getInputStreamComplete(); r.length = payload.getTotalLength(); r.status = 200; HeaderLine ctHeader = r.warcRecord.getHeader("content-type"); if (ctHeader != null) { contentType = ctHeader.value; HeaderLine dateHeader = r.warcRecord.getHeader(HEADER_KEY_DATE); if (dateHeader != null) { try {
startOffset = record.getStartOffset(); consumed = record.getConsumed(); if (header.bValidVersionFormat) { this.warcVersionStr = header.versionStr; bIsNonCompliant = !record.isCompliant(); isValidBlockDigest = record.isValidBlockDigest; isValidPayloadDigest = record.isValidPayloadDigest; bHasPayload = record.hasPayload(); Payload payload = record.getPayload(); HeaderLine headerLine; if (payload != null) {
url = warcRecord.header.warcTargetUriStr; key = UrlUtils.urlToKey(url); Payload payload = warcRecord.getPayload(); HttpHeader httpHeader = null; InputStream payloadStream = null; continue; httpHeader = warcRecord.getHttpHeader(); if (httpHeader != null) { payloadStream = httpHeader.getPayloadInputStream();
&& record.getHttpHeader() != null && httpACCEPTED == record.getHttpHeader().statusCode) { IdentificationRequest warcRequest = factory.newRequest(metaData, identifier); ByteCountingPushBackInputStream in = (ByteCountingPushBackInputStream) record.getPayloadContent();
PayloadWithHeaderAbstract payloadHeader = arcRecord.getPayload() .getPayloadHeaderWrapped(); if (payloadHeader == null)
/** * Submits a request to droid. * @param entry the arc entry to submit * @param entryName the name of the entry * @param parentName the name of the parent file * @param in the archive input stream * @param correlationId the correlation Id for the request * @param originatorNodeId the Id of the originator node * @throws IOException if the input stream could not be read */ final void submit(WarcRecord entry, String entryName, URI parentName, InputStream in, ResourceId correlationId, long originatorNodeId) throws IOException { WarcHeader header = entry.header; long size = header.contentLength; Date time = header.warcDate; RequestMetaData metaData = new RequestMetaData( size == -1 ? null : size, time == null ? null : time.getTime(), entryName); super.submit(WEB_ARCHIVE_TYPE, metaData, parentName, entry.getPayloadContent(), correlationId, originatorNodeId); }
/** * Process a WARC record. * Does not characterize the record payload. * @param record WARC record from WARC reader * @throws EOFException if EOF occurs prematurely * @throws IOException if an IO error occurs while processing * @throws JhoveException if a serious problem needs to be reported */ protected void processRecord(WarcRecord record) throws IOException, JhoveException { if (record.header.bValidVersionFormat) { Integer count = versions.get(record.header.versionStr); if (count == null) { count = 0; } ++count; versions.put(record.header.versionStr, count); } WarcRecordProperties properties = new WarcRecordProperties(record); Property p = new Property("Record", PropertyType.STRING, PropertyArity.MAP, properties.getProperties()); recordProperties.add(p); record.close(); }
WARCRecordType rectype = getWARCRecordType(r.warcRecord); if (rectype == WARCRecordType.response || rectype == WARCRecordType.revisit) { payload = r.warcRecord.getPayload(); if (payload != null) { httpHeader = r.warcRecord.getHttpHeader(); payload = r.warcRecord.getPayload(); r.payloadStream = payload.getInputStreamComplete(); r.length = payload.getTotalLength(); r.status = 200; HeaderLine ctHeader = r.warcRecord.getHeader("content-type"); if (ctHeader != null) { contentType = ctHeader.value; HeaderLine dateHeader = r.warcRecord.getHeader(HEADER_KEY_DATE); if (dateHeader != null) { try {
/** * Submits a request to droid. * @param entry the arc entry to submit * @param entryName the name of the entry * @param parentName the name of the parent file * @param in the archive input stream * @param correlationId the correlation Id for the request * @param originatorNodeId the Id of the originator node * @throws IOException if the input stream could not be read */ final void submit(WarcRecord entry, String entryName, URI parentName, InputStream in, ResourceId correlationId, long originatorNodeId) throws IOException { WarcHeader header = entry.header; long size = header.contentLength; Date time = header.warcDate; RequestMetaData metaData = new RequestMetaData( size == -1 ? null : size, time == null ? null : time.getTime(), entryName); super.submit(WEB_ARCHIVE_TYPE, metaData, parentName, entry.getPayloadContent(), correlationId, originatorNodeId); }
@Override public void close() throws IOException { if (warcRecord != null) { warcRecord.close(); } if (warcReader != null) { warcReader.close(); } if (arcRecord != null) { arcRecord.close(); } if (arcReader != null) { arcReader.close(); } if (gzipEntry != null) { gzipEntry.close(); } if (gzipReader != null) { gzipReader.close(); } if (pbin != null) { pbin.close(); } }
/** * {@inheritDoc} */ @Override protected WarcRecord getNextEntry() throws IOException { WarcRecord record = null; if (this.iterator.hasNext()) { record = this.iterator.next(); // skip all but responses, and only accept HTTP 200s while (record != null && (!"response".equals(record.header.warcTypeStr) || record.getHttpHeader() == null || (HTTP_ACCEPTED != record.getHttpHeader().statusCode))) { if (this.iterator.hasNext()) { record = this.iterator.next(); } else { record = null; } } } return record; }
byte[] buffer = IOUtils.toByteArray(record.getPayloadContent()); Class<?> encodingDetectorClass = conf.getClass("dkpro.input.encodingdetector", DummyEncodingDetector.class); try {
@Override public void close() throws IOException { if (warcRecord != null) { warcRecord.close(); } if (warcReader != null) { warcReader.close(); } if (arcRecord != null) { arcRecord.close(); } if (arcReader != null) { arcReader.close(); } if (gzipEntry != null) { gzipEntry.close(); } if (gzipReader != null) { gzipReader.close(); } if (pbin != null) { pbin.close(); } }
@Override public String getRefersToTargetURI() { if (warcRecord != null) { HeaderLine h = warcRecord.getHeader("WARC-Refers-To-Target-URI"); if (h != null) return h.value; } return null; }
@Override public String getRefersToDate() { if (warcRecord != null) { HeaderLine h = warcRecord.getHeader("WARC-Refers-To-Date"); if (h != null) { Date date = ArchiveUtils.parse14DigitISODate(h.value, null); if (date != null) { return ArchiveUtils.get14DigitDate(date); } } } return null; }
@Override public String getRefersToDate() { if (warcRecord != null) { HeaderLine h = warcRecord.getHeader("WARC-Refers-To-Date"); if (h != null) { Date date = ArchiveUtils.parse14DigitISODate(h.value, null); if (date != null) { return ArchiveUtils.get14DigitDate(date); } } } return null; }
private static WARCRecordType getWARCRecordType(WarcRecord rec) throws ResourceNotAvailableException { HeaderLine rectypeHeader = rec.getHeader(HEADER_KEY_TYPE); if (rectypeHeader == null) { throw new ResourceNotAvailableException("WARC-Type header is missing"); } try { return WARCRecordType.valueOf(rectypeHeader.value); } catch (IllegalArgumentException ex) { throw new ResourceNotAvailableException( "unrecognized WARC-Type \"" + rectypeHeader.value + "\""); } }
private static WARCRecordType getWARCRecordType(WarcRecord rec) throws ResourceNotAvailableException { HeaderLine rectypeHeader = rec.getHeader(HEADER_KEY_TYPE); if (rectypeHeader == null) { throw new ResourceNotAvailableException("WARC-Type header is missing"); } try { return WARCRecordType.valueOf(rectypeHeader.value); } catch (IllegalArgumentException ex) { throw new ResourceNotAvailableException( "unrecognized WARC-Type \"" + rectypeHeader.value + "\""); } }