@Override public void analyse(String source, ArchiveRecordHeader header, InputStream tikainput, SolrRecord solr) { final long firstBytesStart = System.nanoTime(); // Pull out the first few bytes, to hunt for new format by magic: try { byte[] ffb = new byte[this.firstBytesLength]; int read = tikainput.read(ffb); if (read >= 4) { String hexBytes = Hex.encodeHexString(ffb); solr.addField(SolrFields.CONTENT_FFB, hexBytes.substring(0, 2 * 4)); StringBuilder separatedHexBytes = new StringBuilder(); for (String hexByte : Splitter.fixedLength(2).split(hexBytes)) { separatedHexBytes.append(hexByte); separatedHexBytes.append(" "); } if (this.extractContentFirstBytes) { solr.addField(SolrFields.CONTENT_FIRST_BYTES, separatedHexBytes.toString().trim()); } } } catch (Exception i) { log.error(i + ": " + i.getMessage() + ";ffb; " + source + "@" + header.getOffset()); } Instrument.timeRel("WARCPayloadAnalyzers.analyze#total", "WARCPayloadAnalyzers.analyze#firstbytes", firstBytesStart); }
protected String outputCdx(final String strippedFileName) throws IOException { // Read the whole record so we get out a hash. Should be safe calling // close on already closed Record. close(); ArchiveRecordHeader h = getHeader(); StringBuilder buffer = new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE); buffer.append(h.getDate()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getIp4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getUrl()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getMimetype4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getStatusCode4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getDigest4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getOffset()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getLength()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(strippedFileName != null? strippedFileName: '-'); return buffer.toString(); }
protected String outputCdx(final String strippedFileName) throws IOException { // Read the whole record so we get out a hash. Should be safe calling // close on already closed Record. close(); ArchiveRecordHeader h = getHeader(); StringBuilder buffer = new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE); buffer.append(h.getDate()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getIp4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getUrl()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getMimetype4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getStatusCode4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getDigest4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getOffset()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getLength()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(strippedFileName != null? strippedFileName: '-'); return buffer.toString(); }
protected String outputCdx(final String strippedFileName) throws IOException { // Read the whole record so we get out a hash. Should be safe calling // close on already closed Record. close(); ArchiveRecordHeader h = getHeader(); StringBuilder buffer = new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE); buffer.append(h.getDate()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getIp4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getUrl()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getMimetype4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getStatusCode4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(getDigest4Cdx(h)); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getOffset()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(h.getLength()); buffer.append(ArchiveFileConstants.SINGLE_SPACE); buffer.append(strippedFileName != null? strippedFileName: '-'); return buffer.toString(); }
public SolrRecord(int defaultMaxFieldLength, HashMap<String, Integer> maxFieldLengths, String filename, ArchiveRecordHeader header) { defaultMax = defaultMaxFieldLength; maxLengths = maxFieldLengths; setField(SolrFields.ID, "exception-at-" + filename + "@" + header.getOffset()); setField(SolrFields.SOURCE_FILE, filename); setField(SolrFields.SOURCE_FILE_OFFSET, "" + header.getOffset()); setField(SolrFields.SOLR_URL, Normalisation.sanitiseWARCHeaderValue(header.getUrl())); setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_UNKNOWN); }
@Override public void analyse(String source, ArchiveRecordHeader header, InputStream tikainput, SolrRecord solr) { final String url = Normalisation .sanitiseWARCHeaderValue(header.getUrl()); log.debug("Analysing " + url); final long start = System.nanoTime(); // Analyse with tika: try { if (passUriToFormatTools) { solr = this.extract(source, solr, tikainput, url); } else { solr = this.extract(source, solr, tikainput, null); } } catch (Exception i) { log.error(i + ": " + i.getMessage() + ";tika; " + url + "@" + header.getOffset()); } Instrument.timeRel("WARCPayloadAnalyzers.analyze#total", "WARCPayloadAnalyzers.analyze#tikasolrextract", start); } /**
public SolrRecord(int defaultMaxFieldLength, HashMap<String, Integer> maxFieldLengths, String filename, ArchiveRecordHeader header) { defaultMax = defaultMaxFieldLength; maxLengths = maxFieldLengths; setField(SolrFields.ID, "exception-at-" + filename + "@" + header.getOffset()); setField(SolrFields.SOURCE_FILE, filename); setField(SolrFields.SOURCE_FILE_OFFSET, "" + header.getOffset()); setField(SolrFields.SOLR_URL, Normalisation.sanitiseWARCHeaderValue(header.getUrl())); setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_UNKNOWN); }
log.debug("Processing @"+header.getOffset()+ "+"+record.available()+","+header.getLength()+ ": "+header.getUrl());
log.debug("Processing @"+header.getOffset()+ "+"+record.available()+","+header.getLength()+ ": "+header.getUrl());
log.error( i + ": " + i.getMessage() + ";tika; " + url + "@" + header.getOffset() ); log.error( i + ": " + i.getMessage() + ";ffb; " + url + "@" + header.getOffset() ); } catch( Exception i ) { log.error(i + ": " + i.getMessage() + ";dd; " + url + " @" + header.getOffset(), i); + header.getOffset(), i);
} catch (Exception i) { log.error(i + ": " + i.getMessage() + ";x; " + url + "@" + header.getOffset(), i);
LOG.error(e.getClass().getName() + ": " + e.getMessage() + "; " + header.getUrl() + "; " + header.getOffset()); reporter.incrCounter(MyCounters.NUM_ERRORS, 1); solr.addParseException(e); } catch (OutOfMemoryError e) { LOG.error("OOME " + e.getClass().getName() + ": " + e.getMessage() + "; " + header.getUrl() + "; " + header.getOffset()); reporter.incrCounter(MyCounters.NUM_ERRORS, 1); solr.addParseException(e);
+ header.getOffset(), i);
long offset = header.getOffset();
long offset = header.getOffset();
log.debug("Processing @" + header.getOffset() + "+" + record.available() + "," + header.getLength() + ": " + header.getUrl()); for (String h : header.getHeaderFields().keySet()) { mdx.put("content-length", "" + header.getContentLength()); mdx.put("length", "" + header.getLength()); mdx.put("source-offset", "" + header.getOffset()); mdx.put("record-identifier", header.getRecordIdentifier()); for (String k : header.getHeaderFieldKeys()) {