org.archive.io.ArchiveRecordHeader.getOffset java code examples

@Override
public void analyse(String source, ArchiveRecordHeader header, InputStream tikainput,
    SolrRecord solr) {
  final long firstBytesStart = System.nanoTime();
  // Pull out the first few bytes, to hunt for new format by magic:
  try {
    byte[] ffb = new byte[this.firstBytesLength];
    int read = tikainput.read(ffb);
    if (read >= 4) {
      String hexBytes = Hex.encodeHexString(ffb);
      solr.addField(SolrFields.CONTENT_FFB,
          hexBytes.substring(0, 2 * 4));
      StringBuilder separatedHexBytes = new StringBuilder();
      for (String hexByte : Splitter.fixedLength(2).split(hexBytes)) {
        separatedHexBytes.append(hexByte);
        separatedHexBytes.append(" ");
      }
      if (this.extractContentFirstBytes) {
        solr.addField(SolrFields.CONTENT_FIRST_BYTES,
            separatedHexBytes.toString().trim());
      }
    }
  } catch (Exception i) {
    log.error(i + ": " + i.getMessage() + ";ffb; " + source + "@"
        + header.getOffset());
  }
  Instrument.timeRel("WARCPayloadAnalyzers.analyze#total",
      "WARCPayloadAnalyzers.analyze#firstbytes", firstBytesStart);
}

protected String outputCdx(final String strippedFileName)
    throws IOException {
  // Read the whole record so we get out a hash. Should be safe calling
  // close on already closed Record.
  close();
  ArchiveRecordHeader h = getHeader();
  StringBuilder buffer =
      new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE);
  buffer.append(h.getDate());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getIp4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(h.getUrl());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getMimetype4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getStatusCode4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getDigest4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(h.getOffset());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(h.getLength());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(strippedFileName != null? strippedFileName: '-');
  return buffer.toString();
}

protected String outputCdx(final String strippedFileName)
    throws IOException {
  // Read the whole record so we get out a hash. Should be safe calling
  // close on already closed Record.
  close();
  ArchiveRecordHeader h = getHeader();
  StringBuilder buffer =
      new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE);
  buffer.append(h.getDate());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getIp4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(h.getUrl());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getMimetype4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getStatusCode4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getDigest4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(h.getOffset());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(h.getLength());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(strippedFileName != null? strippedFileName: '-');
  return buffer.toString();
}

protected String outputCdx(final String strippedFileName)
    throws IOException {
  // Read the whole record so we get out a hash. Should be safe calling
  // close on already closed Record.
  close();
  ArchiveRecordHeader h = getHeader();
  StringBuilder buffer =
      new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE);
  buffer.append(h.getDate());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getIp4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(h.getUrl());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getMimetype4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getStatusCode4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(getDigest4Cdx(h));
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(h.getOffset());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(h.getLength());
  buffer.append(ArchiveFileConstants.SINGLE_SPACE);
  buffer.append(strippedFileName != null? strippedFileName: '-');
  return buffer.toString();
}

public SolrRecord(int defaultMaxFieldLength, HashMap<String, Integer> maxFieldLengths,
         String filename, ArchiveRecordHeader header) {
  defaultMax = defaultMaxFieldLength;
  maxLengths = maxFieldLengths;
  setField(SolrFields.ID,
      "exception-at-" + filename + "@" + header.getOffset());
  setField(SolrFields.SOURCE_FILE, filename);
  setField(SolrFields.SOURCE_FILE_OFFSET, "" + header.getOffset());
  setField(SolrFields.SOLR_URL, Normalisation.sanitiseWARCHeaderValue(header.getUrl()));
  setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_UNKNOWN);
}

@Override
public void analyse(String source, ArchiveRecordHeader header,
    InputStream tikainput,
    SolrRecord solr) {
  final String url = Normalisation
      .sanitiseWARCHeaderValue(header.getUrl());
  log.debug("Analysing " + url);
  final long start = System.nanoTime();
  // Analyse with tika:
  try {
    if (passUriToFormatTools) {
      solr = this.extract(source, solr, tikainput, url);
    } else {
      solr = this.extract(source, solr, tikainput, null);
    }
  } catch (Exception i) {
    log.error(i + ": " + i.getMessage() + ";tika; " + url + "@"
        + header.getOffset());
  }
  Instrument.timeRel("WARCPayloadAnalyzers.analyze#total",
      "WARCPayloadAnalyzers.analyze#tikasolrextract", start);
}
/**

public SolrRecord(int defaultMaxFieldLength, HashMap<String, Integer> maxFieldLengths,
         String filename, ArchiveRecordHeader header) {
  defaultMax = defaultMaxFieldLength;
  maxLengths = maxFieldLengths;
  setField(SolrFields.ID,
      "exception-at-" + filename + "@" + header.getOffset());
  setField(SolrFields.SOURCE_FILE, filename);
  setField(SolrFields.SOURCE_FILE_OFFSET, "" + header.getOffset());
  setField(SolrFields.SOLR_URL, Normalisation.sanitiseWARCHeaderValue(header.getUrl()));
  setField(SolrFields.SOLR_URL_TYPE, SolrFields.SOLR_URL_TYPE_UNKNOWN);
}

  digest =  MessageDigest.getInstance( MessageDigestAlgorithms.SHA_1);
} catch (NoSuchAlgorithmException e) {
  log.error( "Hashing: " + url + "@" + header.getOffset(), e );
  log.error( "Hashing: " + url + "@" + header.getOffset(), i );

  digest =  MessageDigest.getInstance( MessageDigestAlgorithms.SHA_1);
} catch (NoSuchAlgorithmException e) {
  log.error( "Hashing: " + url + "@" + header.getOffset(), e );
  log.error( "Hashing: " + url + "@" + header.getOffset(), i );

log.debug("Processing @"+header.getOffset()+
    "+"+record.available()+","+header.getLength()+
    ": "+header.getUrl());

log.debug("Processing @"+header.getOffset()+
    "+"+record.available()+","+header.getLength()+
    ": "+header.getUrl());

log.error( i + ": " + i.getMessage() + ";tika; " + url + "@" + header.getOffset() );
log.error( i + ": " + i.getMessage() + ";ffb; " + url + "@" + header.getOffset() );
} catch( Exception i ) {
  log.error(i + ": " + i.getMessage() + ";dd; " + url + " @" + header.getOffset(), i);
    + header.getOffset(), i);

} catch (Exception i) {
  log.error(i + ": " + i.getMessage() + ";x; " + url + "@"
      + header.getOffset(), i);

 LOG.error(e.getClass().getName() + ": " + e.getMessage() + "; " + header.getUrl() + "; " + header.getOffset());
 reporter.incrCounter(MyCounters.NUM_ERRORS, 1);
 solr.addParseException(e);
} catch (OutOfMemoryError e) {
 LOG.error("OOME " + e.getClass().getName() + ": " + e.getMessage() + "; " + header.getUrl() + "; " + header.getOffset());
 reporter.incrCounter(MyCounters.NUM_ERRORS, 1);
 solr.addParseException(e);

+ url + "; " + header.getOffset(), e);
+ "; " + url + "; " + header.getOffset());

+ url + "; " + header.getOffset(), e);
+ "; " + url + "; " + header.getOffset());

+ header.getOffset(), i);

long offset = header.getOffset();

long offset = header.getOffset();

log.debug("Processing @" + header.getOffset() + "+" + record.available()
    + "," + header.getLength() + ": " + header.getUrl());
for (String h : header.getHeaderFields().keySet()) {
  mdx.put("content-length", "" + header.getContentLength());
  mdx.put("length", "" + header.getLength());
  mdx.put("source-offset", "" + header.getOffset());
  mdx.put("record-identifier", header.getRecordIdentifier());
  for (String k : header.getHeaderFieldKeys()) {

Popular methods of ArchiveRecordHeader

getUrl
getDate
Get the time when the record was created.
getHeaderValue
getHeaderFields
getLength
getMimetype
getContentLength
getHeaderFieldKeys
getReaderIdentifier
toString
getContentBegin
Offset at which the content begins. For ARCs, its used to delimit where http headers end and content
getDigest

Popular in Java

Making http requests using okhttp
setRequestProperty (URLConnection)
getApplicationContext (Context)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
Pointer (com.sun.jna)
An abstraction for a native pointer data type. A Pointer instance represents, on the Java side, a na
IOException (java.io)
Signals a general, I/O-related error. Error details may be specified when calling the constructor, a
Deque (java.util)
A linear collection that supports element insertion and removal at both ends. The name deque is shor
Queue (java.util)
A collection designed for holding elements prior to processing. Besides basic java.util.Collection o
ConcurrentHashMap (java.util.concurrent)
A plug-in replacement for JDK1.5 java.util.concurrent.ConcurrentHashMap. This version is based on or
Get (org.apache.hadoop.hbase.client)
Used to perform Get operations on a single row. To get everything for a row, instantiate a Get objec
Best plugins for Eclipse

How to use getOffsetmethodin org.archive.io.ArchiveRecordHeader

Best Java code snippets using org.archive.io.ArchiveRecordHeader.getOffset (Showing top 20 results out of 315)

How to use
getOffset
method
in
org.archive.io.ArchiveRecordHeader