WarcArchiveEntryIterator(InputStream in) { super(in); // dummy call - we override most of supers methods try { WarcReader warcReader = WarcReaderFactory.getReader(in); this.iterator = warcReader.iterator(); } catch (IOException e) { log.error(e.getMessage(), e); System.err.println(e); } }
private void searchFolder(File inputFolder, String uri) throws IOException { long startTime = System.currentTimeMillis(); GZIPInputStream gzInputStream = null; for (int i = 0; i < inputFolder.listFiles().length; i++) { File inputFile = inputFolder.listFiles()[i]; LOG.info("processing file " + i + ": " + inputFile.getName()); if (inputFile.toString().toLowerCase().endsWith(".gz")) { gzInputStream = new GZIPInputStream(new FileInputStream(inputFile)); ByteCountingPushBackInputStream in = new ByteCountingPushBackInputStream(gzInputStream, 32); if (ArcReaderFactory.isArcFile(in)) { searchArcFile(inputFile, uri); } else if (WarcReaderFactory.isWarcFile(in)) { searchWarcFile(inputFile, uri); } } } long totalTime = System.currentTimeMillis() - startTime; LOG.info("Total time: " + totalTime + "ms"); }
} else if (WarcReaderFactory.isWarcRecord(in)) { r.warcReader = WarcReaderFactory.getReaderUncompressed(); r.warcReader .setWarcTargetUriProfile(UriProfile.RFC3986_ABS_16BIT_LAX);
ByteCountingPushBackInputStream pbin = new ByteCountingPushBackInputStream( new BufferedInputStream(gzInputStream, 8192), 32); WarcReader warcReader = WarcReaderFactory.getReaderUncompressed(pbin);
} else if (WarcReaderFactory.isWarcRecord(in)) { r.warcReader = WarcReaderFactory.getReaderUncompressed(); r.warcReader .setWarcTargetUriProfile(UriProfile.RFC3986_ABS_16BIT_LAX);
WarcArchiveEntryIterator(InputStream in) { super(in); // dummy call - we override most of supers methods try { WarcReader warcReader = WarcReaderFactory.getReader(in); this.iterator = warcReader.iterator(); } catch (IOException e) { LOGGER.error(e); System.err.println(e); } }
@Override public void checkSignatures (File file, InputStream stream, RepInfo info) throws IOException { info.setFormat (_format[0]); info.setMimeType (_mimeType[0]); info.setModule (this); ByteCountingPushBackInputStream pbin = new ByteCountingPushBackInputStream(stream, GzipReader.DEFAULT_INPUT_BUFFER_SIZE); // First try warc uncompressed boolean checkIsWarc = WarcReaderFactory.isWarcFile(pbin); if (checkIsWarc) { info.setSigMatch(_name); return; } // Then try warc compressed boolean checkIsGzip = GzipReader.isGzipped(pbin); if (checkIsGzip) { info.setSigMatch(_name); return; } // Not a warc or a gzip info.setWellFormed (false); }
/** * Creates an ARCRecordReader for the specified <code>split</code> of the input stream that * will start at the first valid ARC record header after <code>split.getStart()</code> and * continue until a record is read that goes past * <code>split.getStart() + split.getLength()</code>. */ public WARCRecordReader(FileSplit split, JobConf jobConf) throws IOException { conf = jobConf; start = split.getStart(); end = start + split.getLength(); System.out.println("========== " + start + " " + end); configure(jobConf); // Open the file and seek to the start of the split Path file = split.getPath(); FileSystem fs = file.getFileSystem(jobConf); fsin = new CountingInputStream(new BufferedInputStream(fs.open(split.getPath()))); warcReader = WarcReaderFactory.getReader(fsin); // Start with the first valid record after offset "start" skipToNextRecord(start); }
try { InputStream warcIn = request.getSourceInputStream(); WarcReader warcReader = WarcReaderFactory.getReader(warcIn); Iterator<WarcRecord> iterator = warcReader.iterator(); try {
@Override public int parse(InputStream stream, RepInfo info, int parseIndex) throws IOException { WarcReader reader = WarcReaderFactory.getReader(new InputStreamNoSkip(stream), 8192); try { info.setFormat(_format[0]); info.setMimeType(_mimeType[0]); info.setModule(this); setReaderOptions(reader); parseRecords(reader); info.setValid(reader.isCompliant()); info.setWellFormed(reader.isCompliant()); reportResults(reader, info); if (reader.isCompliant()) { info.setSigMatch(_name); } } catch (JhoveException e) { info.setMessage(new ErrorMessage(e.getMessage())); info.setValid(false); info.setWellFormed(false); } finally { if(reader != null) { reader.close(); reader = null; } } return 0; }