protected ArchiveReader getArchiveReader(final URL u) throws IOException { // If url represents a local file then return file it points to. if (u.getPath() != null) { // TODO: Add scheme check and host check. File f = new File(u.getPath()); if (f.exists()) { return get(f, 0); } } String scheme = u.getProtocol(); if (scheme.startsWith("http") || scheme.equals("s3")) { // Try streaming if http or s3 URLs rather than copying local // and then reading (Passing an offset will get us an Reader // that wraps a Stream). return get(u, 0); } return makeARCLocal(u.openConnection()); }
protected ArchiveReader getArchiveReader(final URL u) throws IOException { // If url represents a local file then return file it points to. if (u.getPath() != null) { // TODO: Add scheme check and host check. File f = new File(u.getPath()); if (f.exists()) { return get(f, 0); } } String scheme = u.getProtocol(); if (scheme.startsWith("http") || scheme.equals("s3")) { // Try streaming if http or s3 URLs rather than copying local // and then reading (Passing an offset will get us an Reader // that wraps a Stream). return get(u, 0); } return makeARCLocal(u.openConnection()); }
protected ArchiveReader getArchiveReader(final URL u) throws IOException { // If url represents a local file then return file it points to. if (u.getPath() != null) { // TODO: Add scheme check and host check. File f = new File(u.getPath()); if (f.exists()) { return get(f, 0); } } String scheme = u.getProtocol(); if (scheme.startsWith("http") || scheme.equals("s3")) { // Try streaming if http or s3 URLs rather than copying local // and then reading (Passing an offset will get us an Reader // that wraps a Stream). return get(u, 0); } return makeARCLocal(u.openConnection()); }
private boolean nextFile() throws IOException { currentPath++; if (currentPath >= paths.length) { return false; } // Output the archive filename, to help with debugging: log.info("Opening nextFile: " + paths[currentPath]); // Set up the ArchiveReader: this.status = this.filesystem.getFileStatus(paths[currentPath]); datainputstream = this.filesystem.open(paths[currentPath]); arcreader = (ArchiveReader) ArchiveReaderFactory .get(paths[currentPath].getName(), datainputstream, true); // Set to strict reading, in order to cope with malformed archive files // which cause an infinite loop otherwise. arcreader.setStrict(true); // Get the iterator: iterator = arcreader.iterator(); this.archiveName = paths[currentPath].getName(); return true; }
private boolean nextFile() throws IOException { currentPath++; if (currentPath >= paths.length) { return false; } // Output the archive filename, to help with debugging: log.info("Opening nextFile: " + paths[currentPath]); // Set up the ArchiveReader: this.status = this.filesystem.getFileStatus(paths[currentPath]); datainputstream = this.filesystem.open(paths[currentPath]); arcreader = (ArchiveReader) ArchiveReaderFactory.get( paths[currentPath].getName(), datainputstream, true); // Set to strict reading, in order to cope with malformed archive files // which cause an infinite loop otherwise. arcreader.setStrict(true); // Get the iterator: iterator = arcreader.iterator(); this.archiveName = paths[currentPath].getName(); return true; }
private boolean nextFile() throws IOException { currentPath++; if (currentPath >= paths.length) { return false; } // Output the archive filename, to help with debugging: log.info("Opening nextFile: " + paths[currentPath]); // Set up the ArchiveReader: this.status = this.filesystem.getFileStatus(paths[currentPath]); datainputstream = this.filesystem.open(paths[currentPath]); arcreader = (ArchiveReader) ArchiveReaderFactory .get(paths[currentPath].getName(), datainputstream, true); // Set to strict reading, in order to cope with malformed archive files // which cause an infinite loop otherwise. arcreader.setStrict(true); // Get the iterator: iterator = arcreader.iterator(); this.archiveName = paths[currentPath].getName(); return true; }
private boolean nextFile() throws IOException { currentPath++; if (currentPath >= paths.length) { return false; } // Output the archive filename, to help with debugging: log.info("Opening nextFile: " + paths[currentPath]); // Set up the ArchiveReader: this.status = this.filesystem.getFileStatus(paths[currentPath]); datainputstream = this.filesystem.open(paths[currentPath]); arcreader = (ArchiveReader) ArchiveReaderFactory.get( paths[currentPath].getName(), datainputstream, true); // Set to strict reading, in order to cope with malformed archive files // which cause an infinite loop otherwise. arcreader.setStrict(true); // Get the iterator: iterator = arcreader.iterator(); this.archiveName = paths[currentPath].getName(); return true; }
reader = get(localFile, 0); } catch (IOException e) { localFile.delete();
protected Resource loadResource(String path, InputStream is) throws IOException, ResourceNotAvailableException { ArchiveReader archiveReader = ArchiveReaderFactory.get(path, is, false); if (archiveReader instanceof ARCReader) { return new ArcResource((ARCRecord)archiveReader.get(), archiveReader); } else if (archiveReader instanceof WARCReader) { return new WarcResource((WARCRecord)archiveReader.get(), archiveReader); } else { throw new IOException("Unknown ArchiveReader"); } }
protected Resource loadResource(String path, InputStream is) throws IOException, ResourceNotAvailableException { ArchiveReader archiveReader = ArchiveReaderFactory.get(path, is, false); if (archiveReader instanceof ARCReader) { return new ArcResource((ARCRecord)archiveReader.get(), archiveReader); } else if (archiveReader instanceof WARCReader) { return new WarcResource((WARCRecord)archiveReader.get(), archiveReader); } else { throw new IOException("Unknown ArchiveReader"); } }
reader = get(localFile, 0); } catch (IOException e) { localFile.delete();
reader = get(localFile, 0); } catch (IOException e) { localFile.delete();
this.internal.getCurrentValue().toString()); datainputstream = this.filesystem.open(path); arcreader = ArchiveReaderFactory.get(path.getName(), datainputstream, true); arcreader.setStrict(false);
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); reader = ArchiveReaderFactory.get(split.getPath().toString(), new BufferedInputStream(fileIn), true); if (reader instanceof ARCReader) { format = ArchiveFormat.ARC; iter = reader.iterator(); } if (reader instanceof WARCReader) { format = ArchiveFormat.WARC; iter = reader.iterator(); } this.pos = start; }
this.internal.getCurrentValue().toString()); datainputstream = this.filesystem.open(path); arcreader = ArchiveReaderFactory.get(path.getName(), datainputstream, true); arcreader.setStrict(false);
ArchiveReader arcreader = ArchiveReaderFactory.get(inputFile); arcreader.setStrict(false); WarcIndexer warcIndexer = new WarcIndexer();
ArchiveReader arcreader = ArchiveReaderFactory.get(inputFile); arcreader.setStrict(false); WarcIndexer warcIndexer = new WarcIndexer();