public String processCommand(InputStream stream) { TikaInputStream tis = (TikaInputStream) stream; String pCommand = this.command; try { if (this.command.contains(INPUT_FILE_TOKEN)) { pCommand = this.command.replace(INPUT_FILE_TOKEN, tis.getFile() .getPath()); } } catch (Exception e) { LOG.warn("exception processing command", e); } return pCommand; }
public static File tikaInputStreamGetFile(String filename) throws Exception { try (InputStream stream = TikaInputStream.get(new File(filename))) { TikaInputStream tikaInputStream = TikaInputStream.get(stream); File file = tikaInputStream.getFile(); return file; } }
public void parse(InputStream is, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { List<String> lines = FileUtils.readLines(TikaInputStream.get(is).getFile(), UTF_8); for (String line : lines) { String[] fileToks = line.split("\\s+"); if (fileToks.length < 8) continue; String filePermissions = fileToks[0]; String numHardLinks = fileToks[1]; String fileOwner = fileToks[2]; String fileOwnerGroup = fileToks[3]; String fileSize = fileToks[4]; StringBuilder lastModDate = new StringBuilder(); lastModDate.append(fileToks[5]); lastModDate.append(" "); lastModDate.append(fileToks[6]); lastModDate.append(" "); lastModDate.append(fileToks[7]); StringBuilder fileName = new StringBuilder(); for (int i = 8; i < fileToks.length; i++) { fileName.append(fileToks[i]); fileName.append(" "); } fileName.deleteCharAt(fileName.length() - 1); this.addMetadata(metadata, filePermissions, numHardLinks, fileOwner, fileOwnerGroup, fileSize, lastModDate.toString(), fileName.toString()); } }
private static Set<String> getTopLevelNames(TikaInputStream stream) throws IOException { // Force the document stream to a (possibly temporary) file // so we don't modify the current position of the stream File file = stream.getFile(); try { POIFSFileSystem fs = new POIFSFileSystem(file, true); // Optimize a possible later parsing process by keeping // a reference to the already opened POI file system stream.setOpenContainer(fs); return getTopLevelNames(fs.getRoot()); } catch (IOException e) { // Parse error in POI, so we don't know the file type return Collections.emptySet(); } catch (RuntimeException e) { // Another problem in POI return Collections.emptySet(); } }
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources()); File tmpFile = tis.getFile(); GrobidRESTParser grobidParser = new GrobidRESTParser(); grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context); PDFParser parser = new PDFParser(); parser.parse(new FileInputStream(tmpFile), handler, metadata, context); } }
zipFile = (ZipFile) container; } else if (tis.hasFile()) { zipFile = new ZipFile(tis.getFile()); } else { zipStream = new ZipInputStream(stream);
private static MediaType tryStreamingDetection(TikaInputStream stream) { Set<String> entryNames = new HashSet<>(); try (InputStream is = new FileInputStream(stream.getFile())) { ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(is); ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
zipFile = (ZipFile) container; } else if (tis.hasFile()) { zipFile = new ZipFile(tis.getFile()); } else { zipStream = new ZipInputStream(stream);
/** * Ensures that the Stream will be able to be re-read, by buffering to * a temporary file if required. * Streams that are automatically OK include {@link TikaInputStream}s * created from Files or InputStreamFactories, and {@link RereadableInputStream}. */ public static InputStream ensureStreamReReadable(InputStream stream, TemporaryResources tmp) throws IOException { // If it's re-readable, we're done if (stream instanceof RereadableInputStream) return stream; // Make sure it's a TikaInputStream TikaInputStream tstream = TikaInputStream.cast(stream); if (tstream == null) { tstream = TikaInputStream.get(stream, tmp); } // If it's factory based, it's ok if (tstream.getInputStreamFactory() != null) return tstream; // Ensure it's file based tstream.getFile(); // Prepare for future re-reads tstream.mark(-1); return tstream; } /**
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // Use the delegate parser to parse the contained document EmbeddedDocumentExtractor embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString()); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); TikaInputStream in = TikaInputStream.get(stream); PSTFile pstFile = null; try { pstFile = new PSTFile(in.getFile().getPath()); metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length())); boolean isValid = pstFile.getFileHandle().getFD().valid(); metadata.set("isValid", valueOf(isValid)); if (isValid) { parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor); } } catch (Exception e) { throw new TikaException(e.getMessage(), e); } finally { if (pstFile != null && pstFile.getFileHandle() != null) { try { pstFile.getFileHandle().close(); } catch (IOException e) { //swallow closing exception } } } xhtml.endDocument(); }
File tmpTxtOutput = null; try { File input = tikaInputStream.getFile(); long size = tikaInputStream.getLength();
public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream tis = TikaInputStream.get(stream, tmp); new ImageMetadataExtractor(metadata).parseWebP(tis.getFile()); } finally { tmp.dispose(); } XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.endDocument(); } }
public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream tis = TikaInputStream.get(stream, tmp); new ImageMetadataExtractor(metadata).parseTiff(tis.getFile()); new JempboxExtractor(metadata).parse(tis); } finally { tmp.dispose(); } XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.endDocument(); }
public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream tis = TikaInputStream.get(stream, tmp); new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile()); new JempboxExtractor(metadata).parse(tis); } finally { tmp.dispose(); } XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.endDocument(); }
public void parse( InputStream stream, ContentHandler ignored, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream tis = TikaInputStream.get(stream, tmp); // Figure out what we have to process String filename = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); MediaType type = detector.detect(tis, metadata); if (extractor == null) { // Let the handler process the embedded resource handler.handle(filename, type, tis); } else { // Use a temporary file to process the stream twice File file = tis.getFile(); // Let the handler process the embedded resource try (InputStream input = TikaInputStream.get(file)) { handler.handle(filename, type, input); } // Recurse extractor.extract(tis, extractor, handler); } } finally { tmp.dispose(); } }
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { StringsConfig stringsConfig = context.get(StringsConfig.class, DEFAULT_STRINGS_CONFIG); FileConfig fileConfig = context.get(FileConfig.class, DEFAULT_FILE_CONFIG); if (!hasStrings(stringsConfig)) { return; } TikaInputStream tis = TikaInputStream.get(stream); File input = tis.getFile(); // Metadata metadata.set("strings:min-len", "" + stringsConfig.getMinLength()); metadata.set("strings:encoding", stringsConfig.toString()); metadata.set("strings:file_output", doFile(input, fileConfig)); int totalBytes = 0; // Content XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); totalBytes = doStrings(input, stringsConfig, xhtml); xhtml.endDocument(); // Metadata metadata.set("strings:length", "" + totalBytes); }
@Override public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException { metadata.set(Metadata.CONTENT_TYPE,geoInfoType); DataStore dataStore= null; DefaultMetadata defaultMetadata=null; XHTMLContentHandler xhtmlContentHandler=new XHTMLContentHandler(contentHandler,metadata); TemporaryResources tmp = TikaInputStream.isTikaInputStream(inputStream) ? null : new TemporaryResources(); try { TikaInputStream tikaInputStream = TikaInputStream.get(inputStream,tmp); File file= tikaInputStream.getFile(); dataStore = DataStores.open(file); defaultMetadata=new DefaultMetadata(dataStore.getMetadata()); if(defaultMetadata!=null) extract(xhtmlContentHandler, metadata, defaultMetadata); } catch (UnsupportedStorageException e) { throw new TikaException("UnsupportedStorageException",e); } catch (DataStoreException e) { throw new TikaException("DataStoreException", e); } finally { if (tmp != null) { tmp.dispose(); } } }
try (TemporaryResources tmp = new TemporaryResources()) { TikaInputStream tis = TikaInputStream.get(stream, tmp); rar = new Archive(new FileVolumeManager(tis.getFile()));