@Override public void process(final InputStream stream) throws IOException { try (final InputStream in = new BufferedInputStream(stream)) { TikaInputStream tikaStream = TikaInputStream.get(in); Metadata metadata = new Metadata(); if (filename != null && context.getProperty(USE_FILENAME_IN_DETECTION).asBoolean()) { metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename); } // Get mime type MediaType mediatype = detector.detect(tikaStream, metadata); mimeTypeRef.set(mediatype.toString()); } } });
private Map<String, String> tika_parse(InputStream sourceStream, String prefix, Integer maxAttribs, Integer maxAttribLen) throws IOException, TikaException, SAXException { final Metadata metadata = new Metadata(); final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream); try { autoDetectParser.parse(tikaInputStream, new DefaultHandler(), metadata); } finally { tikaInputStream.close(); final Pattern metadataKeyFilter = metadataKeyFilterRef.get(); final StringBuilder dataBuilder = new StringBuilder(); for (final String key : metadata.names()) { if (metadataKeyFilter != null && !metadataKeyFilter.matcher(key).matches()) { continue; if (metadata.isMultiValued(key)) { for (String val : metadata.getValues(key)) { if (dataBuilder.length() > 1) {
private long getByteCount() throws SAXException { try { if (stream.hasLength()) { return stream.getLength(); } else { return stream.getPosition(); } } catch (IOException e) { throw new SAXException("Unable to get stream length", e); } }
ParseContext context = new ParseContext(); context.set(Parser.class, recursiveReportingParser); parser.init(context); Metadata metadata = new Metadata(); metadata.add( Metadata.RESOURCE_NAME_KEY, inputFile.toURI().toString()); InputStream stream = TikaInputStream.get(inputFile); MediaType type = parser.getDetector().detect(stream, metadata); System.out.println("Detector found: "+type); metadata.add( Metadata.CONTENT_TYPE, type.toString()); recursiveReportingParser.parse(stream, handler, metadata, context); } catch (Exception e ) { System.out.println("---- Exception: "+e);
getXmlContentHandler(xmlBuffer)); context.set(DocumentSelector.class, new ImageDocumentSelector()); input = TikaInputStream.get(new ProgressMonitorInputStream( this, "Parsing stream", input)); int mark = -1; if (input instanceof TikaInputStream) { if (((TikaInputStream)input).hasFile()) { mark = (int)((TikaInputStream)input).getLength(); parser.parse(input, handler, md, context); String name = md.get(TikaCoreProperties.RESOURCE_NAME_KEY); if (name != null && name.length() > 0) { setTitle("Apache Tika: " + name); -1); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); wrapper.parse(input, recursiveParserWrapperHandler, new Metadata(), new ParseContext()); StringWriter jsonBuffer = new StringWriter(); JsonMetadataList.setPrettyPrinting(true);
@Override public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException { metadata.set(Metadata.CONTENT_TYPE,geoInfoType); DataStore dataStore= null; DefaultMetadata defaultMetadata=null; XHTMLContentHandler xhtmlContentHandler=new XHTMLContentHandler(contentHandler,metadata); TemporaryResources tmp = TikaInputStream.isTikaInputStream(inputStream) ? null : new TemporaryResources(); try { TikaInputStream tikaInputStream = TikaInputStream.get(inputStream,tmp); File file= tikaInputStream.getFile(); dataStore = DataStores.open(file); defaultMetadata=new DefaultMetadata(dataStore.getMetadata()); if(defaultMetadata!=null) extract(xhtmlContentHandler, metadata, defaultMetadata); } catch (UnsupportedStorageException e) { throw new TikaException("UnsupportedStorageException",e); } catch (DataStoreException e) { throw new TikaException("DataStoreException", e); } finally { if (tmp != null) { tmp.dispose(); } } }
try (TikaInputStream stream = TikaInputStream.get( new DocumentInputStream((DocumentEntry) ooxml))) { ZipContainerDetector detector = new ZipContainerDetector(); try { handleEmbeddedResource(stream, null, dir.getName(), dir.getStorageClsid(), type.toString(), xhtml, true); return; Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, dir.getName()); embedded = TikaInputStream.get(data); } catch (Ole10NativeException ex) { byte[] contents = new byte[contentsEntry.getSize()]; inp.readFully(contents); embedded = TikaInputStream.get(contents); metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString()); embedded = TikaInputStream.get(new byte[0]); embedded.setOpenContainer(dir); } finally { if (embedded != null) { embedded.close();
TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream tis = TikaInputStream.get(stream, tmp); MediaType type = detector.detect(tis, metadata); metadata.set(Metadata.CONTENT_TYPE, type.toString()); if (tis.getOpenContainer() == null) { tis.mark(1); if (tis.read() == -1) { throw new ZeroByteFileException("InputStream must have > 0 bytes"); tis.reset(); if (context.get(EmbeddedDocumentExtractor.class) == null) { Parser p = context.get(Parser.class); if (p == null) { context.set(Parser.class, this); context.set(EmbeddedDocumentExtractor.class,
@ProcessElement public void processElement(ProcessContext c) throws Exception { ReadableFile file = c.element(); InputStream stream = Channels.newInputStream(file.open()); try (InputStream tikaStream = TikaInputStream.get(stream)) { Parser parser = tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig); ParseContext context = new ParseContext(); context.set(Parser.class, parser); Metadata tikaMetadata = spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata(); if (spec.getContentTypeHint() != null) { tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint()); } String location = file.getMetadata().resourceId().toString(); ParseResult res; ContentHandler tikaHandler = new ToTextContentHandler(); try { parser.parse(tikaStream, tikaHandler, tikaMetadata, context); res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata); } catch (Exception e) { res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e); } c.output(res); } } }
private void handleInlineBodyPart(BodyContents part) throws MimeException, IOException { String contentType = part.metadata.get(Metadata.CONTENT_TYPE); Parser parser = null; if (MediaType.TEXT_HTML.toString().equalsIgnoreCase(contentType)) { parser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext); parser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext); } else if (MediaType.TEXT_PLAIN.toString().equalsIgnoreCase(contentType)) { parser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(TXTParser.class, parseContext); try (TikaInputStream tis = TikaInputStream.get(part.bytes)) { handleEmbedded(tis, part.metadata); parser.parse( new ByteArrayInputStream(part.bytes), new EmbeddedContentHandler(new BodyContentHandler(handler)), new Metadata(), parseContext ); } catch (SAXException | TikaException e) {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, filename); System.out.println("The MIME type (based on filename) is: [" + mimeRegistry.detect(null, metadata) + "]"); InputStream stream = TikaInputStream.get(new File(filename)); System.out.println("The MIME type (based on MAGIC) is: [" + mimeRegistry.detect(stream, metadata) + "]"); stream = TikaInputStream.get(new File(filename)); Detector detector = tikaConfig.getDetector(); System.out.println("The MIME type (based on the Detector interface) is: [" + detector.detect(stream, metadata) + "]"); MediaType type = detector.detect(stream, metadata); metadata.set(Metadata.CONTENT_TYPE, type.toString()); parser.parse(stream, handler, metadata, new ParseContext());
public static Map<String, String> handleStreamMetaDate(byte[] file) throws Exception { Map<String, String> meta = new HashMap<>(); Metadata md = new Metadata(); TikaInputStream input = TikaInputStream.get(file, md); StringWriter textBuffer = new StringWriter(); ContentHandler handler = new TeeContentHandler( getTextContentHandler(textBuffer) ); parser.parse(input, handler, md, context); String[] names = md.names(); Arrays.sort(names); for (String name : names) { meta.put(name, md.get(name)); } return meta; }
@Override public void endPart() throws SAXException, TikaException { if (hasData()) { EmbeddedDocumentExtractor embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext); Metadata embeddedMetadata = new Metadata(); try (TikaInputStream stream = TikaInputStream.get(getInputStream())) { embeddedDocumentExtractor.parseEmbedded(stream, handler, embeddedMetadata, false); } catch (IOException e) { throw new TikaException("error in finishing part", e); } buffer.setLength(0); } }
String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); if (name != null && name.length() > 0 && outputHtml) { handler.startElement(XHTML, "h1", "h1", new AttributesImpl()); final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp); if (stream instanceof TikaInputStream) { final Object container = ((TikaInputStream) stream).getOpenContainer(); if (container != null) { newStream.setOpenContainer(container); DELEGATING_PARSER.parse( newStream, new EmbeddedContentHandler(new BodyContentHandler(handler)),
public static void parseTikaInputStream(String filename) throws Exception { Parser parser = new AutoDetectParser(); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try (InputStream stream = TikaInputStream.get(new File(filename))) { parser.parse(stream, handler, metadata, context); } }
private boolean detectTextOrHtml(Metadata submd, byte[] bytes) { String mediaTypeString = submd.get(Metadata.CONTENT_TYPE); if (mediaTypeString != null) { if (mediaTypeString.startsWith("text")) { return true; } else { return false; } } try (TikaInputStream tis = TikaInputStream.get(bytes)) { MediaType mediaType = detector.detect(tis, submd); if (mediaType != null) { //detect only once submd.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE, mediaType.toString()); if (mediaType.toString().startsWith("text")) { return true; } } } catch (IOException e) { } return false; }
Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, rel); stream = TikaInputStream.get( fs.createDocumentInputStream("CONTENTS")); } else if (root.hasEntry("Package")) { stream = TikaInputStream.get( fs.createDocumentInputStream("Package")); } else { Ole10Native.createFromEmbeddedOleObject(fs); if (ole.getLabel() != null) { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, ole.getLabel()); stream = TikaInputStream.get(data); stream.close();
@Override public Integer call() throws Exception { for (int i = 0; i < iterations; i++) { int randIndex = random.nextInt(files.length); Path testFile = files[randIndex]; Metadata metadata = new Metadata(); try (TikaInputStream tis = TikaInputStream.get(testFile, metadata)) { MediaType mediaType = detector.detect(tis, metadata); assertEquals("failed on: " + testFile.getFileName(), truth.get(testFile), mediaType); } } return 1; }
byte[] bytes = (byte[])obj; handleEmbeddedResource( TikaInputStream.get(bytes), if (isRichText(c)) { BodyContentHandler h = new BodyContentHandler(); Metadata m = new Metadata(); m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8"); try { htmlParser.parse(new ByteArrayInputStream(v.getBytes(UTF_8)), h, m, parseContext);