public static void useAutoDetectParser() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); Parser parser = new AutoDetectParser(); parser.parse(stream, handler, metadata, context); }
private Map<String, String> tika_parse(InputStream sourceStream, String prefix, Integer maxAttribs, Integer maxAttribLen) throws IOException, TikaException, SAXException { final Metadata metadata = new Metadata(); final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream); try { autoDetectParser.parse(tikaInputStream, new DefaultHandler(), metadata); } finally { tikaInputStream.close(); final Pattern metadataKeyFilter = metadataKeyFilterRef.get(); final StringBuilder dataBuilder = new StringBuilder(); for (final String key : metadata.names()) { if (metadataKeyFilter != null && !metadataKeyFilter.matcher(key).matches()) { continue; if (metadata.isMultiValued(key)) { for (String val : metadata.getValues(key)) { if (dataBuilder.length() > 1) {
try (InputStream is = theInputStream; BufferedInputStream bis = new BufferedInputStream(is);) { AutoDetectParser parser = new AutoDetectParser(); Detector detector = parser.getDetector(); Metadata md = new Metadata(); md.add(Metadata.RESOURCE_NAME_KEY, theFileName); MediaType mediaType = detector.detect(bis, md); return mediaType.toString(); }
/** * Example of extracting the plain text of the contents. * Will return only the "body" part of the document */ public String parseToPlainText() throws IOException, SAXException, TikaException { BodyContentHandler handler = new BodyContentHandler(); AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) { parser.parse(stream, handler, metadata); return handler.toString(); } }
public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig, Metadata metadata) throws Exception { System.out.println("Handling using AutoDetectParser: [" + filename + "]"); AutoDetectParser parser = new AutoDetectParser(tikaConfig); ContentHandler handler = new BodyContentHandler(); TikaInputStream stream = TikaInputStream.get(new File(filename), metadata); parser.parse(stream, handler, metadata, new ParseContext()); return handler.toString(); }
/** * Example of extracting the contents as HTML, as a string. */ public String parseToHTML() throws IOException, SAXException, TikaException { ContentHandler handler = new ToXMLContentHandler(); AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) { parser.parse(stream, handler, metadata); return handler.toString(); } }
public static void main(String[] args) throws Exception { File file = new File("/Users/jason/docstore/example_received_regular.msg"); AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(-1); Metadata tikaMetadata = new Metadata(); InputStream input = TikaInputStream.get(file, tikaMetadata); parser.parse(input, handler, tikaMetadata, new ParseContext()); String[] names = tikaMetadata.names(); Arrays.sort(names); for (String name : names) { System.out.println(name + ": " + tikaMetadata.get(name)); } }
@Test public void testRPWWithEmbeddedNPE() throws Exception { Parser parser = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory( BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 20000)); ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try (InputStream is = getClass().getResourceAsStream("/test-documents/embedded_with_npe.xml")) { fork.parse(is, handler, metadata, context); } finally { fork.close(); } List<Metadata> metadataList = handler.getMetadataList(); Metadata m0 = metadataList.get(0); assertEquals("Nikolai Lobachevsky", m0.get(TikaCoreProperties.CREATOR)); assertContains("main_content", m0.get(RecursiveParserWrapperHandler.TIKA_CONTENT)); assertContains("embed1.xml", m0.get(RecursiveParserWrapperHandler.TIKA_CONTENT)); Metadata m1 = metadataList.get(1); assertEquals("embeddedAuthor", m1.get(TikaCoreProperties.CREATOR)); assertContains("some_embedded_content", m1.get(RecursiveParserWrapperHandler.TIKA_CONTENT)); assertEquals("/embed1.xml", m1.get(RecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH)); assertContains("another null pointer exception", m1.get(RecursiveParserWrapperHandler.EMBEDDED_EXCEPTION)); }
public void listMetadata(File f) throws MapperException { try { InputStream stream = new FileInputStream(f); Metadata metadata = new Metadata(); ContentHandler handler = new DefaultHandler(); Parser parser = new AutoDetectParser(); ParseContext context = new ParseContext(); parser.parse(stream, handler, metadata, context); for (String key : metadata.names()) { String val = metadata.get(key); LOG.info("Found metadata \'" + key + "\': " + val); } } catch (Exception e) { LOG.error(e.toString(), e); throw new MapperException("Extracting metadata failed, file not found: " + f.getAbsolutePath(), e); } } }
File file = new File("/pats/to/file.xls"); AutoDetectParser parser = new AutoDetectParser(); parser.setParsers(new HashMap<MediaType, Parser>()); Metadata metadata = new Metadata(); metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, file.getName()); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); stream.close(); String mimeType = metadata.get(HttpHeaders.CONTENT_TYPE); System.out.println(mimeType);
@Override public Single<Map<String, String>> getMetadata(InputStream ins) { return Single.create(sub -> { Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try { parser.parse(ins, handler, metadata, context); Map<String, String> map = new HashMap<>(); String[] metadataNames = metadata.names(); for (String name : metadataNames) { map.put(name, metadata.get(name)); } sub.onSuccess(map); } catch (Exception e) { sub.onError(e); } // ins.close(); }); }
@ProcessElement public void processElement(ProcessContext c) throws Exception { ReadableFile file = c.element(); InputStream stream = Channels.newInputStream(file.open()); try (InputStream tikaStream = TikaInputStream.get(stream)) { Parser parser = tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig); ParseContext context = new ParseContext(); context.set(Parser.class, parser); Metadata tikaMetadata = spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata(); if (spec.getContentTypeHint() != null) { tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint()); } String location = file.getMetadata().resourceId().toString(); ParseResult res; ContentHandler tikaHandler = new ToTextContentHandler(); try { parser.parse(tikaStream, tikaHandler, tikaMetadata, context); res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata); } catch (Exception e) { res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e); } c.output(res); } } }
public static String extractMeta(String uri, String contentType) throws Exception { final AutoDetectParser parser = createParser(); final Metadata metadata = new Metadata(); fillMetadata(parser, metadata, contentType, uri); final TikaInputStream inputStream = createInputStream(uri, metadata); parser.parse(inputStream, new DefaultHandler(), metadata); Map meta = new HashMap(); for (String name : metadata.names()) { String[] values = metadata.getValues(name); meta.put(name, values); } inputStream.close(); return new Gson().toJson(meta); }
}; AutoDetectParser ap = new AutoDetectParser(); for (String fileBase : testFiles) Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, filename); MediaType mt = ap.getDetector().detect(TikaInputStream.get(file), metadata); String mimetype = mt.toString();
if(currentImageType ==null){ ByteArrayInputStream is = new ByteArrayInputStream(image); String mimeType = URLConnection.guessContentTypeFromStream(is); if(mimeType == null){ AutoDetectParser parser = new AutoDetectParser(); Detector detector = parser.getDetector(); Metadata md = new Metadata(); mimeType = detector.detect(is,md).toString(); if (mimeType.contains("pdf")){ mimeType ="pdf"; } else if(mimeType.contains("tif")||mimeType.contains("tiff")){ mimeType = "tif"; } } if(mimeType.contains("png")){ mimeType ="png"; } else if( mimeType.contains("jpg")||mimeType.contains("jpeg")){ mimeType = "jpg"; } else if (mimeType.contains("pdf")){ mimeType ="pdf"; } else if(mimeType.contains("tif")||mimeType.contains("tiff")){ mimeType = "tif"; } currentImageType = ImageType.fromValue(mimeType); }
public static void languageDetectionWithHandler() throws Exception { LanguageHandler handler = new LanguageHandler(); new AutoDetectParser().parse(System.in, handler, new Metadata(), new ParseContext()); LanguageResult result = handler.getLanguage(); System.out.println(result.getLanguage()); } }
public void parse( InputStream stream, ContentHandler handler, Metadata metadata) throws IOException, SAXException, TikaException { ParseContext context = new ParseContext(); context.set(Parser.class, this); parse(stream, handler, metadata, context); }
/** * Creates a reader for the text content of the given binary stream * with the given name. * * @param stream binary stream * @param name document name * @throws IOException if the document can not be parsed */ public ParsingReader(InputStream stream, String name) throws IOException { this(new AutoDetectParser(), stream, getMetadata(name), new ParseContext()); context.set(Parser.class, parser); }
/** * * @param file office file * @return boolean success * @throws IOException a problem of file. refer to a message. * @throws SAXException * @throws TikaException throw this, if can not parse file. */ public static final String extract(File file) throws IOException, SAXException, TikaException { final ContentHandler handler = new BodyContentHandler(-1);//infinity -> -1, object will be gone soon final Metadata metadata = new Metadata(); // only 1-run 1-use, object will be gone soon try (InputStream input = new FileInputStream(file)) { TikaTextExtractor.parser.parse(input, handler, metadata); } return handler.toString(); } }
protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception { Parser p = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { wrapper.parse(is, handler, new Metadata(), context); } return handler.getMetadataList(); }