public static void useAutoDetectParser() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); Parser parser = new AutoDetectParser(); parser.parse(stream, handler, metadata, context); }
public boolean isSupported(TikaInputStream input) throws IOException { MediaType type = detector.detect(input, new Metadata()); return parser.getSupportedTypes(new ParseContext()).contains(type); }
private static void parseMeta(Parser parser,InputStream in,Metadata meta,Map<String,String> ogmeta) throws IOException, SAXException, TikaException{ parser.parse(in, new DefaultHandler(), meta, new ParseContext()); String[] propnames = meta.names(); for (String propname : propnames){ String val = meta.get(propname); ogmeta.put(propname, val); } }
InputStream stream = new FileInputStream(file); try { Metadata metadata = new Metadata(); ContentHandler handler = new DefaultHandler(); Parser parser = new JpegParser(); ParseContext context = new ParseContext(); metadata.set(Metadata.CONTENT_TYPE, mimeType); parser.parse(stream, handler, metadata, context); String lat = metadata.get("geo:lat"); String lon = metadata.get("geo:long"); stream.close();
return null; final Metadata metadata = new Metadata(); metadata.set(HttpHeaders.CONTENT_TYPE, mediaType.toString()); } else { for (Parser p : parsers) { if (mediaType != null && !p.getSupportedTypes(context).contains(mediaType)) { continue; context = new ParseContext(); if (context.get(Parser.class) == null) { context.set(Parser.class, parser instanceof AutoDetectParser ? parser : new AutoDetectParser()); parser.parse(in, handler, metadata, context); } catch (Exception ex) { parser.parse(in, handler, metadata, context); } else { throw ex;
ParseContext context = new ParseContext(); BodyContentHandler handler; Metadata metadata; metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); usedParsers = metadata.getValues("X-Parsed-By"); assertEquals(1, usedParsers.length); assertEquals(DummyParser.class.getName(), usedParsers[0]); metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString());
metadata.set(Metadata.CONTENT_TYPE, type); } else if (entry.getName().equals(META_NAME)) { meta.parse(zip, new DefaultHandler(), metadata, context); } else if (entry.getName().endsWith("content.xml")) { if (content instanceof OpenDocumentContentParser) { } else { content.parse(zip, handler, metadata, context); } else { content.parse(zip, handler, metadata, context); EmbeddedDocumentExtractor embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); Metadata embeddedMetadata = new Metadata(); embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName());
xhtml.startDocument(); ContentHandler childHandler = new EmbeddedContentHandler( new BodyContentHandler(xhtml)); type = type.trim(); metadata.set(Metadata.CONTENT_TYPE, type); } else if (entry.getName().equals("metadata.xml")) { meta.parse(zip, new DefaultHandler(), metadata, context); } else if (entry.getName().endsWith(".opf")) { meta.parse(zip, new DefaultHandler(), metadata, context); } else if (entry.getName().endsWith(".htm") || entry.getName().endsWith(".html") || entry.getName().endsWith(".xhtml")) { content.parse(zip, childHandler, metadata, context);
@Override public Single<Map<String, String>> getMetadata(InputStream ins) { return Single.create(sub -> { Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try { parser.parse(ins, handler, metadata, context); Map<String, String> map = new HashMap<>(); String[] metadataNames = metadata.names(); for (String name : metadataNames) { map.put(name, metadata.get(name)); } sub.onSuccess(map); } catch (Exception e) { sub.onError(e); } // ins.close(); }); }
@ProcessElement public void processElement(ProcessContext c) throws Exception { ReadableFile file = c.element(); InputStream stream = Channels.newInputStream(file.open()); try (InputStream tikaStream = TikaInputStream.get(stream)) { Parser parser = tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig); ParseContext context = new ParseContext(); context.set(Parser.class, parser); Metadata tikaMetadata = spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata(); if (spec.getContentTypeHint() != null) { tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint()); } String location = file.getMetadata().resourceId().toString(); ParseResult res; ContentHandler tikaHandler = new ToTextContentHandler(); try { parser.parse(tikaStream, tikaHandler, tikaMetadata, context); res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata); } catch (Exception e) { res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e); } c.output(res); } } }
MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN)); ParseContext context = new ParseContext(); BodyContentHandler handler; Metadata metadata; Set<MediaType> types = p.getSupportedTypes(context); assertEquals(2, types.size()); assertEquals(types.toString(), true, types.contains(MediaType.TEXT_PLAIN)); metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("", handler.toString());
private void parsePage(byte[] byteObject, Parser htmlParser, ContentHandler xhtml, ParseContext context) throws TikaException {// throws IOException InputStream stream = null; Metadata metadata = new Metadata(); ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1 try { stream = new ByteArrayInputStream(byteObject); htmlParser.parse(stream, handler, metadata, context); } catch (SAXException e) { throw new RuntimeException(e); } catch (IOException e) { // Pushback overflow from tagsoup } }
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException { TesseractOCRConfig config = parseContext.get(TesseractOCRConfig.class, defaultConfig); _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new DefaultHandler(), metadata, parseContext);
BodyContentHandler ch = new BodyContentHandler(woh); Metadata metadata = new Metadata(); metadata.add(Metadata.CONTENT_TYPE, blob.getMediaType()); if (blob.getName() != null) { metadata.add(Metadata.RESOURCE_NAME_KEY, blob.getName()); ParseContext parseContext = new ParseContext(); tikaParser.parse(is, ch, metadata, parseContext); } catch (Throwable t) { if (woh.isWriteLimitReached(t)) { String text = ch.toString(); if (text.length() > 0) { result.add(text);
String v = toString(obj, c.getType()); if (isRichText(c)) { BodyContentHandler h = new BodyContentHandler(); Metadata m = new Metadata(); m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8"); try { htmlParser.parse(new ByteArrayInputStream(v.getBytes(UTF_8)), h, m, parseContext); handler.characters(h.toString()); } catch (SAXException e) {
public static void main(String[] args) throws Exception { ApplicationContext context = new ClassPathXmlApplicationContext( new String[]{"org/apache/tika/example/spring.xml"}); Parser parser = context.getBean("tika", Parser.class); parser.parse(new ByteArrayInputStream("Hello, World!".getBytes(UTF_8)), new WriteOutContentHandler(System.out), new Metadata(), new ParseContext()); } }
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, filename); System.out.println("The MIME type (based on filename) is: [" + mimeRegistry.detect(null, metadata) + "]"); metadata.set(Metadata.CONTENT_TYPE, type.toString()); ContentHandler handler = new BodyContentHandler(); parser.parse(stream, handler, metadata, new ParseContext());
public ParsedData parse(InputStream stream, String fileName, String contentType) { BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS); BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE); Metadata metadata = createMetadata(fileName, contentType); ParseContext context = new ParseContext(); try { parser.parse(stream, textHandler, metadata, context); Map<String, String> metadataMap = new HashMap<String, String>(); for (String propertyName : metadata.names()) { metadataMap.put(propertyName, metadata.get(propertyName)); } return new ParsedData(handler.toString(), metadataMap); } catch (IOException | SAXException | TikaException e) { logger.error("Failed to extract metadata using Tika.", e); return null; } }
private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, SAXException, TikaException { if (entry == null) return; if (entry.getName().equals("mimetype")) { String type = IOUtils.toString(zip, UTF_8); metadata.set(Metadata.CONTENT_TYPE, type); } else if (entry.getName().equals(META_NAME)) { meta.parse(zip, new DefaultHandler(), metadata, context); } else if (entry.getName().endsWith("content.xml")) { if (content instanceof OpenDocumentContentParser) { ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context); } else { // Foreign content parser was set: content.parse(zip, handler, metadata, context); } } else if (entry.getName().endsWith("styles.xml")) { if (content instanceof OpenDocumentContentParser) { ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context); } else { // Foreign content parser was set: content.parse(zip, handler, metadata, context); } } } }
context.set(Parser.class, decorator); ContentHandler localHandler = parserState.recursiveParserWrapperHandler.getNewContentHandler(); long started = System.currentTimeMillis(); parserState.recursiveParserWrapperHandler.startDocument(); try { getWrappedParser().parse(stream, localHandler, metadata, context); } catch (SAXException e) { boolean wlr = isWriteLimitReached(e); throw e; metadata.set(RecursiveParserWrapperHandler.WRITE_LIMIT_REACHED, "true"); } catch (Throwable e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX+"runtime", stackTrace); throw e; } finally { long elapsedMillis = System.currentTimeMillis() - started; metadata.set(RecursiveParserWrapperHandler.PARSE_TIME_MILLIS, Long.toString(elapsedMillis)); parserState.recursiveParserWrapperHandler.endDocument(localHandler, metadata); parserState.recursiveParserWrapperHandler.endDocument();