public static void testLocale() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); Parser parser = new AutoDetectParser(); ParseContext context = new ParseContext(); context.set(Locale.class, Locale.ENGLISH); parser.parse(stream, handler, metadata, context); }
/** * Checks to see if the user has specified an {@link OfficeParserConfig}. * If so, no changes are made; if not, one is added to the context. * * @param parseContext */ public void configure(ParseContext parseContext) { OfficeParserConfig officeParserConfig = parseContext.get(OfficeParserConfig.class, defaultOfficeParserConfig); parseContext.set(OfficeParserConfig.class, officeParserConfig); }
public TikaHtmlParser(CrawlConfig config, TLDList tldList) throws InstantiationException, IllegalAccessException { this.config = config; this.tldList = tldList; htmlParser = new HtmlParser(); parseContext = new ParseContext(); parseContext.set(HtmlMapper.class, AllTagMapper.class.newInstance()); }
public void parse( InputStream stream, ContentHandler handler, Metadata metadata) throws IOException, SAXException, TikaException { ParseContext context = new ParseContext(); context.set(Parser.class, this); parse(stream, handler, metadata, context); }
ParseContext context = new ParseContext(); BodyContentHandler handler; Metadata metadata; metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); usedParsers = metadata.getValues("X-Parsed-By"); assertEquals(1, usedParsers.length); assertEquals(DummyParser.class.getName(), usedParsers[0]); metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString());
public void parseRawXMP(byte[] xmpData) throws IOException, SAXException, TikaException { XMPMetadata xmp = null; try (InputStream decoded = new ByteArrayInputStream(xmpData) ) { Document dom = new ParseContext().getDocumentBuilder().parse(decoded); if (dom != null) { xmp = new XMPMetadata(dom); } } catch (IOException|SAXException e) { // } if (xmp != null) { JempboxExtractor.extractDublinCore(xmp, metadata); JempboxExtractor.extractXMPMM(xmp, metadata); } }
public static void useCompositeParser() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); ParseContext context = new ParseContext(); Map<MediaType, Parser> parsersByType = new HashMap<MediaType, Parser>(); parsersByType.put(MediaType.parse("text/html"), new HtmlParser()); parsersByType.put(MediaType.parse("application/xml"), new XMLParser()); CompositeParser parser = new CompositeParser(); parser.setParsers(parsersByType); parser.setFallback(new TXTParser()); Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "text/html"); parser.parse(stream, handler, metadata, context); }
MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN)); ParseContext context = new ParseContext(); BodyContentHandler handler; Metadata metadata; Set<MediaType> types = p.getSupportedTypes(context); assertEquals(2, types.size()); assertEquals(types.toString(), true, types.contains(MediaType.TEXT_PLAIN)); metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("", handler.toString());
byte[] raw = content.getContent(); ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); Parser parser = new AutoDetectParser(); parser.parse(new ByteArrayInputStream(raw), handler, metadata, new ParseContext()); LOG.info("content: " + handler.toString());
public void extract(InputStream is, Path outputDir) throws SAXException, TikaException, IOException { Metadata m = new Metadata(); ParseContext c = new ParseContext(); ContentHandler h = new BodyContentHandler(-1); c.set(Parser.class, parser); EmbeddedDocumentExtractor ex = new MyEmbeddedDocumentExtractor(outputDir, c); c.set(EmbeddedDocumentExtractor.class, ex); parser.parse(is, h, m, c); }
public static void main(String[] args) throws Exception { File file = new File("/Users/jason/docstore/example_received_regular.msg"); AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(-1); Metadata tikaMetadata = new Metadata(); InputStream input = TikaInputStream.get(file, tikaMetadata); parser.parse(input, handler, tikaMetadata, new ParseContext()); String[] names = tikaMetadata.names(); Arrays.sort(names); for (String name : names) { System.out.println(name + ": " + tikaMetadata.get(name)); } }
@Test public void testRecursiveParserWrapper() throws Exception { Parser parser = new AutoDetectParser(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory( BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 20000)); ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try (InputStream is = getClass().getResourceAsStream("/test-documents/basic_embedded.xml")) { fork.parse(is, handler, metadata, context); } finally { fork.close(); } List<Metadata> metadataList = handler.getMetadataList(); Metadata m0 = metadataList.get(0); assertEquals("Nikolai Lobachevsky", m0.get(TikaCoreProperties.CREATOR)); assertContains("main_content", m0.get(RecursiveParserWrapperHandler.TIKA_CONTENT)); assertContains("embed1.xml", m0.get(RecursiveParserWrapperHandler.TIKA_CONTENT)); Metadata m1 = metadataList.get(1); assertEquals("embeddedAuthor", m1.get(TikaCoreProperties.CREATOR)); assertContains("some_embedded_content", m1.get(RecursiveParserWrapperHandler.TIKA_CONTENT)); assertEquals("/embed1.xml", m1.get(RecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH)); }
public void listMetadata(File f) throws MapperException { try { InputStream stream = new FileInputStream(f); Metadata metadata = new Metadata(); ContentHandler handler = new DefaultHandler(); Parser parser = new AutoDetectParser(); ParseContext context = new ParseContext(); parser.parse(stream, handler, metadata, context); for (String key : metadata.names()) { String val = metadata.get(key); LOG.info("Found metadata \'" + key + "\': " + val); } } catch (Exception e) { LOG.error(e.toString(), e); throw new MapperException("Extracting metadata failed, file not found: " + f.getAbsolutePath(), e); } } }
/** * Creates a reader for the text content of the given binary stream. * * @param stream binary stream * @throws IOException if the document can not be parsed */ public ParsingReader(InputStream stream) throws IOException { this(new AutoDetectParser(), stream, new Metadata(), new ParseContext()); context.set(Parser.class, parser); }
File file = new File("/pats/to/file.xls"); AutoDetectParser parser = new AutoDetectParser(); parser.setParsers(new HashMap<MediaType, Parser>()); Metadata metadata = new Metadata(); metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, file.getName()); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); stream.close(); String mimeType = metadata.get(HttpHeaders.CONTENT_TYPE); System.out.println(mimeType);
/** * This example shows how to extract content from the outer document and all * embedded documents. The key is to specify a {@link Parser} in the {@link ParseContext}. * * @return content, including from embedded documents * @throws IOException * @throws SAXException * @throws TikaException */ public String parseEmbeddedExample() throws IOException, SAXException, TikaException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); context.set(Parser.class, parser); try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) { parser.parse(stream, handler, metadata, context); return handler.toString(); } }
public static void main( String[] args ) { try { FileInputStream input = new FileInputStream( new File( "src/test/resources/jap_91055688_japredcross_ss_ue_fnl_12212011.pdf"));//simple-PDFA-1a.pdf" ) ); OutputStream output = System.out; //new FileOutputStream( new File( "Z:/part-00001.xml" ) ); Metadata metadata = new Metadata(); PDFParser parser = new PDFParser(); parser.parse(input, new DefaultHandler() , metadata, new ParseContext() ); input.close(); for( String key : metadata.names() ) { output.write( (key+" : "+metadata.get(key)+"\n").getBytes( "UTF-8" ) ); } output.close(); } catch( Exception e ) { e.printStackTrace(); } }}
@Override public Single<Map<String, String>> getMetadata(InputStream ins) { return Single.create(sub -> { Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try { parser.parse(ins, handler, metadata, context); Map<String, String> map = new HashMap<>(); String[] metadataNames = metadata.names(); for (String name : metadataNames) { map.put(name, metadata.get(name)); } sub.onSuccess(map); } catch (Exception e) { sub.onError(e); } // ins.close(); }); }
boolean saveAll ) throws Exception { Metadata metadata = new Metadata(); ParseContext pc = new ParseContext(); ch = new BodyContentHandler(new RichTextContentHandler(new OutputStreamWriter(text, UTF_8))); } else { ch = new DefaultHandler(); MutableInt count = new MutableInt(); pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, files)); TikaResource.parse(parser, LOG, info.getPath(), is, ch, metadata, pc);
private static void parseMeta(Parser parser,InputStream in,Metadata meta,Map<String,String> ogmeta) throws IOException, SAXException, TikaException{ parser.parse(in, new DefaultHandler(), meta, new ParseContext()); String[] propnames = meta.names(); for (String propname : propnames){ String val = meta.get(propname); ogmeta.put(propname, val); } }