InputStream input = new FileInputStream("myfile.html"); ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); new HtmlParser().parse(input, handler, metadata, new ParseContext()); String plainText = handler.toString();
public static void main(String[] args) throws Exception { ApplicationContext context = new ClassPathXmlApplicationContext( new String[]{"org/apache/tika/example/spring.xml"}); Parser parser = context.getBean("tika", Parser.class); parser.parse(new ByteArrayInputStream("Hello, World!".getBytes(UTF_8)), new WriteOutContentHandler(System.out), new Metadata(), new ParseContext()); } }
public static void process(Path path) throws Exception { Parser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them // to the underlying Handler. PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata); try (InputStream stream = new BufferedInputStream(Files.newInputStream(path))) { parser.parse(stream, handler, metadata, new ParseContext()); } String[] numbers = metadata.getValues("phonenumbers"); Collections.addAll(phoneNumbers, numbers); } }
@Test public void testHelloWorld() throws Exception { try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser())) { Metadata metadata = new Metadata(); ContentHandler output = new BodyContentHandler(); InputStream stream = new ByteArrayInputStream(new byte[0]); ParseContext context = new ParseContext(); parser.parse(stream, output, metadata, context); assertEquals("Hello, World!", output.toString().trim()); assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE)); } }
@Test public void testSimple() { Parser p = new AutoDetectParser(); ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, p); Parser txtParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(org.apache.tika.parser.txt.TXTParser.class, parseContext); assertNotNull(txtParser); assertEquals(org.apache.tika.parser.txt.TXTParser.class, txtParser.getClass()); }
@Test public void testFallback() throws Exception { ParseContext context = new ParseContext(); BodyContentHandler handler; Metadata metadata; metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); assertEquals(1, usedParsers.length); assertEquals(DummyParser.class.getName(), usedParsers[0]); metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString()); assertNotNull(metadata.get(ParserUtils.EMBEDDED_EXCEPTION)); assertNotNull(metadata.get(ParserUtils.EMBEDDED_PARSER)); assertEquals(ErrorParser.class.getName(), metadata.get(ParserUtils.EMBEDDED_PARSER)); metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back!", handler.toString());
@Test public void testToFileHandler() throws Exception { ToFileHandler toFileHandler = new ToFileHandler(new SBContentHandlerFactory(), target.toFile()); forkParser = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper); Metadata m = new Metadata(); ParseContext context = new ParseContext(); forkParser.parse(is, toFileHandler, m, context); } finally {
@Test public void testExecuteThread() throws Exception { ParseContext context = new ParseContext(); Future result = ConcurrentUtils.execute(context, new Runnable() { @Override public void run() { //Do nothing } }); assertNull(result.get()); }
public static Metadata getMet(URL url) throws IOException, SAXException, TikaException { Metadata met = new Metadata(); PDFParser parser = new PDFParser(); parser.parse(url.openStream(), new BodyContentHandler(), met, new ParseContext()); return met; }
parser.setParsers(new HashMap<MediaType, Parser>()); Metadata metadata = new Metadata(); metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, file.getName()); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); stream.close(); String mimeType = metadata.get(HttpHeaders.CONTENT_TYPE); System.out.println(mimeType);
new WriteOutContentHandler(maxStringLength); try { ParseContext context = new ParseContext(); context.set(Parser.class, parser); parser.parse( stream, new BodyContentHandler(handler), metadata, context); } catch (SAXException e) { if (!handler.isWriteLimitReached(e)) {
public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig, Metadata metadata) throws Exception { System.out.println("Handling using AutoDetectParser: [" + filename + "]"); AutoDetectParser parser = new AutoDetectParser(tikaConfig); ContentHandler handler = new BodyContentHandler(); TikaInputStream stream = TikaInputStream.get(new File(filename), metadata); parser.parse(stream, handler, metadata, new ParseContext()); return handler.toString(); }
@Test public void testPackageCanBeAccessed() throws Exception { try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser.ForkTestParserAccessingPackage())) { Metadata metadata = new Metadata(); ContentHandler output = new BodyContentHandler(); InputStream stream = new ByteArrayInputStream(new byte[0]); ParseContext context = new ParseContext(); parser.parse(stream, output, metadata, context); assertEquals("Hello, World!", output.toString().trim()); assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE)); } }
@Test @SuppressWarnings("serial") public void testFindDuplicateParsers() { Parser a = new EmptyParser() { public Set<MediaType> getSupportedTypes(ParseContext context) { return Collections.singleton(MediaType.TEXT_PLAIN); } }; Parser b = new EmptyParser() { public Set<MediaType> getSupportedTypes(ParseContext context) { return Collections.singleton(MediaType.TEXT_PLAIN); } }; Parser c = new EmptyParser() { public Set<MediaType> getSupportedTypes(ParseContext context) { return Collections.singleton(MediaType.OCTET_STREAM); } }; CompositeParser composite = new CompositeParser( MediaTypeRegistry.getDefaultRegistry(), a, b, c); Map<MediaType, List<Parser>> duplicates = composite.findDuplicateParsers(new ParseContext()); assertEquals(1, duplicates.size()); List<Parser> parsers = duplicates.get(MediaType.TEXT_PLAIN); assertNotNull(parsers); assertEquals(2, parsers.size()); assertEquals(a, parsers.get(0)); assertEquals(b, parsers.get(1)); }
@Test public void testSupplemental() throws Exception { ParseContext context = new ParseContext(); BodyContentHandler handler; Metadata metadata; metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back 1!", handler.toString()); assertEquals("Test1", metadata.get("T1")); assertEquals("Test1", metadata.get("TBoth")); pContent1, pContent2, pNothing); metadata = new Metadata(); handler = new BodyContentHandler(); p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context); assertEquals("Fell back 1!Fell back 2!", handler.toString()); assertEquals("Test1", metadata.get("T1")); assertEquals("Test2", metadata.get("T2")); assertEquals("Test1", metadata.get("TBoth")); pContent1, pContent2, pNothing);
@Test public void testExecuteExecutor() throws Exception { TikaConfig config = TikaConfig.getDefaultConfig(); ParseContext context = new ParseContext(); context.set(ExecutorService.class, config.getExecutorService()); Future result = ConcurrentUtils.execute(context, new Runnable() { @Override public void run() { //Do nothing } }); assertNull(result.get()); }
public static void testTeeContentHandler(String filename) throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); Parser parser = new AutoDetectParser(); LinkContentHandler linkCollector = new LinkContentHandler(); try (OutputStream output = new FileOutputStream(new File(filename))) { ContentHandler handler = new TeeContentHandler( new BodyContentHandler(output), linkCollector); parser.parse(stream, handler, metadata, context); } }
public static void useHtmlParser() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); Parser parser = new HtmlParser(); parser.parse(stream, handler, metadata, context); }
/** * This example shows how to extract content from the outer document and all * embedded documents. The key is to specify a {@link Parser} in the {@link ParseContext}. * * @return content, including from embedded documents * @throws IOException * @throws SAXException * @throws TikaException */ public String parseEmbeddedExample() throws IOException, SAXException, TikaException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); context.set(Parser.class, parser); try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) { parser.parse(stream, handler, metadata, context); return handler.toString(); } }
Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, filename); ParseContext context = new ParseContext(); Parser parser = new AutoDetectParser(); parser.parse(input, textHandler, metadata, new ParseContext()); if(metadata.get(CONTENT_TYPE).equals("application/pdf")) { // Do something special with the PDF metadata here }