public void setBinaryContent(byte[] data) throws TransformerConfigurationException, TikaException, SAXException, IOException { InputStream inputStream = new ByteArrayInputStream(data); ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); try { TransformerHandler handler = getTransformerHandler(outputStream, DEFAULT_OUTPUT_FORMAT, DEFAULT_ENCODING); AUTO_DETECT_PARSER.parse(inputStream, handler, new Metadata(), context); // Hacking the following line to remove Tika's inserted DocType this.html = new String(outputStream.toByteArray(), DEFAULT_ENCODING).replace( "http://www.w3.org/1999/xhtml", ""); } catch (TransformerConfigurationException | TikaException | SAXException | IOException | RuntimeException e) { throw e; } }
public static void useHtmlParser() throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); Parser parser = new HtmlParser(); parser.parse(stream, handler, metadata, context); }
private void parsePage(byte[] byteObject, Parser htmlParser, ContentHandler xhtml, ParseContext context) throws TikaException {// throws IOException InputStream stream = null; Metadata metadata = new Metadata(); ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1 try { stream = new ByteArrayInputStream(byteObject); htmlParser.parse(stream, handler, metadata, context); } catch (SAXException e) { throw new RuntimeException(e); } catch (IOException e) { // Pushback overflow from tagsoup } }
public static void testTeeContentHandler(String filename) throws Exception { InputStream stream = new ByteArrayInputStream(new byte[0]); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); Parser parser = new AutoDetectParser(); LinkContentHandler linkCollector = new LinkContentHandler(); try (OutputStream output = new FileOutputStream(new File(filename))) { ContentHandler handler = new TeeContentHandler( new BodyContentHandler(output), linkCollector); parser.parse(stream, handler, metadata, context); } }
@Test public void testIgnore() throws Exception { Parser p = new MockParser(OVER_DEFAULT); ContentHandler handler = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1).getNewContentHandler(); assertTrue(handler instanceof DefaultHandler); p.parse(null, handler, null, null); //unfortunatley, the DefaultHandler does not return "", assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString()); //tests that no write limit exception is thrown p = new MockParser(100); handler = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, 5).getNewContentHandler(); assertTrue(handler instanceof DefaultHandler); p.parse(null, handler, null, null); assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString()); }
@Test public void testXML() throws Exception { Parser p = new MockParser(OVER_DEFAULT); p.parse(null, handler, new Metadata(), null); String extracted = handler.toString(); assertContains("<head><title>This is the title", extracted); handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING); assertTrue(handler instanceof ToXMLContentHandler); p.parse(null, handler, null, null);
new WriteOutContentHandler(maxLength); try { ParseContext context = new ParseContext(); context.set(Parser.class, parser); parser.parse( stream, new BodyContentHandler(handler), metadata, context); } catch (SAXException e) { if (!handler.isWriteLimitReached(e)) {
/** * Parses the given binary stream and writes the text content * to the write end of the pipe. Potential exceptions (including * the one caused if the read end is closed unexpectedly) are * stored before the input stream is closed and processing is stopped. */ public void run() { try { ContentHandler handler = new BodyContentHandler(writer); parser.parse(stream, handler, metadata, context); } catch (Throwable t) { throwable = t; } try { stream.close(); } catch (Throwable t) { if (throwable == null) { throwable = t; } } try { writer.close(); } catch (Throwable t) { if (throwable == null) { throwable = t; } } }
protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata, ParseContext context) throws Exception { if (context == null) { context = new ParseContext(); } try { ContentHandler handler = new ToXMLContentHandler(); parser.parse(input, handler, metadata, context); return new XMLResult(handler.toString(), metadata); } finally { input.close(); } }
public static void process(Path path) throws Exception { Parser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them // to the underlying Handler. PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata); try (InputStream stream = new BufferedInputStream(Files.newInputStream(path))) { parser.parse(stream, handler, metadata, new ParseContext()); } String[] numbers = metadata.getValues("phonenumbers"); Collections.addAll(phoneNumbers, numbers); } }
@Test public void testHTML() throws Exception { Parser p = new MockParser(OVER_DEFAULT); p.parse(null, handler, null, null); String extracted = handler.toString(); assertContains("<head><title>This is the title", extracted); handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING); assertTrue(handler instanceof ToHTMLContentHandler); p.parse(null, handler, null, null); assertContains("This is the title", os.toByteArray()); assertContains("aaaaaaaaaa", os.toByteArray());
public static void parseFileInputStream(String filename) throws Exception { Parser parser = new AutoDetectParser(); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try (InputStream stream = new FileInputStream(new File(filename))) { parser.parse(stream, handler, metadata, context); } }
parser.parse( new ByteArrayInputStream(part.bytes), new EmbeddedContentHandler(new BodyContentHandler(handler)), new Metadata(), parseContext ); } catch (SAXException | TikaException e) {
new WriteOutContentHandler(maxStringLength); try { ParseContext context = new ParseContext(); context.set(Parser.class, parser); parser.parse( stream, new BodyContentHandler(handler), metadata, context); } catch (SAXException e) { if (!handler.isWriteLimitReached(e)) {
public static String handleStreamContent(byte[] file) throws Exception { Metadata md = new Metadata(); TikaInputStream input = TikaInputStream.get(file, md); StringWriter textBuffer = new StringWriter(); StringBuilder metadataBuffer = new StringBuilder(); ContentHandler handler = new TeeContentHandler( getTextContentHandler(textBuffer) ); parser.parse(input, handler, md, context); return textBuffer.toString(); }
xhtml.startDocument(); ContentHandler childHandler = new EmbeddedContentHandler( new BodyContentHandler(xhtml)); meta.parse(zip, new DefaultHandler(), metadata, context); } else if (entry.getName().endsWith(".opf")) { meta.parse(zip, new DefaultHandler(), metadata, context); } else if (entry.getName().endsWith(".htm") || entry.getName().endsWith(".html") || entry.getName().endsWith(".xhtml")) { content.parse(zip, childHandler, metadata, context);
@Override public Void call() throws Exception { getParser().parse(stream, handler, metadata, new ParseContext()); return null; } });
public void extract(InputStream is, Path outputDir) throws SAXException, TikaException, IOException { Metadata m = new Metadata(); ParseContext c = new ParseContext(); ContentHandler h = new BodyContentHandler(-1); c.set(Parser.class, parser); EmbeddedDocumentExtractor ex = new MyEmbeddedDocumentExtractor(outputDir, c); c.set(EmbeddedDocumentExtractor.class, ex); parser.parse(is, h, m, c); }
@Test public void testBody() throws Exception { Parser p = new MockParser(OVER_DEFAULT); p.parse(null, handler, null, null); String extracted = handler.toString(); assertNotContains("title", extracted); handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING); assertTrue(handler instanceof BodyContentHandler); p.parse(null, handler, null, null); assertNotContains("title", os.toByteArray()); assertContains("aaaaaaaaaa", os.toByteArray());
public static void parseURLStream(String address) throws Exception { Parser parser = new AutoDetectParser(); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); try (InputStream stream = new GZIPInputStream(new URL(address).openStream())) { parser.parse(stream, handler, metadata, context); } }