org.apache.tika.parser.Parser.parse java code examples

Refine search

public void setBinaryContent(byte[] data)
      throws TransformerConfigurationException, TikaException, SAXException, IOException {
  InputStream inputStream = new ByteArrayInputStream(data);
  ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
  try {
    TransformerHandler handler =
      getTransformerHandler(outputStream, DEFAULT_OUTPUT_FORMAT, DEFAULT_ENCODING);
    AUTO_DETECT_PARSER.parse(inputStream, handler, new Metadata(), context);
    // Hacking the following line to remove Tika's inserted DocType
    this.html = new String(outputStream.toByteArray(), DEFAULT_ENCODING).replace(
      "http://www.w3.org/1999/xhtml", "");
  } catch (TransformerConfigurationException | TikaException | SAXException | IOException | RuntimeException e) {
    throw e;
  }
}

public static void useHtmlParser() throws Exception {
  InputStream stream = new ByteArrayInputStream(new byte[0]);
  ContentHandler handler = new DefaultHandler();
  Metadata metadata = new Metadata();
  ParseContext context = new ParseContext();
  Parser parser = new HtmlParser();
  parser.parse(stream, handler, metadata, context);
}

private void parsePage(byte[] byteObject, Parser htmlParser,
            ContentHandler xhtml, ParseContext context) throws TikaException {// throws IOException
  InputStream stream = null;
  Metadata metadata = new Metadata();
  ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1
  try {
    stream = new ByteArrayInputStream(byteObject);
    htmlParser.parse(stream, handler, metadata, context);
  } catch (SAXException e) {
    throw new RuntimeException(e);
  } catch (IOException e) {
    // Pushback overflow from tagsoup
  }
}

public static void testTeeContentHandler(String filename) throws Exception {
  InputStream stream = new ByteArrayInputStream(new byte[0]);
  Metadata metadata = new Metadata();
  ParseContext context = new ParseContext();
  Parser parser = new AutoDetectParser();
  LinkContentHandler linkCollector = new LinkContentHandler();
  try (OutputStream output = new FileOutputStream(new File(filename))) {
    ContentHandler handler = new TeeContentHandler(
        new BodyContentHandler(output), linkCollector);
    parser.parse(stream, handler, metadata, context);
  }
}

@Test
public void testIgnore() throws Exception {
  Parser p = new MockParser(OVER_DEFAULT);
  ContentHandler handler =
      new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1).getNewContentHandler();
  assertTrue(handler instanceof DefaultHandler);
  p.parse(null, handler, null, null);
  //unfortunatley, the DefaultHandler does not return "",
  assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString());
  //tests that no write limit exception is thrown
  p = new MockParser(100);
  handler =
      new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, 5).getNewContentHandler();
  assertTrue(handler instanceof DefaultHandler);
  p.parse(null, handler, null, null);
  assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString());
}

@Test
public void testXML() throws Exception {
  Parser p = new MockParser(OVER_DEFAULT);
  p.parse(null, handler, new Metadata(), null);
  String extracted = handler.toString();
  assertContains("<head><title>This is the title", extracted);
  handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING);
  assertTrue(handler instanceof ToXMLContentHandler);
  p.parse(null, handler, null, null);

  new WriteOutContentHandler(maxLength);
try {
  ParseContext context = new ParseContext();
  context.set(Parser.class, parser);
  parser.parse(
         stream, new BodyContentHandler(handler), metadata, context);
} catch (SAXException e) {
  if (!handler.isWriteLimitReached(e)) {

/**
 * Parses the given binary stream and writes the text content
 * to the write end of the pipe. Potential exceptions (including
 * the one caused if the read end is closed unexpectedly) are
 * stored before the input stream is closed and processing is stopped.
 */
public void run() {
  try {
    ContentHandler handler = new BodyContentHandler(writer);
    parser.parse(stream, handler, metadata, context);
  } catch (Throwable t) {
    throwable = t;
  }
  try {
    stream.close();
  } catch (Throwable t) {
    if (throwable == null) {
      throwable = t;
    }
  }
  try {
    writer.close();
  } catch (Throwable t) {
    if (throwable == null) {
      throwable = t;
    }
  }
}

protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata, ParseContext context) throws Exception {
 if (context == null) {
   context = new ParseContext();
 }
 try {
   ContentHandler handler = new ToXMLContentHandler();
   parser.parse(input, handler, metadata, context);
   return new XMLResult(handler.toString(), metadata);
 } finally {
   input.close();
 }
}

  public static void process(Path path) throws Exception {
    Parser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them
    // to the underlying Handler.
    PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata);
    try (InputStream stream = new BufferedInputStream(Files.newInputStream(path))) {
      parser.parse(stream, handler, metadata, new ParseContext());
    }
    String[] numbers = metadata.getValues("phonenumbers");
    Collections.addAll(phoneNumbers, numbers);
  }
}

@Test
public void testHTML() throws Exception {
  Parser p = new MockParser(OVER_DEFAULT);
  p.parse(null, handler, null, null);
  String extracted = handler.toString();
  assertContains("<head><title>This is the title", extracted);
  handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING);
  assertTrue(handler instanceof ToHTMLContentHandler);
  p.parse(null, handler, null, null);
  assertContains("This is the title", os.toByteArray());
  assertContains("aaaaaaaaaa", os.toByteArray());

public static void parseFileInputStream(String filename) throws Exception {
  Parser parser = new AutoDetectParser();
  ContentHandler handler = new DefaultHandler();
  Metadata metadata = new Metadata();
  ParseContext context = new ParseContext();
  try (InputStream stream = new FileInputStream(new File(filename))) {
    parser.parse(stream, handler, metadata, context);
  }
}

  parser.parse(
      new ByteArrayInputStream(part.bytes),
      new EmbeddedContentHandler(new BodyContentHandler(handler)),
      new Metadata(), parseContext
  );
} catch (SAXException | TikaException e) {

  new WriteOutContentHandler(maxStringLength);
try {
  ParseContext context = new ParseContext();
  context.set(Parser.class, parser);
  parser.parse(
      stream, new BodyContentHandler(handler), metadata, context);
} catch (SAXException e) {
  if (!handler.isWriteLimitReached(e)) {

public static String handleStreamContent(byte[] file)
    throws Exception {
  Metadata md = new Metadata();
  TikaInputStream input = TikaInputStream.get(file, md);
  StringWriter textBuffer = new StringWriter();
  StringBuilder metadataBuffer = new StringBuilder();
  ContentHandler handler = new TeeContentHandler(
      getTextContentHandler(textBuffer)
  );
  parser.parse(input, handler, md, context);
  return textBuffer.toString();
}

xhtml.startDocument();
ContentHandler childHandler = new EmbeddedContentHandler(
   new BodyContentHandler(xhtml));
    meta.parse(zip, new DefaultHandler(), metadata, context);
  } else if (entry.getName().endsWith(".opf")) {
    meta.parse(zip, new DefaultHandler(), metadata, context);
  } else if (entry.getName().endsWith(".htm") || 
          entry.getName().endsWith(".html") || 
        entry.getName().endsWith(".xhtml")) {
    content.parse(zip, childHandler, metadata, context);

 @Override
 public Void call() throws Exception {
  getParser().parse(stream, handler, metadata, new ParseContext());
  return null;
 }
});

public void extract(InputStream is, Path outputDir) throws SAXException, TikaException, IOException {
  Metadata m = new Metadata();
  ParseContext c = new ParseContext();
  ContentHandler h = new BodyContentHandler(-1);
  c.set(Parser.class, parser);
  EmbeddedDocumentExtractor ex = new MyEmbeddedDocumentExtractor(outputDir, c);
  c.set(EmbeddedDocumentExtractor.class, ex);
  parser.parse(is, h, m, c);
}

@Test
public void testBody() throws Exception {
  Parser p = new MockParser(OVER_DEFAULT);
  p.parse(null, handler, null, null);
  String extracted = handler.toString();
  assertNotContains("title", extracted);
  handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, ENCODING);
  assertTrue(handler instanceof BodyContentHandler);
  p.parse(null, handler, null, null);
  assertNotContains("title", os.toByteArray());
  assertContains("aaaaaaaaaa", os.toByteArray());

public static void parseURLStream(String address) throws Exception {
  Parser parser = new AutoDetectParser();
  ContentHandler handler = new DefaultHandler();
  Metadata metadata = new Metadata();
  ParseContext context = new ParseContext();
  try (InputStream stream = new GZIPInputStream(new URL(address).openStream())) {
    parser.parse(stream, handler, metadata, context);
  }
}

Javadoc

Parses a document stream into a sequence of XHTML SAX events. Fills in related document metadata in the given metadata object.

The given document stream is consumed but not closed by this method. The responsibility to close the stream remains on the caller.

Information about the parsing context can be passed in the context parameter. See the parser implementations for the kinds of context information they expect.

Popular methods of Parser

getSupportedTypes
Returns the set of media types supported by this parser when used with the given parse context.

Popular in Java

Reading from database using SQL prepared statement
getApplicationContext (Context)
startActivity (Activity)
scheduleAtFixedRate (Timer)
InetAddress (java.net)
An Internet Protocol (IP) address. This can be either an IPv4 address or an IPv6 address, and in pra
Charset (java.nio.charset)
A charset is a named mapping between Unicode characters and byte sequences. Every Charset can decode
Dictionary (java.util)
Note: Do not use this class since it is obsolete. Please use the Map interface for new implementatio
Container (java.awt)
A generic Abstract Window Toolkit(AWT) container object is a component that can contain other AWT co
Font (java.awt)
The Font class represents fonts, which are used to render text in a visible way. A font provides the
Window (java.awt)
A Window object is a top-level window with no borders and no menubar. The default layout for a windo
Top Sublime Text plugins

How to use parsemethodin org.apache.tika.parser.Parser

Best Java code snippets using org.apache.tika.parser.Parser.parse (Showing top 20 results out of 513)

Refine search

How to use
parse
method
in
org.apache.tika.parser.Parser