InputStream input = new FileInputStream("myfile.html"); ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); new HtmlParser().parse(input, handler, metadata, new ParseContext()); String plainText = handler.toString();
private Map<String, String> tika_parse(InputStream sourceStream, String prefix, Integer maxAttribs, Integer maxAttribLen) throws IOException, TikaException, SAXException { final Metadata metadata = new Metadata(); final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream); try { dataBuilder.append(metadata.get(key));
@Test public void testHelloWorld() throws Exception { try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser())) { Metadata metadata = new Metadata(); ContentHandler output = new BodyContentHandler(); InputStream stream = new ByteArrayInputStream(new byte[0]); ParseContext context = new ParseContext(); parser.parse(stream, output, metadata, context); assertEquals("Hello, World!", output.toString().trim()); assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE)); } }
/** * Test for things like javascript files whose content is enclosed in XML * comment delimiters, but that aren't actually XML. * * @see <a href="https://issues.apache.org/jira/browse/TIKA-426">TIKA-426</a> */ @Test public void testNotXML() throws IOException { assertEquals(MediaType.TEXT_PLAIN, mimeTypes.detect( new ByteArrayInputStream("<!-- test -->".getBytes(UTF_8)), new Metadata())); }
@Test public void testNotEquals() { Metadata meta1 = new Metadata(); meta1.add("key", "value1"); meta1.add("key", "value2"); meta1.add("key2", "value12"); Metadata meta2 = new Metadata(); meta2.add("key", "value1"); meta2.add("key", "value2"); meta2.add("key2", "value22"); assertFalse(meta1.equals(meta2)); }
private void detect(byte[] data, MediaType type) { try { InputStream stream = new ByteArrayInputStream(data); assertEquals(type, detector.detect(stream, new Metadata())); } catch (IOException e) { fail("Unexpected exception from ZeroSizeFileDetector"); } }
@Test public void testToFileHandler() throws Exception { ToFileHandler toFileHandler = new ToFileHandler(new SBContentHandlerFactory(), target.toFile()); forkParser = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper); Metadata m = new Metadata(); ParseContext context = new ParseContext(); forkParser.parse(is, toFileHandler, m, context); } finally {
public static Metadata getMet(URL url) throws IOException, SAXException, TikaException { Metadata met = new Metadata(); PDFParser parser = new PDFParser(); parser.parse(url.openStream(), new BodyContentHandler(), met, new ParseContext()); return met; }
InputStream is = new FileInputStream("/home/rahul/Music/03 - I Like Your Music.mp3"); Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); parser.parse(is, handler, metadata, new ParseContext()); String handler = handler.toString(); System.out.println("Handler data: " + handler); System.out.println(metadata.get(Metadata.CREATION_DATE)); System.out.println(metadata.get(Metadata.LAST_MODIFIED));
parser.setParsers(new HashMap<MediaType, Parser>()); Metadata metadata = new Metadata(); metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, file.getName()); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); stream.close(); String mimeType = metadata.get(HttpHeaders.CONTENT_TYPE); System.out.println(mimeType);
public static void process(Path path) throws Exception { Parser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them // to the underlying Handler. PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata); try (InputStream stream = new BufferedInputStream(Files.newInputStream(path))) { parser.parse(stream, handler, metadata, new ParseContext()); } String[] numbers = metadata.getValues("phonenumbers"); Collections.addAll(phoneNumbers, numbers); } }
public static void main(String[] args) throws IOException, SAXException, TikaException { DirListParser parser = new DirListParser(); Metadata met = new Metadata(); parser.parse(System.in, new BodyContentHandler(), met); System.out.println("Num files: " + met.getValues("Filename").length); System.out.println("Num executables: " + met.get("NumExecutables")); }
String name = md.get(TikaCoreProperties.RESOURCE_NAME_KEY); if (name != null && name.length() > 0) { setTitle("Apache Tika: " + name); -1); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); wrapper.parse(input, recursiveParserWrapperHandler, new Metadata(), new ParseContext()); StringWriter jsonBuffer = new StringWriter(); JsonMetadataList.setPrettyPrinting(true);
private void parsePage(byte[] byteObject, Parser htmlParser, ContentHandler xhtml, ParseContext context) throws TikaException {// throws IOException InputStream stream = null; Metadata metadata = new Metadata(); ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1 try { stream = new ByteArrayInputStream(byteObject); htmlParser.parse(stream, handler, metadata, context); } catch (SAXException e) { throw new RuntimeException(e); } catch (IOException e) { // Pushback overflow from tagsoup } }
private void handleInlineBodyPart(BodyContents part) throws MimeException, IOException { String contentType = part.metadata.get(Metadata.CONTENT_TYPE); Parser parser = null; if (MediaType.TEXT_HTML.toString().equalsIgnoreCase(contentType)) { parser.parse( new ByteArrayInputStream(part.bytes), new EmbeddedContentHandler(new BodyContentHandler(handler)), new Metadata(), parseContext ); } catch (SAXException | TikaException e) {
public TrecDocument summarize(File file) throws FileNotFoundException, IOException, TikaException { Tika tika = new Tika(); Metadata met = new Metadata(); String contents = tika.parseToString(new FileInputStream(file), met); return new TrecDocument(met.get(TikaCoreProperties.RESOURCE_NAME_KEY), contents, met.getDate(TikaCoreProperties.CREATED)); }
@Test public void testGetMetadata() throws Exception { URL url = TikaInputStreamTest.class.getResource("test.txt"); Metadata metadata = new Metadata(); TikaInputStream.get(url, metadata).close(); assertEquals("test.txt", metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY)); assertEquals( Long.toString(Files.size(Paths.get(url.toURI()))), metadata.get(Metadata.CONTENT_LENGTH)); }
@Test public void testDetectNull() throws Exception { assertEquals( MediaType.OCTET_STREAM, detector.detect(null, new Metadata())); }