private void parsePage(byte[] byteObject, Parser htmlParser, ContentHandler xhtml, ParseContext context) throws TikaException {// throws IOException InputStream stream = null; Metadata metadata = new Metadata(); ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1 try { stream = new ByteArrayInputStream(byteObject); htmlParser.parse(stream, handler, metadata, context); } catch (SAXException e) { throw new RuntimeException(e); } catch (IOException e) { // Pushback overflow from tagsoup } }
ContentHandler childHandler = new EmbeddedContentHandler( new BodyContentHandler(xhtml));
private void handleDocumentRef(String docRef) throws SAXException { //docRef is a path to a FixedDocumentSequence document, // e.g. /Documents/1/FixedDoc.fdoc //relative root is /Documents/1 ..need this Pages... String relativeRoot = null; int i = docRef.lastIndexOf("/"); if (i > 0) { relativeRoot = docRef.substring(0, i); } else { relativeRoot = ""; } String zipPath = (docRef.startsWith("/") ? docRef.substring(1) : docRef); if (pkg instanceof ZipPackage) { try (InputStream stream = getZipStream(zipPath, pkg)) { XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler( new PageContentPartHandler(relativeRoot, xhtml))), context); } catch (IOException | TikaException e) { throw new SAXException(new TikaException("IOException trying to read: " + docRef)); } } else { throw new SAXException(new TikaException("Package must be ZipPackage")); } }
private void handleDocuments(PackageRelationship packageRelationship, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException { try (InputStream stream = pkg.getPart(packageRelationship).getInputStream()) { XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler( new FixedDocSeqHandler(xhtml))), context); } }
private static void handleEmbedded(byte[] data, EmbeddedDocumentExtractor embeddedDocumentExtractor, ContentHandler handler) throws TikaException, SAXException { try (InputStream is = TikaInputStream.get(data)) { Metadata embeddedMetadata = new Metadata(); if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { embeddedDocumentExtractor.parseEmbedded(is, new EmbeddedContentHandler(handler), embeddedMetadata, false); } } catch (IOException e) { } } }
new EmbeddedContentHandler(new BodyContentHandler(handler)), metadata, context); } catch (EncryptedDocumentException ede) {
new EmbeddedContentHandler(handler), embeddedMetadata, false);
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { //set OfficeParserConfig if the user hasn't specified one configure(context); final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); try { XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler( new Word2006MLDocHandler(xhtml, metadata, context))), context); } catch (SAXException e) { throw new TikaException("XML parse error", e); } xhtml.endDocument(); } }
private void handleEmbedded(Node action, XHTMLContentHandler handler, ParseContext context) throws TikaException, SAXException, IOException { String fileName = ""; String contentType = ""; NamedNodeMap attrs = action.getAttributes(); if (attrs != null) { Node n = attrs.getNamedItem("filename"); if (n != null) { fileName = n.getNodeValue(); } n = attrs.getNamedItem("content-type"); if (n != null) { contentType = n.getNodeValue(); } } String embeddedText = action.getTextContent(); EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(context); Metadata m = new Metadata(); m.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName); if (! "".equals(contentType)) { m.set(Metadata.CONTENT_TYPE, contentType); } InputStream is = new ByteArrayInputStream(embeddedText.getBytes(UTF_8)); extractor.parseEmbedded( is, new EmbeddedContentHandler(handler), m, true); }
try (InputStream stream = relatedPartPart.getInputStream()) { XMLReaderUtils.parseSAX(stream, new OfflineContentHandler(new EmbeddedContentHandler(contentHandler)), context);
embeddedDocumentExtractor.parseEmbedded( stream, new EmbeddedContentHandler(xhtml), embeddedMetadata, false);
private void handleWMF(HemfCommentPublic.WindowsMetafile comment, ContentHandler contentHandler, EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, SAXException, TikaException { Metadata embeddedMetadata = new Metadata(); embeddedMetadata.set(Metadata.CONTENT_TYPE, WMF_MEDIA_TYPE.toString()); if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { try (InputStream is = TikaInputStream.get(comment.getWmfInputStream())) { embeddedDocumentExtractor.parseEmbedded(is, new EmbeddedContentHandler(contentHandler), embeddedMetadata, false); } } }
private void handleEmbedded(String name, String type, byte[] contents, EmbeddedDocumentExtractor embeddedExtractor, ContentHandler handler) throws IOException, SAXException, TikaException { Metadata metadata = new Metadata(); if (name != null) metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); if (type != null) metadata.set(Metadata.CONTENT_TYPE, type); if (embeddedExtractor.shouldParseEmbedded(metadata)) { embeddedExtractor.parseEmbedded( TikaInputStream.get(contents), new EmbeddedContentHandler(handler), metadata, false); } } }
private void handleThumbnail(ContentHandler handler) { try { OPCPackage opcPackage = extractor.getPackage(); for (PackageRelationship rel : opcPackage.getRelationshipsByType(PackageRelationshipTypes.THUMBNAIL)) { PackagePart tPart = opcPackage.getPart(rel); InputStream tStream = tPart.getInputStream(); Metadata thumbnailMetadata = new Metadata(); String thumbName = tPart.getPartName().getName(); thumbnailMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, thumbName); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute(XHTML, "class", "class", "CDATA", "embedded"); attributes.addAttribute(XHTML, "id", "id", "CDATA", thumbName); handler.startElement(XHTML, "div", "div", attributes); handler.endElement(XHTML, "div", "div"); thumbnailMetadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, thumbName); thumbnailMetadata.set(Metadata.CONTENT_TYPE, tPart.getContentType()); thumbnailMetadata.set(TikaCoreProperties.TITLE, tPart.getPartName().getName()); if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) { embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream), new EmbeddedContentHandler(handler), thumbnailMetadata, false); } tStream.close(); } } catch (Exception ex) { } }
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // TODO Auto-generated method stub final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.startElement("p"); TaggedContentHandler tagged = new TaggedContentHandler(handler); try { XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler( getContentHandler(tagged, metadata, context))), context); } catch (SAXException e) { tagged.throwIfCauseOf(e); throw new TikaException("XML parse error", e); } finally { xhtml.endElement("p"); xhtml.endDocument(); } }
@Override public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { setContentType(metadata); final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); TaggedContentHandler tagged = new TaggedContentHandler(xhtml); try { XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler( getContentHandler(tagged, metadata, context))), context); } catch (SAXException e) { tagged.throwIfCauseOf(e); throw new TikaException("XML parse error", e); } finally { xhtml.endDocument(); } }
parser.parse( new ByteArrayInputStream(part.bytes), new EmbeddedContentHandler(new BodyContentHandler(handler)), new Metadata(), parseContext );
embeddedDocumentExtractor.parseEmbedded( embeddedIs, new EmbeddedContentHandler(xhtml), embeddedMetadata, false);
/** * Handles an embedded file in the document */ protected void handleEmbeddedFile(PackagePart part, ContentHandler handler, String rel) throws SAXException, IOException { Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, rel); // Get the name String name = part.getPartName().getName(); metadata.set( TikaCoreProperties.RESOURCE_NAME_KEY, name.substring(name.lastIndexOf('/') + 1)); // Get the content type metadata.set( Metadata.CONTENT_TYPE, part.getContentType()); // Call the recursing handler if (embeddedExtractor.shouldParseEmbedded(metadata)) { try(TikaInputStream tis = TikaInputStream.get(part.getInputStream())) { embeddedExtractor.parseEmbedded( tis, new EmbeddedContentHandler(handler), metadata, false); } } }
private void extractObj(byte[] bytes, ContentHandler handler, Metadata metadata) throws SAXException, IOException, TikaException { if (bytes == null) { return; } metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(bytes.length)); if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) { TikaInputStream stream = TikaInputStream.get(bytes); if (metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY) == null) { String extension = embeddedDocumentUtil.getExtension(stream, metadata); if (inObject && state == EMB_STATE.PICT) { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "thumbnail_" + thumbCount++ + extension); metadata.set(RTFMetadata.THUMBNAIL, "true"); } else { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + extension); } } try { embeddedDocumentUtil.parseEmbedded( stream, new EmbeddedContentHandler(handler), metadata, false); } catch (IOException e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); } finally { stream.close(); } } }