Refine search
private void parseImpl(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) { Charset charset = reader.getCharset(); String previous = metadata.get(Metadata.CONTENT_TYPE); MediaType contentType = null; if (previous == null || previous.startsWith("text/html")) { contentType = new MediaType(MediaType.TEXT_HTML, charset); } else if (previous.startsWith("application/xhtml+xml")) { contentType = new MediaType(XHTML, charset); } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) { contentType = new MediaType(WAP_XHTML, charset); } else if (previous.startsWith("application/x-asp")) { contentType = new MediaType(X_ASP, charset); metadata.set(Metadata.CONTENT_TYPE, contentType.toString()); metadata.set(Metadata.CONTENT_ENCODING, charset.name()); context.get(HtmlMapper.class, new HtmlParserMapper()); Schema schema = context.get(Schema.class, HTML_SCHEMA);
public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { SAXParser parser = context.getSAXParser(); parser.parse( new CloseShieldInputStream(stream), new OfflineContentHandler(handler)); }
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { //set OfficeParserConfig if the user hasn't specified one configure(context); final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); try { context.getSAXParser().parse( new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler( new Word2006MLDocHandler(xhtml, metadata, context)))); } catch (SAXException e) { throw new TikaException("XML parse error", e); } finally { xhtml.endDocument(); } }
public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { if (metadata.get(Metadata.CONTENT_TYPE) == null) { metadata.set(Metadata.CONTENT_TYPE, "application/xml"); } final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.startElement("p"); TaggedContentHandler tagged = new TaggedContentHandler(handler); try { context.getSAXParser().parse( new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler( getContentHandler(tagged, metadata, context)))); } catch (SAXException e) { tagged.throwIfCauseOf(e); throw new TikaException("XML parse error", e); } finally { xhtml.endElement("p"); xhtml.endDocument(); } }
break; default: throw new TikaException("Unhandled iWorks file " + type); metadata.add(Metadata.CONTENT_TYPE, type.getType().toString()); xhtml.startDocument(); if (contentHandler != null) { context.getSAXParser().parse( new CloseShieldInputStream(entryStream), new OfflineContentHandler(contentHandler) );
private void handlePart(PackagePart packagePart, XWPFStylesShim styles, XWPFListManager listManager, XHTMLContentHandler xhtml) throws IOException, SAXException { Map<String, String> linkedRelationships = loadLinkedRelationships(packagePart, true, metadata); try (InputStream stream = packagePart.getInputStream()) { context.getSAXParser().parse( new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler( new OOXMLWordAndPowerPointTextHandler( new OOXMLTikaBodyPartHandler(xhtml, styles, listManager, context.get(OfficeParserConfig.class)), linkedRelationships)))); } catch (TikaException e) { //swallow } }
throws IOException, SAXException, TikaException { PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig); if (localConfig.getSetKCMS()) { System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider"); pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password, memoryUsageSetting); metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted())); metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString()); extractMetadata(pdfDocument, metadata, context); AccessChecker checker = localConfig.getAccessChecker(); handleXFAOnly(pdfDocument, handler, metadata, context); } else if (localConfig.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) { metadata.add("X-Parsed-By", TesseractOCRParser.class.toString()); OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig); } else {
stream = new CloseShieldInputStream(stream); } else { stream = new BufferedInputStream(new CloseShieldInputStream(stream)); try { CompressorParserOptions options = context.get(CompressorParserOptions.class, new CompressorParserOptions() { public boolean decompressConcatenated(Metadata metadata) { return false; cis = factory.createCompressorInputStream(stream); MediaType type = getMediaType(cis); if (!type.equals(MediaType.OCTET_STREAM)) { metadata.set(CONTENT_TYPE, type.toString()); throw new TikaMemoryLimitException(e.getMessage()); throw new TikaException("Unable to uncompress document stream", e); Metadata entrydata = new Metadata(); String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); if (name != null) { if (name.endsWith(".tbz")) {
throws IOException, SAXException, TikaException { try (AutoDetectReader reader = new AutoDetectReader( new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) { Charset charset = reader.getCharset(); String mediaType = metadata.get(Metadata.CONTENT_TYPE); String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); if (mediaType != null && name != null) { MediaType type = MediaType.parse(mediaType); metadata.set(Metadata.CONTENT_TYPE, type.toString()); metadata.set(Metadata.CONTENT_ENCODING, charset.name()); metadata.set("LoC", String.valueOf(nbLines)); Renderer renderer = getRenderer(type.toString()); Schema schema = context.get(Schema.class, HTML_SCHEMA);
Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { String type = metadata.get(Metadata.CONTENT_TYPE); if (type != null) { if (OLD_BMP_TYPE.toString().equals(type)) { type = MAIN_BMP_TYPE.toString(); try { try (ImageInputStream imageStream = ImageIO.createImageInputStream( new CloseShieldInputStream(stream))) { reader.setInput(imageStream); metadata.set(Metadata.IMAGE_WIDTH, Integer.toString(reader.getWidth(0))); metadata.set(Metadata.IMAGE_LENGTH, Integer.toString(reader.getHeight(0))); metadata.set("height", Integer.toString(reader.getHeight(0))); metadata.set("width", Integer.toString(reader.getWidth(0))); e.getMessage().equals("Unexpected block type 0!") && type.equals("image/gif"))) { throw new TikaException(type + " parse error", e);
extractInlineImagesFromPDFs(); type = NO_OUTPUT; context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor()); } else if (arg.equals("-r") || arg.equals("--pretty-print")) { prettyPrint = true; new CloseShieldInputStream(System.in))) { type.process(stream, System.out, new Metadata()); handleRecursiveJson(url, System.out); } else { Metadata metadata = new Metadata(); try (InputStream input = TikaInputStream.get(url, metadata)) {
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { EmbeddedDocumentExtractor ex = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); short numEntries = readThroughNumEntries(stream); long bytesRead = 26; List<FieldInfo> fieldInfoList = getSortedFieldInfoList(stream, numEntries); bytesRead += 12*numEntries; Metadata embeddedMetadata = new Metadata(); bytesRead = processFieldEntries(stream, fieldInfoList, embeddedMetadata, bytesRead); FieldInfo contentFieldInfo = getContentFieldInfo(fieldInfoList); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); if (contentFieldInfo != null) { long diff = contentFieldInfo.offset-bytesRead; IOUtils.skipFully(stream, diff); if (ex.shouldParseEmbedded(embeddedMetadata)) { // TODO: we should probably add a readlimiting wrapper around this // stream to ensure that not more than contentFieldInfo.length bytes // are read ex.parseEmbedded(new CloseShieldInputStream(stream), xhtml, embeddedMetadata, false); } } xhtml.endDocument(); }
break; default: throw new TikaException("Unhandled iWorks file " + type); metadata.add(Metadata.CONTENT_TYPE, type.getType().toString()); xhtml.startDocument(); if (contentHandler != null) { XMLReaderUtils.parseSAX( new CloseShieldInputStream(entryStream), new OfflineContentHandler(contentHandler), context
new InputSource(new CloseShieldInputStream(stream))); metadata.set(TikaCoreProperties.TITLE, title); metadata.set(TikaCoreProperties.DESCRIPTION, description); throw new TikaException("RSS parse error", e);
PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true); String password = metadata.get(PASSWORD); if (password == null) { password = ""; metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); extractMetadata(pdfDocument, metadata); PDF2XHTML.process(pdfDocument, handler, metadata, extractAnnotationText, enableAutoSpace);
prc = mainDocument.getRelationshipsByType(XSLFRelation.COMMENT_AUTHORS.getRelation()); } catch (InvalidFormatException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); commentAuthorsPart = mainDocument.getRelatedPart(prc.getRelationship(i)); } catch (InvalidFormatException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e)); new CloseShieldInputStream(stream), new OfflineContentHandler(new XSLFCommentAuthorHandler()), context); metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
@Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // Only outputting the MIME type as metadata metadata.set(Metadata.CONTENT_TYPE, ENVI_MIME_TYPE); // The following code was taken from the TXTParser // Automatically detect the character encoding TikaConfig tikaConfig = context.get(TikaConfig.class); if (tikaConfig == null) { tikaConfig = TikaConfig.getDefaultConfig(); } try (AutoDetectReader reader = new AutoDetectReader( new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))){ Charset charset = reader.getCharset(); // deprecated, see TIKA-431 metadata.set(Metadata.CONTENT_ENCODING, charset.name()); xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); readLines(reader, metadata); xhtml.endDocument(); } catch (IOException | TikaException e) { LOG.error("Error reading input data stream.", e); } }
metadata.set(Metadata.CONTENT_TYPE, ENVI_MIME_TYPE); new CloseShieldInputStream(stream), metadata)) { Charset charset = reader.getCharset(); MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset); metadata.set(Metadata.CONTENT_ENCODING, charset.name());
public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { try { DigestCalculatorProvider digestCalculatorProvider = new JcaDigestCalculatorProviderBuilder().setProvider("BC").build(); CMSSignedDataParser parser = new CMSSignedDataParser(digestCalculatorProvider, new CloseShieldInputStream(stream)); try { CMSTypedStream content = parser.getSignedContent(); if (content == null) { throw new TikaException("cannot parse detached pkcs7 signature (no signed data to parse)"); } try (InputStream input = content.getContentStream()) { Parser delegate = context.get(Parser.class, EmptyParser.INSTANCE); delegate.parse(input, handler, metadata, context); } } finally { parser.close(); } } catch (OperatorCreationException e) { throw new TikaException("Unable to create DigestCalculatorProvider", e); } catch (CMSException e) { throw new TikaException("Unable to parse pkcs7 signed data", e); } }
public static void parseInvestigation(InputStream stream, XHTMLContentHandler handler, Metadata metadata, ParseContext context, String studyFileName) throws IOException, TikaException, SAXException { TikaConfig tikaConfig = context.get(TikaConfig.class); if (tikaConfig == null) { tikaConfig = TikaConfig.getDefaultConfig(); } // Automatically detect the character encoding try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, tikaConfig.getEncodingDetector())) { extractMetadata(reader, metadata, studyFileName); } }