private void extractInlineImagesFromPDFs() { if (configFilePath == null && context.get(PDFParserConfig.class) == null) { PDFParserConfig pdfParserConfig = new PDFParserConfig(); pdfParserConfig.setExtractInlineImages(true); String warn = "As a convenience, TikaCLI has turned on extraction of\n" + "inline images for the PDFParser (TIKA-2374).\n" + "Aside from the -z option, this is not the default behavior\n"+ "in Tika generally or in tika-server."; LOG.info(warn); context.set(PDFParserConfig.class, pdfParserConfig); } }
public static void fillParseContext(ParseContext parseContext, MultivaluedMap<String, String> httpHeaders, Parser embeddedParser) { //lazily initialize configs //if a header is submitted, any params set in --tika-config tika-config.xml //upon server startup will be ignored. TesseractOCRConfig ocrConfig = null; PDFParserConfig pdfParserConfig = null; for (String key : httpHeaders.keySet()) { if (StringUtils.startsWith(key, X_TIKA_OCR_HEADER_PREFIX)) { ocrConfig = (ocrConfig == null) ? new TesseractOCRConfig() : ocrConfig; processHeaderConfig(httpHeaders, ocrConfig, key, X_TIKA_OCR_HEADER_PREFIX); } else if (StringUtils.startsWith(key, X_TIKA_PDF_HEADER_PREFIX)) { pdfParserConfig = (pdfParserConfig == null) ? new PDFParserConfig() : pdfParserConfig; processHeaderConfig(httpHeaders, pdfParserConfig, key, X_TIKA_PDF_HEADER_PREFIX); } } if (ocrConfig != null) { parseContext.set(TesseractOCRConfig.class, ocrConfig); } if (pdfParserConfig != null) { parseContext.set(PDFParserConfig.class, pdfParserConfig); } if (embeddedParser != null) { parseContext.set(Parser.class, embeddedParser); } }
if (pdfParserConfig == null) { try (final InputStream in = new FileInputStream(pdfParserConfigPath)) { pdfParserConfig = new PDFParserConfig(in); } catch (Exception e) { logger.warn("Could not load " + pdfParserConfigPath, e); pdfParserConfig = new PDFParserConfig();
PDFParserConfig pdfConfig = new PDFParserConfig(); pdfConfig.setExtractInlineImages(true);
private static void fillParseContext(ParseContext parseContext, Map<String, Object> options) { final TesseractOCRConfig ocrConfig = new TesseractOCRConfig(); if (options == null) { // Disable OCR and return if no options are specified. disableOcr(ocrConfig); parseContext.set(TesseractOCRConfig.class, ocrConfig); return; } fillOcrOptions(ocrConfig, options); parseContext.set(TesseractOCRConfig.class, ocrConfig); final PDFParserConfig pdfParserConfig = new PDFParserConfig(); fillPdfOptions(pdfParserConfig, options); parseContext.set(PDFParserConfig.class, pdfParserConfig); // Allow a password to be specified for encrypted files. fillPassword(parseContext, options); }
Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE); TesseractOCRConfig config = new TesseractOCRConfig(); PDFParserConfig pdfConfig = new PDFParserConfig(); pdfConfig.setExtractInlineImages(true); ParseContext parseContext = new ParseContext(); parseContext.set(TesseractOCRConfig.class, config); parseContext.set(PDFParserConfig.class, pdfConfig); parseContext.set(Parser.class, parser); //need to add this to make sure recursive parsing happens! parser.parse(stream, handler, new Metadata(), parseContext);
Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE); TesseractOCRConfig config = new TesseractOCRConfig(); config.setTesseractPath(tPath); PDFParserConfig pdfConfig = new PDFParserConfig(); pdfConfig.setExtractInlineImages(true); pdfConfig.setExtractUniqueInlineImagesOnly(false); // set to false if pdf contains multiple images. ParseContext parseContext = new ParseContext(); parseContext.set(TesseractOCRConfig.class, config); parseContext.set(PDFParserConfig.class, pdfConfig); //need to add this to make sure recursive parsing happens! parseContext.set(Parser.class, parser);
context.set(Parser.class, recursiveParser); PDFParserConfig pdfConfig = new PDFParserConfig(); OCRConfig ocrConfig = parseHints.getOcrConfig(); if (!ocrConfig.isEmpty()