@Field void setExtractInlineImages(boolean extractInlineImages) { defaultConfig.setExtractInlineImages(extractInlineImages); }
private void extractInlineImagesFromPDFs() { if (configFilePath == null && context.get(PDFParserConfig.class) == null) { PDFParserConfig pdfParserConfig = new PDFParserConfig(); pdfParserConfig.setExtractInlineImages(true); String warn = "As a convenience, TikaCLI has turned on extraction of\n" + "inline images for the PDFParser (TIKA-2374).\n" + "Aside from the -z option, this is not the default behavior\n"+ "in Tika generally or in tika-server."; LOG.info(warn); context.set(PDFParserConfig.class, pdfParserConfig); } }
getBooleanProp(props.getProperty("extractBookmarksText"), getExtractBookmarksText())); setExtractInlineImages( getBooleanProp(props.getProperty("extractInlineImages"), getExtractInlineImages()));
/** * Disable OCR. This method only has an effect if Tesseract is installed. */ public void disableOcr() { if (!ocrDisabled) { excludeParser(TesseractOCRParser.class); ocrDisabled = true; pdfConfig.setExtractInlineImages(false); } }
@Field void setExtractInlineImages(boolean extractInlineImages) { defaultConfig.setExtractInlineImages(extractInlineImages); }
@Field void setExtractInlineImages(boolean extractInlineImages) { defaultConfig.setExtractInlineImages(extractInlineImages); }
pdfParserConfig.setExtractInlineImages((Boolean) extractInlineImages); } else { pdfParserConfig.setExtractInlineImages(true);
pdfConfig.setExtractInlineImages(true);
getBooleanProp(props.getProperty("extractAcroFormContent"), getExtractAcroFormContent())); setExtractInlineImages( getBooleanProp(props.getProperty("extractInlineImages"), getExtractInlineImages()));
getBooleanProp(props.getProperty("extractBookmarksText"), getExtractBookmarksText())); setExtractInlineImages( getBooleanProp(props.getProperty("extractInlineImages"), getExtractInlineImages()));
/** * Create a new extractor, which will OCR images by default if Tesseract is available locally, extract inline * images from PDF files and OCR them and use PDFBox's non-sequential PDF parser. */ public Extractor() { // Calculate the SHA256 digest by default. setDigestAlgorithms(DigestAlgorithm.SHA256); // Run OCR on images contained within PDFs and not on pages. pdfConfig.setExtractInlineImages(true); pdfConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); // By default, only the object IDs are used for determining uniqueness. // In scanned documents under test from the Panama registry, different embedded images had the same ID, leading to incomplete OCRing when uniqueness detection was turned on. pdfConfig.setExtractUniqueInlineImagesOnly(false); // Set a long OCR timeout by default, because Tika's is too short. setOcrTimeout(Duration.ofDays(1)); ocrConfig.setEnableImageProcessing(0); // See TIKA-2167. Image processing causes OCR to fail. // English text recognition by default. ocrConfig.setLanguage("eng"); }
Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE); TesseractOCRConfig config = new TesseractOCRConfig(); PDFParserConfig pdfConfig = new PDFParserConfig(); pdfConfig.setExtractInlineImages(true); ParseContext parseContext = new ParseContext(); parseContext.set(TesseractOCRConfig.class, config); parseContext.set(PDFParserConfig.class, pdfConfig); parseContext.set(Parser.class, parser); //need to add this to make sure recursive parsing happens! parser.parse(stream, handler, new Metadata(), parseContext);
Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE); TesseractOCRConfig config = new TesseractOCRConfig(); config.setTesseractPath(tPath); PDFParserConfig pdfConfig = new PDFParserConfig(); pdfConfig.setExtractInlineImages(true); pdfConfig.setExtractUniqueInlineImagesOnly(false); // set to false if pdf contains multiple images. ParseContext parseContext = new ParseContext(); parseContext.set(TesseractOCRConfig.class, config); parseContext.set(PDFParserConfig.class, pdfConfig); //need to add this to make sure recursive parsing happens! parseContext.set(Parser.class, parser);
|| contentType.matches(ocrConfig.getContentTypes()))) { context.set(TesseractOCRConfig.class, ocrTesseractConfig); pdfConfig.setExtractInlineImages(true); } else { pdfConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);