@Field public void setOcrStrategy(String ocrStrategyString) { defaultConfig.setOcrStrategy(ocrStrategyString); }
isCatchIntermediateIOExceptions())); setOcrStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy")));
@Field public void setOcrStrategy(String ocrStrategyString) { defaultConfig.setOcrStrategy(ocrStrategyString); }
@Field public void setOcrStrategy(String ocrStrategyString) { defaultConfig.setOcrStrategy(ocrStrategyString); }
/** * Create a new extractor, which will OCR images by default if Tesseract is available locally, extract inline * images from PDF files and OCR them and use PDFBox's non-sequential PDF parser. */ public Extractor() { // Calculate the SHA256 digest by default. setDigestAlgorithms(DigestAlgorithm.SHA256); // Run OCR on images contained within PDFs and not on pages. pdfConfig.setExtractInlineImages(true); pdfConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); // By default, only the object IDs are used for determining uniqueness. // In scanned documents under test from the Panama registry, different embedded images had the same ID, leading to incomplete OCRing when uniqueness detection was turned on. pdfConfig.setExtractUniqueInlineImagesOnly(false); // Set a long OCR timeout by default, because Tika's is too short. setOcrTimeout(Duration.ofDays(1)); ocrConfig.setEnableImageProcessing(0); // See TIKA-2167. Image processing causes OCR to fail. // English text recognition by default. ocrConfig.setLanguage("eng"); }
isCatchIntermediateIOExceptions())); setOcrStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy")));
isCatchIntermediateIOExceptions())); setOcrStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy")));
pdfConfig.setExtractInlineImages(true); } else { pdfConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);