setEnableAutoSpace( getBooleanProp(props.getProperty("enableAutoSpace"), getEnableAutoSpace())); setSuppressDuplicateOverlappingText( getBooleanProp(props.getProperty("suppressDuplicateOverlappingText"), getSuppressDuplicateOverlappingText())); setExtractAnnotationText( getBooleanProp(props.getProperty("extractAnnotationText"), getExtractAnnotationText())); setSortByPosition( getBooleanProp(props.getProperty("sortByPosition"), getSortByPosition())); setExtractAcroFormContent( getBooleanProp(props.getProperty("extractAcroFormContent"), getExtractAcroFormContent())); setExtractBookmarksText( getBooleanProp(props.getProperty("extractBookmarksText"), getExtractBookmarksText())); setExtractInlineImages( getBooleanProp(props.getProperty("extractInlineImages"), getExtractInlineImages())); setExtractUniqueInlineImagesOnly( getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"), getExtractUniqueInlineImagesOnly())); setIfXFAExtractOnlyXFA( getBooleanProp(props.getProperty("ifXFAExtractOnlyXFA"), getIfXFAExtractOnlyXFA())); setCatchIntermediateIOExceptions(
private void extractInlineImagesFromPDFs() { if (configFilePath == null && context.get(PDFParserConfig.class) == null) { PDFParserConfig pdfParserConfig = new PDFParserConfig(); pdfParserConfig.setExtractInlineImages(true); String warn = "As a convenience, TikaCLI has turned on extraction of\n" + "inline images for the PDFParser (TIKA-2374).\n" + "Aside from the -z option, this is not the default behavior\n"+ "in Tika generally or in tika-server."; LOG.info(warn); context.set(PDFParserConfig.class, pdfParserConfig); } }
if (config.getDetectAngles()) { pdf2XHTML = new AngleDetectingPDF2XHTML(document, handler, context, metadata, config); } else { pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config); config.configure(pdf2XHTML);
pdfParserConfig.setAverageCharTolerance(Float.parseFloat(averageCharTolerance.toString())); pdfParserConfig.setEnableAutoSpace((Boolean) enableAutoSpace); pdfParserConfig.setExtractAcroFormContent((Boolean) extractAcroFormContent); pdfParserConfig.setExtractAnnotationText((Boolean) extractAnnotationText); pdfParserConfig.setExtractInlineImages((Boolean) extractInlineImages); } else { pdfParserConfig.setExtractInlineImages(true); pdfParserConfig.setExtractUniqueInlineImagesOnly((Boolean) extractUniqueInlineImagesOnly); pdfParserConfig.setSortByPosition((Boolean) sortByPosition); pdfParserConfig.setSpacingTolerance(Float.parseFloat(spacingTolerance.toString())); pdfParserConfig.setSuppressDuplicateOverlappingText((Boolean) suppressDuplicateOverlappingText);
context.set(Parser.class, recursiveParser); PDFParserConfig pdfConfig = new PDFParserConfig(); OCRConfig ocrConfig = parseHints.getOcrConfig(); if (!ocrConfig.isEmpty() || contentType.matches(ocrConfig.getContentTypes()))) { context.set(TesseractOCRConfig.class, ocrTesseractConfig); pdfConfig.setExtractInlineImages(true); } else { pdfConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); pdfConfig.setSuppressDuplicateOverlappingText(true); context.set(PDFParserConfig.class, pdfConfig); modifyParseContext(context);
Parser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE); TesseractOCRConfig config = new TesseractOCRConfig(); config.setTesseractPath(tPath); PDFParserConfig pdfConfig = new PDFParserConfig(); pdfConfig.setExtractInlineImages(true); pdfConfig.setExtractUniqueInlineImagesOnly(false); // set to false if pdf contains multiple images. ParseContext parseContext = new ParseContext(); parseContext.set(TesseractOCRConfig.class, config); parseContext.set(PDFParserConfig.class, pdfConfig); //need to add this to make sure recursive parsing happens! parseContext.set(Parser.class, parser);
/** * Create a new extractor, which will OCR images by default if Tesseract is available locally, extract inline * images from PDF files and OCR them and use PDFBox's non-sequential PDF parser. */ public Extractor() { // Calculate the SHA256 digest by default. setDigestAlgorithms(DigestAlgorithm.SHA256); // Run OCR on images contained within PDFs and not on pages. pdfConfig.setExtractInlineImages(true); pdfConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); // By default, only the object IDs are used for determining uniqueness. // In scanned documents under test from the Panama registry, different embedded images had the same ID, leading to incomplete OCRing when uniqueness detection was turned on. pdfConfig.setExtractUniqueInlineImagesOnly(false); // Set a long OCR timeout by default, because Tika's is too short. setOcrTimeout(Duration.ofDays(1)); ocrConfig.setEnableImageProcessing(0); // See TIKA-2167. Image processing causes OCR to fail. // English text recognition by default. ocrConfig.setLanguage("eng"); }
@Field void setExtractInlineImages(boolean extractInlineImages) { defaultConfig.setExtractInlineImages(extractInlineImages); }
public static void fillParseContext(ParseContext parseContext, MultivaluedMap<String, String> httpHeaders, Parser embeddedParser) { //lazily initialize configs //if a header is submitted, any params set in --tika-config tika-config.xml //upon server startup will be ignored. TesseractOCRConfig ocrConfig = null; PDFParserConfig pdfParserConfig = null; for (String key : httpHeaders.keySet()) { if (StringUtils.startsWith(key, X_TIKA_OCR_HEADER_PREFIX)) { ocrConfig = (ocrConfig == null) ? new TesseractOCRConfig() : ocrConfig; processHeaderConfig(httpHeaders, ocrConfig, key, X_TIKA_OCR_HEADER_PREFIX); } else if (StringUtils.startsWith(key, X_TIKA_PDF_HEADER_PREFIX)) { pdfParserConfig = (pdfParserConfig == null) ? new PDFParserConfig() : pdfParserConfig; processHeaderConfig(httpHeaders, pdfParserConfig, key, X_TIKA_PDF_HEADER_PREFIX); } } if (ocrConfig != null) { parseContext.set(TesseractOCRConfig.class, ocrConfig); } if (pdfParserConfig != null) { parseContext.set(PDFParserConfig.class, pdfParserConfig); } if (embeddedParser != null) { parseContext.set(Parser.class, embeddedParser); } }
@Field void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) { defaultConfig.setExtractUniqueInlineImagesOnly(extractUniqueInlineImagesOnly); }
@Field public void setOcrStrategy(String ocrStrategyString) { defaultConfig.setOcrStrategy(ocrStrategyString); }
/** * If true, the parser should try to remove duplicated * text over the same region. This is needed for some * PDFs that achieve bolding by re-writing the same * text in the same area. Note that this can * slow down extraction substantially (PDFBOX-956) and * sometimes remove characters that were not in fact * duplicated (PDFBOX-1155). By default this is disabled. * * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} */ public void setSuppressDuplicateOverlappingText(boolean v) { defaultConfig.setSuppressDuplicateOverlappingText(v); }
/** * If true (the default), text in annotations will be * extracted. * * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} */ public void setExtractAnnotationText(boolean v) { defaultConfig.setExtractAnnotationText(v); }
/** * If true (the default), the parser should estimate * where spaces should be inserted between words. For * many PDFs this is necessary as they do not include * explicit whitespace characters. * * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} */ public void setEnableAutoSpace(boolean v) { defaultConfig.setEnableAutoSpace(v); }
@Field void setExtractAcroFormContent(boolean extractAcroFormContent) { defaultConfig.setExtractAcroFormContent(extractAcroFormContent); }
/** * If true, sort text tokens by their x/y position * before extracting text. This may be necessary for * some PDFs (if the text tokens are not rendered "in * order"), while for other PDFs it can produce the * wrong result (for example if there are 2 columns, * the text will be interleaved). Default is false. * * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} */ @Field public void setSortByPosition(boolean v) { defaultConfig.setSortByPosition(v); }
@Override public void configure(PDF2XHTML pdf2XHTML) { super.configure(pdf2XHTML); // Better paragraph detection pdf2XHTML.setDropThreshold(2.0f); } }
/** * Disable OCR. This method only has an effect if Tesseract is installed. */ public void disableOcr() { if (!ocrDisabled) { excludeParser(TesseractOCRParser.class); ocrDisabled = true; pdfConfig.setExtractInlineImages(false); } }
if (pdfParserConfig == null) { try (final InputStream in = new FileInputStream(pdfParserConfigPath)) { pdfParserConfig = new PDFParserConfig(in); } catch (Exception e) { logger.warn("Could not load " + pdfParserConfigPath, e); pdfParserConfig = new PDFParserConfig();
@Field void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) { defaultConfig.setExtractUniqueInlineImagesOnly(extractUniqueInlineImagesOnly); }