@Override public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof PDFPureJavaParserConfig)) return false; PDFPureJavaParserConfig config = (PDFPureJavaParserConfig) o; if (getEnableAutoSpace() != config.getEnableAutoSpace()) return false; if (getSuppressDuplicateOverlappingText() != config.getSuppressDuplicateOverlappingText()) return false; if (getExtractAnnotationText() != config.getExtractAnnotationText()) return false; if (getSortByPosition() != config.getSortByPosition()) return false; if (getExtractAcroFormContent() != config.getExtractAcroFormContent()) return false; if (getExtractInlineImages() != config.getExtractInlineImages()) return false; if (getExtractUniqueInlineImagesOnly() != config.getExtractUniqueInlineImagesOnly()) return false; if (getIfXFAExtractOnlyXFA() != config.getIfXFAExtractOnlyXFA()) return false; if (getOcrDPI() != config.getOcrDPI()) return false; if (isCatchIntermediateIOExceptions() != config.isCatchIntermediateIOExceptions()) return false; if (!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return false; if (!getSpacingTolerance().equals(config.getSpacingTolerance())) return false; if (!getOcrStrategy().equals(config.getOcrStrategy())) return false; if (getOcrImageType() != config.getOcrImageType()) return false; if (!getOcrImageFormatName().equals(config.getOcrImageFormatName())) return false; if (getExtractActions() != config.getExtractActions()) return false; return getAccessChecker().equals(config.getAccessChecker()); }
setEnableAutoSpace( getBooleanProp(props.getProperty("enableAutoSpace"), getEnableAutoSpace())); setSuppressDuplicateOverlappingText( getBooleanProp(props.getProperty("suppressDuplicateOverlappingText"), getSuppressDuplicateOverlappingText())); setExtractAnnotationText( getBooleanProp(props.getProperty("extractAnnotationText"), getExtractAnnotationText())); setSortByPosition( getBooleanProp(props.getProperty("sortByPosition"), getSortByPosition())); setExtractAcroFormContent( getBooleanProp(props.getProperty("extractAcroFormContent"), getExtractAcroFormContent())); setExtractInlineImages( getBooleanProp(props.getProperty("extractInlineImages"), getExtractInlineImages())); setExtractUniqueInlineImagesOnly( getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"), getExtractUniqueInlineImagesOnly())); setIfXFAExtractOnlyXFA( getBooleanProp(props.getProperty("ifXFAExtractOnlyXFA"), getIfXFAExtractOnlyXFA())); setCatchIntermediateIOExceptions( getBooleanProp(props.getProperty("catchIntermediateIOExceptions"), isCatchIntermediateIOExceptions()));
/** * Configures the given pdf2XHTML. * * @param pdf2XHTML */ public void configure(PDF2XHTMLPureJava pdf2XHTML) { pdf2XHTML.setSortByPosition(getSortByPosition()); if (getEnableAutoSpace()) { pdf2XHTML.setWordSeparator(" "); } else { pdf2XHTML.setWordSeparator(""); } if (getAverageCharTolerance() != null) { pdf2XHTML.setAverageCharTolerance(getAverageCharTolerance()); } if (getSpacingTolerance() != null) { pdf2XHTML.setSpacingTolerance(getSpacingTolerance()); } pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText()); }
private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException, IOException { if (resources == null || config.getExtractInlineImages() == false) { return; if (config.getExtractUniqueInlineImagesOnly() == true) { if (processedInlineImages.containsKey(cosStream)) { continue;
AccessChecker checker = localConfig.getAccessChecker(); checker.check(metadata); if (handler != null) { if (shouldHandleXFAOnly(pdfDocument, localConfig)) { handleXFAOnly(pdfDocument, handler, metadata, context); } else if (localConfig.getOcrStrategy().equals(PDFPureJavaParserConfig.OCR_STRATEGY.OCR_ONLY)) { metadata.add("X-Parsed-By", "org.apache.tika.parser.ocr.TesseractOCRParser"); if (localConfig.getOcrStrategy().equals(PDFPureJavaParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { metadata.add("X-Parsed-By", "org.apache.tika.parser.ocr.TesseractOCRParser");
/** * If true, text in annotations will be extracted. * * @deprecated use {@link #getPDFParserConfig()} */ public boolean getExtractAnnotationText() { return defaultConfig.getExtractAnnotationText(); }
/** * @see #setEnableAutoSpace(boolean) * @deprecated use {@link #getPDFParserConfig()} */ public boolean getEnableAutoSpace() { return defaultConfig.getEnableAutoSpace(); }
config.configure(pdf2XHTML);
private void handleDestinationOrAction(PDDestinationOrAction action, ActionTrigger actionTrigger) throws IOException, SAXException, TikaException { if (action == null || ! config.getExtractActions()) { return;
if (config.getExtractAcroFormContent() == true) { try { extractAcroForm(pdf);
if (config.getExtractAnnotationText()) { if (annotation instanceof PDAnnotationLink) { PDAnnotationLink annotationlink = (PDAnnotationLink) annotation;
@Override public int hashCode() { int result = (getEnableAutoSpace() ? 1 : 0); result = 31 * result + (getSuppressDuplicateOverlappingText() ? 1 : 0); result = 31 * result + (getExtractAnnotationText() ? 1 : 0); result = 31 * result + (getSortByPosition() ? 1 : 0); result = 31 * result + (getExtractAcroFormContent() ? 1 : 0); result = 31 * result + (getExtractInlineImages() ? 1 : 0); result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0); result = 31 * result + getAverageCharTolerance().hashCode(); result = 31 * result + getSpacingTolerance().hashCode(); result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0); result = 31 * result + ocrStrategy.hashCode(); result = 31 * result + getOcrDPI(); result = 31 * result + getOcrImageType().hashCode(); result = 31 * result + getOcrImageFormatName().hashCode(); result = 31 * result + getAccessChecker().hashCode(); result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0); result = 31 * result + (getExtractActions() ? 1 : 0); return result; }