org.apache.tika.parser.pdf.PDFPureJavaParserConfig java code examples

@Override
public boolean equals(Object o) {
  if (this == o) return true;
  if (!(o instanceof PDFPureJavaParserConfig)) return false;
  PDFPureJavaParserConfig config = (PDFPureJavaParserConfig) o;
  if (getEnableAutoSpace() != config.getEnableAutoSpace()) return false;
  if (getSuppressDuplicateOverlappingText() != config.getSuppressDuplicateOverlappingText()) return false;
  if (getExtractAnnotationText() != config.getExtractAnnotationText()) return false;
  if (getSortByPosition() != config.getSortByPosition()) return false;
  if (getExtractAcroFormContent() != config.getExtractAcroFormContent()) return false;
  if (getExtractInlineImages() != config.getExtractInlineImages()) return false;
  if (getExtractUniqueInlineImagesOnly() != config.getExtractUniqueInlineImagesOnly()) return false;
  if (getIfXFAExtractOnlyXFA() != config.getIfXFAExtractOnlyXFA()) return false;
  if (getOcrDPI() != config.getOcrDPI()) return false;
  if (isCatchIntermediateIOExceptions() != config.isCatchIntermediateIOExceptions()) return false;
  if (!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return false;
  if (!getSpacingTolerance().equals(config.getSpacingTolerance())) return false;
  if (!getOcrStrategy().equals(config.getOcrStrategy())) return false;
  if (getOcrImageType() != config.getOcrImageType()) return false;
  if (!getOcrImageFormatName().equals(config.getOcrImageFormatName())) return false;
  if (getExtractActions() != config.getExtractActions()) return false;
  return getAccessChecker().equals(config.getAccessChecker());
}

setEnableAutoSpace(
    getBooleanProp(props.getProperty("enableAutoSpace"), getEnableAutoSpace()));
setSuppressDuplicateOverlappingText(
    getBooleanProp(props.getProperty("suppressDuplicateOverlappingText"),
        getSuppressDuplicateOverlappingText()));
setExtractAnnotationText(
    getBooleanProp(props.getProperty("extractAnnotationText"),
        getExtractAnnotationText()));
setSortByPosition(
    getBooleanProp(props.getProperty("sortByPosition"),
        getSortByPosition()));
setExtractAcroFormContent(
    getBooleanProp(props.getProperty("extractAcroFormContent"),
        getExtractAcroFormContent()));
setExtractInlineImages(
    getBooleanProp(props.getProperty("extractInlineImages"),
        getExtractInlineImages()));
setExtractUniqueInlineImagesOnly(
    getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"),
        getExtractUniqueInlineImagesOnly()));
setIfXFAExtractOnlyXFA(
  getBooleanProp(props.getProperty("ifXFAExtractOnlyXFA"),
    getIfXFAExtractOnlyXFA()));
setCatchIntermediateIOExceptions(
    getBooleanProp(props.getProperty("catchIntermediateIOExceptions"),
    isCatchIntermediateIOExceptions()));

/**
 * Configures the given pdf2XHTML.
 *
 * @param pdf2XHTML
 */
public void configure(PDF2XHTMLPureJava pdf2XHTML) {
  pdf2XHTML.setSortByPosition(getSortByPosition());
  if (getEnableAutoSpace()) {
    pdf2XHTML.setWordSeparator(" ");
  } else {
    pdf2XHTML.setWordSeparator("");
  }
  if (getAverageCharTolerance() != null) {
    pdf2XHTML.setAverageCharTolerance(getAverageCharTolerance());
  }
  if (getSpacingTolerance() != null) {
    pdf2XHTML.setSpacingTolerance(getSpacingTolerance());
  }
  pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText());
}

private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException, IOException {
  if (resources == null || config.getExtractInlineImages() == false) {
    return;
      if (config.getExtractUniqueInlineImagesOnly() == true) {
        if (processedInlineImages.containsKey(cosStream)) {
          continue;

AccessChecker checker = localConfig.getAccessChecker();
checker.check(metadata);
if (handler != null) {
  if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
    handleXFAOnly(pdfDocument, handler, metadata, context);
  } else if (localConfig.getOcrStrategy().equals(PDFPureJavaParserConfig.OCR_STRATEGY.OCR_ONLY)) {
    metadata.add("X-Parsed-By", "org.apache.tika.parser.ocr.TesseractOCRParser");
    if (localConfig.getOcrStrategy().equals(PDFPureJavaParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
      metadata.add("X-Parsed-By", "org.apache.tika.parser.ocr.TesseractOCRParser");

/**
 * If true, text in annotations will be extracted.
 *
 * @deprecated use {@link #getPDFParserConfig()}
 */
public boolean getExtractAnnotationText() {
  return defaultConfig.getExtractAnnotationText();
}

/**
 * @see #setEnableAutoSpace(boolean)
 * @deprecated use {@link #getPDFParserConfig()}
 */
public boolean getEnableAutoSpace() {
  return defaultConfig.getEnableAutoSpace();
}

config.configure(pdf2XHTML);

private void handleDestinationOrAction(PDDestinationOrAction action,
                    ActionTrigger actionTrigger) throws IOException, SAXException, TikaException {
  if (action == null || ! config.getExtractActions()) {
    return;

if (config.getExtractAcroFormContent() == true) {
  try {
    extractAcroForm(pdf);

if (config.getExtractAnnotationText()) {
  if (annotation instanceof PDAnnotationLink) {
    PDAnnotationLink annotationlink = (PDAnnotationLink) annotation;

@Override
public int hashCode() {
  int result = (getEnableAutoSpace() ? 1 : 0);
  result = 31 * result + (getSuppressDuplicateOverlappingText() ? 1 : 0);
  result = 31 * result + (getExtractAnnotationText() ? 1 : 0);
  result = 31 * result + (getSortByPosition() ? 1 : 0);
  result = 31 * result + (getExtractAcroFormContent() ? 1 : 0);
  result = 31 * result + (getExtractInlineImages() ? 1 : 0);
  result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0);
  result = 31 * result + getAverageCharTolerance().hashCode();
  result = 31 * result + getSpacingTolerance().hashCode();
  result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0);
  result = 31 * result + ocrStrategy.hashCode();
  result = 31 * result + getOcrDPI();
  result = 31 * result + getOcrImageType().hashCode();
  result = 31 * result + getOcrImageFormatName().hashCode();
  result = 31 * result + getAccessChecker().hashCode();
  result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0);
  result = 31 * result + (getExtractActions() ? 1 : 0);
  return result;
}

Javadoc

Config for PDFParser.

This allows parameters to be set programmatically:

Calls to PDFParser, i.e. parser.getPDFParserConfig().setEnableAutoSpace() (as before)
Constructor of PDFParser
Passing to PDFParser through a ParseContext: context.set(PDFParserConfig.class, config);

Parameters can also be set by modifying the PDFParserConfig.properties file, which lives in the expected places, in trunk: tika-parsers/src/main/resources/org/apache/tika/parser/pdf

Or, in tika-app-x.x.jar or tika-parsers-x.x.jar: org/apache/tika/parser/pdf

Most used methods

Popular in Java

Running tasks concurrently on multiple threads
getContentResolver (Context)
startActivity (Activity)
scheduleAtFixedRate (Timer)
Comparator (java.util)
A Comparator is used to compare two objects to determine their ordering with respect to each other.
PriorityQueue (java.util)
A PriorityQueue holds elements on a priority heap, which orders the elements according to their natu
Properties (java.util)
A Properties object is a Hashtable where the keys and values must be Strings. Each property can have
XPath (javax.xml.xpath)
XPath provides access to the XPath evaluation environment and expressions. Evaluation of XPath Expr
Menu (java.awt)
Join (org.hibernate.mapping)
Top PhpStorm plugins

How to usePDFPureJavaParserConfig in org.apache.tika.parser.pdf

Best Java code snippets using org.apache.tika.parser.pdf.PDFPureJavaParserConfig (Showing top 12 results out of 315)

How to use
PDFPureJavaParserConfig
in
org.apache.tika.parser.pdf