org.apache.pdfbox.pdfparser.PDFParser java code examples

/**
 * Parses a PDF.
 * 
 * @param input byte array that contains the document.
 * @param password password to be used for decryption
 * @param keyStore key store to be used for decryption when using public key security 
 * @param alias alias to be used for decryption when using public key security
 * @param memUsageSetting defines how memory is used for buffering input stream and PDF streams 
 * 
 * @return loaded document
 * 
 * @throws InvalidPasswordException If the password is incorrect.
 * @throws IOException In case of a reading or parsing error.
 */
public static PDDocument load(byte[] input, String password, InputStream keyStore, 
    String alias, MemoryUsageSetting memUsageSetting) throws IOException
{
  ScratchFile scratchFile = new ScratchFile(memUsageSetting);
  RandomAccessRead source = new RandomAccessBuffer(input);
  PDFParser parser = new PDFParser(source, password, keyStore, alias, scratchFile);
  parser.parse();
  return parser.getPDDocument();
}

private void initFromRandomAccessRead(RandomAccessRead rar) throws IOException
{
  pdfSource = rar;
  PDFParser parser = new PDFParser(pdfSource);
  parser.parse();
  visualSignature = parser.getDocument();
}

if (!parsePDFHeader() && !parseFDFHeader())
  initialParse();

/**
 * This will get the PD document that was parsed.  When you are done with
 * this document you must call close() on it to release resources.
 *
 * @return The document at the PD layer.
 *
 * @throws IOException If there is an error getting the document.
 */
public PDDocument getPDDocument() throws IOException
{
  PDDocument doc = new PDDocument(getDocument(), source, getAccessPermission());
  doc.setEncryptionDictionary(getEncryption());
  return doc;
}

public static String convertPDFDocument(String url) throws FileNotFoundException, IOException {
  PDFTextStripper stripper = new PDFTextStripper();
  PDFParser parser = new PDFParser(new FileInputStream(url));
  parser.parse();
  PDDocument doc = parser.getPDDocument();
  String text = stripper.getText(doc);
  parser.clearResources();
  doc.close();
  if(text==null || text.isEmpty() || verifyValidOCRlenght(text)==false)
  {
    try {
      text =  fileOCR(url);
    } catch (TesseractException e) {
      text = new String();
    }
  }
  if(text==null || text.isEmpty() || verifyValidOCRlenght(text)==false)
  {
    try {
      text =  convertEncryptedPDFDocument(url);
    } catch (TesseractException e) {
      text = new String();
    }
  }
  return NormalizationForm.removeOffsetProblemSituation(text);
}

try
  super.parse();

COSDictionary trailer = retrieveTrailer();
COSBase base = parseTrailerValuesDynamically(trailer);
if (!(base instanceof COSDictionary))
if (isLenient() && !root.containsKey(COSName.TYPE))
parseDictObjects(root, (COSName[]) null);
  parseDictObjects((COSDictionary) infoBase, (COSName[]) null);
checkPages(root);
document.setDecrypted();
initialParseDone = true;

/**
 * This will get the PD document that was parsed.  When you are done with
 * this document you must call close() on it to release resources.
 *
 * @return The document at the PD layer.
 *
 * @throws IOException If there is an error getting the document.
 */
public PDDocument getPDDocument() throws IOException
{
  PDDocument doc = new PDDocument(getDocument(), source, accessPermission);
  doc.setEncryptionDictionary(encryption);
  return doc;
}

@Override
/**
 * Fill the CosDocument with some object that isn't set by the NonSequentialParser
 */
protected void initialParse() throws IOException
{
  super.initialParse();
  // For each ObjectKey, we check if the object has been loaded
  // useful for linearized PDFs
  Map<COSObjectKey, Long> xrefTable = document.getXrefTable();
  for (Entry<COSObjectKey, Long> entry : xrefTable.entrySet())
  {
    COSObject co = document.getObjectFromPool(entry.getKey());
    if (co.getObject() == null)
    {
      // object isn't loaded - parse the object to load its content
      parseObjectDynamically(co, true);
    }
  }
}

try
  super.parse();

COSDictionary trailer = retrieveTrailer();
prepareDecryption();
COSBase base = parseTrailerValuesDynamically(trailer);
if (!(base instanceof COSDictionary))
if (isLenient() && !root.containsKey(COSName.TYPE))
parseDictObjects(root, (COSName[]) null);
  parseDictObjects((COSDictionary) infoBase, (COSName[]) null);
checkPages(root);
document.setDecrypted();
initialParseDone = true;

/**
 * This will get the PD document that was parsed.  When you are done with
 * this document you must call close() on it to release resources.
 *
 * @return The document at the PD layer.
 *
 * @throws IOException If there is an error getting the document.
 */
public PDDocument getPDDocument() throws IOException
{
  PDDocument doc = new PDDocument(getDocument(), source, getAccessPermission());
  doc.setEncryptionDictionary(getEncryption());
  return doc;
}

@Override
/**
 * Fill the CosDocument with some object that isn't set by the NonSequentialParser
 */
protected void initialParse() throws IOException
{
  super.initialParse();
  // For each ObjectKey, we check if the object has been loaded
  // useful for linearized PDFs
  Map<COSObjectKey, Long> xrefTable = document.getXrefTable();
  for (Entry<COSObjectKey, Long> entry : xrefTable.entrySet())
  {
    COSObject co = document.getObjectFromPool(entry.getKey());
    if (co.getObject() == null)
    {
      // object isn't loaded - parse the object to load its content
      parseObjectDynamically(co, true);
    }
  }
}

PDFParser parser = new PDFParser(source, password, keyStore, alias, scratchFile);
parser.parse();
return parser.getPDDocument();

private void initFromRandomAccessRead(RandomAccessRead rar) throws IOException
{
  pdfSource = rar;
  PDFParser parser = new PDFParser(pdfSource);
  parser.parse();
  visualSignature = parser.getDocument();
}

try
  super.parse();

COSDictionary trailer = retrieveTrailer();
COSBase base = parseTrailerValuesDynamically(trailer);
if (!(base instanceof COSDictionary))
if (isLenient() && !root.containsKey(COSName.TYPE))
parseDictObjects(root, (COSName[]) null);
  parseDictObjects((COSDictionary) infoBase, (COSName[]) null);
checkPages(root);
if (!(root.getDictionaryObject(COSName.PAGES) instanceof COSDictionary))

if (!parsePDFHeader() && !parseFDFHeader())
  initialParse();

@Override
/**
 * Fill the CosDocument with some object that isn't set by the NonSequentialParser
 */
protected void initialParse() throws IOException
{
  super.initialParse();
  // For each ObjectKey, we check if the object has been loaded
  // useful for linearized PDFs
  Map<COSObjectKey, Long> xrefTable = document.getXrefTable();
  for (Entry<COSObjectKey, Long> entry : xrefTable.entrySet())
  {
    COSObject co = document.getObjectFromPool(entry.getKey());
    if (co.getObject() == null)
    {
      // object isn't loaded - parse the object to load its content
      parseObjectDynamically(co, true);
    }
  }
}

try
  PDFParser parser = new PDFParser(raFile, password, keyStore, alias, scratchFile);
  parser.parse();
  return parser.getPDDocument();

Most used methods

parse
This will parse the stream and populate the COSDocument object. This will close the keystore stream
<init>
Constructor.
getPDDocument
This will get the PD document that was parsed. When you are done with this document you must call cl
getDocument
initialParse
The initial parse will first parse only the trailer, the xrefstart and all xref tables to have a poi
checkPages
init
isLenient
lastIndexOf
parseCOSArray
parseCOSName
parseCOSStream

Popular in Java

Running tasks concurrently on multiple threads
getResourceAsStream (ClassLoader)
setContentView (Activity)
addToBackStack (FragmentTransaction)
SocketException (java.net)
This SocketException may be thrown during socket creation or setting options, and is the superclass
DecimalFormat (java.text)
A concrete subclass of NumberFormat that formats decimal numbers. It has a variety of features desig
Dictionary (java.util)
Note: Do not use this class since it is obsolete. Please use the Map interface for new implementatio
Manifest (java.util.jar)
The Manifest class is used to obtain attribute information for a JarFile and its entries.
BorderLayout (java.awt)
A border layout lays out a container, arranging and resizing its components to fit in five regions:
Option (scala)
Top plugins for Android Studio

How to usePDFParser in org.apache.pdfbox.pdfparser

Best Java code snippets using org.apache.pdfbox.pdfparser.PDFParser (Showing top 20 results out of 315)

How to use
PDFParser
in
org.apache.pdfbox.pdfparser