/** * Parses a PDF. * * @param input byte array that contains the document. * @param password password to be used for decryption * @param keyStore key store to be used for decryption when using public key security * @param alias alias to be used for decryption when using public key security * @param memUsageSetting defines how memory is used for buffering input stream and PDF streams * * @return loaded document * * @throws InvalidPasswordException If the password is incorrect. * @throws IOException In case of a reading or parsing error. */ public static PDDocument load(byte[] input, String password, InputStream keyStore, String alias, MemoryUsageSetting memUsageSetting) throws IOException { ScratchFile scratchFile = new ScratchFile(memUsageSetting); RandomAccessRead source = new RandomAccessBuffer(input); PDFParser parser = new PDFParser(source, password, keyStore, alias, scratchFile); parser.parse(); return parser.getPDDocument(); }
private void initFromRandomAccessRead(RandomAccessRead rar) throws IOException { pdfSource = rar; PDFParser parser = new PDFParser(pdfSource); parser.parse(); visualSignature = parser.getDocument(); }
if (!parsePDFHeader() && !parseFDFHeader()) initialParse();
/** * This will get the PD document that was parsed. When you are done with * this document you must call close() on it to release resources. * * @return The document at the PD layer. * * @throws IOException If there is an error getting the document. */ public PDDocument getPDDocument() throws IOException { PDDocument doc = new PDDocument(getDocument(), source, getAccessPermission()); doc.setEncryptionDictionary(getEncryption()); return doc; }
public static String convertPDFDocument(String url) throws FileNotFoundException, IOException { PDFTextStripper stripper = new PDFTextStripper(); PDFParser parser = new PDFParser(new FileInputStream(url)); parser.parse(); PDDocument doc = parser.getPDDocument(); String text = stripper.getText(doc); parser.clearResources(); doc.close(); if(text==null || text.isEmpty() || verifyValidOCRlenght(text)==false) { try { text = fileOCR(url); } catch (TesseractException e) { text = new String(); } } if(text==null || text.isEmpty() || verifyValidOCRlenght(text)==false) { try { text = convertEncryptedPDFDocument(url); } catch (TesseractException e) { text = new String(); } } return NormalizationForm.removeOffsetProblemSituation(text); }
try super.parse();
COSDictionary trailer = retrieveTrailer(); COSBase base = parseTrailerValuesDynamically(trailer); if (!(base instanceof COSDictionary)) if (isLenient() && !root.containsKey(COSName.TYPE)) parseDictObjects(root, (COSName[]) null); parseDictObjects((COSDictionary) infoBase, (COSName[]) null); checkPages(root); document.setDecrypted(); initialParseDone = true;
/** * This will get the PD document that was parsed. When you are done with * this document you must call close() on it to release resources. * * @return The document at the PD layer. * * @throws IOException If there is an error getting the document. */ public PDDocument getPDDocument() throws IOException { PDDocument doc = new PDDocument(getDocument(), source, accessPermission); doc.setEncryptionDictionary(encryption); return doc; }
@Override /** * Fill the CosDocument with some object that isn't set by the NonSequentialParser */ protected void initialParse() throws IOException { super.initialParse(); // For each ObjectKey, we check if the object has been loaded // useful for linearized PDFs Map<COSObjectKey, Long> xrefTable = document.getXrefTable(); for (Entry<COSObjectKey, Long> entry : xrefTable.entrySet()) { COSObject co = document.getObjectFromPool(entry.getKey()); if (co.getObject() == null) { // object isn't loaded - parse the object to load its content parseObjectDynamically(co, true); } } }
try super.parse();
COSDictionary trailer = retrieveTrailer(); prepareDecryption(); COSBase base = parseTrailerValuesDynamically(trailer); if (!(base instanceof COSDictionary)) if (isLenient() && !root.containsKey(COSName.TYPE)) parseDictObjects(root, (COSName[]) null); parseDictObjects((COSDictionary) infoBase, (COSName[]) null); checkPages(root); document.setDecrypted(); initialParseDone = true;
/** * This will get the PD document that was parsed. When you are done with * this document you must call close() on it to release resources. * * @return The document at the PD layer. * * @throws IOException If there is an error getting the document. */ public PDDocument getPDDocument() throws IOException { PDDocument doc = new PDDocument(getDocument(), source, getAccessPermission()); doc.setEncryptionDictionary(getEncryption()); return doc; }
@Override /** * Fill the CosDocument with some object that isn't set by the NonSequentialParser */ protected void initialParse() throws IOException { super.initialParse(); // For each ObjectKey, we check if the object has been loaded // useful for linearized PDFs Map<COSObjectKey, Long> xrefTable = document.getXrefTable(); for (Entry<COSObjectKey, Long> entry : xrefTable.entrySet()) { COSObject co = document.getObjectFromPool(entry.getKey()); if (co.getObject() == null) { // object isn't loaded - parse the object to load its content parseObjectDynamically(co, true); } } }
PDFParser parser = new PDFParser(source, password, keyStore, alias, scratchFile); parser.parse(); return parser.getPDDocument();
private void initFromRandomAccessRead(RandomAccessRead rar) throws IOException { pdfSource = rar; PDFParser parser = new PDFParser(pdfSource); parser.parse(); visualSignature = parser.getDocument(); }
try super.parse();
COSDictionary trailer = retrieveTrailer(); COSBase base = parseTrailerValuesDynamically(trailer); if (!(base instanceof COSDictionary)) if (isLenient() && !root.containsKey(COSName.TYPE)) parseDictObjects(root, (COSName[]) null); parseDictObjects((COSDictionary) infoBase, (COSName[]) null); checkPages(root); if (!(root.getDictionaryObject(COSName.PAGES) instanceof COSDictionary))
if (!parsePDFHeader() && !parseFDFHeader()) initialParse();
@Override /** * Fill the CosDocument with some object that isn't set by the NonSequentialParser */ protected void initialParse() throws IOException { super.initialParse(); // For each ObjectKey, we check if the object has been loaded // useful for linearized PDFs Map<COSObjectKey, Long> xrefTable = document.getXrefTable(); for (Entry<COSObjectKey, Long> entry : xrefTable.entrySet()) { COSObject co = document.getObjectFromPool(entry.getKey()); if (co.getObject() == null) { // object isn't loaded - parse the object to load its content parseObjectDynamically(co, true); } } }
try PDFParser parser = new PDFParser(raFile, password, keyStore, alias, scratchFile); parser.parse(); return parser.getPDDocument();