public void close() throws IOException { this.document.close(); }
public APDObjectDestination getDestination(PDAnnotation annotation) { return new PageDestination(PDAnnotationTools.getPage(annotation).getNodeIndex() + 1); }
private Integer getBookmarkDestinationPage(PDOutlineItem item) throws IOException, COSLoadException { if (item == null) { return null; } if (item.getDestination() != null) { PDExplicitDestination destination = item.getDestination().getResolvedDestination(getDocument()); if (destination != null) { PDPage page = destination.getPage(getDocument()); return page.getNodeIndex() + 1; } } if (!(item.cosGetField(PDOutlineItem.DK_A) instanceof COSNull)) { COSDictionary cosDictionary = (COSDictionary) item.cosGetField(PDOutlineItem.DK_A); COSArray destination = getCOSArrayFromDestination(cosDictionary); return getPageFromCOSArray((COSArray) destination); } return null; }
private Integer getPageFromCOSArray(COSArray destination) { // DOCEAR: fallback if no entry was found if (destination == null) { return 1; } Iterator<?> it = destination.iterator(); while (it.hasNext()) { COSObject o = (COSObject) it.next(); if (o.isIndirect()) { // the page is indirect referenced o.dereference(); } PDPage page = getDocument().getPageTree().getFirstPage(); while (page != null) { if (page.cosGetObject().equals(o)) { return page.getNodeIndex() + 1; } page = page.getNextPage(); } } return null; }
private void onlyHashExtraction() throws IOException { try { PDPage page = getPDDocument(getDocument()).getPageTree().getFirstPage(); if (page.isPage()) { try { if(!page.cosGetContents().basicIterator().hasNext()) { page = page.getNextPage(); } TreeMap<PdfTextEntity, StringBuilder> map = tryTextExtraction(page); Entry<PdfTextEntity, StringBuilder> entry = map.firstEntry(); if(entry == null) { UniqueImageHashExtractor handler = new UniqueImageHashExtractor(); tryImageExtraction(page, handler); uniqueHash = handler.getUniqueHash(); } } catch (Exception ex) { } } } finally { close(); } }
private void extractText(PDPageTree pageTree, StringBuilder sb) { for (Iterator<?> it = pageTree.getKids().iterator(); it.hasNext();) { PDPageNode node = (PDPageNode) it.next(); if (node.isPage()) { try { CSTextExtractor extractor = new CSTextExtractor(); PDPage page = (PDPage) node; AffineTransform pageTx = new AffineTransform(); PDFGeometryTools.adjustTransform(pageTx, page); extractor.setDeviceTransform(pageTx); CSDeviceBasedInterpreter interpreter = new CSDeviceBasedInterpreter(null, extractor); interpreter.process(page.getContentStream(), page.getResources()); sb.append(extractor.getContent()); } catch (CSException e) { e.printStackTrace(); } } else { extractText((PDPageTree) node, sb); } } }
/** * Try to use JPod. * * @param imgPath the provided (PDF) input file. * @return proper (JPod) loader or null if failed */ private static Loader getJPodLoader (Path imgPath) { logger.debug("getJPodLoader {}", imgPath); PDDocument doc = null; try { FileLocator locator = new FileLocator(imgPath.toFile()); doc = PDDocument.createFromLocator(locator); } catch (IOException ex) { logger.warn("Error opening pdf file " + imgPath, ex); } catch (COSLoadException ex) { logger.warn("Invalid pdf file " + imgPath, ex); } if (doc == null) { return null; } int imageCount = doc.getPageTree().getCount(); return new JPodLoader(doc, imageCount); }
private void tryImageExtraction(PDPage page, IDocearPdfImageHandler imageHandler) { CSImageExtractor ocrExtractor = new CSImageExtractor(imageHandler); CSDeviceBasedInterpreter interpreter = new CSDeviceBasedInterpreter(null, ocrExtractor); interpreter.process(page.getContentStream(), page.getResources()); }
private List<APDMetaObject> getBookmarks(List<APDMetaObject> bookmarkList) throws IOException { if (bookmarkList == null) { bookmarkList = new ArrayList<APDMetaObject>(); } try { PDOutlineNode outline; try { outline = getDocument().getOutline(); } catch (ClassCastException ex) { outline = (PDOutlineNode)PDOutline.META.createFromCos(getDocument().getCatalog().cosGetOutline()); } getBookmarks(outline, bookmarkList); } catch (Exception e) { throw new IOException(e); } return bookmarkList; }
|| annotation.getClass() == PDSquigglyAnnotation.class) && !ignoreHighlights()) { Integer objectNumber = annotation.cosGetObject().getIndirectObject().getObjectNumber(); COSObjectContext context = new COSObjectContext(annotation); APDMetaObject meta = new HighlightAnnotation(getOrCreateUID(context), context); if (annotation.getContents() != null && annotation.getContents().length() > 0) { meta.setText(annotation.getContents()); String subject = ((PDHighlightAnnotation) annotation).getSubject(); if (subject != null && subject.length() > 0) { if (!subject.equalsIgnoreCase("Highlight") && !subject.equalsIgnoreCase("Hervorheben")) { meta.setText(((PDHighlightAnnotation) annotation).getSubject());
private void getBookmarks(PDOutlineNode parent, List<APDMetaObject> bookmarks) throws IOException, COSLoadException, COSRuntimeException { if (parent == null) { return; } @SuppressWarnings("unchecked") List<PDOutlineItem> children = parent.getChildren(); for (PDOutlineItem child : children) { COSObjectContext context = new COSObjectContext(child); Bookmark bm = new Bookmark(getOrCreateUID(context), context); setBookmarkDestination(bm, child); int objectNumber = child.cosGetObject().getIndirectObject().getObjectNumber(); bm.setObjectNumber(objectNumber); bm.setText(child.getTitle()); if(child.getChildren().size() > 0) { getBookmarks(child, bm.getChildren()); } bookmarks.add(bm); } }
public APDMetaObject getComment(PDAnnotation annotation) { if ((annotation.getClass() == PDAnyAnnotation.class || annotation.getClass() == PDTextAnnotation.class) && !ignoreComments()) { Integer objectNumber = annotation.cosGetObject().getIndirectObject().getObjectNumber(); COSObjectContext context = new COSObjectContext(annotation); APDMetaObject meta = new CommentAnnotation(getOrCreateUID(context), context); meta.setObjectNumber(objectNumber); meta.setText(annotation.getContents()); meta.setDestination(getDestination(annotation)); return meta; } return null; }
protected APDMetaObject getMetaObject(PDAnnotation annotation, String lastString) { // Avoid empty entries // support repligo highlights if (annotation.getClass() == PDHighlightAnnotation.class) { // ignore Highlight if Subject is "Highlight" and Contents is "" if (((PDHighlightAnnotation) annotation).getSubject() != null && ((PDHighlightAnnotation) annotation).getSubject().length() > 0 && ((PDHighlightAnnotation) annotation).getSubject().equals("Highlight") && annotation.getContents().equals("")) { return null; } } else if (!(annotation.getClass() == PDSquigglyAnnotation.class) && !(annotation.getClass() == PDUnderlineAnnotation.class) && !(annotation.getClass() == PDStrikeOutAnnotation.class) && !(annotation.getClass() == PDTextMarkupAnnotation.class)) { // ignore annotations with Contents is "" if ("".equals(annotation.getContents())/* && !annotation.isMarkupAnnotation() */) { return null; } // Avoid double entries (Foxit Reader) if (annotation.getContents().equals(lastString)) { return null; } lastString = annotation.getContents(); } APDMetaObject metaObject = getComment(annotation); if(metaObject == null) { metaObject = getHighlight(annotation); } return metaObject; }
public String extractPlainText() throws IOException { StringBuilder sb = new StringBuilder(); try { extractText(getPDDocument(getDocument()).getPageTree(), sb); } finally { close(); } return sb.toString(); }
public static PDDocument getPDDocument(COSDocument cosDoc) throws IOException { try { return PDDocument.createFromCos(cosDoc); } catch (Exception e) { if(cosDoc != null) { cosDoc.close(); } throw new IOException(e); } }
public void handleImage(CSBasicDevice device, PDImage image) { byte[] buffer = image.getBytes(); // long time = System.currentTimeMillis(); // this.uniqueHash = Long.toString(HashUtililities.hashFNV64(buffer), 16); // System.out.println("time FNV64: "+(System.currentTimeMillis()-time)); // System.out.println(this.uniqueHash); // time = System.currentTimeMillis(); // this.uniqueHash = Long.toString(HashUtililities.hashBerkeleyDB64(buffer), 16); // System.out.println("time BerkeleyDB64: "+(System.currentTimeMillis()-time)); // System.out.println(this.uniqueHash); // time = System.currentTimeMillis(); // this.uniqueHash = Integer.toHexString(Arrays.deepHashCode(new Object[]{buffer})); // System.out.println("time default: "+(System.currentTimeMillis()-time)); // System.out.println(this.uniqueHash); this.uniqueHash = HashUtililities.hashSHA2(buffer); }
private void getAnnotations(List<APDMetaObject> annotations) throws IOException { if (annotations == null) { annotations = new ArrayList<APDMetaObject>(); } String lastString = ""; List<PDAnnotation> annotationList = document.getAnnotations(); for (PDAnnotation pdAnnotation : annotationList) { APDMetaObject metaObj = getMetaObject(pdAnnotation, lastString); if(metaObj != null) { annotations.add(metaObj); } } }
private TreeMap<PdfTextEntity, StringBuilder> tryTextExtraction(PDPage page) { CSFormatedTextExtractor extractor = new CSFormatedTextExtractor(); AffineTransform pageTx = new AffineTransform(); PDFGeometryTools.adjustTransform(pageTx, page); extractor.setDeviceTransform(pageTx); CSDeviceBasedInterpreter interpreter = new CSDeviceBasedInterpreter(null, extractor); interpreter.process(page.getContentStream(), page.getResources()); TreeMap<PdfTextEntity, StringBuilder> map = extractor.getMap(); uniqueHash = extractor.getHash(); return map; }
@Override public void dispose () { try { doc.close(); } catch (IOException ex) { logger.warn("Could not close PDDocument", ex); } }
public boolean close() { synchronized (this) { if(cosDoc != null) { try { cosDoc.close(); cosDoc = null; } catch (IOException e) { e.printStackTrace(); return false; } } if(document != null) { try { document.close(); document = null; } catch (IOException e) { e.printStackTrace(); return false; } } return true; } } }