private void processAcroField(PDField field, final int currentRecursiveDepth) throws SAXException, IOException, TikaException { if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) { return; } PDFormFieldAdditionalActions pdFormFieldAdditionalActions = field.getActions(); if (pdFormFieldAdditionalActions != null) { handleDestinationOrAction(pdFormFieldAdditionalActions.getC(), ActionTrigger.FORM_FIELD_RECALCULATE); handleDestinationOrAction(pdFormFieldAdditionalActions.getF(), ActionTrigger.FORM_FIELD_FORMATTED); handleDestinationOrAction(pdFormFieldAdditionalActions.getK(), ActionTrigger.FORM_FIELD_KEYSTROKE); handleDestinationOrAction(pdFormFieldAdditionalActions.getV(), ActionTrigger.FORM_FIELD_VALUE_CHANGE); } if (field.getWidgets() != null) { for (PDAnnotationWidget widget : field.getWidgets()) { handleWidget(widget); } } addFieldString(field); if (field instanceof PDNonTerminalField) { int r = currentRecursiveDepth + 1; xhtml.startElement("ol"); for (PDField child : ((PDNonTerminalField)field).getChildren()) { processAcroField(child, r); } xhtml.endElement("ol"); } }
String actionOrDestString = (action instanceof PDAction) ? "action" : "destination"; addNonNullAttribute("class", actionOrDestString, attributes); addNonNullAttribute("type", action.getClass().getSimpleName(), attributes); addNonNullAttribute("trigger", actionTrigger.name(), attributes); processDoc("", ((PDActionImportData)action).getFile(), attributes); } else if (action instanceof PDActionLaunch) { PDActionLaunch pdActionLaunch = (PDActionLaunch)action; addNonNullAttribute("id", pdActionLaunch.getF(), attributes); addNonNullAttribute("defaultDirectory", pdActionLaunch.getD(), attributes); addNonNullAttribute("operation", pdActionLaunch.getO(), attributes); addNonNullAttribute("parameters", pdActionLaunch.getP(), attributes); processDoc(pdActionLaunch.getF(), pdActionLaunch.getFile(), attributes); } else if (action instanceof PDActionRemoteGoTo) { PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo)action; processDoc("", remoteGoTo.getFile(), attributes); } else if (action instanceof PDActionJavaScript) { PDActionJavaScript jsAction = (PDActionJavaScript)action; addNonNullAttribute("class", "javascript", attributes); addNonNullAttribute("type", jsAction.getType(), attributes); addNonNullAttribute("subtype", jsAction.getSubType(), attributes); xhtml.startElement("div", attributes); xhtml.endElement("div");
AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "source", "source", "CDATA", "annotation"); extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } catch (IOException e) { handleCatchableIOE(e); handleWidget((PDAnnotationWidget)annotation); PDActionURI uri = getActionURI(annotation); if (uri != null) { String link = uri.getURI(); doOCROnCurrentPage(); handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE); handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN);
extractBookmarkText(); extractEmbeddedDocuments(pdf); } catch (IOException e) { handleCatchableIOE(e); extractAcroForm(pdf); } catch (IOException e) { handleCatchableIOE(e); handleDestinationOrAction(additionalActions.getDP(), ActionTrigger.AFTER_DOCUMENT_PRINT); handleDestinationOrAction(additionalActions.getDS(), ActionTrigger.AFTER_DOCUMENT_SAVE); handleDestinationOrAction(additionalActions.getWC(), ActionTrigger.BEFORE_DOCUMENT_CLOSE); handleDestinationOrAction(additionalActions.getWP(), ActionTrigger.BEFORE_DOCUMENT_PRINT); handleDestinationOrAction(additionalActions.getWS(), ActionTrigger.BEFORE_DOCUMENT_SAVE); xhtml.endDocument(); } catch (TikaException e) {
void extractBookmarkText(PDOutlineNode bookmark) throws SAXException, IOException, TikaException { PDOutlineItem current = bookmark.getFirstChild(); if (current != null) { xhtml.startElement("ul"); while (current != null) { xhtml.startElement("li"); xhtml.characters(current.getTitle()); xhtml.endElement("li"); handleDestinationOrAction(current.getAction(), ActionTrigger.BOOKMARK); // Recurse: extractBookmarkText(current); current = current.getNextSibling(); } xhtml.endElement("ul"); } }
void extractBookmarkText() throws SAXException, IOException, TikaException { PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline(); if (outline != null) { extractBookmarkText(outline); } }
private void processDoc(String name, PDFileSpecification spec, AttributesImpl attributes) throws TikaException, SAXException, IOException { if (spec instanceof PDSimpleFileSpecification) { attributes.addAttribute("", "class", "class", "CDATA", "linked"); attributes.addAttribute("", "id", "id", "CDATA", spec.getFile()); xhtml.startElement("div", attributes); xhtml.endElement("div"); } else if (spec instanceof PDComplexFileSpecification){ if (attributes.getIndex("source") < 0) { attributes.addAttribute("", "source", "source", "CDATA", "attachment"); } extractMultiOSPDEmbeddedFiles(name, (PDComplexFileSpecification)spec, attributes); } }
private void extractMultiOSPDEmbeddedFiles(String displayName, PDComplexFileSpecification spec, AttributesImpl attributes) throws IOException, SAXException, TikaException { if (spec == null) { return; } //current strategy is to pull all, not just first non-null extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFile(), spec.getEmbeddedFile(), attributes); extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileMac(), spec.getEmbeddedFileMac(), attributes); extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileDos(), spec.getEmbeddedFileDos(), attributes); extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileUnix(), spec.getEmbeddedFileUnix(), attributes); }
@Override protected void endPage(PDPage page) throws IOException { try { writeParagraphEnd(); try { extractImages(page.getResources(), new HashSet<COSBase>()); } catch (IOException e) { handleCatchableIOE(e); } super.endPage(page); } catch (SAXException e) { throw new IOException("Unable to end a page", e); } catch (IOException e) { exceptions.add(e); } }
handleCatchableIOE(e); } catch (SAXException e) { throw new IOExceptionWithCause("error writing OCR content from PDF", e);
try { extractBookmarkText(); try { extractEmbeddedDocuments(pdf); } catch (IOException e) { handleCatchableIOE(e); extractAcroForm(pdf); } catch (IOException e) { handleCatchableIOE(e); handleDestinationOrAction(additionalActions.getDP(), ActionTrigger.AFTER_DOCUMENT_PRINT); handleDestinationOrAction(additionalActions.getDS(), ActionTrigger.AFTER_DOCUMENT_SAVE); handleDestinationOrAction(additionalActions.getWC(), ActionTrigger.BEFORE_DOCUMENT_CLOSE); handleDestinationOrAction(additionalActions.getWP(), ActionTrigger.BEFORE_DOCUMENT_PRINT); handleDestinationOrAction(additionalActions.getWS(), ActionTrigger.BEFORE_DOCUMENT_SAVE); xhtml.endDocument(); } catch (TikaException e) {
void extractBookmarkText(PDOutlineNode bookmark) throws SAXException, IOException, TikaException { PDOutlineItem current = bookmark.getFirstChild(); if (current != null) { xhtml.startElement("ul"); while (current != null) { xhtml.startElement("li"); xhtml.characters(current.getTitle()); xhtml.endElement("li"); handleDestinationOrAction(current.getAction(), ActionTrigger.BOOKMARK); // Recurse: extractBookmarkText(current); current = current.getNextSibling(); } xhtml.endElement("ul"); } }
void extractBookmarkText() throws SAXException, IOException, TikaException { PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline(); if (outline != null) { extractBookmarkText(outline); } }
private void processDoc(String name, PDFileSpecification spec, AttributesImpl attributes) throws TikaException, SAXException, IOException { if (spec instanceof PDSimpleFileSpecification) { attributes.addAttribute("", "class", "class", "CDATA", "linked"); attributes.addAttribute("", "id", "id", "CDATA", spec.getFile()); xhtml.startElement("div", attributes); xhtml.endElement("div"); } else if (spec instanceof PDComplexFileSpecification){ if (attributes.getIndex("source") < 0) { attributes.addAttribute("", "source", "source", "CDATA", "attachment"); } extractMultiOSPDEmbeddedFiles(name, (PDComplexFileSpecification)spec, attributes); } }
private void extractMultiOSPDEmbeddedFiles(String displayName, PDComplexFileSpecification spec, AttributesImpl attributes) throws IOException, SAXException, TikaException { if (spec == null) { return; } //current strategy is to pull all, not just first non-null extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFile(), spec.getEmbeddedFile(), attributes); extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileMac(), spec.getEmbeddedFileMac(), attributes); extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileDos(), spec.getEmbeddedFileDos(), attributes); extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileUnix(), spec.getEmbeddedFileUnix(), attributes); }
@Override protected void endPage(PDPage page) throws IOException { try { writeParagraphEnd(); try { extractImages(page.getResources(), new HashSet<COSBase>()); } catch (IOException e) { handleCatchableIOE(e); } super.endPage(page); } catch (SAXException e) { throw new IOException("Unable to end a page", e); } catch (IOException e) { exceptions.add(e); } }
handleCatchableIOE(e); } catch (SAXException e) { throw new IOExceptionWithCause("error writing OCR content from PDF", e);
AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "source", "source", "CDATA", "annotation"); extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes); } catch (SAXException e) { throw new IOExceptionWithCause("file embedded in annotation sax exception", e); throw new IOExceptionWithCause("file embedded in annotation tika exception", e); } catch (IOException e) { handleCatchableIOE(e); handleWidget((PDAnnotationWidget)annotation); PDActionURI uri = getActionURI(annotation); if (uri != null) { String link = uri.getURI(); doOCROnCurrentPage(); handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE); handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN);
extractBookmarkText(); extractEmbeddedDocuments(pdf); } catch (IOException e) { handleCatchableIOE(e); extractAcroForm(pdf); } catch (IOException e) { handleCatchableIOE(e); handleDestinationOrAction(additionalActions.getDP(), ActionTrigger.AFTER_DOCUMENT_PRINT); handleDestinationOrAction(additionalActions.getDS(), ActionTrigger.AFTER_DOCUMENT_SAVE); handleDestinationOrAction(additionalActions.getWC(), ActionTrigger.BEFORE_DOCUMENT_CLOSE); handleDestinationOrAction(additionalActions.getWP(), ActionTrigger.BEFORE_DOCUMENT_PRINT); handleDestinationOrAction(additionalActions.getWS(), ActionTrigger.BEFORE_DOCUMENT_SAVE); xhtml.endDocument(); } catch (TikaException e) {
private void processAcroField(PDField field, final int currentRecursiveDepth) throws SAXException, IOException, TikaException { if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) { return; } PDFormFieldAdditionalActions pdFormFieldAdditionalActions = field.getActions(); if (pdFormFieldAdditionalActions != null) { handleDestinationOrAction(pdFormFieldAdditionalActions.getC(), ActionTrigger.FORM_FIELD_RECALCULATE); handleDestinationOrAction(pdFormFieldAdditionalActions.getF(), ActionTrigger.FORM_FIELD_FORMATTED); handleDestinationOrAction(pdFormFieldAdditionalActions.getK(), ActionTrigger.FORM_FIELD_KEYSTROKE); handleDestinationOrAction(pdFormFieldAdditionalActions.getV(), ActionTrigger.FORM_FIELD_VALUE_CHANGE); } if (field.getWidgets() != null) { for (PDAnnotationWidget widget : field.getWidgets()) { handleWidget(widget); } } addFieldString(field); if (field instanceof PDNonTerminalField) { int r = currentRecursiveDepth + 1; xhtml.startElement("ol"); for (PDField child : ((PDNonTerminalField)field).getChildren()) { processAcroField(child, r); } xhtml.endElement("ol"); } }