@Override protected void endPage(PDPage page) throws IOException { try { writeParagraphEnd(); try { extractImages(page.getResources(), new HashSet<COSBase>()); } catch (IOException e) { handleCatchableIOE(e); } super.endPage(page); } catch (SAXException e) { throw new IOException("Unable to end a page", e); } catch (IOException e) { exceptions.add(e); } }
@Override protected void writeWordSeparator() throws IOException { try { xhtml.characters(getWordSeparator()); } catch (SAXException e) { throw new IOException( "Unable to write a space character", e); } }
/** * Configures the given pdf2XHTML. * * @param pdf2XHTML */ public void configure(PDF2XHTMLPureJava pdf2XHTML) { pdf2XHTML.setSortByPosition(getSortByPosition()); if (getEnableAutoSpace()) { pdf2XHTML.setWordSeparator(" "); } else { pdf2XHTML.setWordSeparator(""); } if (getAverageCharTolerance() != null) { pdf2XHTML.setAverageCharTolerance(getAverageCharTolerance()); } if (getSpacingTolerance() != null) { pdf2XHTML.setSpacingTolerance(getSpacingTolerance()); } pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText()); }
extractImages(((PDFormXObject) object).getResources(), seenThisPage); } else if (object instanceof PDImageXObject) { extension = getJBIG2Suffix(image); writeToBuffer(image, extension, buffer); } catch (IOException e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); handleCatchableIOE(e);
pdf2XHTML = new PDF2XHTMLPureJava(document, handler, context, metadata, config); config.configure(pdf2XHTML); pdf2XHTML.writeText(document, new Writer() { @Override public void write(char[] cbuf, int off, int len) {
@Override public void processPage(PDPage page) throws IOException, PdfTimeoutException { try { super.processPage(page); } catch (IOException e) { handleCatchableIOE(e); } }
metadata.add("X-Parsed-By", "org.apache.tika.parser.ocr.TesseractOCRParser"); PDF2XHTMLPureJava.process(pdfDocument, handler, context, metadata, localConfig);