public class PDFReader{ public static void main(String args[]) { PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; File file = new File("C:/my.pdf"); try { PDFParser parser = new PDFParser(new FileInputStream(file)); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); pdfStripper.setStartPage(1); pdfStripper.setEndPage(5); String parsedText = pdfStripper.getText(pdDoc); System.out.println(parsedText); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
public static Metadata getMet(URL url) throws IOException, SAXException, TikaException { Metadata met = new Metadata(); PDFParser parser = new PDFParser(); parser.parse(url.openStream(), new BodyContentHandler(), met, new ParseContext()); return met; }
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources()); File tmpFile = tis.getFile(); GrobidRESTParser grobidParser = new GrobidRESTParser(); grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context); PDFParser parser = new PDFParser(); parser.parse(new FileInputStream(tmpFile), handler, metadata, context); } }
try { parser = new PDFParser(new FileInputStream(file)); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper();
public void ReadPDF() throws Exception { URL TestURL = new URL("http://www.axmag.com/download/pdfurl-guide.pdf"); BufferedInputStream TestFile = new BufferedInputStream(TestURL.openStream()); PDFParser TestPDF = new PDFParser(TestFile); TestPDF.parse(); String TestText = new PDFTextStripper().getText(TestPDF.getPDDocument()); Assert.assertTrue(TestText.contains("Open the setting.xml, you can see it is like this")); }
File in = new File("somefile.pdf"); InputStream fin = new FileInputStream(in); PDFParser parser = new PDFParser(fin); parser.setTempDirectory(new File(tempDirectoryPath)); parser.parse(); PDDocument document = parser.getPDDocument();
InputStream input = new FileInputStream(new File(resourceLocation)); ContentHandler textHandler = new BodyContentHandler(); Metadata metadata = new Metadata(); PDFParser parser = new PDFParser(); parser.parse(input, textHandler, metadata); input.close(); out.println("Title: " + metadata.get("title")); out.println("Author: " + metadata.get("Author")); out.println("content: " + textHandler.toString());
InputStream input = new FileInputStream(new File(resourceLocation)); ContentHandler textHandler = new BodyContentHandler(); Metadata metadata = new Metadata(); PDFParser parser = new PDFParser(); parser.parse(input, textHandler, metadata); input.close(); out.println("Title: " + metadata.get("title")); out.println("Author: " + metadata.get("Author")); out.println("content: " + textHandler.toString());
import org.apache.tika.parser.pdf.PDFParser; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; public class URLReader { public static void main(String[] args) throws Exception { URL url = new URL("http://website.com/document.pdf"); ContentHandler contenthandler = new BodyContentHandler(); Metadata metadata = new Metadata(); PDFParser pdfparser = new PDFParser(); pdfparser.parse(is, contenthandler, metadata, new ParseContext()); System.out.println(contenthandler.toString()); } }
private void openPDFDoc(final File pdfFile) throws Exception { File originalPDF = pdfFile; PDFParser parser = new PDFParser(new BufferedInputStream(new FileInputStream( originalPDF))); parser.parse(); PDDocument originialPdfDoc = parser.getPDDocument(); boolean isOriginalDocEncrypted = originialPdfDoc.isEncrypted(); if (isOriginalDocEncrypted) { originialPdfDoc.openProtection(new StandardDecryptionMaterial("password")); } }
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources()); File tmpFile = tis.getFile(); GrobidRESTParser grobidParser = new GrobidRESTParser(); grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context); PDFParser parser = new PDFParser(); parser.parse(new FileInputStream(tmpFile), handler, metadata, context); } }
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources()); File tmpFile = tis.getFile(); GrobidRESTParser grobidParser = new GrobidRESTParser(); grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context); PDFParser parser = new PDFParser(); parser.parse(new FileInputStream(tmpFile), handler, metadata, context); } }
PDFParser pdfParser = new PDFParser(new FileInputStream("c:\\temp\\owgr49f2013.pdf")); pdfParser.parse(); PDDocument pdDocument = pdfParser.getPDDocument(); PDFTextStripper stripper = new PDFTextStripper("UTF-8"); stripper.setSortByPosition(false); stripper.setWordSeparator("###"); System.out.println(stripper.getText(pdDocument));
parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper();
@Override public List<Page> parse(InputStream fs, List<String> filterPatterns) throws Exception { List<Page> pages = new ArrayList<>(); PageContentHandler handler = new PageContentHandler(filterPatterns); Metadata metadata = new Metadata(); pdfParser.setSortByPosition(true); pdfParser.parse(fs, handler, metadata, new ParseContext()); Map<Integer, List<String>> content = handler.getImprovedContent(); for (Integer i : content.keySet()) { Page page = new Page(i); for (String p : content.get(i)) { page.getParagraphs().add(p); } pages.add(page); } return pages; } }
parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper();
parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper();
parser.parse();
public static String pdftoText(File file) { PDFParser parser=null; String parsedText = null; PDFTextStripper pdfStripper = null; PDDocument pdDoc = null; COSDocument cosDoc = null; try { parser = new PDFParser(new FileInputStream(file)); parser.parse(); cosDoc = parser.getDocument(); pdfStripper = new PDFTextStripper(); pdDoc = new PDDocument(cosDoc); parsedText = pdfStripper.getText(pdDoc); } catch (Exception e) { // handle exception } finally { try { if (cosDoc != null) cosDoc.close(); if (pdDoc != null) pdDoc.close(); } catch (Exception e) { // handle excpetion } } return parsedText; }
public void testFopMacroLibrary() throws Exception { String screentextUrl = screenUrl.concat("Fop"); HttpClient http = initHttpClient(); http.setUrl(screentextUrl.concat(authentificationQuery)); //FIXME need to check if the stream is an application-pdf that don't contains ftl stack trace InputStream screenInputStream = http.postStream(); assertNotNull("Response failed from ofbiz", screenInputStream); assertEquals("Response contentType isn't good : " + http.getResponseContentType(), "application/pdf;charset=UTF-8", http.getResponseContentType()); String screenOutString = ""; try { BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); new PDFParser().parse(screenInputStream, handler, metadata, new ParseContext()); screenOutString = handler.toString(); } finally { screenInputStream.close(); } //Test if a ftl macro error is present assertFalse("Fop Screen contains Macro on error : see " + screentextUrl + " for more detail", screenOutString.contains("FreeMarker template error:")); } }