org.apache.tika.parser.pdf.PDFParser.parse java code examples

 public class PDFReader{
  public static void main(String args[]) {
    PDFTextStripper pdfStripper = null;
    PDDocument pdDoc = null;
    COSDocument cosDoc = null;
    File file = new File("C:/my.pdf");
    try {
      PDFParser parser = new PDFParser(new FileInputStream(file));
      parser.parse();
      cosDoc = parser.getDocument();
      pdfStripper = new PDFTextStripper();
      pdDoc = new PDDocument(cosDoc);
      pdfStripper.setStartPage(1);
      pdfStripper.setEndPage(5);
      String parsedText = pdfStripper.getText(pdDoc);
      System.out.println(parsedText);
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } 
  }
}

public static Metadata getMet(URL url) throws IOException, SAXException,
    TikaException {
  Metadata met = new Metadata();
  PDFParser parser = new PDFParser();
  parser.parse(url.openStream(), new BodyContentHandler(), met, new ParseContext());
  return met;
}

 public void parse(InputStream stream, ContentHandler handler,
   Metadata metadata, ParseContext context) throws IOException,
   SAXException, TikaException {
  TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
  File tmpFile = tis.getFile();

  GrobidRESTParser grobidParser = new GrobidRESTParser();
  grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context);

  PDFParser parser = new PDFParser();
  parser.parse(new FileInputStream(tmpFile), handler, metadata, context);
 }
}

try {
  parser = new PDFParser(new FileInputStream(file));
  parser.parse();
  cosDoc = parser.getDocument();
  pdfStripper = new PDFTextStripper();

public void ReadPDF() throws Exception {
 URL TestURL = new URL("http://www.axmag.com/download/pdfurl-guide.pdf");
 BufferedInputStream TestFile = new BufferedInputStream(TestURL.openStream());
 PDFParser TestPDF = new PDFParser(TestFile);
 TestPDF.parse();
 String TestText = new PDFTextStripper().getText(TestPDF.getPDDocument());
 Assert.assertTrue(TestText.contains("Open the setting.xml, you can see it is like this"));
 }

 File in = new File("somefile.pdf");
InputStream fin = new FileInputStream(in);
PDFParser parser = new PDFParser(fin);
parser.setTempDirectory(new File(tempDirectoryPath));
parser.parse();
PDDocument document = parser.getPDDocument();

 InputStream input = new FileInputStream(new File(resourceLocation));
ContentHandler textHandler = new BodyContentHandler();
Metadata metadata = new Metadata();
PDFParser parser = new PDFParser();
parser.parse(input, textHandler, metadata);
input.close();
out.println("Title: " + metadata.get("title"));
out.println("Author: " + metadata.get("Author"));
out.println("content: " + textHandler.toString());

 InputStream input = new FileInputStream(new File(resourceLocation));
ContentHandler textHandler = new BodyContentHandler();
Metadata metadata = new Metadata();
PDFParser parser = new PDFParser();
parser.parse(input, textHandler, metadata);
input.close();
out.println("Title: " + metadata.get("title"));
out.println("Author: " + metadata.get("Author"));
out.println("content: " + textHandler.toString());

 import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;

public class URLReader {
  public static void main(String[] args) throws Exception {

    URL url = new URL("http://website.com/document.pdf");
    ContentHandler contenthandler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    PDFParser pdfparser = new PDFParser();
    pdfparser.parse(is, contenthandler, metadata, new ParseContext());

    System.out.println(contenthandler.toString());
  }
}

private void openPDFDoc(final File pdfFile) throws Exception {
   File originalPDF = pdfFile;
   PDFParser parser = new PDFParser(new BufferedInputStream(new FileInputStream(
       originalPDF)));
   parser.parse();
   PDDocument originialPdfDoc = parser.getPDDocument();
   boolean isOriginalDocEncrypted = originialPdfDoc.isEncrypted();
   if (isOriginalDocEncrypted) {
     originialPdfDoc.openProtection(new StandardDecryptionMaterial("password"));
   }
 }

 public void parse(InputStream stream, ContentHandler handler,
   Metadata metadata, ParseContext context) throws IOException,
   SAXException, TikaException {
  TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
  File tmpFile = tis.getFile();

  GrobidRESTParser grobidParser = new GrobidRESTParser();
  grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context);

  PDFParser parser = new PDFParser();
  parser.parse(new FileInputStream(tmpFile), handler, metadata, context);
 }
}

 public void parse(InputStream stream, ContentHandler handler,
   Metadata metadata, ParseContext context) throws IOException,
   SAXException, TikaException {
  TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
  File tmpFile = tis.getFile();

  GrobidRESTParser grobidParser = new GrobidRESTParser();
  grobidParser.parse(tmpFile.getAbsolutePath(), handler, metadata, context);

  PDFParser parser = new PDFParser();
  parser.parse(new FileInputStream(tmpFile), handler, metadata, context);
 }
}

 PDFParser pdfParser = new PDFParser(new FileInputStream("c:\\temp\\owgr49f2013.pdf"));
pdfParser.parse();
PDDocument pdDocument = pdfParser.getPDDocument();

PDFTextStripper stripper = new PDFTextStripper("UTF-8");
stripper.setSortByPosition(false);
stripper.setWordSeparator("###");
System.out.println(stripper.getText(pdDocument));

parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();

  @Override
  public List<Page> parse(InputStream fs, List<String> filterPatterns) throws Exception {
    List<Page> pages = new ArrayList<>();
    PageContentHandler handler = new PageContentHandler(filterPatterns);
    Metadata metadata = new Metadata();
    pdfParser.setSortByPosition(true);
    pdfParser.parse(fs, handler, metadata, new ParseContext());

    Map<Integer, List<String>> content =  handler.getImprovedContent();
    for (Integer i : content.keySet()) {
      Page page = new Page(i);
      for (String p : content.get(i)) {
        page.getParagraphs().add(p);
      }
      pages.add(page);
    }

    return pages;
  }
}

parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();

parser.parse();
cosDoc = parser.getDocument();
pdfStripper = new PDFTextStripper();

parser.parse();

 public static String pdftoText(File file) {
  PDFParser parser=null;
  String parsedText = null;
  PDFTextStripper pdfStripper = null;
  PDDocument pdDoc = null;
  COSDocument cosDoc = null;
  try {
    parser = new PDFParser(new FileInputStream(file));
    parser.parse();
    cosDoc = parser.getDocument();
    pdfStripper = new PDFTextStripper();
    pdDoc = new PDDocument(cosDoc);
    parsedText = pdfStripper.getText(pdDoc);
  } catch (Exception e) {
    // handle exception
  } finally {
    try {
      if (cosDoc != null)
        cosDoc.close();
      if (pdDoc != null)
        pdDoc.close();
    } catch (Exception e) {
      // handle excpetion
    }
  }
  return parsedText;
}

  public void testFopMacroLibrary() throws Exception {
    String screentextUrl = screenUrl.concat("Fop");
    HttpClient http = initHttpClient();
    http.setUrl(screentextUrl.concat(authentificationQuery));
    //FIXME need to check if the stream is an application-pdf that don't contains ftl stack trace
    InputStream screenInputStream = http.postStream();
    assertNotNull("Response failed from ofbiz", screenInputStream);
    assertEquals("Response contentType isn't good : " + http.getResponseContentType(), "application/pdf;charset=UTF-8", http.getResponseContentType());

    String screenOutString = "";
    try {
      BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
      Metadata metadata = new Metadata();
      new PDFParser().parse(screenInputStream, handler, metadata, new ParseContext());
      screenOutString = handler.toString();
    } finally {
      screenInputStream.close();
    }
    //Test if a ftl macro error is present
    assertFalse("Fop Screen contains Macro on error : see " + screentextUrl + " for more detail", screenOutString.contains("FreeMarker template error:"));
  }
}

Popular methods of PDFParser

<init>
addMetadata
decode
extractDublinCoreListItems
This tries to read a list from a particular property in XMPSchemaDublinCore. If it can't find the in
extractMetadata
extractMultilingualItems
Try to extract all multilingual items from the XMPSchema This relies on the property having a valid
getPassword
getXMPBagOrSeqList
As of this writing, XMPSchema can contain bags or sequence lists for some attributes...despite stand
handleXFAOnly
loadDOM
shouldHandleXFAOnly
getDocument

Popular in Java

Running tasks concurrently on multiple threads
startActivity (Activity)
addToBackStack (FragmentTransaction)
getSupportFragmentManager (FragmentActivity)
IOException (java.io)
Signals a general, I/O-related error. Error details may be specified when calling the constructor, a
Connection (java.sql)
A connection represents a link from a Java application to a database. All SQL statements and results
MessageFormat (java.text)
Produces concatenated messages in language-neutral way. New code should probably use java.util.Forma
Timer (java.util)
Timers schedule one-shot or recurring TimerTask for execution. Prefer java.util.concurrent.Scheduled
FileUtils (org.apache.commons.io)
General file manipulation utilities. Facilities are provided in the following areas: * writing to a
JLabel (javax.swing)
From CI to AI: The AI layer in your organization

How to use parsemethodin org.apache.tika.parser.pdf.PDFParser

Best Java code snippets using org.apache.tika.parser.pdf.PDFParser.parse (Showing top 20 results out of 315)

How to use
parse
method
in
org.apache.tika.parser.pdf.PDFParser