public static JCas newCasFromFile(String filePath, String docId) throws UIMAException { JCas jCas = JCasFactory.createJCas(); Header header = new Header(jCas); header.setDocId(docId); header.setSource(filePath); header.addToIndexes(); return jCas; }
@Override public void getNext(JCas jcas) throws IOException, CollectionException { File f = fileIterator.next(); Header header = new Header(jcas); // .* removes the tmp part header.setDocId(f.getName().replaceAll("\\.pdf.*", "")); header.setSource(f.getAbsolutePath()); header.addToIndexes(); PDFTextStream pdf = new PDFTextStream(f); BlockHandler blueHandler = new BlockHandler(); pdf.pipe(blueHandler); pdf.close(); extractText(jcas, blueHandler.getDoc(), header.getDocId(), expandAbbrevs); if (extractTables) extractTables(tableExtractor, f, jcas); // printHtml(jcas, new File("target/" + header.getDocId() + ".html")); }
header.setDocId(pubmedId); header.setTitle(title); header.setComponentId(PubmedWebServiceCollectionReader.COMPONENT_ID);