@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { DocumentMetaData meta = iterate(aJCas, DocumentMetaData.class).iterator().next(); // make a new, empty document Document doc = new Document(); // Add the document metadata. Use fields that are indexed (i.e. searchable), but don't // tokenize the field into words. doc.add(new Field("documentUri", meta.getDocumentUri(), Store.YES, Index.NOT_ANALYZED)); doc.add(new Field("documentBaseUri", meta.getDocumentBaseUri(), Store.YES, Index.NOT_ANALYZED)); doc.add(new Field("language", aJCas.getDocumentLanguage(), Store.YES, Index.NOT_ANALYZED)); // Add all tokens to the index without any further processing. doc.add(new Field("token", AnnotationStream.create(iterate(aJCas, Token.class)), TermVector.YES)); // Optionally store the document text. It can be retrieved but not searched. if (storeText) { doc.add(new Field("text", aJCas.getDocumentText(), Store.YES, Index.NO)); } try { writer.addDocument(doc); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } }
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { DocumentMetaData meta = iterate(aJCas, DocumentMetaData.class).iterator().next(); System.out.println("=== METADATA ========================================"); System.out.println("URI : "+meta.getDocumentUri()); System.out.println("Language: "+aJCas.getDocumentLanguage()); System.out.println("=== TEXT ============================================"); System.out.println(aJCas.getDocumentText()); System.out.println("=== ANNOTATIONS ====================================="); for (Annotation a : iterate(aJCas, Annotation.class)) { System.out.println(a.getType().getName() + "(" + a.getBegin() + "," + a.getEnd() + ") [" + a.getCoveredText() + "]"); } } }
@Test public void test() throws Exception { CollectionReader reader = CollectionReaderFactory.createCollectionReader( TextFileReader.class, createTypeSystemDescription(), TextFileReader.PARAM_PATH, "src/test/resources/textfiles", TextFileReader.PARAM_LANGUAGE, "Latin"); int found = 0; CAS cas = CasCreationUtils.createCas(reader.getProcessingResourceMetaData()); while (reader.hasNext()) { reader.getNext(cas); DocumentMetaData meta = iterate(cas.getJCas(), DocumentMetaData.class).iterator().next(); for (Entry<String, String> entry : testFileContent.entrySet()) { if (meta.getDocumentUri().endsWith(entry.getKey())) { assertEquals(entry.getValue(), cas.getDocumentText()); found++; } } cas.reset(); } assertEquals(testFileContent.size(), found); } }