@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { DocumentAnnotation doc = UimaSupport.getDocumentAnnotation(jCas); doc.setSourceUri(source); doc.setTimestamp(System.currentTimeMillis()); // Add metadata item to capture which content extractor was used addMetadata(jCas, METADATA_KEY_CONTENT_EXTRACTOR, this.getClass().getName()); }
@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { DocumentAnnotation doc = UimaSupport.getDocumentAnnotation(jCas); doc.setSourceUri(source); doc.setTimestamp(System.currentTimeMillis()); // Add metadata item to capture which content extractor was used addMetadata(jCas, METADATA_KEY_CONTENT_EXTRACTOR, this.getClass().getName()); }
@Test public void testBaseDirectoryOneLayers() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri(tmp.getAbsolutePath()); processJCas(BASE_DIRECTORY, parentDir.getAbsolutePath()); String relative = tmp.getAbsolutePath() .substring( parentDir.getAbsolutePath().length() + 1, tmp.getAbsolutePath().length() - tmp.getName().length() - 1); assertEquals(relative, da.getDocType()); }
@Test public void testBaseDirectoryTwoLayers() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri(tmp.getAbsolutePath()); processJCas(BASE_DIRECTORY, topDir.getAbsolutePath()); String relative = tmp.getAbsolutePath() .substring( topDir.getAbsolutePath().length() + 1, tmp.getAbsolutePath().length() - tmp.getName().length() - 1); assertEquals(relative, da.getDocType()); } }
@Override protected void apply(MucEntry entry, JCas jCas) { jCas.setDocumentLanguage("en"); jCas.setDocumentText(entry.getText()); UimaSupport.getDocumentAnnotation(jCas).setSourceUri(entry.getId()); }
@Override protected void apply(MucEntry entry, JCas jCas) { jCas.setDocumentLanguage("en"); jCas.setDocumentText(entry.getText()); UimaSupport.getDocumentAnnotation(jCas).setSourceUri(entry.getId()); }
@Test public void testBaseDirectory() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri(tmp.getAbsolutePath()); processJCas(BASE_DIRECTORY, childDir.getAbsolutePath()); assertEquals("", da.getDocType()); }
@Test public void testNullBasePath() throws Exception { AnalysisEngine consumer = AnalysisEngineFactory.createEngine( TestFileConsumer.class, TypeSystemSingleton.getTypeSystemDescriptionInstance()); DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs(); da.setSourceUri(FILENAME); consumer.process(jCas); File f = new File(FILENAME); assertTrue(f.exists()); f.delete(); } }
@Test public void test() throws Exception { getDocumentAnnotation().setSourceUri("/this/is/a/2017/01/23/valid/path/index.html"); getDocumentAnnotation().setTimestamp(1); jCas.setDocumentText("Hello world."); processJCas(); final long timestamp = getDocumentAnnotation().getTimestamp(); assertEquals(new GregorianCalendar(2017, 0, 23).getTime().getTime(), timestamp); } }
@Test public void testPatternCaseSensitiveFalse() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri("20170127-Test_Document.docx"); processJCas( DocumentTypeByFilename.PARAM_PATTERN, "\\d{8}-([a-z]).*", DocumentTypeByFilename.PARAM_DEFAULT, "unknown"); assertEquals("t", da.getDocType()); }
@Test public void testPath() throws Exception { DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs(); da.setSourceUri(file); processJCas(); assertEquals(1, JCasUtil.select(jCas, Metadata.class).size()); Metadata md = JCasUtil.selectByIndex(jCas, Metadata.class, 0); assertEquals("source", md.getKey()); assertEquals(file, md.getValue()); }
@Test public void testPatternNoMatch() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri("20170127-Test_Document.docx"); processJCas( DocumentTypeByFilename.PARAM_PATTERN, "([a-z]{2}).*", DocumentTypeByFilename.PARAM_DEFAULT, "unknown"); assertEquals("unknown", da.getDocType()); }
private void createDocument() { DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs(); da.setSourceUri(TEST1_TXT); CommsIdentifier ci = new CommsIdentifier(jCas); ci.addToIndexes(); Person p = new Person(jCas); p.addToIndexes(); }
@Test public void testGroup() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri("20170127-Test_Document.docx"); processJCas( DocumentTypeByFilename.PARAM_PATTERN, "(\\d{4})(\\d{2})(\\d{2}).*", DocumentTypeByFilename.PARAM_GROUP, 2); assertEquals("01", da.getDocType()); }
@Test public void testPrefix() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri("20170127-Test_Document.docx"); processJCas(DocumentTypeByFilename.PARAM_PREFIX, "filetype_"); assertEquals("filetype_docx", da.getDocType()); }
@Test public void testPattern() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri("20170127-Test_Document.docx"); processJCas(DocumentTypeByFilename.PARAM_PATTERN, "(\\d{4}).*"); assertEquals("2017", da.getDocType()); }
@Test public void testLowerCase() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri("20170127-Test_Document.docx"); processJCas( DocumentTypeByFilename.PARAM_PATTERN, "\\d{8}-([a-z]).*", DocumentTypeByFilename.PARAM_LOWER_CASE, false); assertEquals("T", da.getDocType()); } }
@Test public void testDefault() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri("20170127-Test_Document.docx"); processJCas(); assertEquals("docx", da.getDocType()); }
@Test public void testName() throws Exception { DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs(); da.setSourceUri(file); processJCas("nameOnly", true, "key", "title"); assertEquals(1, JCasUtil.select(jCas, Metadata.class).size()); Metadata md = JCasUtil.selectByIndex(jCas, Metadata.class, 0); assertEquals("title", md.getKey()); assertEquals("Test Document", md.getValue()); } }
@Test public void testPatternCaseSensitiveTrue() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri("20170127-Test_Document.docx"); processJCas( DocumentTypeByFilename.PARAM_PATTERN, "\\d{8}-([a-z]).*", DocumentTypeByFilename.PARAM_DEFAULT, "unknown", DocumentTypeByFilename.PARAM_CASE_SENSITIVE, true); assertEquals("unknown", da.getDocType()); }