private File getDestinationFolder(DocumentAnnotation da) { File dest = new File(destination); if (splitByType) { String type = da.getDocType(); if (!Strings.isNullOrEmpty(type)) { dest = new File(dest, type); } } return dest; } }
private void addDocumentAnnotationToProperties( final Map<String, Object> properties, final DocumentAnnotation da) { properties.put(AnalysisConstants.DOCUMENT_TYPE, da.getDocType()); properties.put(AnalysisConstants.CAVEATS, UimaTypesUtils.toList(da.getDocumentCaveats())); properties.put(AnalysisConstants.CLASSIFICATION, da.getDocumentClassification()); properties.put( AnalysisConstants.RELEASABILITY, UimaTypesUtils.toList(da.getDocumentReleasability())); properties.put(AnalysisConstants.LANGUAGE, da.getLanguage()); properties.put(AnalysisConstants.HASH, da.getHash()); properties.put(AnalysisConstants.SOURCE, da.getSourceUri()); properties.put(AnalysisConstants.TIMESTAMP, new Date(da.getTimestamp())); }
private Map<String, Object> serialiseDocumentAnnotation(final DocumentAnnotation da) { final Map<String, Object> map = new HashMap<>(); map.put(JsonJCas.DA_DOCUMENT_TYPE, da.getDocType()); map.put(JsonJCas.DA_LANGUAGE, da.getLanguage()); map.put(JsonJCas.DA_SOURCE_URI, da.getSourceUri()); map.put(JsonJCas.DA_CLASSIFICATION, da.getDocumentClassification()); final String[] caveats = da.getDocumentCaveats() != null ? da.getDocumentCaveats().toArray() : new String[0]; map.put(JsonJCas.DA_CAVEATS, caveats); final String[] rels = da.getDocumentReleasability() != null ? da.getDocumentReleasability().toArray() : new String[0]; map.put(JsonJCas.DA_RELEASABILITY, rels); return map; }
@Test public void testBaseDirectoryOneLayers() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri(tmp.getAbsolutePath()); processJCas(BASE_DIRECTORY, parentDir.getAbsolutePath()); String relative = tmp.getAbsolutePath() .substring( parentDir.getAbsolutePath().length() + 1, tmp.getAbsolutePath().length() - tmp.getName().length() - 1); assertEquals(relative, da.getDocType()); }
@Test public void testBaseDirectoryTwoLayers() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri(tmp.getAbsolutePath()); processJCas(BASE_DIRECTORY, topDir.getAbsolutePath()); String relative = tmp.getAbsolutePath() .substring( topDir.getAbsolutePath().length() + 1, tmp.getAbsolutePath().length() - tmp.getName().length() - 1); assertEquals(relative, da.getDocType()); } }
@Test public void testBadParameters() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri(tmp.getAbsolutePath()); processJCas(BASE_DIRECTORY, null); String relative = tmp.getAbsolutePath() .substring( parentDir.getAbsolutePath().length() + 1, tmp.getAbsolutePath().length() - tmp.getName().length() - 1); assertNotEquals(relative, da.getDocType()); processJCas(BASE_DIRECTORY, "/not/the/path"); assertNotEquals(relative, da.getDocType()); }
@Test public void testThrehold() throws Exception { jCas.setDocumentText("This text isn't going to score above the threshold."); processJCas( DocumentType.PARAM_MODEL, getClass().getResource(DOCUMENTTYPE_BIN).getPath(), DocumentType.PARAM_CONFIDENCE_THRESHOLD, "0.99"); // Model trained on IOM and BBC reporting, and is OFFICIAL DocumentAnnotation da = getDocumentAnnotation(); assertEquals(null, da.getDocType()); } }
@Test public void test() throws Exception { try { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri(tmp.getAbsolutePath()); processJCas(); // Remove slash (requried for unix paths) String absolutePath = childDir.getAbsolutePath(); if (absolutePath.startsWith(File.separator)) { absolutePath = absolutePath.substring(1); } assertEquals(absolutePath, da.getDocType()); } finally { tmp.delete(); } }
@Test public void testBaseDirectory() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri(tmp.getAbsolutePath()); processJCas(BASE_DIRECTORY, childDir.getAbsolutePath()); assertEquals("", da.getDocType()); }
@Test public void testType() throws AnalysisEngineProcessException, ResourceInitializationException { processJCas(DocumentTypeByParameter.PARAM_TYPE, "test"); assertEquals("test", getDocumentAnnotation().getDocType()); }
@Test public void testNullType() throws AnalysisEngineProcessException, ResourceInitializationException { processJCas(DocumentTypeByParameter.PARAM_TYPE, null); assertNull(getDocumentAnnotation().getDocType()); }
@Test public void testEmptyType() throws AnalysisEngineProcessException, ResourceInitializationException { processJCas(DocumentTypeByParameter.PARAM_TYPE, ""); assertNull(getDocumentAnnotation().getDocType()); } }
@Test public void testPatternCaseSensitiveFalse() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri("20170127-Test_Document.docx"); processJCas( DocumentTypeByFilename.PARAM_PATTERN, "\\d{8}-([a-z]).*", DocumentTypeByFilename.PARAM_DEFAULT, "unknown"); assertEquals("t", da.getDocType()); }
@Test public void testDefault() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri("20170127-Test_Document.docx"); processJCas(); assertEquals("docx", da.getDocType()); }
@Test public void testPrefix() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri("20170127-Test_Document.docx"); processJCas(DocumentTypeByFilename.PARAM_PREFIX, "filetype_"); assertEquals("filetype_docx", da.getDocType()); }
@Test public void testPattern() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri("20170127-Test_Document.docx"); processJCas(DocumentTypeByFilename.PARAM_PATTERN, "(\\d{4}).*"); assertEquals("2017", da.getDocType()); }
@Test public void testLowerCase() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri("20170127-Test_Document.docx"); processJCas( DocumentTypeByFilename.PARAM_PATTERN, "\\d{8}-([a-z]).*", DocumentTypeByFilename.PARAM_LOWER_CASE, false); assertEquals("T", da.getDocType()); } }
@Test public void testGroup() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri("20170127-Test_Document.docx"); processJCas( DocumentTypeByFilename.PARAM_PATTERN, "(\\d{4})(\\d{2})(\\d{2}).*", DocumentTypeByFilename.PARAM_GROUP, 2); assertEquals("01", da.getDocType()); }
@Test public void testPatternNoMatch() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri("20170127-Test_Document.docx"); processJCas( DocumentTypeByFilename.PARAM_PATTERN, "([a-z]{2}).*", DocumentTypeByFilename.PARAM_DEFAULT, "unknown"); assertEquals("unknown", da.getDocType()); }
@Test public void testPatternCaseSensitiveTrue() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri("20170127-Test_Document.docx"); processJCas( DocumentTypeByFilename.PARAM_PATTERN, "\\d{8}-([a-z]).*", DocumentTypeByFilename.PARAM_DEFAULT, "unknown", DocumentTypeByFilename.PARAM_CASE_SENSITIVE, true); assertEquals("unknown", da.getDocType()); }