private void addDocumentAnnotationToProperties( final Map<String, Object> properties, final DocumentAnnotation da) { properties.put(AnalysisConstants.DOCUMENT_TYPE, da.getDocType()); properties.put(AnalysisConstants.CAVEATS, UimaTypesUtils.toList(da.getDocumentCaveats())); properties.put(AnalysisConstants.CLASSIFICATION, da.getDocumentClassification()); properties.put( AnalysisConstants.RELEASABILITY, UimaTypesUtils.toList(da.getDocumentReleasability())); properties.put(AnalysisConstants.LANGUAGE, da.getLanguage()); properties.put(AnalysisConstants.HASH, da.getHash()); properties.put(AnalysisConstants.SOURCE, da.getSourceUri()); properties.put(AnalysisConstants.TIMESTAMP, new Date(da.getTimestamp())); }
private void processDocumentAnnotation( final JCas jCas, final DocumentAnnotation da, final Map<String, Object> map) { da.setDocType((String) map.getOrDefault(JsonJCas.DA_DOCUMENT_TYPE, "")); da.setDocumentClassification((String) map.getOrDefault(JsonJCas.DA_CLASSIFICATION, "")); da.setLanguage((String) map.getOrDefault(JsonJCas.DA_LANGUAGE, "")); da.setSourceUri((String) map.getOrDefault(JsonJCas.DA_SOURCE_URI, "")); da.setTimestamp(((Number) map.getOrDefault(JsonJCas.DA_TIMESTAMP, 0)).longValue()); da.setDocumentCaveats( UimaTypesUtils.toArray( jCas, (Collection<String>) map.getOrDefault(JsonJCas.DA_CAVEATS, null))); da.setDocumentReleasability( UimaTypesUtils.toArray( jCas, (Collection<String>) map.getOrDefault(JsonJCas.DA_RELEASABILITY, null))); }
@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { DocumentAnnotation doc = UimaSupport.getDocumentAnnotation(jCas); doc.setSourceUri(source); doc.setTimestamp(System.currentTimeMillis()); // Add metadata item to capture which content extractor was used addMetadata(jCas, METADATA_KEY_CONTENT_EXTRACTOR, this.getClass().getName()); }
da.setDocType(DOCTYPE); da.setDocumentCaveats(UimaTypesUtils.toArray(jCas, Arrays.asList(CAVEAT))); da.setDocumentReleasability(UimaTypesUtils.toArray(jCas, Arrays.asList(RELEASABILITY))); da.setDocumentClassification(CLASSIFICATION); da.setSourceUri(SOURCE); da.setLanguage(LANGUAGE); da.setTimestamp(DOC_TIMESTAMP); final Map<String, Object> properties = document.getProperties(); properties.put(AnalysisConstants.CAVEATS, Arrays.asList(CAVEAT)); properties.put(AnalysisConstants.CLASSIFICATION, da.getDocumentClassification()); properties.put(AnalysisConstants.DOCUMENT_TYPE, da.getDocType()); properties.put(AnalysisConstants.HASH, da.getHash()); properties.put(AnalysisConstants.RELEASABILITY, Arrays.asList(RELEASABILITY)); properties.put(AnalysisConstants.LANGUAGE, da.getLanguage()); properties.put(AnalysisConstants.TIMESTAMP, new Date(DOC_TIMESTAMP)); properties.put(AnalysisConstants.SOURCE, SOURCE);
@Test public void testBaseDirectoryOneLayers() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri(tmp.getAbsolutePath()); processJCas(BASE_DIRECTORY, parentDir.getAbsolutePath()); String relative = tmp.getAbsolutePath() .substring( parentDir.getAbsolutePath().length() + 1, tmp.getAbsolutePath().length() - tmp.getName().length() - 1); assertEquals(relative, da.getDocType()); }
@Override protected void apply(MucEntry entry, JCas jCas) { jCas.setDocumentLanguage("en"); jCas.setDocumentText(entry.getText()); UimaSupport.getDocumentAnnotation(jCas).setSourceUri(entry.getId()); }
/** * Returns the base filename from DocumentAnnotation source URI in the given JCas. * * <p>The basename is the main part of the filename, without extension or enclosing paths, e.g. * for path '/some/directory/SomeFile.txt' this method will return 'SomeFile'. * * @param jCas the {@link JCas} from which to get the document annotation. * @return the filename * @throws IllegalArgumentException if there is an error parsing the document source URI. */ public static String getDocumentSourceBaseName(final JCas jCas) { DocumentAnnotation documentAnnotation = UimaSupport.getDocumentAnnotation(jCas); String sourceUri = documentAnnotation.getSourceUri(); return FilenameUtils.getName(sourceUri); } }
/** * Get (or create) the history associated with the document. * * @param jCas the target document * @return the history associated with the document */ public DocumentHistory getDocumentHistory(JCas jCas) { String documentId = pipelineName + ":" + getDocumentAnnotation(jCas).getHash(); return history.getHistory(documentId); }
@Test public void test() throws Exception { getDocumentAnnotation().setSourceUri("/this/is/a/2017/01/23/valid/path/index.html"); getDocumentAnnotation().setTimestamp(1); jCas.setDocumentText("Hello world."); processJCas(); final long timestamp = getDocumentAnnotation().getTimestamp(); assertEquals(new GregorianCalendar(2017, 0, 23).getTime().getTime(), timestamp); } }
/** * Get a usable unique uid * * @param da document annotation * @param contentHashAsId true if should use the hash, false will use the source url * @return hash, source or if all else fails a UUID */ public static String getExternalId(DocumentAnnotation da, boolean contentHashAsId) { if (contentHashAsId) { return da.getHash(); } else { try { return IdentityUtils.hashStrings(da.getSourceUri()); } catch (BaleenException e) { return fallbackToUUID(e); } } }
private File getDestinationFolder(DocumentAnnotation da) { File dest = new File(destination); if (splitByType) { String type = da.getDocType(); if (!Strings.isNullOrEmpty(type)) { dest = new File(dest, type); } } return dest; } }
da.setSourceUri("test.txt"); da.setDocumentClassification("UK OFFICIAL"); da.setDocumentCaveats(UimaTypesUtils.toArray(jCas, Arrays.asList("Test", "Caveats")));
private static DocumentMetaData initDocumentMetaData(DocumentMetaData aMetaData) { // If there is already a DocumentAnnotation copy it's information and delete it DocumentAnnotation da = getDocumentAnnotation(aMetaData.getView()); if (da != null) { aMetaData.setLanguage(da.getLanguage()); aMetaData.setBegin(da.getBegin()); aMetaData.setEnd(da.getEnd()); da.removeFromIndexes(); } else if (aMetaData.getView().getDocumentText() != null) { aMetaData.setBegin(0); aMetaData.setEnd(aMetaData.getView().getDocumentText().length()); } aMetaData.addToIndexes(); return aMetaData; }
@Test public void testReindexEntities() throws Exception { createEntitiesDocument(); ae.process(jCas); ae.process(jCas); // Change the last document so we can check its been updated getDocumentAnnotation(jCas).setDocumentClassification("TEST"); ae.process(jCas); elasticsearch.flush(BALEEN_INDEX); assertEquals(new Long(1), getCount()); SearchHit result = elasticsearch.client().search(new SearchRequest()).actionGet().getHits().getHits()[0]; // This checks the last document is tone we are getting assertEquals("TEST", result.getSource().get("classification")); }
public void assertTopLevel() { // Top level jCas assertEquals(in.getDocumentText(), out.getDocumentText()); assertEquals(in.getDocumentLanguage(), out.getDocumentLanguage()); // Doc annotations final DocumentAnnotation outDa = (DocumentAnnotation) out.getDocumentAnnotationFs(); assertNotNull(outDa); final DocumentAnnotation inDa = (DocumentAnnotation) out.getDocumentAnnotationFs(); assertEquals(inDa.getDocumentClassification(), outDa.getDocumentClassification()); }
private Date findDocumentDate( final List<BaleenDocumentMetadata> documentMetadata, final DocumentAnnotation da) { final Optional<Date> date = findDateFromMetadata(documentMetadata); return date.orElse(new Date(da.getTimestamp())); }
@Test public void testBaseDirectoryTwoLayers() throws Exception { DocumentAnnotation da = getDocumentAnnotation(); da.setSourceUri(tmp.getAbsolutePath()); processJCas(BASE_DIRECTORY, topDir.getAbsolutePath()); String relative = tmp.getAbsolutePath() .substring( topDir.getAbsolutePath().length() + 1, tmp.getAbsolutePath().length() - tmp.getName().length() - 1); assertEquals(relative, da.getDocType()); } }
@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { DocumentAnnotation doc = UimaSupport.getDocumentAnnotation(jCas); doc.setSourceUri(source); doc.setTimestamp(System.currentTimeMillis()); // Add metadata item to capture which content extractor was used addMetadata(jCas, METADATA_KEY_CONTENT_EXTRACTOR, this.getClass().getName()); }
@Override protected void apply(MucEntry entry, JCas jCas) { jCas.setDocumentLanguage("en"); jCas.setDocumentText(entry.getText()); UimaSupport.getDocumentAnnotation(jCas).setSourceUri(entry.getId()); }
@Override protected void write(JCas jCas) { final String source = getDocumentAnnotation(jCas).getSourceUri(); final Map<Event, Collection<Sentence>> coveringSentence = JCasUtil.indexCovering(jCas, Event.class, Sentence.class); JCasUtil.select(jCas, Event.class) .stream() .map(e -> extracted(source, coveringSentence, e)) .filter(s -> s.length > 0) .forEach(this::write); }