@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { DocumentAnnotation doc = UimaSupport.getDocumentAnnotation(jCas); doc.setSourceUri(source); doc.setTimestamp(System.currentTimeMillis()); // Add metadata item to capture which content extractor was used addMetadata(jCas, METADATA_KEY_CONTENT_EXTRACTOR, this.getClass().getName()); }
@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { DocumentAnnotation doc = UimaSupport.getDocumentAnnotation(jCas); doc.setSourceUri(source); doc.setTimestamp(System.currentTimeMillis()); // Add metadata item to capture which content extractor was used addMetadata(jCas, METADATA_KEY_CONTENT_EXTRACTOR, this.getClass().getName()); }
private void processDocumentAnnotation( final JCas jCas, final DocumentAnnotation da, final Map<String, Object> map) { da.setDocType((String) map.getOrDefault(JsonJCas.DA_DOCUMENT_TYPE, "")); da.setDocumentClassification((String) map.getOrDefault(JsonJCas.DA_CLASSIFICATION, "")); da.setLanguage((String) map.getOrDefault(JsonJCas.DA_LANGUAGE, "")); da.setSourceUri((String) map.getOrDefault(JsonJCas.DA_SOURCE_URI, "")); da.setTimestamp(((Number) map.getOrDefault(JsonJCas.DA_TIMESTAMP, 0)).longValue()); da.setDocumentCaveats( UimaTypesUtils.toArray( jCas, (Collection<String>) map.getOrDefault(JsonJCas.DA_CAVEATS, null))); da.setDocumentReleasability( UimaTypesUtils.toArray( jCas, (Collection<String>) map.getOrDefault(JsonJCas.DA_RELEASABILITY, null))); }
doc.setTimestamp(System.currentTimeMillis());
doc.setTimestamp(System.currentTimeMillis());
da.setTimestamp(calculateBestDate(message, file)); da.setDocType("email"); da.setDocumentClassification("O");
@Test public void test() throws Exception { getDocumentAnnotation().setSourceUri("/this/is/a/2017/01/23/valid/path/index.html"); getDocumentAnnotation().setTimestamp(1); jCas.setDocumentText("Hello world."); processJCas(); final long timestamp = getDocumentAnnotation().getTimestamp(); assertEquals(new GregorianCalendar(2017, 0, 23).getTime().getTime(), timestamp); } }
da.setTimestamp(calculateBestDate(message, file)); da.setDocType("email"); da.setDocumentClassification("O");
/** Process a single body part */ private boolean processBody(JCas jCas, Body body, String sourceUri) throws IOException { if (body instanceof TextBody) { // Process plain text body processTextBody(jCas, (TextBody) body); // Add fields from parent for (Field f : body.getParent().getHeader().getFields()) { addMetadata(jCas, f.getName(), f.getBody()); } // Set up document annotation - this is done by the content extractor in other cases DocumentAnnotation doc = UimaSupport.getDocumentAnnotation(jCas); doc.setSourceUri(sourceUri); doc.setTimestamp(System.currentTimeMillis()); } else if (body instanceof BinaryBody) { processBinaryBody(jCas, (BinaryBody) body, sourceUri); } else if (body instanceof Multipart) { // Multipart message, so recurse Multipart mp = (Multipart) body; return processMultipart(jCas, mp, sourceUri); } else { // No body processed return false; } return true; }
da.setTimestamp(System.currentTimeMillis()); da.setDocType("re3d"); da.setDocumentClassification("O"); .minusDays(random.nextInt(30)) .atTime(random.nextInt(24), random.nextInt(60), random.nextInt(60)); da.setTimestamp(date.atOffset(ZoneOffset.UTC).toInstant().toEpochMilli());
da.setTimestamp(System.currentTimeMillis()); da.setDocType("re3d"); da.setDocumentClassification("O"); .minusDays(random.nextInt(30)) .atTime(random.nextInt(24), random.nextInt(60), random.nextInt(60)); da.setTimestamp(date.atOffset(ZoneOffset.UTC).toInstant().toEpochMilli());
/** Process a single body part */ private boolean processBody(JCas jCas, Body body, String sourceUri) throws IOException { if (body instanceof TextBody) { // Process plain text body processTextBody(jCas, (TextBody) body); // Add fields from parent for (Field f : body.getParent().getHeader().getFields()) { addMetadata(jCas, f.getName(), f.getBody()); } // Set up document annotation - this is done by the content extractor in other cases DocumentAnnotation doc = UimaSupport.getDocumentAnnotation(jCas); doc.setSourceUri(sourceUri); doc.setTimestamp(System.currentTimeMillis()); } else if (body instanceof BinaryBody) { processBinaryBody(jCas, (BinaryBody) body, sourceUri); } else if (body instanceof Multipart) { // Multipart message, so recurse Multipart mp = (Multipart) body; return processMultipart(jCas, mp, sourceUri); } else { // No body processed return false; } return true; }
protected long createNoEntitiesDocument() { jCas.reset(); jCas.setDocumentText("Hello World"); jCas.setDocumentLanguage("en"); long timestamp = System.currentTimeMillis(); DocumentAnnotation da = getDocumentAnnotation(jCas); da.setTimestamp(timestamp); da.setSourceUri("test/no_entities"); da.setDocType("test"); da.setDocumentClassification("OFFICIAL"); da.setDocumentCaveats( UimaTypesUtils.toArray(jCas, Arrays.asList(new String[] {"TEST_A", "TEST_B"}))); da.setDocumentReleasability( UimaTypesUtils.toArray(jCas, Arrays.asList(new String[] {"ENG", "SCO", "WAL"}))); return timestamp; }
da.setTimestamp(timestamp); da.setSourceUri("test/no_entities"); da.setDocType("test");
da.setSourceUri(SOURCE); da.setLanguage(LANGUAGE); da.setTimestamp(DOC_TIMESTAMP);
da.setSourceUri("http://test.com"); da.setLanguage("en"); da.setTimestamp(new Date().getTime()); da.setDocumentCaveats(new StringArray(jCas, 2)); da.setDocumentCaveats(0, "GITHUB");