org.apache.uima.jcas.tcas.DocumentAnnotation java code examples

private void addDocumentAnnotationToProperties(
  final Map<String, Object> properties, final DocumentAnnotation da) {
 properties.put(AnalysisConstants.DOCUMENT_TYPE, da.getDocType());
 properties.put(AnalysisConstants.CAVEATS, UimaTypesUtils.toList(da.getDocumentCaveats()));
 properties.put(AnalysisConstants.CLASSIFICATION, da.getDocumentClassification());
 properties.put(
   AnalysisConstants.RELEASABILITY, UimaTypesUtils.toList(da.getDocumentReleasability()));
 properties.put(AnalysisConstants.LANGUAGE, da.getLanguage());
 properties.put(AnalysisConstants.HASH, da.getHash());
 properties.put(AnalysisConstants.SOURCE, da.getSourceUri());
 properties.put(AnalysisConstants.TIMESTAMP, new Date(da.getTimestamp()));
}

private void processDocumentAnnotation(
  final JCas jCas, final DocumentAnnotation da, final Map<String, Object> map) {
 da.setDocType((String) map.getOrDefault(JsonJCas.DA_DOCUMENT_TYPE, ""));
 da.setDocumentClassification((String) map.getOrDefault(JsonJCas.DA_CLASSIFICATION, ""));
 da.setLanguage((String) map.getOrDefault(JsonJCas.DA_LANGUAGE, ""));
 da.setSourceUri((String) map.getOrDefault(JsonJCas.DA_SOURCE_URI, ""));
 da.setTimestamp(((Number) map.getOrDefault(JsonJCas.DA_TIMESTAMP, 0)).longValue());
 da.setDocumentCaveats(
   UimaTypesUtils.toArray(
     jCas, (Collection<String>) map.getOrDefault(JsonJCas.DA_CAVEATS, null)));
 da.setDocumentReleasability(
   UimaTypesUtils.toArray(
     jCas, (Collection<String>) map.getOrDefault(JsonJCas.DA_RELEASABILITY, null)));
}

@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
 DocumentAnnotation doc = UimaSupport.getDocumentAnnotation(jCas);
 doc.setSourceUri(source);
 doc.setTimestamp(System.currentTimeMillis());
 // Add metadata item to capture which content extractor was used
 addMetadata(jCas, METADATA_KEY_CONTENT_EXTRACTOR, this.getClass().getName());
}

da.setDocType(DOCTYPE);
da.setDocumentCaveats(UimaTypesUtils.toArray(jCas, Arrays.asList(CAVEAT)));
da.setDocumentReleasability(UimaTypesUtils.toArray(jCas, Arrays.asList(RELEASABILITY)));
da.setDocumentClassification(CLASSIFICATION);
da.setSourceUri(SOURCE);
da.setLanguage(LANGUAGE);
da.setTimestamp(DOC_TIMESTAMP);
final Map<String, Object> properties = document.getProperties();
properties.put(AnalysisConstants.CAVEATS, Arrays.asList(CAVEAT));
properties.put(AnalysisConstants.CLASSIFICATION, da.getDocumentClassification());
properties.put(AnalysisConstants.DOCUMENT_TYPE, da.getDocType());
properties.put(AnalysisConstants.HASH, da.getHash());
properties.put(AnalysisConstants.RELEASABILITY, Arrays.asList(RELEASABILITY));
properties.put(AnalysisConstants.LANGUAGE, da.getLanguage());
properties.put(AnalysisConstants.TIMESTAMP, new Date(DOC_TIMESTAMP));
properties.put(AnalysisConstants.SOURCE, SOURCE);

@Test
public void testBaseDirectoryOneLayers() throws Exception {
 DocumentAnnotation da = getDocumentAnnotation();
 da.setSourceUri(tmp.getAbsolutePath());
 processJCas(BASE_DIRECTORY, parentDir.getAbsolutePath());
 String relative =
   tmp.getAbsolutePath()
     .substring(
       parentDir.getAbsolutePath().length() + 1,
       tmp.getAbsolutePath().length() - tmp.getName().length() - 1);
 assertEquals(relative, da.getDocType());
}

@Override
protected void apply(MucEntry entry, JCas jCas) {
 jCas.setDocumentLanguage("en");
 jCas.setDocumentText(entry.getText());
 UimaSupport.getDocumentAnnotation(jCas).setSourceUri(entry.getId());
}

 /**
  * Returns the base filename from DocumentAnnotation source URI in the given JCas.
  *
  * <p>The basename is the main part of the filename, without extension or enclosing paths, e.g.
  * for path '/some/directory/SomeFile.txt' this method will return 'SomeFile'.
  *
  * @param jCas the {@link JCas} from which to get the document annotation.
  * @return the filename
  * @throws IllegalArgumentException if there is an error parsing the document source URI.
  */
 public static String getDocumentSourceBaseName(final JCas jCas) {
  DocumentAnnotation documentAnnotation = UimaSupport.getDocumentAnnotation(jCas);
  String sourceUri = documentAnnotation.getSourceUri();
  return FilenameUtils.getName(sourceUri);
 }
}

/**
 * Get (or create) the history associated with the document.
 *
 * @param jCas the target document
 * @return the history associated with the document
 */
public DocumentHistory getDocumentHistory(JCas jCas) {
 String documentId = pipelineName + ":" + getDocumentAnnotation(jCas).getHash();
 return history.getHistory(documentId);
}

 @Test
 public void test() throws Exception {

  getDocumentAnnotation().setSourceUri("/this/is/a/2017/01/23/valid/path/index.html");
  getDocumentAnnotation().setTimestamp(1);

  jCas.setDocumentText("Hello world.");
  processJCas();

  final long timestamp = getDocumentAnnotation().getTimestamp();

  assertEquals(new GregorianCalendar(2017, 0, 23).getTime().getTime(), timestamp);
 }
}

/**
 * Get a usable unique uid
 *
 * @param da document annotation
 * @param contentHashAsId true if should use the hash, false will use the source url
 * @return hash, source or if all else fails a UUID
 */
public static String getExternalId(DocumentAnnotation da, boolean contentHashAsId) {
 if (contentHashAsId) {
  return da.getHash();
 } else {
  try {
   return IdentityUtils.hashStrings(da.getSourceUri());
  } catch (BaleenException e) {
   return fallbackToUUID(e);
  }
 }
}

 private File getDestinationFolder(DocumentAnnotation da) {
  File dest = new File(destination);

  if (splitByType) {
   String type = da.getDocType();
   if (!Strings.isNullOrEmpty(type)) {
    dest = new File(dest, type);
   }
  }

  return dest;
 }
}

da.setSourceUri("test.txt");
da.setDocumentClassification("UK OFFICIAL");
da.setDocumentCaveats(UimaTypesUtils.toArray(jCas, Arrays.asList("Test", "Caveats")));

private static DocumentMetaData initDocumentMetaData(DocumentMetaData aMetaData)
{
  // If there is already a DocumentAnnotation copy it's information and delete it
  DocumentAnnotation da = getDocumentAnnotation(aMetaData.getView());
  if (da != null) {
    aMetaData.setLanguage(da.getLanguage());
    aMetaData.setBegin(da.getBegin());
    aMetaData.setEnd(da.getEnd());
    da.removeFromIndexes();
  }
  else if (aMetaData.getView().getDocumentText() != null) {
    aMetaData.setBegin(0);
    aMetaData.setEnd(aMetaData.getView().getDocumentText().length());
  }
  aMetaData.addToIndexes();
  return aMetaData;
}

@Test
public void testReindexEntities() throws Exception {
 createEntitiesDocument();
 ae.process(jCas);
 ae.process(jCas);
 // Change the last document so we can check its been updated
 getDocumentAnnotation(jCas).setDocumentClassification("TEST");
 ae.process(jCas);
 elasticsearch.flush(BALEEN_INDEX);
 assertEquals(new Long(1), getCount());
 SearchHit result =
   elasticsearch.client().search(new SearchRequest()).actionGet().getHits().getHits()[0];
 // This checks the last document is tone we are getting
 assertEquals("TEST", result.getSource().get("classification"));
}

public void assertTopLevel() {
 // Top level jCas
 assertEquals(in.getDocumentText(), out.getDocumentText());
 assertEquals(in.getDocumentLanguage(), out.getDocumentLanguage());
 // Doc annotations
 final DocumentAnnotation outDa = (DocumentAnnotation) out.getDocumentAnnotationFs();
 assertNotNull(outDa);
 final DocumentAnnotation inDa = (DocumentAnnotation) out.getDocumentAnnotationFs();
 assertEquals(inDa.getDocumentClassification(), outDa.getDocumentClassification());
}

private Date findDocumentDate(
  final List<BaleenDocumentMetadata> documentMetadata, final DocumentAnnotation da) {
 final Optional<Date> date = findDateFromMetadata(documentMetadata);
 return date.orElse(new Date(da.getTimestamp()));
}

 @Test
 public void testBaseDirectoryTwoLayers() throws Exception {

  DocumentAnnotation da = getDocumentAnnotation();
  da.setSourceUri(tmp.getAbsolutePath());

  processJCas(BASE_DIRECTORY, topDir.getAbsolutePath());

  String relative =
    tmp.getAbsolutePath()
      .substring(
        topDir.getAbsolutePath().length() + 1,
        tmp.getAbsolutePath().length() - tmp.getName().length() - 1);

  assertEquals(relative, da.getDocType());
 }
}

@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
 DocumentAnnotation doc = UimaSupport.getDocumentAnnotation(jCas);
 doc.setSourceUri(source);
 doc.setTimestamp(System.currentTimeMillis());
 // Add metadata item to capture which content extractor was used
 addMetadata(jCas, METADATA_KEY_CONTENT_EXTRACTOR, this.getClass().getName());
}

@Override
protected void apply(MucEntry entry, JCas jCas) {
 jCas.setDocumentLanguage("en");
 jCas.setDocumentText(entry.getText());
 UimaSupport.getDocumentAnnotation(jCas).setSourceUri(entry.getId());
}

@Override
protected void write(JCas jCas) {
 final String source = getDocumentAnnotation(jCas).getSourceUri();
 final Map<Event, Collection<Sentence>> coveringSentence =
   JCasUtil.indexCovering(jCas, Event.class, Sentence.class);
 JCasUtil.select(jCas, Event.class)
   .stream()
   .map(e -> extracted(source, coveringSentence, e))
   .filter(s -> s.length > 0)
   .forEach(this::write);
}

Javadoc

Overriding the base DocumentAnntation to add additional features. The JCasGen code generated from this annotation replaces the default type in uima-document-annotation.jar (which should be removed from the classpath). Updated by JCasGen Wed Apr 13 13:23:15 BST 2016 XML source: H:/git/TextProcessing/core/baleen/baleen-uima/src/main/resources/types/common_type_system.xml

Most used methods

getDocType
getter for docType - gets The document type
getDocumentClassification
getter for documentClassification - gets The security classification of the document
getHash
Get hash of current document text
getLanguage
getter for language - gets
getSourceUri
getter for sourceUri - gets A URI representing the source of the document
setDocumentClassification
setter for documentClassification - sets The security classification of the document
setSourceUri
setter for sourceUri - sets A URI representing the source of the document
setTimestamp
setter for timestamp - sets The time at which the document was processed
getDocumentCaveats
indexed getter for documentCaveats - gets an indexed value - An array of string values specifying ha
getTimestamp
getter for timestamp - gets The time at which the document was processed
setDocType
setter for docType - sets The document type
setLanguage
setter for language - sets

Popular in Java

Creating JSON documents from java classes using gson
scheduleAtFixedRate (Timer)
putExtra (Intent)
runOnUiThread (Activity)
ObjectMapper (com.fasterxml.jackson.databind)
ObjectMapper provides functionality for reading and writing JSON, either to and from basic POJOs (Pl
SimpleDateFormat (java.text)
Formats and parses dates in a locale-sensitive manner. Formatting turns a Date into a String, and pa
Deque (java.util)
A linear collection that supports element insertion and removal at both ends. The name deque is shor
Table (com.google.common.collect)
A collection that associates an ordered pair of keys, called a row key and a column key, with a sing
Point (java.awt)
A point representing a location in (x,y) coordinate space, specified in integer precision.
Table (org.hibernate.mapping)
A relational table
From CI to AI: The AI layer in your organization

How to useDocumentAnnotation in org.apache.uima.jcas.tcas

Best Java code snippets using org.apache.uima.jcas.tcas.DocumentAnnotation (Showing top 20 results out of 315)

How to use
DocumentAnnotation
in
org.apache.uima.jcas.tcas