private void addDocumentAnnotationToProperties( final Map<String, Object> properties, final DocumentAnnotation da) { properties.put(AnalysisConstants.DOCUMENT_TYPE, da.getDocType()); properties.put(AnalysisConstants.CAVEATS, UimaTypesUtils.toList(da.getDocumentCaveats())); properties.put(AnalysisConstants.CLASSIFICATION, da.getDocumentClassification()); properties.put( AnalysisConstants.RELEASABILITY, UimaTypesUtils.toList(da.getDocumentReleasability())); properties.put(AnalysisConstants.LANGUAGE, da.getLanguage()); properties.put(AnalysisConstants.HASH, da.getHash()); properties.put(AnalysisConstants.SOURCE, da.getSourceUri()); properties.put(AnalysisConstants.TIMESTAMP, new Date(da.getTimestamp())); }
private Map<String, Object> serialiseDocumentAnnotation(final DocumentAnnotation da) { final Map<String, Object> map = new HashMap<>(); map.put(JsonJCas.DA_DOCUMENT_TYPE, da.getDocType()); map.put(JsonJCas.DA_LANGUAGE, da.getLanguage()); map.put(JsonJCas.DA_SOURCE_URI, da.getSourceUri()); map.put(JsonJCas.DA_CLASSIFICATION, da.getDocumentClassification()); final String[] caveats = da.getDocumentCaveats() != null ? da.getDocumentCaveats().toArray() : new String[0]; map.put(JsonJCas.DA_CAVEATS, caveats); final String[] rels = da.getDocumentReleasability() != null ? da.getDocumentReleasability().toArray() : new String[0]; map.put(JsonJCas.DA_RELEASABILITY, rels); return map; }
setIfValue(variables, FIELD_DOCUMENT_LANGUAGE, da.getLanguage()); setIfValue(variables, FIELD_DOCUMENT_TIMESTAMP, new Date(da.getTimestamp()));
setIfValue(variables, FIELD_DOCUMENT_LANGUAGE, da.getLanguage()); setIfValue(variables, FIELD_DOCUMENT_TIMESTAMP, new Date(da.getTimestamp()));
Jsoup.parse("<!DOCTYPE html>\n<html lang=\"" + da.getLanguage() + "\"></html>"); doc.outputSettings(new Document.OutputSettings().prettyPrint(false)); final Element head = doc.head();
@SuppressWarnings("unchecked") private void assertMetadata(JCas jCas, Map<String, Object> variables) { DocumentAnnotation da = UimaSupport.getDocumentAnnotation(jCas); String documentId = ConsumerUtils.getExternalId(da, false); assertEquals(da.getDocType(), variables.get(FIELD_DOCUMENT_TYPE)); assertEquals(da.getSourceUri(), variables.get(FIELD_DOCUMENT_SOURCE)); assertEquals(da.getLanguage(), variables.get(FIELD_DOCUMENT_LANGUAGE)); assertEquals(new Date(da.getTimestamp()), variables.get(FIELD_DOCUMENT_TIMESTAMP)); assertEquals(da.getDocumentClassification(), variables.get(FIELD_DOCUMENT_CLASSIFICATION)); assertEquals( UimaTypesUtils.toList(da.getDocumentCaveats()), variables.get(FIELD_DOCUMENT_CAVEATS)); assertFalse(variables.containsKey(FIELD_DOCUMENT_RELEASABILITY)); Map<String, String> publishedId = ((List<Map<String, String>>) variables.get(FIELD_PUBLISHEDIDS)).get(0); assertEquals("12", publishedId.get(FIELD_PUBLISHEDIDS_ID)); assertEquals("test", publishedId.get(FIELD_PUBLISHEDIDS_TYPE)); Map<String, Collection<Object>> meta = (Map<String, Collection<Object>>) variables.get(FIELD_METADATA); assertTrue(meta.get("test").contains("1")); assertTrue(meta.get("test").contains("2")); assertEquals(2, meta.get("test").size()); assertNull(variables.get(FIELD_CONTENT)); assertEquals(documentId, variables.get("externalId")); }
private static DocumentMetaData initDocumentMetaData(DocumentMetaData aMetaData) { // If there is already a DocumentAnnotation copy it's information and delete it DocumentAnnotation da = getDocumentAnnotation(aMetaData.getView()); if (da != null) { aMetaData.setLanguage(da.getLanguage()); aMetaData.setBegin(da.getBegin()); aMetaData.setEnd(da.getEnd()); da.removeFromIndexes(); } else if (aMetaData.getView().getDocumentText() != null) { aMetaData.setBegin(0); aMetaData.setEnd(aMetaData.getView().getDocumentText().length()); } aMetaData.addToIndexes(); return aMetaData; }
private static DocumentMetaData initDocumentMetaData(DocumentMetaData aMetaData) { // If there is already a DocumentAnnotation copy it's information and delete it DocumentAnnotation da = getDocumentAnnotation(aMetaData.getView()); if (da != null) { aMetaData.setLanguage(da.getLanguage()); aMetaData.setBegin(da.getBegin()); aMetaData.setEnd(da.getEnd()); da.removeFromIndexes(); } else if (aMetaData.getView().getDocumentText() != null) { aMetaData.setBegin(0); aMetaData.setEnd(aMetaData.getView().getDocumentText().length()); } aMetaData.addToIndexes(); return aMetaData; }
private void saveDocument(String documentId, JCas jCas) { Document doc = new Document(); DocumentAnnotation da = getDocumentAnnotation(jCas); doc.append(fields.getExternalId(), documentId) .append( FIELD_DOCUMENT, new Document() .append(FIELD_DOCUMENT_TYPE, da.getDocType()) .append(FIELD_DOCUMENT_SOURCE, da.getSourceUri()) .append(FIELD_DOCUMENT_LANGUAGE, da.getLanguage()) .append(FIELD_DOCUMENT_TIMESTAMP, new Date(da.getTimestamp())) .append(FIELD_DOCUMENT_CLASSIFICATION, da.getDocumentClassification()) .append(FIELD_DOCUMENT_CAVEATS, toList(da.getDocumentCaveats())) .append(FIELD_DOCUMENT_RELEASABILITY, toList(da.getDocumentReleasability()))); addPublishedIds(jCas, doc); addMetadata(jCas, doc); if (outputContent) { doc.append(FIELD_CONTENT, jCas.getDocumentText()); } documentsCollection.insertOne(doc); }
addProperty(vDoc, "language", da.getLanguage()); addProperty(vDoc, "timestamp", da.getTimestamp()); addProperty(vDoc, "classification", da.getDocumentClassification());
properties.put(AnalysisConstants.HASH, da.getHash()); properties.put(AnalysisConstants.RELEASABILITY, Arrays.asList(RELEASABILITY)); properties.put(AnalysisConstants.LANGUAGE, da.getLanguage()); properties.put(AnalysisConstants.TIMESTAMP, new Date(DOC_TIMESTAMP)); properties.put(AnalysisConstants.SOURCE, SOURCE);