/** * Get (or create) the history associated with the document. * * @param jCas the target document * @return the history associated with the document */ public DocumentHistory getDocumentHistory(JCas jCas) { String documentId = pipelineName + ":" + getDocumentAnnotation(jCas).getHash(); return history.getHistory(documentId); }
/** * Get (or create) the history associated with the document. * * @param jCas the target document * @return the history associated with the document */ public DocumentHistory getDocumentHistory(JCas jCas) { String documentId = pipelineName + ":" + getDocumentAnnotation(jCas).getHash(); return history.getHistory(documentId); }
private void addDocumentAnnotationToProperties( final Map<String, Object> properties, final DocumentAnnotation da) { properties.put(AnalysisConstants.DOCUMENT_TYPE, da.getDocType()); properties.put(AnalysisConstants.CAVEATS, UimaTypesUtils.toList(da.getDocumentCaveats())); properties.put(AnalysisConstants.CLASSIFICATION, da.getDocumentClassification()); properties.put( AnalysisConstants.RELEASABILITY, UimaTypesUtils.toList(da.getDocumentReleasability())); properties.put(AnalysisConstants.LANGUAGE, da.getLanguage()); properties.put(AnalysisConstants.HASH, da.getHash()); properties.put(AnalysisConstants.SOURCE, da.getSourceUri()); properties.put(AnalysisConstants.TIMESTAMP, new Date(da.getTimestamp())); }
@Override protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException { try { final DocumentAnnotation documentAnnotation = getDocumentAnnotation(jCas); final String serialized = converter.serialise(jCas); waitForQueueToBeBelowCapacity(); writeToQueue(documentAnnotation.getHash(), serialized); } catch (final IOException e) { throw new AnalysisEngineProcessException(e); } }
/** * Reset id generation if new JCas. * * @param jCas the j cas * @return true, if reset */ public boolean resetIfNewJCas(final JCas jCas) { final DocumentAnnotation documentAnnotation = UimaSupport.getDocumentAnnotation(jCas); final String documentId = documentAnnotation.getHash(); final boolean isNewDocument = currentDocumentId == null || !currentDocumentId.equals(documentId); if (isNewDocument) { getMonitor().debug("Reset id cache for document {}", documentId); clearMappings(); currentDocumentId = documentId; } return isNewDocument; }
/** * Get a usable unique uid * * @param da document annotation * @param contentHashAsId true if should use the hash, false will use the source url * @return hash, source or if all else fails a UUID */ public static String getExternalId(DocumentAnnotation da, boolean contentHashAsId) { if (contentHashAsId) { return da.getHash(); } else { try { return IdentityUtils.hashStrings(da.getSourceUri()); } catch (BaleenException e) { return fallbackToUUID(e); } } }
/** * Reset id generation if new JCas. * * @param jCas the j cas * @return true, if reset */ public boolean resetIfNewJCas(final JCas jCas) { final DocumentAnnotation documentAnnotation = UimaSupport.getDocumentAnnotation(jCas); final String documentId = documentAnnotation.getHash(); final boolean isNewDocument = currentDocumentId == null || !currentDocumentId.equals(documentId); if (isNewDocument) { getMonitor().debug("Reset id cache for document {}", documentId); clearMappings(); currentDocumentId = documentId; } return isNewDocument; }
appendMeta(head, "externalId", da.getHash());
@Test public void testCSS() throws UIMAException, IOException { AnalysisEngine consumer = AnalysisEngineFactory.createEngine( Html5.class, TypeSystemSingleton.getTypeSystemDescriptionInstance(), Html5.PARAM_OUTPUT_FOLDER, outputFolder.getPath(), Html5.PARAM_CSS, "test.css"); jCas.setDocumentText("This is a test document."); consumer.process(jCas); DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs(); File f = new File(outputFolder, da.getHash() + ".html"); assertTrue(f.exists()); Document doc = Jsoup.parse(f, "UTF-8"); Elements links = doc.select("link"); assertEquals(1, links.size()); Element link = links.get(0); assertEquals("stylesheet", link.attr("rel")); assertEquals("test.css", link.attr("href")); }
@Test public void docHash() throws Exception { JCas jcas = JCasFactory.createJCas(TypeSystemSingleton.getTypeSystemDescriptionInstance()); jcas.setDocumentText("There is the mention of some entity in this sentence."); DocumentAnnotation doc = (DocumentAnnotation) jcas.getDocumentAnnotationFs(); assertEquals("87cebccde680225b7640878d334b4cbb1c048ba1c8e66763f72cca5396a37807", doc.getHash()); } }
@SuppressWarnings("unchecked") @Test public void testHistory() throws AnalysisEngineProcessException { jCas.setDocumentText("Bill went to London. William came back."); Person p = Annotations.createPerson(jCas, 0, 4, "Bill"); Person q = Annotations.createPerson(jCas, 21, 28, NAME_2); DocumentHistory documentHistory = history.getHistory("unknown:" + getDocumentAnnotation(jCas).getHash()); documentHistory.add(HistoryEvents.createAdded(p, "test")); documentHistory.add(HistoryEvents.createAdded(q, "test")); documentHistory.add(HistoryEvents.createMerged(p, "test", q.getInternalId())); documentHistory.add(HistoryEvents.createMerged(p, "fakeId merge", 500)); documentHistory.add(HistoryEvents.createRemoved(q, "test")); ae.process(jCas); Collection<HistoryEvent> pHistory = documentHistory.getHistory(p.getInternalId()); Collection<HistoryEvent> qHistory = documentHistory.getHistory(q.getInternalId()); assertEquals(1, documents.count()); assertEquals(2, entities.count()); Document a = entities.find(new Document(Mongo.FIELD_ENTITIES + "." + VALUE, "Bill")).first(); List<Document> pH = (List<Document>) ((List<Document>) a.get(Mongo.FIELD_ENTITIES)).get(0).get(fields.getHistory()); assertEquals(pHistory.size() + qHistory.size(), pH.size()); Document b = entities.find(new Document(Mongo.FIELD_ENTITIES + "." + VALUE, NAME_2)).first(); List<Document> qH = (List<Document>) ((List<Document>) b.get(Mongo.FIELD_ENTITIES)).get(0).get(fields.getHistory()); assertEquals(qHistory.size(), qH.size()); }
.toArray()); assertEquals(getDocumentAnnotation(jCas).getHash(), result.get(fields.getExternalId()));
properties.put(AnalysisConstants.CLASSIFICATION, da.getDocumentClassification()); properties.put(AnalysisConstants.DOCUMENT_TYPE, da.getDocType()); properties.put(AnalysisConstants.HASH, da.getHash()); properties.put(AnalysisConstants.RELEASABILITY, Arrays.asList(RELEASABILITY)); properties.put(AnalysisConstants.LANGUAGE, da.getLanguage());