/** * Returns the base filename from DocumentAnnotation source URI in the given JCas. * * <p>The basename is the main part of the filename, without extension or enclosing paths, e.g. * for path '/some/directory/SomeFile.txt' this method will return 'SomeFile'. * * @param jCas the {@link JCas} from which to get the document annotation. * @return the filename * @throws IllegalArgumentException if there is an error parsing the document source URI. */ public static String getDocumentSourceBaseName(final JCas jCas) { DocumentAnnotation documentAnnotation = UimaSupport.getDocumentAnnotation(jCas); String sourceUri = documentAnnotation.getSourceUri(); return FilenameUtils.getName(sourceUri); } }
@Override protected void write(JCas jCas) { final String source = getDocumentAnnotation(jCas).getSourceUri(); final Map<Event, Collection<Sentence>> coveringSentence = JCasUtil.indexCovering(jCas, Event.class, Sentence.class); JCasUtil.select(jCas, Event.class) .stream() .map(e -> extracted(source, coveringSentence, e)) .filter(s -> s.length > 0) .forEach(this::write); }
@Override public void doProcess(JCas jCas) throws AnalysisEngineProcessException { DocumentAnnotation da = getDocumentAnnotation(jCas); try (PrintWriter pw = new PrintWriter( new BufferedWriter(new FileWriterWithEncoding(output, StandardCharsets.UTF_8, true)))) { int count = JCasUtil.select(jCas, Entity.class).size(); pw.println(da.getSourceUri() + "\t" + count); } catch (IOException e) { getMonitor().warn("Unable to write to output", e); } }
private void addDocumentAnnotationToProperties( final Map<String, Object> properties, final DocumentAnnotation da) { properties.put(AnalysisConstants.DOCUMENT_TYPE, da.getDocType()); properties.put(AnalysisConstants.CAVEATS, UimaTypesUtils.toList(da.getDocumentCaveats())); properties.put(AnalysisConstants.CLASSIFICATION, da.getDocumentClassification()); properties.put( AnalysisConstants.RELEASABILITY, UimaTypesUtils.toList(da.getDocumentReleasability())); properties.put(AnalysisConstants.LANGUAGE, da.getLanguage()); properties.put(AnalysisConstants.HASH, da.getHash()); properties.put(AnalysisConstants.SOURCE, da.getSourceUri()); properties.put(AnalysisConstants.TIMESTAMP, new Date(da.getTimestamp())); }
private String getSource(JCas jCas) { DocumentAnnotation doc = (DocumentAnnotation) jCas.getDocumentAnnotationFs(); return doc.getSourceUri(); } }
private String getSource(JCas jCas) { DocumentAnnotation doc = (DocumentAnnotation) jCas.getDocumentAnnotationFs(); return doc.getSourceUri(); } }
private String getSource(JCas jCas) { DocumentAnnotation doc = (DocumentAnnotation) jCas.getDocumentAnnotationFs(); return doc.getSourceUri(); } }
private String getSource(JCas jCas) { DocumentAnnotation doc = (DocumentAnnotation) jCas.getDocumentAnnotationFs(); return doc.getSourceUri(); } }
private String getSource(JCas jCas) { DocumentAnnotation doc = (DocumentAnnotation) jCas.getDocumentAnnotationFs(); return doc.getSourceUri(); }
private Map<String, Object> serialiseDocumentAnnotation(final DocumentAnnotation da) { final Map<String, Object> map = new HashMap<>(); map.put(JsonJCas.DA_DOCUMENT_TYPE, da.getDocType()); map.put(JsonJCas.DA_LANGUAGE, da.getLanguage()); map.put(JsonJCas.DA_SOURCE_URI, da.getSourceUri()); map.put(JsonJCas.DA_CLASSIFICATION, da.getDocumentClassification()); final String[] caveats = da.getDocumentCaveats() != null ? da.getDocumentCaveats().toArray() : new String[0]; map.put(JsonJCas.DA_CAVEATS, caveats); final String[] rels = da.getDocumentReleasability() != null ? da.getDocumentReleasability().toArray() : new String[0]; map.put(JsonJCas.DA_RELEASABILITY, rels); return map; }
@Override protected void write(JCas jCas) { final String source = getDocumentAnnotation(jCas).getSourceUri(); // For each entity we need to find all the other sentences they are contained in // This should be all entities and sentences final Map<Entity, Collection<Sentence>> coveringSentence = JCasUtil.indexCovering(jCas, Entity.class, Sentence.class); final Map<Sentence, Collection<Entity>> coveredEntities = JCasUtil.indexCovered(jCas, Sentence.class, Entity.class); final Map<Sentence, Collection<WordToken>> coveredTokens = JCasUtil.indexCovered(jCas, Sentence.class, WordToken.class); final Map<WordToken, Collection<Entity>> coveringEntity = JCasUtil.indexCovering(jCas, WordToken.class, Entity.class); JCasUtil.select(jCas, Entity.class) .stream() .map( e -> convertEntityToRow( source, coveringSentence, coveredEntities, coveredTokens, coveringEntity, e)) .filter(s -> s.length > 0) .forEach(this::write); }
/** * Get a usable unique uid * * @param da document annotation * @param contentHashAsId true if should use the hash, false will use the source url * @return hash, source or if all else fails a UUID */ public static String getExternalId(DocumentAnnotation da, boolean contentHashAsId) { if (contentHashAsId) { return da.getHash(); } else { try { return IdentityUtils.hashStrings(da.getSourceUri()); } catch (BaleenException e) { return fallbackToUUID(e); } } }
private Integer executeDocInsert(JCas jCas) throws SQLException, BaleenException { DocumentAnnotation da = getDocumentAnnotation(jCas); String documentId = ConsumerUtils.getExternalId(da, contentHashAsId); insertDocStatement.clearParameters(); insertDocStatement.setString(1, documentId); insertDocStatement.setString(2, da.getDocType()); insertDocStatement.setString(3, da.getSourceUri()); insertDocStatement.setString(4, jCas.getDocumentText()); insertDocStatement.setString(5, jCas.getDocumentLanguage()); insertDocStatement.setTimestamp(6, new Timestamp(da.getTimestamp())); insertDocStatement.setString(7, da.getDocumentClassification()); insertDocStatement.setArray( 8, createVarcharArray(postgresResource.getConnection(), da.getDocumentCaveats())); insertDocStatement.setArray( 9, createVarcharArray(postgresResource.getConnection(), da.getDocumentReleasability())); insertDocStatement.executeUpdate(); Integer docKey = getKey(insertDocStatement); if (docKey == null) { throw new BaleenException("No document key returned"); } return docKey; }
@Override protected void doProcess(JCas jCas) throws AnalysisEngineProcessException { DocumentAnnotation documentAnnotation = getDocumentAnnotation(jCas); String url = documentAnnotation.getSourceUri(); if (Strings.isNullOrEmpty(url)) { url = ConsumerUtils.getExternalId(documentAnnotation, false); } String extension = getExtension(); if (!Strings.isNullOrEmpty(extension)) { url = url + "." + extension; } File file = SourceUtils.urlToFile(basePath, url); try { getMonitor().debug("Writing {} to {}", url, file.getAbsolutePath()); writeToFile(jCas, file); } catch (Exception e) { getMonitor().warn("Failed to write file {}, deleting", file.getAbsolutePath(), e); if (!file.delete()) { getMonitor().warn("Failed to delete file {}", file.getAbsolutePath(), e); } } }
@Override protected void doProcess(JCas jCas) throws AnalysisEngineProcessException { String documentId = getUniqueId(jCas); // Delete any existing content in the database deleteAllContent(documentId); // Save try { saveRelations(documentId, jCas); } catch (MongoException | BsonSerializationException e) { getMonitor() .error( "Unable to persist relations to database - document {} will contain no relations", getDocumentAnnotation(jCas).getSourceUri(), e); } }
@Override protected void doProcess(JCas jCas) throws AnalysisEngineProcessException { String documentId = ConsumerUtils.getExternalId(getDocumentAnnotation(jCas), contentHashAsId); // Delete any existing content in the database deleteAllContent(documentId); // Save try { saveEvents(documentId, jCas, textClass); } catch (MongoException | BsonSerializationException e) { getMonitor() .error( "Unable to persist relations to database - document {} will contain no relations", getDocumentAnnotation(jCas).getSourceUri(), e); } }
@Override protected void doProcess(JCas jCas) throws AnalysisEngineProcessException { DocumentAnnotation da = getDocumentAnnotation(jCas); String source = da.getSourceUri(); try { File f = new File(source); if (Strings.isNullOrEmpty(destination)) { deleteFile(f); Metadata md = new Metadata(jCas); md.setKey("movedDocumentLocation"); md.setValue("deleted"); addToJCasIndex(md); } else { File finalDest = moveFile(f, getDestinationFolder(da)); Metadata md = new Metadata(jCas); md.setKey("movedDocumentLocation"); md.setValue(finalDest.getPath()); addToJCasIndex(md); } } catch (IOException ioe) { getMonitor().error("Unable to move source file", ioe); } }
@SuppressWarnings("unchecked") private void assertMetadata(JCas jCas, Map<String, Object> variables) { DocumentAnnotation da = UimaSupport.getDocumentAnnotation(jCas); String documentId = ConsumerUtils.getExternalId(da, false); assertEquals(da.getDocType(), variables.get(FIELD_DOCUMENT_TYPE)); assertEquals(da.getSourceUri(), variables.get(FIELD_DOCUMENT_SOURCE)); assertEquals(da.getLanguage(), variables.get(FIELD_DOCUMENT_LANGUAGE)); assertEquals(new Date(da.getTimestamp()), variables.get(FIELD_DOCUMENT_TIMESTAMP)); assertEquals(da.getDocumentClassification(), variables.get(FIELD_DOCUMENT_CLASSIFICATION)); assertEquals( UimaTypesUtils.toList(da.getDocumentCaveats()), variables.get(FIELD_DOCUMENT_CAVEATS)); assertFalse(variables.containsKey(FIELD_DOCUMENT_RELEASABILITY)); Map<String, String> publishedId = ((List<Map<String, String>>) variables.get(FIELD_PUBLISHEDIDS)).get(0); assertEquals("12", publishedId.get(FIELD_PUBLISHEDIDS_ID)); assertEquals("test", publishedId.get(FIELD_PUBLISHEDIDS_TYPE)); Map<String, Collection<Object>> meta = (Map<String, Collection<Object>>) variables.get(FIELD_METADATA); assertTrue(meta.get("test").contains("1")); assertTrue(meta.get("test").contains("2")); assertEquals(2, meta.get("test").size()); assertNull(variables.get(FIELD_CONTENT)); assertEquals(documentId, variables.get("externalId")); }
@Test public void testDocumentMetadata() throws Exception { BaleenCollectionReader bcr = getCollectionReader( Re3dReader.PARAM_FOLDER, tmpDir.toAbsolutePath().toString(), Re3dReader.PARAM_ENTITIES, true, Re3dReader.PARAM_RANDOM_DATES, true); assertTrue(bcr.doHasNext()); bcr.getNext(jCas.getCas()); AnnotationIndex<DocumentAnnotation> annotationIndex = jCas.getAnnotationIndex(DocumentAnnotation.class); assertTrue(contains(annotationIndex, d -> d.getSourceUri().equals(SOURCE_URL))); bcr.close(); }
private void saveDocument(String documentId, JCas jCas) { Document doc = new Document(); DocumentAnnotation da = getDocumentAnnotation(jCas); doc.append(fields.getExternalId(), documentId) .append( FIELD_DOCUMENT, new Document() .append(FIELD_DOCUMENT_TYPE, da.getDocType()) .append(FIELD_DOCUMENT_SOURCE, da.getSourceUri()) .append(FIELD_DOCUMENT_LANGUAGE, da.getLanguage()) .append(FIELD_DOCUMENT_TIMESTAMP, new Date(da.getTimestamp())) .append(FIELD_DOCUMENT_CLASSIFICATION, da.getDocumentClassification()) .append(FIELD_DOCUMENT_CAVEATS, toList(da.getDocumentCaveats())) .append(FIELD_DOCUMENT_RELEASABILITY, toList(da.getDocumentReleasability()))); addPublishedIds(jCas, doc); addMetadata(jCas, doc); if (outputContent) { doc.append(FIELD_CONTENT, jCas.getDocumentText()); } documentsCollection.insertOne(doc); }