@Test public void testDocumentWithEntitiesParsing() throws IOException { Map<String, String> typeToClassMap = new HashMap<>(); BratAnnotationStreamTest.addEntityTypes(typeToClassMap); AnnotationConfiguration config = new AnnotationConfiguration(typeToClassMap); InputStream txtIn = BratDocumentTest.class.getResourceAsStream( "/opennlp/tools/formats/brat/voa-with-entities.txt"); InputStream annIn = BratDocumentTest.class.getResourceAsStream( "/opennlp/tools/formats/brat/voa-with-entities.ann"); BratDocument doc = BratDocument.parseDocument(config, "voa-with-entities", txtIn, annIn); Assert.assertEquals("voa-with-entities", doc.getId()); Assert.assertTrue(doc.getText().startsWith(" U . S . President ")); Assert.assertTrue(doc.getText().endsWith("multinational process . \n")); Assert.assertEquals(18, doc.getAnnotations().size()); BratAnnotation annotation = doc.getAnnotation("T2"); checkNote(annotation, "Barack Obama", "President Obama was the 44th U.S. president"); annotation = doc.getAnnotation("T3"); checkNote(annotation,"South Korea","The capital of South Korea is Seoul"); }
public static BratDocument parseDocument(AnnotationConfiguration config, String id, InputStream txtIn, InputStream annIn) throws IOException { Reader txtReader = new InputStreamReader(txtIn, StandardCharsets.UTF_8); StringBuilder text = new StringBuilder(); char[] cbuf = new char[1024]; int len; while ((len = txtReader.read(cbuf)) > 0) { text.append(cbuf, 0, len); } Collection<BratAnnotation> annotations = new ArrayList<>(); ObjectStream<BratAnnotation> annStream = new BratAnnotationStream(config, id, annIn); BratAnnotation ann; while ((ann = annStream.read()) != null) { annotations.add(ann); } annStream.close(); return new BratDocument(config, id, text.toString(), annotations); } }
Map<Integer, Span> coveredIndexes = new HashMap<>(); for (BratAnnotation ann : sample.getAnnotations()) { if (isSpanAnnotation(ann)) { entityIdSet.add(ann.getId()); for (Span sentence : sentDetector.sentPosDetect(sample.getText())) { Span conflictingName = coveredIndexes.get(sentence.getStart()); sample.getId()); sample.getText()).toString(); for (BratAnnotation ann : sample.getAnnotations()) { entityIdSet.remove(ann.getId()); entitySpan = entitySpan.trim(sample.getText()); } else { System.err.println("Dropped entity " + entity.getId() + " (" + entitySpan.getCoveredText(sample.getText()) + ") " + " in document " + sample.getId() + ", it is not matching tokenization!"); samples.add(new NameSample(sample.getId(), Span.spansToStrings(tokens, sentenceText), names.toArray(new Span[names.size()]), null, samples.size() == 0)); sample.getId() + ", is not matching sentence segmentation!");
public BratDocument read() throws IOException { BratDocument doc = null; if (documentIdIterator.hasNext()) { String id = documentIdIterator.next(); try (InputStream txtIn = new BufferedInputStream(new FileInputStream(id + ".txt")); InputStream annIn = new BufferedInputStream(new FileInputStream(id + ".ann"))) { doc = BratDocument.parseDocument(config, id, txtIn, annIn); } } return doc; }
"/opennlp/tools/formats/brat/opennlp-1193.ann"); BratDocument doc = BratDocument.parseDocument(config, "opennlp-1193", txtIn, annIn); SpanAnnotation t1 = (SpanAnnotation) doc.getAnnotation("T1"); Assert.assertEquals(t1.getSpans()[0].getStart(), 0); Assert.assertEquals(t1.getSpans()[0].getEnd(), 7); Assert.assertEquals(t1.getSpans()[2].getEnd(), 24); SpanAnnotation t2 = (SpanAnnotation) doc.getAnnotation("T2"); Assert.assertEquals(t2.getSpans()[0].getStart(), 26); Assert.assertEquals(t2.getSpans()[0].getEnd(), 33);
Map<Integer, Span> coveredIndexes = new HashMap<>(); for (BratAnnotation ann : sample.getAnnotations()) { if (ann instanceof SpanAnnotation) { entityIdSet.add(ann.getId()); for (Span sentence : sentDetector.sentPosDetect(sample.getText())) { Span conflictingName = coveredIndexes.get(sentence.getStart()); sample.getId()); sample.getText()).toString(); for (BratAnnotation ann : sample.getAnnotations()) { entityIdSet.remove(ann.getId()); entitySpan = entitySpan.trim(sample.getText()); + entitySpan.getCoveredText(sample.getText()) + ") " + " in document " + sample.getId() + ", it is not matching tokenization!"); samples.add(new NameSample(sample.getId(), Span.spansToStrings(tokens, sentenceText), names.toArray(new Span[names.size()]), null, samples.size() == 0)); sample.getId() + ", is not matching sentence segmentation!");
public BratDocument read() throws IOException { BratDocument doc = null; if (documentIdIterator.hasNext()) { String id = documentIdIterator.next(); try (InputStream txtIn = new BufferedInputStream(new FileInputStream(id + ".txt")); InputStream annIn = new BufferedInputStream(new FileInputStream(id + ".ann"))) { doc = BratDocument.parseDocument(config, id, txtIn, annIn); } } return doc; }
Map<Integer, Span> coveredIndexes = new HashMap<>(); for (BratAnnotation ann : sample.getAnnotations()) { if (isSpanAnnotation(ann)) { entityIdSet.add(ann.getId()); for (Span sentence : sentDetector.sentPosDetect(sample.getText())) { Span conflictingName = coveredIndexes.get(sentence.getStart()); sample.getId()); sample.getText()).toString(); for (BratAnnotation ann : sample.getAnnotations()) { entityIdSet.remove(ann.getId()); entitySpan = entitySpan.trim(sample.getText()); } else { System.err.println("Dropped entity " + entity.getId() + " (" + entitySpan.getCoveredText(sample.getText()) + ") " + " in document " + sample.getId() + ", it is not matching tokenization!"); samples.add(new NameSample(sample.getId(), Span.spansToStrings(tokens, sentenceText), names.toArray(new Span[names.size()]), null, samples.size() == 0)); sample.getId() + ", is not matching sentence segmentation!");
public BratDocument read() throws IOException { BratDocument doc = null; if (documentIdIterator.hasNext()) { String id = documentIdIterator.next(); try (InputStream txtIn = new BufferedInputStream(new FileInputStream(id + ".txt")); InputStream annIn = new BufferedInputStream(new FileInputStream(id + ".ann"))) { doc = BratDocument.parseDocument(config, id, txtIn, annIn); } } return doc; }
public static BratDocument parseDocument(AnnotationConfiguration config, String id, InputStream txtIn, InputStream annIn) throws IOException { Reader txtReader = new InputStreamReader(txtIn, StandardCharsets.UTF_8); StringBuilder text = new StringBuilder(); char[] cbuf = new char[1024]; int len; while ((len = txtReader.read(cbuf)) > 0) { text.append(cbuf, 0, len); } Collection<BratAnnotation> annotations = new ArrayList<>(); ObjectStream<BratAnnotation> annStream = new BratAnnotationStream(config, id, annIn); BratAnnotation ann; while ((ann = annStream.read()) != null) { annotations.add(ann); } annStream.close(); return new BratDocument(config, id, text.toString(), annotations); } }
"/opennlp/tools/formats/brat/opennlp-1193.ann"); BratDocument doc = BratDocument.parseDocument(config, "opennlp-1193", txtIn, annIn);
public static BratDocument parseDocument(AnnotationConfiguration config, String id, InputStream txtIn, InputStream annIn) throws IOException { Reader txtReader = new InputStreamReader(txtIn, StandardCharsets.UTF_8); StringBuilder text = new StringBuilder(); char[] cbuf = new char[1024]; int len; while ((len = txtReader.read(cbuf)) > 0) { text.append(cbuf, 0, len); } Collection<BratAnnotation> annotations = new ArrayList<>(); ObjectStream<BratAnnotation> annStream = new BratAnnotationStream(config, id, annIn); BratAnnotation ann; while ((ann = annStream.read()) != null) { annotations.add(ann); } annStream.close(); return new BratDocument(config, id, text.toString(), annotations); } }