return new XmlTextAnnotationMaker(textAnnotationBuilder, xmlProcessor);
/** * given an entry from the corpus file list generated by {@link #getFileListing()} , parse its * contents and get zero or more TextAnnotation objects. This allows for the case where corpus * annotations are provided in standoff format in one or more files separate from the source * document. In such cases, the first file in the list should contain the source document * and the rest should be the corresponding markup files. * * In this default implementation, it is assumed that a single file contains both source and markup. * * @param corpusFileListEntry a list of files, the first of which is a source file. * @return List of TextAnnotation objects extracted from the corpus file. */ @Override public List<XmlTextAnnotation> getAnnotationsFromFile(List<Path> corpusFileListEntry) throws Exception { Path sourceTextAndAnnotationFile = corpusFileListEntry.get(0); fileId = sourceTextAndAnnotationFile.getName(sourceTextAndAnnotationFile.getNameCount() - 1) .toString(); logger.debug("read source file {}", fileId); numFiles++; String fileText = LineIO.slurp(sourceTextAndAnnotationFile.toString()); List<XmlTextAnnotation> xmlTaList = new ArrayList<>(1); XmlTextAnnotation xmlTa = xmlTextAnnotationMaker.createTextAnnotation(fileText, this.corpusName, fileId); if (null != xmlTa) { xmlTaList.add(xmlTa); numTextAnnotations++; } return xmlTaList; }
StatefulTokenizer st = new StatefulTokenizer(); TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st); XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor); XmlTextAnnotation xta = xtam.createTextAnnotation(data, "OntoNotes 5.0", docid); TextAnnotation ta = xta.getTextAnnotation(); List<SpanInfo> fudge = xta.getXmlMarkup();
StatefulTokenizer st = new StatefulTokenizer(); TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st); XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor); XmlTextAnnotation xta = xtam.createTextAnnotation(data, "OntoNotes 5.0", docid); TextAnnotation ta = xta.getTextAnnotation(); List<SpanInfo> fudge = xta.getXmlMarkup();
StatefulTokenizer st = new StatefulTokenizer(); TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st); XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor); XmlTextAnnotation xta = xtam.createTextAnnotation(document, "OntoNotes 5.0", "test"); TextAnnotation ta = xta.getTextAnnotation(); List<SpanInfo> fudge = xta.getXmlMarkup();
return new XmlTextAnnotationMaker(textAnnotationBuilder, xmlProcessor);
/** * given an entry from the corpus file list generated by {@link #getFileListing()} , parse its * contents and get zero or more TextAnnotation objects. This allows for the case where corpus * annotations are provided in standoff format in one or more files separate from the source * document. In such cases, the first file in the list should contain the source document * and the rest should be the corresponding markup files. * * In this default implementation, it is assumed that a single file contains both source and markup. * * @param corpusFileListEntry a list of files, the first of which is a source file. * @return List of TextAnnotation objects extracted from the corpus file. */ @Override public List<XmlTextAnnotation> getAnnotationsFromFile(List<Path> corpusFileListEntry) throws Exception { Path sourceTextAndAnnotationFile = corpusFileListEntry.get(0); fileId = sourceTextAndAnnotationFile.getName(sourceTextAndAnnotationFile.getNameCount() - 1) .toString(); logger.debug("read source file {}", fileId); numFiles++; String fileText = LineIO.slurp(sourceTextAndAnnotationFile.toString()); List<XmlTextAnnotation> xmlTaList = new ArrayList<>(1); XmlTextAnnotation xmlTa = xmlTextAnnotationMaker.createTextAnnotation(fileText, this.corpusName, fileId); if (null != xmlTa) { xmlTaList.add(xmlTa); numTextAnnotations++; } return xmlTaList; }
StatefulTokenizer st = new StatefulTokenizer(); TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st); XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor); XmlTextAnnotation xta = xtam.createTextAnnotation(document, "OntoNotes 5.0", "test"); TextAnnotation ta = xta.getTextAnnotation(); List<SpanInfo> fudge = xta.getXmlMarkup();
return new XmlTextAnnotationMaker(textAnnotationBuilder, xmlProcessor);
return new XmlTextAnnotationMaker(textAnnotationBuilder, xmlProcessor);