/** * Returns the text tokenized and pos-tagged, in that order. * * @param text * @return * @throws AnalysisEngineProcessException */ private List<String[]> tokenize(String text) throws AnalysisEngineProcessException { jCas.reset(); jCas.setDocumentText(text); new Sentence(jCas, 0, text.length()).addToIndexes(); jtbd.process(jCas.getCas()); pennbioIEPosTagger.process(jCas.getCas()); return JCasUtil.select(jCas, Token.class).stream() .map(t -> new String[] { t.getCoveredText(), t.getPosTag(0).getValue() }).collect(Collectors.toList()); }
/** * Returns the first POSTag annotation associated with the given token that has the * required type (i.e. that belongs to the requested posTagSet). If no such POSTag * is found, returns null. (In general tokens may be provided with POSTags from * different POSTagSets.) * * @param token * @return */ private POSTag getPrefPOSTag(Token token) { FSArray posTags = token.getPosTag(); for (int i = 0; i < posTags.size(); i++) { POSTag posTag = (POSTag) posTags.get(i); if (posTag != null) { // compare to the desired type of POS Tag Set if (posTag.getType().getName().equals(posTagSetPreference)) { return posTag; } } } return null; }
postag = token.getPosTag(0);