private List<POSSample> extractPosSamples(List<CAS> aCasses) { List<POSSample> posSamples = new ArrayList<>(); for (CAS cas : aCasses) { Type sentenceType = getType(cas, Sentence.class); Type tokenType = getType(cas, Token.class); Map<AnnotationFS, Collection<AnnotationFS>> sentences = indexCovered(cas, sentenceType, tokenType); for (Map.Entry<AnnotationFS, Collection<AnnotationFS>> e : sentences.entrySet()) { AnnotationFS sentence = e.getKey(); Collection<AnnotationFS> tokens = e.getValue(); createPosSample(cas, sentence, tokens).map(posSamples::add); } } LOG.debug("Extracted {} POS samples", posSamples.size()); return posSamples; }
private List<NameSample> extractNameSamples(List<CAS> aCasses) { List<NameSample> nameSamples = new ArrayList<>(); for (CAS cas : aCasses) { Type sentenceType = getType(cas, Sentence.class); Type tokenType = getType(cas, Token.class); Map<AnnotationFS, Collection<AnnotationFS>> sentences = indexCovered(cas, sentenceType, tokenType); for (Entry<AnnotationFS, Collection<AnnotationFS>> e : sentences.entrySet()) { AnnotationFS sentence = e.getKey(); Collection<AnnotationFS> tokens = e.getValue(); NameSample nameSample = createNameSample(cas, sentence, tokens); if (nameSample.getNames().length > 0) { nameSamples.add(nameSample); } } } return nameSamples; }
private Collection<ImmutablePair<String, Collection<AnnotationFS>>> extractNamedEntities( List<CAS> aCasList) { Type tokenType = org.apache.uima.fit.util.CasUtil .getType(aCasList.get(0), recommender.getLayer().getName()); Feature feature = tokenType.getFeatureByBaseName(recommender.getFeature().getName()); Collection<ImmutablePair<String, Collection<AnnotationFS>>> nameSamples = new HashSet<>(); for (CAS cas : aCasList) { Collection<AnnotationFS> namesPerDocument = new ArrayList<>(); Type sentenceType = getType(cas, Sentence.class); Map<AnnotationFS, Collection<AnnotationFS>> sentences = indexCovered(cas, sentenceType, tokenType); for (Map.Entry<AnnotationFS, Collection<AnnotationFS>> e : sentences.entrySet()) { Collection<AnnotationFS> tokens = e.getValue().stream() // If the identifier has not been set .filter(a -> a.getStringValue(feature) == null) .collect(Collectors.toSet()); namesPerDocument.addAll(tokens); } // TODO #176 use the document Id once it is available in the CAS nameSamples.add( new ImmutablePair<>(DocumentMetaData.get(cas).getDocumentUri(), namesPerDocument)); } return nameSamples; }
private Collection<ImmutablePair<String, Collection<AnnotationFS>>> extractNamedEntities( List<CAS> aCasList) { Type tokenType = org.apache.uima.fit.util.CasUtil .getType(aCasList.get(0), recommender.getLayer().getName()); Feature feature = tokenType.getFeatureByBaseName(recommender.getFeature().getName()); Collection<ImmutablePair<String, Collection<AnnotationFS>>> nameSamples = new HashSet<>(); for (CAS cas : aCasList) { Collection<AnnotationFS> namesPerDocument = new ArrayList<>(); Type sentenceType = getType(cas, Sentence.class); Map<AnnotationFS, Collection<AnnotationFS>> sentences = indexCovered(cas, sentenceType, tokenType); for (Map.Entry<AnnotationFS, Collection<AnnotationFS>> e : sentences.entrySet()) { Collection<AnnotationFS> tokens = e.getValue().stream() // If the identifier has not been set .filter(a -> a.getStringValue(feature) == null) .collect(Collectors.toSet()); namesPerDocument.addAll(tokens); } // TODO #176 use the document Id once it is available in the CAS nameSamples.add( new ImmutablePair<>(DocumentMetaData.get(cas).getDocumentUri(), namesPerDocument)); } return nameSamples; }
Map<AnnotationFS, Collection<AnnotationFS>> idxSentences = indexCovered( Map<AnnotationFS, Collection<AnnotationFS>> idxProperties = indexCovered( Map<AnnotationFS, Collection<AnnotationFS>> idxNeurons = indexCovered(
private List<DocumentSample> extractSamples(List<CAS> aCasses) { List<DocumentSample> samples = new ArrayList<>(); for (CAS cas : aCasses) { Type sentenceType = getType(cas, Sentence.class); Type tokenType = getType(cas, Token.class); Map<AnnotationFS, Collection<AnnotationFS>> sentences = indexCovered(cas, sentenceType, tokenType); for (Entry<AnnotationFS, Collection<AnnotationFS>> e : sentences.entrySet()) { AnnotationFS sentence = e.getKey(); Collection<AnnotationFS> tokens = e.getValue(); String[] tokenTexts = tokens.stream() .map(AnnotationFS::getCoveredText) .toArray(String[]::new); Type annotationType = getType(cas, layerName); Feature feature = annotationType.getFeatureByBaseName(featureName); for (AnnotationFS annotation : selectCovered(annotationType, sentence)) { String label = annotation.getFeatureValueAsString(feature); DocumentSample nameSample = new DocumentSample( label != null ? label : NO_CATEGORY, tokenTexts); if (nameSample.getCategory() != null) { samples.add(nameSample); } } } } return samples; }
Map<AnnotationFS, Collection<AnnotationFS>> idxSentences = indexCovered( Map<AnnotationFS, Collection<AnnotationFS>> idxProperties = indexCovered( Map<AnnotationFS, Collection<AnnotationFS>> idxNeurons = indexCovered(
JCas jCas, Class<? extends T> type, Class<? extends S> coveredType) { return cast(CasUtil .indexCovered(jCas.getCas(), getType(jCas, type), getType(jCas, coveredType)));
public IobEncoder(CAS aCas, Type aType, Feature aValueFeature, boolean aIob1) { iob1 = aIob1; // fill map for whole JCas in order to efficiently encode IOB iobBeginMap = new Int2ObjectOpenHashMap<String>(); iobInsideMap = new Int2ObjectOpenHashMap<String>(); Map<AnnotationFS, Collection<AnnotationFS>> idx = CasUtil.indexCovered(aCas, aType, CasUtil.getType(aCas, Token.class)); String lastValue = null; for (AnnotationFS chunk : CasUtil.select(aCas, aType)) { String value = chunk.getStringValue(aValueFeature); for (AnnotationFS token : idx.get(chunk)) { if ( token.getBegin() == chunk.getBegin() && (!iob1 || (lastValue != null && lastValue.equals(value))) ) { iobBeginMap.put(token.getBegin(), value); } else { iobInsideMap.put(token.getBegin(), value); } } lastValue = value; } }
public IobEncoder(CAS aCas, Type aType, Feature aValueFeature, boolean aIob1) { iob1 = aIob1; // fill map for whole JCas in order to efficiently encode IOB iobBeginMap = new Int2ObjectOpenHashMap<String>(); iobInsideMap = new Int2ObjectOpenHashMap<String>(); Map<AnnotationFS, Collection<AnnotationFS>> idx = CasUtil.indexCovered(aCas, aType, CasUtil.getType(aCas, Token.class)); String lastValue = null; for (AnnotationFS chunk : CasUtil.select(aCas, aType)) { String value = chunk.getStringValue(aValueFeature); for (AnnotationFS token : idx.get(chunk)) { if ( token.getBegin() == chunk.getBegin() && (!iob1 || (lastValue != null && lastValue.equals(value))) ) { iobBeginMap.put(token.getBegin(), value); } else { iobInsideMap.put(token.getBegin(), value); } } lastValue = value; } }