public CoreMapExpressionExtractor getExpressionExtractor(Env env, Reader r) throws ParseException, TokenSequenceParseException { try{ TokenSequenceParser p = new TokenSequenceParser(r); List<SequenceMatchRules.Rule> rules = p.RuleList(env); return new CoreMapExpressionExtractor(env, rules); }catch(TokenMgrError error){ throw new TokenSequenceParseException("Parsing failed. Error: " + error); } }
log.info("extractExpressions() extracting with " + basicExtractRule + " from " + annotation + " gives " + matchedExpressions); annotateExpressions(annotation, matchedExpressions); matchedExpressions = MatchedExpression.removeNullValues(matchedExpressions); matchedExpressions = MatchedExpression.removeNested(matchedExpressions); SequenceMatchRules.ExtractRule<List<? extends CoreMap>, T> compositeExtractRule = stage.compositeExtractRule; if (compositeExtractRule != null) { Pair<List<? extends CoreMap>, List<T>> p = applyCompositeRule( compositeExtractRule, merged, matchedExpressions, stage.limitIters); merged = p.first(); matchedExpressions = p.second(); matchedExpressions = filterInvalidExpressions(stage.filterRule, matchedExpressions); cleanupTags(annotation);
public void updateExpressionExtractor(CoreMapExpressionExtractor extractor, Reader r) throws ParseException, TokenSequenceParseException { try{ TokenSequenceParser p = new TokenSequenceParser(r); List<SequenceMatchRules.Rule> rules = p.RuleList(extractor.getEnv()); extractor.appendRules(rules); }catch(TokenMgrError error){ throw new TokenSequenceParseException("Parsing failed. Error: " + error); } }
private List<CoreMap> extract(CoreMap annotation) { List<CoreMap> cms; if (options.extractWithTokens) { cms = extractor.extractCoreMapsMergedWithTokens(annotation); } else { cms = extractor.extractCoreMaps(annotation); } if (options.flatten) { return extractor.flatten(cms); } else { return cms; } }
public KBPTokensregexExtractor(String tokensregexDir, boolean verbose) { if (verbose) logger.log("Creating TokensRegexExtractor"); // Create extractors for (RelationType rel : RelationType.values()) { String relFileNameComponent = rel.canonicalName.replaceAll(":", "_"); String path = tokensregexDir + File.separator + relFileNameComponent.replaceAll("/", "SLASH") + ".rules"; if (IOUtils.existsInClasspathOrFileSystem(path)) { List<String> listFiles = new ArrayList<>(); listFiles.add(tokensregexDir + File.separator + "defs.rules"); listFiles.add(path); if (verbose) logger.log("Rule files for relation " + rel + " is " + path); Env env = TokenSequencePattern.getNewEnv(); env.bind("collapseExtractionRules", true); env.bind("verbose", verbose); CoreMapExpressionExtractor extr = CoreMapExpressionExtractor.createExtractorFromFiles(env, listFiles).keepTemporaryTags(); rules.put(rel, extr); } } }
/** * Creates an extractor using the specified environment, and reading the rules from the given filename. * @param env * @param filename * @throws RuntimeException */ public static CoreMapExpressionExtractor createExtractorFromFile(Env env, String filename) throws RuntimeException { return createExtractorFromFiles(env, Collections.singletonList(filename)); }
private void cleanupTags(CoreMap cm) { cleanupTags(cm, new IdentityHashMap<>()); }
private List<CoreMap> extractCoreMapsToList(List<CoreMap> res, CoreMap annotation) { List<T> exprs = extractExpressions(annotation); for (T expr : exprs) { res.add(expr.getAnnotation()); } return res; }
.createExtractorFromFiles(TokenSequencePattern.getNewEnv(), rules); token.get(CoreAnnotations.PartOfSpeechAnnotation.class) + ", ne=" + token.get(CoreAnnotations.NamedEntityTagAnnotation.class)); List<MatchedExpression> matchedExpressions = extractor.extractExpressions(sentence); for (MatchedExpression matched : matchedExpressions) {
/** * Creates an instance with the specified environment and list of rules * @param env Environment to use for binding variables and applying rules * @param rules List of rules for this extractor */ public CoreMapExpressionExtractor(Env env, List<SequenceMatchRules.Rule> rules) { this(env); appendRules(rules); }
/** * Returns list of coremaps that matches the specified rules. * * @param annotation */ public List<CoreMap> extractCoreMaps(CoreMap annotation) { List<CoreMap> res = new ArrayList<>(); return extractCoreMapsToList(res, annotation); }
/** * Returns list of merged tokens and original tokens. * * @param annotation */ public List<CoreMap> extractCoreMapsMergedWithTokens(CoreMap annotation) { List<CoreMap> res = extractCoreMaps(annotation); Integer startTokenOffset = annotation.get(CoreAnnotations.TokenBeginAnnotation.class); if (startTokenOffset == null) { startTokenOffset = 0; } final Integer startTokenOffsetFinal = startTokenOffset; List<CoreMap> merged = CollectionUtils.mergeListWithSortedMatchedPreAggregated( annotation.get(tokensAnnotationKey), res, (CoreMap in) -> Interval.toInterval(in.get(CoreAnnotations.TokenBeginAnnotation.class) - startTokenOffsetFinal, in.get(CoreAnnotations.TokenEndAnnotation.class) - startTokenOffsetFinal) ); return merged; }
if (verbose && extracted) log.info("applyCompositeRule() extracting with " + compositeExtractRule + " from " + merged + " gives " + newExprs); if (extracted) { annotateExpressions(merged, newExprs); newExprs = MatchedExpression.removeNullValues(newExprs); if ( ! newExprs.isEmpty()) {
private List<CoreMap> extract(CoreMap annotation) { List<CoreMap> cms; if (options.extractWithTokens) { cms = extractor.extractCoreMapsMergedWithTokens(annotation); } else { cms = extractor.extractCoreMaps(annotation); } if (options.flatten) { return extractor.flatten(cms); } else { return cms; } }
/** * Creates an extractor using the specified environment, and reading the rules from the given filenames. * @param env * @param filenames * @throws RuntimeException */ public static <M extends MatchedExpression> CoreMapExpressionExtractor<M> createExtractorFromFiles(Env env, String... filenames) throws RuntimeException { return createExtractorFromFiles(env, Arrays.asList(filenames)); }
private void cleanupTags(Collection objs, Map<Object, Boolean> cleaned) { for (Object obj:objs) { if (!cleaned.containsKey(obj)) { cleaned.put(obj, false); if (obj instanceof CoreMap) { cleanupTags((CoreMap) obj, cleaned); } else if (obj instanceof Collection) { cleanupTags((Collection) obj, cleaned); } cleaned.put(obj, true); } } }
public List<MatchedExpression> extract(CoreMap annotation) { if (!annotation.containsKey(CoreAnnotations.NumerizedTokensAnnotation.class)) { List<CoreMap> mergedNumbers = NumberNormalizer.findAndMergeNumbers(annotation); annotation.set(CoreAnnotations.NumerizedTokensAnnotation.class, mergedNumbers); } return extractor.extractExpressions(annotation); }
/** * Creates an instance with the specified environment and list of rules * @param env Environment to use for binding variables and applying rules * @param rules List of rules for this extractor */ public CoreMapExpressionExtractor(Env env, List<SequenceMatchRules.Rule> rules) { this(env); appendRules(rules); }
/** * Returns list of coremaps that matches the specified rules. * * @param annotation */ public List<CoreMap> extractCoreMaps(CoreMap annotation) { List<CoreMap> res = new ArrayList<>(); return extractCoreMapsToList(res, annotation); }
/** * Returns list of merged tokens and original tokens * @param annotation */ public List<CoreMap> extractCoreMapsMergedWithTokens(CoreMap annotation) { List<CoreMap> res = extractCoreMaps(annotation); Integer startTokenOffset = annotation.get(CoreAnnotations.TokenBeginAnnotation.class); if (startTokenOffset == null) { startTokenOffset = 0; } final Integer startTokenOffsetFinal = startTokenOffset; List<CoreMap> merged = CollectionUtils.mergeListWithSortedMatchedPreAggregated( (List<CoreMap>) annotation.get(tokensAnnotationKey), res, new Function<CoreMap, Interval<Integer>>() { public Interval<Integer> apply(CoreMap in) { return Interval.toInterval(in.get(CoreAnnotations.TokenBeginAnnotation.class) - startTokenOffsetFinal, in.get(CoreAnnotations.TokenEndAnnotation.class) - startTokenOffsetFinal); } }); return merged; }