public TokenTextForSelectedPosExtractor(Collection<String> acceptablePOSTags) { this.acceptablePOSTags = new HashSet<String>(acceptablePOSTags); this.extractor = new CoveredTextExtractor<Token>(); }
@Override public List<Feature> extract(JCas view, Token token) throws CleartkExtractorException { List<Feature> features = new ArrayList<Feature>(); String pos = token.getPos(); if (pos != null) { if (pos.length() > 2) { pos = pos.substring(0, 2); } if (this.acceptablePOSTags.contains(pos)) { features.addAll(this.extractor.extract(view, token)); } } return features; } }
@Override public String getFeatureName() { return this.extractor.getFeatureName(); }
@Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); // add features: word, stem, pos this.tokenFeatureExtractors = Lists.newArrayList(); this.tokenFeatureExtractors.add(new CoveredTextExtractor<Token>()); this.tokenFeatureExtractors.add(new TypePathExtractor<Token>(Token.class, "stem")); this.tokenFeatureExtractors.add(new TypePathExtractor<Token>(Token.class, "pos")); this.tokenFeatureExtractors.add(new ParentNodeFeaturesExtractor()); // add window of features before and after this.contextExtractors = Lists.newArrayList(); this.contextExtractors.add(new CleartkExtractor<Token, Token>( Token.class, new CoveredTextExtractor<Token>(), new Preceding(3), new Following(3))); }
@Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); this.featuresExtractors = Lists.newArrayList(); this.featuresExtractors.add(new LastWordExtractor<Time>()); FeatureExtractor1<Time> ex = CharacterCategoryPatternFunction.createExtractor(); this.featuresExtractors.add(ex); this.featuresExtractors.add(new TimeWordsExtractor<Time>()); this.featuresExtractors.add(new CleartkExtractor<Time, Token>(Token.class, new CoveredTextExtractor<Token>(), new Bag(new Covered()))); }
@Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); // I explored a ton of features here, and the following were the only ones that worked // The only feature that I didn't try that seems like it might still have some promise // would be to find any times within, say, 5 tokens, and do the time value comparison // to see whether the nearby time is before, overlapping with or after the DCT List<FeatureExtractor1<Event>> srcExtractors = Lists.newArrayList(); srcExtractors.add(new TypePathExtractor<Event>(Event.class, "tense")); srcExtractors.add(new TypePathExtractor<Event>(Event.class, "aspect")); srcExtractors.add(new TypePathExtractor<Event>(Event.class, "eventClass")); srcExtractors.add(new TypePathExtractor<Event>(Event.class, "polarity")); srcExtractors.add(new TypePathExtractor<Event>(Event.class, "modality")); // the word, but only if it's an aspectual event srcExtractors.add( new FilteringExtractor<Event>(Event.class, new CoveredTextExtractor<Event>()) { @Override protected boolean accept(Event event) { return event.getEventClass().equals("ASPECTUAL"); } }); this.setSourceExtractors(srcExtractors); }
public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); // Create an extractor that gives word counts for a document this.extractor = new CleartkExtractor<DocumentAnnotation, Token>( Token.class, new CoveredTextExtractor<Token>(), new Count(new Covered())); }
public VerbClauseTemporalAnnotator() { this.eventID = 1; FeatureExtractor1<Token> precedingAuxiliaries = new CleartkExtractor<Token, Token>( Token.class, new TokenTextForSelectedPosExtractor("MD", "TO", "IN", "VB", "RB"), new Preceding(3)); FeatureExtractor1<Token> tokenStemExtractor = new TypePathExtractor<Token>(Token.class, "stem"); FeatureExtractor1<Token> tokenPOSExtractor = new TypePathExtractor<Token>(Token.class, "pos"); this.sourceFeatureExtractors = Lists.newArrayList(); this.sourceFeatureExtractors.add(new NamingExtractor1<Token>("Source", new CoveredTextExtractor<Token>())); this.sourceFeatureExtractors.add(new NamingExtractor1<Token>("Source", tokenPOSExtractor)); this.sourceFeatureExtractors.add(new NamingExtractor1<Token>("Source", tokenStemExtractor)); this.sourceFeatureExtractors.add(new NamingExtractor1<Token>("Source", precedingAuxiliaries)); this.targetFeatureExtractors = Lists.newArrayList(); this.targetFeatureExtractors.add(new NamingExtractor1<Token>("Target", new CoveredTextExtractor<Token>())); this.targetFeatureExtractors.add(new NamingExtractor1<Token>("Target", tokenPOSExtractor)); this.targetFeatureExtractors.add(new NamingExtractor1<Token>("Target", tokenStemExtractor)); this.targetFeatureExtractors.add(new NamingExtractor1<Token>("Target", precedingAuxiliaries)); this.betweenAnchorsFeatureExtractors = new ArrayList<FeatureExtractor1<Annotation>>(); this.betweenAnchorsFeatureExtractors.add(new NamingExtractor1<Annotation>( "WordsBetween", new CleartkExtractor<Annotation, Token>(Token.class, new CoveredTextExtractor<Token>(), new Bag(new Covered())))); this.pathExtractor = new TargetPathExtractor(); }
@Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); List<FeatureExtractor1<Event>> extractors = Lists.newArrayList(); extractors.add(new TypePathExtractor<Event>(Event.class, "tense")); extractors.add(new TypePathExtractor<Event>(Event.class, "aspect")); extractors.add(new TypePathExtractor<Event>(Event.class, "eventClass")); extractors.add(new SyntacticFirstChildOfGrandparentOfLeafExtractor<Event>()); this.setSourceExtractors(extractors); this.setTargetExtractors(extractors); List<FeatureExtractor2<Anchor, Anchor>>btweenExtractors = Lists.newArrayList(); btweenExtractors.add(new SyntacticLeafToLeafPathPartsExtractor<Anchor, Anchor>()); btweenExtractors.add(new CleartkExtractor<Anchor, Token>(Token.class, new CoveredTextExtractor<Token>(), new Bag(new Covered()))); this.setBetweenExtractors(btweenExtractors); }
@Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); // define chunking type this.chunking = new BioChunking<Token, Time>(Token.class, Time.class); // add features: word, character pattern, stem, pos this.tokenFeatureExtractors = Lists.newArrayList(); this.tokenFeatureExtractors.add(new CoveredTextExtractor<Token>()); NamedFeatureExtractor1<Token> ex = CharacterCategoryPatternFunction.createExtractor(); this.tokenFeatureExtractors.add(ex); this.tokenFeatureExtractors.add(new TimeWordsExtractor<Token>()); this.tokenFeatureExtractors.add(new TypePathExtractor<Token>(Token.class, "stem")); this.tokenFeatureExtractors.add(new TypePathExtractor<Token>(Token.class, "pos")); // add window of features before and after this.contextFeatureExtractors = Lists.newArrayList(); for (FeatureExtractor1<Token> extractor : this.tokenFeatureExtractors) { this.contextFeatureExtractors.add(new CleartkExtractor<Token, Token>(Token.class, extractor, new Preceding( 3), new Following(3))); } }
private TfidfExtractor<String, DocumentAnnotation> initTfIdfExtractor() throws IOException { CleartkExtractor<DocumentAnnotation, Token> countsExtractor = new CleartkExtractor<DocumentAnnotation, Token>( Token.class, new CoveredTextExtractor<Token>(), new CleartkExtractor.Count(new CleartkExtractor.Covered())); TfidfExtractor<String, DocumentAnnotation> tfIdfExtractor = new TfidfExtractor<String, DocumentAnnotation>( DocumentClassificationAnnotator.TFIDF_EXTRACTOR_KEY, countsExtractor); if (this.tfIdfUri != null) { tfIdfExtractor.load(this.tfIdfUri); } return tfIdfExtractor; }
private CentroidTfidfSimilarityExtractor<String, DocumentAnnotation> initCentroidTfIdfSimilarityExtractor() throws IOException { CleartkExtractor<DocumentAnnotation, Token> countsExtractor = new CleartkExtractor<DocumentAnnotation, Token>( Token.class, new CoveredTextExtractor<Token>(), new CleartkExtractor.Count(new CleartkExtractor.Covered())); CentroidTfidfSimilarityExtractor<String, DocumentAnnotation> simExtractor = new CentroidTfidfSimilarityExtractor<String, DocumentAnnotation>( DocumentClassificationAnnotator.CENTROID_TFIDF_SIM_EXTRACTOR_KEY, countsExtractor); if (this.tfIdfCentroidSimilarityUri != null) { simExtractor.load(this.tfIdfCentroidSimilarityUri); } return simExtractor; }
public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); // a feature extractor that creates features corresponding to the word, the word lower cased // the capitalization of the word, the numeric characterization of the word, and character ngram // suffixes of length 2 and 3. this.tokenFeatureExtractor = new FeatureFunctionExtractor<Token>( new CoveredTextExtractor<Token>(), new LowerCaseFeatureFunction(), new CapitalTypeFeatureFunction(), new NumericTypeFeatureFunction(), new CharacterNgramFeatureFunction(Orientation.RIGHT_TO_LEFT, 0, 2), new CharacterNgramFeatureFunction(Orientation.RIGHT_TO_LEFT, 0, 3)); // a feature extractor that extracts the surrounding token texts (within the same sentence) this.contextFeatureExtractor = new CleartkExtractor<Token, Token>( Token.class, new CoveredTextExtractor<Token>(), new Preceding(2), new Following(2)); }
new CombinedExtractor1 new CoveredTextExtractor(), new TypePathExtractor(AssertionCuePhraseAnnotation.class, "cuePhrase"), new TypePathExtractor(AssertionCuePhraseAnnotation.class, "cuePhraseCategory"),
new CombinedExtractor1 new CoveredTextExtractor(), new TypePathExtractor(AssertionCuePhraseAnnotation.class, "cuePhrase"), new TypePathExtractor(AssertionCuePhraseAnnotation.class, "cuePhraseCategory"),
new CoveredTextExtractor<Token>()) { @Override protected boolean accept(Token token) { tgtExtractors.add(new CleartkExtractor<Time, Token>(Token.class, new CoveredTextExtractor<Token>(), new Bag(new Covered()))); tgtExtractors.add(new TypePathExtractor<Time>(Time.class, "timeType")); tgtExtractors.add(new TypePathExtractor<Time>(Time.class, "value"));
private FeatureExtractor1<Sentence> createTokenCountsExtractor() { FeatureExtractor1<Token> tokenFieldExtractor = new CoveredTextExtractor<Token>(); switch (this.tokenField) { case COVERED_TEXT: tokenFieldExtractor = new CoveredTextExtractor<Token>(); break; case STEM: tokenFieldExtractor = new TypePathExtractor<Token>(Token.class, "stem"); break; case LEMMA: tokenFieldExtractor = new TypePathExtractor<Token>(Token.class, "lemma"); break; } CleartkExtractor<Sentence, Token> countsExtractor = new CleartkExtractor<Sentence, Token>( Token.class, new StopwordRemovingExtractor<Token>(this.stopwords, tokenFieldExtractor), new CleartkExtractor.Count(new CleartkExtractor.Covered())); return countsExtractor; }
@Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); // the token feature extractor: text, char pattern (uppercase, digits, etc.), and part-of-speech this.extractor = new CombinedExtractor1<Token>( new FeatureFunctionExtractor<Token>( new CoveredTextExtractor<Token>(), new CharacterCategoryPatternFunction<Token>(PatternType.REPEATS_MERGED)), new TypePathExtractor<Token>(Token.class, "pos")); // the context feature extractor: the features above for the 3 preceding and 3 following tokens this.contextExtractor = new CleartkExtractor<Token, Token>( Token.class, this.extractor, new Preceding(3), new Following(3)); // the chunking definition: Tokens will be combined to form NamedEntityMentions, with labels // from the "mentionType" attribute so that we get B-location, I-person, etc. this.chunking = new BioChunking<Token, NamedEntityMention>( Token.class, NamedEntityMention.class, "mentionType"); }
new CombinedExtractor1 new CoveredTextExtractor(), new TypePathExtractor(AssertionCuePhraseAnnotation.class, "cuePhrase"), new TypePathExtractor(AssertionCuePhraseAnnotation.class, "cuePhraseCategory"), new CleartkExtractor( BaseToken.class, new CoveredTextExtractor(),
new CombinedExtractor1 new CoveredTextExtractor(), new TypePathExtractor(AssertionCuePhraseAnnotation.class, "cuePhrase"), new TypePathExtractor(AssertionCuePhraseAnnotation.class, "cuePhraseCategory"), new CleartkExtractor( BaseToken.class, new CoveredTextExtractor(),