private FeatureExtractor1<Sentence> createTokenCountsExtractor() { FeatureExtractor1<Token> tokenFieldExtractor = new CoveredTextExtractor<Token>(); switch (this.tokenField) { case COVERED_TEXT: tokenFieldExtractor = new CoveredTextExtractor<Token>(); break; case STEM: tokenFieldExtractor = new TypePathExtractor<Token>(Token.class, "stem"); break; case LEMMA: tokenFieldExtractor = new TypePathExtractor<Token>(Token.class, "lemma"); break; } CleartkExtractor<Sentence, Token> countsExtractor = new CleartkExtractor<Sentence, Token>( Token.class, new StopwordRemovingExtractor<Token>(this.stopwords, tokenFieldExtractor), new CleartkExtractor.Count(new CleartkExtractor.Covered())); return countsExtractor; }
private CentroidTfidfSimilarityExtractor<String, DocumentAnnotation> initCentroidTfIdfSimilarityExtractor() throws IOException { CleartkExtractor<DocumentAnnotation, Token> countsExtractor = new CleartkExtractor<DocumentAnnotation, Token>( Token.class, new CoveredTextExtractor<Token>(), new CleartkExtractor.Count(new CleartkExtractor.Covered())); CentroidTfidfSimilarityExtractor<String, DocumentAnnotation> simExtractor = new CentroidTfidfSimilarityExtractor<String, DocumentAnnotation>( DocumentClassificationAnnotator.CENTROID_TFIDF_SIM_EXTRACTOR_KEY, countsExtractor); if (this.tfIdfCentroidSimilarityUri != null) { simExtractor.load(this.tfIdfCentroidSimilarityUri); } return simExtractor; }
private TfidfExtractor<String, DocumentAnnotation> initTfIdfExtractor() throws IOException { CleartkExtractor<DocumentAnnotation, Token> countsExtractor = new CleartkExtractor<DocumentAnnotation, Token>( Token.class, new CoveredTextExtractor<Token>(), new CleartkExtractor.Count(new CleartkExtractor.Covered())); TfidfExtractor<String, DocumentAnnotation> tfIdfExtractor = new TfidfExtractor<String, DocumentAnnotation>( DocumentClassificationAnnotator.TFIDF_EXTRACTOR_KEY, countsExtractor); if (this.tfIdfUri != null) { tfIdfExtractor.load(this.tfIdfUri); } return tfIdfExtractor; }
public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); // Create an extractor that gives word counts for a document this.extractor = new CleartkExtractor<DocumentAnnotation, Token>( Token.class, new CoveredTextExtractor<Token>(), new Count(new Covered())); }