public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); try { TfidfExtractor<String, DocumentAnnotation> tfIdfExtractor = initTfIdfExtractor(); CentroidTfidfSimilarityExtractor<String, DocumentAnnotation> simExtractor = initCentroidTfIdfSimilarityExtractor(); ZeroMeanUnitStddevExtractor<String, DocumentAnnotation> zmusExtractor = initZmusExtractor(); MinMaxNormalizationExtractor<String, DocumentAnnotation> minmaxExtractor = initMinMaxExtractor(); this.extractor = new CombinedExtractor1<DocumentAnnotation>( tfIdfExtractor, simExtractor, zmusExtractor, minmaxExtractor); } catch (IOException e) { throw new ResourceInitializationException(e); } }
public void process(JCas jCas) throws AnalysisEngineProcessException { DocumentAnnotation doc = (DocumentAnnotation) jCas.getDocumentAnnotationFs(); Instance<String> instance = new Instance<String>(); instance.addAll(this.extractor.extract(jCas, doc)); if (isTraining()) { UsenetDocument document = JCasUtil.selectSingle(jCas, UsenetDocument.class); instance.setOutcome(document.getCategory()); this.dataWriter.write(instance); } else { // This is classification, so classify and create UsenetDocument annotation String result = this.classifier.classify(instance.getFeatures()); UsenetDocument document = new UsenetDocument(jCas, 0, jCas.getDocumentText().length()); document.setCategory(result); document.addToIndexes(); // System.out.println("classified " + ViewURIUtil.getURI(jCas) + " as " + result + "."); } }
private MinMaxNormalizationExtractor<String, DocumentAnnotation> initMinMaxExtractor() throws IOException { CombinedExtractor1<DocumentAnnotation> featuresToNormalizeExtractor = new CombinedExtractor1<DocumentAnnotation>( new CountAnnotationExtractor<DocumentAnnotation>(Sentence.class), new CountAnnotationExtractor<DocumentAnnotation>(Token.class)); MinMaxNormalizationExtractor<String, DocumentAnnotation> minmaxExtractor = new MinMaxNormalizationExtractor<String, DocumentAnnotation>( MINMAX_EXTRACTOR_KEY, featuresToNormalizeExtractor); if (this.minmaxUri != null) { minmaxExtractor.load(this.minmaxUri); } return minmaxExtractor; }
private ZeroMeanUnitStddevExtractor<String, DocumentAnnotation> initZmusExtractor() throws IOException { CombinedExtractor1<DocumentAnnotation> featuresToNormalizeExtractor = new CombinedExtractor1<DocumentAnnotation>( new CountAnnotationExtractor<DocumentAnnotation>(Sentence.class), new CountAnnotationExtractor<DocumentAnnotation>(Token.class)); ZeroMeanUnitStddevExtractor<String, DocumentAnnotation> zmusExtractor = new ZeroMeanUnitStddevExtractor<String, DocumentAnnotation>( ZMUS_EXTRACTOR_KEY, featuresToNormalizeExtractor); if (this.zmusUri != null) { zmusExtractor.load(this.zmusUri); } return zmusExtractor; }
@Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); // the token feature extractor: text, char pattern (uppercase, digits, etc.), and part-of-speech this.extractor = new CombinedExtractor1<Token>( new FeatureFunctionExtractor<Token>( new CoveredTextExtractor<Token>(), new CharacterCategoryPatternFunction<Token>(PatternType.REPEATS_MERGED)), new TypePathExtractor<Token>(Token.class, "pos")); // the context feature extractor: the features above for the 3 preceding and 3 following tokens this.contextExtractor = new CleartkExtractor<Token, Token>( Token.class, this.extractor, new Preceding(3), new Following(3)); // the chunking definition: Tokens will be combined to form NamedEntityMentions, with labels // from the "mentionType" attribute so that we get B-location, I-person, etc. this.chunking = new BioChunking<Token, NamedEntityMention>( Token.class, NamedEntityMention.class, "mentionType"); }