public static void main(String[] args) throws Exception { File filesDirectory = new File(args[0]); SimplePipeline.runPipeline( UriCollectionReader.getCollectionReaderFromDirectory(filesDirectory), UriToDocumentTextAnnotator.getDescription(), SentenceAnnotator.getDescription(), TokenAnnotator.getDescription(), PosTaggerAnnotator.getDescription(), ParserAnnotator.getDescription()); } }
public void process(JCas jCas) throws AnalysisEngineProcessException { try { if (!typesInitialized) initializeTypes(jCas); if (windowType != null) { FSIterator<Annotation> windows = jCas.getAnnotationIndex(windowType).iterator(); while (windows.hasNext()) { Annotation window = windows.next(); List<Token> pojoTokens = tokenizer.getTokens(window.getCoveredText()); createTokens(pojoTokens, window.getBegin(), jCas); } } else { String text = jCas.getDocumentText(); List<Token> pojoTokens = tokenizer.getTokens(text); createTokens(pojoTokens, 0, jCas); } } catch (Exception e) { throw new AnalysisEngineProcessException(e); } }
UriToDocumentTextAnnotator.getDescription(), SentenceAnnotator.getDescription(), TokenAnnotator.getDescription(), PosTaggerAnnotator.getDescription(), DefaultSnowballStemmer.getDescription("English"),
public static void main(String[] args) throws UIMAException, IOException { Options options = new Options(); options.parseOptions(args); CollectionReader reader = UriCollectionReader.getCollectionReaderFromDirectory(options.inputDirectoryName); AnalysisEngineDescription uriToText = UriToDocumentTextAnnotator.getDescription(); AnalysisEngineDescription sentences = SentenceAnnotator.getDescription(); AnalysisEngineDescription tokenizer = TokenAnnotator.getDescription(); AnalysisEngineDescription posTagger = PosTaggerAnnotator.getDescription(); AnalysisEngineDescription lineWriter = AnalysisEngineFactory.createPrimitiveDescription( LineWriter.class, LineWriter.PARAM_OUTPUT_FILE_NAME, options.outputFileName, LineWriter.PARAM_OUTPUT_ANNOTATION_CLASS_NAME, Token.class.getName(), LineWriter.PARAM_ANNOTATION_WRITER_CLASS_NAME, TokenAnnotationWriter.class.getName()); SimplePipeline.runPipeline(reader, uriToText, sentences, tokenizer, posTagger, lineWriter); System.out.println("results written to " + options.outputFileName); }
public static void main(String[] args) throws Exception { File filesDirectory = new File(args[0]); String outputDirectory = args[1]; SimplePipeline.runPipeline( UriCollectionReader.getCollectionReaderFromDirectory(filesDirectory), UriToDocumentTextAnnotator.getDescription(), SentenceAnnotator.getDescription(), TokenAnnotator.getDescription(), PosTaggerAnnotator.getDescription(), ParserAnnotator.getDescription(), AnalysisEngineFactory.createPrimitiveDescription( XWriter.class, XWriter.PARAM_OUTPUT_DIRECTORY_NAME, outputDirectory, XWriter.PARAM_FILE_NAMER_CLASS_NAME, ViewURIFileNamer.class.getName())); } }
public static void main(String[] args) throws Exception { Options options = CliFactory.parseArguments(Options.class, args); CollectionReader reader = UriCollectionReader.getCollectionReaderFromDirectory(options.getInputDirectory()); AnalysisEngineDescription uriToText = UriToDocumentTextAnnotator.getDescription(); AnalysisEngineDescription sentences = SentenceAnnotator.getDescription(); AnalysisEngineDescription tokenizer = TokenAnnotator.getDescription(); AnalysisEngineDescription posTagger = PosTaggerAnnotator.getDescription(); AnalysisEngineDescription lineWriter = AnalysisEngineFactory.createEngineDescription( LineWriter.class, LineWriter.PARAM_OUTPUT_FILE_NAME, options.getOutputFile(), LineWriter.PARAM_OUTPUT_ANNOTATION_CLASS_NAME, Token.class.getName(), LineWriter.PARAM_ANNOTATION_WRITER_CLASS_NAME, TokenAnnotationWriter.class.getName()); SimplePipeline.runPipeline(reader, uriToText, sentences, tokenizer, posTagger, lineWriter); System.out.println("results written to " + options.getOutputFile()); }
builder.add(TokenAnnotator.getDescription());
public AggregateBuilder buildTrainingAggregate() throws ResourceInitializationException { AggregateBuilder builder = new AggregateBuilder(); builder.add(UriToDocumentTextAnnotator.getDescription()); // NLP pre-processing components builder.add(SentenceAnnotator.getDescription()); builder.add(TokenAnnotator.getDescription()); builder.add(PosTaggerAnnotator.getDescription()); builder.add(DefaultSnowballStemmer.getDescription("English")); // This will extract the features for summarization builder.add(AnalysisEngineFactory.createEngineDescription( SumBasicAnnotator.class, DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME, SumBasicDataWriter.class.getName(), DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY, this.modelDirectory.getPath(), SumBasicAnnotator.PARAM_TOKEN_FIELD, this.tokenField.name(), SumBasicAnnotator.PARAM_STOPWORDS_URI, stopwordsFile.toURI())); // Save off xmis for re-reading builder.add(AnalysisEngineFactory.createEngineDescription( XmiWriter.class, XmiWriter.PARAM_OUTPUT_DIRECTORY, xmiDirectory.getPath())); return builder; }
builder.add(TokenAnnotator.getDescription());
FilesCollectionReader.getCollectionReader(inputFileOrDir), SentenceAnnotator.getDescription(), TokenAnnotator.getDescription(), PosTaggerAnnotator.getDescription(), DefaultSnowballStemmer.getDescription("English"),
builder.add(TokenAnnotator.getDescription()); // Tokenization builder.add(DefaultSnowballStemmer.getDescription("English")); // Stemming
public static void main(String[] args) throws Exception { Options options = new Options(); options.parseOptions(args); // a reader that loads the URIs of the text file CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(Arrays.asList(options.textFile)); // assemble the classification pipeline AggregateBuilder aggregate = new AggregateBuilder(); // an annotator that loads the text from the training file URIs aggregate.add(UriToDocumentTextAnnotator.getDescription()); // annotators that identify sentences, tokens and part-of-speech tags in the text aggregate.add(SentenceAnnotator.getDescription()); aggregate.add(TokenAnnotator.getDescription()); aggregate.add(PosTaggerAnnotator.getDescription()); // our NamedEntityChunker annotator, configured to classify on the new texts aggregate.add(AnalysisEngineFactory.createPrimitiveDescription( NamedEntityChunker.class, CleartkSequenceAnnotator.PARAM_IS_TRAINING, false, GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH, new File(options.modelDirectory, "model.jar"))); // a very simple annotator that just prints out any named entities we found aggregate.add(AnalysisEngineFactory.createPrimitiveDescription(PrintNamedEntityMentions.class)); // run the classification pipeline on the new texts SimplePipeline.runPipeline(reader, aggregate.createAggregateDescription()); }
builder.add(TokenAnnotator.getDescription()); // Tokenization builder.add(DefaultSnowballStemmer.getDescription("English")); // Stemming
builder.add(TokenAnnotator.getDescription()); // Tokenization builder.add(DefaultSnowballStemmer.getDescription("English")); // Stemming
public static void main(String[] args) throws Exception { Options options = CliFactory.parseArguments(Options.class, args); // a reader that loads the URIs of the text file CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(Arrays.asList(options.getTextFile())); // assemble the classification pipeline AggregateBuilder aggregate = new AggregateBuilder(); // an annotator that loads the text from the training file URIs aggregate.add(UriToDocumentTextAnnotator.getDescription()); // annotators that identify sentences, tokens and part-of-speech tags in the text aggregate.add(SentenceAnnotator.getDescription()); aggregate.add(TokenAnnotator.getDescription()); aggregate.add(PosTaggerAnnotator.getDescription()); // our NamedEntityChunker annotator, configured to classify on the new texts aggregate.add(AnalysisEngineFactory.createEngineDescription( NamedEntityChunker.class, CleartkSequenceAnnotator.PARAM_IS_TRAINING, false, GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH, JarClassifierBuilder.getModelJarFile(options.getModelDirectory()))); // a very simple annotator that just prints out any named entities we found aggregate.add(AnalysisEngineFactory.createEngineDescription(PrintNamedEntityMentions.class)); // run the classification pipeline on the new texts SimplePipeline.runPipeline(reader, aggregate.createAggregateDescription()); }
builder.add(TokenAnnotator.getDescription()); builder.add(DefaultSnowballStemmer.getDescription("English"));
aggregate.add(TokenAnnotator.getDescription()); aggregate.add(PosTaggerAnnotator.getDescription()); aggregate.add(AnalysisEngineFactory.createEngineDescription(
builder.add(TokenAnnotator.getDescription()); builder.add(DefaultSnowballStemmer.getDescription("English"));
builder.add(TokenAnnotator.getDescription()); // Tokenization builder.add(DefaultSnowballStemmer.getDescription("English")); // Stemming
aggregate.add(TokenAnnotator.getDescription()); aggregate.add(PosTaggerAnnotator.getDescription()); aggregate.add(AnalysisEngineFactory.createPrimitiveDescription(