/** * Constructs a new PTBTokenizer that returns CoreLabel objects and * uses the options passed in. * * @param options A String of options. For the default, recommended * options for PTB-style tokenization compatibility, pass * in an empty String. * @return A TokenizerFactory that returns CoreLabel objects o */ public static PTBTokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory(String options) { return new PTBTokenizerFactory<>(new CoreLabelTokenFactory(), options); }
public static TokenizerFactory<CoreLabel> newTokenizerFactory() { return new FrenchTokenizerFactory<>(new CoreLabelTokenFactory()); }
public static PTBTokenizerFactory<CoreLabel> newPTBTokenizerFactory(boolean tokenizeNLs, boolean invertible) { return new PTBTokenizerFactory<>(tokenizeNLs, invertible, false, new CoreLabelTokenFactory()); }
public static TokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory() { return new SpanishTokenizerFactory<>(new CoreLabelTokenFactory()); }
public static ArabicTokenizer<CoreLabel> newArabicTokenizer(Reader r, Properties lexerProperties) { return new ArabicTokenizer<>(r, new CoreLabelTokenFactory(), lexerProperties); }
public static WhitespaceTokenizer<CoreLabel> newCoreLabelWhitespaceTokenizer(Reader r, boolean tokenizeNLs) { return new WhitespaceTokenizer<>(new CoreLabelTokenFactory(), r, tokenizeNLs); }
public static WhitespaceTokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory(String options) { return new WhitespaceTokenizerFactory<>(new CoreLabelTokenFactory(), options); }
public static WhitespaceTokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory() { return new WhitespaceTokenizerFactory<>(new CoreLabelTokenFactory()); }
public static WhitespaceTokenizer<CoreLabel> newCoreLabelWhitespaceTokenizer(Reader r) { return new WhitespaceTokenizer<>(new CoreLabelTokenFactory(), r, false); }
public static TokenizerFactory<CoreLabel> newTokenizerFactory() { return new ArabicTokenizerFactory<>(new CoreLabelTokenFactory()); }
/** * Constructs a new PTBTokenizer that makes CoreLabel tokens. * It optionally returns carriage returns * as their own token. CRs come back as Words whose text is * the value of {@code AbstractTokenizer.NEWLINE_TOKEN}. * * @param r The Reader to read tokens from * @param tokenizeNLs Whether to return newlines as separate tokens * (otherwise they normally disappear as whitespace) * @param invertible if set to true, then will produce CoreLabels which * will have fields for the string before and after, and the * character offsets * @return A PTBTokenizer which returns CoreLabel objects */ public static PTBTokenizer<CoreLabel> newPTBTokenizer(Reader r, boolean tokenizeNLs, boolean invertible) { return new PTBTokenizer<>(r, tokenizeNLs, invertible, false, new CoreLabelTokenFactory()); }
public CoreMapAggregator getDefaultTokensAggregator() { if (defaultTokensAggregator == null && (defaultTokensAggregators != null || aggregateToTokens)) { CoreLabelTokenFactory tokenFactory = (aggregateToTokens) ? new CoreLabelTokenFactory() : null; Map<Class, CoreMapAttributeAggregator> aggregators = defaultTokensAggregators; if (aggregators == null) { aggregators = CoreMapAttributeAggregator.DEFAULT_NUMERIC_TOKENS_AGGREGATORS; } defaultTokensAggregator = CoreMapAggregator.getAggregator(aggregators, null, tokenFactory); } return defaultTokensAggregator; }
/** * Return a tokenizer which might be suitable for tokenizing text that will be used with this * Treebank/Language pair, without tokenizing carriage returns (i.e., treating them as white * space). The implementation in AbstractTreebankLanguagePack returns a factory for {@link * edu.stanford.nlp.process.WhitespaceTokenizer}. * * @return A tokenizer */ @Override public TokenizerFactory<? extends HasWord> getTokenizerFactory() { return SpanishTokenizer.factory(new CoreLabelTokenFactory(), "invertible,ptb3Escaping=true,splitAll=true"); }
public void init(String map) { init(map, (CoreTokenFactory<IN>) new CoreLabelTokenFactory(), "edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation"); }
/** @return A PTBTokenizerFactory that vends CoreLabel tokens with default tokenization. */ public static TokenizerFactory<CoreLabel> coreLabelFactory(String options) { return PTBTokenizerFactory.newPTBTokenizerFactory(new CoreLabelTokenFactory(), options); }
/** * reads the tokenFactory and tokensAnnotationClassName from * {@link SeqClassifierFlags} */ @Override public void init(SeqClassifierFlags flags) { if (flags.tokensAnnotationClassName != null) { this.tokensAnnotationClassName = flags.tokensAnnotationClassName; } else { this.tokensAnnotationClassName = "edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation"; } if (flags.tokenFactory != null) { try { this.tokenFactory = (CoreTokenFactory<IN>) Class.forName(flags.tokenFactory).newInstance(); } catch (Exception e) { throw new RuntimeException(e); } } else { this.tokenFactory = (CoreTokenFactory<IN>) new CoreLabelTokenFactory(); } init(flags, this.tokenFactory, this.tokensAnnotationClassName); }
public void init(String name, Properties props) { String prefix = (name == null)? "":name + "."; String delimiterRegex = props.getProperty(prefix + "delimiter"); if (delimiterRegex != null) { delimiterPattern = Pattern.compile(delimiterRegex); } replaceWhitespace = PropertiesUtils.getBool(props, prefix + "replaceWhitespace", replaceWhitespace); String mapString = props.getProperty(prefix + "columns"); tokensAnnotationClassName = props.getProperty(prefix + "tokens", "edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation"); String tokenFactoryClassName = props.getProperty(prefix + "tokenFactory"); if (tokenFactoryClassName != null) { try { this.tokenFactory = (CoreTokenFactory<IN>) Class.forName(tokenFactoryClassName).newInstance(); } catch (Exception e) { throw new RuntimeException(e); } } else { this.tokenFactory = (CoreTokenFactory<IN>) new CoreLabelTokenFactory(); } init(mapString, this.tokenFactory, this.tokensAnnotationClassName); }
public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) throws Exception { super(dict, semantics); String fileName = props.getProperty(Constants.MUC_PROP); fileContents = IOUtils.slurpFile(fileName); currentOffset = 0; tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(false), ""); stanfordProcessor = loadStanfordProcessor(props); }
public static void main(String[] args) throws Exception { if (args.length != 2) { log.info("usage: java TaggerDemo2 modelFile fileToTag"); return; } MaxentTagger tagger = new MaxentTagger(args[0]); TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "untokenizable=noneKeep"); BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8")); PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8")); DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r); documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory); for (List<HasWord> sentence : documentPreprocessor) { List<TaggedWord> tSentence = tagger.tagSentence(sentence); pw.println(SentenceUtils.listToString(tSentence, false)); } // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence. List<HasWord> sent = SentenceUtils.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", "."); List<TaggedWord> taggedSent = tagger.tagSentence(sent); for (TaggedWord tw : taggedSent) { if (tw.tag().startsWith("JJ")) { pw.println(tw.word()); } } pw.close(); }
/** * Initialize the featureFactory and other variables based on the passed in * flags. * * @param flags A specification of the AbstractSequenceClassifier to construct. */ public AbstractSequenceClassifier(SeqClassifierFlags flags) { this.flags = flags; // Thang Sep13: allow for multiple feature factories. this.featureFactories = Generics.newArrayList(); if (flags.featureFactory != null) { FeatureFactory<IN> factory = new MetaClass(flags.featureFactory).createInstance(flags.featureFactoryArgs); // for compatibility featureFactories.add(factory); } if (flags.featureFactories != null) { for (int i = 0; i < flags.featureFactories.length; i++) { FeatureFactory<IN> indFeatureFactory = new MetaClass(flags.featureFactories[i]). createInstance(flags.featureFactoriesArgs.get(i)); this.featureFactories.add(indFeatureFactory); } } if (flags.tokenFactory == null) { tokenFactory = (CoreTokenFactory<IN>) new CoreLabelTokenFactory(); } else { this.tokenFactory = new MetaClass(flags.tokenFactory).createInstance(flags.tokenFactoryArgs); } pad = tokenFactory.makeToken(); windowSize = flags.maxLeft + 1; reinit(); }