edu.stanford.nlp.process.CoreLabelTokenFactory.<init> java code examples

/**
 * Constructs a new PTBTokenizer that returns CoreLabel objects and
 * uses the options passed in.
 *
 * @param options A String of options. For the default, recommended
 *                options for PTB-style tokenization compatibility, pass
 *                in an empty String.
 * @return A TokenizerFactory that returns CoreLabel objects o
 */
public static PTBTokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory(String options) {
 return new PTBTokenizerFactory<>(new CoreLabelTokenFactory(), options);
}

public static TokenizerFactory<CoreLabel> newTokenizerFactory() {
 return new FrenchTokenizerFactory<>(new CoreLabelTokenFactory());
}

public static PTBTokenizerFactory<CoreLabel> newPTBTokenizerFactory(boolean tokenizeNLs, boolean invertible) {
 return new PTBTokenizerFactory<>(tokenizeNLs, invertible, false, new CoreLabelTokenFactory());
}

public static TokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory() {
 return new SpanishTokenizerFactory<>(new CoreLabelTokenFactory());
}

public static ArabicTokenizer<CoreLabel> newArabicTokenizer(Reader r, Properties lexerProperties) {
 return new ArabicTokenizer<>(r, new CoreLabelTokenFactory(), lexerProperties);
}

public static WhitespaceTokenizer<CoreLabel> newCoreLabelWhitespaceTokenizer(Reader r, boolean tokenizeNLs) {
 return new WhitespaceTokenizer<>(new CoreLabelTokenFactory(), r, tokenizeNLs);
}

public static WhitespaceTokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory(String options) {
 return new WhitespaceTokenizerFactory<>(new CoreLabelTokenFactory(), options);
}

public static WhitespaceTokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory() {
 return new WhitespaceTokenizerFactory<>(new CoreLabelTokenFactory());
}

public static WhitespaceTokenizer<CoreLabel> newCoreLabelWhitespaceTokenizer(Reader r) {
 return new WhitespaceTokenizer<>(new CoreLabelTokenFactory(), r, false);
}

public static TokenizerFactory<CoreLabel> newTokenizerFactory() {
 return new ArabicTokenizerFactory<>(new CoreLabelTokenFactory());
}

/**
 * Constructs a new PTBTokenizer that makes CoreLabel tokens.
 * It optionally returns carriage returns
 * as their own token. CRs come back as Words whose text is
 * the value of {@code AbstractTokenizer.NEWLINE_TOKEN}.
 *
 * @param r The Reader to read tokens from
 * @param tokenizeNLs Whether to return newlines as separate tokens
 *         (otherwise they normally disappear as whitespace)
 * @param invertible if set to true, then will produce CoreLabels which
 *         will have fields for the string before and after, and the
 *         character offsets
 * @return A PTBTokenizer which returns CoreLabel objects
 */
public static PTBTokenizer<CoreLabel> newPTBTokenizer(Reader r, boolean tokenizeNLs, boolean invertible) {
 return new PTBTokenizer<>(r, tokenizeNLs, invertible, false, new CoreLabelTokenFactory());
}

public CoreMapAggregator getDefaultTokensAggregator() {
 if (defaultTokensAggregator == null && (defaultTokensAggregators != null || aggregateToTokens)) {
  CoreLabelTokenFactory tokenFactory = (aggregateToTokens) ? new CoreLabelTokenFactory() : null;
  Map<Class, CoreMapAttributeAggregator> aggregators = defaultTokensAggregators;
  if (aggregators == null) {
   aggregators = CoreMapAttributeAggregator.DEFAULT_NUMERIC_TOKENS_AGGREGATORS;
  }
  defaultTokensAggregator = CoreMapAggregator.getAggregator(aggregators, null, tokenFactory);
 }
 return defaultTokensAggregator;
}

/**
 * Return a tokenizer which might be suitable for tokenizing text that will be used with this
 * Treebank/Language pair, without tokenizing carriage returns (i.e., treating them as white
 * space).  The implementation in AbstractTreebankLanguagePack returns a factory for {@link
 * edu.stanford.nlp.process.WhitespaceTokenizer}.
 *
 * @return A tokenizer
 */
@Override
public TokenizerFactory<? extends HasWord> getTokenizerFactory() {
 return SpanishTokenizer.factory(new CoreLabelTokenFactory(),
   "invertible,ptb3Escaping=true,splitAll=true");
}

public void init(String map) {
 init(map, (CoreTokenFactory<IN>) new CoreLabelTokenFactory(),
   "edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation");
}

/** @return A PTBTokenizerFactory that vends CoreLabel tokens with default tokenization. */
public static TokenizerFactory<CoreLabel> coreLabelFactory(String options) {
 return PTBTokenizerFactory.newPTBTokenizerFactory(new CoreLabelTokenFactory(), options);
}

/**
 * reads the tokenFactory and tokensAnnotationClassName from
 * {@link SeqClassifierFlags}
 */
@Override
public void init(SeqClassifierFlags flags) {
 if (flags.tokensAnnotationClassName != null) {
  this.tokensAnnotationClassName = flags.tokensAnnotationClassName;
 } else {
  this.tokensAnnotationClassName = "edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation";
 }
 if (flags.tokenFactory != null) {
  try {
   this.tokenFactory = (CoreTokenFactory<IN>) Class.forName(flags.tokenFactory).newInstance();
  } catch (Exception e) {
   throw new RuntimeException(e);
  }
 } else {
  this.tokenFactory = (CoreTokenFactory<IN>) new CoreLabelTokenFactory();
 }
 init(flags, this.tokenFactory, this.tokensAnnotationClassName);
}

public void init(String name, Properties props) {
 String prefix = (name == null)? "":name + ".";
 String delimiterRegex = props.getProperty(prefix + "delimiter");
 if (delimiterRegex != null) {
  delimiterPattern = Pattern.compile(delimiterRegex);
 }
 replaceWhitespace = PropertiesUtils.getBool(props, prefix + "replaceWhitespace", replaceWhitespace);
 String mapString = props.getProperty(prefix + "columns");
 tokensAnnotationClassName = props.getProperty(prefix + "tokens",
     "edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation");
 String tokenFactoryClassName =  props.getProperty(prefix + "tokenFactory");
 if (tokenFactoryClassName != null) {
  try {
   this.tokenFactory = (CoreTokenFactory<IN>) Class.forName(tokenFactoryClassName).newInstance();
  } catch (Exception e) {
   throw new RuntimeException(e);
  }
 } else {
  this.tokenFactory = (CoreTokenFactory<IN>) new CoreLabelTokenFactory();
 }
 init(mapString, this.tokenFactory, this.tokensAnnotationClassName);
}

public MUCMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) throws Exception {
 super(dict, semantics);
 String fileName = props.getProperty(Constants.MUC_PROP);
 fileContents = IOUtils.slurpFile(fileName);
 currentOffset = 0;
 tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(false), "");
 stanfordProcessor = loadStanfordProcessor(props);
}

public static void main(String[] args) throws Exception {
 if (args.length != 2) {
  log.info("usage: java TaggerDemo2 modelFile fileToTag");
  return;
 }
 MaxentTagger tagger = new MaxentTagger(args[0]);
 TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
                   "untokenizable=noneKeep");
 BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
 PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
 DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
 documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
 for (List<HasWord> sentence : documentPreprocessor) {
  List<TaggedWord> tSentence = tagger.tagSentence(sentence);
  pw.println(SentenceUtils.listToString(tSentence, false));
 }
 // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
 List<HasWord> sent = SentenceUtils.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
 List<TaggedWord> taggedSent = tagger.tagSentence(sent);
 for (TaggedWord tw : taggedSent) {
  if (tw.tag().startsWith("JJ")) {
   pw.println(tw.word());
  }
 }
 pw.close();
}

/**
 * Initialize the featureFactory and other variables based on the passed in
 * flags.
 *
 * @param flags A specification of the AbstractSequenceClassifier to construct.
 */
public AbstractSequenceClassifier(SeqClassifierFlags flags) {
 this.flags = flags;
 // Thang Sep13: allow for multiple feature factories.
 this.featureFactories = Generics.newArrayList();
 if (flags.featureFactory != null) {
  FeatureFactory<IN> factory = new MetaClass(flags.featureFactory).createInstance(flags.featureFactoryArgs); // for compatibility
  featureFactories.add(factory);
 }
 if (flags.featureFactories != null) {
  for (int i = 0; i < flags.featureFactories.length; i++) {
   FeatureFactory<IN> indFeatureFactory = new MetaClass(flags.featureFactories[i]).
     createInstance(flags.featureFactoriesArgs.get(i));
   this.featureFactories.add(indFeatureFactory);
  }
 }
 if (flags.tokenFactory == null) {
  tokenFactory = (CoreTokenFactory<IN>) new CoreLabelTokenFactory();
 } else {
  this.tokenFactory = new MetaClass(flags.tokenFactory).createInstance(flags.tokenFactoryArgs);
 }
 pad = tokenFactory.makeToken();
 windowSize = flags.maxLeft + 1;
 reinit();
}

Javadoc

Constructor for a new token factory which will add in the word, the "current" annotation, and the begin/end position annotations.

Popular methods of CoreLabelTokenFactory

makeToken

Popular in Java

Finding current android device location
setScale (BigDecimal)
onCreateOptionsMenu (Activity)
runOnUiThread (Activity)
Runnable (java.lang)
Represents a command that can be executed. Often used to run code in a different Thread.
ByteBuffer (java.nio)
A buffer for bytes. A byte buffer can be created in either one of the following ways: * #allocate
Hashtable (java.util)
A plug-in replacement for JDK1.5 java.util.Hashtable. This version is based on org.cliffc.high_scale
Executor (java.util.concurrent)
An object that executes submitted Runnable tasks. This interface provides a way of decoupling task s
Executors (java.util.concurrent)
Factory and utility methods for Executor, ExecutorService, ScheduledExecutorService, ThreadFactory,
Join (org.hibernate.mapping)
Top Sublime Text plugins

How to use edu.stanford.nlp.process.CoreLabelTokenFactoryconstructor

Best Java code snippets using edu.stanford.nlp.process.CoreLabelTokenFactory.<init> (Showing top 20 results out of 315)

How to use
edu.stanford.nlp.process.CoreLabelTokenFactory
constructor