edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer java code examples

/**
 * Read mention-relation annotations -- including coreference -- from ERE corpus.
 *
 * @param ereCorpus the ERE corpus release (values from
 * {@link edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREDocumentReader.EreCorpus}
 * @param throwExceptionOnXmlParseFailure if 'true', throws exception if xml parser encounters e.g. mismatched
 *                                        open/close tags
 * @throws Exception
 */
public EREMentionRelationReader(EreCorpus ereCorpus, String corpusRoot, boolean throwExceptionOnXmlParseFailure) throws Exception {
  this(ereCorpus, new TokenizerTextAnnotationBuilder(new StatefulTokenizer()), corpusRoot, throwExceptionOnXmlParseFailure);
}

StatefulTokenizer statefulTokenizer = new StatefulTokenizer();
  String sentence = sentences.get(i);
  String type = types.get(i);
  Pair<String[], IntPair[]> tokenizedSentence = statefulTokenizer.tokenizeSentence(sentence);
  List<String> curTokens = new LinkedList<>(Arrays.asList(tokenizedSentence.getFirst()));
  int firstArgStart = 0;

/**
 * Read mention-relation annotations -- including coreference -- from ERE English corpus.
 *
 * @param ereCorpus                       the ERE corpus release (values from {@link EreCorpus}
 * @param corpusRoot the data root directory for the ERE corpus to be processed
 * @param throwExceptionOnXmlParseFailure if 'true', throws exception if xml parser encounters e.g. mismatched
 *                                        open/close tags  @throws Exception
 */
public EREEventReader(EreCorpus ereCorpus, String corpusRoot, boolean throwExceptionOnXmlParseFailure) throws Exception {
  this(ereCorpus, new TokenizerTextAnnotationBuilder(new StatefulTokenizer()), corpusRoot, throwExceptionOnXmlParseFailure);
}

/**
 * Read mention-relation annotations -- including coreference -- from ERE English corpus.
 *
 * @param ereCorpus                       the ERE corpus release (values from {@link EreCorpus}
 * @param corpusRoot the data root directory for the ERE corpus to be processed
 * @param throwExceptionOnXmlParseFailure if 'true', throws exception if xml parser encounters e.g. mismatched
 *                                        open/close tags  @throws Exception
 */
public EREEventReader(EreCorpus ereCorpus, String corpusRoot, boolean throwExceptionOnXmlParseFailure) throws Exception {
  this(ereCorpus, new TokenizerTextAnnotationBuilder(new StatefulTokenizer()), corpusRoot, throwExceptionOnXmlParseFailure);
}

/**
 * Read mention-relation annotations -- including coreference -- from ERE corpus.
 *
 * @param ereCorpus the ERE corpus release (values from
 * {@link edu.illinois.cs.cogcomp.nlp.corpusreaders.ereReader.EREDocumentReader.EreCorpus}
 * @param throwExceptionOnXmlParseFailure if 'true', throws exception if xml parser encounters e.g. mismatched
 *                                        open/close tags
 * @throws Exception
 */
public EREMentionRelationReader(EreCorpus ereCorpus, String corpusRoot, boolean throwExceptionOnXmlParseFailure) throws Exception {
  this(ereCorpus, new TokenizerTextAnnotationBuilder(new StatefulTokenizer()), corpusRoot, throwExceptionOnXmlParseFailure);
}

public EnglishTokenizer(){
  tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
}

public EnglishTokenizer(){
  tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
}

/**
 * Reads Named Entity -- and possibly nominal mention -- annotation from an English ERE-format corpus.
 *
 * @param ereCorpus
 * @param corpusRoot
 * @param throwExceptionOnXmlParseFailure
 * @param addNominalMentions              a flag that if true, indicates that nominal mentions should be read,
 *                                        and that the view created should be named {#ViewNames.MENTION_ERE}.
 * @param addFillers                      if 'true', indicates that non-coreferable mentions should be added.
 * @throws Exception
 */
public ERENerReader(EreCorpus ereCorpus, String corpusRoot, boolean throwExceptionOnXmlParseFailure, boolean addNominalMentions, boolean addFillers) throws Exception {
  this(ereCorpus, new TokenizerTextAnnotationBuilder(new StatefulTokenizer()), corpusRoot,
      throwExceptionOnXmlParseFailure, addNominalMentions, addFillers);
}

/**
 * Reads Named Entity -- and possibly nominal mention -- annotation from an English ERE-format corpus.
 *
 * @param ereCorpus
 * @param corpusRoot
 * @param throwExceptionOnXmlParseFailure
 * @param addNominalMentions              a flag that if true, indicates that nominal mentions should be read,
 *                                        and that the view created should be named {#ViewNames.MENTION_ERE}.
 * @param addFillers                      if 'true', indicates that non-coreferable mentions should be added.
 * @throws Exception
 */
public ERENerReader(EreCorpus ereCorpus, String corpusRoot, boolean throwExceptionOnXmlParseFailure, boolean addNominalMentions, boolean addFillers) throws Exception {
  this(ereCorpus, new TokenizerTextAnnotationBuilder(new StatefulTokenizer()), corpusRoot,
      throwExceptionOnXmlParseFailure, addNominalMentions, addFillers);
}

/**
 * builds an {@link XmlTextAnnotationMaker} for reading ERE format English corpus.
 *
 * @param ereCorpus which ERE release is being processed -- affects which tag blocks are marked
 * @param throwExceptionOnXmlParseFail if 'true', throw an exception if xml parser fails
 * @return an XmlTextAnnotationMaker configured for English ERE.
 */
public static XmlTextAnnotationMaker buildXmlTextAnnotationMaker(EreCorpus ereCorpus, boolean throwExceptionOnXmlParseFail) {
  TextAnnotationBuilder textAnnotationBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
  return buildXmlTextAnnotationMaker(textAnnotationBuilder, ereCorpus, throwExceptionOnXmlParseFail);
}

/**
 * builds an {@link XmlTextAnnotationMaker} for reading ERE format English corpus.
 *
 * @param ereCorpus which ERE release is being processed -- affects which tag blocks are marked
 * @param throwExceptionOnXmlParseFail if 'true', throw an exception if xml parser fails
 * @return an XmlTextAnnotationMaker configured for English ERE.
 */
public static XmlTextAnnotationMaker buildXmlTextAnnotationMaker(EreCorpus ereCorpus, boolean throwExceptionOnXmlParseFail) {
  TextAnnotationBuilder textAnnotationBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
  return buildXmlTextAnnotationMaker(textAnnotationBuilder, ereCorpus, throwExceptionOnXmlParseFail);
}

/**
 * assumes files are all from a single source directory, and that no extraneous files are included in that directory.
 *
 * @param corpusName
 * @param sourceDirectory
 * @throws IOException
 */
public XmlFragmentWhitespacingDocumentReader(String corpusName, String sourceDirectory, String sourceFileExtension, String annotationFileExtension)
    throws Exception {
  super(CorpusReaderConfigurator.buildResourceManager(corpusName, sourceDirectory, sourceDirectory, sourceFileExtension, annotationFileExtension));
  taBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
  numFiles = 0;
  numTextAnnotations = 0;
}

/**
 * assumes files are all from a single source directory, and that no extraneous files are included in that directory.
 *
 * @param corpusName
 * @param sourceDirectory
 * @throws IOException
 */
public XmlFragmentWhitespacingDocumentReader(String corpusName, String sourceDirectory, String sourceFileExtension, String annotationFileExtension)
    throws Exception {
  super(CorpusReaderConfigurator.buildResourceManager(corpusName, sourceDirectory, sourceDirectory, sourceFileExtension, annotationFileExtension));
  taBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
  numFiles = 0;
  numTextAnnotations = 0;
}

public TextAnnotation runNER(String s) {
  TextAnnotationBuilder tab;
  boolean splitOnHyphens = false;
  tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(splitOnHyphens, false));
  TextAnnotation ta = tab.createTextAnnotation("001", "001", s);
  try {
    co.getView(ta);
  } catch (Exception e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
  }
  return ta;
}

public TACReader(String corpusRoot, boolean throwExceptionOnXmlParseFail) throws Exception {
  super(TACReader.buildTACConfig(corpusRoot, Language.English),
      buildXmlTextAnnotationMaker(new TokenizerTextAnnotationBuilder(new StatefulTokenizer()),
      throwExceptionOnXmlParseFail));
}

public TACReader(String corpusRoot, boolean throwExceptionOnXmlParseFail) throws Exception {
  super(TACReader.buildTACConfig(corpusRoot, Language.English),
      buildXmlTextAnnotationMaker(new TokenizerTextAnnotationBuilder(new StatefulTokenizer()),
      throwExceptionOnXmlParseFail));
}

/**
 *
 * @param rm non-default config options
 * @return AnnotatorService with specified NLP components
 * @throws IOException
 * @throws AnnotatorException
 */
public static BasicAnnotatorService buildPipeline(ResourceManager rm) throws IOException,
    AnnotatorException {
  TextAnnotationBuilder taBldr =
      new TokenizerTextAnnotationBuilder(new StatefulTokenizer(false, false));
  Map<String, Annotator> annotators = buildAnnotators();
  return new BasicAnnotatorService(taBldr, annotators, rm);
}

public Preprocessor(ResourceManager rm) {
  ResourceManager fullRm =
      Configurator.mergeProperties(new AnnotatorServiceConfigurator().getDefaultConfig(),
          rm);
  try {
    TextAnnotationBuilder taBldr =
        new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, false));
    Map<String, Annotator> annotators = new HashMap<>();
    annotators.put(ViewNames.POS, new POSAnnotator());
    annotators.put(ViewNames.LEMMA, new IllinoisLemmatizer());
    annotators.put(ViewNames.SHALLOW_PARSE, new ChunkerAnnotator());
    annotators.put(ViewNames.DEPENDENCY, new DepAnnotator());
    annotator = new BasicAnnotatorService(taBldr, annotators, fullRm);
  } catch (Exception e) {
    logger.error("Unable to create preprocessor. \n{}", e.getMessage());
  }
}

private static void annotate(String filepath) throws IOException {
  DepAnnotator annotator = new DepAnnotator();
  TextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, false));
  Preprocessor preprocessor = new Preprocessor();
  Files.lines(Paths.get(filepath)).forEach(line -> {
    TextAnnotation ta = taBuilder.createTextAnnotation(line);
    try {
      preprocessor.annotate(ta);
      annotator.addView(ta);
      System.out.println(ta.getView(annotator.getViewName()).toString());
    } catch (AnnotatorException e) {
      e.printStackTrace();
    }
  });
}

public Preprocessor(ResourceManager rm) {
  ResourceManager fullRm =
      Configurator.mergeProperties(new AnnotatorServiceConfigurator().getDefaultConfig(),
          rm);
  try {
    TextAnnotationBuilder taBldr =
        new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, false));
    Map<String, Annotator> annotators = new HashMap<>();
    annotators.put(ViewNames.POS, new POSAnnotator());
    annotators.put(ViewNames.LEMMA, new IllinoisLemmatizer());
    annotators.put(ViewNames.SHALLOW_PARSE, new ChunkerAnnotator());
    annotators.put(ViewNames.DEPENDENCY, new DepAnnotator());
    annotator = new BasicAnnotatorService(taBldr, annotators, fullRm);
  } catch (Exception e) {
    logger.error("Unable to create preprocessor. \n{}", e.getMessage());
  }
}

Javadoc

This is the entry point to the tokenizer state machine. This class is thread-safe, the TokenizerStateMachine is not.

Most used methods

<init>
Takes a boolean indicating if we are to split on dash or not. The default constructor assumes we do
tokenizeSentence

Popular in Java

Making http post requests using okhttp
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
requestLocationUpdates (LocationManager)
findViewById (Activity)
InetAddress (java.net)
An Internet Protocol (IP) address. This can be either an IPv4 address or an IPv6 address, and in pra
DecimalFormat (java.text)
A concrete subclass of NumberFormat that formats decimal numbers. It has a variety of features desig
NoSuchElementException (java.util)
Thrown when trying to retrieve an element past the end of an Enumeration or Iterator.
Pattern (java.util.regex)
Patterns are compiled regular expressions. In many cases, convenience methods such as String#matches
Kernel (java.awt.image)
JButton (javax.swing)
Top plugins for Android Studio

How to useStatefulTokenizer in edu.illinois.cs.cogcomp.nlp.tokenizer

Best Java code snippets using edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer (Showing top 20 results out of 315)

How to use
StatefulTokenizer
in
edu.illinois.cs.cogcomp.nlp.tokenizer