edu.illinois.cs.cogcomp.nlp.tokenizer.TokenType java code examples

/**
 * classify the character.
 * 
 * @param c the character to categorize.
 * @return the index of the associated type.
 */
private int classify(char c) {
  if (c == '_')
    return TokenType.TEXT.ordinal();
  if (Character.isAlphabetic(c) || Character.isDigit(c)) {
    return TokenType.TEXT.ordinal();
  } else if (Character.isWhitespace(c)) {
    return TokenType.WHITESPACE.ordinal();
  } else {
    Character.UnicodeBlock block = Character.UnicodeBlock.of(c);
    if (!Character.isISOControl(c) && c != KeyEvent.CHAR_UNDEFINED && block != null
        && block != Character.UnicodeBlock.SPECIALS)
      return TokenType.PUNCTUATION.ordinal();
    else
      return TokenType.UNPRINTABLE.ordinal();
  }
}

/**
 * classify the character.
 * 
 * @param c the character to categorize.
 * @return the index of the associated type.
 */
private int classify(char c) {
  if (c == '_')
    return TokenType.TEXT.ordinal();
  if (Character.isAlphabetic(c) || Character.isDigit(c)) {
    return TokenType.TEXT.ordinal();
  } else if (Character.isWhitespace(c)) {
    return TokenType.WHITESPACE.ordinal();
  } else {
    Character.UnicodeBlock block = Character.UnicodeBlock.of(c);
    if (!Character.isISOControl(c) && c != KeyEvent.CHAR_UNDEFINED && block != null
        && block != Character.UnicodeBlock.SPECIALS)
      return TokenType.PUNCTUATION.ordinal();
    else
      return TokenType.UNPRINTABLE.ordinal();
  }
}

/**
 * classify the character.
 *
 * @param c the character to categorize.
 * @return the index of the associated type.
 */
private int classify(char c) {
  if (c == '_')
    return TokenType.TEXT.ordinal();
  if (Character.isAlphabetic(c) || Character.isDigit(c)) {
    return TokenType.TEXT.ordinal();
  } else if (Character.isWhitespace(c)) {
    return TokenType.WHITESPACE.ordinal();
  } else {
    Character.UnicodeBlock block = Character.UnicodeBlock.of(c);
    if (!Character.isISOControl(c) && c != KeyEvent.CHAR_UNDEFINED && block != null
        && block != Character.UnicodeBlock.SPECIALS)
      return TokenType.PUNCTUATION.ordinal();
    else
      return TokenType.UNPRINTABLE.ordinal();
  }
}

/**
 * Get the next word, this is a lookahead operation.
 * 
 * @return the next word.
 */
String getNextWord() {
  int texttype = TokenType.TEXT.ordinal();
  int n = current;
  for (; n < this.text.length; n++) {
    char character = this.text[n];
    int tokentype = classify(character);
    if (tokentype != texttype)
      return textstring.substring(current, n);
  }
  return textstring.substring(current, n);
}

/**
 * Get the next word, this is a lookahead operation.
 * 
 * @returns the next word.
 */
String getNextWord() {
  int texttype = TokenType.TEXT.ordinal();
  int n = current;
  for (; n < this.text.length; n++) {
    char character = this.text[n];
    int tokentype = classify(character);
    if (tokentype != texttype)
      return textstring.substring(current, n);
  }
  return textstring.substring(current, n);
}

/**
 * Get the next word, this is a lookahead operation.
 *
 * @returns the next word.
 */
String getNextWord() {
  int texttype = TokenType.TEXT.ordinal();
  int n = current;
  for (; n < this.text.length; n++) {
    char character = this.text[n];
    int tokentype = classify(character);
    if (tokentype != texttype)
      return textstring.substring(current, n);
  }
  return textstring.substring(current, n);
}

  @Override
  public void process(char token) {
    String cword = getCurrent().getWord();
    // let's see if this is a contraction.
    if (cword.equals("'")) {
      String word = getNextWord();
      if (Contractions.contains(word)) {
        // just change the state type to text, this will end up being a
        // word.
        getCurrent().stateindex = TokenType.TEXT.ordinal();
        state = getCurrent().stateindex;
        return;
      }
    } else if (cword.equals(".") && Character.isDigit(token)) {
      // This is a decimal number (probably), just keep the current state and
      // make it a word token
      getCurrent().stateindex = TokenType.TEXT.ordinal();
      state = getCurrent().stateindex;
      return;
    }
    pop(current);
    push(new State(TokenizerState.IN_WORD), current);
  }
},

  @Override
  public void process(char token) {
    String cword = getCurrent().getWord();
    // let's see if this is a contraction.
    if (cword.equals("'")) {
      String word = getNextWord();
      if (Contractions.contains(word)) {
        // just change the state type to text, this will end up being a
        // word.
        getCurrent().stateindex = TokenType.TEXT.ordinal();
        state = getCurrent().stateindex;
        return;
      }
    } else if (cword.equals(".") && Character.isDigit(token)) {
      // This is a decimal number (probably), just keep the current state and
      // make it a word token
      getCurrent().stateindex = TokenType.TEXT.ordinal();
      state = getCurrent().stateindex;
      return;
    }
    pop(current);
    push(new State(TokenizerState.IN_WORD), current);
  }
},

  @Override
  public void process(char token) {
    String cword = getCurrent().getWord();
    // let's see if this is a contraction.
    if (cword.equals("'")) {
      String word = getNextWord();
      if (Contractions.contains(word)) {
        // just change the state type to text, this will end up being a
        // word.
        getCurrent().stateindex = TokenType.TEXT.ordinal();
        state = getCurrent().stateindex;
        return;
      }
    } else if (cword.equals(".") && Character.isDigit(token)) {
      // This is a decimal number (probably), just keep the current state and
      // make it a word token
      getCurrent().stateindex = TokenType.TEXT.ordinal();
      state = getCurrent().stateindex;
      return;
    }
    pop(current);
    push(new State(TokenizerState.IN_WORD), current);
  }
},

Javadoc

These are the types of tokens we deal with.

Most used methods

ordinal

Popular in Java

Reactive rest calls using spring rest template
putExtra (Intent)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
onCreateOptionsMenu (Activity)
ObjectMapper (com.fasterxml.jackson.databind)
ObjectMapper provides functionality for reading and writing JSON, either to and from basic POJOs (Pl
Executors (java.util.concurrent)
Factory and utility methods for Executor, ExecutorService, ScheduledExecutorService, ThreadFactory,
ReentrantLock (java.util.concurrent.locks)
A reentrant mutual exclusion Lock with the same basic behavior and semantics as the implicit monitor
VirtualMachine (com.sun.tools.attach)
A Java virtual machine. A VirtualMachine represents a Java virtual machine to which this Java vir
JList (javax.swing)
Scheduler (org.quartz)
This is the main interface of a Quartz Scheduler. A Scheduler maintains a registry of org.quartz.Job
Github Copilot alternatives

How to useTokenType in edu.illinois.cs.cogcomp.nlp.tokenizer

Best Java code snippets using edu.illinois.cs.cogcomp.nlp.tokenizer.TokenType (Showing top 9 results out of 315)

How to use
TokenType
in
edu.illinois.cs.cogcomp.nlp.tokenizer