public TokenSequence(Token[] tokens) { this( tokens.length ); for (int i = 0; i < tokens.length; i++) this.add( tokens[i] ); }
public void addAll(Token[] tokens) { for (int i = 0; i < tokens.length; i++) add( tokens[i] ); }
public void pipeOutputAccumulate(Instance carrier, Pipe iteratedPipe) { Object data = carrier.getData(); if (! (data instanceof Token)) throw new IllegalArgumentException( "TokenSequence can only accumulator Token's" ); add( (Token)data ); }
public TokenSequence(Object[] tokens) { this( tokens.length ); for (int i = 0; i < tokens.length; i++) this.add( new Token( tokens[i].toString() ) ); }
public void addAll(TokenSequence ts) { for (int i = 0; i < ts.size(); i++) add( ts.getToken( i ) ); }
public TokenSequence randomTokenSequence (Random r, int length) { FeatureSequence fs = randomFeatureSequence (r, length); TokenSequence ts = new TokenSequence (length); for (int i = 0; i < length; i++) ts.add (fs.getObjectAtPosition(i)); return ts; }
public TokenSequence pipe (BufferedReader br) throws java.io.IOException { final int BUFSIZE = 2048; char[] buf = new char[BUFSIZE]; int count; StringBuffer sb = new StringBuffer (BUFSIZE); do { count = br.read (buf, 0, BUFSIZE); sb.append (buf); } while (count == BUFSIZE); lexer.setCharSequence ((CharSequence)sb); TokenSequence ts = new TokenSequence (); while (lexer.hasNext()) ts.add (new Token ((String) lexer.next())); return ts; }
public Instance pipe (Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); TokenSequence targetTokenSeq = new TokenSequence (ts.size()); for (int i = 0; i < ts.size(); i++) { Token t = ts.getToken(i); Matcher matcher = regex.matcher (t.getText()); if (matcher.matches()) { targetTokenSeq.add (matcher.group(targetGroup)); t.setText (matcher.group (dataGroup)); } else { logger.warning ("Skipping token: No match of "+regex.pattern() +" at token #"+i+" with text "+t.getText()); } } carrier.setTarget(targetTokenSeq); carrier.setData(ts); return carrier; }
public Instance pipe (Instance carrier) { String newTerm = null; TokenSequence tmpTS = new TokenSequence(); TokenSequence ts = (TokenSequence) carrier.getData(); for (int i = 0; i < ts.size(); i++) { Token t = ts.getToken(i); for(int j = 0; j < gramSizes.length; j++) { int len = gramSizes[j]; if (len <= 0 || len > (i+1)) continue; if (len == 1) { tmpTS.add(t); continue; } newTerm = new String(t.getText()); for(int k = 1; k < len; k++) newTerm = ts.getToken(i-k) + "_" + newTerm; tmpTS.add(newTerm); } } carrier.setData(tmpTS); return carrier; }
public Instance pipe (Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); // xxx This doesn't seem so efficient. Perhaps have TokenSequence // use a LinkedList, and remove Tokens from it? -? // But a LinkedList implementation of TokenSequence would be quite inefficient -AKM TokenSequence ret = new TokenSequence (); Token prevToken = null; for (int i = 0; i < ts.size(); i++) { Token t = ts.getToken(i); String s = t.getText(); if (CharSequenceLexer.LEX_ALPHA.matcher(s).matches()) { ret.add (t); prevToken = t; } else if (markDeletions && prevToken != null) prevToken.setProperty (FeatureSequenceWithBigrams.deletionMark, t.getText()); } carrier.setData(ret); return carrier; }
public Instance pipe (Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); // xxx This doesn't seem so efficient. Perhaps have TokenSequence // use a LinkedList, and remove Tokens from it? -? // But a LinkedList implementation of TokenSequence would be quite inefficient -AKM TokenSequence ret = new TokenSequence (); Token prevToken = null; for (int i = 0; i < ts.size(); i++) { Token t = ts.getToken(i); if (! stoplist.contains (caseSensitive ? t.getText() : t.getText().toLowerCase())) { // xxx Should we instead make and add a copy of the Token? ret.add (t); prevToken = t; } else if (markDeletions && prevToken != null) prevToken.setProperty (FeatureSequenceWithBigrams.deletionMark, t.getText()); } carrier.setData(ret); return carrier; }
public Instance pipe(Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); TokenSequence newTs = new TokenSequence(); FeatureSequence labelSeq = new FeatureSequence(getTargetAlphabet()); boolean lastWasSpace = true; StringBuffer sb = new StringBuffer(); for (int i = 0; i < ts.size(); i++) { Token t = ts.getToken(i); if (t.getText().equals(" ")) lastWasSpace = true; else { sb.append(t.getText()); newTs.add(t); labelSeq.add(lastWasSpace ? "start" : "notstart"); lastWasSpace = false; } } if (isTargetProcessing()) carrier.setTarget(labelSeq); carrier.setData(newTs); carrier.setSource(sb.toString()); return carrier; }
public Instance pipe(Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); TokenSequence newTs = new TokenSequence(); FeatureSequence labelSeq = new FeatureSequence(getTargetAlphabet()); boolean lastWasSpace = true; StringBuffer sb = new StringBuffer(); for (int i = 0; i < ts.size(); i++) { Token t = ts.getToken(i); if (t.getText().equals(" ")) lastWasSpace = true; else { sb.append(t.getText()); newTs.add(t); labelSeq.add(lastWasSpace ? "start" : "notstart"); lastWasSpace = false; } } if (isTargetProcessing()) carrier.setTarget(labelSeq); carrier.setData(newTs); carrier.setSource(sb.toString()); return carrier; }
public Instance pipe (Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); TokenSequence newTs = new TokenSequence(); FeatureSequence labelSeq = new FeatureSequence(getTargetAlphabet()); boolean lastWasSpace = true; StringBuffer sb = new StringBuffer(); for (int i = 0; i < ts.size(); i++) { Token t = ts.getToken(i); if (t.getText().equals(" ")) lastWasSpace = true; else { sb.append (t.getText()); newTs.add (t); labelSeq.add (lastWasSpace ? "start" : "notstart"); lastWasSpace = false; } } if (isTargetProcessing()) carrier.setTarget(labelSeq); carrier.setData(newTs); carrier.setSource(sb.toString()); return carrier; }
public Instance pipe (Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); TokenSequence newTs = new TokenSequence(); FeatureSequence labelSeq = new FeatureSequence(getTargetAlphabet()); boolean lastWasSpace = true; StringBuffer sb = new StringBuffer(); for (int i = 0; i < ts.size(); i++) { Token t = ts.getToken(i); if (t.getText().equals(" ")) lastWasSpace = true; else { sb.append (t.getText()); newTs.add (t); labelSeq.add (lastWasSpace ? "start" : "notstart"); lastWasSpace = false; } } if (isTargetProcessing()) carrier.setTarget(labelSeq); carrier.setData(newTs); carrier.setSource(sb.toString()); return carrier; }
public Instance pipe (Instance carrier) { if (carrier.getData() instanceof CharSequence) carrier.setData(new TokenSequence (ngramify ((CharSequence)carrier.getData()))); else if (carrier.getData() instanceof TokenSequence) { TokenSequence ts = (TokenSequence) carrier.getData(); TokenSequence ret = new TokenSequence (); for (int i = 0; i < ts.size(); i++) ret.add (ngramify (ts.getToken(i).getText())); carrier.setData(ret); } else throw new IllegalArgumentException ("Unhandled type "+carrier.getData().getClass()); return carrier; }
public Instance pipe (Instance carrier) { CharSequence string = (CharSequence) carrier.getData(); lexer.setCharSequence (string); TokenSequence ts = new StringTokenization (string); while (lexer.hasNext()) { lexer.next(); ts.add (new StringSpan (string, lexer.getStartOffset (), lexer.getEndOffset ())); } carrier.setData(ts); return carrier; }