public Instance pipe (Instance carrier) { String newTerm = null; TokenSequence tmpTS = new TokenSequence(); TokenSequence ts = (TokenSequence) carrier.getData(); for (int i = 0; i < ts.size(); i++) { Token t = ts.getToken(i); for(int j = 0; j < gramSizes.length; j++) { int len = gramSizes[j]; if (len <= 0 || len > (i+1)) continue; if (len == 1) { tmpTS.add(t); continue; } newTerm = new String(t.getText()); for(int k = 1; k < len; k++) newTerm = ts.getToken(i-k) + "_" + newTerm; tmpTS.add(newTerm); } } carrier.setData(tmpTS); return carrier; }
public TokenIterator (TokenSequence ts) { subiterator = ts.iterator(); }
public FeatureVector toFeatureVector(Alphabet dict) { return new FeatureVector( toFeatureSequence( dict ) ); }
public FeatureVectorSequence (Alphabet dict, TokenSequence tokens, boolean binary, boolean augmentable, boolean growAlphabet) { this.sequence = new FeatureVector[tokens.size()]; if (augmentable) for (int i = 0; i < tokens.size(); i++) sequence[i] = new AugmentableFeatureVector (dict, tokens.getToken(i).getFeatures(), binary, growAlphabet); else for (int i = 0; i < tokens.size(); i++) sequence[i] = new FeatureVector (dict, tokens.getToken(i).getFeatures(), binary, growAlphabet); }
public TokenSequence randomTokenSequence (Random r, int length) { FeatureSequence fs = randomFeatureSequence (r, length); TokenSequence ts = new TokenSequence (length); for (int i = 0; i < length; i++) ts.add (fs.getObjectAtPosition(i)); return ts; }
public String toString() { StringBuffer sb = new StringBuffer(); sb.append( "TokenSequence " + super.toString() + "\n" ); for (int i = 0; i < tokens.size(); i++) { String tt = getToken( i ).toString(); sb.append( "Token#" + i + ":" ); sb.append( tt ); if (!tt.endsWith( "\n" )) sb.append( "\n" ); } return sb.toString(); }
static void lattice2html (PrintWriter out, ExtorInfo info) { assert (info.target.size() == info.predicted.size()); assert (info.input.size() == info.predicted.size()); int N = info.target.size(); for (int start = 0; start < N; start += LENGTH - 1) { int end = Math.min (N, start + LENGTH); if (!allSeqMatches (info.predicted, info.target, start, end)) { error2html (out, info, start, end); } } }
public TokenSequence(Token[] tokens) { this( tokens.length ); for (int i = 0; i < tokens.length; i++) this.add( tokens[i] ); }
public PipeOutputAccumulator clonePipeOutputAccumulator() { TokenSequence ret = new TokenSequence( tokens ); ret.properties = this.properties; return ret; }
public static void main (String[] args) { try { for (int i = 0; i < args.length; i++) { Instance carrier = new Instance (new File(args[i]), null, null, null); Pipe p = new SerialPipes (new Pipe[] { new Input2CharSequence (), new CharSequence2TokenSequence(new CharSequenceLexer())}); carrier = p.pipe (carrier); TokenSequence ts = (TokenSequence) carrier.getData(); System.out.println ("==="); System.out.println (args[i]); System.out.println (ts.toString()); } } catch (Exception e) { System.out.println (e); e.printStackTrace(); } }
public String[] fieldNamesForWord (String word) { HashSet allFields = new HashSet(); assert input.size() == output.size(); for (int t = 0; t < input.size(); t++) { if (input.getToken(t).getText().equals(word)) { allFields.add(output.get(t).toString()); } } return (String[]) allFields.toArray(new String[allFields.size()]); }
public TokenSequence pipe (BufferedReader br) throws java.io.IOException { final int BUFSIZE = 2048; char[] buf = new char[BUFSIZE]; int count; StringBuffer sb = new StringBuffer (BUFSIZE); do { count = br.read (buf, 0, BUFSIZE); sb.append (buf); } while (count == BUFSIZE); lexer.setCharSequence ((CharSequence)sb); TokenSequence ts = new TokenSequence (); while (lexer.hasNext()) ts.add (new Token ((String) lexer.next())); return ts; }
private static void outputInputRow (PrintWriter out, TokenSequence input, int start, int end) { out.println (" <tr class=\"input\">"); out.println (" <td class=\"label\"></td>"); for (int ip = start; ip < end; ip++) { out.print ("<td>"+input.getToken(ip).getText()+"</td>"); } out.println (" </tr>"); }
public static void dualLattice2html (PrintWriter out, String desc, ExtorInfo info1, ExtorInfo info2) { assert (info1.predicted.size() == info1.target.size()); assert (info1.input.size() == info1.predicted.size()); assert (info2.input.size() == info2.predicted.size()); assert (info2.predicted.size() == info2.target.size()); int N = info1.target.size(); for (int start = 0; start < N; start += LENGTH - 1) { int end = Math.min (info1.predicted.size(), start + LENGTH); if (!allSeqMatches (info1.predicted, info2.predicted, start, end)) { error2html (out, info1, start, end); error2html (out, info2, start, end); } } }
public void addAll(Token[] tokens) { for (int i = 0; i < tokens.length; i++) add( tokens[i] ); }
/** * Compute the maxent classification of an instance. * * @param classifier the classifier * @param features the features that are on for this instance * @return the classification */ static public Classification classify(Classifier classifier, String[] features) { return classifier.classify( new Instance(new TokenSequence(features), null, null, null, classifier.getInstancePipe())); }
private int endOfWord(TokenSequence ts, int start) { if (start < 0 || start >= ts.size()) { System.err .println("Lexicon.lastIndexOf: error - out of TokenSequence boundaries"); return -1; } Hashtable currentLevel = lex; int end = -1; for (int i = start; i < ts.size(); i++) { Token t = ts.getToken(i); String s = t.getText(); if (ignoreCase) s = s.toLowerCase(); currentLevel = (Hashtable) currentLevel.get(s); if (currentLevel == null) { return end; } if (currentLevel.containsKey(END_OF_WORD_TOKEN)) { end = i; } } return end; }
public Instance pipe (Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); TokenSequence targetTokenSeq = new TokenSequence (ts.size()); for (int i = 0; i < ts.size(); i++) { Token t = ts.getToken(i); Matcher matcher = regex.matcher (t.getText()); if (matcher.matches()) { targetTokenSeq.add (matcher.group(targetGroup)); t.setText (matcher.group (dataGroup)); } else { logger.warning ("Skipping token: No match of "+regex.pattern() +" at token #"+i+" with text "+t.getText()); } } carrier.setTarget(targetTokenSeq); carrier.setData(ts); return carrier; }
throw new ClassCastException ("carrier.data is a " + carrier.getData().getClass().getName() + " not a CharSequence"); TokenSequence dataTokens = new TokenSequence (); TokenSequence targetTokens = new TokenSequence (); CharSequence string = (CharSequence) carrier.getData(); String tag = backgroundTag; lexer.setCharSequence (string.subSequence (textStart, textEnd)); while (lexer.hasNext()) { dataTokens.add (new Token ((String) lexer.next())); targetTokens.add (new Token (tag));