public void testOne () { Token t = new Token ("foo"); t.setProperty ("color", "red"); t.setProperty ("font", "TimesRoman"); t.setFeatureValue ("length", 3); t.setFeatureValue ("containsVowel", 1); t.setFeatureValue ("in /usr/dict/words", 0); Alphabet dict = new Alphabet(); FeatureVector fv = t.toFeatureVector (dict, false); assertTrue (fv.numLocations() == 2); assertTrue (fv.value (dict.lookupIndex("length")) == 3); }
public void testOne () { Token t = new Token ("foo"); t.setProperty ("color", "red"); t.setProperty ("font", "TimesRoman"); t.setFeatureValue ("length", 3); t.setFeatureValue ("containsVowel", 1); t.setFeatureValue ("in /usr/dict/words", 0); Alphabet dict = new Alphabet(); FeatureVector fv = t.toFeatureVector (dict, false); assertTrue (fv.numLocations() == 2); assertTrue (fv.value (dict.lookupIndex("length")) == 3); }
public Instance pipe (Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); // xxx This doesn't seem so efficient. Perhaps have TokenSequence // use a LinkedList, and remove Tokens from it? -? // But a LinkedList implementation of TokenSequence would be quite inefficient -AKM TokenSequence ret = new TokenSequence (); Token prevToken = null; for (int i = 0; i < ts.size(); i++) { Token t = ts.get(i); String s = t.getText(); if (CharSequenceLexer.LEX_ALPHA.matcher(s).matches()) { ret.add (t); prevToken = t; } else if (markDeletions && prevToken != null) prevToken.setProperty (FeatureSequenceWithBigrams.deletionMark, t.getText()); } carrier.setData(ret); return carrier; }
public Instance pipe (Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); // xxx This doesn't seem so efficient. Perhaps have TokenSequence // use a LinkedList, and remove Tokens from it? -? // But a LinkedList implementation of TokenSequence would be quite inefficient -AKM TokenSequence ret = new TokenSequence (); Token prevToken = null; for (int i = 0; i < ts.size(); i++) { Token t = ts.get(i); String s = t.getText(); if (CharSequenceLexer.LEX_ALPHA.matcher(s).matches()) { ret.add (t); prevToken = t; } else if (markDeletions && prevToken != null) prevToken.setProperty (FeatureSequenceWithBigrams.deletionMark, t.getText()); } carrier.setData(ret); return carrier; }
public Instance pipe (Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); // xxx This doesn't seem so efficient. Perhaps have TokenSequence // use a LinkedList, and remove Tokens from it? -? // But a LinkedList implementation of TokenSequence would be quite inefficient -AKM TokenSequence ret = new TokenSequence (); Token prevToken = null; for (int i = 0; i < ts.size(); i++) { Token t = ts.get(i); String s = t.getText(); if (CharSequenceLexer.LEX_ALPHA.matcher(s).matches()) { ret.add (t); prevToken = t; } else if (markDeletions && prevToken != null) prevToken.setProperty (FeatureSequenceWithBigrams.deletionMark, t.getText()); } carrier.setData(ret); return carrier; }
public Instance pipe (Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); // xxx This doesn't seem so efficient. Perhaps have TokenSequence // use a LinkedList, and remove Tokens from it? -? // But a LinkedList implementation of TokenSequence would be quite inefficient -AKM TokenSequence ret = new TokenSequence (); Token prevToken = null; for (int i = 0; i < ts.size(); i++) { Token t = ts.get(i); if (! stoplist.contains (caseSensitive ? t.getText() : t.getText().toLowerCase())) { // xxx Should we instead make and add a copy of the Token? ret.add (t); prevToken = t; } else if (markDeletions && prevToken != null) prevToken.setProperty (FeatureSequenceWithBigrams.deletionMark, t.getText()); } carrier.setData(ret); return carrier; }
public Instance pipe (Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); // xxx This doesn't seem so efficient. Perhaps have TokenSequence // use a LinkedList, and remove Tokens from it? -? // But a LinkedList implementation of TokenSequence would be quite inefficient -AKM TokenSequence ret = new TokenSequence (); Token prevToken = null; for (int i = 0; i < ts.size(); i++) { Token t = ts.get(i); if (! stoplist.contains (caseSensitive ? t.getText() : t.getText().toLowerCase())) { // xxx Should we instead make and add a copy of the Token? ret.add (t); prevToken = t; } else if (markDeletions && prevToken != null) prevToken.setProperty (FeatureSequenceWithBigrams.deletionMark, t.getText()); } carrier.setData(ret); return carrier; }
public Instance pipe (Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); // xxx This doesn't seem so efficient. Perhaps have TokenSequence // use a LinkedList, and remove Tokens from it? -? // But a LinkedList implementation of TokenSequence would be quite inefficient -AKM TokenSequence ret = new TokenSequence (); Token prevToken = null; for (int i = 0; i < ts.size(); i++) { Token t = ts.get(i); if (! stoplist.contains (caseSensitive ? t.getText() : t.getText().toLowerCase())) { // xxx Should we instead make and add a copy of the Token? ret.add (t); prevToken = t; } else if (markDeletions && prevToken != null) prevToken.setProperty (FeatureSequenceWithBigrams.deletionMark, t.getText()); } carrier.setData(ret); return carrier; }