/** * Construct topic model pipe with given stopwords and alphabets * * @param stopwords to be removed * @param dataAlphabet to use */ public TopicModelPipe(Collection<String> stopwords, Alphabet alphabet) { // @formatter:off super( ImmutableList.of( new CharSequenceLowercase(), new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")), new RemoveStopwords(stopwords), new TokenSequence2FeatureSequence(alphabet))); // @formatter:on } }
/** * Construct topic model pipe with given stopwords and alphabets * * @param stopwords to be removed * @param dataAlphabet to use */ public TopicModelPipe(Collection<String> stopwords, Alphabet alphabet) { // @formatter:off super( ImmutableList.of( new CharSequenceLowercase(), new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")), new RemoveStopwords(stopwords), new TokenSequence2FeatureSequence(alphabet))); // @formatter:on } }
public static void writeInstanceList(SimpleTokenizer prunedTokenizer) throws IOException { CsvIterator reader = new CsvIterator(new FileReader(inputFile.value), lineRegex.value, dataGroup.value, labelGroup.value, nameGroup.value); ArrayList<Pipe> pipes = new ArrayList<Pipe>(); Alphabet alphabet = new Alphabet(); CharSequenceLowercase csl = new CharSequenceLowercase(); StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet); if (! preserveCase.value) { pipes.add(csl); } pipes.add(prunedTokenizer); pipes.add(sl2fs); Pipe serialPipe = new SerialPipes(pipes); InstanceList instances = new InstanceList(serialPipe); instances.addThruPipe(reader); instances.save(outputFile.value); }
/** * Construct classifier pipe with given labels and stopwords * * @param initial pipe * @param stopwords to be removed */ public AbstractClassifierPipe(Pipe pipe, Collection<String> stopwords) { // @formatter:off super( ImmutableList.of( pipe, new CharSequenceLowercase(), new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")), new RemoveStopwords(stopwords), new TokenSequence2FeatureSequence(), new FeatureSequence2FeatureVector())); // @formatter:on } }
/** * Construct classifier pipe with given labels and stopwords * * @param initial pipe * @param stopwords to be removed */ public AbstractClassifierPipe(Pipe pipe, Collection<String> stopwords) { // @formatter:off super( ImmutableList.of( pipe, new CharSequenceLowercase(), new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")), new RemoveStopwords(stopwords), new TokenSequence2FeatureSequence(), new FeatureSequence2FeatureVector())); // @formatter:on } }
public static void writeInstanceList(SimpleTokenizer prunedTokenizer) throws IOException { CsvIterator reader = new CsvIterator(new FileReader(inputFile.value), lineRegex.value, dataGroup.value, labelGroup.value, nameGroup.value); ArrayList<Pipe> pipes = new ArrayList<Pipe>(); Alphabet alphabet = new Alphabet(); CharSequenceLowercase csl = new CharSequenceLowercase(); StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet); if (! preserveCase.value) { pipes.add(csl); } pipes.add(prunedTokenizer); pipes.add(sl2fs); Pipe serialPipe = new SerialPipes(pipes); InstanceList instances = new InstanceList(serialPipe); instances.addThruPipe(reader); instances.save(outputFile.value); }
public static void writeInstanceList(SimpleTokenizer prunedTokenizer) throws IOException { CsvIterator reader = new CsvIterator(new FileReader(inputFile.value), lineRegex.value, dataGroup.value, labelGroup.value, nameGroup.value); ArrayList<Pipe> pipes = new ArrayList<Pipe>(); Alphabet alphabet = new Alphabet(); CharSequenceLowercase csl = new CharSequenceLowercase(); StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet); if (! preserveCase.value) { pipes.add(csl); } pipes.add(prunedTokenizer); pipes.add(sl2fs); Pipe serialPipe = new SerialPipes(pipes); InstanceList instances = new InstanceList(serialPipe); instances.addThruPipe(reader); instances.save(outputFile.value); }
Alphabet alphabet = new Alphabet(); CharSequenceLowercase csl = new CharSequenceLowercase(); SimpleTokenizer st = prunedTokenizer.deepClone(); StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet);
Alphabet alphabet = new Alphabet(); CharSequenceLowercase csl = new CharSequenceLowercase(); SimpleTokenizer st = prunedTokenizer.deepClone(); StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet);
Alphabet alphabet = new Alphabet(); CharSequenceLowercase csl = new CharSequenceLowercase(); SimpleTokenizer st = prunedTokenizer.deepClone(); StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet);
pipeList.add( new CharSequenceLowercase() ); pipeList.add( new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")) ); pipeList.add( new TokenSequenceRemoveStopwords(new File("stoplists/en.txt"), "UTF-8", false, false, false) );
pipeList.add( new CharSequenceLowercase() ); pipeList.add( new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")) ); pipeList.add( new TokenSequenceRemoveStopwords(new File("stoplists/en.txt"), "UTF-8", false, false, false) );
pipeList.add( new CharSequenceLowercase() ); pipeList.add( new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}")) ); pipeList.add( new TokenSequenceRemoveStopwords(new File("stoplists/en.txt"), "UTF-8", false, false, false) );
Alphabet alphabet = new Alphabet(); CharSequenceLowercase csl = new CharSequenceLowercase(); StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet);
Alphabet alphabet = new Alphabet(); CharSequenceLowercase csl = new CharSequenceLowercase(); StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet);
Alphabet alphabet = new Alphabet(); CharSequenceLowercase csl = new CharSequenceLowercase(); StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet);
pipeList.add(new CharSequenceLowercase());
pipeList.add(new CharSequenceLowercase());
pipeList.add(new CharSequenceLowercase());
pipeList.add(new CharSequenceLowercase());