/** * Used to create an instance of a specific stopwords scheme. * * @return a suitably configured <code>StopwordsHandler</code> value */ @Override public StopwordsHandler getStopwords() { WordsFromFile result; result = new WordsFromFile(); result.setStopwords(new File(getTmpDirectory() + File.separator + "WordsFromFile.txt")); return result; }
/** * Performs intialization of the scheme. */ @Override protected void initialize() { List<String> words; super.initialize(); m_Words = new HashSet<String>(); words = read(); for (String word: words) { // comment? if (!word.startsWith("#")) m_Words.add(word); } }
/** * Used to create an instance of a specific stopwords scheme. * * @return a suitably configured <code>StopwordsHandler</code> value */ @Override public StopwordsHandler getStopwords() { WordsFromFile result; result = new WordsFromFile(); result.setStopwords(new File(getTmpDirectory() + File.separator + "WordsFromFile.txt")); return result; }
/** * Performs intialization of the scheme. */ @Override protected void initialize() { List<String> words; super.initialize(); m_Words = new HashSet<String>(); words = read(); for (String word: words) { // comment? if (!word.startsWith("#")) m_Words.add(word); } }
StringToWordVector() filter = new StringToWordVector(); filter.setWordsToKeep(1000000); if(useIdf){ filter.setIDFTransform(true); } filter.setTFTransform(true); filter.setLowerCaseTokens(true); filter.setOutputWordCounts(true); filter.setMinTermFreq(minTermFreq); filter.setNormalizeDocLength(new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL,StringToWordVector.TAGS_FILTER)); NGramTokenizer t = new NGramTokenizer(); t.setNGramMaxSize(maxGrams); t.setNGramMinSize(minGrams); filter.setTokenizer(t); WordsFromFile stopwords = new WordsFromFile(); stopwords.setStopwords(new File("data/stopwords/stopwords.txt")); filter.setStopwordsHandler(stopwords); if (useStemmer){ Stemmer s = new /*Iterated*/LovinsStemmer(); filter.setStemmer(s); } filter.setInputFormat(trainingData);
StringToWordVector() filter = new StringToWordVector(); filter.setWordsToKeep(1000000); if(useIdf){ filter.setIDFTransform(true); } filter.setTFTransform(true); filter.setLowerCaseTokens(true); filter.setOutputWordCounts(true); filter.setMinTermFreq(minTermFreq); filter.setNormalizeDocLength(new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL,StringToWordVector.TAGS_FILTER)); NGramTokenizer t = new NGramTokenizer(); t.setNGramMaxSize(maxGrams); t.setNGramMinSize(minGrams); filter.setTokenizer(t); WordsFromFile stopwords = new WordsFromFile(); stopwords.setStopwords(new File("data/stopwords/stopwords.txt")); filter.setStopwordsHandler(stopwords); if (useStemmer){ Stemmer s = new /*Iterated*/LovinsStemmer(); filter.setStemmer(s); } filter.setInputFormat(trainingData);
/** * Used to create an instance of a specific stopwords scheme. * * @return a suitably configured <code>StopwordsHandler</code> value */ @Override public StopwordsHandler getStopwords() { MultiStopwords result; StopwordsHandler[] handlers; result = new MultiStopwords(); handlers = new StopwordsHandler[3]; handlers[0] = new Rainbow(); handlers[1] = new WordsFromFile(); ((WordsFromFile) handlers[1]).setStopwords(new File(getTmpDirectory() + File.separator + "WordsFromFile.txt")); handlers[2] = new RegExpFromFile(); ((RegExpFromFile) handlers[2]).setStopwords(new File(getTmpDirectory() + File.separator + "RegExpFromFile.txt")); result.setStopwords(handlers); return result; }
/** * Used to create an instance of a specific stopwords scheme. * * @return a suitably configured <code>StopwordsHandler</code> value */ @Override public StopwordsHandler getStopwords() { MultiStopwords result; StopwordsHandler[] handlers; result = new MultiStopwords(); handlers = new StopwordsHandler[3]; handlers[0] = new Rainbow(); handlers[1] = new WordsFromFile(); ((WordsFromFile) handlers[1]).setStopwords(new File(getTmpDirectory() + File.separator + "WordsFromFile.txt")); handlers[2] = new RegExpFromFile(); ((RegExpFromFile) handlers[2]).setStopwords(new File(getTmpDirectory() + File.separator + "RegExpFromFile.txt")); result.setStopwords(handlers); return result; }