org.apache.lucene.analysis.util.CharArraySet.<init> java code examples

/**
 * Returns the stopwords {@link CharArraySet} for the specified comma separated stopwords {@code String}.
 *
 * @param stopwords a {@code String} comma separated stopwords list
 * @return the stopwords list as a char array set
 */
private static CharArraySet getStopwords(String stopwords) {
  List<String> stopwordsList = new ArrayList<>();
  for (String stop : stopwords.split(",")) {
    stopwordsList.add(stop.trim());
  }
  return new CharArraySet(stopwordsList, true);
}

public RuleWithSetExceptions(String suffix, int min, String replacement,
  String[] exceptions) {
 super(suffix, min, replacement);
 for (int i = 0; i < exceptions.length; i++) {
  if (!exceptions[i].endsWith(suffix))
   throw new RuntimeException("useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
 }
 this.exceptions = new CharArraySet(Arrays.asList(exceptions), false);
}

/**
 * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
 * leading and trailing whitespace). Every line of the Reader should contain only
 * one word. The words need to be in lowercase if you make use of an
 * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
 *
 * @param reader Reader containing the wordlist
 * @return A {@link CharArraySet} with the reader's words
 */
public static CharArraySet getWordSet(Reader reader) throws IOException {
 return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
}

/**
 * Creates a stopword set from the given stopword array.
 * 
 * @param stopWords An array of stopwords
 * @param ignoreCase If true, all words are lower cased first.  
 * @return a Set containing the words
 */    
public static CharArraySet makeStopSet(String[] stopWords, boolean ignoreCase) {
 CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
 stopSet.addAll(Arrays.asList(stopWords));
 return stopSet;
}

/**
 * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
 * leading and trailing whitespace). Every line of the Reader should contain only
 * one word. The words need to be in lowercase if you make use of an
 * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
 *
 * @param reader Reader containing the wordlist
 * @param comment The string representing a comment.
 * @return A CharArraySet with the reader's words
 */
public static CharArraySet getWordSet(Reader reader, String comment) throws IOException {
 return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false));
}

/**
 * Creates a stopword set from the given stopword array.
 * 
 * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
 * @param stopWords An array of stopwords
 * @param ignoreCase If true, all words are lower cased first.  
 * @return a Set containing the words
 */    
public static CharArraySet makeStopSet(String[] stopWords, boolean ignoreCase) {
 CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
 stopSet.addAll(Arrays.asList(stopWords));
 return stopSet;
}

/**
 * Creates a stopword set from the given stopword list.
 * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
 * @param ignoreCase if true, all words are lower cased first
 * @return A Set ({@link CharArraySet}) containing the words
 */
public static CharArraySet makeStopSet(List<?> stopWords, boolean ignoreCase){
 CharArraySet stopSet = new CharArraySet(stopWords.size(), ignoreCase);
 stopSet.addAll(stopWords);
 return stopSet;
}

/**
 * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
 * leading and trailing whitespace). Every line of the Reader should contain only
 * one word. The words need to be in lowercase if you make use of an
 * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
 *
 * @param reader Reader containing the wordlist
 * @param matchVersion the Lucene {@link Version}
 * @return A {@link CharArraySet} with the reader's words
 */
public static CharArraySet getWordSet(Reader reader) throws IOException {
 return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
}

/**
 * Creates a stopword set from the given stopword list.
 * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0
 * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords
 * @param ignoreCase if true, all words are lower cased first
 * @return A Set ({@link CharArraySet}) containing the words
 */
public static CharArraySet makeStopSet(List<?> stopWords, boolean ignoreCase){
 CharArraySet stopSet = new CharArraySet(stopWords.size(), ignoreCase);
 stopSet.addAll(stopWords);
 return stopSet;
}

/**
 * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
 * leading and trailing whitespace). Every line of the Reader should contain only
 * one word. The words need to be in lowercase if you make use of an
 * Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
 *
 * @param reader Reader containing the wordlist
 * @param comment The string representing a comment.
 * @param matchVersion the Lucene {@link Version}
 * @return A CharArraySet with the reader's words
 */
public static CharArraySet getWordSet(Reader reader, String comment) throws IOException {
 return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false));
}

public FeatureVector(Stopper stopper) {
  this.stopper = stopper;
  if(stopper==null || stopper.asSet().size()==0) {
    analyzer = new StandardAnalyzer(Version.LUCENE_41, CharArraySet.EMPTY_SET);
  } else {
    CharArraySet charArraySet = new CharArraySet(Version.LUCENE_41, stopper.asSet(), true);
    analyzer = new StandardAnalyzer(Version.LUCENE_41, charArraySet);
  }
  features = new HashMap<String,Double>();
}

/**
 * Reads stopwords from a stopword list in Snowball format.
 * <p>
 * The snowball format is the following:
 * <ul>
 * <li>Lines may contain multiple words separated by whitespace.
 * <li>The comment character is the vertical line (&#124;).
 * <li>Lines may contain trailing comments.
 * </ul>
 * 
 * @param reader Reader containing a Snowball stopword list
 * @return A {@link CharArraySet} with the reader's words
 */
public static CharArraySet getSnowballWordSet(Reader reader) throws IOException {
 return getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
}

/**
 * Reads stopwords from a stopword list in Snowball format.
 * <p>
 * The snowball format is the following:
 * <ul>
 * <li>Lines may contain multiple words separated by whitespace.
 * <li>The comment character is the vertical line (&#124;).
 * <li>Lines may contain trailing comments.
 * </ul>
 * </p>
 * 
 * @param reader Reader containing a Snowball stopword list
 * @param matchVersion the Lucene {@link Version}
 * @return A {@link CharArraySet} with the reader's words
 */
public static CharArraySet getSnowballWordSet(Reader reader) throws IOException {
 return getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false));
}

/**
 * Returns an unmodifiable {@link CharArraySet}. This allows to provide
 * unmodifiable views of internal sets for "read-only" use.
 * 
 * @param set
 *          a set for which the unmodifiable set is returned.
 * @return an new unmodifiable {@link CharArraySet}.
 * @throws NullPointerException
 *           if the given set is <code>null</code>.
 */
public static CharArraySet unmodifiableSet(CharArraySet set) {
 if (set == null)
  throw new NullPointerException("Given set is null");
 if (set == EMPTY_SET)
  return EMPTY_SET;
 if (set.map instanceof CharArrayMap.UnmodifiableCharArrayMap)
  return set;
 return new CharArraySet(CharArrayMap.unmodifiableMap(set.map));
}

@Override
public void inform(ResourceLoader loader) throws IOException {
 if (stopWordFiles != null) {
  if (FORMAT_WORDSET.equalsIgnoreCase(format)) {
   stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
  } else if (FORMAT_SNOWBALL.equalsIgnoreCase(format)) {
   stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase);
  } else {
   throw new IllegalArgumentException("Unknown 'format' specified for 'words' file: " + format);
  }
 } else {
  if (null != format) {
   throw new IllegalArgumentException("'format' can not be specified w/o an explicit 'words' file: " + format);
  }
  stopWords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
 }
}

@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
 Set<String> stopWords = stopWordsPerField.get(fieldName);
 if (stopWords == null) {
  return components;
 }
 StopFilter stopFilter = new StopFilter(components.getTokenStream(), 
   new CharArraySet(stopWords, false));
 return new TokenStreamComponents(components.getTokenizer(), stopFilter);
}

/**
 * Returns as {@link CharArraySet} from wordFiles, which
 * can be a comma-separated list of filenames
 */
protected final CharArraySet getWordSet(ResourceLoader loader,
  String wordFiles, boolean ignoreCase) throws IOException {
 List<String> files = splitFileNames(wordFiles);
 CharArraySet words = null;
 if (files.size() > 0) {
  // default stopwords list has 35 or so words, but maybe don't make it that
  // big to start
  words = new CharArraySet(files.size() * 10, ignoreCase);
  for (String file : files) {
   List<String> wlist = getLines(loader, file.trim());
   words.addAll(StopFilter.makeStopSet(wlist, ignoreCase));
  }
 }
 return words;
}

@Override
protected TokenStreamComponents createComponents(final String field) {
  final Tokenizer source = new NGramTokenizer(minNgram(), maxNgram());
  final TokenStream result = new StopFilter(
      new LowerCaseFilter(new StandardFilter(source)),
      new CharArraySet(asList(stopWords()), true));
  return new TokenStreamComponents(source, result);
}

protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
  final Tokenizer source = new ChineseTokenStream(reader);
  if (stopWordManager != null) {
    //走停止词过滤
    CharArraySet stopWords = new CharArraySet(Version.LUCENE_CURRENT, stopWordManager.getStopWords(), true);
    TokenStream result = new StopFilter(Version.LUCENE_CURRENT, source, stopWords);
    return new TokenStreamComponents(source, result);
  } else {
    //走原始逻辑
    return new TokenStreamComponents(source);
  }
}

  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    final Tokenizer source = new NewMMSegTokenizer(newSeg(), reader);
    if (stopWordManager != null) {
      //执行停止词逻辑
      CharArraySet stopWords = new CharArraySet(Version.LUCENE_CURRENT, stopWordManager.getStopWords(), true);
      TokenStream result = new StopFilter(Version.LUCENE_CURRENT, source, stopWords);
      return new TokenStreamComponents(source, result);
    } else {
      //原有逻辑
      return new TokenStreamComponents(source);
    }

  }
}

Javadoc

Create set with enough capacity to hold startSize terms

Popular methods of CharArraySet

contains
true if the len chars of text starting at off are in the set
copy
Returns a copy of the given set as a CharArraySet. If the given set is a CharArraySet the ignoreCase
isEmpty
unmodifiableSet
Returns an unmodifiable CharArraySet. This allows to provide unmodifiable views of internal sets for
add
Add this char[] directly to the set. If ignoreCase is true for this Set, the text array will be dire
addAll
clear
Clears all entries in this set. This method is supported for reusing, but not Set#remove.
iterator
Returns an Iterator for char[] instances in this set.
size
toArray
toString

toString

Popular in Java

Start an intent from android
getSystemService (Context)
onRequestPermissionsResult (Fragment)
setContentView (Activity)
InetAddress (java.net)
An Internet Protocol (IP) address. This can be either an IPv4 address or an IPv6 address, and in pra
Permission (java.security)
Legacy security code; do not use.
PriorityQueue (java.util)
A PriorityQueue holds elements on a priority heap, which orders the elements according to their natu
ExecutorService (java.util.concurrent)
An Executor that provides methods to manage termination and methods that can produce a Future for tr
LogFactory (org.apache.commons.logging)
Factory for creating Log instances, with discovery and configuration features similar to that employ
BufferedImage (java.awt.image)
The BufferedImage subclass describes an java.awt.Image with an accessible buffer of image data. All
Best IntelliJ plugins

How to use org.apache.lucene.analysis.util.CharArraySetconstructor

Best Java code snippets using org.apache.lucene.analysis.util.CharArraySet.<init> (Showing top 20 results out of 315)

How to use
org.apache.lucene.analysis.util.CharArraySet
constructor