org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester java code examples

try {
 Automaton lookupAutomaton = toLookupAutomaton(key);
 List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
   if (sameSurfaceForm(utf8Key, output2)) {
    results.add(getLookupResult(completion.output.output1, output2, spare));
    break;
 prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
  LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);

private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRefBuilder spare) {
 LookupResult result;
 if (hasPayloads) {
  int sepIndex = -1;
  for(int i=0;i<output2.length;i++) {
   if (output2.bytes[output2.offset+i] == payloadSep) {
    sepIndex = i;
    break;
   }
  }
  assert sepIndex != -1;
  final int payloadLen = output2.length - sepIndex - 1;
  spare.copyUTF8Bytes(output2.bytes, output2.offset, sepIndex);
  BytesRef payload = new BytesRef(payloadLen);
  System.arraycopy(output2.bytes, sepIndex+1, payload.bytes, 0, payloadLen);
  payload.length = payloadLen;
  result = new LookupResult(spare.toString(), decodeWeight(output1), payload);
 } else {
  spare.copyUTF8Bytes(output2);
  result = new LookupResult(spare.toString(), decodeWeight(output1));
 }
 return result;
}

public void addSurface(BytesRef surface, BytesRef payload, long cost) throws IOException {
  int surfaceIndex = -1;
  long encodedWeight = cost == -1 ? cost : encodeWeight(cost);

final Automaton toAutomaton(TokenStream ts, final TokenStreamToAutomaton ts2a) throws IOException {
  // Create corresponding automaton: labels are bytes
  // from each analyzed token, with byte 0 used as
  // separator between tokens:
  Automaton automaton = ts2a.toAutomaton(ts);
  automaton = replaceSep(automaton);
  automaton = convertAutomaton(automaton);
  // TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings
  // assert SpecialOperations.isFinite(automaton);
  // Get all paths from the automaton (there can be
  // more than one path, eg if the analyzer created a
  // graph using SynFilter or WDF):
  return automaton;
}

final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
  // TODO: is there a Reader from a CharSequence?
  // Turn tokenstream into automaton:
  Automaton automaton = null;
  try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
    automaton = getTokenStreamToAutomaton().toAutomaton(ts);
  }
  automaton = replaceSep(automaton);
  // TODO: we can optimize this somewhat by determinizing
  // while we convert
  // This automaton should not blow up during determinize:
  automaton = Operations.determinize(automaton, Integer.MAX_VALUE);
  return automaton;
}

public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
  final TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
  Automaton automaton;
  try (TokenStream ts = stream) {
    automaton = toAutomaton(ts, ts2a);
  }
  LimitedFiniteStringsIterator finiteStrings =
      new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
  Set<IntsRef> set = new HashSet<>();
  for (IntsRef string = finiteStrings.next(); string != null; string = finiteStrings.next()) {
    set.add(IntsRef.deepCopyOf(string));
  }
  return Collections.unmodifiableSet(set);
}

@Override
public void build(InputIterator iterator) throws IOException {
 String prefix = getClass().getSimpleName();
  Directory tempDir = getTempDir();
  OfflineSorter sorter = new OfflineSorter(tempDir, prefix, new AnalyzingComparator(hasPayloads));
 TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
  String tempSortedFileName = null;
       new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
   for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
    Util.toBytesRef(string, scratch);
    output.writeInt(encodeWeight(iterator.weight()));

BytesRefBuilder scratch = new BytesRefBuilder();
TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
      new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
  for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
   Util.toBytesRef(string, scratch);
   output.writeInt(encodeWeight(iterator.weight()));

@Override
public TokenStreamToAutomaton getTokenStreamToAutomaton() {
 final TokenStreamToAutomaton tsta = super.getTokenStreamToAutomaton();
 tsta.setUnicodeArcs(unicodeAware);
 return tsta;
}

final Automaton toAutomaton(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
  try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) {
    return toAutomaton(ts, ts2a);
  }
}

@Override
protected Automaton convertAutomaton(Automaton a) {
 if (unicodeAware) {
  // FLORIAN EDIT: get converted Automaton from superclass
  Automaton utf8automaton = new UTF32ToUTF8().convert(super.convertAutomaton(a));
  // This automaton should not blow up during determinize:
  utf8automaton = Operations.determinize(utf8automaton, Integer.MAX_VALUE);
  return utf8automaton;
 } else {
  return super.convertAutomaton(a);
 }
}

@Override
public void build(InputIterator iterator) throws IOException {
 String prefix = getClass().getSimpleName();
  Directory tempDir = getTempDir();
  OfflineSorter sorter = new OfflineSorter(tempDir, prefix, new AnalyzingComparator(hasPayloads));
 TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
  String tempSortedFileName = null;
       new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
   for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
    Util.toBytesRef(string, scratch);
    output.writeInt(encodeWeight(iterator.weight()));

public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
  final TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
  Automaton automaton;
  try (TokenStream ts = stream) {
    automaton = toAutomaton(ts, ts2a);
  }
  LimitedFiniteStringsIterator finiteStrings =
      new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
  Set<IntsRef> set = new HashSet<>();
  for (IntsRef string = finiteStrings.next(); string != null; string = finiteStrings.next()) {
    set.add(IntsRef.deepCopyOf(string));
  }
  return Collections.unmodifiableSet(set);
}

final Automaton toAutomaton(TokenStream ts, final TokenStreamToAutomaton ts2a) throws IOException {
  // Create corresponding automaton: labels are bytes
  // from each analyzed token, with byte 0 used as
  // separator between tokens:
  Automaton automaton = ts2a.toAutomaton(ts);
  automaton = replaceSep(automaton);
  automaton = convertAutomaton(automaton);
  // TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings
  // assert SpecialOperations.isFinite(automaton);
  // Get all paths from the automaton (there can be
  // more than one path, eg if the analyzer created a
  // graph using SynFilter or WDF):
  return automaton;
}

final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
  // TODO: is there a Reader from a CharSequence?
  // Turn tokenstream into automaton:
  Automaton automaton = null;
  try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
    automaton = getTokenStreamToAutomaton().toAutomaton(ts);
  }
  automaton = replaceSep(automaton);
  // TODO: we can optimize this somewhat by determinizing
  // while we convert
  // This automaton should not blow up during determinize:
  automaton = Operations.determinize(automaton, Integer.MAX_VALUE);
  return automaton;
}

@Override
public TokenStreamToAutomaton getTokenStreamToAutomaton() {
 final TokenStreamToAutomaton tsta = super.getTokenStreamToAutomaton();
 tsta.setUnicodeArcs(unicodeAware);
 return tsta;
}

final Automaton toAutomaton(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
  try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) {
    return toAutomaton(ts, ts2a);
  }
}

@Override
protected Automaton convertAutomaton(Automaton a) {
 if (unicodeAware) {
  // FLORIAN EDIT: get converted Automaton from superclass
  Automaton utf8automaton = new UTF32ToUTF8().convert(super.convertAutomaton(a));
  // This automaton should not blow up during determinize:
  utf8automaton = Operations.determinize(utf8automaton, Integer.MAX_VALUE);
  return utf8automaton;
 } else {
  return super.convertAutomaton(a);
 }
}

try {
 Automaton lookupAutomaton = toLookupAutomaton(key);
 List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
   if (sameSurfaceForm(utf8Key, output2)) {
    results.add(getLookupResult(completion.output.output1, output2, spare));
    break;
 prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
  LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);

@Override
public void build(InputIterator iterator) throws IOException {
 String prefix = getClass().getSimpleName();
  Directory tempDir = getTempDir();
  OfflineSorter sorter = new OfflineSorter(tempDir, prefix, new AnalyzingComparator(hasPayloads));
 TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
  String tempSortedFileName = null;
       new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
   for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
    Util.toBytesRef(string, scratch);
    output.writeInt(encodeWeight(iterator.weight()));

Javadoc

Suggester that first analyzes the surface form, adds the analyzed form to a weighted FST, and then does the same thing at lookup time. This means lookup is based on the analyzed form while suggestions are still the surface form(s).

This can result in powerful suggester functionality. For example, if you use an analyzer removing stop words, then the partial text "ghost chr..." could see the suggestion "The Ghost of Christmas Past". Note that position increments MUST NOT be preserved for this example to work, so you should call the constructor with preservePositionIncrements parameter set to false

If SynonymFilter is used to map wifi and wireless network to hotspot then the partial text "wirele..." could suggest "wifi router". Token normalization like stemmers, accent removal, etc., would allow suggestions to ignore such variations.

When two matching suggestions have the same weight, they are tie-broken by the analyzed form. If their analyzed form is the same then the order is undefined.

There are some limitations:

A lookup from a query like "net" in English won't be any different than "net " (ie, user added a trailing space) because analyzers don't reflect when they've seen a token separator and when they haven't.
If you're using StopFilter, and the user will type "fast apple", but so far all they've typed is "fast a", again because the analyzer doesn't convey whether it's seen a token separator after the "a", StopFilter will remove that "a" causing far more matches than you'd expect.
Lookups with the empty string return no results instead of all results.

Most used methods

convertAutomaton
decodeWeight
cost -> weight
encodeWeight
weight -> cost
getFullPrefixPaths
Returns all completion paths to initialize the search.
getLookupResult
getTokenStreamToAutomaton
replaceSep
sameSurfaceForm
toAutomaton
toLookupAutomaton
topoSortStates
getTempDir

Popular in Java

Finding current android device location
scheduleAtFixedRate (Timer)
runOnUiThread (Activity)
findViewById (Activity)
FileOutputStream (java.io)
An output stream that writes bytes to a file. If the output file exists, it can be replaced or appen
PrintWriter (java.io)
Wraps either an existing OutputStream or an existing Writerand provides convenience methods for prin
Queue (java.util)
A collection designed for holding elements prior to processing. Besides basic java.util.Collection o
TimeZone (java.util)
TimeZone represents a time zone offset, and also figures out daylight savings. Typically, you get a
SSLHandshakeException (javax.net.ssl)
The exception that is thrown when a handshake could not be completed successfully.
Font (java.awt)
The Font class represents fonts, which are used to render text in a visible way. A font provides the
Top Vim plugins

How to useXAnalyzingSuggester in org.apache.lucene.search.suggest.analyzing

Best Java code snippets using org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester (Showing top 20 results out of 315)

How to use
XAnalyzingSuggester
in
org.apache.lucene.search.suggest.analyzing