org.apache.lucene.search.suggest.analyzing java code examples

@Override
protected List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> getFullPrefixPaths(
  List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> prefixPaths, Automaton lookupAutomaton,
  FST<PairOutputs.Pair<Long,BytesRef>> fst)
    throws IOException {
  // TODO: right now there's no penalty for fuzzy/edits,
  // ie a completion whose prefix matched exactly what the
  // user typed gets no boost over completions that
  // required an edit, which get no boost over completions
  // requiring two edits.  I suspect a multiplicative
  // factor is appropriate (eg, say a fuzzy match must be at
  // least 2X better weight than the non-fuzzy match to
  // "compete") ... in which case I think the wFST needs
  // to be log weights or something ...
  Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
/*
 Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
 w.write(levA.toDot());
 w.close();
 System.out.println("Wrote LevA to out.dot");
*/
  return FSTUtil.intersectPrefixPaths(levA, fst);
}

final Automaton toAutomaton(TokenStream ts, final TokenStreamToAutomaton ts2a) throws IOException {
  // Create corresponding automaton: labels are bytes
  // from each analyzed token, with byte 0 used as
  // separator between tokens:
  Automaton automaton = ts2a.toAutomaton(ts);
  automaton = replaceSep(automaton);
  automaton = convertAutomaton(automaton);
  // TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings
  // assert SpecialOperations.isFinite(automaton);
  // Get all paths from the automaton (there can be
  // more than one path, eg if the analyzer created a
  // graph using SynFilter or WDF):
  return automaton;
}

@Override
public TokenStream create(TokenStream tokenStream) {
  if (removeTrailing) {
    return new StopFilter(tokenStream, stopWords);
  } else {
    return new SuggestStopFilter(tokenStream, stopWords);
  }
}

try {
 Automaton lookupAutomaton = toLookupAutomaton(key);
 List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
   if (sameSurfaceForm(utf8Key, output2)) {
    results.add(getLookupResult(completion.output.output1, output2, spare));
    break;
 prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
  LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);

@Override
public void build(InputIterator iterator) throws IOException {
 String prefix = getClass().getSimpleName();
  Directory tempDir = getTempDir();
  OfflineSorter sorter = new OfflineSorter(tempDir, prefix, new AnalyzingComparator(hasPayloads));
 TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
  String tempSortedFileName = null;
       new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
   for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
    Util.toBytesRef(string, scratch);
    output.writeInt(encodeWeight(iterator.weight()));

final Automaton toLookupAutomaton(final CharSequence key) throws IOException {
  // TODO: is there a Reader from a CharSequence?
  // Turn tokenstream into automaton:
  Automaton automaton = null;
  try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
    automaton = getTokenStreamToAutomaton().toAutomaton(ts);
  }
  automaton = replaceSep(automaton);
  // TODO: we can optimize this somewhat by determinizing
  // while we convert
  // This automaton should not blow up during determinize:
  automaton = Operations.determinize(automaton, Integer.MAX_VALUE);
  return automaton;
}

public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
  final TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton();
  Automaton automaton;
  try (TokenStream ts = stream) {
    automaton = toAutomaton(ts, ts2a);
  }
  LimitedFiniteStringsIterator finiteStrings =
      new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
  Set<IntsRef> set = new HashSet<>();
  for (IntsRef string = finiteStrings.next(); string != null; string = finiteStrings.next()) {
    set.add(IntsRef.deepCopyOf(string));
  }
  return Collections.unmodifiableSet(set);
}

public void addSurface(BytesRef surface, BytesRef payload, long cost) throws IOException {
  int surfaceIndex = -1;
  long encodedWeight = cost == -1 ? cost : encodeWeight(cost);
    surfaceFormsAndPayload[surfaceIndex] = new SurfaceFormAndPayload(payloadRef, encodedWeight);
  } else {
    surfaceFormsAndPayload[surfaceIndex].payload = payloadRef;

@Override
public int compareTo(SurfaceFormAndPayload o) {
  int res = compare(weight, o.weight);
  if (res == 0 ){
    return payload.compareTo(o.payload);
  }
  return res;
}
public static int compare(long x, long y) {

@Override
public TokenStreamToAutomaton getTokenStreamToAutomaton() {
 final TokenStreamToAutomaton tsta = super.getTokenStreamToAutomaton();
 tsta.setUnicodeArcs(unicodeAware);
 return tsta;
}

@Override
protected Automaton convertAutomaton(Automaton a) {
 if (unicodeAware) {
  // FLORIAN EDIT: get converted Automaton from superclass
  Automaton utf8automaton = new UTF32ToUTF8().convert(super.convertAutomaton(a));
  // This automaton should not blow up during determinize:
  utf8automaton = Operations.determinize(utf8automaton, Integer.MAX_VALUE);
  return utf8automaton;
 } else {
  return super.convertAutomaton(a);
 }
}

public TokenStreamToAutomaton getTokenStreamToAutomaton() {
 final TokenStreamToAutomaton tsta;
 if (preserveSep) {
  tsta = new EscapingTokenStreamToAutomaton((char) sepLabel);
 } else {
  // When we're not preserving sep, we don't steal 0xff
  // byte, so we don't need to do any escaping:
  tsta = new TokenStreamToAutomaton();
 }
 tsta.setPreservePositionIncrements(preservePositionIncrements);
 return tsta;
}

final Automaton toAutomaton(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException {
  try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) {
    return toAutomaton(ts, ts2a);
  }
}

private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRefBuilder spare) {
 LookupResult result;
 if (hasPayloads) {
  int sepIndex = -1;
  for(int i=0;i<output2.length;i++) {
   if (output2.bytes[output2.offset+i] == payloadSep) {
    sepIndex = i;
    break;
   }
  }
  assert sepIndex != -1;
  final int payloadLen = output2.length - sepIndex - 1;
  spare.copyUTF8Bytes(output2.bytes, output2.offset, sepIndex);
  BytesRef payload = new BytesRef(payloadLen);
  System.arraycopy(output2.bytes, sepIndex+1, payload.bytes, 0, payloadLen);
  payload.length = payloadLen;
  result = new LookupResult(spare.toString(), decodeWeight(output1), payload);
 } else {
  spare.copyUTF8Bytes(output2);
  result = new LookupResult(spare.toString(), decodeWeight(output1));
 }
 return result;
}

/** Retrieve suggestions, specifying whether all terms
 *  must match ({@code allTermsRequired}) and whether the hits
 *  should be highlighted ({@code doHighlight}). */
public List<LookupResult> lookup(CharSequence key, Map<BytesRef, BooleanClause.Occur> contextInfo, int num, boolean allTermsRequired, boolean doHighlight) throws IOException {
  return lookup(key, toQuery(contextInfo), num, allTermsRequired, doHighlight);
}

public void finishTerm(long defaultWeight) throws IOException {
  ArrayUtil.timSort(surfaceFormsAndPayload, 0, count);
  int deduplicator = 0;
  analyzed.append((byte) 0);
  analyzed.setLength(analyzed.length() + 1);
  analyzed.grow(analyzed.length());
  for (int i = 0; i < count; i++) {
    analyzed.setByteAt(analyzed.length() - 1, (byte) deduplicator++);
    Util.toIntsRef(analyzed.get(), scratchInts);
    SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i];
    long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight;
    builder.add(scratchInts.get(), outputs.newPair(cost, candiate.payload));
  }
  seenSurfaceForms.clear();
  count = 0;
}

int[] topoSortStates = topoSortStates(a);
for(int i=0;i<topoSortStates.length;i++) {
 int state = topoSortStates[topoSortStates.length-1-i];

@Override
public List<Lookup.LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, int num, boolean allTermsRequired, boolean doHighlight) throws IOException {
 // Don't * numFactor here since we do it down below, once, in the call chain:
 return super.lookup(key, contexts, num, allTermsRequired, doHighlight);
}

public AnalyzingCompletionLookupProvider(boolean preserveSep, boolean exactFirst, boolean preservePositionIncrements, boolean hasPayloads) {
  this.preserveSep = preserveSep;
  this.preservePositionIncrements = preservePositionIncrements;
  this.hasPayloads = hasPayloads;
  this.maxSurfaceFormsPerAnalyzedForm = MAX_SURFACE_FORMS_PER_ANALYZED_FORM;
  this.maxGraphExpansions = MAX_GRAPH_EXPANSIONS;
  int options = preserveSep ? XAnalyzingSuggester.PRESERVE_SEP : 0;
  // needs to fixed in the suggester first before it can be supported
  //options |= exactFirst ? XAnalyzingSuggester.EXACT_FIRST : 0;
  prototype = new XAnalyzingSuggester(null, null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER);
}

@Override
public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
  return prototype.toFiniteStrings(stream);
}

How to use org.apache.lucene.search.suggest.analyzing

Best Java code snippets using org.apache.lucene.search.suggest.analyzing (Showing top 20 results out of 315)