@Override protected List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> getFullPrefixPaths( List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> prefixPaths, Automaton lookupAutomaton, FST<PairOutputs.Pair<Long,BytesRef>> fst) throws IOException { // TODO: right now there's no penalty for fuzzy/edits, // ie a completion whose prefix matched exactly what the // user typed gets no boost over completions that // required an edit, which get no boost over completions // requiring two edits. I suspect a multiplicative // factor is appropriate (eg, say a fuzzy match must be at // least 2X better weight than the non-fuzzy match to // "compete") ... in which case I think the wFST needs // to be log weights or something ... Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton)); /* Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); w.write(levA.toDot()); w.close(); System.out.println("Wrote LevA to out.dot"); */ return FSTUtil.intersectPrefixPaths(levA, fst); }
final Automaton toAutomaton(TokenStream ts, final TokenStreamToAutomaton ts2a) throws IOException { // Create corresponding automaton: labels are bytes // from each analyzed token, with byte 0 used as // separator between tokens: Automaton automaton = ts2a.toAutomaton(ts); automaton = replaceSep(automaton); automaton = convertAutomaton(automaton); // TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings // assert SpecialOperations.isFinite(automaton); // Get all paths from the automaton (there can be // more than one path, eg if the analyzer created a // graph using SynFilter or WDF): return automaton; }
@Override public TokenStream create(TokenStream tokenStream) { if (removeTrailing) { return new StopFilter(tokenStream, stopWords); } else { return new SuggestStopFilter(tokenStream, stopWords); } }
try { Automaton lookupAutomaton = toLookupAutomaton(key); List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst); if (sameSurfaceForm(utf8Key, output2)) { results.add(getLookupResult(completion.output.output1, output2, spare)); break; prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst); LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
@Override public void build(InputIterator iterator) throws IOException { String prefix = getClass().getSimpleName(); Directory tempDir = getTempDir(); OfflineSorter sorter = new OfflineSorter(tempDir, prefix, new AnalyzingComparator(hasPayloads)); TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton(); String tempSortedFileName = null; new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions); for (IntsRef string; (string = finiteStrings.next()) != null; count++) { Util.toBytesRef(string, scratch); output.writeInt(encodeWeight(iterator.weight()));
final Automaton toLookupAutomaton(final CharSequence key) throws IOException { // TODO: is there a Reader from a CharSequence? // Turn tokenstream into automaton: Automaton automaton = null; try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) { automaton = getTokenStreamToAutomaton().toAutomaton(ts); } automaton = replaceSep(automaton); // TODO: we can optimize this somewhat by determinizing // while we convert // This automaton should not blow up during determinize: automaton = Operations.determinize(automaton, Integer.MAX_VALUE); return automaton; }
public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { final TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton(); Automaton automaton; try (TokenStream ts = stream) { automaton = toAutomaton(ts, ts2a); } LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions); Set<IntsRef> set = new HashSet<>(); for (IntsRef string = finiteStrings.next(); string != null; string = finiteStrings.next()) { set.add(IntsRef.deepCopyOf(string)); } return Collections.unmodifiableSet(set); }
public void addSurface(BytesRef surface, BytesRef payload, long cost) throws IOException { int surfaceIndex = -1; long encodedWeight = cost == -1 ? cost : encodeWeight(cost); surfaceFormsAndPayload[surfaceIndex] = new SurfaceFormAndPayload(payloadRef, encodedWeight); } else { surfaceFormsAndPayload[surfaceIndex].payload = payloadRef;
@Override public int compareTo(SurfaceFormAndPayload o) { int res = compare(weight, o.weight); if (res == 0 ){ return payload.compareTo(o.payload); } return res; } public static int compare(long x, long y) {
@Override public TokenStreamToAutomaton getTokenStreamToAutomaton() { final TokenStreamToAutomaton tsta = super.getTokenStreamToAutomaton(); tsta.setUnicodeArcs(unicodeAware); return tsta; }
@Override protected Automaton convertAutomaton(Automaton a) { if (unicodeAware) { // FLORIAN EDIT: get converted Automaton from superclass Automaton utf8automaton = new UTF32ToUTF8().convert(super.convertAutomaton(a)); // This automaton should not blow up during determinize: utf8automaton = Operations.determinize(utf8automaton, Integer.MAX_VALUE); return utf8automaton; } else { return super.convertAutomaton(a); } }
public TokenStreamToAutomaton getTokenStreamToAutomaton() { final TokenStreamToAutomaton tsta; if (preserveSep) { tsta = new EscapingTokenStreamToAutomaton((char) sepLabel); } else { // When we're not preserving sep, we don't steal 0xff // byte, so we don't need to do any escaping: tsta = new TokenStreamToAutomaton(); } tsta.setPreservePositionIncrements(preservePositionIncrements); return tsta; }
final Automaton toAutomaton(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException { try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) { return toAutomaton(ts, ts2a); } }
private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRefBuilder spare) { LookupResult result; if (hasPayloads) { int sepIndex = -1; for(int i=0;i<output2.length;i++) { if (output2.bytes[output2.offset+i] == payloadSep) { sepIndex = i; break; } } assert sepIndex != -1; final int payloadLen = output2.length - sepIndex - 1; spare.copyUTF8Bytes(output2.bytes, output2.offset, sepIndex); BytesRef payload = new BytesRef(payloadLen); System.arraycopy(output2.bytes, sepIndex+1, payload.bytes, 0, payloadLen); payload.length = payloadLen; result = new LookupResult(spare.toString(), decodeWeight(output1), payload); } else { spare.copyUTF8Bytes(output2); result = new LookupResult(spare.toString(), decodeWeight(output1)); } return result; }
/** Retrieve suggestions, specifying whether all terms * must match ({@code allTermsRequired}) and whether the hits * should be highlighted ({@code doHighlight}). */ public List<LookupResult> lookup(CharSequence key, Map<BytesRef, BooleanClause.Occur> contextInfo, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { return lookup(key, toQuery(contextInfo), num, allTermsRequired, doHighlight); }
public void finishTerm(long defaultWeight) throws IOException { ArrayUtil.timSort(surfaceFormsAndPayload, 0, count); int deduplicator = 0; analyzed.append((byte) 0); analyzed.setLength(analyzed.length() + 1); analyzed.grow(analyzed.length()); for (int i = 0; i < count; i++) { analyzed.setByteAt(analyzed.length() - 1, (byte) deduplicator++); Util.toIntsRef(analyzed.get(), scratchInts); SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i]; long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight; builder.add(scratchInts.get(), outputs.newPair(cost, candiate.payload)); } seenSurfaceForms.clear(); count = 0; }
int[] topoSortStates = topoSortStates(a); for(int i=0;i<topoSortStates.length;i++) { int state = topoSortStates[topoSortStates.length-1-i];
@Override public List<Lookup.LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { // Don't * numFactor here since we do it down below, once, in the call chain: return super.lookup(key, contexts, num, allTermsRequired, doHighlight); }
public AnalyzingCompletionLookupProvider(boolean preserveSep, boolean exactFirst, boolean preservePositionIncrements, boolean hasPayloads) { this.preserveSep = preserveSep; this.preservePositionIncrements = preservePositionIncrements; this.hasPayloads = hasPayloads; this.maxSurfaceFormsPerAnalyzedForm = MAX_SURFACE_FORMS_PER_ANALYZED_FORM; this.maxGraphExpansions = MAX_GRAPH_EXPANSIONS; int options = preserveSep ? XAnalyzingSuggester.PRESERVE_SEP : 0; // needs to fixed in the suggester first before it can be supported //options |= exactFirst ? XAnalyzingSuggester.EXACT_FIRST : 0; prototype = new XAnalyzingSuggester(null, null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER); }
@Override public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { return prototype.toFiniteStrings(stream); }