Refine search
final Builder<BytesRef> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, false, Integer.MAX_VALUE, outputs, true, 15); assert bytes.length > 0; scratchBytes.writeTo(bytes, 0); indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length)); scratchBytes.reset(); index = indexBuilder.finish();
SegmentTermsEnumFrame f = getFrame(ord); assert f != null; final BytesRef prefix = new BytesRef(term.get().bytes, 0, f.prefix); if (f.nextEnt == -1) { out.println(" frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<< BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR:0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd()); if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix-1)&0xFF)) { out.println(" broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.byteAt(f.prefix-1)&0xFF)); throw new RuntimeException("seek state is broken"); BytesRef output = Util.get(fr.index, prefix); if (output == null) { out.println(" broken seek state: prefix is not final in index");
/** Reverse lookup (lookup by output instead of by input), * in the special case when your FSTs outputs are * strictly ascending. This locates the input/output * pair where the output is equal to the target, and will * return null if that output does not exist. * * <p>NOTE: this only works with {@code FST<Long>}, only * works when the outputs are ascending in order with * the inputs. * For example, simple ordinals (0, 1, * 2, ...), or file offets (when appending to a file) * fit this. */ public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException { final BytesReader in = fst.getBytesReader(); // TODO: would be nice not to alloc this on every lookup FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>()); FST.Arc<Long> scratchArc = new FST.Arc<>(); final IntsRefBuilder result = new IntsRefBuilder(); return getByOutput(fst, targetOutput, in, arc, scratchArc, result); }
emitDotState(out, "initial", "point", "white", ""); emitDotState(out, Long.toString(startArc.target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.outputs.outputToString(finalOutput)); emitDotState(out, Long.toString(arc.target), stateShape, stateColor, finalOutput); out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"" + (arc.isFinal() ? " style=\"bold\"" : "" ) + " color=\"" + arcColor + "\"]\n");
BytesRefBuilder scratch = new BytesRefBuilder(); new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions); for (IntsRef string; (string = finiteStrings.next()) != null; count++) { Util.toBytesRef(string, scratch); if (scratch.length() > Short.MAX_VALUE-2) { throw new IllegalArgumentException( "cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length() + ")"); short analyzedLength = (short) scratch.length(); Builder<Pair<Long,BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); analyzed.append((byte) dedup); Util.toIntsRef(analyzed.get(), scratchInts); builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface))); } else { int payloadOffset = input.getPosition() + surface.length; System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength); br.length = br.bytes.length; builder.add(scratchInts.get(), outputs.newPair(cost, br));
BytesRefBuilder flagsScratch = new BytesRefBuilder(); IntsRefBuilder scratchInts = new IntsRefBuilder(); BytesRef scratch1 = new BytesRef(); BytesRef scratch2 = new BytesRef(); IntsRefBuilder currentOrds = new IntsRefBuilder(); } else { encodeFlags(flagsScratch, wordForm); int ord = flagLookup.add(flagsScratch.get()); if (ord < 0) { Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts.get(), currentOrds.get()); success2 = true; } finally {
BytesRefBuilder b = new BytesRefBuilder(); b.append(tokenBytes); lastTokens[gramCount-1] = b; for(int i=token.length()-1;i>=0;i--) { if (token.byteAt(i) == separator) { BytesRef context = new BytesRef(token.bytes(), 0, i); Long output = Util.get(fst, Util.toIntsRef(context, new IntsRefBuilder())); assert output != null; contextCount = decodeWeight(output); lastTokenFragment = new BytesRef(token.bytes(), i + 1, token.length() - i - 1); break; searcher.addStartPaths(arc, prefixOutput, true, new IntsRefBuilder()); token.setLength(prefixLength); Util.toBytesRef(completion.input, suffix); token.append(suffix);
BytesRef scratch = new BytesRef(); InputIterator iter = new WFSTInputIterator(iterator); IntsRefBuilder scratchInts = new IntsRefBuilder(); BytesRefBuilder previous = null; PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); while ((scratch = iter.next()) != null) { long cost = iter.weight(); previous = new BytesRefBuilder(); } else if (scratch.equals(previous.get())) { continue; // for duplicate suggestions, the best weight is actually Util.toIntsRef(scratch, scratchInts); builder.add(scratchInts.get(), cost); previous.copyBytes(scratch); count++; fst = builder.finish();
public void finishTerm(long defaultWeight) throws IOException { ArrayUtil.timSort(surfaceFormsAndPayload, 0, count); int deduplicator = 0; analyzed.append((byte) 0); analyzed.setLength(analyzed.length() + 1); analyzed.grow(analyzed.length()); for (int i = 0; i < count; i++) { analyzed.setByteAt(analyzed.length() - 1, (byte) deduplicator++); Util.toIntsRef(analyzed.get(), scratchInts); SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i]; long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight; builder.add(scratchInts.get(), outputs.newPair(cost, candiate.payload)); } seenSurfaceForms.clear(); count = 0; }
public String getUrl(int id) { BytesRef scratchBytes = new BytesRef(); IntsRef key = null; try { key = Util.getByOutput(fst, id); } catch (IOException e) { LOG.error("Error id " + id); e.printStackTrace(); return null; } if (key == null) { return null; } return Util.toBytesRef(key, scratchBytes).utf8ToString(); }
new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE4, outputs); BytesRefBuilder scratch = new BytesRefBuilder(); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator()); final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); scratch.grow(estimatedSize); scratchOutput.reset(scratch.bytes()); scratch.setLength(scratchOutput.getPosition()); builder.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef()); FST<BytesRef> fst = builder.finish(); return new SynonymMap(fst, words, maxHorizontalContext);
/** * Builds the final automaton from a list of entries. */ private FST<Object> buildAutomaton(BytesRefSorter sorter) throws IOException { // Build the automaton. final Outputs<Object> outputs = NoOutputs.getSingleton(); final Object empty = outputs.getNoOutput(); final Builder<Object> builder = new Builder<Object>( FST.INPUT_TYPE.BYTE1, 0, 0, true, true, shareMaxTailLength, outputs, null, false); BytesRef scratch = new BytesRef(); BytesRef entry; final IntsRef scratchIntsRef = new IntsRef(); int count = 0; BytesRefIterator iter = sorter.iterator(); while((entry = iter.next()) != null) { count++; if (scratch.compareTo(entry) != 0) { builder.add(Util.toIntsRef(entry, scratchIntsRef), empty); scratch.copyBytes(entry); } } return count == 0 ? null : builder.finish(); } }
OrdsSegmentTermsEnumFrame f = getFrame(ord); assert f != null; final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefix); if (f.nextEnt == -1) { out.println(" frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + brToString(prefix) + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<<OrdsBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? OrdsBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? OrdsBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd() + " termOrd=" + f.termOrd); if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix-1)&0xFF)) { out.println(" broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.byteAt(f.prefix-1)&0xFF)); throw new RuntimeException("seek state is broken"); Output output = Util.get(fr.index, prefix); if (output == null) { out.println(" broken seek state: prefix is not final in index");
@Override public void build(TermFreqIterator iterator) throws IOException { BytesRef scratch = new BytesRef(); TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator, BytesRef.getUTF8SortedAsUnicodeComparator()); IntsRef scratchInts = new IntsRef(); BytesRef previous = null; PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs); while ((scratch = iter.next()) != null) { long cost = iter.weight(); if (previous == null) { previous = new BytesRef(); } else if (scratch.equals(previous)) { continue; // for duplicate suggestions, the best weight is actually // added } Util.toIntsRef(scratch, scratchInts); builder.add(scratchInts, cost); previous.copyBytes(scratch); } fst = builder.finish(); }
new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs); BytesRef scratch = new BytesRef(64); ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); scratch.grow(estimatedSize); scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length); assert scratch.offset == 0; builder.add(Util.toUTF32(input, scratchIntsRef), BytesRef.deepCopyOf(scratch)); FST<BytesRef> fst = builder.finish(); return new SynonymMap(fst, words, maxHorizontalContext);
private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException { meta.writeVInt(field.number); meta.writeByte(FST); meta.writeLong(data.getFilePointer()); PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); Builder<Long> builder = new Builder<>(INPUT_TYPE.BYTE1, outputs); IntsRefBuilder scratch = new IntsRefBuilder(); long ord = 0; for (BytesRef v : values) { builder.add(Util.toIntsRef(v, scratch), ord); ord++; } FST<Long> fst = builder.finish(); if (fst != null) { fst.save(data); } meta.writeVLong(ord); }
final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<>(posIntOutputs, outputsInner); b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); IndexInput in = SimpleTextFieldsReader.this.in.clone(); in.seek(termsStart); final BytesRefBuilder lastTerm = new BytesRefBuilder(); long lastDocsStart = -1; int docFreq = 0; long totalTermFreq = 0; FixedBitSet visitedDocs = new FixedBitSet(maxDoc); final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); while(true) { SimpleTextUtil.readLine(in, scratch); if (scratch.get().equals(END) || StringHelper.startsWith(scratch.get(), FIELD)) { if (lastDocsStart != -1) { b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq))); } else if (StringHelper.startsWith(scratch.get(), TERM)) { if (lastDocsStart != -1) { b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq)));
private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException { Map<String,String> mappings = new TreeMap<>(); for (int i = 0; i < num; i++) { String line = reader.readLine(); String parts[] = line.split("\\s+"); if (parts.length != 3) { throw new ParseException("invalid syntax: " + line, reader.getLineNumber()); } if (mappings.put(parts[1], parts[2]) != null) { throw new IllegalStateException("duplicate mapping specified for: " + parts[1]); } } Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs); IntsRefBuilder scratchInts = new IntsRefBuilder(); for (Map.Entry<String,String> entry : mappings.entrySet()) { Util.toUTF16(entry.getKey(), scratchInts); builder.add(scratchInts.get(), new CharsRef(entry.getValue())); } return builder.finish(); }
/** * Builds the final automaton from a list of entries. */ private FST<Object> buildAutomaton(BytesRefSorter sorter) throws IOException { // Build the automaton. final Outputs<Object> outputs = NoOutputs.getSingleton(); final Object empty = outputs.getNoOutput(); final Builder<Object> builder = new Builder<>( FST.INPUT_TYPE.BYTE1, 0, 0, true, true, shareMaxTailLength, outputs, false, PackedInts.DEFAULT, true, 15); BytesRefBuilder scratch = new BytesRefBuilder(); BytesRef entry; final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); int count = 0; BytesRefIterator iter = sorter.iterator(); while((entry = iter.next()) != null) { count++; if (scratch.get().compareTo(entry) != 0) { builder.add(Util.toIntsRef(entry, scratchIntsRef), empty); scratch.copyBytes(entry); } } return count == 0 ? null : builder.finish(); } }