org.apache.lucene.util.fst.Util java code examples

Refine search

final Builder<BytesRef> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
                           0, 0, true, false, Integer.MAX_VALUE,
                           outputs, true, 15);
assert bytes.length > 0;
scratchBytes.writeTo(bytes, 0);
indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length));
scratchBytes.reset();
index = indexBuilder.finish();

SegmentTermsEnumFrame f = getFrame(ord);
assert f != null;
final BytesRef prefix = new BytesRef(term.get().bytes, 0, f.prefix);
if (f.nextEnt == -1) {
 out.println("    frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + prefix + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<< BlockTreeTermsReader.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? BlockTreeTermsReader.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR:0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd());
 if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix-1)&0xFF)) {
  out.println("      broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.byteAt(f.prefix-1)&0xFF));
  throw new RuntimeException("seek state is broken");
 BytesRef output = Util.get(fr.index, prefix);
 if (output == null) {
  out.println("      broken seek state: prefix is not final in index");

/** Reverse lookup (lookup by output instead of by input),
 *  in the special case when your FSTs outputs are
 *  strictly ascending.  This locates the input/output
 *  pair where the output is equal to the target, and will
 *  return null if that output does not exist.
 *
 *  <p>NOTE: this only works with {@code FST<Long>}, only
 *  works when the outputs are ascending in order with
 *  the inputs.
 *  For example, simple ordinals (0, 1,
 *  2, ...), or file offets (when appending to a file)
 *  fit this. */
public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException {
 final BytesReader in = fst.getBytesReader();
 // TODO: would be nice not to alloc this on every lookup
 FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>());
 
 FST.Arc<Long> scratchArc = new FST.Arc<>();
 final IntsRefBuilder result = new IntsRefBuilder();
 return getByOutput(fst, targetOutput, in, arc, scratchArc, result);
}

emitDotState(out, "initial", "point", "white", "");
 emitDotState(out, Long.toString(startArc.target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.outputs.outputToString(finalOutput));
     emitDotState(out, Long.toString(arc.target), stateShape, stateColor, finalOutput);
    out.write("  " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"" + (arc.isFinal() ? " style=\"bold\"" : "" ) + " color=\"" + arcColor + "\"]\n");

BytesRefBuilder scratch = new BytesRefBuilder();
      new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
  for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
   Util.toBytesRef(string, scratch);
   if (scratch.length() > Short.MAX_VALUE-2) {
    throw new IllegalArgumentException(
      "cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length() + ")");
   short analyzedLength = (short) scratch.length();
 Builder<Pair<Long,BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
  analyzed.append((byte) dedup);
  Util.toIntsRef(analyzed.get(), scratchInts);
   builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
  } else {
   int payloadOffset = input.getPosition() + surface.length;
   System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength);
   br.length = br.bytes.length;
   builder.add(scratchInts.get(), outputs.newPair(cost, br));

BytesRefBuilder flagsScratch = new BytesRefBuilder();
IntsRefBuilder scratchInts = new IntsRefBuilder();
 BytesRef scratch1 = new BytesRef();
 BytesRef scratch2 = new BytesRef();
 IntsRefBuilder currentOrds = new IntsRefBuilder();
  } else {
   encodeFlags(flagsScratch, wordForm);
   int ord = flagLookup.add(flagsScratch.get());
   if (ord < 0) {
    Util.toUTF32(currentEntry, scratchInts);
    words.add(scratchInts.get(), currentOrds.get());
 Util.toUTF32(currentEntry, scratchInts);
 words.add(scratchInts.get(), currentOrds.get());
 success2 = true;
} finally {

BytesRefBuilder b = new BytesRefBuilder();
b.append(tokenBytes);
lastTokens[gramCount-1] = b;
for(int i=token.length()-1;i>=0;i--) {
 if (token.byteAt(i) == separator) {
  BytesRef context = new BytesRef(token.bytes(), 0, i);
  Long output = Util.get(fst, Util.toIntsRef(context, new IntsRefBuilder()));
  assert output != null;
  contextCount = decodeWeight(output);
  lastTokenFragment = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
  break;
 searcher.addStartPaths(arc, prefixOutput, true, new IntsRefBuilder());
  token.setLength(prefixLength);
  Util.toBytesRef(completion.input, suffix);
  token.append(suffix);

BytesRef scratch = new BytesRef();
InputIterator iter = new WFSTInputIterator(iterator);
IntsRefBuilder scratchInts = new IntsRefBuilder();
BytesRefBuilder previous = null;
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
while ((scratch = iter.next()) != null) {
 long cost = iter.weight();
  previous = new BytesRefBuilder();
 } else if (scratch.equals(previous.get())) {
  continue; // for duplicate suggestions, the best weight is actually
 Util.toIntsRef(scratch, scratchInts);
 builder.add(scratchInts.get(), cost);
 previous.copyBytes(scratch);
 count++;
fst = builder.finish();

public void finishTerm(long defaultWeight) throws IOException {
  ArrayUtil.timSort(surfaceFormsAndPayload, 0, count);
  int deduplicator = 0;
  analyzed.append((byte) 0);
  analyzed.setLength(analyzed.length() + 1);
  analyzed.grow(analyzed.length());
  for (int i = 0; i < count; i++) {
    analyzed.setByteAt(analyzed.length() - 1, (byte) deduplicator++);
    Util.toIntsRef(analyzed.get(), scratchInts);
    SurfaceFormAndPayload candiate = surfaceFormsAndPayload[i];
    long cost = candiate.weight == -1 ? encodeWeight(Math.min(Integer.MAX_VALUE, defaultWeight)) : candiate.weight;
    builder.add(scratchInts.get(), outputs.newPair(cost, candiate.payload));
  }
  seenSurfaceForms.clear();
  count = 0;
}

public String getUrl(int id) {
 BytesRef scratchBytes = new BytesRef();
 IntsRef key = null;
 try {
  key = Util.getByOutput(fst, id);
 } catch (IOException e) {
  LOG.error("Error id " + id);
  e.printStackTrace();
  return null;
 }
 if (key == null) {
  return null;
 }
 return Util.toBytesRef(key, scratchBytes).utf8ToString();
}

 new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
BytesRefBuilder scratch = new BytesRefBuilder();
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
 scratch.grow(estimatedSize);
 scratchOutput.reset(scratch.bytes());
 scratch.setLength(scratchOutput.getPosition());
 builder.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
FST<BytesRef> fst = builder.finish();
return new SynonymMap(fst, words, maxHorizontalContext);

 /**
  * Builds the final automaton from a list of entries.
  */
 private FST<Object> buildAutomaton(BytesRefSorter sorter) throws IOException {
  // Build the automaton.
  final Outputs<Object> outputs = NoOutputs.getSingleton();
  final Object empty = outputs.getNoOutput();
  final Builder<Object> builder = new Builder<Object>(
    FST.INPUT_TYPE.BYTE1, 0, 0, true, true, 
    shareMaxTailLength, outputs, null, false);
  
  BytesRef scratch = new BytesRef();
  BytesRef entry;
  final IntsRef scratchIntsRef = new IntsRef();
  int count = 0;
  BytesRefIterator iter = sorter.iterator();
  while((entry = iter.next()) != null) {
   count++;
   if (scratch.compareTo(entry) != 0) {
    builder.add(Util.toIntsRef(entry, scratchIntsRef), empty);
    scratch.copyBytes(entry);
   }
  }
  
  return count == 0 ? null : builder.finish();
 }
}

OrdsSegmentTermsEnumFrame f = getFrame(ord);
assert f != null;
final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefix);
if (f.nextEnt == -1) {
 out.println("    frame " + (isSeekFrame ? "(seek)" : "(next)") + " ord=" + ord + " fp=" + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" + f.prefix + " prefix=" + brToString(prefix) + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + " hasTerms=" + f.hasTerms + " isFloor=" + f.isFloor + " code=" + ((f.fp<<OrdsBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS) + (f.hasTerms ? OrdsBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS:0) + (f.isFloor ? OrdsBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR:0)) + " isLastInFloor=" + f.isLastInFloor + " mdUpto=" + f.metaDataUpto + " tbOrd=" + f.getTermBlockOrd() + " termOrd=" + f.termOrd);
 if (f.prefix > 0 && isSeekFrame && f.arc.label != (term.byteAt(f.prefix-1)&0xFF)) {
  out.println("      broken seek state: arc.label=" + (char) f.arc.label + " vs term byte=" + (char) (term.byteAt(f.prefix-1)&0xFF));
  throw new RuntimeException("seek state is broken");
 Output output = Util.get(fr.index, prefix);
 if (output == null) {
  out.println("      broken seek state: prefix is not final in index");

@Override
public void build(TermFreqIterator iterator) throws IOException {
 BytesRef scratch = new BytesRef();
 TermFreqIterator iter = new WFSTTermFreqIteratorWrapper(iterator,
   BytesRef.getUTF8SortedAsUnicodeComparator());
 IntsRef scratchInts = new IntsRef();
 BytesRef previous = null;
 PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
 Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
 while ((scratch = iter.next()) != null) {
  long cost = iter.weight();
  
  if (previous == null) {
   previous = new BytesRef();
  } else if (scratch.equals(previous)) {
   continue; // for duplicate suggestions, the best weight is actually
        // added
  }
  Util.toIntsRef(scratch, scratchInts);
  builder.add(scratchInts, cost);
  previous.copyBytes(scratch);
 }
 fst = builder.finish();
}

 new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);
BytesRef scratch = new BytesRef(64);
ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
 scratch.grow(estimatedSize);
 scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length);
 assert scratch.offset == 0;
 builder.add(Util.toUTF32(input, scratchIntsRef), BytesRef.deepCopyOf(scratch));
FST<BytesRef> fst = builder.finish();
return new SynonymMap(fst, words, maxHorizontalContext);

private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException {
 meta.writeVInt(field.number);
 meta.writeByte(FST);
 meta.writeLong(data.getFilePointer());
 PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
 Builder<Long> builder = new Builder<>(INPUT_TYPE.BYTE1, outputs);
 IntsRefBuilder scratch = new IntsRefBuilder();
 long ord = 0;
 for (BytesRef v : values) {
  builder.add(Util.toIntsRef(v, scratch), ord);
  ord++;
 }
 FST<Long> fst = builder.finish();
 if (fst != null) {
  fst.save(data);
 }
 meta.writeVLong(ord);
}

final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<>(posIntOutputs,
  outputsInner);
b = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
IndexInput in = SimpleTextFieldsReader.this.in.clone();
in.seek(termsStart);
final BytesRefBuilder lastTerm = new BytesRefBuilder();
long lastDocsStart = -1;
int docFreq = 0;
long totalTermFreq = 0;
FixedBitSet visitedDocs = new FixedBitSet(maxDoc);
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
while(true) {
 SimpleTextUtil.readLine(in, scratch);
 if (scratch.get().equals(END) || StringHelper.startsWith(scratch.get(), FIELD)) {
  if (lastDocsStart != -1) {
   b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef),
     outputs.newPair(lastDocsStart,
       outputsInner.newPair((long) docFreq, totalTermFreq)));
 } else if (StringHelper.startsWith(scratch.get(), TERM)) {
  if (lastDocsStart != -1) {
   b.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart,
     outputsInner.newPair((long) docFreq, totalTermFreq)));

public int getID(String url) {
 Long id = null;
 try {
  id = Util.get(fst, new BytesRef(url));
 } catch (IOException e) {
  // Log error, but assume that URL doesn't exist.
  LOG.error("Error fetching " + url);
  e.printStackTrace();
  return -1;
 }
 return id == null ? -1 : id.intValue();
}

private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
 Map<String,String> mappings = new TreeMap<>();
 
 for (int i = 0; i < num; i++) {
  String line = reader.readLine();
  String parts[] = line.split("\\s+");
  if (parts.length != 3) {
   throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
  }
  if (mappings.put(parts[1], parts[2]) != null) {
   throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
  }
 }
 
 Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
 Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
 IntsRefBuilder scratchInts = new IntsRefBuilder();
 for (Map.Entry<String,String> entry : mappings.entrySet()) {
  Util.toUTF16(entry.getKey(), scratchInts);
  builder.add(scratchInts.get(), new CharsRef(entry.getValue()));
 }
 
 return builder.finish();
}

 /**
  * Builds the final automaton from a list of entries.
  */
 private FST<Object> buildAutomaton(BytesRefSorter sorter) throws IOException {
  // Build the automaton.
  final Outputs<Object> outputs = NoOutputs.getSingleton();
  final Object empty = outputs.getNoOutput();
  final Builder<Object> builder = new Builder<>(
    FST.INPUT_TYPE.BYTE1, 0, 0, true, true, 
    shareMaxTailLength, outputs, false, 
    PackedInts.DEFAULT, true, 15);
  
  BytesRefBuilder scratch = new BytesRefBuilder();
  BytesRef entry;
  final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
  int count = 0;
  BytesRefIterator iter = sorter.iterator();
  while((entry = iter.next()) != null) {
   count++;
   if (scratch.get().compareTo(entry) != 0) {
    builder.add(Util.toIntsRef(entry, scratchIntsRef), empty);
    scratch.copyBytes(entry);
   }
  }
  
  return count == 0 ? null : builder.finish();
 }
}

Javadoc

Static helper methods.

Most used methods

toIntsRef
Just takes unsigned byte values from the BytesRef and converts into an IntsRef.
get
Looks up the output for this input, or null if the input is not accepted.
toBytesRef
Just converts IntsRef to BytesRef; you must ensure the int values fit into a byte.
getByOutput
Expert: like Util#getByOutput(FST,long) except reusing BytesReader, initial and scratch Arc, and res
emitDotState
Emit a single state in the dot language.
printableLabel
Ensures an arc's label is indeed printable (dot uses US-ASCII).
toUTF16
Just maps each UTF16 unit (char) to the ints in an IntsRef.
toUTF32
Decodes the Unicode codepoints from the provided char[] and places them in the provided scratch Ints
readCeilArc
Reads the first arc greater or equal that the given label into the provided arc in place and returns
shortestPaths

Popular in Java

Reading from database using SQL prepared statement
getSupportFragmentManager (FragmentActivity)
scheduleAtFixedRate (ScheduledExecutorService)
putExtra (Intent)
Format (java.text)
The base class for all formats. This is an abstract base class which specifies the protocol for clas
Scanner (java.util)
A parser that parses a text string of primitive types and strings with the help of regular expressio
Executor (java.util.concurrent)
An object that executes submitted Runnable tasks. This interface provides a way of decoupling task s
Semaphore (java.util.concurrent)
A counting semaphore. Conceptually, a semaphore maintains a set of permits. Each #acquire blocks if
JCheckBox (javax.swing)
Scheduler (org.quartz)
This is the main interface of a Quartz Scheduler. A Scheduler maintains a registry of org.quartz.Job
Top 12 Jupyter Notebook extensions

How to useUtil in org.apache.lucene.util.fst

Best Java code snippets using org.apache.lucene.util.fst.Util (Showing top 20 results out of 315)

Refine search

How to use
Util
in
org.apache.lucene.util.fst