org.apache.lucene.util.automaton.UTF32ToUTF8 java code examples

/** expert: if isBinary is true, the input is already byte-based */
public ByteRunAutomaton(Automaton a, boolean isBinary, int maxDeterminizedStates) {
 super(isBinary ? a : new UTF32ToUTF8().convert(a), 256, maxDeterminizedStates);
}

 build(n, end, startUTF8, endUTF8, 1+upto);
 start(start, end, startUTF8, upto, false);
 if (endUTF8.byteAt(upto) - startUTF8.byteAt(upto) > 1) {
  all(start, end, startUTF8.byteAt(upto)+1, endUTF8.byteAt(upto)-1, startUTF8.len-upto-1);
 end(start, end, endUTF8, upto, false);
start(start, end, startUTF8, upto, true);
 all(start, end,
   tmpUTF8a.byteAt(0),
   tmpUTF8b.byteAt(0),
end(start, end, endUTF8, upto, true);

convertOneEdge(utf8State, destUTF8, scratch.min, scratch.max);

private void start(int start, int end, UTF8Sequence startUTF8, int upto, boolean doAll) {
 if (upto == startUTF8.len-1) {
  // Done recursing
  utf8.addTransition(start, end, startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]); // type=start
  //start.addTransition(new Transition(startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1], end));  // type=start
 } else {
  int n = utf8.createState();
  utf8.addTransition(start, n, startUTF8.byteAt(upto));
  //start.addTransition(new Transition(startUTF8.byteAt(upto), n));  // type=start
  start(n, end, startUTF8, 1+upto, true);
  int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1];
  if (doAll && startUTF8.byteAt(upto) != endCode) {
   all(start, end, startUTF8.byteAt(upto)+1, endCode, startUTF8.len-upto-1);
  }
 }
}

private void end(int start, int end, UTF8Sequence endUTF8, int upto, boolean doAll) {
 if (upto == endUTF8.len-1) {
  // Done recursing
  //start.addTransition(new Transition(endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto), end));   // type=end
  utf8.addTransition(start, end, endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto));
 } else {
  final int startCode;
  if (endUTF8.numBits(upto) == 5) {
   // special case -- avoid created unused edges (endUTF8
   // doesn't accept certain byte sequences) -- there
   // are other cases we could optimize too:
   startCode = 194;
  } else {
   startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]);
  }
  if (doAll && endUTF8.byteAt(upto) != startCode) {
   all(start, end, startCode, endUTF8.byteAt(upto)-1, endUTF8.len-upto-1);
  }
  int n = utf8.createState();
  //start.addTransition(new Transition(endUTF8.byteAt(upto), n));  // type=end
  utf8.addTransition(start, n, endUTF8.byteAt(upto));
  end(n, end, endUTF8, 1+upto, true);
 }
}

void convertOneEdge(int start, int end, int startCodePoint, int endCodePoint) {
 startUTF8.set(startCodePoint);
 endUTF8.set(endCodePoint);
 build(start, end, startUTF8, endUTF8, 0);
}

private void start(int start, int end, UTF8Sequence startUTF8, int upto, boolean doAll) {
 if (upto == startUTF8.len-1) {
  // Done recursing
  utf8.addTransition(start, end, startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]); // type=start
  //start.addTransition(new Transition(startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1], end));  // type=start
 } else {
  int n = utf8.createState();
  utf8.addTransition(start, n, startUTF8.byteAt(upto));
  //start.addTransition(new Transition(startUTF8.byteAt(upto), n));  // type=start
  start(n, end, startUTF8, 1+upto, true);
  int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1];
  if (doAll && startUTF8.byteAt(upto) != endCode) {
   all(start, end, startUTF8.byteAt(upto)+1, endCode, startUTF8.len-upto-1);
  }
 }
}

private void end(int start, int end, UTF8Sequence endUTF8, int upto, boolean doAll) {
 if (upto == endUTF8.len-1) {
  // Done recursing
  //start.addTransition(new Transition(endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto), end));   // type=end
  utf8.addTransition(start, end, endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto));
 } else {
  final int startCode;
  if (endUTF8.numBits(upto) == 5) {
   // special case -- avoid created unused edges (endUTF8
   // doesn't accept certain byte sequences) -- there
   // are other cases we could optimize too:
   startCode = 194;
  } else {
   startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]);
  }
  if (doAll && endUTF8.byteAt(upto) != startCode) {
   all(start, end, startCode, endUTF8.byteAt(upto)-1, endUTF8.len-upto-1);
  }
  int n = utf8.createState();
  //start.addTransition(new Transition(endUTF8.byteAt(upto), n));  // type=end
  utf8.addTransition(start, n, endUTF8.byteAt(upto));
  end(n, end, endUTF8, 1+upto, true);
 }
}

void convertOneEdge(int start, int end, int startCodePoint, int endCodePoint) {
 startUTF8.set(startCodePoint);
 endUTF8.set(endCodePoint);
 build(start, end, startUTF8, endUTF8, 0);
}

} else {
 binary = new UTF32ToUTF8().convert(automaton);

 build(n, end, startUTF8, endUTF8, 1+upto);
 start(start, end, startUTF8, upto, false);
 if (endUTF8.byteAt(upto) - startUTF8.byteAt(upto) > 1) {
  all(start, end, startUTF8.byteAt(upto)+1, endUTF8.byteAt(upto)-1, startUTF8.len-upto-1);
 end(start, end, endUTF8, upto, false);
start(start, end, startUTF8, upto, true);
 all(start, end,
   tmpUTF8a.byteAt(0),
   tmpUTF8b.byteAt(0),
end(start, end, endUTF8, upto, true);

private void start(int start, int end, UTF8Sequence startUTF8, int upto, boolean doAll) {
 if (upto == startUTF8.len-1) {
  // Done recursing
  utf8.addTransition(start, end, startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]); // type=start
  //start.addTransition(new Transition(startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1], end));  // type=start
 } else {
  int n = utf8.createState();
  utf8.addTransition(start, n, startUTF8.byteAt(upto));
  //start.addTransition(new Transition(startUTF8.byteAt(upto), n));  // type=start
  start(n, end, startUTF8, 1+upto, true);
  int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1];
  if (doAll && startUTF8.byteAt(upto) != endCode) {
   all(start, end, startUTF8.byteAt(upto)+1, endCode, startUTF8.len-upto-1);
  }
 }
}

private void end(int start, int end, UTF8Sequence endUTF8, int upto, boolean doAll) {
 if (upto == endUTF8.len-1) {
  // Done recursing
  //start.addTransition(new Transition(endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto), end));   // type=end
  utf8.addTransition(start, end, endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto));
 } else {
  final int startCode;
  if (endUTF8.numBits(upto) == 5) {
   // special case -- avoid created unused edges (endUTF8
   // doesn't accept certain byte sequences) -- there
   // are other cases we could optimize too:
   startCode = 194;
  } else {
   startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]);
  }
  if (doAll && endUTF8.byteAt(upto) != startCode) {
   all(start, end, startCode, endUTF8.byteAt(upto)-1, endUTF8.len-upto-1);
  }
  int n = utf8.createState();
  //start.addTransition(new Transition(endUTF8.byteAt(upto), n));  // type=end
  utf8.addTransition(start, n, endUTF8.byteAt(upto));
  end(n, end, endUTF8, 1+upto, true);
 }
}

void convertOneEdge(int start, int end, int startCodePoint, int endCodePoint) {
 startUTF8.set(startCodePoint);
 endUTF8.set(endCodePoint);
 build(start, end, startUTF8, endUTF8, 0);
}

convertOneEdge(utf8State, destUTF8, scratch.min, scratch.max);

@Override
protected Automaton convertAutomaton(Automaton a) {
 if (unicodeAware) {
  // FLORIAN EDIT: get converted Automaton from superclass
  Automaton utf8automaton = new UTF32ToUTF8().convert(super.convertAutomaton(a));
  // This automaton should not blow up during determinize:
  utf8automaton = Operations.determinize(utf8automaton, Integer.MAX_VALUE);
  return utf8automaton;
 } else {
  return super.convertAutomaton(a);
 }
}

 build(n, end, startUTF8, endUTF8, 1+upto);
 start(start, end, startUTF8, upto, false);
 if (endUTF8.byteAt(upto) - startUTF8.byteAt(upto) > 1) {
  all(start, end, startUTF8.byteAt(upto)+1, endUTF8.byteAt(upto)-1, startUTF8.len-upto-1);
 end(start, end, endUTF8, upto, false);
start(start, end, startUTF8, upto, true);
 all(start, end,
   tmpUTF8a.byteAt(0),
   tmpUTF8b.byteAt(0),
end(start, end, endUTF8, upto, true);

private void start(int start, int end, UTF8Sequence startUTF8, int upto, boolean doAll) {
 if (upto == startUTF8.len-1) {
  // Done recursing
  utf8.addTransition(start, end, startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]); // type=start
  //start.addTransition(new Transition(startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1], end));  // type=start
 } else {
  int n = utf8.createState();
  utf8.addTransition(start, n, startUTF8.byteAt(upto));
  //start.addTransition(new Transition(startUTF8.byteAt(upto), n));  // type=start
  start(n, end, startUTF8, 1+upto, true);
  int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1];
  if (doAll && startUTF8.byteAt(upto) != endCode) {
   all(start, end, startUTF8.byteAt(upto)+1, endCode, startUTF8.len-upto-1);
  }
 }
}

private void end(int start, int end, UTF8Sequence endUTF8, int upto, boolean doAll) {
 if (upto == endUTF8.len-1) {
  // Done recursing
  //start.addTransition(new Transition(endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto), end));   // type=end
  utf8.addTransition(start, end, endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto));
 } else {
  final int startCode;
  if (endUTF8.numBits(upto) == 5) {
   // special case -- avoid created unused edges (endUTF8
   // doesn't accept certain byte sequences) -- there
   // are other cases we could optimize too:
   startCode = 194;
  } else {
   startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]);
  }
  if (doAll && endUTF8.byteAt(upto) != startCode) {
   all(start, end, startCode, endUTF8.byteAt(upto)-1, endUTF8.len-upto-1);
  }
  int n = utf8.createState();
  //start.addTransition(new Transition(endUTF8.byteAt(upto), n));  // type=end
  utf8.addTransition(start, n, endUTF8.byteAt(upto));
  end(n, end, endUTF8, 1+upto, true);
 }
}

void convertOneEdge(int start, int end, int startCodePoint, int endCodePoint) {
 startUTF8.set(startCodePoint);
 endUTF8.set(endCodePoint);
 build(start, end, startUTF8, endUTF8, 0);
}

Javadoc

Converts UTF-32 automata to the equivalent UTF-8 representation.

Most used methods

<init>
Sole constructor.
convert
Converts an incoming utf32 automaton to an equivalent utf8 one. The incoming automaton need not be d
all
build
convertOneEdge
end
start

Popular in Java

Updating database using SQL prepared statement
setScale (BigDecimal)
getResourceAsStream (ClassLoader)
getSharedPreferences (Context)
SecureRandom (java.security)
This class generates cryptographically secure pseudo-random numbers. It is best to invoke SecureRand
NumberFormat (java.text)
The abstract base class for all number formats. This class provides the interface for formatting and
ReentrantLock (java.util.concurrent.locks)
A reentrant mutual exclusion Lock with the same basic behavior and semantics as the implicit monitor
Stream (java.util.stream)
A sequence of elements supporting sequential and parallel aggregate operations. The following exampl
Container (java.awt)
A generic Abstract Window Toolkit(AWT) container object is a component that can contain other AWT co
Join (org.hibernate.mapping)
From CI to AI: The AI layer in your organization

How to useUTF32ToUTF8 in org.apache.lucene.util.automaton

Best Java code snippets using org.apache.lucene.util.automaton.UTF32ToUTF8 (Showing top 20 results out of 315)

How to use
UTF32ToUTF8
in
org.apache.lucene.util.automaton