/** expert: if isBinary is true, the input is already byte-based */ public ByteRunAutomaton(Automaton a, boolean isBinary, int maxDeterminizedStates) { super(isBinary ? a : new UTF32ToUTF8().convert(a), 256, maxDeterminizedStates); }
build(n, end, startUTF8, endUTF8, 1+upto); start(start, end, startUTF8, upto, false); if (endUTF8.byteAt(upto) - startUTF8.byteAt(upto) > 1) { all(start, end, startUTF8.byteAt(upto)+1, endUTF8.byteAt(upto)-1, startUTF8.len-upto-1); end(start, end, endUTF8, upto, false); start(start, end, startUTF8, upto, true); all(start, end, tmpUTF8a.byteAt(0), tmpUTF8b.byteAt(0), end(start, end, endUTF8, upto, true);
convertOneEdge(utf8State, destUTF8, scratch.min, scratch.max);
private void start(int start, int end, UTF8Sequence startUTF8, int upto, boolean doAll) { if (upto == startUTF8.len-1) { // Done recursing utf8.addTransition(start, end, startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]); // type=start //start.addTransition(new Transition(startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1], end)); // type=start } else { int n = utf8.createState(); utf8.addTransition(start, n, startUTF8.byteAt(upto)); //start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=start start(n, end, startUTF8, 1+upto, true); int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]; if (doAll && startUTF8.byteAt(upto) != endCode) { all(start, end, startUTF8.byteAt(upto)+1, endCode, startUTF8.len-upto-1); } } }
private void end(int start, int end, UTF8Sequence endUTF8, int upto, boolean doAll) { if (upto == endUTF8.len-1) { // Done recursing //start.addTransition(new Transition(endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto), end)); // type=end utf8.addTransition(start, end, endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto)); } else { final int startCode; if (endUTF8.numBits(upto) == 5) { // special case -- avoid created unused edges (endUTF8 // doesn't accept certain byte sequences) -- there // are other cases we could optimize too: startCode = 194; } else { startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]); } if (doAll && endUTF8.byteAt(upto) != startCode) { all(start, end, startCode, endUTF8.byteAt(upto)-1, endUTF8.len-upto-1); } int n = utf8.createState(); //start.addTransition(new Transition(endUTF8.byteAt(upto), n)); // type=end utf8.addTransition(start, n, endUTF8.byteAt(upto)); end(n, end, endUTF8, 1+upto, true); } }
void convertOneEdge(int start, int end, int startCodePoint, int endCodePoint) { startUTF8.set(startCodePoint); endUTF8.set(endCodePoint); build(start, end, startUTF8, endUTF8, 0); }
private void start(int start, int end, UTF8Sequence startUTF8, int upto, boolean doAll) { if (upto == startUTF8.len-1) { // Done recursing utf8.addTransition(start, end, startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]); // type=start //start.addTransition(new Transition(startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1], end)); // type=start } else { int n = utf8.createState(); utf8.addTransition(start, n, startUTF8.byteAt(upto)); //start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=start start(n, end, startUTF8, 1+upto, true); int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]; if (doAll && startUTF8.byteAt(upto) != endCode) { all(start, end, startUTF8.byteAt(upto)+1, endCode, startUTF8.len-upto-1); } } }
private void end(int start, int end, UTF8Sequence endUTF8, int upto, boolean doAll) { if (upto == endUTF8.len-1) { // Done recursing //start.addTransition(new Transition(endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto), end)); // type=end utf8.addTransition(start, end, endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto)); } else { final int startCode; if (endUTF8.numBits(upto) == 5) { // special case -- avoid created unused edges (endUTF8 // doesn't accept certain byte sequences) -- there // are other cases we could optimize too: startCode = 194; } else { startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]); } if (doAll && endUTF8.byteAt(upto) != startCode) { all(start, end, startCode, endUTF8.byteAt(upto)-1, endUTF8.len-upto-1); } int n = utf8.createState(); //start.addTransition(new Transition(endUTF8.byteAt(upto), n)); // type=end utf8.addTransition(start, n, endUTF8.byteAt(upto)); end(n, end, endUTF8, 1+upto, true); } }
void convertOneEdge(int start, int end, int startCodePoint, int endCodePoint) { startUTF8.set(startCodePoint); endUTF8.set(endCodePoint); build(start, end, startUTF8, endUTF8, 0); }
} else { binary = new UTF32ToUTF8().convert(automaton);
build(n, end, startUTF8, endUTF8, 1+upto); start(start, end, startUTF8, upto, false); if (endUTF8.byteAt(upto) - startUTF8.byteAt(upto) > 1) { all(start, end, startUTF8.byteAt(upto)+1, endUTF8.byteAt(upto)-1, startUTF8.len-upto-1); end(start, end, endUTF8, upto, false); start(start, end, startUTF8, upto, true); all(start, end, tmpUTF8a.byteAt(0), tmpUTF8b.byteAt(0), end(start, end, endUTF8, upto, true);
private void start(int start, int end, UTF8Sequence startUTF8, int upto, boolean doAll) { if (upto == startUTF8.len-1) { // Done recursing utf8.addTransition(start, end, startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]); // type=start //start.addTransition(new Transition(startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1], end)); // type=start } else { int n = utf8.createState(); utf8.addTransition(start, n, startUTF8.byteAt(upto)); //start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=start start(n, end, startUTF8, 1+upto, true); int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]; if (doAll && startUTF8.byteAt(upto) != endCode) { all(start, end, startUTF8.byteAt(upto)+1, endCode, startUTF8.len-upto-1); } } }
private void end(int start, int end, UTF8Sequence endUTF8, int upto, boolean doAll) { if (upto == endUTF8.len-1) { // Done recursing //start.addTransition(new Transition(endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto), end)); // type=end utf8.addTransition(start, end, endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto)); } else { final int startCode; if (endUTF8.numBits(upto) == 5) { // special case -- avoid created unused edges (endUTF8 // doesn't accept certain byte sequences) -- there // are other cases we could optimize too: startCode = 194; } else { startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]); } if (doAll && endUTF8.byteAt(upto) != startCode) { all(start, end, startCode, endUTF8.byteAt(upto)-1, endUTF8.len-upto-1); } int n = utf8.createState(); //start.addTransition(new Transition(endUTF8.byteAt(upto), n)); // type=end utf8.addTransition(start, n, endUTF8.byteAt(upto)); end(n, end, endUTF8, 1+upto, true); } }
void convertOneEdge(int start, int end, int startCodePoint, int endCodePoint) { startUTF8.set(startCodePoint); endUTF8.set(endCodePoint); build(start, end, startUTF8, endUTF8, 0); }
convertOneEdge(utf8State, destUTF8, scratch.min, scratch.max);
@Override protected Automaton convertAutomaton(Automaton a) { if (unicodeAware) { // FLORIAN EDIT: get converted Automaton from superclass Automaton utf8automaton = new UTF32ToUTF8().convert(super.convertAutomaton(a)); // This automaton should not blow up during determinize: utf8automaton = Operations.determinize(utf8automaton, Integer.MAX_VALUE); return utf8automaton; } else { return super.convertAutomaton(a); } }
build(n, end, startUTF8, endUTF8, 1+upto); start(start, end, startUTF8, upto, false); if (endUTF8.byteAt(upto) - startUTF8.byteAt(upto) > 1) { all(start, end, startUTF8.byteAt(upto)+1, endUTF8.byteAt(upto)-1, startUTF8.len-upto-1); end(start, end, endUTF8, upto, false); start(start, end, startUTF8, upto, true); all(start, end, tmpUTF8a.byteAt(0), tmpUTF8b.byteAt(0), end(start, end, endUTF8, upto, true);
private void start(int start, int end, UTF8Sequence startUTF8, int upto, boolean doAll) { if (upto == startUTF8.len-1) { // Done recursing utf8.addTransition(start, end, startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]); // type=start //start.addTransition(new Transition(startUTF8.byteAt(upto), startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1], end)); // type=start } else { int n = utf8.createState(); utf8.addTransition(start, n, startUTF8.byteAt(upto)); //start.addTransition(new Transition(startUTF8.byteAt(upto), n)); // type=start start(n, end, startUTF8, 1+upto, true); int endCode = startUTF8.byteAt(upto) | MASKS[startUTF8.numBits(upto)-1]; if (doAll && startUTF8.byteAt(upto) != endCode) { all(start, end, startUTF8.byteAt(upto)+1, endCode, startUTF8.len-upto-1); } } }
private void end(int start, int end, UTF8Sequence endUTF8, int upto, boolean doAll) { if (upto == endUTF8.len-1) { // Done recursing //start.addTransition(new Transition(endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto), end)); // type=end utf8.addTransition(start, end, endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]), endUTF8.byteAt(upto)); } else { final int startCode; if (endUTF8.numBits(upto) == 5) { // special case -- avoid created unused edges (endUTF8 // doesn't accept certain byte sequences) -- there // are other cases we could optimize too: startCode = 194; } else { startCode = endUTF8.byteAt(upto) & (~MASKS[endUTF8.numBits(upto)-1]); } if (doAll && endUTF8.byteAt(upto) != startCode) { all(start, end, startCode, endUTF8.byteAt(upto)-1, endUTF8.len-upto-1); } int n = utf8.createState(); //start.addTransition(new Transition(endUTF8.byteAt(upto), n)); // type=end utf8.addTransition(start, n, endUTF8.byteAt(upto)); end(n, end, endUTF8, 1+upto, true); } }
void convertOneEdge(int start, int end, int startCodePoint, int endCodePoint) { startUTF8.set(startCodePoint); endUTF8.set(endCodePoint); build(start, end, startUTF8, endUTF8, 0); }