void readNextRange(int ngramIdx, TrieRange range) { int offset = ngramIdx * totalBits; offset += wordBits; offset += getQuantBits(); range.begin = bitArr.readInt(memPtr, offset, nextMask); offset += totalBits; range.end = bitArr.readInt(memPtr, offset, nextMask); }
/** * Getter for allocated byte array to which trie is mapped * @return byte[] with ngram trie */ public byte[] getMem() { return bitArr.getArr(); }
/** * Finds ngram of cerain order in specified range and reads it's probability. * Range contains ngram successors after function execution. * If ngram is not found, range will be invalid. * @param wordId - word id to look for * @param orderMinusTwo - order of ngram minus two * @param range - range to look in, contains ngram successors after function execution * @param quant - quantation object to decode compressed probability stored in trie * @return probability of ngram */ public float readNgramProb(int wordId, int orderMinusTwo, TrieRange range, NgramTrieQuant quant) { int ptr; NgramSet ngram = getNgram(orderMinusTwo); if ((ptr = findNgram(ngram, wordId, range)) < 0) return 0.0f; return quant.readProb(bitArr, ngram.memPtr, ngram.getNgramWeightsOffset(ptr), orderMinusTwo); }
/** * Reads weights quantation object from stream * @param order - max order of ngrams for this model * @return quantation object, see {@link NgramTrieQuant} * @throws IOException if reading from stream failed */ public NgramTrieQuant readQuant(int order) throws IOException { int quantTypeInt = Utilities.readLittleEndianInt(inStream); if (quantTypeInt < 0 || quantTypeInt >= NgramTrieQuant.QuantType.values().length) throw new Error("Unknown quantatization type: " + quantTypeInt); NgramTrieQuant.QuantType quantType = NgramTrieQuant.QuantType.values()[quantTypeInt]; NgramTrieQuant quant = new NgramTrieQuant(order, quantType); //reading tables for (int i = 2; i <= order; i++) { quant.setTable(readFloatArr(quant.getProbTableLen()), i, true); if (i < order) quant.setTable(readFloatArr(quant.getBackoffTableLen()), i, false); } return quant; }
/** * Finds ngram of cerain order in specified range and reads it's backoff. * Range contains ngram successors after function execution. * If ngram is not found, range will be invalid. * @param wordId - word id to look for * @param orderMinusTwo - order of ngram minus two * @param range - range to look in, contains ngram successors after function execution * @param quant - quantation object to decode compressed backoff stored in trie * @return backoff of ngram */ public float readNgramBackoff(int wordId, int orderMinusTwo, TrieRange range, NgramTrieQuant quant) { int ptr; NgramSet ngram = getNgram(orderMinusTwo); if ((ptr = findNgram(ngram, wordId, range)) < 0) return 0.0f; return quant.readBackoff(bitArr, ngram.memPtr, ngram.getNgramWeightsOffset(ptr), orderMinusTwo); }
/** * Searches ngram index for given wordId in provided range */ private int uniformFind(NgramSet ngram, TrieRange range, int wordId) { TrieRange vocabRange = new TrieRange(0, ngram.maxVocab); while (range.getWidth() > 1) { int pivot = range.begin + 1 + calculatePivot(wordId - vocabRange.begin, vocabRange.getWidth(), range.getWidth() - 1); int mid = ngram.readNgramWord(pivot); if (mid < wordId) { range.begin = pivot; vocabRange.begin = mid; } else if (mid > wordId){ range.end = pivot; vocabRange.end = mid; } else { return pivot; } } return -1; }
int[] ngramMemSize = new int[counts.length - 1]; for (int i = 1; i <= counts.length - 1; i++) { int entryLen = requiredBits(counts[0]); if (i == counts.length - 1) { } else { entryLen += requiredBits(counts[i + 1]); entryLen += quantProbBoLen; memLen += tmpLen; bitArr = new NgramTrieBitarr(memLen); this.quantProbLen = quantProbLen; this.quantProbBoLen = quantProbBoLen; middles[i - 2] = new MiddleNgramSet(startPtrs[i - 2], quantProbBoLen, counts[i-1], counts[0], counts[i]); longest = new LongestNgramSet(startPtr, quantProbLen, counts[0]); ordersNum = middles.length + 1;
/** * Selects backoffs for part of word sequence * unused in {@link #getAvailableProb(WordSequence, TrieRange, float) getAvailableProb} * Amount of unused words is specified by local variable curDepth * @param wordSequence - full word sequence that is scored * @return backoff */ private float getAvailableBackoff(WordSequence wordSequence) { float backoff = 0.0f; int wordsNum = wordSequence.size(); int wordId = unigramIDMap.get(wordSequence.getWord(wordsNum - 2)); TrieRange range = new TrieRange(unigrams[wordId].next, unigrams[wordId + 1].next); if (curDepth == 1) { backoff += unigrams[wordId].backoff; } int sequenceIdx, orderMinusTwo; for (sequenceIdx = wordsNum - 3, orderMinusTwo = 0; sequenceIdx >= 0; sequenceIdx--, orderMinusTwo++) { int tmpWordId = unigramIDMap.get(wordSequence.getWord(sequenceIdx)); float tmpBackoff = trie.readNgramBackoff(tmpWordId, orderMinusTwo, range, quant); if (!range.getFound()) break; backoff += tmpBackoff; if (!range.isSearchable()) break; } return backoff; }
/** * Finds ngram index which corresponds to ngram with specified wordId. * Search is performed in specified range. * Fills range with ngram successors if ngram was found, makes range invalid otherwise. * @param ngramSet - set of ngrams of certain order to look in * @param wordId - word id to look for * @param range - range to look in. range contains ngram successors or is invalid after method usage. * @return ngram index that can be converted into byte offset if ngram was found, -1 otherwise */ private int findNgram(NgramSet ngramSet, int wordId, TrieRange range) { int ptr; range.begin--; if ((ptr = uniformFind(ngramSet, range, wordId)) < 0) { range.setFound(false); return -1; } //read next order ngrams for future searches if (ngramSet instanceof MiddleNgramSet) ((MiddleNgramSet)ngramSet).readNextRange(ptr, range); return ptr; }
/** * Reads encoded probability from provided trie bit array and decodes it into actual value * for specific ngram * @param bitArr - trie bit array * @param memPtr - memory pointer for specific ngram order * @param bitOffset - offset from memPtr that is calculated according to ngram index * @param orderMinusTwo - order of ngram minus two * @return probability of ngram */ public float readProb(NgramTrieBitarr bitArr, int memPtr, int bitOffset, int orderMinusTwo) { switch (quantType) { case NO_QUANT: return bitArr.readNegativeFloat(memPtr, bitOffset); case QUANT_16: int tableIdx = orderMinusTwo * 2; if (tableIdx < tables.length - 1) bitOffset += backoffBits; return binsDecode(tableIdx, bitArr.readInt(memPtr, bitOffset, backoffMask)); //TODO implement different quantization stages default: throw new Error("Unsupported quantization type: " + quantType); } }
/** * Reads encoded backoff from provided trie bit array and decodes it into actual value * for specific ngram * @param bitArr - trie bit array * @param memPtr - memory pointer for specific ngram order * @param bitOffset - offset from memPtr that is calculated according to ngram index * @param orderMinusTwo - order of ngram minus two * @return backoffs of ngram */ public float readBackoff(NgramTrieBitarr bitArr, int memPtr, int bitOffset, int orderMinusTwo) { switch (quantType) { case NO_QUANT: bitOffset += 31; return bitArr.readFloat(memPtr, bitOffset); case QUANT_16: int tableIdx = orderMinusTwo * 2 + 1; return binsDecode(tableIdx, bitArr.readInt(memPtr, bitOffset, probMask)); //TODO implement different quantization stages default: throw new Error("Unsupported quantization type: " + quantType); } }
/** * extracts raw word sequence probability without using caching, * making fresh LM trie traversing * @param wordSequence - sequence of words to get probability for * @return probability of specialized sequence of words */ private float getProbabilityRaw(WordSequence wordSequence) { int wordsNum = wordSequence.size(); int wordId = unigramIDMap.get(wordSequence.getWord(wordsNum - 1)); TrieRange range = new TrieRange(unigrams[wordId].next, unigrams[wordId + 1].next); float prob = unigrams[wordId].prob; curDepth = 1; if (wordsNum == 1) return prob; //find prob of ngrams of higher order if any prob = getAvailableProb(wordSequence, range, prob); if (curDepth < wordsNum) { //use backoff for rest of ngram prob += getAvailableBackoff(wordSequence); } return prob; }
/** * Selects ngram of highest order available for specified word sequence * and extracts probability for it * @param wordSequence - word sequence to score * @param range - range to look bigram in * @param prob - probability of unigram * @return probability of of highest order ngram available */ private float getAvailableProb(WordSequence wordSequence, TrieRange range, float prob) { if (!range.isSearchable()) return prob; for (int reverseOrderMinusTwo = wordSequence.size() - 2; reverseOrderMinusTwo >= 0; reverseOrderMinusTwo--) { int orderMinusTwo = wordSequence.size() - 2 - reverseOrderMinusTwo; if (orderMinusTwo + 1 == maxDepth) break; int wordId = unigramIDMap.get(wordSequence.getWord(reverseOrderMinusTwo)); float updatedProb = trie.readNgramProb(wordId, orderMinusTwo, range, quant); if (!range.getFound()) break; prob = updatedProb; curDepth++; if (!range.isSearchable()) break; } return prob; }
boolean isSearchable() { return getWidth() > 0; } }
int readNgramWord(int ngramIdx) { int offset = ngramIdx * totalBits; return bitArr.readInt(memPtr, offset, wordMask); }
public BinaryLoader(URL location) throws IOException { loadModelData(location.openStream()); }
/** * Called by lexicon after recognition. * Used to clear caches */ public void onUtteranceEnd() { clearCache(); if (logFile != null) { logFile.println("<END_UTT>"); logFile.flush(); } }
/** * Reads header from stream and checks if it matches trie header * @throws IOException if reading from stream failed */ public void verifyHeader() throws IOException { String readHeader = readString(inStream, TRIE_HEADER.length()); if (!readHeader.equals(TRIE_HEADER)) { throw new Error("Bad binary LM file header: " + readHeader); } }
/** * Reads language model order and ngram counts * @return array of counts where ordinal number is ngram order * @throws IOException if reading from stream failed */ public int[] readCounts() throws IOException { int order = readOrder(); int[] counts = new int[order]; for (int i = 0; i < counts.length; i++) { counts[i] = Utilities.readLittleEndianInt(inStream); } return counts; }
/** * Reads array of language model unigrams * @param count - amount of unigrams according to counts previously read * @return array of language model unigrams, see {@link NgramTrieModel.TrieUnigram} * @throws IOException if reading from stream failed */ public TrieUnigram[] readUnigrams(int count) throws IOException { TrieUnigram[] unigrams = new TrieUnigram[count + 1]; for (int i = 0; i < count + 1; i++) { unigrams[i] = new TrieUnigram(); unigrams[i].prob = Utilities.readLittleEndianFloat(inStream); unigrams[i].backoff = Utilities.readLittleEndianFloat(inStream); unigrams[i].next = Utilities.readLittleEndianInt(inStream); } return unigrams; }