/** * Searches ngram index for given wordId in provided range */ private int uniformFind(NgramSet ngram, TrieRange range, int wordId) { TrieRange vocabRange = new TrieRange(0, ngram.maxVocab); while (range.getWidth() > 1) { int pivot = range.begin + 1 + calculatePivot(wordId - vocabRange.begin, vocabRange.getWidth(), range.getWidth() - 1); int mid = ngram.readNgramWord(pivot); if (mid < wordId) { range.begin = pivot; vocabRange.begin = mid; } else if (mid > wordId){ range.end = pivot; vocabRange.end = mid; } else { return pivot; } } return -1; }
/** * Finds ngram of cerain order in specified range and reads it's backoff. * Range contains ngram successors after function execution. * If ngram is not found, range will be invalid. * @param wordId - word id to look for * @param orderMinusTwo - order of ngram minus two * @param range - range to look in, contains ngram successors after function execution * @param quant - quantation object to decode compressed backoff stored in trie * @return backoff of ngram */ public float readNgramBackoff(int wordId, int orderMinusTwo, TrieRange range, NgramTrieQuant quant) { int ptr; NgramSet ngram = getNgram(orderMinusTwo); if ((ptr = findNgram(ngram, wordId, range)) < 0) return 0.0f; return quant.readBackoff(bitArr, ngram.memPtr, ngram.getNgramWeightsOffset(ptr), orderMinusTwo); }
/** * Selects backoffs for part of word sequence * unused in {@link #getAvailableProb(WordSequence, TrieRange, float) getAvailableProb} * Amount of unused words is specified by local variable curDepth * @param wordSequence - full word sequence that is scored * @return backoff */ private float getAvailableBackoff(WordSequence wordSequence) { float backoff = 0.0f; int wordsNum = wordSequence.size(); int wordId = unigramIDMap.get(wordSequence.getWord(wordsNum - 2)); TrieRange range = new TrieRange(unigrams[wordId].next, unigrams[wordId + 1].next); if (curDepth == 1) { backoff += unigrams[wordId].backoff; } int sequenceIdx, orderMinusTwo; for (sequenceIdx = wordsNum - 3, orderMinusTwo = 0; sequenceIdx >= 0; sequenceIdx--, orderMinusTwo++) { int tmpWordId = unigramIDMap.get(wordSequence.getWord(sequenceIdx)); float tmpBackoff = trie.readNgramBackoff(tmpWordId, orderMinusTwo, range, quant); if (!range.getFound()) break; backoff += tmpBackoff; if (!range.isSearchable()) break; } return backoff; }
/** * Finds ngram index which corresponds to ngram with specified wordId. * Search is performed in specified range. * Fills range with ngram successors if ngram was found, makes range invalid otherwise. * @param ngramSet - set of ngrams of certain order to look in * @param wordId - word id to look for * @param range - range to look in. range contains ngram successors or is invalid after method usage. * @return ngram index that can be converted into byte offset if ngram was found, -1 otherwise */ private int findNgram(NgramSet ngramSet, int wordId, TrieRange range) { int ptr; range.begin--; if ((ptr = uniformFind(ngramSet, range, wordId)) < 0) { range.setFound(false); return -1; } //read next order ngrams for future searches if (ngramSet instanceof MiddleNgramSet) ((MiddleNgramSet)ngramSet).readNextRange(ptr, range); return ptr; }
/** * Selects ngram of highest order available for specified word sequence * and extracts probability for it * @param wordSequence - word sequence to score * @param range - range to look bigram in * @param prob - probability of unigram * @return probability of of highest order ngram available */ private float getAvailableProb(WordSequence wordSequence, TrieRange range, float prob) { if (!range.isSearchable()) return prob; for (int reverseOrderMinusTwo = wordSequence.size() - 2; reverseOrderMinusTwo >= 0; reverseOrderMinusTwo--) { int orderMinusTwo = wordSequence.size() - 2 - reverseOrderMinusTwo; if (orderMinusTwo + 1 == maxDepth) break; int wordId = unigramIDMap.get(wordSequence.getWord(reverseOrderMinusTwo)); float updatedProb = trie.readNgramProb(wordId, orderMinusTwo, range, quant); if (!range.getFound()) break; prob = updatedProb; curDepth++; if (!range.isSearchable()) break; } return prob; }
int[] ngramMemSize = new int[counts.length - 1]; for (int i = 1; i <= counts.length - 1; i++) { int entryLen = requiredBits(counts[0]); if (i == counts.length - 1) { } else { entryLen += requiredBits(counts[i + 1]); entryLen += quantProbBoLen;
/** * Selects backoffs for part of word sequence * unused in {@link #getAvailableProb(WordSequence, TrieRange, float) getAvailableProb} * Amount of unused words is specified by local variable curDepth * @param wordSequence - full word sequence that is scored * @return backoff */ private float getAvailableBackoff(WordSequence wordSequence) { float backoff = 0.0f; int wordsNum = wordSequence.size(); int wordId = unigramIDMap.get(wordSequence.getWord(wordsNum - 2)); TrieRange range = new TrieRange(unigrams[wordId].next, unigrams[wordId + 1].next); if (curDepth == 1) { backoff += unigrams[wordId].backoff; } int sequenceIdx, orderMinusTwo; for (sequenceIdx = wordsNum - 3, orderMinusTwo = 0; sequenceIdx >= 0; sequenceIdx--, orderMinusTwo++) { int tmpWordId = unigramIDMap.get(wordSequence.getWord(sequenceIdx)); float tmpBackoff = trie.readNgramBackoff(tmpWordId, orderMinusTwo, range, quant); if (!range.getFound()) break; backoff += tmpBackoff; if (!range.isSearchable()) break; } return backoff; }
/** * Finds ngram index which corresponds to ngram with specified wordId. * Search is performed in specified range. * Fills range with ngram successors if ngram was found, makes range invalid otherwise. * @param ngramSet - set of ngrams of certain order to look in * @param wordId - word id to look for * @param range - range to look in. range contains ngram successors or is invalid after method usage. * @return ngram index that can be converted into byte offset if ngram was found, -1 otherwise */ private int findNgram(NgramSet ngramSet, int wordId, TrieRange range) { int ptr; range.begin--; if ((ptr = uniformFind(ngramSet, range, wordId)) < 0) { range.setFound(false); return -1; } //read next order ngrams for future searches if (ngramSet instanceof MiddleNgramSet) ((MiddleNgramSet)ngramSet).readNextRange(ptr, range); return ptr; }
/** * Selects ngram of highest order available for specified word sequence * and extracts probability for it * @param wordSequence - word sequence to score * @param range - range to look bigram in * @param prob - probability of unigram * @return probability of of highest order ngram available */ private float getAvailableProb(WordSequence wordSequence, TrieRange range, float prob) { if (!range.isSearchable()) return prob; for (int reverseOrderMinusTwo = wordSequence.size() - 2; reverseOrderMinusTwo >= 0; reverseOrderMinusTwo--) { int orderMinusTwo = wordSequence.size() - 2 - reverseOrderMinusTwo; if (orderMinusTwo + 1 == maxDepth) break; int wordId = unigramIDMap.get(wordSequence.getWord(reverseOrderMinusTwo)); float updatedProb = trie.readNgramProb(wordId, orderMinusTwo, range, quant); if (!range.getFound()) break; prob = updatedProb; curDepth++; if (!range.isSearchable()) break; } return prob; }
int[] ngramMemSize = new int[counts.length - 1]; for (int i = 1; i <= counts.length - 1; i++) { int entryLen = requiredBits(counts[0]); if (i == counts.length - 1) { } else { entryLen += requiredBits(counts[i + 1]); entryLen += quantProbBoLen;
/** * Finds ngram of cerain order in specified range and reads it's probability. * Range contains ngram successors after function execution. * If ngram is not found, range will be invalid. * @param wordId - word id to look for * @param orderMinusTwo - order of ngram minus two * @param range - range to look in, contains ngram successors after function execution * @param quant - quantation object to decode compressed probability stored in trie * @return probability of ngram */ public float readNgramProb(int wordId, int orderMinusTwo, TrieRange range, NgramTrieQuant quant) { int ptr; NgramSet ngram = getNgram(orderMinusTwo); if ((ptr = findNgram(ngram, wordId, range)) < 0) return 0.0f; return quant.readProb(bitArr, ngram.memPtr, ngram.getNgramWeightsOffset(ptr), orderMinusTwo); }
/** * Searches ngram index for given wordId in provided range */ private int uniformFind(NgramSet ngram, TrieRange range, int wordId) { TrieRange vocabRange = new TrieRange(0, ngram.maxVocab); while (range.getWidth() > 1) { int pivot = range.begin + 1 + calculatePivot(wordId - vocabRange.begin, vocabRange.getWidth(), range.getWidth() - 1); int mid = ngram.readNgramWord(pivot); if (mid < wordId) { range.begin = pivot; vocabRange.begin = mid; } else if (mid > wordId){ range.end = pivot; vocabRange.end = mid; } else { return pivot; } } return -1; }
/** * Finds ngram of cerain order in specified range and reads it's backoff. * Range contains ngram successors after function execution. * If ngram is not found, range will be invalid. * @param wordId - word id to look for * @param orderMinusTwo - order of ngram minus two * @param range - range to look in, contains ngram successors after function execution * @param quant - quantation object to decode compressed backoff stored in trie * @return backoff of ngram */ public float readNgramBackoff(int wordId, int orderMinusTwo, TrieRange range, NgramTrieQuant quant) { int ptr; NgramSet ngram = getNgram(orderMinusTwo); if ((ptr = findNgram(ngram, wordId, range)) < 0) return 0.0f; return quant.readBackoff(bitArr, ngram.memPtr, ngram.getNgramWeightsOffset(ptr), orderMinusTwo); }
/** * Finds ngram of cerain order in specified range and reads it's probability. * Range contains ngram successors after function execution. * If ngram is not found, range will be invalid. * @param wordId - word id to look for * @param orderMinusTwo - order of ngram minus two * @param range - range to look in, contains ngram successors after function execution * @param quant - quantation object to decode compressed probability stored in trie * @return probability of ngram */ public float readNgramProb(int wordId, int orderMinusTwo, TrieRange range, NgramTrieQuant quant) { int ptr; NgramSet ngram = getNgram(orderMinusTwo); if ((ptr = findNgram(ngram, wordId, range)) < 0) return 0.0f; return quant.readProb(bitArr, ngram.memPtr, ngram.getNgramWeightsOffset(ptr), orderMinusTwo); }