/** * extracts raw word sequence probability without using caching, * making fresh LM trie traversing * @param wordSequence - sequence of words to get probability for * @return probability of specialized sequence of words */ private float getProbabilityRaw(WordSequence wordSequence) { int wordsNum = wordSequence.size(); int wordId = unigramIDMap.get(wordSequence.getWord(wordsNum - 1)); TrieRange range = new TrieRange(unigrams[wordId].next, unigrams[wordId + 1].next); float prob = unigrams[wordId].prob; curDepth = 1; if (wordsNum == 1) return prob; //find prob of ngrams of higher order if any prob = getAvailableProb(wordSequence, range, prob); if (curDepth < wordsNum) { //use backoff for rest of ngram prob += getAvailableBackoff(wordSequence); } return prob; }
/** * Selects backoffs for part of word sequence * unused in {@link #getAvailableProb(WordSequence, TrieRange, float) getAvailableProb} * Amount of unused words is specified by local variable curDepth * @param wordSequence - full word sequence that is scored * @return backoff */ private float getAvailableBackoff(WordSequence wordSequence) { float backoff = 0.0f; int wordsNum = wordSequence.size(); int wordId = unigramIDMap.get(wordSequence.getWord(wordsNum - 2)); TrieRange range = new TrieRange(unigrams[wordId].next, unigrams[wordId + 1].next); if (curDepth == 1) { backoff += unigrams[wordId].backoff; } int sequenceIdx, orderMinusTwo; for (sequenceIdx = wordsNum - 3, orderMinusTwo = 0; sequenceIdx >= 0; sequenceIdx--, orderMinusTwo++) { int tmpWordId = unigramIDMap.get(wordSequence.getWord(sequenceIdx)); float tmpBackoff = trie.readNgramBackoff(tmpWordId, orderMinusTwo, range, quant); if (!range.getFound()) break; backoff += tmpBackoff; if (!range.isSearchable()) break; } return backoff; }
/** * Selects backoffs for part of word sequence * unused in {@link #getAvailableProb(WordSequence, TrieRange, float) getAvailableProb} * Amount of unused words is specified by local variable curDepth * @param wordSequence - full word sequence that is scored * @return backoff */ private float getAvailableBackoff(WordSequence wordSequence) { float backoff = 0.0f; int wordsNum = wordSequence.size(); int wordId = unigramIDMap.get(wordSequence.getWord(wordsNum - 2)); TrieRange range = new TrieRange(unigrams[wordId].next, unigrams[wordId + 1].next); if (curDepth == 1) { backoff += unigrams[wordId].backoff; } int sequenceIdx, orderMinusTwo; for (sequenceIdx = wordsNum - 3, orderMinusTwo = 0; sequenceIdx >= 0; sequenceIdx--, orderMinusTwo++) { int tmpWordId = unigramIDMap.get(wordSequence.getWord(sequenceIdx)); float tmpBackoff = trie.readNgramBackoff(tmpWordId, orderMinusTwo, range, quant); if (!range.getFound()) break; backoff += tmpBackoff; if (!range.isSearchable()) break; } return backoff; }
/** * extracts raw word sequence probability without using caching, * making fresh LM trie traversing * @param wordSequence - sequence of words to get probability for * @return probability of specialized sequence of words */ private float getProbabilityRaw(WordSequence wordSequence) { int wordsNum = wordSequence.size(); int wordId = unigramIDMap.get(wordSequence.getWord(wordsNum - 1)); TrieRange range = new TrieRange(unigrams[wordId].next, unigrams[wordId + 1].next); float prob = unigrams[wordId].prob; curDepth = 1; if (wordsNum == 1) return prob; //find prob of ngrams of higher order if any prob = getAvailableProb(wordSequence, range, prob); if (curDepth < wordsNum) { //use backoff for rest of ngram prob += getAvailableBackoff(wordSequence); } return prob; }
/** * Finds ngram index which corresponds to ngram with specified wordId. * Search is performed in specified range. * Fills range with ngram successors if ngram was found, makes range invalid otherwise. * @param ngramSet - set of ngrams of certain order to look in * @param wordId - word id to look for * @param range - range to look in. range contains ngram successors or is invalid after method usage. * @return ngram index that can be converted into byte offset if ngram was found, -1 otherwise */ private int findNgram(NgramSet ngramSet, int wordId, TrieRange range) { int ptr; range.begin--; if ((ptr = uniformFind(ngramSet, range, wordId)) < 0) { range.setFound(false); return -1; } //read next order ngrams for future searches if (ngramSet instanceof MiddleNgramSet) ((MiddleNgramSet)ngramSet).readNextRange(ptr, range); return ptr; }
/** * Selects ngram of highest order available for specified word sequence * and extracts probability for it * @param wordSequence - word sequence to score * @param range - range to look bigram in * @param prob - probability of unigram * @return probability of of highest order ngram available */ private float getAvailableProb(WordSequence wordSequence, TrieRange range, float prob) { if (!range.isSearchable()) return prob; for (int reverseOrderMinusTwo = wordSequence.size() - 2; reverseOrderMinusTwo >= 0; reverseOrderMinusTwo--) { int orderMinusTwo = wordSequence.size() - 2 - reverseOrderMinusTwo; if (orderMinusTwo + 1 == maxDepth) break; int wordId = unigramIDMap.get(wordSequence.getWord(reverseOrderMinusTwo)); float updatedProb = trie.readNgramProb(wordId, orderMinusTwo, range, quant); if (!range.getFound()) break; prob = updatedProb; curDepth++; if (!range.isSearchable()) break; } return prob; }
/** * Searches ngram index for given wordId in provided range */ private int uniformFind(NgramSet ngram, TrieRange range, int wordId) { TrieRange vocabRange = new TrieRange(0, ngram.maxVocab); while (range.getWidth() > 1) { int pivot = range.begin + 1 + calculatePivot(wordId - vocabRange.begin, vocabRange.getWidth(), range.getWidth() - 1); int mid = ngram.readNgramWord(pivot); if (mid < wordId) { range.begin = pivot; vocabRange.begin = mid; } else if (mid > wordId){ range.end = pivot; vocabRange.end = mid; } else { return pivot; } } return -1; }
boolean isSearchable() { return getWidth() > 0; } }
/** * Finds ngram index which corresponds to ngram with specified wordId. * Search is performed in specified range. * Fills range with ngram successors if ngram was found, makes range invalid otherwise. * @param ngramSet - set of ngrams of certain order to look in * @param wordId - word id to look for * @param range - range to look in. range contains ngram successors or is invalid after method usage. * @return ngram index that can be converted into byte offset if ngram was found, -1 otherwise */ private int findNgram(NgramSet ngramSet, int wordId, TrieRange range) { int ptr; range.begin--; if ((ptr = uniformFind(ngramSet, range, wordId)) < 0) { range.setFound(false); return -1; } //read next order ngrams for future searches if (ngramSet instanceof MiddleNgramSet) ((MiddleNgramSet)ngramSet).readNextRange(ptr, range); return ptr; }
/** * Selects ngram of highest order available for specified word sequence * and extracts probability for it * @param wordSequence - word sequence to score * @param range - range to look bigram in * @param prob - probability of unigram * @return probability of of highest order ngram available */ private float getAvailableProb(WordSequence wordSequence, TrieRange range, float prob) { if (!range.isSearchable()) return prob; for (int reverseOrderMinusTwo = wordSequence.size() - 2; reverseOrderMinusTwo >= 0; reverseOrderMinusTwo--) { int orderMinusTwo = wordSequence.size() - 2 - reverseOrderMinusTwo; if (orderMinusTwo + 1 == maxDepth) break; int wordId = unigramIDMap.get(wordSequence.getWord(reverseOrderMinusTwo)); float updatedProb = trie.readNgramProb(wordId, orderMinusTwo, range, quant); if (!range.getFound()) break; prob = updatedProb; curDepth++; if (!range.isSearchable()) break; } return prob; }
/** * Searches ngram index for given wordId in provided range */ private int uniformFind(NgramSet ngram, TrieRange range, int wordId) { TrieRange vocabRange = new TrieRange(0, ngram.maxVocab); while (range.getWidth() > 1) { int pivot = range.begin + 1 + calculatePivot(wordId - vocabRange.begin, vocabRange.getWidth(), range.getWidth() - 1); int mid = ngram.readNgramWord(pivot); if (mid < wordId) { range.begin = pivot; vocabRange.begin = mid; } else if (mid > wordId){ range.end = pivot; vocabRange.end = mid; } else { return pivot; } } return -1; }
boolean isSearchable() { return getWidth() > 0; } }