/** * Selects backoffs for part of word sequence * unused in {@link #getAvailableProb(WordSequence, TrieRange, float) getAvailableProb} * Amount of unused words is specified by local variable curDepth * @param wordSequence - full word sequence that is scored * @return backoff */ private float getAvailableBackoff(WordSequence wordSequence) { float backoff = 0.0f; int wordsNum = wordSequence.size(); int wordId = unigramIDMap.get(wordSequence.getWord(wordsNum - 2)); TrieRange range = new TrieRange(unigrams[wordId].next, unigrams[wordId + 1].next); if (curDepth == 1) { backoff += unigrams[wordId].backoff; } int sequenceIdx, orderMinusTwo; for (sequenceIdx = wordsNum - 3, orderMinusTwo = 0; sequenceIdx >= 0; sequenceIdx--, orderMinusTwo++) { int tmpWordId = unigramIDMap.get(wordSequence.getWord(sequenceIdx)); float tmpBackoff = trie.readNgramBackoff(tmpWordId, orderMinusTwo, range, quant); if (!range.getFound()) break; backoff += tmpBackoff; if (!range.isSearchable()) break; } return backoff; }
/** * extracts raw word sequence probability without using caching, * making fresh LM trie traversing * @param wordSequence - sequence of words to get probability for * @return probability of specialized sequence of words */ private float getProbabilityRaw(WordSequence wordSequence) { int wordsNum = wordSequence.size(); int wordId = unigramIDMap.get(wordSequence.getWord(wordsNum - 1)); TrieRange range = new TrieRange(unigrams[wordId].next, unigrams[wordId + 1].next); float prob = unigrams[wordId].prob; curDepth = 1; if (wordsNum == 1) return prob; //find prob of ngrams of higher order if any prob = getAvailableProb(wordSequence, range, prob); if (curDepth < wordsNum) { //use backoff for rest of ngram prob += getAvailableBackoff(wordSequence); } return prob; }
/** * Selects backoffs for part of word sequence * unused in {@link #getAvailableProb(WordSequence, TrieRange, float) getAvailableProb} * Amount of unused words is specified by local variable curDepth * @param wordSequence - full word sequence that is scored * @return backoff */ private float getAvailableBackoff(WordSequence wordSequence) { float backoff = 0.0f; int wordsNum = wordSequence.size(); int wordId = unigramIDMap.get(wordSequence.getWord(wordsNum - 2)); TrieRange range = new TrieRange(unigrams[wordId].next, unigrams[wordId + 1].next); if (curDepth == 1) { backoff += unigrams[wordId].backoff; } int sequenceIdx, orderMinusTwo; for (sequenceIdx = wordsNum - 3, orderMinusTwo = 0; sequenceIdx >= 0; sequenceIdx--, orderMinusTwo++) { int tmpWordId = unigramIDMap.get(wordSequence.getWord(sequenceIdx)); float tmpBackoff = trie.readNgramBackoff(tmpWordId, orderMinusTwo, range, quant); if (!range.getFound()) break; backoff += tmpBackoff; if (!range.isSearchable()) break; } return backoff; }
/** * extracts raw word sequence probability without using caching, * making fresh LM trie traversing * @param wordSequence - sequence of words to get probability for * @return probability of specialized sequence of words */ private float getProbabilityRaw(WordSequence wordSequence) { int wordsNum = wordSequence.size(); int wordId = unigramIDMap.get(wordSequence.getWord(wordsNum - 1)); TrieRange range = new TrieRange(unigrams[wordId].next, unigrams[wordId + 1].next); float prob = unigrams[wordId].prob; curDepth = 1; if (wordsNum == 1) return prob; //find prob of ngrams of higher order if any prob = getAvailableProb(wordSequence, range, prob); if (curDepth < wordsNum) { //use backoff for rest of ngram prob += getAvailableBackoff(wordSequence); } return prob; }
/** * Searches ngram index for given wordId in provided range */ private int uniformFind(NgramSet ngram, TrieRange range, int wordId) { TrieRange vocabRange = new TrieRange(0, ngram.maxVocab); while (range.getWidth() > 1) { int pivot = range.begin + 1 + calculatePivot(wordId - vocabRange.begin, vocabRange.getWidth(), range.getWidth() - 1); int mid = ngram.readNgramWord(pivot); if (mid < wordId) { range.begin = pivot; vocabRange.begin = mid; } else if (mid > wordId){ range.end = pivot; vocabRange.end = mid; } else { return pivot; } } return -1; }
/** * Searches ngram index for given wordId in provided range */ private int uniformFind(NgramSet ngram, TrieRange range, int wordId) { TrieRange vocabRange = new TrieRange(0, ngram.maxVocab); while (range.getWidth() > 1) { int pivot = range.begin + 1 + calculatePivot(wordId - vocabRange.begin, vocabRange.getWidth(), range.getWidth() - 1); int mid = ngram.readNgramWord(pivot); if (mid < wordId) { range.begin = pivot; vocabRange.begin = mid; } else if (mid > wordId){ range.end = pivot; vocabRange.end = mid; } else { return pivot; } } return -1; }