edu.cmu.sphinx.linguist.language.ngram.trie.NgramTrie java code examples

trie = new NgramTrie(counts, quant.getProbBoSize(), quant.getProbSize());
loader.readTrieByteArr(trie.getMem());

/**
 * Searches ngram index for given wordId in provided range 
 */
private int uniformFind(NgramSet ngram, TrieRange range, int wordId) {
  TrieRange vocabRange = new TrieRange(0, ngram.maxVocab);
  while (range.getWidth() > 1) {
    int pivot = range.begin + 1 + calculatePivot(wordId - vocabRange.begin, vocabRange.getWidth(), range.getWidth() - 1);
    int mid = ngram.readNgramWord(pivot);
    if (mid < wordId) {
      range.begin = pivot;
      vocabRange.begin = mid;
    } else if (mid > wordId){
      range.end = pivot;
      vocabRange.end = mid;
    } else {
      return pivot;
    }
  }
  return -1;
}

/**
 * Finds ngram of cerain order in specified range and reads it's backoff.
 * Range contains ngram successors after function execution.
 * If ngram is not found, range will be invalid.
 * @param wordId - word id to look for
 * @param orderMinusTwo - order of ngram minus two
 * @param range - range to look in, contains ngram successors after function execution
 * @param quant - quantation object to decode compressed backoff stored in trie
 * @return backoff of ngram
 */
public float readNgramBackoff(int wordId, int orderMinusTwo, TrieRange range, NgramTrieQuant quant) {
  int ptr;
  NgramSet ngram = getNgram(orderMinusTwo);
  if ((ptr = findNgram(ngram, wordId, range)) < 0)
    return 0.0f;
  return quant.readBackoff(bitArr, ngram.memPtr, ngram.getNgramWeightsOffset(ptr), orderMinusTwo);
}

/**
 * Selects backoffs for part of word sequence 
 * unused in {@link #getAvailableProb(WordSequence, TrieRange, float) getAvailableProb}
 * Amount of unused words is specified by local variable curDepth
 * @param wordSequence - full word sequence that is scored
 * @return backoff
 */
private float getAvailableBackoff(WordSequence wordSequence) {
  float backoff = 0.0f;
  int wordsNum = wordSequence.size();
  int wordId = unigramIDMap.get(wordSequence.getWord(wordsNum - 2));
  TrieRange range = new TrieRange(unigrams[wordId].next, unigrams[wordId + 1].next);
  if (curDepth == 1) {
    backoff += unigrams[wordId].backoff;
  }
  int sequenceIdx, orderMinusTwo;
  for (sequenceIdx = wordsNum - 3, orderMinusTwo = 0; sequenceIdx >= 0; sequenceIdx--, orderMinusTwo++) {
    int tmpWordId = unigramIDMap.get(wordSequence.getWord(sequenceIdx));
    float tmpBackoff = trie.readNgramBackoff(tmpWordId, orderMinusTwo, range, quant);
    if (!range.getFound()) break;
    backoff += tmpBackoff;
    if (!range.isSearchable()) break;
  }
  return backoff;
}

/**
 * Finds ngram index which corresponds to ngram with specified wordId.
 * Search is performed in specified range. 
 * Fills range with ngram successors if ngram was found, makes range invalid otherwise.
 * @param ngramSet - set of ngrams of certain order to look in
 * @param wordId - word id to look for
 * @param range - range to look in. range contains ngram successors or is invalid after method usage.
 * @return ngram index that can be converted into byte offset if ngram was found, -1 otherwise
 */
private int findNgram(NgramSet ngramSet, int wordId, TrieRange range) {
  int ptr;
  range.begin--;
  if ((ptr = uniformFind(ngramSet, range, wordId)) < 0) {
    range.setFound(false);
    return -1;
  }
  //read next order ngrams for future searches
  if (ngramSet instanceof MiddleNgramSet)
    ((MiddleNgramSet)ngramSet).readNextRange(ptr, range);
  return ptr;
}

/**
 * Selects ngram of highest order available for specified word sequence
 * and extracts probability for it
 * @param wordSequence - word sequence to score
 * @param range - range to look bigram in
 * @param prob - probability of unigram
 * @return probability of of highest order ngram available
 */
private float getAvailableProb(WordSequence wordSequence, TrieRange range, float prob) {
  if (!range.isSearchable()) return prob;
  for (int reverseOrderMinusTwo = wordSequence.size() - 2; reverseOrderMinusTwo >= 0; reverseOrderMinusTwo--) {
    int orderMinusTwo = wordSequence.size() - 2 - reverseOrderMinusTwo;
    if (orderMinusTwo + 1 == maxDepth) break;
    int wordId = unigramIDMap.get(wordSequence.getWord(reverseOrderMinusTwo));
    float updatedProb = trie.readNgramProb(wordId, orderMinusTwo, range, quant);
    if (!range.getFound()) break;
    prob = updatedProb;
    curDepth++;
    if (!range.isSearchable()) break;
  }
  return prob;
}

int[] ngramMemSize = new int[counts.length - 1];
for (int i = 1; i <= counts.length - 1; i++) {
  int entryLen = requiredBits(counts[0]);
  if (i == counts.length - 1) {
  } else {
    entryLen += requiredBits(counts[i + 1]);
    entryLen += quantProbBoLen;

/**
 * Selects backoffs for part of word sequence 
 * unused in {@link #getAvailableProb(WordSequence, TrieRange, float) getAvailableProb}
 * Amount of unused words is specified by local variable curDepth
 * @param wordSequence - full word sequence that is scored
 * @return backoff
 */
private float getAvailableBackoff(WordSequence wordSequence) {
  float backoff = 0.0f;
  int wordsNum = wordSequence.size();
  int wordId = unigramIDMap.get(wordSequence.getWord(wordsNum - 2));
  TrieRange range = new TrieRange(unigrams[wordId].next, unigrams[wordId + 1].next);
  if (curDepth == 1) {
    backoff += unigrams[wordId].backoff;
  }
  int sequenceIdx, orderMinusTwo;
  for (sequenceIdx = wordsNum - 3, orderMinusTwo = 0; sequenceIdx >= 0; sequenceIdx--, orderMinusTwo++) {
    int tmpWordId = unigramIDMap.get(wordSequence.getWord(sequenceIdx));
    float tmpBackoff = trie.readNgramBackoff(tmpWordId, orderMinusTwo, range, quant);
    if (!range.getFound()) break;
    backoff += tmpBackoff;
    if (!range.isSearchable()) break;
  }
  return backoff;
}

/**
 * Finds ngram index which corresponds to ngram with specified wordId.
 * Search is performed in specified range. 
 * Fills range with ngram successors if ngram was found, makes range invalid otherwise.
 * @param ngramSet - set of ngrams of certain order to look in
 * @param wordId - word id to look for
 * @param range - range to look in. range contains ngram successors or is invalid after method usage.
 * @return ngram index that can be converted into byte offset if ngram was found, -1 otherwise
 */
private int findNgram(NgramSet ngramSet, int wordId, TrieRange range) {
  int ptr;
  range.begin--;
  if ((ptr = uniformFind(ngramSet, range, wordId)) < 0) {
    range.setFound(false);
    return -1;
  }
  //read next order ngrams for future searches
  if (ngramSet instanceof MiddleNgramSet)
    ((MiddleNgramSet)ngramSet).readNextRange(ptr, range);
  return ptr;
}

/**
 * Selects ngram of highest order available for specified word sequence
 * and extracts probability for it
 * @param wordSequence - word sequence to score
 * @param range - range to look bigram in
 * @param prob - probability of unigram
 * @return probability of of highest order ngram available
 */
private float getAvailableProb(WordSequence wordSequence, TrieRange range, float prob) {
  if (!range.isSearchable()) return prob;
  for (int reverseOrderMinusTwo = wordSequence.size() - 2; reverseOrderMinusTwo >= 0; reverseOrderMinusTwo--) {
    int orderMinusTwo = wordSequence.size() - 2 - reverseOrderMinusTwo;
    if (orderMinusTwo + 1 == maxDepth) break;
    int wordId = unigramIDMap.get(wordSequence.getWord(reverseOrderMinusTwo));
    float updatedProb = trie.readNgramProb(wordId, orderMinusTwo, range, quant);
    if (!range.getFound()) break;
    prob = updatedProb;
    curDepth++;
    if (!range.isSearchable()) break;
  }
  return prob;
}

int[] ngramMemSize = new int[counts.length - 1];
for (int i = 1; i <= counts.length - 1; i++) {
  int entryLen = requiredBits(counts[0]);
  if (i == counts.length - 1) {
  } else {
    entryLen += requiredBits(counts[i + 1]);
    entryLen += quantProbBoLen;

/**
 * Finds ngram of cerain order in specified range and reads it's probability.
 * Range contains ngram successors after function execution.
 * If ngram is not found, range will be invalid.
 * @param wordId - word id to look for
 * @param orderMinusTwo - order of ngram minus two
 * @param range - range to look in, contains ngram successors after function execution
 * @param quant - quantation object to decode compressed probability stored in trie
 * @return probability of ngram
 */
public float readNgramProb(int wordId, int orderMinusTwo, TrieRange range, NgramTrieQuant quant) {
  int ptr;
  NgramSet ngram = getNgram(orderMinusTwo);
  if ((ptr = findNgram(ngram, wordId, range)) < 0)
    return 0.0f;
  return quant.readProb(bitArr, ngram.memPtr, ngram.getNgramWeightsOffset(ptr), orderMinusTwo);
}

trie = new NgramTrie(counts, quant.getProbBoSize(), quant.getProbSize());
loader.readTrieByteArr(trie.getMem());

/**
 * Searches ngram index for given wordId in provided range 
 */
private int uniformFind(NgramSet ngram, TrieRange range, int wordId) {
  TrieRange vocabRange = new TrieRange(0, ngram.maxVocab);
  while (range.getWidth() > 1) {
    int pivot = range.begin + 1 + calculatePivot(wordId - vocabRange.begin, vocabRange.getWidth(), range.getWidth() - 1);
    int mid = ngram.readNgramWord(pivot);
    if (mid < wordId) {
      range.begin = pivot;
      vocabRange.begin = mid;
    } else if (mid > wordId){
      range.end = pivot;
      vocabRange.end = mid;
    } else {
      return pivot;
    }
  }
  return -1;
}

/**
 * Finds ngram of cerain order in specified range and reads it's backoff.
 * Range contains ngram successors after function execution.
 * If ngram is not found, range will be invalid.
 * @param wordId - word id to look for
 * @param orderMinusTwo - order of ngram minus two
 * @param range - range to look in, contains ngram successors after function execution
 * @param quant - quantation object to decode compressed backoff stored in trie
 * @return backoff of ngram
 */
public float readNgramBackoff(int wordId, int orderMinusTwo, TrieRange range, NgramTrieQuant quant) {
  int ptr;
  NgramSet ngram = getNgram(orderMinusTwo);
  if ((ptr = findNgram(ngram, wordId, range)) < 0)
    return 0.0f;
  return quant.readBackoff(bitArr, ngram.memPtr, ngram.getNgramWeightsOffset(ptr), orderMinusTwo);
}

/**
 * Finds ngram of cerain order in specified range and reads it's probability.
 * Range contains ngram successors after function execution.
 * If ngram is not found, range will be invalid.
 * @param wordId - word id to look for
 * @param orderMinusTwo - order of ngram minus two
 * @param range - range to look in, contains ngram successors after function execution
 * @param quant - quantation object to decode compressed probability stored in trie
 * @return probability of ngram
 */
public float readNgramProb(int wordId, int orderMinusTwo, TrieRange range, NgramTrieQuant quant) {
  int ptr;
  NgramSet ngram = getNgram(orderMinusTwo);
  if ((ptr = findNgram(ngram, wordId, range)) < 0)
    return 0.0f;
  return quant.readProb(bitArr, ngram.memPtr, ngram.getNgramWeightsOffset(ptr), orderMinusTwo);
}

Javadoc

Trie structure that contains ngrams of order 2+ in reversed order. Ngrams are stored in bit array for space efficiency.

Most used methods

<init>
calculatePivot
Calculates pivot for binary search
findNgram
Finds ngram index which corresponds to ngram with specified wordId. Search is performed in specified
getMem
Getter for allocated byte array to which trie is mapped
getNgram
Getter for ngram set by ngram order
readNgramBackoff
Finds ngram of cerain order in specified range and reads it's backoff. Range contains ngram successo
readNgramProb
Finds ngram of cerain order in specified range and reads it's probability. Range contains ngram succ
requiredBits
Calculates minimum amount of bits to store provided int
uniformFind
Searches ngram index for given wordId in provided range

Popular in Java

Creating JSON documents from java classes using gson
addToBackStack (FragmentTransaction)
runOnUiThread (Activity)
getSystemService (Context)
ByteBuffer (java.nio)
A buffer for bytes. A byte buffer can be created in either one of the following ways: * #allocate
SortedMap (java.util)
A map that has its keys ordered. The sorting is according to either the natural ordering of its keys
BlockingQueue (java.util.concurrent)
A java.util.Queue that additionally supports operations that wait for the queue to become non-empty
ExecutorService (java.util.concurrent)
An Executor that provides methods to manage termination and methods that can produce a Future for tr
ZipFile (java.util.zip)
This class provides random read access to a zip file. You pay more to read the zip file's central di
Container (java.awt)
A generic Abstract Window Toolkit(AWT) container object is a component that can contain other AWT co
From CI to AI: The AI layer in your organization

How to useNgramTrie in edu.cmu.sphinx.linguist.language.ngram.trie

Best Java code snippets using edu.cmu.sphinx.linguist.language.ngram.trie.NgramTrie (Showing top 16 results out of 315)

How to use
NgramTrie
in
edu.cmu.sphinx.linguist.language.ngram.trie