edu.cmu.sphinx.linguist.language.ngram.trie java code examples

void readNextRange(int ngramIdx, TrieRange range) {
  int offset = ngramIdx * totalBits;
  offset += wordBits;
  offset += getQuantBits();
  range.begin = bitArr.readInt(memPtr, offset, nextMask);
  offset += totalBits;
  range.end = bitArr.readInt(memPtr, offset, nextMask);
}

/**
 * Getter for allocated byte array to which trie is mapped
 * @return byte[] with ngram trie 
 */
public byte[] getMem() {
  return bitArr.getArr();
}

/**
 * Finds ngram of cerain order in specified range and reads it's probability.
 * Range contains ngram successors after function execution.
 * If ngram is not found, range will be invalid.
 * @param wordId - word id to look for
 * @param orderMinusTwo - order of ngram minus two
 * @param range - range to look in, contains ngram successors after function execution
 * @param quant - quantation object to decode compressed probability stored in trie
 * @return probability of ngram
 */
public float readNgramProb(int wordId, int orderMinusTwo, TrieRange range, NgramTrieQuant quant) {
  int ptr;
  NgramSet ngram = getNgram(orderMinusTwo);
  if ((ptr = findNgram(ngram, wordId, range)) < 0)
    return 0.0f;
  return quant.readProb(bitArr, ngram.memPtr, ngram.getNgramWeightsOffset(ptr), orderMinusTwo);
}

/**
 * Reads weights quantation object from stream
 * @param order - max order of ngrams for this model
 * @return quantation object, see {@link NgramTrieQuant} 
 * @throws IOException if reading from stream failed
 */
public NgramTrieQuant readQuant(int order) throws IOException {
  int quantTypeInt = Utilities.readLittleEndianInt(inStream);
  if (quantTypeInt < 0 || quantTypeInt >= NgramTrieQuant.QuantType.values().length)
    throw new Error("Unknown quantatization type: " + quantTypeInt);
  NgramTrieQuant.QuantType quantType = NgramTrieQuant.QuantType.values()[quantTypeInt];
  NgramTrieQuant quant = new NgramTrieQuant(order, quantType);
  //reading tables
  for (int i = 2; i <= order; i++) {
    quant.setTable(readFloatArr(quant.getProbTableLen()), i, true);
    if (i < order)
      quant.setTable(readFloatArr(quant.getBackoffTableLen()), i, false);
  }
  return quant;
}

/**
 * Finds ngram of cerain order in specified range and reads it's backoff.
 * Range contains ngram successors after function execution.
 * If ngram is not found, range will be invalid.
 * @param wordId - word id to look for
 * @param orderMinusTwo - order of ngram minus two
 * @param range - range to look in, contains ngram successors after function execution
 * @param quant - quantation object to decode compressed backoff stored in trie
 * @return backoff of ngram
 */
public float readNgramBackoff(int wordId, int orderMinusTwo, TrieRange range, NgramTrieQuant quant) {
  int ptr;
  NgramSet ngram = getNgram(orderMinusTwo);
  if ((ptr = findNgram(ngram, wordId, range)) < 0)
    return 0.0f;
  return quant.readBackoff(bitArr, ngram.memPtr, ngram.getNgramWeightsOffset(ptr), orderMinusTwo);
}

/**
 * Searches ngram index for given wordId in provided range 
 */
private int uniformFind(NgramSet ngram, TrieRange range, int wordId) {
  TrieRange vocabRange = new TrieRange(0, ngram.maxVocab);
  while (range.getWidth() > 1) {
    int pivot = range.begin + 1 + calculatePivot(wordId - vocabRange.begin, vocabRange.getWidth(), range.getWidth() - 1);
    int mid = ngram.readNgramWord(pivot);
    if (mid < wordId) {
      range.begin = pivot;
      vocabRange.begin = mid;
    } else if (mid > wordId){
      range.end = pivot;
      vocabRange.end = mid;
    } else {
      return pivot;
    }
  }
  return -1;
}

int[] ngramMemSize = new int[counts.length - 1];
for (int i = 1; i <= counts.length - 1; i++) {
  int entryLen = requiredBits(counts[0]);
  if (i == counts.length - 1) {
  } else {
    entryLen += requiredBits(counts[i + 1]);
    entryLen += quantProbBoLen;
  memLen += tmpLen;
bitArr = new NgramTrieBitarr(memLen);
this.quantProbLen = quantProbLen;
this.quantProbBoLen = quantProbBoLen;
  middles[i - 2] = new MiddleNgramSet(startPtrs[i - 2], quantProbBoLen, counts[i-1], counts[0], counts[i]);
longest = new LongestNgramSet(startPtr, quantProbLen, counts[0]);
ordersNum = middles.length + 1;

/**
 * Selects backoffs for part of word sequence 
 * unused in {@link #getAvailableProb(WordSequence, TrieRange, float) getAvailableProb}
 * Amount of unused words is specified by local variable curDepth
 * @param wordSequence - full word sequence that is scored
 * @return backoff
 */
private float getAvailableBackoff(WordSequence wordSequence) {
  float backoff = 0.0f;
  int wordsNum = wordSequence.size();
  int wordId = unigramIDMap.get(wordSequence.getWord(wordsNum - 2));
  TrieRange range = new TrieRange(unigrams[wordId].next, unigrams[wordId + 1].next);
  if (curDepth == 1) {
    backoff += unigrams[wordId].backoff;
  }
  int sequenceIdx, orderMinusTwo;
  for (sequenceIdx = wordsNum - 3, orderMinusTwo = 0; sequenceIdx >= 0; sequenceIdx--, orderMinusTwo++) {
    int tmpWordId = unigramIDMap.get(wordSequence.getWord(sequenceIdx));
    float tmpBackoff = trie.readNgramBackoff(tmpWordId, orderMinusTwo, range, quant);
    if (!range.getFound()) break;
    backoff += tmpBackoff;
    if (!range.isSearchable()) break;
  }
  return backoff;
}

/**
 * Finds ngram index which corresponds to ngram with specified wordId.
 * Search is performed in specified range. 
 * Fills range with ngram successors if ngram was found, makes range invalid otherwise.
 * @param ngramSet - set of ngrams of certain order to look in
 * @param wordId - word id to look for
 * @param range - range to look in. range contains ngram successors or is invalid after method usage.
 * @return ngram index that can be converted into byte offset if ngram was found, -1 otherwise
 */
private int findNgram(NgramSet ngramSet, int wordId, TrieRange range) {
  int ptr;
  range.begin--;
  if ((ptr = uniformFind(ngramSet, range, wordId)) < 0) {
    range.setFound(false);
    return -1;
  }
  //read next order ngrams for future searches
  if (ngramSet instanceof MiddleNgramSet)
    ((MiddleNgramSet)ngramSet).readNextRange(ptr, range);
  return ptr;
}

/**
 * Reads encoded probability from provided trie bit array and decodes it into actual value
 * for specific ngram
 * @param bitArr - trie bit array 
 * @param memPtr - memory pointer for specific ngram order
 * @param bitOffset - offset from memPtr that is calculated according to ngram index
 * @param orderMinusTwo - order of ngram minus two
 * @return probability of ngram
 */
public float readProb(NgramTrieBitarr bitArr, int memPtr, int bitOffset, int orderMinusTwo) {
  switch (quantType) {
  case NO_QUANT:
    return bitArr.readNegativeFloat(memPtr, bitOffset);
  case QUANT_16:
    int tableIdx = orderMinusTwo * 2;
    if (tableIdx < tables.length - 1)
      bitOffset += backoffBits;
    return binsDecode(tableIdx, bitArr.readInt(memPtr, bitOffset, backoffMask));
  //TODO implement different quantization stages
  default:
    throw new Error("Unsupported quantization type: " + quantType);
  }
}

/**
 * Reads encoded backoff from provided trie bit array and decodes it into actual value
 * for specific ngram
 * @param bitArr - trie bit array 
 * @param memPtr - memory pointer for specific ngram order
 * @param bitOffset - offset from memPtr that is calculated according to ngram index
 * @param orderMinusTwo - order of ngram minus two
 * @return backoffs of ngram
 */
public float readBackoff(NgramTrieBitarr bitArr, int memPtr, int bitOffset, int orderMinusTwo) {
  switch (quantType) {
  case NO_QUANT:
    bitOffset += 31;
    return bitArr.readFloat(memPtr, bitOffset);
  case QUANT_16:
    int tableIdx = orderMinusTwo * 2 + 1;
    return binsDecode(tableIdx, bitArr.readInt(memPtr, bitOffset, probMask));
  //TODO implement different quantization stages
  default:
    throw new Error("Unsupported quantization type: " + quantType);
  }
}

/**
 * extracts raw word sequence probability without using caching, 
 * making fresh LM trie traversing
 * @param wordSequence - sequence of words to get probability for
 * @return probability of specialized sequence of words
 */
private float getProbabilityRaw(WordSequence wordSequence) {
  int wordsNum = wordSequence.size();
  int wordId = unigramIDMap.get(wordSequence.getWord(wordsNum - 1));
  TrieRange range = new TrieRange(unigrams[wordId].next, unigrams[wordId + 1].next);
  float prob = unigrams[wordId].prob;
  curDepth = 1;
  if (wordsNum == 1)
    return prob;
  //find prob of ngrams of higher order if any
  prob = getAvailableProb(wordSequence, range, prob);
  if (curDepth < wordsNum) {
    //use backoff for rest of ngram
    prob += getAvailableBackoff(wordSequence);
  }
  return prob;
}

/**
 * Selects ngram of highest order available for specified word sequence
 * and extracts probability for it
 * @param wordSequence - word sequence to score
 * @param range - range to look bigram in
 * @param prob - probability of unigram
 * @return probability of of highest order ngram available
 */
private float getAvailableProb(WordSequence wordSequence, TrieRange range, float prob) {
  if (!range.isSearchable()) return prob;
  for (int reverseOrderMinusTwo = wordSequence.size() - 2; reverseOrderMinusTwo >= 0; reverseOrderMinusTwo--) {
    int orderMinusTwo = wordSequence.size() - 2 - reverseOrderMinusTwo;
    if (orderMinusTwo + 1 == maxDepth) break;
    int wordId = unigramIDMap.get(wordSequence.getWord(reverseOrderMinusTwo));
    float updatedProb = trie.readNgramProb(wordId, orderMinusTwo, range, quant);
    if (!range.getFound()) break;
    prob = updatedProb;
    curDepth++;
    if (!range.isSearchable()) break;
  }
  return prob;
}

  boolean isSearchable() {
    return getWidth() > 0;
  }
}

int readNgramWord(int ngramIdx) {
  int offset = ngramIdx * totalBits;
  return bitArr.readInt(memPtr, offset, wordMask);
}

public BinaryLoader(URL location) throws IOException {
  loadModelData(location.openStream());
}

/**
 *  Called by lexicon after recognition.
 *  Used to clear caches
 */
public void onUtteranceEnd() {
  clearCache();
  if (logFile != null) {
    logFile.println("<END_UTT>");
    logFile.flush();
  }
}

/**
 * Reads header from stream and checks if it matches trie header
 * @throws IOException if reading from stream failed
 */
public void verifyHeader() throws IOException {
  String readHeader = readString(inStream, TRIE_HEADER.length());
  if (!readHeader.equals(TRIE_HEADER)) {
    throw new Error("Bad binary LM file header: " + readHeader);
  }
}

/**
 * Reads language model order and ngram counts
 * @return array of counts where ordinal number is ngram order
 * @throws IOException if reading from stream failed
 */
public int[] readCounts() throws IOException {
  int order = readOrder();
  int[] counts = new int[order];
  for (int i = 0; i < counts.length; i++) {
    counts[i] = Utilities.readLittleEndianInt(inStream);
  }
  return counts;
}

/**
 * Reads array of language model unigrams 
 * @param count - amount of unigrams according to counts previously read
 * @return array of language model unigrams, see {@link NgramTrieModel.TrieUnigram}
 * @throws IOException if reading from stream failed
 */
public TrieUnigram[] readUnigrams(int count) throws IOException {
  TrieUnigram[] unigrams = new TrieUnigram[count + 1];
  for (int i = 0; i < count + 1; i++) {
    unigrams[i] = new TrieUnigram();
    unigrams[i].prob = Utilities.readLittleEndianFloat(inStream);
    unigrams[i].backoff = Utilities.readLittleEndianFloat(inStream);
    unigrams[i].next = Utilities.readLittleEndianInt(inStream);
  }
  return unigrams;
}

How to use edu.cmu.sphinx.linguist.language.ngram.trie

Best Java code snippets using edu.cmu.sphinx.linguist.language.ngram.trie (Showing top 20 results out of 315)