/** * @param <V> * @param opts * @param numNgramsForEachWord * @param numNgramsForEachOrder * @param reversed * @param values * @param compress * @return */ private static <V> AbstractNgramMap<V> createNgramMap(final ConfigOptions opts, final LongArray[] numNgramsForEachWord, final long[] numNgramsForEachOrder, final boolean reversed, final ValueContainer<V> values, final boolean compress) { return compress ? new CompressedNgramMap<V>((CompressibleValueContainer<V>) values, numNgramsForEachOrder, opts) : HashNgramMap .createImplicitWordHashNgramMap(values, opts, numNgramsForEachWord, reversed); }
private long decompressSearch(final LongArray compressed, final long searchKey, final int ngramOrder, final T outputVal, final long searchOffset) { if (ngramOrder == 0) { final boolean lookingForOffset = searchKey >= 0; final int word = lookingForOffset ? wordOf(searchKey) : (int) searchOffset; if (word < 0 || word >= maps[0].size()) return -1; if (outputVal != null) values.getFromOffset(word, 0, outputVal); return lookingForOffset ? word : combineToKey(word, 0); } else { if (compressed == null) return -1; final long fromIndex = 0; final long toIndex = ((compressed.size() / compressedBlockSize) - 1); final long low = binarySearchBlocks(compressed, compressed.size(), searchKey, fromIndex, toIndex, searchOffset); if (low < 0) return -1; final long index = decompressLinearSearch(compressed, low, searchKey, ngramOrder, outputVal, searchOffset); return index; } }
protected void sort(final LongArray array, final long left0, final long right0, final int ngramOrder) { long left, right; long pivot; left = left0; right = right0 + 1; final long pivotIndex = (left0 + right0) >>> 1; pivot = array.get(pivotIndex);//[outerArrayPart(pivotIndex)][innerArrayPart(pivotIndex)]; swap(pivotIndex, left0, array, ngramOrder); do { do left++; while (left <= right0 && compareLongsRaw(array.get(left), pivot) < 0); do right--; while (compareLongsRaw(array.get(right), pivot) > 0); if (left < right) { swap(left, right, array, ngramOrder); } } while (left <= right); swap(left0, right, array, ngramOrder); if (left0 < right) sort(array, left0, right, ngramOrder); if (left < right0) sort(array, left, right0, ngramOrder); }
private long getContextOffset(final int[] ngram, final int startPos, final int endPos) { if (endPos == startPos) return 0; long hasValueSuffixIndex = 0; if (endPos > startPos) { long lastSuffix = 0L; for (int ngramOrder = 0; ngramOrder < endPos - startPos; ++ngramOrder) { final int firstWord = reverseTrie ? ngram[endPos - ngramOrder - 1] : ngram[startPos + ngramOrder]; final long key = combineToKey(firstWord, lastSuffix); final LongArray compressedKeys = (maps[ngramOrder]).compressedKeys; final long currIndex = decompressSearch(compressedKeys, key, ngramOrder, null); if (currIndex < 0) return -1; lastSuffix = currIndex; } hasValueSuffixIndex = lastSuffix; } return hasValueSuffixIndex; }
numKeyBits = 0; numValueBits = 0; long lastFirstWord = wordOf(firstKey); long lastSuffixPart = contextOffsetOf(firstKey); headerBits = makeHeader(offsetBits, firstValueBits, wordBitOn); bodyBits = new BitList(); for (currUncompressedPos = uncompressedPos + 1; currUncompressedPos < uncompressedSize; ++currUncompressedPos) { final long currKey = uncompressed.get(currUncompressedPos); final long currFirstWord = wordOf(currKey); final long currSuffixPart = contextOffsetOf(currKey); numValueBits += compressValue(ngramOrder, currUncompressedPos, currBits); if (blockFull(currBlockBits, bodyBits, headerBits, currBits)) { break; writeBlockToArray(currBlockBits, compressedLongArray); logCompressionInfo(uncompressedSize, compressedLongArray, totalNumKeyBits, totalNumValueBits);
final BitStream bits = getCompressedBits(compressed, pos + 1); final long offset = offsetCoder.decompress(bits); final boolean wordBitOn = bits.nextBit(); int currWord = wordOf(firstKey); long currSuffix = contextOffsetOf(firstKey); final boolean foundKeyFirst = searchOffset >= 0 ? searchOffset == offset : firstKey == searchKey; nextSuffix = (currSuffix + suffixDelta); currKey = combineToKey(newWord, nextSuffix); currWord = newWord; currSuffix = nextSuffix;
@Override public void handleNgramsFinished(final int justFinishedOrder) { final CompressedMap compressedMap = maps[justFinishedOrder - 1]; if (compressedMap != null) { final LongArray currKeys = compressedMap.getUncompressedKeys(); final long currSize = currKeys.size(); sort(currKeys, 0, currSize - 1, justFinishedOrder - 1); compressedMap.trim(); values.trimAfterNgram(justFinishedOrder - 1, currSize); compress(justFinishedOrder - 1); } }
@Override public long put(final int[] ngram, int startPos, int endPos, final T val) { final int ngramOrder = endPos - startPos - 1; final int word = reverseTrie ? ngram[startPos] : ngram[endPos - 1]; final long contextOffset = reverseTrie ? getContextOffset(ngram, startPos + 1, endPos) : getContextOffset(ngram, startPos, endPos - 1); if (contextOffset < 0) return -1; CompressedMap map = maps[ngramOrder]; if (map == null) { map = maps[ngramOrder] = new CompressedMap(); final long l = numNgramsForEachOrder[ngramOrder]; maps[ngramOrder].init(l); values.setSizeAtLeast(l, ngramOrder); } long oldSize = map.size(); final long newOffset = map.add(combineToKey(word, contextOffset)); values.add(ngram, startPos, endPos, ngramOrder, map.size() - 1, contextOffset, word, val, (-1), map.size() == oldSize); return newOffset; }
/** * @param compressed * @param searchKey * @return */ private long binarySearchBlocks(final LongArray compressed, final long size, final long searchKey, final long low_, final long high_, final long searchOffset) { final long toFind = searchOffset >= 0 ? searchOffset : searchKey; long low = low_; long high = high_; assert size % compressedBlockSize == 0; while (low <= high) { final long mid = (low + high) >>> 1; final long currPos = mid * compressedBlockSize; final long midVal = searchOffset >= 0 ? offsetCoder.decompress(getCompressedBits(compressed, currPos + 1)) : compressed.get(currPos); final int compare = compareLongsRaw(midVal, toFind); if (compare < 0) //midVal < key low = mid + 1; else if (compare > 0) // midVal > key high = mid - 1; else { low = mid + 1; break;// key found } } if (low <= 0) return -1; final long i = (low - 1) * compressedBlockSize; return i; }
private long decompressSearch(final LongArray compressed, final long searchKey, final int ngramOrder, final T outputVal, final long searchOffset) { if (ngramOrder == 0) { final boolean lookingForOffset = searchKey >= 0; int word = lookingForOffset ? AbstractNgramMap.wordOf(searchKey) : (int) searchOffset; if (word < 0 || word >= maps[0].size()) return -1; if (outputVal != null) values.getFromOffset(word, 0, outputVal); return lookingForOffset ? word : AbstractNgramMap.combineToKey(word, 0); } final long fromIndex = 0; final long toIndex = ((compressed.size() / compressedBlockSize) - 1); final long low = binarySearchBlocks(compressed, compressed.size(), searchKey, fromIndex, toIndex, searchOffset); if (low < 0) return -1; final long index = decompressLinearSearch(compressed, low, searchKey, ngramOrder, outputVal, searchOffset); return index; }
private void compress(final int ngramOrder) { if (ngramOrder > 0) { (maps[ngramOrder]).compressedKeys = compress(maps[ngramOrder].getUncompressedKeys(), maps[ngramOrder].size(), ngramOrder); ((CompressibleValueContainer<T>) values).clearStorageAfterCompression(ngramOrder); maps[ngramOrder].clearUncompressedKeys(); } }
private long decompressSearch(final LongArray compressed, final long searchKey, final int ngramOrder, final T outputVal) { return decompressSearch(compressed, searchKey, ngramOrder, outputVal, -1); }
numKeyBits = 0; numValueBits = 0; long lastFirstWord = wordOf(firstKey); long lastSuffixPart = contextOffsetOf(firstKey); headerBits = makeHeader(offsetBits, firstValueBits, wordBitOn); bodyBits = new BitList(); for (currUncompressedPos = uncompressedPos + 1; currUncompressedPos < uncompressedSize; ++currUncompressedPos) { final long currKey = uncompressed.get(currUncompressedPos); final long currFirstWord = wordOf(currKey); final long currSuffixPart = contextOffsetOf(currKey); numValueBits += compressValue(ngramOrder, currUncompressedPos, currBits); if (blockFull(currBlockBits, bodyBits, headerBits, currBits)) { break; writeBlockToArray(currBlockBits, compressedLongArray); logCompressionInfo(uncompressedSize, compressedLongArray, totalNumKeyBits, totalNumValueBits);
final BitStream bits = getCompressedBits(compressed, pos + 1); final long offset = offsetCoder.decompress(bits); final boolean wordBitOn = bits.nextBit(); int currWord = wordOf(firstKey); long currSuffix = contextOffsetOf(firstKey); final boolean foundKeyFirst = searchOffset >= 0 ? searchOffset == offset : firstKey == searchKey; nextSuffix = (currSuffix + suffixDelta); currKey = combineToKey(newWord, nextSuffix); currWord = newWord; currSuffix = nextSuffix;
private long getContextOffset(final int[] ngram, final int startPos, final int endPos, T val) { if (endPos == startPos) return 0; long hasValueSuffixIndex = 0; if (endPos > startPos) { long lastSuffix = 0L; for (int ngramOrder = 0; ngramOrder < endPos - startPos; ++ngramOrder) { final int firstWord = reverseTrie ? ngram[endPos - ngramOrder - 1] : ngram[startPos + ngramOrder]; final long key = combineToKey(firstWord, lastSuffix); if (maps[ngramOrder] == null) return -1; final LongArray compressedKeys = (maps[ngramOrder]).compressedKeys; final long currIndex = decompressSearch(compressedKeys, key, ngramOrder, val); if (currIndex < 0) return -1; lastSuffix = currIndex; } hasValueSuffixIndex = lastSuffix; } return hasValueSuffixIndex; }
@Override public void handleNgramsFinished(final int justFinishedOrder) { final LongArray currKeys = maps[justFinishedOrder - 1].getUncompressedKeys(); final long currSize = currKeys.size(); sort(currKeys, 0, currSize - 1, justFinishedOrder - 1); maps[justFinishedOrder - 1].trim(); values.trimAfterNgram(justFinishedOrder - 1, currSize); compress(justFinishedOrder - 1); }
@Override public long put(final int[] ngram, final int startPos, final int endPos, final T val) { final int ngramOrder = endPos - startPos - 1; final int word = reverseTrie ? ngram[startPos] : ngram[endPos - 1]; final long contextOffset = reverseTrie ? getContextOffset(ngram, startPos + 1, endPos, null) : getContextOffset(ngram, startPos, endPos - 1, null); if (contextOffset < 0) return -1; CompressedMap map = maps[ngramOrder]; if (map == null) { map = maps[ngramOrder] = new CompressedMap(); final long l = numNgramsForEachOrder[ngramOrder]; maps[ngramOrder].init(l); values.setSizeAtLeast(l, ngramOrder); } final long oldSize = map.size(); final long newOffset = map.add(combineToKey(word, contextOffset)); final boolean addWorked = values.add(ngram, startPos, endPos, ngramOrder, map.size() - 1, contextOffset, word, val, -1, map.size() == oldSize); if (!addWorked) return -1; return newOffset; }
/** * @param compressed * @param searchKey * @return */ private long binarySearchBlocks(final LongArray compressed, final long size, final long searchKey, final long low_, final long high_, final long searchOffset) { final long toFind = searchOffset >= 0 ? searchOffset : searchKey; long low = low_; long high = high_; assert size % compressedBlockSize == 0; while (low <= high) { final long mid = (low + high) >>> 1; final long currPos = mid * compressedBlockSize; final long midVal = searchOffset >= 0 ? offsetCoder.decompress(getCompressedBits(compressed, currPos + 1)) : compressed.get(currPos); final int compare = compareLongsRaw(midVal, toFind); if (compare < 0) //midVal < key low = mid + 1; else if (compare > 0) // midVal > key high = mid - 1; else { low = mid + 1; break;// key found } } if (low <= 0) return -1; final long i = (low - 1) * compressedBlockSize; return i; }
private void compress(final int ngramOrder) { if (ngramOrder > 0) { (maps[ngramOrder]).compressedKeys = compress(maps[ngramOrder].getUncompressedKeys(), maps[ngramOrder].size(), ngramOrder); ((CompressibleValueContainer<T>) values).clearStorageAfterCompression(ngramOrder); } maps[ngramOrder].clearUncompressedKeys(); }
private long decompressSearch(final LongArray compressed, final long searchKey, final int ngramOrder, final T outputVal) { return decompressSearch(compressed, searchKey, ngramOrder, outputVal, -1); }