@Override public void setDictionary(Dictionary dictionary) { int length = dictionary.getMaxId() + 1; lookupTable = new ArrayList<T>(); for (int i = 0; i < length; i++) { lookupTable.add(convert(dictionary.decodeToBinary(i))); } }
public static void printDictionary(ColumnDescriptor columnDescriptor, Dictionary localDictionary) { System.out.println("Dictionary for column " + columnDescriptor.toString()); for (int i = 0; i < localDictionary.getMaxId(); ++i) { switch (columnDescriptor.getType()) { case INT32: System.out.println(format("%d: %d", i, localDictionary.decodeToInt(i))); break; case INT64: System.out.println(format("%d: %d", i, localDictionary.decodeToLong(i))); break; case INT96: case BINARY: case FIXED_LEN_BYTE_ARRAY: System.out.println(format("%d: %s", i, new String(localDictionary.decodeToBinary(i).getBytesUnsafe()))); break; case FLOAT: System.out.println(format("%d: %f", i, localDictionary.decodeToFloat(i))); break; case DOUBLE: System.out.println(format("%d: %f", i, localDictionary.decodeToDouble(i))); break; case BOOLEAN: System.out.println(format("%d: %b", i, localDictionary.decodeToBoolean(i))); break; default: break; } } } }
for (int i=0; i<=dict.getMaxId(); i++) { switch (col.getType()) { case BINARY: dictSet.add((T) conversion.apply(dict.decodeToBinary(i)));
localIdToGlobalId = new int[pageReader.dictionary.getMaxId() + 1]; final VectorContainer vectorContainer = globalDictionaries.getDictionaries().get(schemaElement.getName()); switch (schemaElement.getType()) { valueLookup.put(intVector.get(i), i); for (int i = 0; i <= pageReader.dictionary.getMaxId(); ++i) { localIdToGlobalId[i] = valueLookup.get(pageReader.dictionary.decodeToInt(i)); valueLookup.put(longVector.get(i), i); for (int i = 0; i <= pageReader.dictionary.getMaxId(); ++i) { localIdToGlobalId[i] = valueLookup.get(pageReader.dictionary.decodeToLong(i)); for (int i = 0; i <= pageReader.dictionary.getMaxId(); ++i) { localIdToGlobalId[i] = valueLookup.get(pageReader.dictionary.decodeToBinary(i)); for (int i = 0; i <= pageReader.dictionary.getMaxId(); ++i) { localIdToGlobalId[i] = valueLookup.get(pageReader.dictionary.decodeToFloat(i)); for (int i = 0; i <= pageReader.dictionary.getMaxId(); ++i) { localIdToGlobalId[i] = valueLookup.get(pageReader.dictionary.decodeToDouble(i)); for (int i = 0; i <= pageReader.dictionary.getMaxId(); ++i) { localIdToGlobalId[i] = pageReader.dictionary.decodeToBoolean(i) ? 1 : 0; this.dictionaryWidthBits = BytesUtils.getWidthFromMaxInt(pageReader.dictionary.getMaxId() - 1);
@SuppressWarnings("unchecked") private <T extends Comparable<T>> Set<T> expandDictionary(ColumnChunkMetaData meta) throws IOException { ColumnDescriptor col = new ColumnDescriptor(meta.getPath().toArray(), meta.getPrimitiveType(), -1, -1); DictionaryPage page = dictionaries.readDictionaryPage(col); // the chunk may not be dictionary-encoded if (page == null) { return null; } Dictionary dict = page.getEncoding().initDictionary(col, page); Set dictSet = new HashSet<T>(); for (int i=0; i<=dict.getMaxId(); i++) { switch(meta.getType()) { case BINARY: dictSet.add(dict.decodeToBinary(i)); break; case INT32: dictSet.add(dict.decodeToInt(i)); break; case INT64: dictSet.add(dict.decodeToLong(i)); break; case FLOAT: dictSet.add(dict.decodeToFloat(i)); break; case DOUBLE: dictSet.add(dict.decodeToDouble(i)); break; default: LOG.warn("Unknown dictionary type{}", meta.getType()); } } return (Set<T>) dictSet; }
for (int i = 0; i <= dict.getMaxId(); i++) { dictSet.add((T) dictValueProvider.apply(i));
private static VectorContainer buildLongGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) { final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.Int(64, true), null); final VectorContainer input = new VectorContainer(bufferAllocator); final BigIntVector longVector = input.addOrGet(field); longVector.allocateNew(); SortedSet<Long> values = Sets.newTreeSet(); for (Dictionary dictionary : dictionaries) { for (int i = 0; i <= dictionary.getMaxId(); ++i) { values.add(dictionary.decodeToLong(i)); } } if (existingDict != null) { final BigIntVector existingDictValues = existingDict.getValueAccessorById(BigIntVector.class, 0).getValueVector(); for (int i = 0; i < existingDict.getRecordCount(); ++i) { values.add(existingDictValues.get(i)); } } final Iterator<Long> iter = values.iterator(); int recordCount = 0; while (iter.hasNext()) { longVector.setSafe(recordCount++, iter.next()); } longVector.setValueCount(recordCount); input.setRecordCount(recordCount); input.buildSchema(BatchSchema.SelectionVectorMode.NONE); return input; }
private static VectorContainer buildIntegerGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) { final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.Int(32, true), null); final VectorContainer input = new VectorContainer(bufferAllocator); final IntVector intVector = input.addOrGet(field); intVector.allocateNew(); final SortedSet<Integer> values = Sets.newTreeSet(); for (Dictionary dictionary : dictionaries) { for (int i = 0; i <= dictionary.getMaxId(); ++i) { values.add(dictionary.decodeToInt(i)); } } if (existingDict != null) { final IntVector existingDictValues = existingDict.getValueAccessorById(IntVector.class, 0).getValueVector(); for (int i = 0; i < existingDict.getRecordCount(); ++i) { values.add(existingDictValues.get(i)); } } final Iterator<Integer> iter = values.iterator(); int recordCount = 0; while (iter.hasNext()) { intVector.setSafe(recordCount++, iter.next()); } intVector.setValueCount(recordCount); input.setRecordCount(recordCount); input.buildSchema(BatchSchema.SelectionVectorMode.NONE); return input; }
private static VectorContainer buildFloatGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) { final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), null); final VectorContainer input = new VectorContainer(bufferAllocator); final Float4Vector floatVector = input.addOrGet(field); floatVector.allocateNew(); SortedSet<Float> values = Sets.newTreeSet(); for (Dictionary dictionary : dictionaries) { for (int i = 0; i <= dictionary.getMaxId(); ++i) { values.add(dictionary.decodeToFloat(i)); } } if (existingDict != null) { final Float4Vector existingDictValues = existingDict.getValueAccessorById(Float4Vector.class, 0).getValueVector(); for (int i = 0; i < existingDict.getRecordCount(); ++i) { values.add(existingDictValues.get(i)); } } final Iterator<Float> iter = values.iterator(); int recordCount = 0; while (iter.hasNext()) { floatVector.setSafe(recordCount++, iter.next()); } floatVector.setValueCount(recordCount); input.setRecordCount(recordCount); input.buildSchema(BatchSchema.SelectionVectorMode.NONE); return input; }
private static VectorContainer buildDoubleGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) { final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE), null); final VectorContainer input = new VectorContainer(bufferAllocator); final Float8Vector doubleVector = input.addOrGet(field); doubleVector.allocateNew(); SortedSet<Double> values = Sets.newTreeSet(); for (Dictionary dictionary : dictionaries) { for (int i = 0; i <= dictionary.getMaxId(); ++i) { values.add(dictionary.decodeToDouble(i)); } } if (existingDict != null) { final Float8Vector existingDictValues = existingDict.getValueAccessorById(Float8Vector.class, 0).getValueVector(); for (int i = 0; i < existingDict.getRecordCount(); ++i) { values.add(existingDictValues.get(i)); } } final Iterator<Double> iter = values.iterator(); int recordCount = 0; while (iter.hasNext()) { doubleVector.setSafe(recordCount++, iter.next()); } doubleVector.setValueCount(recordCount); input.setRecordCount(recordCount); input.buildSchema(BatchSchema.SelectionVectorMode.NONE); return input; }
private static VectorContainer buildBinaryGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) { final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.Binary(), null); final VectorContainer input = new VectorContainer(bufferAllocator); final VarBinaryVector binaryVector = input.addOrGet(field); binaryVector.allocateNew(); final SortedSet<Binary> values = new TreeSet<>(); for (Dictionary dictionary : dictionaries) { for (int i = 0; i <= dictionary.getMaxId(); ++i) { values.add(dictionary.decodeToBinary(i)); } } if (existingDict != null) { final VarBinaryVector existingDictValues = existingDict.getValueAccessorById(VarBinaryVector.class, 0).getValueVector(); for (int i = 0; i < existingDict.getRecordCount(); ++i) { values.add(Binary.fromConstantByteArray(existingDictValues.get(i))); } } final Iterator<Binary> iter = values.iterator(); int recordCount = 0; while (iter.hasNext()) { final byte[] data = iter.next().getBytes(); binaryVector.setSafe(recordCount++, data, 0, data.length); } binaryVector.setValueCount(recordCount); input.setRecordCount(recordCount); input.buildSchema(BatchSchema.SelectionVectorMode.NONE); return input; }