public MetricFieldSpec(@Nonnull String name, @Nonnull DataType dataType) { super(name, dataType, true); _fieldSize = _dataType.size(); }
public void setDataType(@Nonnull DataType dataType) { _dataType = dataType.getStoredType(); }
/** * Extract the data type stored in Pinot for the given Avro field. */ public static FieldSpec.DataType extractFieldDataType(Field field) { try { org.apache.avro.Schema fieldSchema = extractSupportedSchema(field.schema()); org.apache.avro.Schema.Type fieldType = fieldSchema.getType(); if (fieldType == org.apache.avro.Schema.Type.ARRAY) { return FieldSpec.DataType.valueOf(extractSupportedSchema(fieldSchema.getElementType()).getType()); } else { return FieldSpec.DataType.valueOf(fieldType); } } catch (Exception e) { throw new RuntimeException("Caught exception while extracting data type from field: " + field.name(), e); } }
builder.setTotalRawDocs(config.getInt(getKeyFor(column, TOTAL_RAW_DOCS), totalDocs)); builder.setTotalAggDocs(config.getInt(getKeyFor(column, TOTAL_AGG_DOCS), 0)); DataType dataType = DataType.valueOf(config.getString(getKeyFor(column, DATA_TYPE)).toUpperCase()); builder.setDataType(dataType); builder.setBitsPerElement(config.getInt(getKeyFor(column, BITS_PER_ELEMENT)));
int indexColumnSize = FieldSpec.DataType.INT.size(); if (noDictionaryColumns.contains(column) && fieldSpec.isSingleValueField() && dataType != FieldSpec.DataType.STRING && !invertedIndexColumns.contains(column)) { indexColumnSize = dataType.size(); } else { int dictionaryColumnSize; dictionaryColumnSize = _statsHistory.getEstimatedAvgColSize(column); } else { dictionaryColumnSize = dataType.size();
Object sortedValues = dictionary.getSortedValues(); List<Comparable> actualSortedValues = (dataType.equals(FieldSpec.DataType.STRING) || dataType.equals(FieldSpec.DataType.BYTES)) ? Arrays .asList((Comparable[]) dictionary.getSortedValues()) : primitiveArrayToList(dataType, sortedValues); Assert.assertEquals(actualSortedValues, expectedSortedValues); if (!dataType.equals(FieldSpec.DataType.BYTES)) { for (int i = 0; i < dictionary.length(); i++) { Assert.assertTrue(dictionary.inRange(expectedMin.toString(), expectedMax.toString(), i, true, true));
/** * Returns true if dictionary should be created for a column, false otherwise. * Currently there are two sources for this config: * <ul> * <li> ColumnIndexCreationInfo (this is currently hard-coded to always return dictionary). </li> * <li> SegmentGeneratorConfig</li> * </ul> * * This method gives preference to the SegmentGeneratorConfig first. * * @param info Column index creation info * @param config Segment generation config * @param spec Field spec for the column * @return True if dictionary should be created for the column, false otherwise */ private boolean createDictionaryForColumn(ColumnIndexCreationInfo info, SegmentGeneratorConfig config, FieldSpec spec) { String column = spec.getName(); if (config.getRawIndexCreationColumns().contains(column) || config.getRawIndexCompressionType() .containsKey(column)) { if (!spec.isSingleValueField()) { throw new RuntimeException( "Creation of indices without dictionaries is supported for single valued columns only."); } return false; } else if (spec.getDataType().equals(FieldSpec.DataType.BYTES) && !info.isFixedLength()) { return false; } return info.isCreateDictionary(); }
/** * Test all {@link FieldSpec.DataType}. */ @Test public void testDataType() { Assert.assertEquals(INT.getStoredType(), INT); Assert.assertEquals(LONG.getStoredType(), LONG); Assert.assertEquals(FLOAT.getStoredType(), FLOAT); Assert.assertEquals(DOUBLE.getStoredType(), DOUBLE); Assert.assertEquals(BOOLEAN.getStoredType(), STRING); Assert.assertEquals(STRING.getStoredType(), STRING); Assert.assertEquals(BYTES.getStoredType(), BYTES); Assert.assertEquals(INT.size(), Integer.BYTES); Assert.assertEquals(LONG.size(), Long.BYTES); Assert.assertEquals(FLOAT.size(), Float.BYTES); Assert.assertEquals(DOUBLE.size(), Double.BYTES); Assert.assertEquals(FieldSpec.DataType.valueOf(Schema.Type.INT), INT); Assert.assertEquals(FieldSpec.DataType.valueOf(Schema.Type.LONG), LONG); Assert.assertEquals(FieldSpec.DataType.valueOf(Schema.Type.FLOAT), FLOAT); Assert.assertEquals(FieldSpec.DataType.valueOf(Schema.Type.DOUBLE), DOUBLE); Assert.assertEquals(FieldSpec.DataType.valueOf(Schema.Type.BOOLEAN), STRING); Assert.assertEquals(FieldSpec.DataType.valueOf(Schema.Type.STRING), STRING); Assert.assertEquals(FieldSpec.DataType.valueOf(Schema.Type.ENUM), STRING); Assert.assertEquals(FieldSpec.DataType.valueOf(Schema.Type.BYTES), BYTES); }
private boolean shouldConvertColumn(FieldSpec fieldSpec) { String columnName = fieldSpec.getName(); FieldSpec.DataType dataType = fieldSpec.getDataType(); int numTotalDocs = _originalSegmentMetadata.getTotalDocs(); ColumnMetadata columnMetadata = _originalSegmentMetadata.getColumnMetadataFor(columnName); int cardinality = columnMetadata.getCardinality(); // In bits int lengthOfEachEntry; if (dataType.equals(FieldSpec.DataType.STRING)) { lengthOfEachEntry = columnMetadata.getColumnMaxLength() * Byte.SIZE; } else { lengthOfEachEntry = dataType.size() * Byte.SIZE; } long dictionaryBasedIndexSize = (long) numTotalDocs * columnMetadata.getBitsPerElement() + (long) cardinality * lengthOfEachEntry; long rawIndexSize = (long) numTotalDocs * lengthOfEachEntry; LOGGER.info( "For column: {}, size of dictionary based index: {} bits, size of raw index (without compression): {} bits", columnName, dictionaryBasedIndexSize, rawIndexSize); return rawIndexSize <= dictionaryBasedIndexSize * CONVERSION_THRESHOLD; }
@Override public int getLengthOfShortestElement() { // Length of longest string int minStringLength = Integer.MAX_VALUE; // If this column is a string/bytes column, iterate over the dictionary to find the maximum length FieldSpec.DataType dataType = _dataSource.getDataSourceMetadata().getDataType(); final int length = _dictionaryReader.length(); if (dataType.equals(FieldSpec.DataType.STRING)) { for (int i = 0; i < length; i++) { minStringLength = Math.min(_dictionaryReader.getStringValue(i).length(), minStringLength); } } else if (dataType.equals(FieldSpec.DataType.BYTES)) { for (int i = 0; i < length; i++) { minStringLength = Math.min(_dictionaryReader.getBytesValue(i).length, minStringLength); } } return minStringLength; }
public static DataType getColumnType(Field field) { org.apache.avro.Schema fieldSchema = field.schema(); fieldSchema = extractSchemaFromUnionIfNeeded(fieldSchema); final Type type = fieldSchema.getType(); if (type == Type.ARRAY) { org.apache.avro.Schema elementSchema = extractSchemaFromUnionIfNeeded(fieldSchema.getElementType()); if (elementSchema.getType() == Type.RECORD) { if (elementSchema.getFields().size() == 1) { elementSchema = elementSchema.getFields().get(0).schema(); } else { throw new RuntimeException("More than one schema in Multi-value column!"); } elementSchema = extractSchemaFromUnionIfNeeded(elementSchema); } return DataType.valueOf(elementSchema.getType()); } else { return DataType.valueOf(type); } }
public static DataType getColumnType(Field field) { org.apache.avro.Schema fieldSchema = field.schema(); fieldSchema = extractSchemaFromUnionIfNeeded(fieldSchema); final Type type = fieldSchema.getType(); if (type == Type.ARRAY) { org.apache.avro.Schema elementSchema = extractSchemaFromUnionIfNeeded(fieldSchema.getElementType()); if (elementSchema.getType() == Type.RECORD) { if (elementSchema.getFields().size() == 1) { elementSchema = elementSchema.getFields().get(0).schema(); } else { throw new RuntimeException("More than one schema in Multi-value column!"); } elementSchema = extractSchemaFromUnionIfNeeded(elementSchema); } return DataType.valueOf(elementSchema.getType()); } else { return DataType.valueOf(type); } }
@Override public int getLengthOfLargestElement() { // Length of longest string int maximumStringLength = 0; // If this column is a string/bytes column, iterate over the dictionary to find the maximum length FieldSpec.DataType dataType = _dataSource.getDataSourceMetadata().getDataType(); final int length = _dictionaryReader.length(); if (dataType.equals(FieldSpec.DataType.STRING)) { for (int i = 0; i < length; i++) { maximumStringLength = Math.max(_dictionaryReader.getStringValue(i).length(), maximumStringLength); } } else if (dataType.equals(FieldSpec.DataType.BYTES)) { for (int i = 0; i < length; i++) { maximumStringLength = Math.max(_dictionaryReader.getBytesValue(i).length, maximumStringLength); } } return maximumStringLength; }
/** * Returns the {@link ObjectNode} representing the field spec. * <p>Only contains fields with non-default value. * <p>NOTE: here we use {@link ObjectNode} to preserve the insertion order. */ public ObjectNode toJsonObject() { ObjectNode jsonObject = JsonUtils.newObjectNode(); jsonObject.put("name", _name); jsonObject.put("dataType", _dataType.name()); if (!_isSingleValueField) { jsonObject.put("singleValueField", false); } if (_maxLength != DEFAULT_MAX_LENGTH) { jsonObject.put("maxLength", _maxLength); } appendDefaultNullValue(jsonObject); return jsonObject; }
/** * Returns the {@link ObjectNode} representing the time granularity spec. * <p>Only contains fields with non-default value. * <p>NOTE: here we use {@link ObjectNode} to preserve the insertion order. */ public ObjectNode toJsonObject() { ObjectNode jsonObject = JsonUtils.newObjectNode(); jsonObject.put("name", _name); jsonObject.put("dataType", _dataType.name()); jsonObject.put("timeType", _timeType.name()); if (_timeUnitSize != DEFAULT_TIME_UNIT_SIZE) { jsonObject.put("timeUnitSize", _timeUnitSize); } if (!_timeFormat.equals(DEFAULT_TIME_FORMAT)) { jsonObject.put("timeFormat", _timeFormat); } return jsonObject; }
public void setDataType(DataType dataType) { _dataType = dataType.getStoredType(); _defaultNullValue = getDefaultNullValue(getFieldType(), _dataType, _stringDefaultNullValue); }
public Builder setDataType(DataType dataType) { this.dataType = dataType.getStoredType(); return this; }
public MetricFieldSpec(@Nonnull String name, @Nonnull DataType dataType, @Nonnull Object defaultNullValue) { super(name, dataType, true, defaultNullValue); _fieldSize = _dataType.size(); }
@Override public int hashCode() { return EqualityUtils.hashCodeOf(_column.hashCode(), _dataType.hashCode()); }
public TimeGranularitySpec(@Nonnull DataType dataType, int timeUnitSize, @Nonnull TimeUnit timeType, @Nonnull String timeFormat, @Nonnull String name) { Preconditions.checkNotNull(timeType); Preconditions.checkNotNull(name); Preconditions.checkNotNull(timeFormat); Preconditions.checkArgument(timeFormat.equals(TimeFormat.EPOCH.toString()) || (timeFormat .startsWith(TimeFormat.SIMPLE_DATE_FORMAT.toString()))); _dataType = dataType.getStoredType(); _timeType = timeType; _timeUnitSize = timeUnitSize; _name = name; _timeFormat = timeFormat; }