public SanitationTransformer(Schema schema) { for (Map.Entry<String, FieldSpec> entry : schema.getFieldSpecMap().entrySet()) { FieldSpec fieldSpec = entry.getValue(); if (fieldSpec.getDataType() == FieldSpec.DataType.STRING) { _stringColumnMaxLengthMap.put(entry.getKey(), fieldSpec.getMaxLength()); } } }
public static Object convertToDataType(String token, FieldSpec fieldSpec) { if ((token == null) || token.isEmpty()) { return fieldSpec.getDefaultNullValue(); } DataType dataType = fieldSpec.getDataType(); switch (dataType) { case INT: return Integer.parseInt(token); case LONG: return Long.parseLong(token); case FLOAT: return Float.parseFloat(token); case DOUBLE: return Double.parseDouble(token); case STRING: return token; default: throw new IllegalStateException("Illegal data type: " + dataType); } }
/** * Transform a single-value Avro value into an object in Pinot format. */ public static Object transformAvroValueToObject(Object avroValue, FieldSpec fieldSpec) { if (avroValue == null) { return fieldSpec.getDefaultNullValue(); } if (avroValue instanceof GenericData.Record) { return transformAvroValueToObject(((GenericData.Record) avroValue).get(0), fieldSpec); } if (fieldSpec.getDataType() == FieldSpec.DataType.STRING) { return avroValue.toString(); } else if (fieldSpec.getDataType() == FieldSpec.DataType.BYTES && avroValue instanceof ByteBuffer) { // Avro ByteBuffer maps to byte[]. ByteBuffer byteBuffer = (ByteBuffer) avroValue; // Assumes byte-buffer is ready to read. Also, avoid getting underlying array, as it may be over-sized. byte[] bytes = new byte[byteBuffer.remaining()]; byteBuffer.get(bytes); return bytes; } return avroValue; }
/** * For REALTIME segment. */ public ColumnDataSource(FieldSpec fieldSpec, int numDocs, int maxNumMultiValues, DataFileReader forwardIndex, InvertedIndexReader invertedIndex, MutableDictionary dictionary, BloomFilterReader bloomFilter) { this(fieldSpec.getName(), fieldSpec.getDataType(), fieldSpec.isSingleValueField(), false, numDocs, maxNumMultiValues, forwardIndex, invertedIndex, dictionary, bloomFilter, Constants.UNKNOWN_CARDINALITY); }
public static Object extractValue(@Nullable JsonNode jsonValue, FieldSpec fieldSpec) { if (fieldSpec.isSingleValueField()) { if (jsonValue != null && !jsonValue.isNull()) { return extractSingleValue(jsonValue, fieldSpec.getDataType()); } else { return fieldSpec.getDefaultNullValue(); } } else { if (jsonValue != null && !jsonValue.isNull()) { if (jsonValue.isArray()) { int numValues = jsonValue.size(); if (numValues != 0) { Object[] values = new Object[numValues]; for (int i = 0; i < numValues; i++) { values[i] = extractSingleValue(jsonValue.get(i), fieldSpec.getDataType()); } return values; } else { return new Object[]{fieldSpec.getDefaultNullValue()}; } } else { return new Object[]{extractSingleValue(jsonValue, fieldSpec.getDataType())}; } } else { return new Object[]{fieldSpec.getDefaultNullValue()}; } } }
@Override public void logStats() { try { for (final String column : columnStatsCollectorMap.keySet()) { AbstractColumnStatisticsCollector statisticsCollector = columnStatsCollectorMap.get(column); LOGGER.info("********** logging for column : " + column + " ********************* "); LOGGER.info("min value : " + statisticsCollector.getMinValue()); LOGGER.info("max value : " + statisticsCollector.getMaxValue()); LOGGER.info("cardinality : " + statisticsCollector.getCardinality()); LOGGER.info("length of largest column : " + statisticsCollector.getLengthOfLargestElement()); LOGGER.info("is sorted : " + statisticsCollector.isSorted()); LOGGER.info("column type : " + _statsCollectorConfig.getSchema().getFieldSpecFor(column).getDataType()); if (statisticsCollector.getPartitionFunction() != null) { LOGGER.info("partitions: " + statisticsCollector.getPartitions().toString()); } LOGGER.info("***********************************************"); } } catch (final Exception e) { LOGGER.error("Caught exception while logging column stats", e); } } }
public AbstractColumnStatisticsCollector(String column, StatsCollectorConfig statsCollectorConfig) { this.column = column; fieldSpec = statsCollectorConfig.getFieldSpecForColumn(column); partitionFunction = statsCollectorConfig.getPartitionFunction(column); numPartitions = statsCollectorConfig.getNumPartitions(column); if (partitionFunction != null) { _partitions = new HashSet<>(); } else { _partitions = null; } addressNull(previousValue, fieldSpec.getDataType()); previousValue = null; }
Object thisVal = _row.getValue(column); if (fieldSpec.isSingleValueField()) { switch (fieldSpec.getDataType()) { case INT: compare = ((Integer) thisVal).compareTo((Integer) otherVal);
dataTypes.put(col, fs.getDataType()); fieldTypes.put(col, fs.getFieldType());
public int indexOfSV(Object value) { switch (_fieldSpec.getDataType()) { case INT: return _intValueToIndexMap.get((int) value); case LONG: return _longValueToIndexMap.get((long) value); case FLOAT: return _floatValueToIndexMap.get((float) value); case DOUBLE: return _doubleValueToIndexMap.get((double) value); case STRING: return _stringValueToIndexMap.getInt(value); case BYTES: return _bytesValueToIndexMap.get(new ByteArray((byte[]) value)); default: throw new UnsupportedOperationException("Unsupported data type : " + _fieldSpec.getDataType()); } }
private void validateSchema() { org.apache.avro.Schema avroSchema = _avroReader.getSchema(); for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String fieldName = fieldSpec.getName(); Field avroField = avroSchema.getField(fieldName); if (avroField == null) { LOGGER.warn("Pinot field: {} does not exist in Avro Schema", fieldName); } else { boolean isPinotFieldSingleValue = fieldSpec.isSingleValueField(); boolean isAvroFieldSingleValue = AvroUtils.isSingleValueField(avroField); if (isPinotFieldSingleValue != isAvroFieldSingleValue) { String errorMessage = "Pinot field: " + fieldName + " is " + (isPinotFieldSingleValue ? "Single" : "Multi") + "-valued in Pinot schema but not in Avro schema"; LOGGER.error(errorMessage); throw new IllegalStateException(errorMessage); } DataType pinotFieldDataType = fieldSpec.getDataType(); DataType avroFieldDataType = AvroUtils.extractFieldDataType(avroField); if (pinotFieldDataType != avroFieldDataType) { LOGGER.warn("Pinot field: {} of type: {} mismatches with corresponding field in Avro Schema of type: {}", fieldName, pinotFieldDataType, avroFieldDataType); } } } }
/** * Return the row given a docId */ private GenericRow getRecord(GenericRow reuse, int docId) { for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String fieldName = fieldSpec.getName(); if (fieldSpec.isSingleValueField()) { reuse.putField(fieldName, _columnReaderMap.get(fieldName).readSV(docId, fieldSpec.getDataType())); } else { reuse.putField(fieldName, _columnReaderMap.get(fieldName).readMV(docId)); } } return reuse; }
/** * Returns true if dictionary should be created for a column, false otherwise. * Currently there are two sources for this config: * <ul> * <li> ColumnIndexCreationInfo (this is currently hard-coded to always return dictionary). </li> * <li> SegmentGeneratorConfig</li> * </ul> * * This method gives preference to the SegmentGeneratorConfig first. * * @param info Column index creation info * @param config Segment generation config * @param spec Field spec for the column * @return True if dictionary should be created for the column, false otherwise */ private boolean createDictionaryForColumn(ColumnIndexCreationInfo info, SegmentGeneratorConfig config, FieldSpec spec) { String column = spec.getName(); if (config.getRawIndexCreationColumns().contains(column) || config.getRawIndexCompressionType() .containsKey(column)) { if (!spec.isSingleValueField()) { throw new RuntimeException( "Creation of indices without dictionaries is supported for single valued columns only."); } return false; } else if (spec.getDataType().equals(FieldSpec.DataType.BYTES) && !info.isFixedLength()) { return false; } return info.isCreateDictionary(); }
private Object getRandomValueForColumn(FieldSpec fieldSpec, boolean isSimpleDate) { if (fieldSpec.getName().equals(TIME_COL_NAME)) { return getRandomValueForTimeColumn(isSimpleDate); } return RawIndexCreatorTest.getRandomValue(_random, fieldSpec.getDataType()); }
public static PinotDataType getPinotDataType(FieldSpec fieldSpec) { FieldSpec.DataType dataType = fieldSpec.getDataType(); switch (dataType) { case INT: return fieldSpec.isSingleValueField() ? PinotDataType.INTEGER : PinotDataType.INTEGER_ARRAY; case LONG: return fieldSpec.isSingleValueField() ? PinotDataType.LONG : PinotDataType.LONG_ARRAY; case FLOAT: return fieldSpec.isSingleValueField() ? PinotDataType.FLOAT : PinotDataType.FLOAT_ARRAY; case DOUBLE: return fieldSpec.isSingleValueField() ? PinotDataType.DOUBLE : PinotDataType.DOUBLE_ARRAY; case STRING: return fieldSpec.isSingleValueField() ? PinotDataType.STRING : PinotDataType.STRING_ARRAY; case BYTES: if (fieldSpec.isSingleValueField()) { return PinotDataType.BYTES; } else { throw new UnsupportedOperationException("Unsupported multi-valued type: BYTES"); } default: throw new UnsupportedOperationException( "Unsupported data type: " + dataType + " in field: " + fieldSpec.getName()); } }}
for (final FieldSpec spec : dataSchema.getAllFieldSpecs()) { String column = spec.getName(); switch (spec.getDataType()) { case BOOLEAN: case STRING:
public static List<GenericRow> createTestData(Schema schema, int numRows) { List<GenericRow> rows = new ArrayList<>(); final Random random = new Random(); Map<String, Object> fields; for (int i = 0; i < numRows; i++) { fields = new HashMap<>(); for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) { Object value; if (fieldSpec.isSingleValueField()) { value = generateSingleValue(random, fieldSpec.getDataType()); } else { value = generateMultiValue(random, fieldSpec.getDataType()); } fields.put(fieldSpec.getName(), value); } GenericRow row = new GenericRow(); row.init(fields); rows.add(row); } return rows; }
private boolean aggregateMetrics(GenericRow row, int docId) { for (FieldSpec metricSpec : _schema.getMetricFieldSpecs()) { String column = metricSpec.getName(); Object value = row.getValue(column); Preconditions.checkState(metricSpec.isSingleValueField(), "Multivalued metrics cannot be updated."); FixedByteSingleColumnSingleValueReaderWriter indexReaderWriter = (FixedByteSingleColumnSingleValueReaderWriter) _indexReaderWriterMap.get(column); Preconditions.checkState(_dictionaryMap.get(column) == null, "Updating metrics not supported with dictionary."); FieldSpec.DataType dataType = metricSpec.getDataType(); switch (dataType) { case INT: indexReaderWriter.setInt(docId, (Integer) value + indexReaderWriter.getInt(docId)); break; case LONG: indexReaderWriter.setLong(docId, (Long) value + indexReaderWriter.getLong(docId)); break; case FLOAT: indexReaderWriter.setFloat(docId, indexReaderWriter.getFloat(docId) + indexReaderWriter.getFloat(docId)); break; case DOUBLE: indexReaderWriter.setDouble(docId, indexReaderWriter.getDouble(docId) + indexReaderWriter.getDouble(docId)); break; default: throw new UnsupportedOperationException( "Unsupported data type: " + dataType + " for no-dictionary column: " + column); } } return true; }
private boolean shouldConvertColumn(FieldSpec fieldSpec) { String columnName = fieldSpec.getName(); FieldSpec.DataType dataType = fieldSpec.getDataType(); int numTotalDocs = _originalSegmentMetadata.getTotalDocs(); ColumnMetadata columnMetadata = _originalSegmentMetadata.getColumnMetadataFor(columnName); int cardinality = columnMetadata.getCardinality(); // In bits int lengthOfEachEntry; if (dataType.equals(FieldSpec.DataType.STRING)) { lengthOfEachEntry = columnMetadata.getColumnMaxLength() * Byte.SIZE; } else { lengthOfEachEntry = dataType.size() * Byte.SIZE; } long dictionaryBasedIndexSize = (long) numTotalDocs * columnMetadata.getBitsPerElement() + (long) cardinality * lengthOfEachEntry; long rawIndexSize = (long) numTotalDocs * lengthOfEachEntry; LOGGER.info( "For column: {}, size of dictionary based index: {} bits, size of raw index (without compression): {} bits", columnName, dictionaryBasedIndexSize, rawIndexSize); return rawIndexSize <= dictionaryBasedIndexSize * CONVERSION_THRESHOLD; }
Object value; value = getRandomValue(_random, fieldSpec.getDataType()); map.put(fieldSpec.getName(), value);