@Override public boolean isSorted() { return fieldSpec.isSingleValueField() && isSorted; }
public static PinotDataType getPinotDataType(FieldSpec fieldSpec) { FieldSpec.DataType dataType = fieldSpec.getDataType(); switch (dataType) { case INT: return fieldSpec.isSingleValueField() ? PinotDataType.INTEGER : PinotDataType.INTEGER_ARRAY; case LONG: return fieldSpec.isSingleValueField() ? PinotDataType.LONG : PinotDataType.LONG_ARRAY; case FLOAT: return fieldSpec.isSingleValueField() ? PinotDataType.FLOAT : PinotDataType.FLOAT_ARRAY; case DOUBLE: return fieldSpec.isSingleValueField() ? PinotDataType.DOUBLE : PinotDataType.DOUBLE_ARRAY; case STRING: return fieldSpec.isSingleValueField() ? PinotDataType.STRING : PinotDataType.STRING_ARRAY; case BYTES: if (fieldSpec.isSingleValueField()) { return PinotDataType.BYTES; } else { throw new UnsupportedOperationException("Unsupported multi-valued type: BYTES"); } default: throw new UnsupportedOperationException( "Unsupported data type: " + dataType + " in field: " + fieldSpec.getName()); } }}
/** * For REALTIME segment. */ public ColumnDataSource(FieldSpec fieldSpec, int numDocs, int maxNumMultiValues, DataFileReader forwardIndex, InvertedIndexReader invertedIndex, MutableDictionary dictionary, BloomFilterReader bloomFilter) { this(fieldSpec.getName(), fieldSpec.getDataType(), fieldSpec.isSingleValueField(), false, numDocs, maxNumMultiValues, forwardIndex, invertedIndex, dictionary, bloomFilter, Constants.UNKNOWN_CARDINALITY); }
private void addInvertedIndex(int docId, Map<String, Object> dictIdMap) { // Update inverted index at last // NOTE: inverted index have to be updated at last because once it gets updated, the latest record will become // queryable for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String column = fieldSpec.getName(); RealtimeInvertedIndexReader invertedIndex = _invertedIndexMap.get(column); if (invertedIndex != null) { if (fieldSpec.isSingleValueField()) { invertedIndex.add(((Integer) dictIdMap.get(column)), docId); } else { int[] dictIds = (int[]) dictIdMap.get(column); for (int dictId : dictIds) { invertedIndex.add(dictId, docId); } } } } }
@Override public GenericRow next(GenericRow reuse) { Map record = _iterator.next(); for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String fieldName = fieldSpec.getName(); Object jsonValue = record.get(fieldName); Object value; if (fieldSpec.isSingleValueField()) { String token = jsonValue != null ? jsonValue.toString() : null; value = RecordReaderUtils.convertToDataType(token, fieldSpec); } else { value = RecordReaderUtils.convertToDataTypeArray((ArrayList) jsonValue, fieldSpec); } reuse.putField(fieldName, value); } return reuse; }
@Override public int compare(int i1, int i2) { int docId1 = sortedDocIds[i1]; int docId2 = sortedDocIds[i2]; int compare = 0; for (int index : _sortOrder) { String dimensionName = _dimensionNames.get(index); FieldSpec fieldSpec = _schema.getFieldSpecFor(dimensionName); PinotSegmentColumnReader columnReader = _columnReaderMap.get(dimensionName); // Multi value column or no dictionary column is not supported boolean isMultiValueColumn = !fieldSpec.isSingleValueField(); boolean isNoDictionaryColumn = !columnReader.hasDictionary(); if (isMultiValueColumn || isNoDictionaryColumn) { throw new IllegalStateException( "Multi value column or no dictionary column is not supported. ( column name: " + dimensionName + ", multi value column: " + isMultiValueColumn + ", no dictionary column: " + isNoDictionaryColumn + " )"); } // Compute the order compare = columnReader.getDictionaryId(docId1) - columnReader.getDictionaryId(docId2); if (compare != 0) { return compare; } } return compare; }
@Override public void indexRow(GenericRow row) { for (String columnName : _forwardIndexCreatorMap.keySet()) { Object columnValueToIndex = row.getValue(columnName); if (columnValueToIndex == null) { throw new RuntimeException("Null value for column:" + columnName); } SegmentDictionaryCreator dictionaryCreator = _dictionaryCreatorMap.get(columnName); if (schema.getFieldSpecFor(columnName).isSingleValueField()) { if (dictionaryCreator != null) { int dictId = dictionaryCreator.indexOfSV(columnValueToIndex); ((SingleValueForwardIndexCreator) _forwardIndexCreatorMap.get(columnName)).index(docIdCounter, dictId); if (_invertedIndexCreatorMap.containsKey(columnName)) { _invertedIndexCreatorMap.get(columnName).add(dictId); } } else { ((SingleValueRawIndexCreator) _forwardIndexCreatorMap.get(columnName)) .index(docIdCounter, columnValueToIndex); } } else { int[] dictIds = dictionaryCreator.indexOfMV(columnValueToIndex); ((MultiValueForwardIndexCreator) _forwardIndexCreatorMap.get(columnName)).index(docIdCounter, dictIds); if (_invertedIndexCreatorMap.containsKey(columnName)) { _invertedIndexCreatorMap.get(columnName).add(dictIds, dictIds.length); } } } docIdCounter++; }
@Override protected void updateDefaultColumn(String column, DefaultColumnAction action) throws Exception { LOGGER.info("Starting default column action: {} on column: {}", action, column); // For V3 segment format, only support ADD action // For UPDATE and REMOVE action, throw exception to drop and re-download the segment if (!action.isAddAction()) { throw new V3RemoveIndexException( "Default value indices for column: " + column + " cannot be removed for V3 format segment."); } // Create new dictionary and forward index, and update column metadata createColumnV1Indices(column); // Write index to V3 format. FieldSpec fieldSpec = _schema.getFieldSpecFor(column); Preconditions.checkNotNull(fieldSpec); boolean isSingleValue = fieldSpec.isSingleValueField(); File dictionaryFile = new File(_indexDir, column + V1Constants.Dict.FILE_EXTENSION); File forwardIndexFile; if (isSingleValue) { forwardIndexFile = new File(_indexDir, column + V1Constants.Indexes.SORTED_SV_FORWARD_INDEX_FILE_EXTENSION); } else { forwardIndexFile = new File(_indexDir, column + V1Constants.Indexes.UNSORTED_MV_FORWARD_INDEX_FILE_EXTENSION); } LoaderUtils.writeIndexToV3Format(_segmentWriter, column, dictionaryFile, ColumnIndexType.DICTIONARY); LoaderUtils.writeIndexToV3Format(_segmentWriter, column, forwardIndexFile, ColumnIndexType.FORWARD_INDEX); } }
private void validateSchema() { org.apache.avro.Schema avroSchema = _avroReader.getSchema(); for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String fieldName = fieldSpec.getName(); Field avroField = avroSchema.getField(fieldName); if (avroField == null) { LOGGER.warn("Pinot field: {} does not exist in Avro Schema", fieldName); } else { boolean isPinotFieldSingleValue = fieldSpec.isSingleValueField(); boolean isAvroFieldSingleValue = AvroUtils.isSingleValueField(avroField); if (isPinotFieldSingleValue != isAvroFieldSingleValue) { String errorMessage = "Pinot field: " + fieldName + " is " + (isPinotFieldSingleValue ? "Single" : "Multi") + "-valued in Pinot schema but not in Avro schema"; LOGGER.error(errorMessage); throw new IllegalStateException(errorMessage); } DataType pinotFieldDataType = fieldSpec.getDataType(); DataType avroFieldDataType = AvroUtils.extractFieldDataType(avroField); if (pinotFieldDataType != avroFieldDataType) { LOGGER.warn("Pinot field: {} of type: {} mismatches with corresponding field in Avro Schema of type: {}", fieldName, pinotFieldDataType, avroFieldDataType); } } } }
/** * Return the row given a docId */ private GenericRow getRecord(GenericRow reuse, int docId) { for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String fieldName = fieldSpec.getName(); if (fieldSpec.isSingleValueField()) { reuse.putField(fieldName, _columnReaderMap.get(fieldName).readSV(docId, fieldSpec.getDataType())); } else { reuse.putField(fieldName, _columnReaderMap.get(fieldName).readMV(docId)); } } return reuse; }
@Nonnull public GenericRow transform(@Nonnull GenericData.Record from, @Nonnull GenericRow to) { for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { FieldSpec incomingFieldSpec = fieldSpec.getFieldType() == FieldSpec.FieldType.TIME ? _incomingTimeFieldSpec : fieldSpec; String fieldName = incomingFieldSpec.getName(); Object avroValue = from.get(fieldName); if (incomingFieldSpec.isSingleValueField()) { to.putField(fieldName, AvroUtils.transformAvroValueToObject(avroValue, incomingFieldSpec)); } else { to.putField(fieldName, AvroUtils.transformAvroArrayToObjectArray((Array) avroValue, incomingFieldSpec)); } } return to; } }
public static Object extractValue(@Nullable JsonNode jsonValue, FieldSpec fieldSpec) { if (fieldSpec.isSingleValueField()) { if (jsonValue != null && !jsonValue.isNull()) { return extractSingleValue(jsonValue, fieldSpec.getDataType()); } else { return fieldSpec.getDefaultNullValue(); } } else { if (jsonValue != null && !jsonValue.isNull()) { if (jsonValue.isArray()) { int numValues = jsonValue.size(); if (numValues != 0) { Object[] values = new Object[numValues]; for (int i = 0; i < numValues; i++) { values[i] = extractSingleValue(jsonValue.get(i), fieldSpec.getDataType()); } return values; } else { return new Object[]{fieldSpec.getDefaultNullValue()}; } } else { return new Object[]{extractSingleValue(jsonValue, fieldSpec.getDataType())}; } } else { return new Object[]{fieldSpec.getDefaultNullValue()}; } } }
/** * Fill the data in a {@link GenericRecord} to a {@link GenericRow}. */ public static void fillGenericRow(GenericRecord from, GenericRow to, Schema schema) { for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) { String fieldName = fieldSpec.getName(); Object avroValue = from.get(fieldName); if (fieldSpec.isSingleValueField()) { to.putField(fieldName, transformAvroValueToObject(avroValue, fieldSpec)); } else { to.putField(fieldName, transformAvroArrayToObjectArray((GenericData.Array) avroValue, fieldSpec)); } } }
_invertedIndexValueBufferFile = new File(indexDir, columnName + INVERTED_INDEX_VALUE_BUFFER_SUFFIX); _invertedIndexLengthBufferFile = new File(indexDir, columnName + INVERTED_INDEX_LENGTH_BUFFER_SUFFIX); _singleValue = fieldSpec.isSingleValueField(); _cardinality = cardinality; _numDocs = numDocs;
@Override public GenericRow next(GenericRow reuse) { CSVRecord record = _iterator.next(); for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String column = fieldSpec.getName(); String token = record.isSet(column) ? record.get(column) : null; Object value; if (fieldSpec.isSingleValueField()) { value = RecordReaderUtils.convertToDataType(token, fieldSpec); } else { String[] tokens = token != null ? StringUtils.split(token, _multiValueDelimiter) : null; value = RecordReaderUtils.convertToDataTypeArray(tokens, fieldSpec); } reuse.putField(column, value); } return reuse; }
/** * Returns true if dictionary should be created for a column, false otherwise. * Currently there are two sources for this config: * <ul> * <li> ColumnIndexCreationInfo (this is currently hard-coded to always return dictionary). </li> * <li> SegmentGeneratorConfig</li> * </ul> * * This method gives preference to the SegmentGeneratorConfig first. * * @param info Column index creation info * @param config Segment generation config * @param spec Field spec for the column * @return True if dictionary should be created for the column, false otherwise */ private boolean createDictionaryForColumn(ColumnIndexCreationInfo info, SegmentGeneratorConfig config, FieldSpec spec) { String column = spec.getName(); if (config.getRawIndexCreationColumns().contains(column) || config.getRawIndexCompressionType() .containsKey(column)) { if (!spec.isSingleValueField()) { throw new RuntimeException( "Creation of indices without dictionaries is supported for single valued columns only."); } return false; } else if (spec.getDataType().equals(FieldSpec.DataType.BYTES) && !info.isFixedLength()) { return false; } return info.isCreateDictionary(); }
private boolean aggregateMetrics(GenericRow row, int docId) { for (FieldSpec metricSpec : _schema.getMetricFieldSpecs()) { String column = metricSpec.getName(); Object value = row.getValue(column); Preconditions.checkState(metricSpec.isSingleValueField(), "Multivalued metrics cannot be updated."); FixedByteSingleColumnSingleValueReaderWriter indexReaderWriter = (FixedByteSingleColumnSingleValueReaderWriter) _indexReaderWriterMap.get(column); Preconditions.checkState(_dictionaryMap.get(column) == null, "Updating metrics not supported with dictionary."); FieldSpec.DataType dataType = metricSpec.getDataType(); switch (dataType) { case INT: indexReaderWriter.setInt(docId, (Integer) value + indexReaderWriter.getInt(docId)); break; case LONG: indexReaderWriter.setLong(docId, (Long) value + indexReaderWriter.getLong(docId)); break; case FLOAT: indexReaderWriter.setFloat(docId, indexReaderWriter.getFloat(docId) + indexReaderWriter.getFloat(docId)); break; case DOUBLE: indexReaderWriter.setDouble(docId, indexReaderWriter.getDouble(docId) + indexReaderWriter.getDouble(docId)); break; default: throw new UnsupportedOperationException( "Unsupported data type: " + dataType + " for no-dictionary column: " + column); } } return true; }
public static List<GenericRow> createTestData(Schema schema, int numRows) { List<GenericRow> rows = new ArrayList<>(); final Random random = new Random(); Map<String, Object> fields; for (int i = 0; i < numRows; i++) { fields = new HashMap<>(); for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) { Object value; if (fieldSpec.isSingleValueField()) { value = generateSingleValue(random, fieldSpec.getDataType()); } else { value = generateMultiValue(random, fieldSpec.getDataType()); } fields.put(fieldSpec.getName(), value); } GenericRow row = new GenericRow(); row.init(fields); rows.add(row); } return rows; }
@Test public void testDataSourceForSVColumns() { for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { if (fieldSpec.isSingleValueField()) { String column = fieldSpec.getName(); DataSource actualDataSource = _mutableSegmentImpl.getDataSource(column); DataSource expectedDataSource = _immutableSegment.getDataSource(column); Dictionary actualDictionary = actualDataSource.getDictionary(); Dictionary expectedDictionary = expectedDataSource.getDictionary(); Assert.assertEquals(actualDictionary.length(), expectedDictionary.length()); BlockSingleValIterator actualSVIterator = (BlockSingleValIterator) actualDataSource.nextBlock().getBlockValueSet().iterator(); BlockSingleValIterator expectedSVIterator = (BlockSingleValIterator) expectedDataSource.nextBlock().getBlockValueSet().iterator(); while (expectedSVIterator.hasNext()) { Assert.assertTrue(actualSVIterator.hasNext()); int actualDictId = actualSVIterator.nextIntVal(); int expectedDictId = expectedSVIterator.nextIntVal(); Assert.assertEquals(actualDictionary.get(actualDictId), expectedDictionary.get(expectedDictId)); } Assert.assertFalse(actualSVIterator.hasNext()); } } }
@Test public void testDataSourceForMVColumns() { for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { if (!fieldSpec.isSingleValueField()) { String column = fieldSpec.getName(); DataSource actualDataSource = _mutableSegmentImpl.getDataSource(column); DataSource expectedDataSource = _immutableSegment.getDataSource(column); Dictionary actualDictionary = actualDataSource.getDictionary(); Dictionary expectedDictionary = expectedDataSource.getDictionary(); Assert.assertEquals(actualDictionary.length(), expectedDictionary.length()); BlockMultiValIterator actualMVIterator = (BlockMultiValIterator) actualDataSource.nextBlock().getBlockValueSet().iterator(); BlockMultiValIterator expectedMVIterator = (BlockMultiValIterator) expectedDataSource.nextBlock().getBlockValueSet().iterator(); int numMaxMultiValues = expectedDataSource.getDataSourceMetadata().getMaxNumMultiValues(); int[] actualDictIds = new int[numMaxMultiValues]; int[] expectedDictIds = new int[numMaxMultiValues]; while (expectedMVIterator.hasNext()) { Assert.assertTrue(actualMVIterator.hasNext()); int actualNumMultiValues = actualMVIterator.nextIntVal(actualDictIds); int expectedNumMultiValues = expectedMVIterator.nextIntVal(expectedDictIds); Assert.assertEquals(actualNumMultiValues, expectedNumMultiValues); for (int i = 0; i < expectedNumMultiValues; i++) { Assert.assertEquals(actualDictionary.get(actualDictIds[i]), expectedDictionary.get(expectedDictIds[i])); } } Assert.assertFalse(actualMVIterator.hasNext()); } } }