public void createInvertedIndexForAllColumns() { if (_schema == null) { LOGGER.warn("Schema has not been set, will not create inverted index for all columns."); return; } for (FieldSpec spec : _schema.getAllFieldSpecs()) { _invertedIndexCreationColumns.add(spec.getName()); } }
public SegmentDictionaryCreator(Object sortedValues, FieldSpec fieldSpec, File indexDir) throws IOException { _sortedValues = sortedValues; _fieldSpec = fieldSpec; _dictionaryFile = new File(indexDir, fieldSpec.getName() + V1Constants.Dict.FILE_EXTENSION); FileUtils.touch(_dictionaryFile); }
public PinotSegmentSorter(int numDocs, Schema schema, Map<String, PinotSegmentColumnReader> columnReaderMap) { _numDocs = numDocs; _schema = schema; _columnReaderMap = columnReaderMap; _dimensionNames = new ArrayList<>(); for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { // Count all fields that are not metrics as dimensions if (fieldSpec.getFieldType() != FieldSpec.FieldType.METRIC) { String dimensionName = fieldSpec.getName(); _numDimensions++; _dimensionNames.add(dimensionName); } } }
@Override public GenericRow getRecord(int docId, GenericRow reuse) { for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String column = fieldSpec.getName(); reuse.putField(column, IndexSegmentUtils .getValue(docId, fieldSpec, _indexReaderWriterMap.get(column), _dictionaryMap.get(column), _maxNumValuesMap.getOrDefault(column, 0))); } return reuse; }
/** * For REALTIME segment. */ public ColumnDataSource(FieldSpec fieldSpec, int numDocs, int maxNumMultiValues, DataFileReader forwardIndex, InvertedIndexReader invertedIndex, MutableDictionary dictionary, BloomFilterReader bloomFilter) { this(fieldSpec.getName(), fieldSpec.getDataType(), fieldSpec.isSingleValueField(), false, numDocs, maxNumMultiValues, forwardIndex, invertedIndex, dictionary, bloomFilter, Constants.UNKNOWN_CARDINALITY); }
/** * Returns a comma separated list of qualifying field name strings * @param type FieldType to filter on * @return Comma separate qualifying fields names. */ @JsonIgnore private String getQualifyingFields(FieldType type, boolean excludeVirtualColumns) { List<String> fields = new ArrayList<>(); for (final FieldSpec spec : getSchema().getAllFieldSpecs()) { if (excludeVirtualColumns && getSchema().isVirtualColumn(spec.getName())) { continue; } if (spec.getFieldType() == type) { fields.add(spec.getName()); } } Collections.sort(fields); return StringUtils.join(fields, ","); } }
@Override public GenericRow decode(byte[] payload, GenericRow destination) { try { JsonNode message = JsonUtils.bytesToJsonNode(payload); for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) { String column = fieldSpec.getName(); destination.putField(column, JsonUtils.extractValue(message.get(column), fieldSpec)); } return destination; } catch (Exception e) { LOGGER.error("Caught exception while decoding row, discarding row.", e); return null; } }
private void addInvertedIndex(int docId, Map<String, Object> dictIdMap) { // Update inverted index at last // NOTE: inverted index have to be updated at last because once it gets updated, the latest record will become // queryable for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String column = fieldSpec.getName(); RealtimeInvertedIndexReader invertedIndex = _invertedIndexMap.get(column); if (invertedIndex != null) { if (fieldSpec.isSingleValueField()) { invertedIndex.add(((Integer) dictIdMap.get(column)), docId); } else { int[] dictIds = (int[]) dictIdMap.get(column); for (int dictId : dictIds) { invertedIndex.add(dictId, docId); } } } } }
@Override public GenericRow next(GenericRow reuse) { Map record = _iterator.next(); for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String fieldName = fieldSpec.getName(); Object jsonValue = record.get(fieldName); Object value; if (fieldSpec.isSingleValueField()) { String token = jsonValue != null ? jsonValue.toString() : null; value = RecordReaderUtils.convertToDataType(token, fieldSpec); } else { value = RecordReaderUtils.convertToDataTypeArray((ArrayList) jsonValue, fieldSpec); } reuse.putField(fieldName, value); } return reuse; }
@Override public PinotRecord serialize(T t) { _record.clear(); JsonNode jsonRecord = JsonUtils.objectToJsonNode(t); for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String column = fieldSpec.getName(); _record.putField(column, JsonUtils.extractValue(jsonRecord.get(column), fieldSpec)); } return _record; }
private void validateSchema() { org.apache.avro.Schema avroSchema = _avroReader.getSchema(); for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String fieldName = fieldSpec.getName(); Field avroField = avroSchema.getField(fieldName); if (avroField == null) { LOGGER.warn("Pinot field: {} does not exist in Avro Schema", fieldName); } else { boolean isPinotFieldSingleValue = fieldSpec.isSingleValueField(); boolean isAvroFieldSingleValue = AvroUtils.isSingleValueField(avroField); if (isPinotFieldSingleValue != isAvroFieldSingleValue) { String errorMessage = "Pinot field: " + fieldName + " is " + (isPinotFieldSingleValue ? "Single" : "Multi") + "-valued in Pinot schema but not in Avro schema"; LOGGER.error(errorMessage); throw new IllegalStateException(errorMessage); } DataType pinotFieldDataType = fieldSpec.getDataType(); DataType avroFieldDataType = AvroUtils.extractFieldDataType(avroField); if (pinotFieldDataType != avroFieldDataType) { LOGGER.warn("Pinot field: {} of type: {} mismatches with corresponding field in Avro Schema of type: {}", fieldName, pinotFieldDataType, avroFieldDataType); } } } }
/** * Return the row given a docId */ private GenericRow getRecord(GenericRow reuse, int docId) { for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String fieldName = fieldSpec.getName(); if (fieldSpec.isSingleValueField()) { reuse.putField(fieldName, _columnReaderMap.get(fieldName).readSV(docId, fieldSpec.getDataType())); } else { reuse.putField(fieldName, _columnReaderMap.get(fieldName).readMV(docId)); } } return reuse; }
@Nonnull public GenericRow transform(@Nonnull GenericData.Record from, @Nonnull GenericRow to) { for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { FieldSpec incomingFieldSpec = fieldSpec.getFieldType() == FieldSpec.FieldType.TIME ? _incomingTimeFieldSpec : fieldSpec; String fieldName = incomingFieldSpec.getName(); Object avroValue = from.get(fieldName); if (incomingFieldSpec.isSingleValueField()) { to.putField(fieldName, AvroUtils.transformAvroValueToObject(avroValue, incomingFieldSpec)); } else { to.putField(fieldName, AvroUtils.transformAvroArrayToObjectArray((Array) avroValue, incomingFieldSpec)); } } return to; } }
/** * Fill the data in a {@link GenericRecord} to a {@link GenericRow}. */ public static void fillGenericRow(GenericRecord from, GenericRow to, Schema schema) { for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) { String fieldName = fieldSpec.getName(); Object avroValue = from.get(fieldName); if (fieldSpec.isSingleValueField()) { to.putField(fieldName, transformAvroValueToObject(avroValue, fieldSpec)); } else { to.putField(fieldName, transformAvroArrayToObjectArray((GenericData.Array) avroValue, fieldSpec)); } } }
@Override public GenericRow next(GenericRow reuse) { CSVRecord record = _iterator.next(); for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String column = fieldSpec.getName(); String token = record.isSet(column) ? record.get(column) : null; Object value; if (fieldSpec.isSingleValueField()) { value = RecordReaderUtils.convertToDataType(token, fieldSpec); } else { String[] tokens = token != null ? StringUtils.split(token, _multiValueDelimiter) : null; value = RecordReaderUtils.convertToDataTypeArray(tokens, fieldSpec); } reuse.putField(column, value); } return reuse; }
@Override public GenericRow getRecord(int docId, GenericRow reuse) { for (FieldSpec fieldSpec : _segmentMetadata.getSchema().getAllFieldSpecs()) { String column = fieldSpec.getName(); ColumnIndexContainer indexContainer = _indexContainerMap.get(column); reuse.putField(column, IndexSegmentUtils .getValue(docId, fieldSpec, indexContainer.getForwardIndex(), indexContainer.getDictionary(), _segmentMetadata.getColumnMetadataFor(column).getMaxNumberOfMultiValues())); } return reuse; } }
private Object getRandomValueForColumn(FieldSpec fieldSpec, boolean isSimpleDate) { if (fieldSpec.getName().equals(TIME_COL_NAME)) { return getRandomValueForTimeColumn(isSimpleDate); } return RawIndexCreatorTest.getRandomValue(_random, fieldSpec.getDataType()); }
private boolean shouldConvertColumn(FieldSpec fieldSpec) { String columnName = fieldSpec.getName(); FieldSpec.DataType dataType = fieldSpec.getDataType(); int numTotalDocs = _originalSegmentMetadata.getTotalDocs(); ColumnMetadata columnMetadata = _originalSegmentMetadata.getColumnMetadataFor(columnName); int cardinality = columnMetadata.getCardinality(); // In bits int lengthOfEachEntry; if (dataType.equals(FieldSpec.DataType.STRING)) { lengthOfEachEntry = columnMetadata.getColumnMaxLength() * Byte.SIZE; } else { lengthOfEachEntry = dataType.size() * Byte.SIZE; } long dictionaryBasedIndexSize = (long) numTotalDocs * columnMetadata.getBitsPerElement() + (long) cardinality * lengthOfEachEntry; long rawIndexSize = (long) numTotalDocs * lengthOfEachEntry; LOGGER.info( "For column: {}, size of dictionary based index: {} bits, size of raw index (without compression): {} bits", columnName, dictionaryBasedIndexSize, rawIndexSize); return rawIndexSize <= dictionaryBasedIndexSize * CONVERSION_THRESHOLD; }
public static List<GenericRow> createTestData(Schema schema, int numRows) { List<GenericRow> rows = new ArrayList<>(); final Random random = new Random(); Map<String, Object> fields; for (int i = 0; i < numRows; i++) { fields = new HashMap<>(); for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) { Object value; if (fieldSpec.isSingleValueField()) { value = generateSingleValue(random, fieldSpec.getDataType()); } else { value = generateMultiValue(random, fieldSpec.getDataType()); } fields.put(fieldSpec.getName(), value); } GenericRow row = new GenericRow(); row.init(fields); rows.add(row); } return rows; }
@Test public void testMetadata() { SegmentMetadata actualSegmentMetadata = _mutableSegmentImpl.getSegmentMetadata(); SegmentMetadata expectedSegmentMetadata = _immutableSegment.getSegmentMetadata(); Assert.assertEquals(actualSegmentMetadata.getTotalDocs(), expectedSegmentMetadata.getTotalDocs()); for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String column = fieldSpec.getName(); DataSourceMetadata actualDataSourceMetadata = _mutableSegmentImpl.getDataSource(column).getDataSourceMetadata(); DataSourceMetadata expectedDataSourceMetadata = _immutableSegment.getDataSource(column).getDataSourceMetadata(); Assert.assertEquals(actualDataSourceMetadata.getDataType(), expectedDataSourceMetadata.getDataType()); Assert.assertEquals(actualDataSourceMetadata.isSingleValue(), expectedDataSourceMetadata.isSingleValue()); Assert.assertEquals(actualDataSourceMetadata.getNumDocs(), expectedDataSourceMetadata.getNumDocs()); if (!expectedDataSourceMetadata.isSingleValue()) { Assert.assertEquals(actualDataSourceMetadata.getMaxNumMultiValues(), expectedDataSourceMetadata.getMaxNumMultiValues()); } } }