private List<Integer> computeDefaultSplitOrder() { List<Integer> defaultSplitOrder = new ArrayList<>(); // Sort on all non-time dimensions that are not skipped in descending order Set<String> timeDimensions = new HashSet<>(_schema.getDateTimeNames()); String timeColumnName = _schema.getTimeColumnName(); if (timeColumnName != null) { timeDimensions.add(timeColumnName); } for (int i = 0; i < _numDimensions; i++) { if (!_skipMaterializationDimensions.contains(i) && !timeDimensions.contains(_dimensionNames.get(i))) { defaultSplitOrder.add(i); } } defaultSplitOrder.sort((o1, o2) -> { // Descending order return _dimensionDictionaries.get(o2).size() - _dimensionDictionaries.get(o1).size(); }); return defaultSplitOrder; }
private int getOrCreateDocId(Map<String, Object> dictIdMap) { if (!_aggregateMetrics) { return _numDocsIndexed; } int i = 0; int[] dictIds = new int[_numKeyColumns]; // dimensions + time column. for (String column : _schema.getDimensionNames()) { dictIds[i++] = (Integer) dictIdMap.get(column); } String timeColumnName = _schema.getTimeColumnName(); if (timeColumnName != null) { dictIds[i] = (Integer) dictIdMap.get(timeColumnName); } return _recordIdMap.put(new FixedIntArray(dictIds)); }
/** * Split the leaf nodes on time column if we have not split on time-column name yet, and time column is still * preserved (i.e. not replaced by StarTreeNode.all()). * <p>The method visits each leaf node does the following: * <ul> * <li>Re-order the documents under the leaf node based on time column</li> * <li>Create children nodes for each time value under this leaf node</li> * </ul> */ private void splitLeafNodesOnTimeColumn() throws IOException { String timeColumnName = _schema.getTimeColumnName(); if (timeColumnName != null) { int timeColumnId = _dimensionNames.indexOf(timeColumnName); if (!_skipMaterializationDimensions.contains(timeColumnId) && !_dimensionsSplitOrder.contains(timeColumnId)) { try (StarTreeDataTable dataTable = new StarTreeDataTable(PinotDataBuffer .mapFile(_dataFile, false, 0, _dataFile.length(), PinotDataBuffer.NATIVE_ORDER, "OffHeapStarTreeBuilder#splitLeafNodesOnTimeColumn: data buffer"), _dimensionSize, _metricSize, 0)) { splitLeafNodesOnTimeColumnHelper(dataTable, _rootNode, 0, timeColumnId); } } } }
String timeColumn = schema.getTimeColumnName(); if (noDictionaryColumns.contains(timeColumn)) { _logger
String timeColumn = schema.getTimeColumnName(); if (timeColumn != null) { uniqueSingleValueDimensions.add(timeColumn);
/** * @deprecated Load outside the class and use the setter for schema setting. * @throws IOException */ @Deprecated public void loadConfigFiles() throws IOException { Schema schema; if (_schemaFile != null) { schema = Schema.fromFile(new File(_schemaFile)); setSchema(schema); } else if (_format == FileFormat.AVRO) { schema = AvroUtils.getPinotSchemaFromAvroDataFile(new File(_inputFilePath)); setSchema(schema); } else { throw new RuntimeException("Input format " + _format + " requires schema."); } setTimeColumnName(schema.getTimeColumnName()); TimeFieldSpec timeFieldSpec = schema.getTimeFieldSpec(); if (timeFieldSpec != null) { setSegmentTimeUnit(timeFieldSpec.getIncomingGranularitySpec().getTimeType()); } else { setSegmentTimeUnit(TimeUnit.DAYS); } if (_readerConfigFile != null) { setReaderConfig(JsonUtils.fileToObject(new File(_readerConfigFile), CSVRecordReaderConfig.class)); } }
private static void printSchema(Schema schema) { LOGGER.info("schemaName: {}", schema.getSchemaName()); LOGGER.info("Dimension columnNames: "); int i = 0; for (DimensionFieldSpec spec : schema.getDimensionFieldSpecs()) { String columnInfo = i + " " + spec.getName(); if (!spec.isSingleValueField()) { LOGGER.info(columnInfo + " Multi-Value."); } else { LOGGER.info(columnInfo); } i += 1; } LOGGER.info("Metric columnNames: "); i = 0; for (MetricFieldSpec spec : schema.getMetricFieldSpecs()) { String columnInfo = i + " " + spec.getName(); if (!spec.isSingleValueField()) { LOGGER.info(columnInfo + " Multi-Value."); } else { LOGGER.info(columnInfo); } i += 1; } LOGGER.info("Time column: {}", schema.getTimeColumnName()); }
/** * Rolls up input segments using segment converter. * @param schema input schema * @return a list of rolled-up segments */ private List<File> rollupSegments(Schema schema) throws Exception { // Compute group by columns for roll-up preparation (all dimensions + time column) List<String> groupByColumns = new ArrayList<>(); for (DimensionFieldSpec dimensionFieldSpec : schema.getDimensionFieldSpecs()) { groupByColumns.add(dimensionFieldSpec.getName()); } String timeColumn = schema.getTimeColumnName(); if (timeColumn != null) { groupByColumns.add(timeColumn); } // Initialize roll-up record transformer // TODO: add the support for roll-up with time granularity change RecordTransformer rollupRecordTransformer = (row) -> row; // Initialize roll-up record aggregator RecordAggregator rollupRecordAggregator = new RollupRecordAggregator(schema, _rolllupPreAggregateType); SegmentConverter rollupSegmentConverter = new SegmentConverter.Builder().setTableName(_tableName).setSegmentName(_segmentName) .setInputIndexDirs(_inputIndexDirs).setWorkingDir(_workingDir).setRecordTransformer(rollupRecordTransformer) .setRecordAggregator(rollupRecordAggregator).setGroupByColumns(groupByColumns) .setIndexingConfig(_indexingConfig).build(); return rollupSegmentConverter.convertSegment(); }
public static SegmentGeneratorConfig getSegmentGeneratorConfigWithSchema(File inputAvro, File outputDir, String tableName, Schema schema) { SegmentGeneratorConfig segmentGeneratorConfig = new SegmentGeneratorConfig(schema); segmentGeneratorConfig.setInputFilePath(inputAvro.getAbsolutePath()); segmentGeneratorConfig.setOutDir(outputDir.getAbsolutePath()); segmentGeneratorConfig.setFormat(FileFormat.AVRO); segmentGeneratorConfig.setSegmentVersion(SegmentVersion.v1); segmentGeneratorConfig.setTableName(tableName); segmentGeneratorConfig.setTimeColumnName(schema.getTimeColumnName()); segmentGeneratorConfig.setSegmentTimeUnit(schema.getOutgoingTimeUnit()); return segmentGeneratorConfig; }
public void addColumnMinMaxValue() throws Exception { Preconditions.checkState(_columnMinMaxValueGeneratorMode != ColumnMinMaxValueGeneratorMode.NONE); Schema schema = _segmentMetadata.getSchema(); // Process time column String timeColumnName = schema.getTimeColumnName(); if (timeColumnName != null) { addColumnMinMaxValueForColumn(timeColumnName); } if (_columnMinMaxValueGeneratorMode == ColumnMinMaxValueGeneratorMode.TIME) { saveMetadata(); return; } // Process dimension columns for (String dimensionColumnName : schema.getDimensionNames()) { addColumnMinMaxValueForColumn(dimensionColumnName); } if (_columnMinMaxValueGeneratorMode == ColumnMinMaxValueGeneratorMode.NON_METRIC) { saveMetadata(); return; } // Process metric columns for (String metricColumnName : schema.getMetricNames()) { addColumnMinMaxValueForColumn(metricColumnName); } saveMetadata(); }
protected void setUpTable(File avroFile) throws Exception { String schemaName = _schema.getSchemaName(); addSchema(getSchemaFile(), schemaName); String timeColumnName = _schema.getTimeColumnName(); Assert.assertNotNull(timeColumnName); TimeUnit outgoingTimeUnit = _schema.getOutgoingTimeUnit(); Assert.assertNotNull(outgoingTimeUnit); String timeType = outgoingTimeUnit.toString(); addHybridTable(getTableName(), useLlc(), KafkaStarterUtils.DEFAULT_KAFKA_BROKER, KafkaStarterUtils.DEFAULT_ZK_STR, getKafkaTopic(), getRealtimeSegmentFlushSize(), avroFile, timeColumnName, timeType, schemaName, TENANT_NAME, TENANT_NAME, getLoadMode(), getSortedColumn(), getInvertedIndexColumns(), getBloomFilterIndexColumns(), getRawIndexColumns(), getTaskConfig(), getStreamConsumerFactoryClassName()); completeTableConfiguration(); }
protected void setUpTable(File avroFile) throws Exception { File schemaFile = getSchemaFile(); Schema schema = Schema.fromFile(schemaFile); String schemaName = schema.getSchemaName(); addSchema(schemaFile, schemaName); String timeColumnName = schema.getTimeColumnName(); Assert.assertNotNull(timeColumnName); TimeUnit outgoingTimeUnit = schema.getOutgoingTimeUnit(); Assert.assertNotNull(outgoingTimeUnit); String timeType = outgoingTimeUnit.toString(); addRealtimeTable(getTableName(), useLlc(), KafkaStarterUtils.DEFAULT_KAFKA_BROKER, KafkaStarterUtils.DEFAULT_ZK_STR, getKafkaTopic(), getRealtimeSegmentFlushSize(), avroFile, timeColumnName, timeType, schemaName, null, null, getLoadMode(), getSortedColumn(), getInvertedIndexColumns(), getBloomFilterIndexColumns(), getRawIndexColumns(), getTaskConfig(), getStreamConsumerFactoryClassName()); completeTableConfiguration(); }
private void setupRealtimeTable(String table) throws Exception { _offlineTableConfig = null; File schemaFile = getSchemaFile(); Schema schema = Schema.fromFile(schemaFile); String schemaName = schema.getSchemaName(); addSchema(schemaFile, schemaName); String timeColumnName = schema.getTimeColumnName(); Assert.assertNotNull(timeColumnName); TimeUnit outgoingTimeUnit = schema.getOutgoingTimeUnit(); Assert.assertNotNull(outgoingTimeUnit); String timeType = outgoingTimeUnit.toString(); addRealtimeTable(table, useLlc(), KafkaStarterUtils.DEFAULT_KAFKA_BROKER, KafkaStarterUtils.DEFAULT_ZK_STR, getKafkaTopic(), getRealtimeSegmentFlushSize(), null, timeColumnName, timeType, schemaName, null, null, getLoadMode(), getSortedColumn(), getInvertedIndexColumns(), getBloomFilterIndexColumns(), getRawIndexColumns(), getTaskConfig(), getStreamConsumerFactoryClassName()); completeTableConfiguration(); }
String timeColumnName = schema.getTimeColumnName(); Assert.assertNotNull(timeColumnName); TimeUnit outgoingTimeUnit = schema.getOutgoingTimeUnit();