public PartitionIterator executeInternal(ReadExecutionController controller) { // Note that the only difference between the command in a group must be the partition key on which // they applied. boolean enforceStrictLiveness = commands.get(0).metadata().enforceStrictLiveness(); return limits.filter(UnfilteredPartitionIterators.filter(executeLocally(controller, false), nowInSec), nowInSec, selectsFullPartitions, enforceStrictLiveness); }
protected AbstractQueryPager(ReadCommand command, ProtocolVersion protocolVersion) { this.command = command; this.protocolVersion = protocolVersion; this.limits = command.limits(); this.enforceStrictLiveness = command.metadata().enforceStrictLiveness(); this.remaining = limits.count(); this.remainingInPartition = limits.perPartitionCount(); }
private SinglePartitionReadCommand deserializeSliceCommand(DataInputPlus in, boolean isDigest, CFMetaData metadata, DecoratedKey key, int nowInSeconds, int version) throws IOException { Pair<ClusteringIndexSliceFilter, Boolean> p = deserializeSlicePartitionFilter(in, metadata); ClusteringIndexSliceFilter filter = p.left; boolean selectsStatics = p.right; int count = in.readInt(); int compositesToGroup = in.readInt(); // if a slice query from a pre-3.0 node doesn't cover statics, we shouldn't select them at all ColumnFilter columnFilter = LegacyRangeSliceCommandSerializer.getColumnSelectionForSlice(selectsStatics, compositesToGroup, metadata); // We have 2 types of DISTINCT queries: ones on only the partition key, and ones on the partition key and static columns. For the former, // we can easily detect the case because compositeToGroup is -2 and that's the only case it can be that. The latter is probablematic // however as we have no way to distinguish it from a normal select with a limit of 1 (and this, contrarily to the range query case // were the countCQL3Rows boolean allows us to decide). // So we consider this case not distinct here. This is ok because even if it is a distinct (with static), the count will be 1 and // we'll still just query one row (a distinct DataLimits currently behave exactly like a CQL limit with a count of 1). The only // drawback is that we'll send back the first row entirely while a 2.1/2.2 node would return only the first cell in that same // situation. This isn't a problem for 2.1/2.2 code however (it would be for a range query, as it would throw off the count for // reasons similar to CASSANDRA-10762, but it's ok for single partition queries). // We do _not_ want to do the reverse however and consider a 'SELECT * FROM foo LIMIT 1' as a DISTINCT query as that would make // us only return the 1st cell rather then 1st row. DataLimits limits; if (compositesToGroup == -2) limits = DataLimits.distinctLimits(count); // See CASSANDRA-8490 for the explanation of this value else if (compositesToGroup == -1) limits = DataLimits.thriftLimits(1, count); else limits = DataLimits.cqlLimits(count); return SinglePartitionReadCommand.legacySliceCommand(isDigest, version, metadata, nowInSeconds, columnFilter, limits, key, filter); }
private DataLimits getDataLimits(int userLimit, int perPartitionLimit, int pageSize) { int cqlRowLimit = DataLimits.NO_LIMIT; int cqlPerPartitionLimit = DataLimits.NO_LIMIT; // If we do post ordering we need to get all the results sorted before we can trim them. if (aggregationSpec != AggregationSpecification.AGGREGATE_EVERYTHING) { if (!needsPostQueryOrdering()) cqlRowLimit = userLimit; cqlPerPartitionLimit = perPartitionLimit; } // Group by and aggregation queries will always be paged internally to avoid OOM. // If the user provided a pageSize we'll use that to page internally (because why not), otherwise we use our default if (pageSize <= 0) pageSize = DEFAULT_PAGE_SIZE; // Aggregation queries work fine on top of the group by paging but to maintain // backward compatibility we need to use the old way. if (aggregationSpec != null && aggregationSpec != AggregationSpecification.AGGREGATE_EVERYTHING) { if (parameters.isDistinct) return DataLimits.distinctLimits(cqlRowLimit); return DataLimits.groupByLimits(cqlRowLimit, cqlPerPartitionLimit, pageSize, aggregationSpec); } if (parameters.isDistinct) return cqlRowLimit == DataLimits.NO_LIMIT ? DataLimits.DISTINCT_NONE : DataLimits.distinctLimits(cqlRowLimit); return DataLimits.cqlLimits(cqlRowLimit, cqlPerPartitionLimit); }
private void serializeSliceCommand(SinglePartitionReadCommand command, DataOutputPlus out) throws IOException { CFMetaData metadata = command.metadata(); ClusteringIndexSliceFilter filter = (ClusteringIndexSliceFilter)command.clusteringIndexFilter(); Slices slices = filter.requestedSlices(); boolean makeStaticSlice = !command.columnFilter().fetchedColumns().statics.isEmpty() && !slices.selects(Clustering.STATIC_CLUSTERING); serializeSlices(out, slices, filter.isReversed(), makeStaticSlice, metadata); out.writeBoolean(filter.isReversed()); boolean selectsStatics = !command.columnFilter().fetchedColumns().statics.isEmpty() || slices.selects(Clustering.STATIC_CLUSTERING); DataLimits limits = command.limits(); if (limits.isDistinct()) out.writeInt(1); // the limit is always 1 for DISTINCT queries else out.writeInt(updateLimitForQuery(command.limits().count(), filter.requestedSlices())); int compositesToGroup; if (limits.kind() == DataLimits.Kind.THRIFT_LIMIT || metadata.isDense()) compositesToGroup = -1; else if (limits.isDistinct() && !selectsStatics) compositesToGroup = -2; // for DISTINCT queries (CASSANDRA-8490) else compositesToGroup = metadata.clusteringColumns().size(); out.writeInt(compositesToGroup); }
DataLimits limits; if (isDistinct) limits = DataLimits.distinctLimits(maxResults); else limits = DataLimits.cqlLimits(maxResults); limits = limits.forPaging(maxResults);
public boolean isExhausted(Counter counter) { return counter.counted() < count(); }
assert !command.limits().isUnlimited(); if (!singleResultCounter.isDone() && command.limits().perPartitionCount() == DataLimits.NO_LIMIT) return null; int toQuery = command.limits().count() != DataLimits.NO_LIMIT ? command.limits().count() - counted(mergedResultCounter) : command.limits().perPartitionCount();
public Pair<RowCacheKey, IRowCacheEntry> call() throws Exception { DecoratedKey key = cfs.decorateKey(buffer); int nowInSec = FBUtilities.nowInSeconds(); SinglePartitionReadCommand cmd = SinglePartitionReadCommand.fullPartitionRead(cfs.metadata, nowInSec, key); try (ReadExecutionController controller = cmd.executionController(); UnfilteredRowIterator iter = cmd.queryMemtableAndDisk(cfs, controller)) { CachedPartition toCache = CachedBTreePartition.create(DataLimits.cqlLimits(rowsToCache).filter(iter, nowInSec, true), nowInSec); return Pair.create(new RowCacheKey(cfs.metadata.ksAndCFName, key), (IRowCacheEntry)toCache); } } });
@SuppressWarnings("resource") public static PartitionIterator getRangeSlice(PartitionRangeReadCommand command, ConsistencyLevel consistencyLevel, long queryStartNanoTime) { Tracing.trace("Computing ranges to query"); Keyspace keyspace = Keyspace.open(command.metadata().ksName); RangeIterator ranges = new RangeIterator(command, keyspace, consistencyLevel); // our estimate of how many result rows there will be per-range float resultsPerRange = estimateResultsPerRange(command, keyspace); // underestimate how many rows we will get per-range in order to increase the likelihood that we'll // fetch enough rows in the first round resultsPerRange -= resultsPerRange * CONCURRENT_SUBREQUESTS_MARGIN; int concurrencyFactor = resultsPerRange == 0.0 ? 1 : Math.max(1, Math.min(ranges.rangeCount(), (int) Math.ceil(command.limits().count() / resultsPerRange))); logger.trace("Estimated result rows per range: {}; requested rows: {}, ranges.size(): {}; concurrent range requests: {}", resultsPerRange, command.limits().count(), ranges.rangeCount(), concurrencyFactor); Tracing.trace("Submitting range requests on {} ranges with a concurrency of {} ({} rows per range expected)", ranges.rangeCount(), concurrencyFactor, resultsPerRange); // Note that in general, a RangeCommandIterator will honor the command limit for each range, but will not enforce it globally. return command.limits().filter(command.postReconciliationProcessing(new RangeCommandIterator(ranges, command, concurrencyFactor, keyspace, consistencyLevel, queryStartNanoTime)), command.nowInSec(), command.selectsFullPartition(), command.metadata().enforceStrictLiveness()); }
protected ReadCommand nextPageReadCommand(int pageSize) { Clustering clustering = lastReturned == null ? null : lastReturned.clustering(command.metadata()); DataLimits limits = (lastReturned == null || command.isForThrift()) ? limits().forPaging(pageSize) : limits().forPaging(pageSize, key(), remainingInPartition()); return command.forPaging(clustering, limits); }
public DataLimits limits() { // What we return here doesn't matter much in practice. However, returning DataLimits.NONE means // "no particular limit", which makes SelectStatement.execute() take the slightly more complex "paging" // path. Not a big deal but it's easy enough to return a limit of 0 rows which avoids this. return DataLimits.cqlLimits(0); }
private PartitionRangeReadCommand makeFetchAdditionalPartitionReadCommand(int toQuery) { PartitionRangeReadCommand cmd = (PartitionRangeReadCommand) command; DataLimits newLimits = cmd.limits().forShortReadRetry(toQuery); AbstractBounds<PartitionPosition> bounds = cmd.dataRange().keyRange(); AbstractBounds<PartitionPosition> newBounds = bounds.inclusiveRight() ? new Range<>(lastPartitionKey, bounds.right) : new ExcludingBounds<>(lastPartitionKey, bounds.right); DataRange newDataRange = cmd.dataRange().forSubRange(newBounds); return cmd.withUpdatedLimitsAndDataRange(newLimits, newDataRange); }
/** * Estimate the number of result rows (either cql3 rows or "thrift" rows, as called for by the command) per * range in the ring based on our local data. This assumes that ranges are uniformly distributed across the cluster * and that the queried data is also uniformly distributed. */ private static float estimateResultsPerRange(PartitionRangeReadCommand command, Keyspace keyspace) { ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(command.metadata().cfId); Index index = command.getIndex(cfs); float maxExpectedResults = index == null ? command.limits().estimateTotalResults(cfs) : index.getEstimatedResultRows(); // adjust maxExpectedResults by the number of tokens this node has and the replication factor for this ks return (maxExpectedResults / DatabaseDescriptor.getNumTokens()) / keyspace.getReplicationStrategy().getReplicationFactor(); }
/** * Updates the pager with the new limits if needed. * * @param pager the pager previoulsy used * @param limits the DataLimits * @param lastPartitionKey the partition key of the last row returned * @param lastClustering the clustering of the last row returned * @return the pager to use to query the next page of data */ protected QueryPager updatePagerLimit(QueryPager pager, DataLimits limits, ByteBuffer lastPartitionKey, Clustering lastClustering) { GroupingState state = new GroupingState(lastPartitionKey, lastClustering); DataLimits newLimits = limits.forGroupByInternalPaging(state); return pager.withUpdatedLimit(newLimits); }
public boolean isFilterFullyCoveredBy(ClusteringIndexFilter filter, DataLimits limits, CachedPartition cached, int nowInSec) { // We can use the cached value only if we know that no data it doesn't contain could be covered // by the query filter, that is if: // 1) either the whole partition is cached // 2) or we can ensure than any data the filter selects is in the cached partition // We can guarantee that a partition is fully cached if the number of rows it contains is less than // what we're caching. Wen doing that, we should be careful about expiring cells: we should count // something expired that wasn't when the partition was cached, or we could decide that the whole // partition is cached when it's not. This is why we use CachedPartition#cachedLiveRows. if (cached.cachedLiveRows() < metadata.params.caching.rowsPerPartitionToCache()) return true; // If the whole partition isn't cached, then we must guarantee that the filter cannot select data that // is not in the cache. We can guarantee that if either the filter is a "head filter" and the cached // partition has more live rows that queried (where live rows refers to the rows that are live now), // or if we can prove that everything the filter selects is in the cached partition based on its content. return (filter.isHeadFilter() && limits.hasEnoughLiveData(cached, nowInSec, filter.selectsAllPartition(), metadata.enforceStrictLiveness())) || filter.isFullyCoveredBy(cached); }
private void serializeSliceCommand(SinglePartitionReadCommand command, DataOutputPlus out) throws IOException { CFMetaData metadata = command.metadata(); ClusteringIndexSliceFilter filter = (ClusteringIndexSliceFilter)command.clusteringIndexFilter(); Slices slices = filter.requestedSlices(); boolean makeStaticSlice = !command.columnFilter().fetchedColumns().statics.isEmpty() && !slices.selects(Clustering.STATIC_CLUSTERING); serializeSlices(out, slices, filter.isReversed(), makeStaticSlice, metadata); out.writeBoolean(filter.isReversed()); boolean selectsStatics = !command.columnFilter().fetchedColumns().statics.isEmpty() || slices.selects(Clustering.STATIC_CLUSTERING); DataLimits limits = command.limits(); if (limits.isDistinct()) out.writeInt(1); // the limit is always 1 for DISTINCT queries else out.writeInt(updateLimitForQuery(command.limits().count(), filter.requestedSlices())); int compositesToGroup; if (limits.kind() == DataLimits.Kind.THRIFT_LIMIT || metadata.isDense()) compositesToGroup = -1; else if (limits.isDistinct() && !selectsStatics) compositesToGroup = -2; // for DISTINCT queries (CASSANDRA-8490) else compositesToGroup = metadata.clusteringColumns().size(); out.writeInt(compositesToGroup); }
private DataLimits getDataLimits(int userLimit, int perPartitionLimit, int pageSize) { int cqlRowLimit = DataLimits.NO_LIMIT; int cqlPerPartitionLimit = DataLimits.NO_LIMIT; // If we do post ordering we need to get all the results sorted before we can trim them. if (aggregationSpec != AggregationSpecification.AGGREGATE_EVERYTHING) { if (!needsPostQueryOrdering()) cqlRowLimit = userLimit; cqlPerPartitionLimit = perPartitionLimit; } // Group by and aggregation queries will always be paged internally to avoid OOM. // If the user provided a pageSize we'll use that to page internally (because why not), otherwise we use our default if (pageSize <= 0) pageSize = DEFAULT_PAGE_SIZE; // Aggregation queries work fine on top of the group by paging but to maintain // backward compatibility we need to use the old way. if (aggregationSpec != null && aggregationSpec != AggregationSpecification.AGGREGATE_EVERYTHING) { if (parameters.isDistinct) return DataLimits.distinctLimits(cqlRowLimit); return DataLimits.groupByLimits(cqlRowLimit, cqlPerPartitionLimit, pageSize, aggregationSpec); } if (parameters.isDistinct) return cqlRowLimit == DataLimits.NO_LIMIT ? DataLimits.DISTINCT_NONE : DataLimits.distinctLimits(cqlRowLimit); return DataLimits.cqlLimits(cqlRowLimit, cqlPerPartitionLimit); }
DataLimits limits; if (isDistinct) limits = DataLimits.distinctLimits(maxResults); else limits = DataLimits.cqlLimits(maxResults); limits = limits.forPaging(maxResults);