48
48
import org .apache .cassandra .db .Keyspace ;
49
49
import org .apache .cassandra .db .ReadCommand ;
50
50
import org .apache .cassandra .db .filter .RowFilter ;
51
+ import org .apache .cassandra .db .partitions .BasePartitionIterator ;
51
52
import org .apache .cassandra .db .partitions .ParallelCommandProcessor ;
53
+ import org .apache .cassandra .db .partitions .PartitionIterator ;
52
54
import org .apache .cassandra .db .partitions .UnfilteredPartitionIterator ;
53
55
import org .apache .cassandra .db .rows .BaseRowIterator ;
54
56
import org .apache .cassandra .db .rows .Row ;
58
60
import org .apache .cassandra .index .sai .IndexContext ;
59
61
import org .apache .cassandra .index .sai .StorageAttachedIndex ;
60
62
import org .apache .cassandra .index .sai .utils .AbortedOperationException ;
63
+ import org .apache .cassandra .index .sai .utils .InMemoryPartitionIterator ;
61
64
import org .apache .cassandra .index .sai .utils .InMemoryUnfilteredPartitionIterator ;
62
65
import org .apache .cassandra .index .sai .utils .PartitionInfo ;
63
66
import org .apache .cassandra .index .sai .utils .TypeUtil ;
71
74
/**
72
75
* Processor applied to SAI based ORDER BY queries. This class could likely be refactored into either two filter
73
76
* methods depending on where the processing is happening or into two classes.
74
- * Ordering on the coordinator is delegated to CQL.
77
+ *
78
+ * This processor performs the following steps on a replica:
79
+ * - collect LIMIT rows from partition iterator, making sure that all are valid.
80
+ * - return rows in Primary Key order
81
+ *
82
+ * This processor performs the following steps on a coordinator:
83
+ * - consume all rows from the provided partition iterator and sort them according to the specified order.
84
+ * For vectors, that is similarit score and for all others, that is the ordering defined by their
85
+ * {@link org.apache.cassandra.db.marshal.AbstractType}. If there are multiple vector indexes,
86
+ * the final score is the sum of all vector index scores.
87
+ * - remove rows with the lowest scores from PQ if PQ size exceeds limit
88
+ * - return rows from PQ in primary key order to caller
75
89
*/
76
90
public class TopKProcessor
77
91
{
@@ -109,7 +123,7 @@ public TopKProcessor(ReadCommand command)
109
123
/**
110
124
* Executor to use for parallel index reads.
111
125
* Defined by -Dcassandra.index_read.parallele=true/false, true by default.
112
- * </p>
126
+ *
113
127
* INDEX_READ uses 2 * cpus threads by default but can be overridden with {@literal -Dcassandra.index_read.parallel_thread_num=<value>}
114
128
*
115
129
* @return stage to use, default INDEX_READ
@@ -133,7 +147,7 @@ private static LocalAwareExecutorPlus getExecutor()
133
147
* Filter given partitions and keep the rows with highest scores. In case of {@link UnfilteredPartitionIterator},
134
148
* all tombstones will be kept. Caller must close the supplied iterator.
135
149
*/
136
- public UnfilteredPartitionIterator filter (UnfilteredPartitionIterator partitions )
150
+ public < U extends Unfiltered , R extends BaseRowIterator < U >, P extends BasePartitionIterator < R >> BasePartitionIterator <?> filter (P partitions )
137
151
{
138
152
// filterInternal consumes the partitions iterator and creates a new one. Use a try-with-resources block
139
153
// to ensure the original iterator is closed. We do not expect exceptions from filterInternal, but if they
@@ -145,14 +159,12 @@ public UnfilteredPartitionIterator filter(UnfilteredPartitionIterator partitions
145
159
}
146
160
}
147
161
148
- private UnfilteredPartitionIterator filterInternal (UnfilteredPartitionIterator partitions )
162
+ private < U extends Unfiltered , R extends BaseRowIterator < U >, P extends BasePartitionIterator < R >> BasePartitionIterator <?> filterInternal (P partitions )
149
163
{
150
164
// priority queue ordered by score in descending order
151
165
Comparator <Triple <PartitionInfo , Row , ?>> comparator ;
152
166
if (queryVector != null )
153
- {
154
167
comparator = Comparator .comparing ((Triple <PartitionInfo , Row , ?> t ) -> (Float ) t .getRight ()).reversed ();
155
- }
156
168
else
157
169
{
158
170
comparator = Comparator .comparing (t -> (ByteBuffer ) t .getRight (), indexContext .getValidator ());
@@ -163,15 +175,13 @@ private UnfilteredPartitionIterator filterInternal(UnfilteredPartitionIterator p
163
175
// to store top-k results in primary key order
164
176
TreeMap <PartitionInfo , TreeSet <Unfiltered >> unfilteredByPartition = new TreeMap <>(Comparator .comparing (p -> p .key ));
165
177
166
- if (PARALLEL_EXECUTOR != ImmediateExecutor .INSTANCE && partitions instanceof ParallelCommandProcessor )
167
- {
178
+ if (PARALLEL_EXECUTOR != ImmediateExecutor .INSTANCE && partitions instanceof ParallelCommandProcessor ) {
168
179
ParallelCommandProcessor pIter = (ParallelCommandProcessor ) partitions ;
169
180
var commands = pIter .getUninitializedCommands ();
170
181
List <CompletableFuture <PartitionResults >> results = new ArrayList <>(commands .size ());
171
182
172
183
int count = commands .size ();
173
- for (var command : commands )
174
- {
184
+ for (var command : commands ) {
175
185
CompletableFuture <PartitionResults > future = new CompletableFuture <>();
176
186
results .add (future );
177
187
@@ -191,8 +201,7 @@ private UnfilteredPartitionIterator filterInternal(UnfilteredPartitionIterator p
191
201
});
192
202
}
193
203
194
- for (CompletableFuture <PartitionResults > triplesFuture : results )
195
- {
204
+ for (CompletableFuture <PartitionResults > triplesFuture : results ) {
196
205
PartitionResults pr ;
197
206
try
198
207
{
@@ -207,12 +216,10 @@ private UnfilteredPartitionIterator filterInternal(UnfilteredPartitionIterator p
207
216
if (pr == null )
208
217
continue ;
209
218
topK .addAll (pr .rows );
210
- for (var uf : pr .tombstones )
219
+ for (var uf : pr .tombstones )
211
220
addUnfiltered (unfilteredByPartition , pr .partitionInfo , uf );
212
221
}
213
- }
214
- else if (partitions instanceof StorageAttachedIndexSearcher .ScoreOrderedResultRetriever )
215
- {
222
+ } else if (partitions instanceof StorageAttachedIndexSearcher .ScoreOrderedResultRetriever ) {
216
223
// FilteredPartitions does not implement ParallelizablePartitionIterator.
217
224
// Realistically, this won't benefit from parallelizm as these are coming from in-memory/memtable data.
218
225
int rowsMatched = 0 ;
@@ -225,9 +232,7 @@ else if (partitions instanceof StorageAttachedIndexSearcher.ScoreOrderedResultRe
225
232
rowsMatched += processSingleRowPartition (unfilteredByPartition , partitionRowIterator );
226
233
}
227
234
}
228
- }
229
- else
230
- {
235
+ } else {
231
236
// FilteredPartitions does not implement ParallelizablePartitionIterator.
232
237
// Realistically, this won't benefit from parallelizm as these are coming from in-memory/memtable data.
233
238
while (partitions .hasNext ())
@@ -239,7 +244,7 @@ else if (partitions instanceof StorageAttachedIndexSearcher.ScoreOrderedResultRe
239
244
{
240
245
PartitionResults pr = processPartition (partitionRowIterator );
241
246
topK .addAll (pr .rows );
242
- for (var uf : pr .tombstones )
247
+ for (var uf : pr .tombstones )
243
248
addUnfiltered (unfilteredByPartition , pr .partitionInfo , uf );
244
249
}
245
250
else
@@ -250,6 +255,7 @@ else if (partitions instanceof StorageAttachedIndexSearcher.ScoreOrderedResultRe
250
255
topK .add (Triple .of (PartitionInfo .create (partitionRowIterator ), row , row .getCell (expression .column ()).buffer ()));
251
256
}
252
257
}
258
+
253
259
}
254
260
}
255
261
}
@@ -258,17 +264,17 @@ else if (partitions instanceof StorageAttachedIndexSearcher.ScoreOrderedResultRe
258
264
for (var triple : topK .getUnsortedShared ())
259
265
addUnfiltered (unfilteredByPartition , triple .getLeft (), triple .getMiddle ());
260
266
267
+ if (partitions instanceof PartitionIterator )
268
+ return new InMemoryPartitionIterator (command , unfilteredByPartition );
261
269
return new InMemoryUnfilteredPartitionIterator (command , unfilteredByPartition );
262
270
}
263
271
264
- private class PartitionResults
265
- {
272
+ private class PartitionResults {
266
273
final PartitionInfo partitionInfo ;
267
274
final SortedSet <Unfiltered > tombstones = new TreeSet <>(command .metadata ().comparator );
268
275
final List <Triple <PartitionInfo , Row , Float >> rows = new ArrayList <>();
269
276
270
- PartitionResults (PartitionInfo partitionInfo )
271
- {
277
+ PartitionResults (PartitionInfo partitionInfo ) {
272
278
this .partitionInfo = partitionInfo ;
273
279
}
274
280
@@ -277,17 +283,15 @@ void addTombstone(Unfiltered uf)
277
283
tombstones .add (uf );
278
284
}
279
285
280
- void addRow (Triple <PartitionInfo , Row , Float > triple )
281
- {
286
+ void addRow (Triple <PartitionInfo , Row , Float > triple ) {
282
287
rows .add (triple );
283
288
}
284
289
}
285
290
286
291
/**
287
292
* Processes a single partition, calculating scores for rows and extracting tombstones.
288
293
*/
289
- private PartitionResults processPartition (BaseRowIterator <?> partitionRowIterator )
290
- {
294
+ private PartitionResults processPartition (BaseRowIterator <?> partitionRowIterator ) {
291
295
// Compute key and static row score once per partition
292
296
DecoratedKey key = partitionRowIterator .partitionKey ();
293
297
Row staticRow = partitionRowIterator .staticRow ();
@@ -318,8 +322,7 @@ private PartitionResults processPartition(BaseRowIterator<?> partitionRowIterato
318
322
* Processes a single partition, without scoring it.
319
323
*/
320
324
private int processSingleRowPartition (TreeMap <PartitionInfo , TreeSet <Unfiltered >> unfilteredByPartition ,
321
- BaseRowIterator <?> partitionRowIterator )
322
- {
325
+ BaseRowIterator <?> partitionRowIterator ) {
323
326
if (!partitionRowIterator .hasNext ())
324
327
return 0 ;
325
328
0 commit comments