|
1 | 1 | # -*- coding: utf-8 -*-
|
2 | 2 | #
|
3 | 3 | # Copyright (C) 2021 CERN.
|
| 4 | +# Copyright (C) 2023 Northwestern University. |
4 | 5 | #
|
5 | 6 | # Invenio-Records-Resources is free software; you can redistribute it and/or
|
6 | 7 | # modify it under the terms of the MIT License; see LICENSE file for more
|
7 | 8 | # details.
|
8 | 9 |
|
9 | 10 | """Facets types defined."""
|
10 | 11 |
|
| 12 | +from functools import reduce |
| 13 | + |
11 | 14 | from invenio_search.engine import dsl
|
12 | 15 |
|
13 | 16 |
|
@@ -103,15 +106,7 @@ class NestedTermsFacet(TermsFacet):
|
103 | 106 | splitchar='::',
|
104 | 107 | label=_('Resource types'),
|
105 | 108 | value_labels=VocabularyL10NLabels(current_service)
|
106 |
| - ), |
107 |
| -
|
108 |
| - 'resource_type': NestedTermsFacet( |
109 |
| - field='metadata.resource_type.type', |
110 |
| - subfield='metadata.resource_type.subtype', |
111 |
| - splitchar='::', |
112 |
| - label=_('Resource types'), |
113 |
| - value_labels=VocabularyL10NLabels(current_service) |
114 |
| - ), |
| 109 | + ) |
115 | 110 | }
|
116 | 111 | """
|
117 | 112 |
|
@@ -149,7 +144,7 @@ def _parse_values(self, filter_values):
|
149 | 144 | .. code-block:: python
|
150 | 145 |
|
151 | 146 | {
|
152 |
| - 'publication': ['publication::book', 'publication::journal'], |
| 147 | + 'publication': ['book', 'journal'], |
153 | 148 | 'dataset': []
|
154 | 149 | }
|
155 | 150 |
|
@@ -178,12 +173,10 @@ def get_value_filter(self, parsed_value):
|
178 | 173 | # Expects to get a value from the output of "_parse_values()"."
|
179 | 174 | field_value, subfield_values = parsed_value
|
180 | 175 |
|
| 176 | + q = dsl.Q("term", **{self._field: field_value}) |
181 | 177 | if subfield_values:
|
182 |
| - return dsl.Q("term", **{self._field: field_value}) & dsl.Q( |
183 |
| - "terms", **{self._subfield: subfield_values} |
184 |
| - ) |
185 |
| - else: |
186 |
| - return dsl.Q("term", **{self._field: field_value}) |
| 178 | + q &= dsl.Q("terms", **{self._subfield: subfield_values}) |
| 179 | + return q |
187 | 180 |
|
188 | 181 | def add_filter(self, filter_values):
|
189 | 182 | """Construct a filter query for the facet."""
|
@@ -246,6 +239,211 @@ def get_labelled_values(
|
246 | 239 | return ret_val
|
247 | 240 |
|
248 | 241 |
|
| 242 | +class CombinedTermsFacet(NestedTermsFacet): |
| 243 | + """ |
| 244 | + Facet to mimic a nested aggregation without having to define a 'nested' field. |
| 245 | +
|
| 246 | + This facet is needed to prevent the "crossed wires" problem of a regular |
| 247 | + NestedTermsFacet applied to documents with multiple 2-level objects. For example, |
| 248 | + and the motivating use case for this facet, a "subjects" field with the |
| 249 | + following mapping: |
| 250 | +
|
| 251 | + .. code-block:: json |
| 252 | +
|
| 253 | + "subjects": { |
| 254 | + "type": "object", |
| 255 | + "properties": { |
| 256 | + "scheme": { |
| 257 | + "type": "keyword" |
| 258 | + }, |
| 259 | + "subject": { |
| 260 | + "type": "keyword" |
| 261 | + } |
| 262 | + } |
| 263 | + } |
| 264 | +
|
| 265 | + will lead the document with the following subjects field: |
| 266 | +
|
| 267 | + .. code-block:: json |
| 268 | +
|
| 269 | + "subjects": [ |
| 270 | + {"scheme": "SC1", "subject": "SU1"}, |
| 271 | + {"scheme": "SC2", "subject": "SU2"} |
| 272 | + ] |
| 273 | +
|
| 274 | + to be internally-indexed in the following manner: |
| 275 | +
|
| 276 | + .. code-block:: json |
| 277 | +
|
| 278 | + "subjects.scheme": ["SC1", "SC2"] |
| 279 | + "subjects.subject": ["SU1", "SU2"] |
| 280 | +
|
| 281 | + . This indexing loses the original pairwise relationships. This causes searches |
| 282 | + and aggregations for scheme = SC1 and subject = SU2 to surface the above document |
| 283 | + when they shouldn't. This is the "crossed wires" problem that this Facet class |
| 284 | + resolves for aggregations without using "nested" types and searches (the classic |
| 285 | + solution to this problem). |
| 286 | +
|
| 287 | + This facet requires the following indexed format: |
| 288 | +
|
| 289 | + .. code-block:: json |
| 290 | +
|
| 291 | + "<field>": ["<parent>", ...] |
| 292 | + // may have independent "<child>" entries |
| 293 | + "<combined field>": ["<parent><split char><child>", ..., "<child>"] |
| 294 | +
|
| 295 | + The reasoning given for avoiding "nested" fields is to allow regular queries on |
| 296 | + those fields that would have had to be made "nested" (only nested queries can be |
| 297 | + done on those fields). This is a UX concern since end-users can make queries to |
| 298 | + metadata field directly and they wouldn't be able to anymore (without a lot more |
| 299 | + changes). |
| 300 | +
|
| 301 | + Although this facet allows us to forego the need for a "nested" type field and |
| 302 | + nested queries to filter on that field, it *does* do extra work that is thrown away. |
| 303 | + See `get_aggregation` and `get_labelled_values`. |
| 304 | +
|
| 305 | + This facet formats the result of the aggregation such that it looks like it was |
| 306 | + a nested aggregation. |
| 307 | + """ |
| 308 | + |
| 309 | + def __init__(self, field, combined_field, parents, splitchar="::", **kwargs): |
| 310 | + """Constructor. |
| 311 | +
|
| 312 | + :param field: top-level/parent field |
| 313 | + :type field: str |
| 314 | + :param combined_field: field containing combined terms |
| 315 | + :type combined_field: str |
| 316 | + :param groups: iterable of parent/top-level values |
| 317 | + :type groups: Iterable[str] |
| 318 | + :param splitchar: splitting/combining token, defaults to "::" |
| 319 | + :type splitchar: str, optional |
| 320 | + """ |
| 321 | + self._field = field |
| 322 | + self._combined_field = combined_field |
| 323 | + self._parents = parents |
| 324 | + self._cached_parents = None |
| 325 | + self._splitchar = splitchar |
| 326 | + TermsFacet.__init__(self, **kwargs) |
| 327 | + |
| 328 | + def get_parents(self): |
| 329 | + """Return parents. |
| 330 | +
|
| 331 | + We have to delay getting the parents since it may require an application |
| 332 | + context. |
| 333 | + """ |
| 334 | + if not self._cached_parents: |
| 335 | + if callable(self._parents): |
| 336 | + self._cached_parents = self._parents() |
| 337 | + else: |
| 338 | + self._cached_parents = self._parents |
| 339 | + return self._cached_parents |
| 340 | + |
| 341 | + def get_aggregation(self): |
| 342 | + """Aggregate. |
| 343 | +
|
| 344 | + This aggregation repeats ALL group subaggregation for each bucket generated |
| 345 | + by the top-level terms aggregation. This is to overcome the |
| 346 | + "irrelevant flooding" problem: when aggregating on a subfield, the top 10 |
| 347 | + (by default) most frequent terms of that subfield are selected, but those |
| 348 | + terms may not be relevant to the parent because the parent-child relationship |
| 349 | + is lost when not using "nested". So to make sure only relevant terms are |
| 350 | + used to select the documents in the aggregation, we "include" (filter) for them. |
| 351 | +
|
| 352 | + Only the subaggregation corresponding to the top-level group will be kept in |
| 353 | + get_labelled_values. |
| 354 | + """ |
| 355 | + return dsl.A( |
| 356 | + { |
| 357 | + "terms": { |
| 358 | + "field": self._field, |
| 359 | + "aggs": { |
| 360 | + f"inner_{parent}": { |
| 361 | + "terms": { |
| 362 | + "field": self._combined_field, |
| 363 | + "include": f"{parent}{self._splitchar}.*", |
| 364 | + }, |
| 365 | + } |
| 366 | + for parent in self.get_parents() |
| 367 | + }, |
| 368 | + } |
| 369 | + } |
| 370 | + ) |
| 371 | + |
| 372 | + def get_labelled_values(self, data, filter_values): |
| 373 | + """Get a labelled version of a bucket. |
| 374 | +
|
| 375 | + :param data: Bucket data returned by document engine for a field |
| 376 | + :type data: dsl.response.aggs.FieldBucketData |
| 377 | + """ |
| 378 | + |
| 379 | + def get_child_buckets(bucket, key): |
| 380 | + """Get lower-level/child buckets.""" |
| 381 | + result = [] |
| 382 | + |
| 383 | + # Ignore other subaggregations, and only retrieve inner_{key} one. |
| 384 | + # inner_{key} should always be present unless disconnect between |
| 385 | + # parents passed to generate subaggregations and parents actually present. |
| 386 | + # To not break in that case, we put a default empty list value. |
| 387 | + inner_data = getattr(bucket, f"inner_{key}", dsl.AttrDict({"buckets": []})) |
| 388 | + |
| 389 | + for inner_bucket in inner_data.buckets: |
| 390 | + # get raw key and appropriately formatted key |
| 391 | + key_raw_inner = self.get_value(inner_bucket) |
| 392 | + prefix = key + self._splitchar |
| 393 | + key_inner = key_raw_inner[len(prefix):] # fmt: skip |
| 394 | + |
| 395 | + result.append( |
| 396 | + { |
| 397 | + "key": key_inner, |
| 398 | + "doc_count": self.get_metric(inner_bucket), |
| 399 | + "label": key_inner, |
| 400 | + "is_selected": self.is_filtered(key_raw_inner, filter_values), |
| 401 | + } |
| 402 | + ) |
| 403 | + |
| 404 | + return result |
| 405 | + |
| 406 | + def get_parent_buckets(data): |
| 407 | + """Get top-level/group buckets. |
| 408 | +
|
| 409 | + :param data: Bucket data returned by document engine for a field |
| 410 | + :type data: dsl.response.aggs.FieldBucketData |
| 411 | + :return: list of labelled buckets |
| 412 | + :rtype: List[dict] |
| 413 | + """ |
| 414 | + label_map = self.get_label_mapping(data.buckets) |
| 415 | + result = [] |
| 416 | + for bucket in data.buckets: |
| 417 | + key = self.get_value(bucket) |
| 418 | + result.append( |
| 419 | + { |
| 420 | + "key": key, |
| 421 | + "doc_count": self.get_metric(bucket), |
| 422 | + "label": label_map[key], |
| 423 | + "is_selected": self.is_filtered(key, filter_values), |
| 424 | + "inner": {"buckets": get_child_buckets(bucket, key)}, |
| 425 | + } |
| 426 | + ) |
| 427 | + return result |
| 428 | + |
| 429 | + return {"buckets": get_parent_buckets(data), "label": str(self._label)} |
| 430 | + |
| 431 | + def get_value_filter(self, parsed_value): |
| 432 | + """Return a filter for a single parsed value.""" |
| 433 | + # Expect to get a value from the output of `_parse_values()` |
| 434 | + field_value, subfield_values = parsed_value |
| 435 | + |
| 436 | + # recombine |
| 437 | + subfield_values = [ |
| 438 | + f"{field_value}{self._splitchar}{subvalue}" for subvalue in subfield_values |
| 439 | + ] |
| 440 | + |
| 441 | + q = dsl.Q("term", **{self._field: field_value}) |
| 442 | + if subfield_values: |
| 443 | + q &= dsl.Q("terms", **{self._combined_field: subfield_values}) |
| 444 | + return q |
| 445 | + |
| 446 | + |
249 | 447 | class CFFacetMixin:
|
250 | 448 | """Mixin to abstract the custom fields path."""
|
251 | 449 |
|
|
0 commit comments