Skip to content

Commit ea4b1b1

Browse files
committed
facets: provide CombinedTermsFacet to fix #798 [+]
This approach doesn't use 'nested' fields, but instead relies on a field containing <parent><split char><child> to aggregate correctly.
1 parent 8d028fc commit ea4b1b1

File tree

11 files changed

+533
-107
lines changed

11 files changed

+533
-107
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -129,3 +129,6 @@ dmypy.json
129129
# Pyre type checker
130130
.pyre/
131131
.DS_Store
132+
133+
# VSCode editor
134+
.vscode/
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
# -*- coding: utf-8 -*-
22
#
33
# Copyright (C) 2021 CERN.
4+
# Copyright (C) 2023 Northwestern University.
45
#
56
# Invenio-Records-Resources is free software; you can redistribute it and/or
67
# modify it under the terms of the MIT License; see LICENSE file for more
78
# details.
89

910
"""Facets."""
1011

11-
from .facets import CFTermsFacet, NestedTermsFacet, TermsFacet
12+
from .facets import CFTermsFacet, CombinedTermsFacet, NestedTermsFacet, TermsFacet
1213
from .labels import RecordRelationLabels
1314
from .response import FacetsResponse
1415

@@ -17,5 +18,6 @@
1718
"FacetsResponse",
1819
"NestedTermsFacet",
1920
"RecordRelationLabels",
21+
"CombinedTermsFacet",
2022
"TermsFacet",
2123
)

invenio_records_resources/services/records/facets/facets.py

+213-15
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
# -*- coding: utf-8 -*-
22
#
33
# Copyright (C) 2021 CERN.
4+
# Copyright (C) 2023 Northwestern University.
45
#
56
# Invenio-Records-Resources is free software; you can redistribute it and/or
67
# modify it under the terms of the MIT License; see LICENSE file for more
78
# details.
89

910
"""Facets types defined."""
1011

12+
from functools import reduce
13+
1114
from invenio_search.engine import dsl
1215

1316

@@ -103,15 +106,7 @@ class NestedTermsFacet(TermsFacet):
103106
splitchar='::',
104107
label=_('Resource types'),
105108
value_labels=VocabularyL10NLabels(current_service)
106-
),
107-
108-
'resource_type': NestedTermsFacet(
109-
field='metadata.resource_type.type',
110-
subfield='metadata.resource_type.subtype',
111-
splitchar='::',
112-
label=_('Resource types'),
113-
value_labels=VocabularyL10NLabels(current_service)
114-
),
109+
)
115110
}
116111
"""
117112

@@ -149,7 +144,7 @@ def _parse_values(self, filter_values):
149144
.. code-block:: python
150145
151146
{
152-
'publication': ['publication::book', 'publication::journal'],
147+
'publication': ['book', 'journal'],
153148
'dataset': []
154149
}
155150
@@ -178,12 +173,10 @@ def get_value_filter(self, parsed_value):
178173
# Expects to get a value from the output of "_parse_values()"."
179174
field_value, subfield_values = parsed_value
180175

176+
q = dsl.Q("term", **{self._field: field_value})
181177
if subfield_values:
182-
return dsl.Q("term", **{self._field: field_value}) & dsl.Q(
183-
"terms", **{self._subfield: subfield_values}
184-
)
185-
else:
186-
return dsl.Q("term", **{self._field: field_value})
178+
q &= dsl.Q("terms", **{self._subfield: subfield_values})
179+
return q
187180

188181
def add_filter(self, filter_values):
189182
"""Construct a filter query for the facet."""
@@ -246,6 +239,211 @@ def get_labelled_values(
246239
return ret_val
247240

248241

242+
class CombinedTermsFacet(NestedTermsFacet):
243+
"""
244+
Facet to mimic a nested aggregation without having to define a 'nested' field.
245+
246+
This facet is needed to prevent the "crossed wires" problem of a regular
247+
NestedTermsFacet applied to documents with multiple 2-level objects. For example,
248+
and the motivating use case for this facet, a "subjects" field with the
249+
following mapping:
250+
251+
.. code-block:: json
252+
253+
"subjects": {
254+
"type": "object",
255+
"properties": {
256+
"scheme": {
257+
"type": "keyword"
258+
},
259+
"subject": {
260+
"type": "keyword"
261+
}
262+
}
263+
}
264+
265+
will lead the document with the following subjects field:
266+
267+
.. code-block:: json
268+
269+
"subjects": [
270+
{"scheme": "SC1", "subject": "SU1"},
271+
{"scheme": "SC2", "subject": "SU2"}
272+
]
273+
274+
to be internally-indexed in the following manner:
275+
276+
.. code-block:: json
277+
278+
"subjects.scheme": ["SC1", "SC2"]
279+
"subjects.subject": ["SU1", "SU2"]
280+
281+
. This indexing loses the original pairwise relationships. This causes searches
282+
and aggregations for scheme = SC1 and subject = SU2 to surface the above document
283+
when they shouldn't. This is the "crossed wires" problem that this Facet class
284+
resolves for aggregations without using "nested" types and searches (the classic
285+
solution to this problem).
286+
287+
This facet requires the following indexed format:
288+
289+
.. code-block:: json
290+
291+
"<field>": ["<parent>", ...]
292+
// may have independent "<child>" entries
293+
"<combined field>": ["<parent><split char><child>", ..., "<child>"]
294+
295+
The reasoning given for avoiding "nested" fields is to allow regular queries on
296+
those fields that would have had to be made "nested" (only nested queries can be
297+
done on those fields). This is a UX concern since end-users can make queries to
298+
metadata field directly and they wouldn't be able to anymore (without a lot more
299+
changes).
300+
301+
Although this facet allows us to forego the need for a "nested" type field and
302+
nested queries to filter on that field, it *does* do extra work that is thrown away.
303+
See `get_aggregation` and `get_labelled_values`.
304+
305+
This facet formats the result of the aggregation such that it looks like it was
306+
a nested aggregation.
307+
"""
308+
309+
def __init__(self, field, combined_field, parents, splitchar="::", **kwargs):
310+
"""Constructor.
311+
312+
:param field: top-level/parent field
313+
:type field: str
314+
:param combined_field: field containing combined terms
315+
:type combined_field: str
316+
:param groups: iterable of parent/top-level values
317+
:type groups: Iterable[str]
318+
:param splitchar: splitting/combining token, defaults to "::"
319+
:type splitchar: str, optional
320+
"""
321+
self._field = field
322+
self._combined_field = combined_field
323+
self._parents = parents
324+
self._cached_parents = None
325+
self._splitchar = splitchar
326+
TermsFacet.__init__(self, **kwargs)
327+
328+
def get_parents(self):
329+
"""Return parents.
330+
331+
We have to delay getting the parents since it may require an application
332+
context.
333+
"""
334+
if not self._cached_parents:
335+
if callable(self._parents):
336+
self._cached_parents = self._parents()
337+
else:
338+
self._cached_parents = self._parents
339+
return self._cached_parents
340+
341+
def get_aggregation(self):
342+
"""Aggregate.
343+
344+
This aggregation repeats ALL group subaggregation for each bucket generated
345+
by the top-level terms aggregation. This is to overcome the
346+
"irrelevant flooding" problem: when aggregating on a subfield, the top 10
347+
(by default) most frequent terms of that subfield are selected, but those
348+
terms may not be relevant to the parent because the parent-child relationship
349+
is lost when not using "nested". So to make sure only relevant terms are
350+
used to select the documents in the aggregation, we "include" (filter) for them.
351+
352+
Only the subaggregation corresponding to the top-level group will be kept in
353+
get_labelled_values.
354+
"""
355+
return dsl.A(
356+
{
357+
"terms": {
358+
"field": self._field,
359+
"aggs": {
360+
f"inner_{parent}": {
361+
"terms": {
362+
"field": self._combined_field,
363+
"include": f"{parent}{self._splitchar}.*",
364+
},
365+
}
366+
for parent in self.get_parents()
367+
},
368+
}
369+
}
370+
)
371+
372+
def get_labelled_values(self, data, filter_values):
373+
"""Get a labelled version of a bucket.
374+
375+
:param data: Bucket data returned by document engine for a field
376+
:type data: dsl.response.aggs.FieldBucketData
377+
"""
378+
379+
def get_child_buckets(bucket, key):
380+
"""Get lower-level/child buckets."""
381+
result = []
382+
383+
# Ignore other subaggregations, and only retrieve inner_{key} one.
384+
# inner_{key} should always be present unless disconnect between
385+
# parents passed to generate subaggregations and parents actually present.
386+
# To not break in that case, we put a default empty list value.
387+
inner_data = getattr(bucket, f"inner_{key}", dsl.AttrDict({"buckets": []}))
388+
389+
for inner_bucket in inner_data.buckets:
390+
# get raw key and appropriately formatted key
391+
key_raw_inner = self.get_value(inner_bucket)
392+
prefix = key + self._splitchar
393+
key_inner = key_raw_inner[len(prefix):] # fmt: skip
394+
395+
result.append(
396+
{
397+
"key": key_inner,
398+
"doc_count": self.get_metric(inner_bucket),
399+
"label": key_inner,
400+
"is_selected": self.is_filtered(key_raw_inner, filter_values),
401+
}
402+
)
403+
404+
return result
405+
406+
def get_parent_buckets(data):
407+
"""Get top-level/group buckets.
408+
409+
:param data: Bucket data returned by document engine for a field
410+
:type data: dsl.response.aggs.FieldBucketData
411+
:return: list of labelled buckets
412+
:rtype: List[dict]
413+
"""
414+
label_map = self.get_label_mapping(data.buckets)
415+
result = []
416+
for bucket in data.buckets:
417+
key = self.get_value(bucket)
418+
result.append(
419+
{
420+
"key": key,
421+
"doc_count": self.get_metric(bucket),
422+
"label": label_map[key],
423+
"is_selected": self.is_filtered(key, filter_values),
424+
"inner": {"buckets": get_child_buckets(bucket, key)},
425+
}
426+
)
427+
return result
428+
429+
return {"buckets": get_parent_buckets(data), "label": str(self._label)}
430+
431+
def get_value_filter(self, parsed_value):
432+
"""Return a filter for a single parsed value."""
433+
# Expect to get a value from the output of `_parse_values()`
434+
field_value, subfield_values = parsed_value
435+
436+
# recombine
437+
subfield_values = [
438+
f"{field_value}{self._splitchar}{subvalue}" for subvalue in subfield_values
439+
]
440+
441+
q = dsl.Q("term", **{self._field: field_value})
442+
if subfield_values:
443+
q &= dsl.Q("terms", **{self._combined_field: subfield_values})
444+
return q
445+
446+
249447
class CFFacetMixin:
250448
"""Mixin to abstract the custom fields path."""
251449

invenio_records_resources/services/records/facets/response.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,12 @@ class FacetsResponseForRequest(cls):
4747
def _iter_facets(self):
4848
# _facets_param instance is added to _search by the FacetsParam.apply
4949
for name, facet in self._facets_param.facets.items():
50-
yield name, facet, getattr(
51-
self.aggregations, name
52-
), self._facets_param.selected_values.get(name, [])
50+
yield (
51+
name,
52+
facet,
53+
getattr(self.aggregations, name),
54+
self._facets_param.selected_values.get(name, []),
55+
)
5356

5457
@property
5558
def facets(self):

tests/mock_module/config.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# -*- coding: utf-8 -*-
22
#
33
# Copyright (C) 2020-2021 CERN.
4-
# Copyright (C) 2020-2021 Northwestern University.
4+
# Copyright (C) 2020-2023 Northwestern University.
55
#
66
# Invenio-Records-Resources is free software; you can redistribute it and/or
77
# modify it under the terms of the MIT License; see LICENSE file for more
@@ -18,8 +18,8 @@
1818
from invenio_records_resources.services.records.components import FilesComponent
1919
from invenio_records_resources.services.records.config import SearchOptions
2020
from invenio_records_resources.services.records.facets import (
21+
CombinedTermsFacet,
2122
NestedTermsFacet,
22-
TermsFacet,
2323
)
2424
from invenio_records_resources.services.records.links import (
2525
RecordLink,
@@ -44,9 +44,11 @@ class MockSearchOptions(SearchOptions):
4444
splitchar="**",
4545
label="Type",
4646
),
47-
"subject": TermsFacet(
48-
field="metadata.subject",
49-
label="Subject",
47+
"subjects": CombinedTermsFacet(
48+
field="metadata.subjects.scheme",
49+
combined_field="metadata.combined_subjects",
50+
parents=["SC1", "SC2"],
51+
label="Subjects",
5052
),
5153
}
5254

tests/mock_module/mappings/os-v1/records/record-v1.0.0.json

+17-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,23 @@
2121
}
2222
}
2323
},
24-
"subject": {
24+
"subjects": {
25+
"type": "object",
26+
"properties": {
27+
"subject": {
28+
"type": "text",
29+
"fields": {
30+
"keyword": {
31+
"type": "keyword"
32+
}
33+
}
34+
},
35+
"scheme": {
36+
"type": "keyword"
37+
}
38+
}
39+
},
40+
"combined_subjects": {
2541
"type": "keyword"
2642
},
2743
"inner_record": {

0 commit comments

Comments
 (0)