From ea4b1b1e456ed123328e32a9d585cbaec6dcd1a1 Mon Sep 17 00:00:00 2001
From: Guillaume Viger <fenekku@fenekku.com>
Date: Mon, 20 Nov 2023 09:25:49 -0500
Subject: [PATCH] facets: provide CombinedTermsFacet to fix #798 [+]

This approach doesn't use 'nested' fields, but instead relies on a
field containing <parent><split char><child> to aggregate correctly.
---
 .gitignore                                    |   3 +
 .../services/records/facets/__init__.py       |   4 +-
 .../services/records/facets/facets.py         | 228 ++++++++++++++++--
 .../services/records/facets/response.py       |   9 +-
 tests/mock_module/config.py                   |  12 +-
 .../mappings/os-v1/records/record-v1.0.0.json |  18 +-
 .../mappings/os-v2/records/record-v1.0.0.json |  18 +-
 .../mappings/v7/records/record-v1.0.0.json    |  18 +-
 tests/mock_module/schemas.py                  |  12 +-
 tests/resources/test_resource_faceting.py     | 134 +++++++++-
 tests/services/test_service_facets.py         | 184 ++++++++------
 11 files changed, 533 insertions(+), 107 deletions(-)
diff --git a/.gitignore b/.gitignore
index e110aa3c..115b6df7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -129,3 +129,6 @@ dmypy.json
 # Pyre type checker
 .pyre/
 .DS_Store
+
+# VSCode editor
+.vscode/
diff --git a/invenio_records_resources/services/records/facets/__init__.py b/invenio_records_resources/services/records/facets/__init__.py
index 55f41322..ba5ab798 100644
--- a/invenio_records_resources/services/records/facets/__init__.py
+++ b/invenio_records_resources/services/records/facets/__init__.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 # Copyright (C) 2021 CERN.
+# Copyright (C) 2023 Northwestern University.
 #
 # Invenio-Records-Resources is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -8,7 +9,7 @@
 
 """Facets."""
 
-from .facets import CFTermsFacet, NestedTermsFacet, TermsFacet
+from .facets import CFTermsFacet, CombinedTermsFacet, NestedTermsFacet, TermsFacet
 from .labels import RecordRelationLabels
 from .response import FacetsResponse
 
@@ -17,5 +18,6 @@
     "FacetsResponse",
     "NestedTermsFacet",
     "RecordRelationLabels",
+    "CombinedTermsFacet",
     "TermsFacet",
 )
diff --git a/invenio_records_resources/services/records/facets/facets.py b/invenio_records_resources/services/records/facets/facets.py
index 26127663..b4bb3ccc 100644
--- a/invenio_records_resources/services/records/facets/facets.py
+++ b/invenio_records_resources/services/records/facets/facets.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 # Copyright (C) 2021 CERN.
+# Copyright (C) 2023 Northwestern University.
 #
 # Invenio-Records-Resources is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -8,6 +9,8 @@
 
 """Facets types defined."""
 
+from functools import reduce
+
 from invenio_search.engine import dsl
 
 
@@ -103,15 +106,7 @@ class NestedTermsFacet(TermsFacet):
                 splitchar='::',
                 label=_('Resource types'),
                 value_labels=VocabularyL10NLabels(current_service)
-            ),
-
-            'resource_type': NestedTermsFacet(
-                field='metadata.resource_type.type',
-                subfield='metadata.resource_type.subtype',
-                splitchar='::',
-                label=_('Resource types'),
-                value_labels=VocabularyL10NLabels(current_service)
-            ),
+            )
         }
     """
 
@@ -149,7 +144,7 @@ def _parse_values(self, filter_values):
         .. code-block:: python
 
             {
-                'publication': ['publication::book', 'publication::journal'],
+                'publication': ['book', 'journal'],
                 'dataset': []
             }
 
@@ -178,12 +173,10 @@ def get_value_filter(self, parsed_value):
         # Expects to get a value from the output of "_parse_values()"."
         field_value, subfield_values = parsed_value
 
+        q = dsl.Q("term", **{self._field: field_value})
         if subfield_values:
-            return dsl.Q("term", **{self._field: field_value}) & dsl.Q(
-                "terms", **{self._subfield: subfield_values}
-            )
-        else:
-            return dsl.Q("term", **{self._field: field_value})
+            q &= dsl.Q("terms", **{self._subfield: subfield_values})
+        return q
 
     def add_filter(self, filter_values):
         """Construct a filter query for the facet."""
@@ -246,6 +239,211 @@ def get_labelled_values(
         return ret_val
 
 
+class CombinedTermsFacet(NestedTermsFacet):
+    """
+    Facet to mimic a nested aggregation without having to define a 'nested' field.
+
+    This facet is needed to prevent the "crossed wires" problem of a regular
+    NestedTermsFacet applied to documents with multiple 2-level objects. For example,
+    and the motivating use case for this facet, a "subjects" field with the
+    following mapping:
+
+    .. code-block:: json
+
+        "subjects": {
+            "type": "object",
+            "properties": {
+                "scheme": {
+                    "type": "keyword"
+                },
+                "subject": {
+                    "type": "keyword"
+                }
+            }
+        }
+
+    will lead the document with the following subjects field:
+
+    .. code-block:: json
+
+        "subjects": [
+            {"scheme": "SC1", "subject": "SU1"},
+            {"scheme": "SC2", "subject": "SU2"}
+        ]
+
+    to be internally-indexed in the following manner:
+
+    .. code-block:: json
+
+        "subjects.scheme": ["SC1", "SC2"]
+        "subjects.subject": ["SU1", "SU2"]
+
+    . This indexing loses the original pairwise relationships. This causes searches
+    and aggregations for scheme = SC1 and subject = SU2 to surface the above document
+    when they shouldn't. This is the "crossed wires" problem that this Facet class
+    resolves for aggregations without using "nested" types and searches (the classic
+    solution to this problem).
+
+    This facet requires the following indexed format:
+
+    .. code-block:: json
+
+        "<field>": ["<parent>", ...]
+        // may have independent "<child>" entries
+        "<combined field>": ["<parent><split char><child>", ..., "<child>"]
+
+    The reasoning given for avoiding "nested" fields is to allow regular queries on
+    those fields that would have had to be made "nested" (only nested queries can be
+    done on those fields). This is a UX concern since end-users can make queries to
+    metadata field directly and they wouldn't be able to anymore (without a lot more
+    changes).
+
+    Although this facet allows us to forego the need for a "nested" type field and
+    nested queries to filter on that field, it *does* do extra work that is thrown away.
+    See `get_aggregation` and `get_labelled_values`.
+
+    This facet formats the result of the aggregation such that it looks like it was
+    a nested aggregation.
+    """
+
+    def __init__(self, field, combined_field, parents, splitchar="::", **kwargs):
+        """Constructor.
+
+        :param field: top-level/parent field
+        :type field: str
+        :param combined_field: field containing combined terms
+        :type combined_field: str
+        :param groups: iterable of parent/top-level values
+        :type groups: Iterable[str]
+        :param splitchar: splitting/combining token, defaults to "::"
+        :type splitchar: str, optional
+        """
+        self._field = field
+        self._combined_field = combined_field
+        self._parents = parents
+        self._cached_parents = None
+        self._splitchar = splitchar
+        TermsFacet.__init__(self, **kwargs)
+
+    def get_parents(self):
+        """Return parents.
+
+        We have to delay getting the parents since it may require an application
+        context.
+        """
+        if not self._cached_parents:
+            if callable(self._parents):
+                self._cached_parents = self._parents()
+            else:
+                self._cached_parents = self._parents
+        return self._cached_parents
+
+    def get_aggregation(self):
+        """Aggregate.
+
+        This aggregation repeats ALL group subaggregation for each bucket generated
+        by the top-level terms aggregation. This is to overcome the
+        "irrelevant flooding" problem: when aggregating on a subfield, the top 10
+        (by default) most frequent terms of that subfield are selected, but those
+        terms may not be relevant to the parent because the parent-child relationship
+        is lost when not using "nested". So to make sure only relevant terms are
+        used to select the documents in the aggregation, we "include" (filter) for them.
+
+        Only the subaggregation corresponding to the top-level group will be kept in
+        get_labelled_values.
+        """
+        return dsl.A(
+            {
+                "terms": {
+                    "field": self._field,
+                    "aggs": {
+                        f"inner_{parent}": {
+                            "terms": {
+                                "field": self._combined_field,
+                                "include": f"{parent}{self._splitchar}.*",
+                            },
+                        }
+                        for parent in self.get_parents()
+                    },
+                }
+            }
+        )
+
+    def get_labelled_values(self, data, filter_values):
+        """Get a labelled version of a bucket.
+
+        :param data: Bucket data returned by document engine for a field
+        :type data: dsl.response.aggs.FieldBucketData
+        """
+
+        def get_child_buckets(bucket, key):
+            """Get lower-level/child buckets."""
+            result = []
+
+            # Ignore other subaggregations, and only retrieve inner_{key} one.
+            # inner_{key} should always be present unless disconnect between
+            # parents passed to generate subaggregations and parents actually present.
+            # To not break in that case, we put a default empty list value.
+            inner_data = getattr(bucket, f"inner_{key}", dsl.AttrDict({"buckets": []}))
+
+            for inner_bucket in inner_data.buckets:
+                # get raw key and appropriately formatted key
+                key_raw_inner = self.get_value(inner_bucket)
+                prefix = key + self._splitchar
+                key_inner = key_raw_inner[len(prefix):]  # fmt: skip
+
+                result.append(
+                    {
+                        "key": key_inner,
+                        "doc_count": self.get_metric(inner_bucket),
+                        "label": key_inner,
+                        "is_selected": self.is_filtered(key_raw_inner, filter_values),
+                    }
+                )
+
+            return result
+
+        def get_parent_buckets(data):
+            """Get top-level/group buckets.
+
+            :param data: Bucket data returned by document engine for a field
+            :type data: dsl.response.aggs.FieldBucketData
+            :return: list of labelled buckets
+            :rtype: List[dict]
+            """
+            label_map = self.get_label_mapping(data.buckets)
+            result = []
+            for bucket in data.buckets:
+                key = self.get_value(bucket)
+                result.append(
+                    {
+                        "key": key,
+                        "doc_count": self.get_metric(bucket),
+                        "label": label_map[key],
+                        "is_selected": self.is_filtered(key, filter_values),
+                        "inner": {"buckets": get_child_buckets(bucket, key)},
+                    }
+                )
+            return result
+
+        return {"buckets": get_parent_buckets(data), "label": str(self._label)}
+
+    def get_value_filter(self, parsed_value):
+        """Return a filter for a single parsed value."""
+        # Expect to get a value from the output of `_parse_values()`
+        field_value, subfield_values = parsed_value
+
+        # recombine
+        subfield_values = [
+            f"{field_value}{self._splitchar}{subvalue}" for subvalue in subfield_values
+        ]
+
+        q = dsl.Q("term", **{self._field: field_value})
+        if subfield_values:
+            q &= dsl.Q("terms", **{self._combined_field: subfield_values})
+        return q
+
+
 class CFFacetMixin:
     """Mixin to abstract the custom fields path."""
 
diff --git a/invenio_records_resources/services/records/facets/response.py b/invenio_records_resources/services/records/facets/response.py
index 07da05a1..ddb75580 100644
--- a/invenio_records_resources/services/records/facets/response.py
+++ b/invenio_records_resources/services/records/facets/response.py
@@ -47,9 +47,12 @@ class FacetsResponseForRequest(cls):
     def _iter_facets(self):
         # _facets_param instance is added to _search by the FacetsParam.apply
         for name, facet in self._facets_param.facets.items():
-            yield name, facet, getattr(
-                self.aggregations, name
-            ), self._facets_param.selected_values.get(name, [])
+            yield (
+                name,
+                facet,
+                getattr(self.aggregations, name),
+                self._facets_param.selected_values.get(name, []),
+            )
 
     @property
     def facets(self):
diff --git a/tests/mock_module/config.py b/tests/mock_module/config.py
index 1d90173e..98c0cb30 100644
--- a/tests/mock_module/config.py
+++ b/tests/mock_module/config.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 # Copyright (C) 2020-2021 CERN.
-# Copyright (C) 2020-2021 Northwestern University.
+# Copyright (C) 2020-2023 Northwestern University.
 #
 # Invenio-Records-Resources is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -18,8 +18,8 @@
 from invenio_records_resources.services.records.components import FilesComponent
 from invenio_records_resources.services.records.config import SearchOptions
 from invenio_records_resources.services.records.facets import (
+    CombinedTermsFacet,
     NestedTermsFacet,
-    TermsFacet,
 )
 from invenio_records_resources.services.records.links import (
     RecordLink,
@@ -44,9 +44,11 @@ class MockSearchOptions(SearchOptions):
             splitchar="**",
             label="Type",
         ),
-        "subject": TermsFacet(
-            field="metadata.subject",
-            label="Subject",
+        "subjects": CombinedTermsFacet(
+            field="metadata.subjects.scheme",
+            combined_field="metadata.combined_subjects",
+            parents=["SC1", "SC2"],
+            label="Subjects",
         ),
     }
 
diff --git a/tests/mock_module/mappings/os-v1/records/record-v1.0.0.json b/tests/mock_module/mappings/os-v1/records/record-v1.0.0.json
index 9ab10ced..cff83f4d 100644
--- a/tests/mock_module/mappings/os-v1/records/record-v1.0.0.json
+++ b/tests/mock_module/mappings/os-v1/records/record-v1.0.0.json
@@ -21,7 +21,23 @@
               }
             }
           },
-          "subject": {
+          "subjects": {
+            "type": "object",
+            "properties": {
+              "subject": {
+                "type": "text",
+                "fields": {
+                  "keyword": {
+                    "type": "keyword"
+                  }
+                }
+              },
+              "scheme": {
+                "type": "keyword"
+              }
+            }
+          },
+          "combined_subjects": {
             "type": "keyword"
           },
           "inner_record": {
diff --git a/tests/mock_module/mappings/os-v2/records/record-v1.0.0.json b/tests/mock_module/mappings/os-v2/records/record-v1.0.0.json
index 9ab10ced..cff83f4d 100644
--- a/tests/mock_module/mappings/os-v2/records/record-v1.0.0.json
+++ b/tests/mock_module/mappings/os-v2/records/record-v1.0.0.json
@@ -21,7 +21,23 @@
               }
             }
           },
-          "subject": {
+          "subjects": {
+            "type": "object",
+            "properties": {
+              "subject": {
+                "type": "text",
+                "fields": {
+                  "keyword": {
+                    "type": "keyword"
+                  }
+                }
+              },
+              "scheme": {
+                "type": "keyword"
+              }
+            }
+          },
+          "combined_subjects": {
             "type": "keyword"
           },
           "inner_record": {
diff --git a/tests/mock_module/mappings/v7/records/record-v1.0.0.json b/tests/mock_module/mappings/v7/records/record-v1.0.0.json
index 9ab10ced..cff83f4d 100644
--- a/tests/mock_module/mappings/v7/records/record-v1.0.0.json
+++ b/tests/mock_module/mappings/v7/records/record-v1.0.0.json
@@ -21,7 +21,23 @@
               }
             }
           },
-          "subject": {
+          "subjects": {
+            "type": "object",
+            "properties": {
+              "subject": {
+                "type": "text",
+                "fields": {
+                  "keyword": {
+                    "type": "keyword"
+                  }
+                }
+              },
+              "scheme": {
+                "type": "keyword"
+              }
+            }
+          },
+          "combined_subjects": {
             "type": "keyword"
           },
           "inner_record": {
diff --git a/tests/mock_module/schemas.py b/tests/mock_module/schemas.py
index 4eaafd8e..4b34e61d 100644
--- a/tests/mock_module/schemas.py
+++ b/tests/mock_module/schemas.py
@@ -2,7 +2,7 @@
 #
 # This file is part of Invenio.
 # Copyright (C) 2021 CERN.
-# Copyright (C) 2021 Northwestern University.
+# Copyright (C) 2021-2023 Northwestern University.
 #
 # Invenio-Records-Resources is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -24,6 +24,13 @@ class TypeSchema(Schema):
     subtype = fields.Str()
 
 
+class SubjectSchema(Schema):
+    """Nested subject schema used for faceting tests."""
+
+    scheme = fields.Str()
+    subject = fields.Str()
+
+
 class ReferencedCreatedBySchema(Schema):
     """Nested type schema for fake created by field."""
 
@@ -48,7 +55,8 @@ class MetadataSchema(Schema):
 
     title = fields.Str(required=True, validate=validate.Length(min=3))
     type = fields.Nested(TypeSchema)
-    subject = fields.Str()
+    subjects = fields.List(fields.Nested(SubjectSchema))
+    combined_subjects = fields.List(fields.Str)
     inner_record = fields.Dict()
     # referenced records
     referenced_created_by = fields.Nested(ReferencedCreatedBySchema)
diff --git a/tests/resources/test_resource_faceting.py b/tests/resources/test_resource_faceting.py
index 2be4e2a0..787feb8d 100644
--- a/tests/resources/test_resource_faceting.py
+++ b/tests/resources/test_resource_faceting.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 # Copyright (C) 2020 CERN.
-# Copyright (C) 2020 Northwestern University.
+# Copyright (C) 2020-2023 Northwestern University.
 #
 # Invenio-Records-Resources is free software; you can redistribute it and/or
 # modify it under the terms of the MIT License; see LICENSE file for more
@@ -21,8 +21,8 @@
 # 2- links are generated
 
 
-@pytest.fixture(scope="module")
-def three_indexed_records(app, identity_simple, search):
+@pytest.fixture()
+def three_indexed_records(app, identity_simple, search_clear):
     # NOTE: search is used (and not search_clear) here because all tests
     #       assume 3 records have been indexed and NO tests in this module
     #       adds/deletes any.
@@ -51,9 +51,9 @@ def test_aggregating(client, headers, three_indexed_records):
     response_aggs = response.json["aggregations"]
 
     expected_aggs = {
-        "subject": {
+        "subjects": {
             "buckets": [],
-            "label": "Subject",
+            "label": "Subjects",
         },
         "type": {
             "label": "Type",
@@ -101,9 +101,9 @@ def test_post_filtering(client, headers, three_indexed_records):
     # Test aggregation is the same
     response_aggs = response.json["aggregations"]
     expected_aggs = {
-        "subject": {
+        "subjects": {
             "buckets": [],
-            "label": "Subject",
+            "label": "Subjects",
         },
         "type": {
             "label": "Type",
@@ -150,6 +150,126 @@ def test_post_filtering(client, headers, three_indexed_records):
     )
 
 
+def test_nested_post_filtering(client, headers, identity_simple, service, search_clear):
+    data = {
+        "metadata": {
+            "title": "SU1 + (SC1, SU2)",
+            "subjects": [
+                {"subject": "SU1"},  # should be ignored in aggregation results
+                {"scheme": "SC1", "subject": "SU2"},
+            ],
+            "combined_subjects": ["SC1::SU2"],
+        },
+    }
+    service.create(identity_simple, data)
+    data = {
+        "metadata": {
+            "title": "(SC1, SU4) + (SC2, SU2)",
+            "subjects": [
+                {"scheme": "SC1", "subject": "SU4"},
+                {"scheme": "SC2", "subject": "SU2"},
+            ],
+            "combined_subjects": ["SC1::SU4", "SC2::SU2"],
+        },
+    }
+    service.create(identity_simple, data)
+    data = {
+        "metadata": {
+            "title": "(SC2, SU3)",
+            "subjects": [
+                {"scheme": "SC2", "subject": "SU3"},
+            ],
+            "combined_subjects": ["SC2::SU3"],
+        },
+    }
+    service.create(identity_simple, data)
+    Record.index.refresh()
+
+    # First scenario
+    # Test that:
+    # - different subjects/subfields but same scheme result in a union post_filter
+    #   (a record that only has 1 of the scheme+subject should be selected)
+    # - hierarchical dependency is enforced (selects for scheme AND subject together)
+    response = client.get("/mocks?subjects=SC1::SU2&subjects=SC1::SU3", headers=headers)
+    resource_aggs = response.json["aggregations"]
+
+    expected_aggs = {
+        "subjects": {
+            "buckets": [
+                {
+                    "doc_count": 2,
+                    "inner": {
+                        "buckets": [
+                            {
+                                "doc_count": 1,
+                                "is_selected": True,
+                                "key": "SU2",
+                                "label": "SU2",
+                            },
+                            {
+                                "doc_count": 1,
+                                "is_selected": False,
+                                "key": "SU4",
+                                "label": "SU4",
+                            },
+                        ]
+                    },
+                    "is_selected": False,  # only selected if SC1 is separately passed
+                    "key": "SC1",
+                    "label": "SC1",
+                },
+                {
+                    "doc_count": 2,
+                    "inner": {
+                        "buckets": [
+                            {
+                                "doc_count": 1,
+                                "is_selected": False,
+                                "key": "SU2",
+                                "label": "SU2",
+                            },
+                            {
+                                "doc_count": 1,
+                                "is_selected": False,
+                                "key": "SU3",
+                                "label": "SU3",
+                            },
+                        ]
+                    },
+                    "is_selected": False,
+                    "key": "SC2",
+                    "label": "SC2",
+                },
+            ],
+            "label": "Subjects",
+        },
+        "type": {
+            "buckets": [],
+            "label": "Type",
+        },
+    }
+    assert expected_aggs == resource_aggs
+    hits = response.json["hits"]["hits"]
+    assert 1 == len(hits)
+    assert "SU1 + (SC1, SU2)" == hits[0]["metadata"]["title"]
+
+    # 2nd scenario
+    # Test that:
+    # - different schemes/fields result in a union post_filter as well
+    #   (a record that only has 1 of the scheme+subject should be selected)
+    response = client.get("/mocks?subjects=SC1::SU1&subjects=SC2::SU3", headers=headers)
+    resource_aggs = response.json["aggregations"]
+
+    # Reformat expected_aggs
+    inner = expected_aggs["subjects"]["buckets"][0]["inner"]
+    inner["buckets"][0]["is_selected"] = False
+    expected_aggs["subjects"]["buckets"][1]["inner"]["buckets"][1]["is_selected"] = True
+    assert expected_aggs == resource_aggs
+    hits = response.json["hits"]["hits"]
+    assert 1 == len(hits)
+    assert "(SC2, SU3)" == hits[0]["metadata"]["title"]
+
+
 #
 # 2- links are generated
 #
diff --git a/tests/services/test_service_facets.py b/tests/services/test_service_facets.py
index c1a96e1a..4e17147a 100644
--- a/tests/services/test_service_facets.py
+++ b/tests/services/test_service_facets.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 #
 # Copyright (C) 2020 CERN.
-# Copyright (C) 2020 Northwestern University.
+# Copyright (C) 2020-2023 Northwestern University.
 # Copyright (C) 2022 Graz University of Technology.
 #
 # Invenio-Records-Resources is free software; you can redistribute it and/or
@@ -17,8 +17,8 @@
 #
 # Fixtures
 #
-@pytest.fixture(scope="module")
-def records(app, service, identity_simple):
+@pytest.fixture()
+def records(app, service, identity_simple, search_clear):
     """Input data (as coming from the view layer)."""
     items = []
     for idx in range(2):
@@ -26,7 +26,6 @@ def records(app, service, identity_simple):
             "metadata": {
                 "title": f"00{idx}",
                 "type": {"type": f"Foo{idx}", "subtype": f"Bar{idx}"},
-                "subject": f"Subject{idx}",
             },
         }
         items.append(service.create(identity_simple, data))
@@ -38,28 +37,14 @@ def records(app, service, identity_simple):
 # Tests
 #
 def test_facets(app, service, identity_simple, records):
-    """Create a record."""
     # Search it
     res = service.search(identity_simple)
     service_aggs = res.aggregations
 
     expected_aggs = {
-        "subject": {
-            "buckets": [
-                {
-                    "doc_count": 1,
-                    "is_selected": False,
-                    "key": "Subject0",
-                    "label": "Subject0",
-                },
-                {
-                    "doc_count": 1,
-                    "is_selected": False,
-                    "key": "Subject1",
-                    "label": "Subject1",
-                },
-            ],
-            "label": "Subject",
+        "subjects": {
+            "buckets": [],
+            "label": "Subjects",
         },
         "type": {
             "buckets": [
@@ -99,7 +84,6 @@ def test_facets(app, service, identity_simple, records):
             "label": "Type",
         },
     }
-
     assert expected_aggs == service_aggs
 
 
@@ -109,22 +93,9 @@ def test_facets_post_filtering_union(app, service, identity_simple, records):
     res = service.search(identity_simple, facets={"type": ["Foo0", "Foo1"]})
     service_aggs = res.aggregations
     expected_aggs = {
-        "subject": {
-            "buckets": [
-                {
-                    "doc_count": 1,
-                    "is_selected": False,
-                    "key": "Subject0",
-                    "label": "Subject0",
-                },
-                {
-                    "doc_count": 1,
-                    "is_selected": False,
-                    "key": "Subject1",
-                    "label": "Subject1",
-                },
-            ],
-            "label": "Subject",
+        "subjects": {
+            "buckets": [],
+            "label": "Subjects",
         },
         "type": {
             "label": "Type",
@@ -174,26 +145,13 @@ def test_facets_post_filtering_intersection(app, service, identity_simple, recor
     """Different facets should result in intersection of results."""
     # No records match both facets
     res = service.search(
-        identity_simple, facets={"type": ["Foo1"], "subject": ["Subject0"]}
+        identity_simple, facets={"type": ["Foo1"], "subjects": ["Subject0"]}
     )
     service_aggs = res.aggregations
     expected_aggs = {
-        "subject": {
-            "buckets": [
-                {
-                    "doc_count": 1,
-                    "is_selected": True,
-                    "key": "Subject0",
-                    "label": "Subject0",
-                },
-                {
-                    "doc_count": 1,
-                    "is_selected": False,
-                    "key": "Subject1",
-                    "label": "Subject1",
-                },
-            ],
-            "label": "Subject",
+        "subjects": {
+            "buckets": [],
+            "label": "Subjects",
         },
         "type": {
             "label": "Type",
@@ -244,22 +202,9 @@ def test_facets_post_filtering(app, service, identity_simple, records):
     res = service.search(identity_simple, facets={"type": ["Foo1"]})
     service_aggs = res.aggregations
     expected_aggs = {
-        "subject": {
-            "buckets": [
-                {
-                    "doc_count": 1,
-                    "is_selected": False,
-                    "key": "Subject0",
-                    "label": "Subject0",
-                },
-                {
-                    "doc_count": 1,
-                    "is_selected": False,
-                    "key": "Subject1",
-                    "label": "Subject1",
-                },
-            ],
-            "label": "Subject",
+        "subjects": {
+            "buckets": [],
+            "label": "Subjects",
         },
         "type": {
             "label": "Type",
@@ -303,3 +248,100 @@ def test_facets_post_filtering(app, service, identity_simple, records):
     # Test hits are filtered
     assert 1 == len(res)
     assert set(["001"]) == set([h["metadata"]["title"] for h in res])
+
+
+def test_combined_terms_facets(app, service, identity_simple, search_clear):
+    # Create records with nested patterns of interest
+    data = {
+        "metadata": {
+            "title": "SU1 + (SC1, SU2)",
+            "subjects": [
+                {"subject": "SU1"},  # should be ignored in aggregation results
+                {"scheme": "SC1", "subject": "SU2"},
+            ],
+            # Note that you would typically want to have a mechanism in place to
+            # auto-fill this field based on the values in "subjects"
+            "combined_subjects": [
+                "SU1",  # should be ignored in aggregation results
+                "SC1::SU2",
+            ],
+        },
+    }
+    service.create(identity_simple, data)
+    data = {
+        "metadata": {
+            "title": "(SC1, SU2) + (SC2, SU3)",
+            "subjects": [
+                {"scheme": "SC1", "subject": "SU2"},
+                {"scheme": "SC2", "subject": "SU3"},
+            ],
+            "combined_subjects": ["SC1::SU2", "SC2::SU3"],
+        },
+    }
+    service.create(identity_simple, data)
+    data = {
+        "metadata": {
+            "title": "(SC1, SU3)",
+            "subjects": [
+                {"scheme": "SC1", "subject": "SU3"},
+            ],
+            "combined_subjects": ["SC1::SU3"],
+        },
+    }
+    service.create(identity_simple, data)
+    Record.index.refresh()
+
+    # Action
+    res = service.search(identity_simple)
+    service_aggs = res.aggregations
+
+    expected_aggs = {
+        "subjects": {
+            "buckets": [
+                {
+                    "doc_count": 3,
+                    "inner": {
+                        "buckets": [
+                            {
+                                "doc_count": 2,
+                                "is_selected": False,
+                                "key": "SU2",
+                                "label": "SU2",
+                            },
+                            {
+                                "doc_count": 1,
+                                "is_selected": False,
+                                "key": "SU3",
+                                "label": "SU3",
+                            },
+                        ]
+                    },
+                    "is_selected": False,
+                    "key": "SC1",
+                    "label": "SC1",
+                },
+                {
+                    "doc_count": 1,
+                    "inner": {
+                        "buckets": [
+                            {
+                                "doc_count": 1,
+                                "is_selected": False,
+                                "key": "SU3",
+                                "label": "SU3",
+                            },
+                        ]
+                    },
+                    "is_selected": False,
+                    "key": "SC2",
+                    "label": "SC2",
+                },
+            ],
+            "label": "Subjects",
+        },
+        "type": {
+            "buckets": [],
+            "label": "Type",
+        },
+    }
+    assert expected_aggs == service_aggs