openedx · bradenmacdonald · Feb 8, 2024 · Jan 31, 2024 · Feb 1, 2024 · Feb 8, 2024
diff --git a/openedx_tagging/core/tagging/data.py b/openedx_tagging/core/tagging/data.py
@@ -21,6 +21,7 @@ class TagData(TypedDict):
     value: str
     external_id: str | None
     child_count: int
+    descendant_count: int
     depth: int
     parent_value: str | None
     # Note: usage_count may or may not be present, depending on the request.

diff --git a/openedx_tagging/core/tagging/models/base.py b/openedx_tagging/core/tagging/models/base.py
@@ -170,6 +170,18 @@ def child_count(self) -> int:
             return self.taxonomy.tag_set.filter(parent=self).count()
         return 0
 
+    @cached_property
+    def descendant_count(self) -> int:
+        """
+        How many descendant tags this tag has in the taxonomy.
+        """
+        if self.taxonomy and not self.taxonomy.allow_free_text:
+            return self.taxonomy.tag_set.filter(
+                Q(parent__parent=self) |
+                Q(parent__parent__parent=self)
+            ).count() + self.child_count
+        return 0
+
     def clean(self):
         """
         Validate this tag before saving
@@ -413,6 +425,7 @@ def _get_filtered_tags_free_text(
         qs = qs.annotate(
             depth=Value(0),
             child_count=Value(0),
+            descendant_count=Value(0),
             external_id=Value(None, output_field=models.CharField()),
             parent_value=Value(None, output_field=models.CharField()),
             _id=Value(None, output_field=models.CharField()),
@@ -444,12 +457,16 @@ def _get_filtered_tags_one_level(
         else:
             qs = self.tag_set.filter(parent=None).annotate(depth=Value(0))
             qs = qs.annotate(parent_value=Value(None, output_field=models.CharField()))
-        qs = qs.annotate(child_count=models.Count("children"))
+        qs = qs.annotate(child_count=models.Count("children", distinct=True))
+        qs = qs.annotate(grandchild_count=models.Count("children__children", distinct=True))
+        qs = qs.annotate(great_grandchild_count=models.Count("children__children__children"))
+        qs = qs.annotate(descendant_count=F("child_count") + F("grandchild_count") + F("great_grandchild_count"))
         # Filter by search term:
         if search_term:
             qs = qs.filter(value__icontains=search_term)
         qs = qs.annotate(_id=F("id"))  # ID has an underscore to encourage use of 'value' rather than this internal ID
-        qs = qs.values("value", "child_count", "depth", "parent_value", "external_id", "_id").order_by("value")
+        qs = qs.values("value", "child_count", "descendant_count", "depth", "parent_value", "external_id", "_id")
+        qs = qs.order_by("value")
         if include_counts:
             # We need to include the count of how many times this tag is used to tag objects.
             # You'd think we could just use:
@@ -501,14 +518,27 @@ def _get_filtered_tags_deep(
                     if pk is not None:
                         matching_ids.append(pk)
             qs = qs.filter(pk__in=matching_ids)
-            qs = qs.annotate(child_count=models.Count("children", filter=Q(children__pk__in=matching_ids)))
+            qs = qs.annotate(
+                child_count=models.Count("children", filter=Q(children__pk__in=matching_ids), distinct=True),
+                grandchild_count=models.Count(
+                    "children__children", filter=Q(children__children__pk__in=matching_ids), distinct=True,
+                ),
+                great_grandchild_count=models.Count(
+                    "children__children__children",
+                    filter=Q(children__children__children__pk__in=matching_ids),
+                ),
+            )
+            qs = qs.annotate(descendant_count=F("child_count") + F("grandchild_count") + F("great_grandchild_count"))
         elif excluded_values:
             raise NotImplementedError("Using excluded_values without search_term is not currently supported.")
             # We could implement this in the future but I'd prefer to get rid of the "excluded_values" API altogether.
             # It remains to be seen if it's useful to do that on the backend, or if we can do it better/simpler on the
             # frontend.
         else:
-            qs = qs.annotate(child_count=models.Count("children"))
+            qs = qs.annotate(child_count=models.Count("children", distinct=True))
+            qs = qs.annotate(grandchild_count=models.Count("children__children", distinct=True))
+            qs = qs.annotate(great_grandchild_count=models.Count("children__children__children"))
+            qs = qs.annotate(descendant_count=F("child_count") + F("grandchild_count") + F("great_grandchild_count"))
 
         # Add the "depth" to each tag:
         qs = Tag.annotate_depth(qs)
@@ -527,7 +557,8 @@ def _get_filtered_tags_deep(
         # Add the parent value
         qs = qs.annotate(parent_value=F("parent__value"))
         qs = qs.annotate(_id=F("id"))  # ID has an underscore to encourage use of 'value' rather than this internal ID
-        qs = qs.values("value", "child_count", "depth", "parent_value", "external_id", "_id").order_by("sort_key")
+        qs = qs.values("value", "child_count", "descendant_count", "depth", "parent_value", "external_id", "_id")
+        qs = qs.order_by("sort_key")
         if include_counts:
             # Including the counts is a bit tricky; see the comment above in _get_filtered_tags_one_level()
             obj_tags = ObjectTag.objects.filter(tag_id=models.OuterRef("pk")).order_by().annotate(

diff --git a/openedx_tagging/core/tagging/rest_api/v1/serializers.py b/openedx_tagging/core/tagging/rest_api/v1/serializers.py
@@ -208,6 +208,7 @@ class TagDataSerializer(UserPermissionsSerializerMixin, serializers.Serializer):
     value = serializers.CharField()
     external_id = serializers.CharField(allow_null=True)
     child_count = serializers.IntegerField()
+    descendant_count = serializers.IntegerField()
     depth = serializers.IntegerField()
     parent_value = serializers.CharField(allow_null=True)
     usage_count = serializers.IntegerField(required=False)

diff --git a/tests/openedx_tagging/core/tagging/import_export/test_template.py b/tests/openedx_tagging/core/tagging/import_export/test_template.py
@@ -53,27 +53,27 @@ def test_import_template(self, template_file, parser_format):
             'Electronic instruments (ELECTRIC) (None) (children: 2)',
             '  Synthesizer (SYNTH) (Electronic instruments) (children: 0)',
             '  Theramin (THERAMIN) (Electronic instruments) (children: 0)',
-            'Percussion instruments (PERCUSS) (None) (children: 3)',
+            'Percussion instruments (PERCUSS) (None) (children: 3 + 6)',
             '  Chordophone (CHORD) (Percussion instruments) (children: 1)',
             '    Piano (PIANO) (Chordophone) (children: 0)',
             '  Idiophone (BELLS) (Percussion instruments) (children: 2)',
             '    Celesta (CELESTA) (Idiophone) (children: 0)',
             '    Hi-hat (HI-HAT) (Idiophone) (children: 0)',
-            '  Membranophone (DRUMS) (Percussion instruments) (children: 2)',
+            '  Membranophone (DRUMS) (Percussion instruments) (children: 2 + 1)',
             '    Cajón (CAJÓN) (Membranophone) (children: 1)',
             # This tag is present in the import files, but it will be omitted from get_tags()
             # because it sits beyond TAXONOMY_MAX_DEPTH.
             #      Pyle Stringed Jam Cajón (PYLE) (Cajón) (children: 0)
             '    Tabla (TABLA) (Membranophone) (children: 0)',
-            'String instruments (STRINGS) (None) (children: 2)',
+            'String instruments (STRINGS) (None) (children: 2 + 5)',
             '  Bowed strings (BOW) (String instruments) (children: 2)',
             '    Cello (CELLO) (Bowed strings) (children: 0)',
             '    Violin (VIOLIN) (Bowed strings) (children: 0)',
             '  Plucked strings (PLUCK) (String instruments) (children: 3)',
             '    Banjo (BANJO) (Plucked strings) (children: 0)',
             '    Harp (HARP) (Plucked strings) (children: 0)',
             '    Mandolin (MANDOLIN) (Plucked strings) (children: 0)',
-            'Wind instruments (WINDS) (None) (children: 2)',
+            'Wind instruments (WINDS) (None) (children: 2 + 5)',
             '  Brass (BRASS) (Wind instruments) (children: 2)',
             '    Trumpet (TRUMPET) (Brass) (children: 0)',
             '    Tuba (TUBA) (Brass) (children: 0)',

diff --git a/tests/openedx_tagging/core/tagging/test_api.py b/tests/openedx_tagging/core/tagging/test_api.py
@@ -126,8 +126,8 @@ def test_get_tags(self) -> None:
             "Bacteria (children: 2)",
             "  Archaebacteria (children: 0)",
             "  Eubacteria (children: 0)",
-            "Eukaryota (children: 5)",
-            "  Animalia (children: 7)",
+            "Eukaryota (children: 5 + 8)",
+            "  Animalia (children: 7 + 1)",
             "    Arthropoda (children: 0)",
             "    Chordata (children: 1)",  # The child of this is excluded due to depth limit
             "    Cnidaria (children: 0)",
@@ -155,7 +155,7 @@ def test_get_root_tags(self):
         assert pretty_format_tags(root_life_on_earth_tags, parent=False) == [
             'Archaea (children: 3)',
             'Bacteria (children: 2)',
-            'Eukaryota (children: 5)',
+            'Eukaryota (children: 5 + 8)',
         ]
 
     @override_settings(LANGUAGES=test_languages)
@@ -615,7 +615,7 @@ def get_object_tags():
             "  Proteoarchaeota (used: 0, children: 0)",
             "Bacteria (used: 0, children: 1)",  # does not contain "ar" but a child does
             "  Archaebacteria (used: 1, children: 0)",
-            "Eukaryota (used: 0, children: 1)",
+            "Eukaryota (used: 0, children: 1 + 2)",
             "  Animalia (used: 1, children: 2)",  # does not contain "ar" but a child does
             "    Arthropoda (used: 1, children: 0)",
             "    Cnidaria (used: 0, children: 0)",
@@ -637,7 +637,7 @@ def get_object_tags():
             "Bacteria (used: 0, children: 2)",
             "  Archaebacteria (used: 1, children: 0)",
             "  Eubacteria (used: 0, children: 0)",
-            "Eukaryota (used: 0, children: 4)",
+            "Eukaryota (used: 0, children: 4 + 7)",
             "  Animalia (used: 1, children: 7)",
             "    Arthropoda (used: 1, children: 0)",
             "    Chordata (used: 0, children: 0)",  # <<< Chordata has a matching child but we only support searching

diff --git a/tests/openedx_tagging/core/tagging/test_models.py b/tests/openedx_tagging/core/tagging/test_models.py
@@ -285,9 +285,9 @@ def test_get_root(self) -> None:
             del r["_id"]  # Remove the internal database IDs; they aren't interesting here and a other tests check them
         assert result == [
             # These are the root tags, in alphabetical order:
-            {"value": "Archaea", "child_count": 3, **common_fields},
-            {"value": "Bacteria", "child_count": 2, **common_fields},
-            {"value": "Eukaryota", "child_count": 5, **common_fields},
+            {"value": "Archaea", "child_count": 3, "descendant_count": 3, **common_fields},
+            {"value": "Bacteria", "child_count": 2, "descendant_count": 2, **common_fields},
+            {"value": "Eukaryota", "child_count": 5, "descendant_count": 13, **common_fields},
         ]
 
     def test_get_child_tags_one_level(self) -> None:
@@ -301,11 +301,11 @@ def test_get_child_tags_one_level(self) -> None:
             del r["_id"]  # Remove the internal database IDs; they aren't interesting here and a other tests check them
         assert result == [
             # These are the Eukaryota tags, in alphabetical order:
-            {"value": "Animalia", "child_count": 7, **common_fields},
-            {"value": "Fungi", "child_count": 0, **common_fields},
-            {"value": "Monera", "child_count": 0, **common_fields},
-            {"value": "Plantae", "child_count": 0, **common_fields},
-            {"value": "Protista", "child_count": 0, **common_fields},
+            {"value": "Animalia", "child_count": 7, "descendant_count": 8, **common_fields},
+            {"value": "Fungi", "child_count": 0, "descendant_count": 0, **common_fields},
+            {"value": "Monera", "child_count": 0, "descendant_count": 0, **common_fields},
+            {"value": "Plantae", "child_count": 0, "descendant_count": 0, **common_fields},
+            {"value": "Protista", "child_count": 0, "descendant_count": 0, **common_fields},
         ]
 
     def test_get_grandchild_tags_one_level(self) -> None:
@@ -319,13 +319,13 @@ def test_get_grandchild_tags_one_level(self) -> None:
             del r["_id"]  # Remove the internal database IDs; they aren't interesting here and a other tests check them
         assert result == [
             # These are the Eukaryota tags, in alphabetical order:
-            {"value": "Arthropoda", "child_count": 0, **common_fields},
-            {"value": "Chordata", "child_count": 1, **common_fields},
-            {"value": "Cnidaria", "child_count": 0, **common_fields},
-            {"value": "Ctenophora", "child_count": 0, **common_fields},
-            {"value": "Gastrotrich", "child_count": 0, **common_fields},
-            {"value": "Placozoa", "child_count": 0, **common_fields},
-            {"value": "Porifera", "child_count": 0, **common_fields},
+            {"value": "Arthropoda", "child_count": 0, "descendant_count": 0, **common_fields},
+            {"value": "Chordata", "child_count": 1, "descendant_count": 1, **common_fields},
+            {"value": "Cnidaria", "child_count": 0, "descendant_count": 0, **common_fields},
+            {"value": "Ctenophora", "child_count": 0, "descendant_count": 0, **common_fields},
+            {"value": "Gastrotrich", "child_count": 0, "descendant_count": 0, **common_fields},
+            {"value": "Placozoa", "child_count": 0, "descendant_count": 0, **common_fields},
+            {"value": "Porifera", "child_count": 0, "descendant_count": 0, **common_fields},
         ]
 
     def test_get_depth_1_search_term(self) -> None:
@@ -337,6 +337,7 @@ def test_get_depth_1_search_term(self) -> None:
             {
                 "value": "Archaea",
                 "child_count": 3,
+                "descendant_count": 3,
                 "depth": 0,
                 "usage_count": 0,
                 "parent_value": None,
@@ -355,6 +356,7 @@ def test_get_depth_1_child_search_term(self) -> None:
             {
                 "value": "Archaebacteria",
                 "child_count": 0,
+                "descendant_count": 0,
                 "depth": 1,
                 "parent_value": "Bacteria",
                 "external_id": None,
@@ -395,8 +397,8 @@ def test_get_all(self) -> None:
             "Bacteria (None) (children: 2)",
             "  Archaebacteria (Bacteria) (children: 0)",
             "  Eubacteria (Bacteria) (children: 0)",
-            "Eukaryota (None) (children: 5)",
-            "  Animalia (Eukaryota) (children: 7)",
+            "Eukaryota (None) (children: 5 + 8)",
+            "  Animalia (Eukaryota) (children: 7 + 1)",
             "    Arthropoda (Animalia) (children: 0)",
             "    Chordata (Animalia) (children: 1)",  # note this has a child but the child is not included
             "    Cnidaria (Animalia) (children: 0)",
@@ -431,7 +433,7 @@ def test_search_2(self) -> None:
         """
         result = pretty_format_tags(self.taxonomy.get_filtered_tags(search_term="chordata"))
         assert result == [
-            "Eukaryota (None) (children: 1)",  # Has one child that matches
+            "Eukaryota (None) (children: 1 + 1)",  # Has one child that matches, plus one additional matching descendant
             "  Animalia (Eukaryota) (children: 1)",
             "    Chordata (Animalia) (children: 0)",  # this is the matching tag.
         ]
@@ -445,7 +447,7 @@ def test_search_3(self) -> None:
         assert result == [
             "Archaea (None) (children: 1)",
             "  Proteoarchaeota (Archaea) (children: 0)",
-            "Eukaryota (None) (children: 2)",  # Note the "children: 2" is correct - 2 direct children are in the result
+            "Eukaryota (None) (children: 2 + 2)",  # 2 direct matching children, 2 additional matching descendants
             "  Animalia (Eukaryota) (children: 2)",
             "    Arthropoda (Animalia) (children: 0)",  # match
             "    Gastrotrich (Animalia) (children: 0)",  # match
@@ -464,6 +466,7 @@ def test_tags_deep(self) -> None:
                 "depth": 3,
                 "usage_count": 0,
                 "child_count": 0,
+                "descendant_count": 0,
                 "external_id": None,
                 "_id": 21,  # These IDs are hard-coded in the test fixture file
             }
@@ -527,7 +530,7 @@ def test_tree_sort(self) -> None:
         taxonomy = self.create_sort_test_taxonomy()
         result = pretty_format_tags(taxonomy.get_filtered_tags())
         assert result == [
-            "1 (None) (children: 4)",
+            "1 (None) (children: 4 + 1)",
             "  1 A (1) (children: 0)",
             "  11 (1) (children: 0)",
             "  11111 (1) (children: 1)",
@@ -547,6 +550,37 @@ def test_tree_sort(self) -> None:
             "  azure (ALPHABET) (children: 0)",
         ]
 
+    def test_descendant_counts(self) -> None:
+        """
+        Test getting the descendant count on a taxonomy known to cause aggregation
+        bugs unless the aggregations are correctly specified with distinct=True
+
+        https://docs.djangoproject.com/en/5.0/topics/db/aggregation/#combining-multiple-aggregations
+        """
+        taxonomy = api.create_taxonomy("ESDC Subset")
+        api.add_tag_to_taxonomy(taxonomy, "Interests")  # root tag
+        api.add_tag_to_taxonomy(taxonomy, "Holland Codes", parent_tag_value="Interests")  # child tag
+        # Create the grandchild tag:
+        g_tag = api.add_tag_to_taxonomy(taxonomy, "Interests - Holland Codes", parent_tag_value="Holland Codes")
+        # Create the 6 great-grandchild tags:
+        api.add_tag_to_taxonomy(taxonomy, "Artistic", parent_tag_value=g_tag.value)
+        api.add_tag_to_taxonomy(taxonomy, "Conventional", parent_tag_value=g_tag.value)
+        api.add_tag_to_taxonomy(taxonomy, "Enterprising", parent_tag_value=g_tag.value)
+        api.add_tag_to_taxonomy(taxonomy, "Investigative", parent_tag_value=g_tag.value)
+        api.add_tag_to_taxonomy(taxonomy, "Realistic", parent_tag_value=g_tag.value)
+        api.add_tag_to_taxonomy(taxonomy, "Social", parent_tag_value=g_tag.value)
+
+        result = pretty_format_tags(taxonomy.get_filtered_tags(depth=1, include_counts=True))
+        assert result == [
+            "Interests (None) (used: 0, children: 1 + 7)",  # 1 child + (1 grandchild and 6 great grandchild tags)
+        ]
+        result2 = pretty_format_tags(taxonomy.get_filtered_tags(depth=None, include_counts=True))
+        assert result2 == [
+            "Interests (None) (used: 0, children: 1 + 7)",
+            "  Holland Codes (Interests) (used: 0, children: 1 + 6)",
+            "    Interests - Holland Codes (Holland Codes) (used: 0, children: 6)",
+        ]
+
 
 class TestFilteredTagsFreeTextTaxonomy(TestCase):
     """