Merge pull request #4653 from voxel51/cherry-pick-indexes-in-progress

Cherry picking #4633 for release
voxel51 · Aug 11, 2024 · 4ec6b9f · 4ec6b9f
2 parents ffeb1f4 + 76f06a6
commit 4ec6b9f
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 20 deletions.
diff --git a/fiftyone/core/collections.py b/fiftyone/core/collections.py
@@ -602,7 +602,8 @@ def stats(
         Args:
             include_media (False): whether to include stats about the size of
                 the raw media in the collection
-            include_indexes (False): whether to return the stats on the indexes
+            include_indexes (False): whether to include stats on the dataset's
+                indexes
             compressed (False): whether to return the sizes of collections in
                 their compressed form on disk (True) or the logical
                 uncompressed size of the collections (False). This option is
@@ -649,13 +650,17 @@ def stats(
             total_bytes += media_bytes
 
         if include_indexes:
-            ii = self.get_index_information(include_size=True)
+            ii = self.get_index_information(include_stats=True)
             index_bytes = {k: v["size"] for k, v in ii.items()}
             indexes_bytes = sum(index_bytes.values())
+            indexes_in_progress = [
+                k for k, v in ii.items() if v.get("in_progress", False)
+            ]
 
             stats["indexes_count"] = len(index_bytes)
             stats["indexes_bytes"] = indexes_bytes
             stats["indexes_size"] = etau.to_human_bytes_str(indexes_bytes)
+            stats["indexes_in_progress"] = indexes_in_progress
             stats["index_bytes"] = index_bytes
             stats["index_sizes"] = {
                 k: etau.to_human_bytes_str(v) for k, v in index_bytes.items()
@@ -9060,14 +9065,16 @@ def list_indexes(self):
         """
         return list(self.get_index_information().keys())
 
-    def get_index_information(self, include_size=False):
+    def get_index_information(self, include_stats=False):
         """Returns a dictionary of information about the indexes on this
         collection.
 
         See :meth:`pymongo:pymongo.collection.Collection.index_information` for
         details on the structure of this dictionary.
 
-        include_size (False): whether to include the size of each index
+        Args:
+            include_stats (False): whether to include the size and build status
+                of each index
 
         Returns:
             a dict mapping index names to info dicts
@@ -9078,15 +9085,16 @@ def get_index_information(self, include_size=False):
         fields_map = self._get_db_fields_map(reverse=True)
         sample_info = self._dataset._sample_collection.index_information()
 
-        if include_size:
-            conn = foo.get_db_conn()
-            cs = conn.command(
-                "collstats", self._dataset._sample_collection_name
-            )
+        if include_stats:
+            cs = self._dataset._sample_collstats()
             for key, size in cs["indexSizes"].items():
                 if key in sample_info:
                     sample_info[key]["size"] = size
 
+            for key in cs["indexBuilds"]:
+                if key in sample_info:
+                    sample_info[key]["in_progress"] = True
+
         for key, info in sample_info.items():
             if len(info["key"]) == 1:
                 field = info["key"][0][0]
@@ -9099,14 +9107,16 @@ def get_index_information(self, include_size=False):
             fields_map = self._get_db_fields_map(frames=True, reverse=True)
             frame_info = self._dataset._frame_collection.index_information()
 
-            if include_size:
-                cs = conn.command(
-                    "collstats", self._dataset._frame_collection_name
-                )
+            if include_stats:
+                cs = self._dataset._frame_collstats()
                 for key, size in cs["indexSizes"].items():
                     if key in frame_info:
                         frame_info[key]["size"] = size
 
+                for key in cs["indexBuilds"]:
+                    if key in frame_info:
+                        frame_info[key]["in_progress"] = True
+
             for key, info in frame_info.items():
                 if len(info["key"]) == 1:
                     field = info["key"][0][0]

diff --git a/fiftyone/core/dataset.py b/fiftyone/core/dataset.py
@@ -1107,7 +1107,8 @@ def stats(
         Args:
             include_media (False): whether to include stats about the size of
                 the raw media in the dataset
-            include_indexes (False): whether to return the stats on the indexes
+            include_indexes (False): whether to include stats on the dataset's
+                indexes
             compressed (False): whether to return the sizes of collections in
                 their compressed form on disk (True) or the logical
                 uncompressed size of the collections (False)
@@ -1119,17 +1120,15 @@ def stats(
 
         stats = {}
 
-        conn = foo.get_db_conn()
-
-        cs = conn.command("collstats", self._sample_collection_name)
+        cs = self._sample_collstats()
         samples_bytes = cs["storageSize"] if compressed else cs["size"]
         stats["samples_count"] = cs["count"]
         stats["samples_bytes"] = samples_bytes
         stats["samples_size"] = etau.to_human_bytes_str(samples_bytes)
         total_bytes = samples_bytes
 
         if contains_videos:
-            cs = conn.command("collstats", self._frame_collection_name)
+            cs = self._frame_collstats()
             frames_bytes = cs["storageSize"] if compressed else cs["size"]
             stats["frames_count"] = cs["count"]
             stats["frames_bytes"] = frames_bytes
@@ -1149,13 +1148,17 @@ def stats(
             total_bytes += media_bytes
 
         if include_indexes:
-            ii = self.get_index_information(include_size=True)
+            ii = self.get_index_information(include_stats=True)
             index_bytes = {k: v["size"] for k, v in ii.items()}
             indexes_bytes = sum(index_bytes.values())
+            indexes_in_progress = [
+                k for k, v in ii.items() if v.get("in_progress", False)
+            ]
 
             stats["indexes_count"] = len(index_bytes)
             stats["indexes_bytes"] = indexes_bytes
             stats["indexes_size"] = etau.to_human_bytes_str(indexes_bytes)
+            stats["indexes_in_progress"] = indexes_in_progress
             stats["index_bytes"] = index_bytes
             stats["index_sizes"] = {
                 k: etau.to_human_bytes_str(v) for k, v in index_bytes.items()
@@ -1167,6 +1170,17 @@ def stats(
 
         return stats
 
+    def _sample_collstats(self):
+        conn = foo.get_db_conn()
+        return conn.command("collstats", self._sample_collection_name)
+
+    def _frame_collstats(self):
+        if self._frame_collection_name is None:
+            return None
+
+        conn = foo.get_db_conn()
+        return conn.command("collstats", self._frame_collection_name)
+
     def first(self):
         """Returns the first sample in the dataset.
 

diff --git a/tests/unittests/dataset_tests.py b/tests/unittests/dataset_tests.py
@@ -5,6 +5,7 @@
 | `voxel51.com <https://voxel51.com/>`_
 |
 """
+
 import time
 from copy import deepcopy, copy
 from datetime import date, datetime, timedelta
@@ -13,6 +14,7 @@
 import random
 import string
 import unittest
+from unittest.mock import patch
 
 from bson import ObjectId
 from mongoengine import ValidationError
@@ -544,7 +546,7 @@ def test_index_sizes(self):
         dataset.create_index("gt.detections.label")
         dataset.create_index("frames.gt.detections.label")
 
-        info = dataset.get_index_information(include_size=True)
+        info = dataset.get_index_information(include_stats=True)
 
         indexes = [
             "id",
@@ -560,6 +562,41 @@ def test_index_sizes(self):
         for d in info.values():
             self.assertTrue(d.get("size") is not None)
 
+    @drop_datasets
+    def test_index_in_progress(self):
+        gt = fo.Detections(detections=[fo.Detection(label="foo")])
+        sample = fo.Sample(filepath="video.mp4", gt=gt)
+        sample.frames[1] = fo.Frame(gt=gt)
+
+        dataset = fo.Dataset()
+        dataset.add_sample(sample)
+
+        dataset.create_index("gt.detections.label")
+        dataset.create_index("frames.gt.detections.label")
+
+        sample_stats = dataset._sample_collstats()
+        sample_stats["indexBuilds"] = ["gt.detections.label_1"]
+
+        frame_stats = dataset._frame_collstats()
+        frame_stats["indexBuilds"] = ["gt.detections.label_1"]
+
+        with patch.object(
+            dataset, "_sample_collstats", return_value=sample_stats
+        ), patch.object(dataset, "_frame_collstats", return_value=frame_stats):
+            info = dataset.get_index_information(include_stats=True)
+            for key in [
+                "gt.detections.label",
+                "frames.gt.detections.label",
+            ]:
+                self.assertTrue(info[key].get("in_progress"))
+
+            stats = dataset.stats(include_indexes=True)
+            self.assertTrue("indexes_in_progress" in stats)
+            self.assertEqual(
+                set(stats["indexes_in_progress"]),
+                {"gt.detections.label", "frames.gt.detections.label"},
+            )
+
     @drop_datasets
     def test_iter_samples(self):
         dataset = fo.Dataset()