Skip to content

Commit

Permalink
Merge pull request #4653 from voxel51/cherry-pick-indexes-in-progress
Browse files Browse the repository at this point in the history
Cherry picking #4633 for release
  • Loading branch information
brimoor authored Aug 11, 2024
2 parents ffeb1f4 + 76f06a6 commit 4ec6b9f
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 20 deletions.
36 changes: 23 additions & 13 deletions fiftyone/core/collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,8 @@ def stats(
Args:
include_media (False): whether to include stats about the size of
the raw media in the collection
include_indexes (False): whether to return the stats on the indexes
include_indexes (False): whether to include stats on the dataset's
indexes
compressed (False): whether to return the sizes of collections in
their compressed form on disk (True) or the logical
uncompressed size of the collections (False). This option is
Expand Down Expand Up @@ -649,13 +650,17 @@ def stats(
total_bytes += media_bytes

if include_indexes:
ii = self.get_index_information(include_size=True)
ii = self.get_index_information(include_stats=True)
index_bytes = {k: v["size"] for k, v in ii.items()}
indexes_bytes = sum(index_bytes.values())
indexes_in_progress = [
k for k, v in ii.items() if v.get("in_progress", False)
]

stats["indexes_count"] = len(index_bytes)
stats["indexes_bytes"] = indexes_bytes
stats["indexes_size"] = etau.to_human_bytes_str(indexes_bytes)
stats["indexes_in_progress"] = indexes_in_progress
stats["index_bytes"] = index_bytes
stats["index_sizes"] = {
k: etau.to_human_bytes_str(v) for k, v in index_bytes.items()
Expand Down Expand Up @@ -9060,14 +9065,16 @@ def list_indexes(self):
"""
return list(self.get_index_information().keys())

def get_index_information(self, include_size=False):
def get_index_information(self, include_stats=False):
"""Returns a dictionary of information about the indexes on this
collection.
See :meth:`pymongo:pymongo.collection.Collection.index_information` for
details on the structure of this dictionary.
include_size (False): whether to include the size of each index
Args:
include_stats (False): whether to include the size and build status
of each index
Returns:
a dict mapping index names to info dicts
Expand All @@ -9078,15 +9085,16 @@ def get_index_information(self, include_size=False):
fields_map = self._get_db_fields_map(reverse=True)
sample_info = self._dataset._sample_collection.index_information()

if include_size:
conn = foo.get_db_conn()
cs = conn.command(
"collstats", self._dataset._sample_collection_name
)
if include_stats:
cs = self._dataset._sample_collstats()
for key, size in cs["indexSizes"].items():
if key in sample_info:
sample_info[key]["size"] = size

for key in cs["indexBuilds"]:
if key in sample_info:
sample_info[key]["in_progress"] = True

for key, info in sample_info.items():
if len(info["key"]) == 1:
field = info["key"][0][0]
Expand All @@ -9099,14 +9107,16 @@ def get_index_information(self, include_size=False):
fields_map = self._get_db_fields_map(frames=True, reverse=True)
frame_info = self._dataset._frame_collection.index_information()

if include_size:
cs = conn.command(
"collstats", self._dataset._frame_collection_name
)
if include_stats:
cs = self._dataset._frame_collstats()
for key, size in cs["indexSizes"].items():
if key in frame_info:
frame_info[key]["size"] = size

for key in cs["indexBuilds"]:
if key in frame_info:
frame_info[key]["in_progress"] = True

for key, info in frame_info.items():
if len(info["key"]) == 1:
field = info["key"][0][0]
Expand Down
26 changes: 20 additions & 6 deletions fiftyone/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1107,7 +1107,8 @@ def stats(
Args:
include_media (False): whether to include stats about the size of
the raw media in the dataset
include_indexes (False): whether to return the stats on the indexes
include_indexes (False): whether to include stats on the dataset's
indexes
compressed (False): whether to return the sizes of collections in
their compressed form on disk (True) or the logical
uncompressed size of the collections (False)
Expand All @@ -1119,17 +1120,15 @@ def stats(

stats = {}

conn = foo.get_db_conn()

cs = conn.command("collstats", self._sample_collection_name)
cs = self._sample_collstats()
samples_bytes = cs["storageSize"] if compressed else cs["size"]
stats["samples_count"] = cs["count"]
stats["samples_bytes"] = samples_bytes
stats["samples_size"] = etau.to_human_bytes_str(samples_bytes)
total_bytes = samples_bytes

if contains_videos:
cs = conn.command("collstats", self._frame_collection_name)
cs = self._frame_collstats()
frames_bytes = cs["storageSize"] if compressed else cs["size"]
stats["frames_count"] = cs["count"]
stats["frames_bytes"] = frames_bytes
Expand All @@ -1149,13 +1148,17 @@ def stats(
total_bytes += media_bytes

if include_indexes:
ii = self.get_index_information(include_size=True)
ii = self.get_index_information(include_stats=True)
index_bytes = {k: v["size"] for k, v in ii.items()}
indexes_bytes = sum(index_bytes.values())
indexes_in_progress = [
k for k, v in ii.items() if v.get("in_progress", False)
]

stats["indexes_count"] = len(index_bytes)
stats["indexes_bytes"] = indexes_bytes
stats["indexes_size"] = etau.to_human_bytes_str(indexes_bytes)
stats["indexes_in_progress"] = indexes_in_progress
stats["index_bytes"] = index_bytes
stats["index_sizes"] = {
k: etau.to_human_bytes_str(v) for k, v in index_bytes.items()
Expand All @@ -1167,6 +1170,17 @@ def stats(

return stats

def _sample_collstats(self):
conn = foo.get_db_conn()
return conn.command("collstats", self._sample_collection_name)

def _frame_collstats(self):
if self._frame_collection_name is None:
return None

conn = foo.get_db_conn()
return conn.command("collstats", self._frame_collection_name)

def first(self):
"""Returns the first sample in the dataset.
Expand Down
39 changes: 38 additions & 1 deletion tests/unittests/dataset_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
| `voxel51.com <https://voxel51.com/>`_
|
"""

import time
from copy import deepcopy, copy
from datetime import date, datetime, timedelta
Expand All @@ -13,6 +14,7 @@
import random
import string
import unittest
from unittest.mock import patch

from bson import ObjectId
from mongoengine import ValidationError
Expand Down Expand Up @@ -544,7 +546,7 @@ def test_index_sizes(self):
dataset.create_index("gt.detections.label")
dataset.create_index("frames.gt.detections.label")

info = dataset.get_index_information(include_size=True)
info = dataset.get_index_information(include_stats=True)

indexes = [
"id",
Expand All @@ -560,6 +562,41 @@ def test_index_sizes(self):
for d in info.values():
self.assertTrue(d.get("size") is not None)

@drop_datasets
def test_index_in_progress(self):
gt = fo.Detections(detections=[fo.Detection(label="foo")])
sample = fo.Sample(filepath="video.mp4", gt=gt)
sample.frames[1] = fo.Frame(gt=gt)

dataset = fo.Dataset()
dataset.add_sample(sample)

dataset.create_index("gt.detections.label")
dataset.create_index("frames.gt.detections.label")

sample_stats = dataset._sample_collstats()
sample_stats["indexBuilds"] = ["gt.detections.label_1"]

frame_stats = dataset._frame_collstats()
frame_stats["indexBuilds"] = ["gt.detections.label_1"]

with patch.object(
dataset, "_sample_collstats", return_value=sample_stats
), patch.object(dataset, "_frame_collstats", return_value=frame_stats):
info = dataset.get_index_information(include_stats=True)
for key in [
"gt.detections.label",
"frames.gt.detections.label",
]:
self.assertTrue(info[key].get("in_progress"))

stats = dataset.stats(include_indexes=True)
self.assertTrue("indexes_in_progress" in stats)
self.assertEqual(
set(stats["indexes_in_progress"]),
{"gt.detections.label", "frames.gt.detections.label"},
)

@drop_datasets
def test_iter_samples(self):
dataset = fo.Dataset()
Expand Down

0 comments on commit 4ec6b9f

Please sign in to comment.