diff --git a/cl/api/tests.py b/cl/api/tests.py index e15bbfe279..1956af2fef 100644 --- a/cl/api/tests.py +++ b/cl/api/tests.py @@ -1,8 +1,10 @@ import json +from collections import OrderedDict, defaultdict from datetime import date, datetime, timedelta, timezone from http import HTTPStatus from typing import Any, Dict from unittest import mock +from unittest.mock import MagicMock, patch from urllib.parse import parse_qs, urlparse from asgiref.sync import async_to_sync, sync_to_async @@ -29,7 +31,7 @@ from cl.api.factories import WebhookEventFactory, WebhookFactory from cl.api.models import WEBHOOK_EVENT_STATUS, WebhookEvent, WebhookEventType from cl.api.pagination import VersionBasedPagination -from cl.api.utils import LoggingMixin, get_logging_prefix +from cl.api.utils import LoggingMixin, get_logging_prefix, invert_user_logs from cl.api.views import build_chart_data, coverage_data, make_court_variable from cl.api.webhooks import send_webhook_event from cl.audio.api_views import AudioViewSet @@ -3297,3 +3299,159 @@ async def test_count_with_no_results(self): self.assertEqual(response.status_code, 200) self.assertEqual(response.data["count"], 0) + + +class TestApiUsage(SimpleTestCase): + """Tests for combining v3 and v4 API usage data""" + + def setUp(self): + """ + Set up test environment before each test. + We create fresh mocks to avoid state bleeding between tests. + """ + self.mock_redis = MagicMock() + self.mock_pipeline = MagicMock() + self.mock_redis.pipeline.return_value = self.mock_pipeline + + @patch("cl.api.utils.get_redis_interface") + def test_single_date_combined_usage(self, mock_get_redis): + """ + Test that for a single date: + 1. Both v3 and v4 usage is fetched + 2. Counts are properly combined + 3. The output format matches template expectations + """ + mock_get_redis.return_value = self.mock_redis + + self.mock_pipeline.execute.return_value = [ + # First result: v3 API data + [("1", 100.0), ("2", 50.0)], + # Second result: v4 API data + [("1", 50.0), ("2", 25.0)], + ] + + results = invert_user_logs( + start="2023-01-01", end="2023-01-01", add_usernames=False + ) + + expected = defaultdict(dict) + expected[1] = OrderedDict({"2023-01-01": 150, "total": 150}) + expected[2] = OrderedDict({"2023-01-01": 75, "total": 75}) + + self.assertEqual(results, expected) + + @patch("cl.api.utils.get_redis_interface") + def test_multiple_dates_combined_usage(self, mock_get_redis): + """ + Test that across multiple dates: + 1. API usage is correctly combined from both v3 and v4 + 2. Totals accumulate properly across dates + 3. The data structure matches template expectations + 4. Date ordering is preserved + """ + mock_get_redis.return_value = self.mock_redis + self.mock_pipeline.execute.return_value = [ + # January 1st - v3 API + [("1", 100.0), ("2", 50.0)], + # January 1st - v4 API + [("1", 50.0), ("2", 25.0)], + # January 2nd - v3 API + [("1", 200.0), ("2", 100.0)], + # January 2nd - v4 API + [("1", 100.0), ("2", 50.0)], + ] + + results = invert_user_logs( + start="2023-01-01", end="2023-01-02", add_usernames=False + ) + + # User 1: + # Jan 1: 150 (100 from v3 + 50 from v4) + # Jan 2: 300 (200 from v3 + 100 from v4) + # Total: 450 + # User 2: + # Jan 1: 75 (50 from v3 + 25 from v4) + # Jan 2: 150 (100 from v3 + 50 from v4) + # Total: 225 + expected = defaultdict(dict) + expected[1] = OrderedDict( + {"2023-01-01": 150, "2023-01-02": 300, "total": 450} + ) + expected[2] = OrderedDict( + {"2023-01-01": 75, "2023-01-02": 150, "total": 225} + ) + + self.assertEqual(results, expected) + + @patch("cl.api.utils.get_redis_interface") + def test_anonymous_user_handling(self, mock_get_redis): + """ + Test the handling of anonymous users, which have special requirements: + 1. Both 'None' and 'AnonymousUser' should be treated as the same user + 2. They should be combined under 'AnonymousUser' in the output + 3. Their usage should be summed correctly across both identifiers + 4. They should work with both API versions + """ + mock_get_redis.return_value = self.mock_redis + self.mock_pipeline.execute.return_value = [ + # January 1st - v3 + [ + ("None", 30.0), + ("1", 100.0), + ("AnonymousUser", 20.0), + ], + # January 1st - v4 + [ + ("AnonymousUser", 25.0), + ("1", 50.0), + ], + # January 2nd - v3 + [("None", 40.0), ("1", 200.0)], + # January 2nd - v4 + [ + ("1", 100.0), + ("AnonymousUser", 35.0), + ], + ] + + results = invert_user_logs( + start="2023-01-01", end="2023-01-02", add_usernames=False + ) + + expected = defaultdict(dict) + # Expected results: + # Anonymous user on Jan 1: + # - v3: 30 (None) + 20 (AnonymousUser) = 50 + # - v4: 25 (AnonymousUser) = 25 + # - Total for Jan 1: 75 + # Anonymous user on Jan 2: + # - v3: 40 (None) = 40 + # - v4: 35 (AnonymousUser) = 35 + # - Total for Jan 2: 75 + # Anonymous total across all dates: 150 + expected["AnonymousUser"] = OrderedDict( + { + "2023-01-01": 75, + "2023-01-02": 75, + "total": 150, + } + ) + expected[1] = OrderedDict( + { + "2023-01-01": 150, + "2023-01-02": 300, + "total": 450, + } + ) + + self.assertEqual(results, expected) + + self.assertNotIn("None", results) + + self.assertIn("AnonymousUser", results) + + # Verify ordering is maintained even with anonymous users + anonymous_data = results["AnonymousUser"] + dates = list(anonymous_data.keys()) + dates.remove("total") + self.assertEqual(dates, ["2023-01-01", "2023-01-02"]) diff --git a/cl/api/utils.py b/cl/api/utils.py index 570c48c769..84630f7b5d 100644 --- a/cl/api/utils.py +++ b/cl/api/utils.py @@ -1,6 +1,7 @@ import logging from collections import OrderedDict, defaultdict from datetime import date, datetime, timedelta, timezone +from itertools import batched, chain from typing import Any, Dict, List, Set, TypedDict, Union import eyecite @@ -704,56 +705,64 @@ def invert_user_logs( end: Union[str, datetime], add_usernames: bool = True, ) -> Dict[str, Dict[str, int]]: - """Invert the user logs for a period of time + """Aggregate API usage statistics per user over a date range. - The user logs have the date in the key and the user as part of the set: + - Anonymous users are aggregated under the key 'AnonymousUser'. + - Both v3 and v4 API counts are combined in the results. - 'api:v3.user.d:2016-10-01.counts': { - mlissner: 22, - joe_hazard: 33, - } - - This inverts these entries to: + :param start: Beginning date (inclusive) for the query range + :param end: End date (inclusive) for the query range + :param add_usernames: If True, replaces user IDs with usernames as keys. + When False, uses only user IDs as keys. - users: { - mlissner: { - 2016-10-01: 22, - total: 22, - }, - joe_hazard: { - 2016-10-01: 33, - total: 33, - } - } - :param start: The beginning date (inclusive) you want the results for. A - :param end: The end date (inclusive) you want the results for. - :param add_usernames: Stats are stored with the user ID. If this is True, - add an alias in the returned dictionary that contains the username as well. - :return The inverted dictionary + :return: Dictionary mapping user identifiers (usernames if `add_usernames=True`, + otherwise user IDs) to their daily API usage counts and totals. + Inner dictionaries are ordered by date. Only dates with usage are included. """ r = get_redis_interface("STATS") pipe = r.pipeline() dates = make_date_str_list(start, end) + versions = ["v3", "v4"] for d in dates: - pipe.zrange(f"api:v3.user.d:{d}.counts", 0, -1, withscores=True) + for version in versions: + pipe.zrange( + f"api:{version}.user.d:{d}.counts", + 0, + -1, + withscores=True, + ) + + # results contains alternating v3/v4 API usage data for each date queried. + # For example, if querying 2023-01-01 to 2023-01-02, results might look like: + # [ + # # 2023-01-01 v3 data: [(user_id, count), ...] + # [("1", 100.0), ("2", 50.0)], + # # 2023-01-01 v4 data + # [("1", 50.0), ("2", 25.0)], + # # 2023-01-02 v3 data + # [("1", 200.0), ("2", 100.0)], + # # 2023-01-02 v4 data + # [("1", 100.0), ("2", 50.0)] + # ] + # We zip this with dates to combine v3/v4 counts per user per day results = pipe.execute() - # results is a list of results for each of the zrange queries above. Zip - # those results with the date that created it, and invert the whole thing. out: defaultdict = defaultdict(dict) - for d, result in zip(dates, results): - for user_id, count in result: - if user_id == "None" or user_id == "AnonymousUser": - user_id = "AnonymousUser" - else: - user_id = int(user_id) - count = int(count) - if out.get(user_id): - out[user_id][d] = count - out[user_id]["total"] += count - else: - out[user_id] = {d: count, "total": count} + + def update_user_counts(_user_id, _count, _date): + user_is_anonymous = _user_id == "None" or _user_id == "AnonymousUser" + _user_id = "AnonymousUser" if user_is_anonymous else int(_user_id) + _count = int(_count) + out.setdefault(_user_id, OrderedDict()) + out[_user_id].setdefault(_date, 0) + out[_user_id][_date] += _count + out[_user_id].setdefault("total", 0) + out[_user_id]["total"] += _count + + for d, api_usage in zip(dates, batched(results, len(versions))): + for user_id, count in chain(*api_usage): + update_user_counts(user_id, count, d) # Sort the values for k, v in out.items():