From 931c3bc9f4514ae690d6a61fc1943c9364a11e4d Mon Sep 17 00:00:00 2001 From: Elisa Anguita Date: Tue, 31 Dec 2024 12:41:46 -0300 Subject: [PATCH 1/5] fix(api): include v4 usage in user's recent_api_usage --- cl/api/utils.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/cl/api/utils.py b/cl/api/utils.py index 570c48c769..795992b0a7 100644 --- a/cl/api/utils.py +++ b/cl/api/utils.py @@ -1,6 +1,7 @@ import logging from collections import OrderedDict, defaultdict from datetime import date, datetime, timedelta, timezone +from itertools import chain from typing import Any, Dict, List, Set, TypedDict, Union import eyecite @@ -737,23 +738,26 @@ def invert_user_logs( dates = make_date_str_list(start, end) for d in dates: pipe.zrange(f"api:v3.user.d:{d}.counts", 0, -1, withscores=True) + pipe.zrange(f"api:v4.user.d:{d}.counts", 0, -1, withscores=True) results = pipe.execute() # results is a list of results for each of the zrange queries above. Zip # those results with the date that created it, and invert the whole thing. out: defaultdict = defaultdict(dict) - for d, result in zip(dates, results): - for user_id, count in result: - if user_id == "None" or user_id == "AnonymousUser": - user_id = "AnonymousUser" - else: - user_id = int(user_id) - count = int(count) - if out.get(user_id): - out[user_id][d] = count - out[user_id]["total"] += count - else: - out[user_id] = {d: count, "total": count} + + def update_user_counts(_user_id, _count, _date): + user_is_anonymous = _user_id == "None" or _user_id == "AnonymousUser" + _user_id = "AnonymousUser" if user_is_anonymous else int(_user_id) + _count = int(_count) + out.setdefault(_user_id, OrderedDict()) + out[_user_id].setdefault(_date, 0) + out[_user_id][_date] += _count + out[_user_id].setdefault("total", 0) + out[_user_id]["total"] += _count # Update total + + for d, v3_data, v4_data in zip(dates, results[::2], results[1::2]): + for user_id, count in chain(v3_data, v4_data): + update_user_counts(user_id, count, d) # Sort the values for k, v in out.items(): From 7e6317e22795d910a902e4a3f85a696993c9d64c Mon Sep 17 00:00:00 2001 From: Elisa Anguita Date: Tue, 31 Dec 2024 13:14:20 -0300 Subject: [PATCH 2/5] docs(api): update invert_user_logs docstring and add descriptive comment --- cl/api/utils.py | 51 ++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/cl/api/utils.py b/cl/api/utils.py index 795992b0a7..5970b0bce9 100644 --- a/cl/api/utils.py +++ b/cl/api/utils.py @@ -705,32 +705,19 @@ def invert_user_logs( end: Union[str, datetime], add_usernames: bool = True, ) -> Dict[str, Dict[str, int]]: - """Invert the user logs for a period of time + """Aggregate API usage statistics per user over a date range. - The user logs have the date in the key and the user as part of the set: + - Anonymous users are aggregated under the key 'AnonymousUser'. + - Both v3 and v4 API counts are combined in the results. - 'api:v3.user.d:2016-10-01.counts': { - mlissner: 22, - joe_hazard: 33, - } - - This inverts these entries to: + :param start: Beginning date (inclusive) for the query range + :param end: End date (inclusive) for the query range + :param add_usernames: If True, replaces user IDs with usernames as keys. + When False, uses only user IDs as keys. - users: { - mlissner: { - 2016-10-01: 22, - total: 22, - }, - joe_hazard: { - 2016-10-01: 33, - total: 33, - } - } - :param start: The beginning date (inclusive) you want the results for. A - :param end: The end date (inclusive) you want the results for. - :param add_usernames: Stats are stored with the user ID. If this is True, - add an alias in the returned dictionary that contains the username as well. - :return The inverted dictionary + :return: Dictionary mapping user identifiers (usernames if `add_usernames=True`, + otherwise user IDs) to their daily API usage counts and totals. + Inner dictionaries are ordered by date. Only dates with usage are included. """ r = get_redis_interface("STATS") pipe = r.pipeline() @@ -739,10 +726,22 @@ def invert_user_logs( for d in dates: pipe.zrange(f"api:v3.user.d:{d}.counts", 0, -1, withscores=True) pipe.zrange(f"api:v4.user.d:{d}.counts", 0, -1, withscores=True) + + # results contains alternating v3/v4 API usage data for each date queried. + # For example, if querying 2023-01-01 to 2023-01-02, results might look like: + # [ + # # 2023-01-01 v3 data: [(user_id, count), ...] + # [("1", 100.0), ("2", 50.0)], + # # 2023-01-01 v4 data + # [("1", 50.0), ("2", 25.0)], + # # 2023-01-02 v3 data + # [("1", 200.0), ("2", 100.0)], + # # 2023-01-02 v4 data + # [("1", 100.0), ("2", 50.0)] + # ] + # We zip this with dates to combine v3/v4 counts per user per day results = pipe.execute() - # results is a list of results for each of the zrange queries above. Zip - # those results with the date that created it, and invert the whole thing. out: defaultdict = defaultdict(dict) def update_user_counts(_user_id, _count, _date): @@ -753,7 +752,7 @@ def update_user_counts(_user_id, _count, _date): out[_user_id].setdefault(_date, 0) out[_user_id][_date] += _count out[_user_id].setdefault("total", 0) - out[_user_id]["total"] += _count # Update total + out[_user_id]["total"] += _count for d, v3_data, v4_data in zip(dates, results[::2], results[1::2]): for user_id, count in chain(v3_data, v4_data): From ef20c82b11400901e1efa8fdd01fb6aa0b0b6668 Mon Sep 17 00:00:00 2001 From: Elisa Anguita Date: Tue, 31 Dec 2024 13:23:05 -0300 Subject: [PATCH 3/5] test(api): include tests for combining v3 and v4 API usage data --- cl/api/tests.py | 160 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 159 insertions(+), 1 deletion(-) diff --git a/cl/api/tests.py b/cl/api/tests.py index e15bbfe279..1956af2fef 100644 --- a/cl/api/tests.py +++ b/cl/api/tests.py @@ -1,8 +1,10 @@ import json +from collections import OrderedDict, defaultdict from datetime import date, datetime, timedelta, timezone from http import HTTPStatus from typing import Any, Dict from unittest import mock +from unittest.mock import MagicMock, patch from urllib.parse import parse_qs, urlparse from asgiref.sync import async_to_sync, sync_to_async @@ -29,7 +31,7 @@ from cl.api.factories import WebhookEventFactory, WebhookFactory from cl.api.models import WEBHOOK_EVENT_STATUS, WebhookEvent, WebhookEventType from cl.api.pagination import VersionBasedPagination -from cl.api.utils import LoggingMixin, get_logging_prefix +from cl.api.utils import LoggingMixin, get_logging_prefix, invert_user_logs from cl.api.views import build_chart_data, coverage_data, make_court_variable from cl.api.webhooks import send_webhook_event from cl.audio.api_views import AudioViewSet @@ -3297,3 +3299,159 @@ async def test_count_with_no_results(self): self.assertEqual(response.status_code, 200) self.assertEqual(response.data["count"], 0) + + +class TestApiUsage(SimpleTestCase): + """Tests for combining v3 and v4 API usage data""" + + def setUp(self): + """ + Set up test environment before each test. + We create fresh mocks to avoid state bleeding between tests. + """ + self.mock_redis = MagicMock() + self.mock_pipeline = MagicMock() + self.mock_redis.pipeline.return_value = self.mock_pipeline + + @patch("cl.api.utils.get_redis_interface") + def test_single_date_combined_usage(self, mock_get_redis): + """ + Test that for a single date: + 1. Both v3 and v4 usage is fetched + 2. Counts are properly combined + 3. The output format matches template expectations + """ + mock_get_redis.return_value = self.mock_redis + + self.mock_pipeline.execute.return_value = [ + # First result: v3 API data + [("1", 100.0), ("2", 50.0)], + # Second result: v4 API data + [("1", 50.0), ("2", 25.0)], + ] + + results = invert_user_logs( + start="2023-01-01", end="2023-01-01", add_usernames=False + ) + + expected = defaultdict(dict) + expected[1] = OrderedDict({"2023-01-01": 150, "total": 150}) + expected[2] = OrderedDict({"2023-01-01": 75, "total": 75}) + + self.assertEqual(results, expected) + + @patch("cl.api.utils.get_redis_interface") + def test_multiple_dates_combined_usage(self, mock_get_redis): + """ + Test that across multiple dates: + 1. API usage is correctly combined from both v3 and v4 + 2. Totals accumulate properly across dates + 3. The data structure matches template expectations + 4. Date ordering is preserved + """ + mock_get_redis.return_value = self.mock_redis + self.mock_pipeline.execute.return_value = [ + # January 1st - v3 API + [("1", 100.0), ("2", 50.0)], + # January 1st - v4 API + [("1", 50.0), ("2", 25.0)], + # January 2nd - v3 API + [("1", 200.0), ("2", 100.0)], + # January 2nd - v4 API + [("1", 100.0), ("2", 50.0)], + ] + + results = invert_user_logs( + start="2023-01-01", end="2023-01-02", add_usernames=False + ) + + # User 1: + # Jan 1: 150 (100 from v3 + 50 from v4) + # Jan 2: 300 (200 from v3 + 100 from v4) + # Total: 450 + # User 2: + # Jan 1: 75 (50 from v3 + 25 from v4) + # Jan 2: 150 (100 from v3 + 50 from v4) + # Total: 225 + expected = defaultdict(dict) + expected[1] = OrderedDict( + {"2023-01-01": 150, "2023-01-02": 300, "total": 450} + ) + expected[2] = OrderedDict( + {"2023-01-01": 75, "2023-01-02": 150, "total": 225} + ) + + self.assertEqual(results, expected) + + @patch("cl.api.utils.get_redis_interface") + def test_anonymous_user_handling(self, mock_get_redis): + """ + Test the handling of anonymous users, which have special requirements: + 1. Both 'None' and 'AnonymousUser' should be treated as the same user + 2. They should be combined under 'AnonymousUser' in the output + 3. Their usage should be summed correctly across both identifiers + 4. They should work with both API versions + """ + mock_get_redis.return_value = self.mock_redis + self.mock_pipeline.execute.return_value = [ + # January 1st - v3 + [ + ("None", 30.0), + ("1", 100.0), + ("AnonymousUser", 20.0), + ], + # January 1st - v4 + [ + ("AnonymousUser", 25.0), + ("1", 50.0), + ], + # January 2nd - v3 + [("None", 40.0), ("1", 200.0)], + # January 2nd - v4 + [ + ("1", 100.0), + ("AnonymousUser", 35.0), + ], + ] + + results = invert_user_logs( + start="2023-01-01", end="2023-01-02", add_usernames=False + ) + + expected = defaultdict(dict) + # Expected results: + # Anonymous user on Jan 1: + # - v3: 30 (None) + 20 (AnonymousUser) = 50 + # - v4: 25 (AnonymousUser) = 25 + # - Total for Jan 1: 75 + # Anonymous user on Jan 2: + # - v3: 40 (None) = 40 + # - v4: 35 (AnonymousUser) = 35 + # - Total for Jan 2: 75 + # Anonymous total across all dates: 150 + expected["AnonymousUser"] = OrderedDict( + { + "2023-01-01": 75, + "2023-01-02": 75, + "total": 150, + } + ) + expected[1] = OrderedDict( + { + "2023-01-01": 150, + "2023-01-02": 300, + "total": 450, + } + ) + + self.assertEqual(results, expected) + + self.assertNotIn("None", results) + + self.assertIn("AnonymousUser", results) + + # Verify ordering is maintained even with anonymous users + anonymous_data = results["AnonymousUser"] + dates = list(anonymous_data.keys()) + dates.remove("total") + self.assertEqual(dates, ["2023-01-01", "2023-01-02"]) From eb44e9663d1f54ffe9373ed9993483642b2a77cd Mon Sep 17 00:00:00 2001 From: Elisa Anguita Date: Fri, 3 Jan 2025 13:21:00 -0300 Subject: [PATCH 4/5] refactor(api): enhance maintainability by using itertools.batched instead of slicing --- cl/api/utils.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/cl/api/utils.py b/cl/api/utils.py index 5970b0bce9..4155135e07 100644 --- a/cl/api/utils.py +++ b/cl/api/utils.py @@ -1,7 +1,7 @@ import logging from collections import OrderedDict, defaultdict from datetime import date, datetime, timedelta, timezone -from itertools import chain +from itertools import batched, chain from typing import Any, Dict, List, Set, TypedDict, Union import eyecite @@ -723,9 +723,15 @@ def invert_user_logs( pipe = r.pipeline() dates = make_date_str_list(start, end) + versions = ["v3", "v4"] for d in dates: - pipe.zrange(f"api:v3.user.d:{d}.counts", 0, -1, withscores=True) - pipe.zrange(f"api:v4.user.d:{d}.counts", 0, -1, withscores=True) + for v in versions: + pipe.zrange( + f"api:{v}.user.d:{d}.counts", + 0, + -1, + withscores=True, + ) # results contains alternating v3/v4 API usage data for each date queried. # For example, if querying 2023-01-01 to 2023-01-02, results might look like: @@ -754,8 +760,8 @@ def update_user_counts(_user_id, _count, _date): out[_user_id].setdefault("total", 0) out[_user_id]["total"] += _count - for d, v3_data, v4_data in zip(dates, results[::2], results[1::2]): - for user_id, count in chain(v3_data, v4_data): + for d, api_usage in zip(dates, batched(results, len(versions))): + for user_id, count in chain(*api_usage): update_user_counts(user_id, count, d) # Sort the values From 27340541eeff1eb98f419239da357f08657fbde1 Mon Sep 17 00:00:00 2001 From: Elisa Anguita Date: Fri, 3 Jan 2025 13:34:25 -0300 Subject: [PATCH 5/5] refactor(api): rename variable to avoid type checker error --- cl/api/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cl/api/utils.py b/cl/api/utils.py index 4155135e07..84630f7b5d 100644 --- a/cl/api/utils.py +++ b/cl/api/utils.py @@ -725,9 +725,9 @@ def invert_user_logs( dates = make_date_str_list(start, end) versions = ["v3", "v4"] for d in dates: - for v in versions: + for version in versions: pipe.zrange( - f"api:{v}.user.d:{d}.counts", + f"api:{version}.user.d:{d}.counts", 0, -1, withscores=True,