Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(api): include v4 in recent API usage #4881

Merged
merged 8 commits into from
Jan 3, 2025
160 changes: 159 additions & 1 deletion cl/api/tests.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import json
from collections import OrderedDict, defaultdict
from datetime import date, datetime, timedelta, timezone
from http import HTTPStatus
from typing import Any, Dict
from unittest import mock
from unittest.mock import MagicMock, patch
from urllib.parse import parse_qs, urlparse

from asgiref.sync import async_to_sync, sync_to_async
Expand All @@ -29,7 +31,7 @@
from cl.api.factories import WebhookEventFactory, WebhookFactory
from cl.api.models import WEBHOOK_EVENT_STATUS, WebhookEvent, WebhookEventType
from cl.api.pagination import VersionBasedPagination
from cl.api.utils import LoggingMixin, get_logging_prefix
from cl.api.utils import LoggingMixin, get_logging_prefix, invert_user_logs
from cl.api.views import build_chart_data, coverage_data, make_court_variable
from cl.api.webhooks import send_webhook_event
from cl.audio.api_views import AudioViewSet
Expand Down Expand Up @@ -3297,3 +3299,159 @@ async def test_count_with_no_results(self):

self.assertEqual(response.status_code, 200)
self.assertEqual(response.data["count"], 0)


class TestApiUsage(SimpleTestCase):
"""Tests for combining v3 and v4 API usage data"""

def setUp(self):
"""
Set up test environment before each test.
We create fresh mocks to avoid state bleeding between tests.
"""
self.mock_redis = MagicMock()
self.mock_pipeline = MagicMock()
self.mock_redis.pipeline.return_value = self.mock_pipeline

@patch("cl.api.utils.get_redis_interface")
def test_single_date_combined_usage(self, mock_get_redis):
"""
Test that for a single date:
1. Both v3 and v4 usage is fetched
2. Counts are properly combined
3. The output format matches template expectations
"""
mock_get_redis.return_value = self.mock_redis

self.mock_pipeline.execute.return_value = [
# First result: v3 API data
[("1", 100.0), ("2", 50.0)],
# Second result: v4 API data
[("1", 50.0), ("2", 25.0)],
]

results = invert_user_logs(
start="2023-01-01", end="2023-01-01", add_usernames=False
)

expected = defaultdict(dict)
expected[1] = OrderedDict({"2023-01-01": 150, "total": 150})
expected[2] = OrderedDict({"2023-01-01": 75, "total": 75})

self.assertEqual(results, expected)

@patch("cl.api.utils.get_redis_interface")
def test_multiple_dates_combined_usage(self, mock_get_redis):
"""
Test that across multiple dates:
1. API usage is correctly combined from both v3 and v4
2. Totals accumulate properly across dates
3. The data structure matches template expectations
4. Date ordering is preserved
"""
mock_get_redis.return_value = self.mock_redis
self.mock_pipeline.execute.return_value = [
# January 1st - v3 API
[("1", 100.0), ("2", 50.0)],
# January 1st - v4 API
[("1", 50.0), ("2", 25.0)],
# January 2nd - v3 API
[("1", 200.0), ("2", 100.0)],
# January 2nd - v4 API
[("1", 100.0), ("2", 50.0)],
]

results = invert_user_logs(
start="2023-01-01", end="2023-01-02", add_usernames=False
)

# User 1:
# Jan 1: 150 (100 from v3 + 50 from v4)
# Jan 2: 300 (200 from v3 + 100 from v4)
# Total: 450
# User 2:
# Jan 1: 75 (50 from v3 + 25 from v4)
# Jan 2: 150 (100 from v3 + 50 from v4)
# Total: 225
expected = defaultdict(dict)
expected[1] = OrderedDict(
{"2023-01-01": 150, "2023-01-02": 300, "total": 450}
)
expected[2] = OrderedDict(
{"2023-01-01": 75, "2023-01-02": 150, "total": 225}
)

self.assertEqual(results, expected)

@patch("cl.api.utils.get_redis_interface")
def test_anonymous_user_handling(self, mock_get_redis):
"""
Test the handling of anonymous users, which have special requirements:
1. Both 'None' and 'AnonymousUser' should be treated as the same user
2. They should be combined under 'AnonymousUser' in the output
3. Their usage should be summed correctly across both identifiers
4. They should work with both API versions
"""
mock_get_redis.return_value = self.mock_redis
self.mock_pipeline.execute.return_value = [
# January 1st - v3
[
("None", 30.0),
("1", 100.0),
("AnonymousUser", 20.0),
],
# January 1st - v4
[
("AnonymousUser", 25.0),
("1", 50.0),
],
# January 2nd - v3
[("None", 40.0), ("1", 200.0)],
# January 2nd - v4
[
("1", 100.0),
("AnonymousUser", 35.0),
],
]

results = invert_user_logs(
start="2023-01-01", end="2023-01-02", add_usernames=False
)

expected = defaultdict(dict)
# Expected results:
# Anonymous user on Jan 1:
# - v3: 30 (None) + 20 (AnonymousUser) = 50
# - v4: 25 (AnonymousUser) = 25
# - Total for Jan 1: 75
# Anonymous user on Jan 2:
# - v3: 40 (None) = 40
# - v4: 35 (AnonymousUser) = 35
# - Total for Jan 2: 75
# Anonymous total across all dates: 150
expected["AnonymousUser"] = OrderedDict(
{
"2023-01-01": 75,
"2023-01-02": 75,
"total": 150,
}
)
expected[1] = OrderedDict(
{
"2023-01-01": 150,
"2023-01-02": 300,
"total": 450,
}
)

self.assertEqual(results, expected)

self.assertNotIn("None", results)

self.assertIn("AnonymousUser", results)

# Verify ordering is maintained even with anonymous users
anonymous_data = results["AnonymousUser"]
dates = list(anonymous_data.keys())
dates.remove("total")
self.assertEqual(dates, ["2023-01-01", "2023-01-02"])
77 changes: 40 additions & 37 deletions cl/api/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
from collections import OrderedDict, defaultdict
from datetime import date, datetime, timedelta, timezone
from itertools import chain
from typing import Any, Dict, List, Set, TypedDict, Union

import eyecite
Expand Down Expand Up @@ -704,56 +705,58 @@ def invert_user_logs(
end: Union[str, datetime],
add_usernames: bool = True,
) -> Dict[str, Dict[str, int]]:
"""Invert the user logs for a period of time
"""Aggregate API usage statistics per user over a date range.

The user logs have the date in the key and the user as part of the set:
- Anonymous users are aggregated under the key 'AnonymousUser'.
- Both v3 and v4 API counts are combined in the results.

'api:v3.user.d:2016-10-01.counts': {
mlissner: 22,
joe_hazard: 33,
}

This inverts these entries to:
:param start: Beginning date (inclusive) for the query range
:param end: End date (inclusive) for the query range
:param add_usernames: If True, replaces user IDs with usernames as keys.
When False, uses only user IDs as keys.

users: {
mlissner: {
2016-10-01: 22,
total: 22,
},
joe_hazard: {
2016-10-01: 33,
total: 33,
}
}
:param start: The beginning date (inclusive) you want the results for. A
:param end: The end date (inclusive) you want the results for.
:param add_usernames: Stats are stored with the user ID. If this is True,
add an alias in the returned dictionary that contains the username as well.
:return The inverted dictionary
:return: Dictionary mapping user identifiers (usernames if `add_usernames=True`,
otherwise user IDs) to their daily API usage counts and totals.
Inner dictionaries are ordered by date. Only dates with usage are included.
"""
r = get_redis_interface("STATS")
pipe = r.pipeline()

dates = make_date_str_list(start, end)
for d in dates:
pipe.zrange(f"api:v3.user.d:{d}.counts", 0, -1, withscores=True)
pipe.zrange(f"api:v4.user.d:{d}.counts", 0, -1, withscores=True)

# results contains alternating v3/v4 API usage data for each date queried.
# For example, if querying 2023-01-01 to 2023-01-02, results might look like:
# [
# # 2023-01-01 v3 data: [(user_id, count), ...]
# [("1", 100.0), ("2", 50.0)],
# # 2023-01-01 v4 data
# [("1", 50.0), ("2", 25.0)],
# # 2023-01-02 v3 data
# [("1", 200.0), ("2", 100.0)],
# # 2023-01-02 v4 data
# [("1", 100.0), ("2", 50.0)]
# ]
# We zip this with dates to combine v3/v4 counts per user per day
results = pipe.execute()

# results is a list of results for each of the zrange queries above. Zip
# those results with the date that created it, and invert the whole thing.
out: defaultdict = defaultdict(dict)
for d, result in zip(dates, results):
for user_id, count in result:
if user_id == "None" or user_id == "AnonymousUser":
user_id = "AnonymousUser"
else:
user_id = int(user_id)
count = int(count)
if out.get(user_id):
out[user_id][d] = count
out[user_id]["total"] += count
else:
out[user_id] = {d: count, "total": count}

def update_user_counts(_user_id, _count, _date):
user_is_anonymous = _user_id == "None" or _user_id == "AnonymousUser"
_user_id = "AnonymousUser" if user_is_anonymous else int(_user_id)
_count = int(_count)
out.setdefault(_user_id, OrderedDict())
out[_user_id].setdefault(_date, 0)
out[_user_id][_date] += _count
out[_user_id].setdefault("total", 0)
out[_user_id]["total"] += _count

for d, v3_data, v4_data in zip(dates, results[::2], results[1::2]):
for user_id, count in chain(v3_data, v4_data):
update_user_counts(user_id, count, d)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can use the itertools.batched function in the outer loop to process the data in chunks. This function iterates over the list, yielding tuples of items with a maximum size of n.

Here's the documentation for reference: itertools.batched

Suggested change
for d, v3_data, v4_data in zip(dates, results[::2], results[1::2]):
for user_id, count in chain(v3_data, v4_data):
update_user_counts(user_id, count, d)
for d, api_usage in zip(dates, batched(results, 2)):
for user_id, count in chain(*api_usage):
update_user_counts(user_id, count, d)

This approach enhances future maintainability. If a new API version (e.g., v5) is introduced, we simply need to adjust the batch size instead of modifying the slicing expressions for previous versions (v3, v4) and adding a new slicing expression for v5.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a versions array and used len(versions) instead of a hardcoded 2 so now we'd simply have to add "v5" to the versions array. What do you think? it does add a new nested for loop but it's a fixed and small number of inner iterations (just a couple of versions) so I'm guessing it's not too bad and it's probably worth the tradeoff.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@elisa-a-v looks great!


# Sort the values
for k, v in out.items():
Expand Down
Loading