From f3eeb988165a2ef636e49440d1aaa71757592e0a Mon Sep 17 00:00:00 2001 From: rgraber Date: Wed, 28 Aug 2024 15:57:37 -0400 Subject: [PATCH] feat: delete old access logs --- dependencies/pip/dev_requirements.txt | 2 + dependencies/pip/requirements.in | 1 + dependencies/pip/requirements.txt | 2 + kobo/apps/audit_log/tasks.py | 88 +++++++++++++ kobo/apps/audit_log/tests/test_tasks.py | 163 ++++++++++++++++++++++++ kobo/settings/base.py | 14 +- 6 files changed, 269 insertions(+), 1 deletion(-) create mode 100644 kobo/apps/audit_log/tasks.py create mode 100644 kobo/apps/audit_log/tests/test_tasks.py diff --git a/dependencies/pip/dev_requirements.txt b/dependencies/pip/dev_requirements.txt index d0b6ee9169..b23b517cd4 100644 --- a/dependencies/pip/dev_requirements.txt +++ b/dependencies/pip/dev_requirements.txt @@ -390,6 +390,8 @@ modilabs-python-utils==0.1.5 # via -r dependencies/pip/requirements.in mongomock==4.1.2 # via -r dependencies/pip/dev_requirements.in +more-itertools==10.4.0 + # via -r dependencies/pip/requirements.in multidict==6.0.5 # via # aiohttp diff --git a/dependencies/pip/requirements.in b/dependencies/pip/requirements.in index 86365bcaea..e5769de2e2 100644 --- a/dependencies/pip/requirements.in +++ b/dependencies/pip/requirements.in @@ -67,6 +67,7 @@ jsonfield jsonschema kombu lxml +more-itertools oauthlib openpyxl #py-gfm # Incompatible with markdown 3.x diff --git a/dependencies/pip/requirements.txt b/dependencies/pip/requirements.txt index d81ea2e13e..9df8d9e292 100644 --- a/dependencies/pip/requirements.txt +++ b/dependencies/pip/requirements.txt @@ -316,6 +316,8 @@ markupsafe==2.1.5 # via werkzeug modilabs-python-utils==0.1.5 # via -r dependencies/pip/requirements.in +more-itertools==10.4.0 + # via -r dependencies/pip/requirements.in multidict==6.0.5 # via # aiohttp diff --git a/kobo/apps/audit_log/tasks.py b/kobo/apps/audit_log/tasks.py new file mode 100644 index 0000000000..3a02e92e28 --- /dev/null +++ b/kobo/apps/audit_log/tasks.py @@ -0,0 +1,88 @@ +from datetime import timedelta + +from constance import config +from django.conf import settings +from django.db.models import Count, Q +from django.utils import timezone +from more_itertools import chunked + +from kobo.apps.audit_log.models import ( + AccessLog, + AuditLog, + SubmissionAccessLog, + SubmissionGroup, +) +from kobo.celery import celery_app +from kpi.constants import ACCESS_LOG_SUBMISSION_GROUP_AUTH_TYPE +from kpi.utils.log import logging + + +def remove_expired_submissions_from_groups(expiration_date): + expired_submissions = SubmissionAccessLog.objects.filter( + date_created__lt=expiration_date + ) + for submission in expired_submissions: + submission.submission_group = None + SubmissionAccessLog.objects.bulk_update( + expired_submissions, fields=['submission_group'], batch_size=100 + ) + + +def get_empty_submission_group_ids(): + # (groups will still contain themselves, so check for groups with only one "submission") + return ( + SubmissionGroup.objects.annotate(Count('submissions')) + .filter(submissions__count=1) + .values_list('id', flat=True) + .iterator() + ) + + +@celery_app.task() +def spawn_access_log_cleaning_tasks(): + """ + Enqueue tasks to delete access logs older than ACCESS_LOG_LIFESPAN days old, in batches + + ACCESS_LOG_LIFESPAN is configured via constance + """ + + expiration_date = timezone.now() - timedelta( + days=config.ACCESS_LOG_LIFESPAN + ) + # step 1: remove expired submissions from groups + remove_expired_submissions_from_groups(expiration_date) + + # step 2: remove empty submission groups + empty_submission_groups = get_empty_submission_group_ids() + for group_batch in chunked( + empty_submission_groups, settings.ACCESS_LOG_DELETION_BATCH_SIZE + ): + batch_delete_audit_logs_by_id.delay(ids=group_batch) + + # step 3: delete everything else that is > ACCESS_LOG_LIFESPAN days ago + expired_logs = ( + # .exclude works funny with jsonfields, so use Q instead to exclude submission groups + # (need to explicitly include cases where the auth_type is null. It shouldn't happen + # outside of tests, but we should handle it anyway) + AccessLog.objects.filter( + Q(date_created__lt=expiration_date) + & ( + ~Q(metadata__auth_type=ACCESS_LOG_SUBMISSION_GROUP_AUTH_TYPE) + | Q(metadata__auth_type__isnull=True) + ) + ) + .values_list('id', flat=True) + .iterator() + ) + for id_batch in chunked( + expired_logs, settings.ACCESS_LOG_DELETION_BATCH_SIZE + ): + # queue up a new task for each batch of expired ids + batch_delete_audit_logs_by_id.delay(ids=id_batch) + + +@celery_app.task() +def batch_delete_audit_logs_by_id(ids): + logs = AuditLog.objects.filter(id__in=ids) + count, _ = logs.delete() + logging.info(f'Deleted {count} audit logs from database') diff --git a/kobo/apps/audit_log/tests/test_tasks.py b/kobo/apps/audit_log/tests/test_tasks.py new file mode 100644 index 0000000000..cd1aa428c2 --- /dev/null +++ b/kobo/apps/audit_log/tests/test_tasks.py @@ -0,0 +1,163 @@ +from datetime import timedelta +from unittest.mock import Mock, call, patch + +from constance.test import override_config +from django.test import override_settings +from django.utils import timezone + +from kobo.apps.audit_log.models import ( + AccessLog, + SubmissionAccessLog, + SubmissionGroup, +) +from kobo.apps.audit_log.tasks import ( + batch_delete_audit_logs_by_id, + get_empty_submission_group_ids, + remove_expired_submissions_from_groups, + spawn_access_log_cleaning_tasks, +) +from kobo.apps.audit_log.tests.test_utils import skip_all_signals +from kobo.apps.kobo_auth.shortcuts import User +from kpi.tests.base_test_case import BaseTestCase + + +@override_config(ACCESS_LOG_LIFESPAN=1) +class AuditLogTasksTestCase(BaseTestCase): + + fixtures = ['test_data'] + + def get_ids_queued_for_deletion(self, patched_task: Mock): + """ + Given a patched version of batch_delete_audit_logs_by_id, get all the ids it was called with + """ + # get the list of ids passed for each call + id_lists = [kwargs['ids'] for _, _, kwargs in patched_task.mock_calls] + # flatten the list + all_ids = [log_id for id_list in id_lists for log_id in id_list] + return all_ids + + @patch('kobo.apps.audit_log.tasks.batch_delete_audit_logs_by_id.delay') + def test_spawn_deletion_task_deletes_expired_groups(self, patched_delete): + # basic plumbing test to make sure we are getting and deleting expired groups + with patch( + 'kobo.apps.audit_log.tasks.get_empty_submission_group_ids', + return_value=[1, 2], + ): + spawn_access_log_cleaning_tasks() + ids = self.get_ids_queued_for_deletion(patched_delete) + self.assertEquals(ids, [1, 2]) + + def test_spawn_deletion_task_identifies_expired_logs(self): + user = User.objects.get(username='someuser') + three_days_ago = timezone.now() - timedelta(days=3) + old_log = AccessLog.objects.create( + user=user, + metadata={'auth_type': 'token'}, + date_created=three_days_ago, + ) + new_log = AccessLog.objects.create( + user=user, metadata={'auth_type': 'token'} + ) + + with patch( + 'kobo.apps.audit_log.tasks.batch_delete_audit_logs_by_id.delay' + ) as patched_spawned_task: + spawn_access_log_cleaning_tasks() + deleted_ids = self.get_ids_queued_for_deletion(patched_spawned_task) + self.assertIn(old_log.id, deleted_ids) + self.assertNotIn(new_log.id, deleted_ids) + + def test_remove_expired_submissions_from_groups(self): + user = User.objects.get(username='someuser') + expiration_date = timezone.now() - timedelta(days=1) + three_days_ago = timezone.now() - timedelta(days=3) + with skip_all_signals(): + # skip signals so we can manually assign submissions to groups + group = SubmissionGroup.objects.create(user=user) + group.submission_group = group + group.save() + old_log = SubmissionAccessLog.objects.create( + user=user, date_created=three_days_ago, submission_group=group + ) + new_log = SubmissionAccessLog.objects.create( + user=user, submission_group=group + ) + remove_expired_submissions_from_groups(expiration_date) + self.assertEqual(group.submissions.count(), 2) + self.assertIn(new_log, group.submissions.all()) + self.assertNotIn(old_log, group.submissions.all()) + + def test_get_empty_submission_group_ids(self): + user = User.objects.get(username='someuser') + + with skip_all_signals(): + # skip post_save signals so we can manually assign submissions to groups + group_to_delete = SubmissionGroup.objects.create(user=user) + group_to_delete.submission_group = group_to_delete + group_to_delete.save() + + group_to_keep = SubmissionGroup.objects.create(user=user) + group_to_keep.submission_group = group_to_keep + group_to_keep.save() + + SubmissionAccessLog.objects.create( + user=user, + submission_group=group_to_keep, + ) + + empty = get_empty_submission_group_ids() + self.assertEqual(list(empty), [group_to_delete.id]) + + @override_settings(ACCESS_LOG_DELETION_BATCH_SIZE=2) + @patch('kobo.apps.audit_log.tasks.batch_delete_audit_logs_by_id.delay') + def test_spawn_task_batches_ids(self, patched_task): + three_days_ago = timezone.now() - timedelta(days=3) + user = User.objects.get(username='someuser') + old_log_1 = AccessLog.objects.create( + user=user, date_created=three_days_ago + ) + old_log_2 = AccessLog.objects.create( + user=user, date_created=three_days_ago + ) + old_log_3 = AccessLog.objects.create( + user=user, date_created=three_days_ago + ) + group1 = SubmissionGroup.objects.create(user=user) + group2 = SubmissionGroup.objects.create(user=user) + group3 = SubmissionGroup.objects.create(user=user) + # force deletion of the 3 groups + with patch( + 'kobo.apps.audit_log.tasks.get_empty_submission_group_ids', + return_value=[group1.id, group2.id, group3.id], + ): + spawn_access_log_cleaning_tasks() + + # groups are batched separately from other logs, so it will be 2 batches for the groups + # and 2 for the other logs + self.assertEquals(patched_task.call_count, 4) + # make sure all batches were <= ACCESS_LOG_DELETION_BATCH_SIZE + for task_call in patched_task.mock_calls: + _, _, kwargs = task_call + id_list = kwargs['ids'] + self.assertLessEqual(len(id_list), 2) + + # make sure we queued everything for deletion + all_deleted_ids = self.get_ids_queued_for_deletion(patched_task) + self.assertIn(old_log_1.id, all_deleted_ids) + self.assertIn(old_log_2.id, all_deleted_ids) + self.assertIn(old_log_3.id, all_deleted_ids) + self.assertIn(group1.id, all_deleted_ids) + self.assertIn(group2.id, all_deleted_ids) + self.assertIn(group3.id, all_deleted_ids) + + def test_batch_delete_audit_logs_by_id(self): + user = User.objects.get(username='someuser') + log_1 = AccessLog.objects.create(user=user) + log_2 = AccessLog.objects.create(user=user) + log_3 = AccessLog.objects.create(user=user) + self.assertEquals(AccessLog.objects.count(), 3) + + batch_delete_audit_logs_by_id(ids=[log_1.id, log_2.id]) + # only log_3 should remain + self.assertEquals(AccessLog.objects.count(), 1) + self.assertEquals(AccessLog.objects.first().id, log_3.id) diff --git a/kobo/settings/base.py b/kobo/settings/base.py index e207737429..9b209866f7 100644 --- a/kobo/settings/base.py +++ b/kobo/settings/base.py @@ -586,6 +586,11 @@ ), 'Email message to sent to admins on failure.', ), + 'ACCESS_LOG_LIFESPAN': ( + 60, + 'Length of time in days to keep access logs.', + 'positive_int' + ) } CONSTANCE_ADDITIONAL_FIELDS = { @@ -649,6 +654,7 @@ 'EXPOSE_GIT_REV', 'FRONTEND_MIN_RETRY_TIME', 'FRONTEND_MAX_RETRY_TIME', + 'ACCESS_LOG_LIFESPAN', ), 'Rest Services': ( 'ALLOW_UNSECURED_HOOK_ENDPOINTS', @@ -660,7 +666,6 @@ 'ASR_MT_GOOGLE_STORAGE_BUCKET_PREFIX', 'ASR_MT_GOOGLE_TRANSLATION_LOCATION', 'ASR_MT_GOOGLE_CREDENTIALS', - 'ASR_MT_GOOGLE_REQUEST_TIMEOUT', ), 'Security': ( 'SSRF_ALLOWED_IP_ADDRESS', @@ -1198,6 +1203,11 @@ def dj_stripe_request_callback_method(): 'schedule': crontab(minute=0, hour=0), 'options': {'queue': 'kpi_low_priority_queue'} }, + 'delete-expired-access-logs': { + 'task': 'kobo.apps.audit_log.tasks.spawn_access_log_cleaning_tasks', + 'schedule': crontab(minute=0, hour=0), + 'options': {'queue': 'kpi_low_priority_queue'} + } } @@ -1748,6 +1758,8 @@ def dj_stripe_request_callback_method(): 'application/x-zip-compressed' ] +ACCESS_LOG_DELETION_BATCH_SIZE=1000 + # Silence Django Guardian warning. Authentication backend is hooked, but # Django Guardian does not recognize it because it is extended SILENCED_SYSTEM_CHECKS = ['guardian.W001']