From 08666c950f0a2760773295c9f5f3f49b5f7f1c06 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 6 Feb 2024 16:36:48 -0500 Subject: [PATCH 01/15] start on changes I think andrew wanted Signed-off-by: Isaac Milarsky --- augur/tasks/init/celery_app.py | 13 +++++++++++-- augur/tasks/util/collection_util.py | 1 + 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index ee6eaeccdf..20c9dc6e4b 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -85,7 +85,7 @@ class CollectionState(Enum): #Classes for tasks that take a repo_git as an argument. class AugurCoreRepoCollectionTask(celery.Task): - def augur_handle_task_failure(self,exc,task_id,repo_git,logger_name,collection_hook='core'): + def augur_handle_task_failure(self,exc,task_id,repo_git,logger_name,collection_hook='core',after_fail=CollectionState.ERROR.value): from augur.tasks.init.celery_app import engine logger = AugurLogger(logger_name).get_logger() @@ -104,7 +104,7 @@ def augur_handle_task_failure(self,exc,task_id,repo_git,logger_name,collection_h prevStatus = getattr(repoStatus, f"{collection_hook}_status") if prevStatus == CollectionState.COLLECTING.value or prevStatus == CollectionState.INITIALIZING.value: - setattr(repoStatus, f"{collection_hook}_status", CollectionState.ERROR.value) + setattr(repoStatus, f"{collection_hook}_status", after_fail) setattr(repoStatus, f"{collection_hook}_task_id", None) session.commit() @@ -129,6 +129,15 @@ def on_failure(self,exc,task_id,args,kwargs,einfo): repo_git = args[0] self.augur_handle_task_failure(exc,task_id,repo_git, "ml_task_failure", collection_hook='ml') +#Create task subclasses that set their status to standby instead of error so that they can be retried. +class AugurCoreRepoCollectionStandbyTask(AugurCoreRepoCollectionTask): + def on_failure(self,exc,task_id,args,kwargs,einfo): + repo_git = args[0] + self.augur_handle_task_failure(exc, task_id, + repo_git, "core_task_failure",after_fail=CollectionState.STANDBY.value) + +#TODO: Make certain tasks such as detect_github_repo_move able to softly fail and be retried later. + #task_cls='augur.tasks.init.celery_app:AugurCoreRepoCollectionTask' celery_app = Celery('tasks', broker=BROKER_URL, backend=BACKEND_URL, include=tasks) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 47705785e9..bf5d2d8348 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -35,6 +35,7 @@ class CollectionState(Enum): INITIALIZING = "Initializing" UPDATE = "Update" FAILED_CLONE = "Failed Clone" + STANDBY = "Standby" def get_list_of_all_users(session): #Get a list of all users. From a8bf6f77831644f34b15b8ab6d7e51df303eb156 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 6 Feb 2024 16:39:10 -0500 Subject: [PATCH 02/15] subclasses not needed Signed-off-by: Isaac Milarsky --- augur/tasks/init/celery_app.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index 20c9dc6e4b..0c3696776c 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -129,14 +129,6 @@ def on_failure(self,exc,task_id,args,kwargs,einfo): repo_git = args[0] self.augur_handle_task_failure(exc,task_id,repo_git, "ml_task_failure", collection_hook='ml') -#Create task subclasses that set their status to standby instead of error so that they can be retried. -class AugurCoreRepoCollectionStandbyTask(AugurCoreRepoCollectionTask): - def on_failure(self,exc,task_id,args,kwargs,einfo): - repo_git = args[0] - self.augur_handle_task_failure(exc, task_id, - repo_git, "core_task_failure",after_fail=CollectionState.STANDBY.value) - -#TODO: Make certain tasks such as detect_github_repo_move able to softly fail and be retried later. #task_cls='augur.tasks.init.celery_app:AugurCoreRepoCollectionTask' celery_app = Celery('tasks', broker=BROKER_URL, backend=BACKEND_URL, include=tasks) From 1934a2d86563dea22ff3950b953d8219b53fbe7b Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 6 Feb 2024 16:40:25 -0500 Subject: [PATCH 03/15] implement change Signed-off-by: Isaac Milarsky --- augur/tasks/github/detect_move/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/augur/tasks/github/detect_move/core.py b/augur/tasks/github/detect_move/core.py index 304574bc80..eac68166d3 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/augur/tasks/github/detect_move/core.py @@ -104,11 +104,11 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' collectionRecord = execute_session_query(statusQuery,'one') if collection_hook == 'core': - collectionRecord.core_status = CollectionState.PENDING.value + collectionRecord.core_status = CollectionState.STANDBY.value collectionRecord.core_task_id = None collectionRecord.core_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') elif collection_hook == 'secondary': - collectionRecord.secondary_status = CollectionState.PENDING.value + collectionRecord.secondary_status = CollectionState.STANDBY.value collectionRecord.secondary_task_id = None collectionRecord.secondary_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') From 01c5c7396b67b20286b0b57d3d65cb7a0478abd5 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 8 Feb 2024 09:38:27 -0500 Subject: [PATCH 04/15] fix enum import issues and add mechanism to retry repos marked as Standby Signed-off-by: Isaac Milarsky --- augur/application/cli/backend.py | 3 ++- augur/tasks/git/facade_tasks.py | 3 ++- augur/tasks/github/detect_move/core.py | 9 ++------- augur/tasks/init/celery_app.py | 16 +++++----------- augur/tasks/start_tasks.py | 17 +++++++++++++++++ augur/tasks/util/collection_state.py | 11 +++++++++++ augur/tasks/util/collection_util.py | 12 +----------- 7 files changed, 40 insertions(+), 31 deletions(-) create mode 100644 augur/tasks/util/collection_state.py diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index fc466f021c..9b6894a7dd 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -19,7 +19,8 @@ from datetime import datetime from augur import instance_id -from augur.tasks.start_tasks import augur_collection_monitor, CollectionState, create_collection_status_records +from augur.tasks.util.collection_state import CollectionState +from augur.tasks.start_tasks import augur_collection_monitor, create_collection_status_records from augur.tasks.git.facade_tasks import clone_repos from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model from augur.tasks.init.redis_connection import redis_connection diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index c763a2a2c1..ee3dc047ff 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -31,7 +31,8 @@ from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_facade_weight_time_factor, get_repo_commit_count, update_facade_scheduling_fields, get_facade_weight_with_commit_count, facade_bulk_insert_commits from augur.tasks.github.facade_github.tasks import * -from augur.tasks.util.collection_util import CollectionState, get_collection_status_repo_git_from_filter +from augur.tasks.util.collection_state import CollectionState +from augur.tasks.util.collection_util import get_collection_status_repo_git_from_filter from augur.tasks.git.util.facade_worker.facade_worker.repofetch import GitCloneError, git_repo_initialize diff --git a/augur/tasks/github/detect_move/core.py b/augur/tasks/github/detect_move/core.py index eac68166d3..6fb7c76378 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/augur/tasks/github/detect_move/core.py @@ -6,14 +6,9 @@ from augur.tasks.github.util.util import parse_json_response import logging from datetime import datetime -from enum import Enum +from augur.tasks.utl.collection_state import CollectionState from augur.application.db.util import execute_session_query -class CollectionState(Enum): - SUCCESS = "Success" - PENDING = "Pending" - ERROR = "Error" - COLLECTING = "Collecting" def update_repo_with_dict(current_dict,new_dict,logger,db): @@ -114,6 +109,6 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' augur_db.session.commit() - raise Exception("ERROR: Repo has moved! Marked repo as pending and stopped collection") + raise Exception("ERROR: Repo has moved! Marked repo as standby and stopped collection") diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index 0c3696776c..1c3c9e2088 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -20,16 +20,7 @@ from augur.application.db.engine import get_database_string from augur.tasks.init import get_redis_conn_values, get_rabbitmq_conn_string from augur.application.db.models import CollectionStatus, Repo - -class CollectionState(Enum): - SUCCESS = "Success" - PENDING = "Pending" - ERROR = "Error" - COLLECTING = "Collecting" - INITIALIZING = "Initializing" - UPDATE = "Update" - FAILED_CLONE = "Failed Clone" - +from augur.tasks.util.collection_state import CollectionState logger = logging.getLogger(__name__) @@ -210,7 +201,7 @@ def setup_periodic_tasks(sender, **kwargs): """ from celery.schedules import crontab from augur.tasks.start_tasks import augur_collection_monitor, augur_collection_update_weights - from augur.tasks.start_tasks import non_repo_domain_tasks + from augur.tasks.start_tasks import non_repo_domain_tasks, retry_404_repos from augur.tasks.git.facade_tasks import clone_repos from augur.tasks.db.refresh_materialized_views import refresh_materialized_views from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model @@ -235,6 +226,9 @@ def setup_periodic_tasks(sender, **kwargs): logger.info(f"Scheduling update of collection weights on midnight each day") sender.add_periodic_task(crontab(hour=0, minute=0),augur_collection_update_weights.s()) + logger.info(f"Setting 404 repos to be marked for retry on midnight each day") + sender.add_periodic_task(crontab(hour=0, minute=0),retry_404_repos.s()) + logger.info(f"Scheduling contributor breadth every 30 days") thirty_days_in_seconds = 30*24*60*60 sender.add_periodic_task(thirty_days_in_seconds, contributor_breadth_model.s()) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 10f04e40b7..bb158104c3 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -36,6 +36,7 @@ from enum import Enum from augur.tasks.util.redis_list import RedisList from augur.application.db.models import CollectionStatus, Repo +from augur.tasks.util.collection_state import CollectionState from augur.tasks.util.collection_util import * from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_facade_weight_time_factor @@ -328,6 +329,22 @@ def augur_collection_update_weights(): session.commit() #git_update_commit_count_weight(repo_git) +@celery.task +def retry_404_repos(): + from augur.tasks.init.celery_app import engine + logger = logging.getLogger(create_collection_status_records.__name__) + + with DatabaseSession(logger,engine) as session: + query = s.sql.text(f"""UPDATE repo SET secondary_staus = {CollectionState.STANDBY.value}""" + f""" WHERE secondary_status = '{CollectionState.PENDING.value}' ;""" + f"""UPDATE repo SET core_status = {CollectionState.STANDBY.value}""" + f""" WHERE core_status = '{CollectionState.PENDING.value}' ;""" + ) + + session.execute_sql(query) + + + #Retry this task for every issue so that repos that were added manually get the chance to be added to the collection_status table. @celery.task(autoretry_for=(Exception,), retry_backoff=True, retry_backoff_max=300, retry_jitter=True, max_retries=None) def create_collection_status_records(): diff --git a/augur/tasks/util/collection_state.py b/augur/tasks/util/collection_state.py new file mode 100644 index 0000000000..37a3b1ed8f --- /dev/null +++ b/augur/tasks/util/collection_state.py @@ -0,0 +1,11 @@ +from enum import Enum + +class CollectionState(Enum): + SUCCESS = "Success" + PENDING = "Pending" + ERROR = "Error" + COLLECTING = "Collecting" + INITIALIZING = "Initializing" + UPDATE = "Update" + FAILED_CLONE = "Failed Clone" + STANDBY = "Standby" diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index bf5d2d8348..89ae5f3d53 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -24,19 +24,9 @@ from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.application.db.session import DatabaseSession from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps +from augur.tasks.util.collection_state import CollectionState -# class syntax -class CollectionState(Enum): - SUCCESS = "Success" - PENDING = "Pending" - ERROR = "Error" - COLLECTING = "Collecting" - INITIALIZING = "Initializing" - UPDATE = "Update" - FAILED_CLONE = "Failed Clone" - STANDBY = "Standby" - def get_list_of_all_users(session): #Get a list of all users. query = s.sql.text(""" From 801e5e0286670b831440ee6c0da43b57b2382a7c Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 8 Feb 2024 10:00:36 -0500 Subject: [PATCH 05/15] add docstring to update_repo_with_dict Signed-off-by: Isaac Milarsky --- augur/tasks/github/detect_move/core.py | 19 ++++++++++++++----- augur/tasks/util/collection_state.py | 1 + 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/augur/tasks/github/detect_move/core.py b/augur/tasks/github/detect_move/core.py index 6fb7c76378..c7af7e0ee8 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/augur/tasks/github/detect_move/core.py @@ -11,10 +11,19 @@ -def update_repo_with_dict(current_dict,new_dict,logger,db): +def update_repo_with_dict(repo,new_dict,logger,db): +""" + Update a repository record in the database using a dictionary tagged with + the appropriate table fields + + Args: + repo: orm repo object to update + new_dict: dict of new values to add to the repo record + logger: logging object + db: db object +""" - - to_insert = current_dict + to_insert = repo.__dict__ del to_insert['_sa_instance_state'] to_insert.update(new_dict) @@ -63,7 +72,7 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' 'data_collection_date': datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') } - update_repo_with_dict(current_repo_dict, repo_update_dict, logger, augur_db) + update_repo_with_dict(repo, repo_update_dict, logger, augur_db) raise Exception(f"ERROR: Repo not found at requested host {repo.repo_git}") elif attempts >= 10: @@ -93,7 +102,7 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' 'description': f"(Originally hosted at {url}) {old_description}" } - update_repo_with_dict(current_repo_dict, repo_update_dict, logger,augur_db) + update_repo_with_dict(repo, repo_update_dict, logger,augur_db) statusQuery = augur_db.session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) diff --git a/augur/tasks/util/collection_state.py b/augur/tasks/util/collection_state.py index 37a3b1ed8f..b2bbf4fc55 100644 --- a/augur/tasks/util/collection_state.py +++ b/augur/tasks/util/collection_state.py @@ -1,3 +1,4 @@ + from enum import Enum class CollectionState(Enum): From 9a76471da461f13a4cc7e532476b71eabf7cbb00 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 8 Feb 2024 10:54:25 -0500 Subject: [PATCH 06/15] fix misunderstanding in implementation Signed-off-by: Isaac Milarsky --- augur/tasks/github/detect_move/core.py | 7 +++---- augur/tasks/init/celery_app.py | 4 ++-- augur/tasks/start_tasks.py | 10 +++++----- augur/tasks/util/collection_state.py | 1 + 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/augur/tasks/github/detect_move/core.py b/augur/tasks/github/detect_move/core.py index c7af7e0ee8..2dc3863f8a 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/augur/tasks/github/detect_move/core.py @@ -49,7 +49,6 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' owner, name = get_owner_repo(repo.repo_git) url = f"https://api.github.com/repos/{owner}/{name}" - current_repo_dict = repo.__dict__ attempts = 0 while attempts < 10: @@ -108,16 +107,16 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' collectionRecord = execute_session_query(statusQuery,'one') if collection_hook == 'core': - collectionRecord.core_status = CollectionState.STANDBY.value + collectionRecord.core_status = CollectionState.IGNORE.value collectionRecord.core_task_id = None collectionRecord.core_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') elif collection_hook == 'secondary': - collectionRecord.secondary_status = CollectionState.STANDBY.value + collectionRecord.secondary_status = CollectionState.IGNORE.value collectionRecord.secondary_task_id = None collectionRecord.secondary_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') augur_db.session.commit() - raise Exception("ERROR: Repo has moved! Marked repo as standby and stopped collection") + raise Exception("ERROR: Repo has moved! Marked repo as IGNORE and stopped collection!") diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index 1c3c9e2088..274305449a 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -201,7 +201,7 @@ def setup_periodic_tasks(sender, **kwargs): """ from celery.schedules import crontab from augur.tasks.start_tasks import augur_collection_monitor, augur_collection_update_weights - from augur.tasks.start_tasks import non_repo_domain_tasks, retry_404_repos + from augur.tasks.start_tasks import non_repo_domain_tasks, retry_errored_repos from augur.tasks.git.facade_tasks import clone_repos from augur.tasks.db.refresh_materialized_views import refresh_materialized_views from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model @@ -227,7 +227,7 @@ def setup_periodic_tasks(sender, **kwargs): sender.add_periodic_task(crontab(hour=0, minute=0),augur_collection_update_weights.s()) logger.info(f"Setting 404 repos to be marked for retry on midnight each day") - sender.add_periodic_task(crontab(hour=0, minute=0),retry_404_repos.s()) + sender.add_periodic_task(crontab(hour=0, minute=0),retry_errored_repos.s()) logger.info(f"Scheduling contributor breadth every 30 days") thirty_days_in_seconds = 30*24*60*60 diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index bb158104c3..dd08a926a2 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -330,15 +330,15 @@ def augur_collection_update_weights(): #git_update_commit_count_weight(repo_git) @celery.task -def retry_404_repos(): +def retry_errored_repos(): from augur.tasks.init.celery_app import engine logger = logging.getLogger(create_collection_status_records.__name__) with DatabaseSession(logger,engine) as session: - query = s.sql.text(f"""UPDATE repo SET secondary_staus = {CollectionState.STANDBY.value}""" - f""" WHERE secondary_status = '{CollectionState.PENDING.value}' ;""" - f"""UPDATE repo SET core_status = {CollectionState.STANDBY.value}""" - f""" WHERE core_status = '{CollectionState.PENDING.value}' ;""" + query = s.sql.text(f"""UPDATE repo SET secondary_staus = {CollectionState.PENDING.value}""" + f""" WHERE secondary_status = '{CollectionState.ERROR.value}' ;""" + f"""UPDATE repo SET core_status = {CollectionState.PENDING.value}""" + f""" WHERE core_status = '{CollectionState.ERROR.value}' ;""" ) session.execute_sql(query) diff --git a/augur/tasks/util/collection_state.py b/augur/tasks/util/collection_state.py index b2bbf4fc55..27acc98de1 100644 --- a/augur/tasks/util/collection_state.py +++ b/augur/tasks/util/collection_state.py @@ -10,3 +10,4 @@ class CollectionState(Enum): UPDATE = "Update" FAILED_CLONE = "Failed Clone" STANDBY = "Standby" + IGNORE = "Ignore" From 5a77d652af9561b84028fa036ad6a6963b39b95c Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 8 Feb 2024 10:56:13 -0500 Subject: [PATCH 07/15] fix format Signed-off-by: Isaac Milarsky --- augur/tasks/github/detect_move/core.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/augur/tasks/github/detect_move/core.py b/augur/tasks/github/detect_move/core.py index 2dc3863f8a..762b20779a 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/augur/tasks/github/detect_move/core.py @@ -12,16 +12,16 @@ def update_repo_with_dict(repo,new_dict,logger,db): -""" - Update a repository record in the database using a dictionary tagged with - the appropriate table fields - - Args: - repo: orm repo object to update - new_dict: dict of new values to add to the repo record - logger: logging object - db: db object -""" + """ + Update a repository record in the database using a dictionary tagged with + the appropriate table fields + + Args: + repo: orm repo object to update + new_dict: dict of new values to add to the repo record + logger: logging object + db: db object + """ to_insert = repo.__dict__ del to_insert['_sa_instance_state'] From 23f17e940fee29bb7c7f53f2fe71e077f76f414f Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 9 Feb 2024 09:32:03 -0500 Subject: [PATCH 08/15] retry all status' Signed-off-by: Isaac Milarsky --- augur/tasks/start_tasks.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index dd08a926a2..5b3e259d9c 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -331,14 +331,23 @@ def augur_collection_update_weights(): @celery.task def retry_errored_repos(): + """ + Periodic task to reset repositories that have errored and try again. + """ from augur.tasks.init.celery_app import engine logger = logging.getLogger(create_collection_status_records.__name__) + #TODO: Isaac needs to normalize the status's to be abstract in the + #collection_status table once augur dev is less unstable. with DatabaseSession(logger,engine) as session: query = s.sql.text(f"""UPDATE repo SET secondary_staus = {CollectionState.PENDING.value}""" f""" WHERE secondary_status = '{CollectionState.ERROR.value}' ;""" f"""UPDATE repo SET core_status = {CollectionState.PENDING.value}""" f""" WHERE core_status = '{CollectionState.ERROR.value}' ;""" + f"""UPDATE repo SET facade_status = {CollectionState.PENDING.value}""" + f""" WHERE facade_status = '{CollectionState.ERROR.value}' ;""" + f"""UPDATE repo SET ml_status = {CollectionState.PENDING.value}""" + f""" WHERE ml_status = '{CollectionState.ERROR.value}' ;""" ) session.execute_sql(query) From b70f524251244ca7af01c3ad87d66f7afea41816 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 9 Feb 2024 09:50:53 -0500 Subject: [PATCH 09/15] linting Signed-off-by: Isaac Milarsky --- augur/tasks/start_tasks.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 5b3e259d9c..b0badb89d2 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -33,7 +33,6 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.application.db.session import DatabaseSession from logging import Logger -from enum import Enum from augur.tasks.util.redis_list import RedisList from augur.application.db.models import CollectionStatus, Repo from augur.tasks.util.collection_state import CollectionState @@ -357,6 +356,13 @@ def retry_errored_repos(): #Retry this task for every issue so that repos that were added manually get the chance to be added to the collection_status table. @celery.task(autoretry_for=(Exception,), retry_backoff=True, retry_backoff_max=300, retry_jitter=True, max_retries=None) def create_collection_status_records(): + """ + Automatic task that runs and checks for repos that haven't been given a collection_status + record corresponding to the state of their collection at the monent. + + A special celery task that automatically retries itself and has no max retries. + """ + from augur.tasks.init.celery_app import engine logger = logging.getLogger(create_collection_status_records.__name__) From 68e50dec153b3e5ac9903ba2835490f995bff8dd Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 9 Feb 2024 09:58:52 -0500 Subject: [PATCH 10/15] doc-string Signed-off-by: Isaac Milarsky --- augur/tasks/util/collection_state.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/augur/tasks/util/collection_state.py b/augur/tasks/util/collection_state.py index 27acc98de1..b5b8f0d264 100644 --- a/augur/tasks/util/collection_state.py +++ b/augur/tasks/util/collection_state.py @@ -2,6 +2,23 @@ from enum import Enum class CollectionState(Enum): + """ + Enum of possible states a repository's collection + can have whether it is core, secondary, facade, etc. + + Attributes: + + SUCCESS: State of success for the jobs in that collection hook + PENDING: Means the repo has not had collection run at all + ERROR: The collection hook has crashed + COLLECTING: The collection hook is running + INITIALIZING: Only for facade, indicates the repo is being cloned via git + UPDATE: Only for facade, indicates the repo has been cloned + FAILED_CLONE: Only for facade, indicates the clone has failed (usually 404) + STANDBY: Indicates the repo has been paused + IGNORE: Repo has encountered an error and we will not try again (usually 404) + """ + SUCCESS = "Success" PENDING = "Pending" ERROR = "Error" From 9aa97258465b1941e6d8b3bc29671f5f9627b1f0 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 12 Feb 2024 10:14:01 -0600 Subject: [PATCH 11/15] implement requested changes Signed-off-by: Isaac Milarsky --- augur/tasks/github/detect_move/core.py | 94 +++++++++++++------------- 1 file changed, 46 insertions(+), 48 deletions(-) diff --git a/augur/tasks/github/detect_move/core.py b/augur/tasks/github/detect_move/core.py index 762b20779a..79c4fbd728 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/augur/tasks/github/detect_move/core.py @@ -59,64 +59,62 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' attempts += 1 - #Mark as errored if not found - if response_from_gh.status_code == 404: - logger.error(f"Repo {repo.repo_git} responded 404 when pinged!") + #Update Url and retry if 301 + #301 moved permanently + if response_from_gh.status_code == 301: + + owner, name = extract_owner_and_repo_from_endpoint(key_auth, response_from_gh.headers['location'], logger) + + try: + old_description = str(repo.description) + except Exception: + old_description = "" + #Create new repo object to update existing repo_update_dict = { - 'repo_git': repo.repo_git, - 'repo_path': None, - 'repo_name': None, - 'description': f"During our check for this repo on {datetime.today().strftime('%Y-%m-%d')}, a 404 error was returned. The repository does not appear to have moved. Instead, it appears to be deleted", - 'data_collection_date': datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') + 'repo_git': f"https://github.com/{owner}/{name}", + 'repo_path': None, + 'repo_name': None, + 'description': f"(Originally hosted at {url}) {old_description}" } - update_repo_with_dict(repo, repo_update_dict, logger, augur_db) - - raise Exception(f"ERROR: Repo not found at requested host {repo.repo_git}") - elif attempts >= 10: - logger.warning(f"Could not check if repo moved because the api timed out 10 times. Url: {url}") - return - + update_repo_with_dict(repo, repo_update_dict, logger,augur_db) - #skip if not moved - #301 moved permanently - if response_from_gh.status_code != 301: - logger.info(f"Repo found at url: {url}") - return + raise Exception("ERROR: Repo has moved! Resetting Collection!") - owner, name = extract_owner_and_repo_from_endpoint(key_auth, response_from_gh.headers['location'], logger) - - - try: - old_description = str(repo.description) - except: - old_description = "" - - #Create new repo object to update existing - repo_update_dict = { - 'repo_git': f"https://github.com/{owner}/{name}", - 'repo_path': None, - 'repo_name': None, - 'description': f"(Originally hosted at {url}) {old_description}" - } + #Mark as ignore if 404 + if response_from_gh.status_code == 404: + repo_update_dict = { + 'repo_git': repo.repo_git, + 'repo_path': None, + 'repo_name': None, + 'description': f"During our check for this repo on {datetime.today().strftime('%Y-%m-%d')}, a 404 error was returned. The repository does not appear to have moved. Instead, it appears to be deleted", + 'data_collection_date': datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') + } - update_repo_with_dict(repo, repo_update_dict, logger,augur_db) + update_repo_with_dict(repo, repo_update_dict, logger, augur_db) - statusQuery = augur_db.session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) + statusQuery = augur_db.session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) - collectionRecord = execute_session_query(statusQuery,'one') - if collection_hook == 'core': - collectionRecord.core_status = CollectionState.IGNORE.value - collectionRecord.core_task_id = None - collectionRecord.core_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') - elif collection_hook == 'secondary': - collectionRecord.secondary_status = CollectionState.IGNORE.value - collectionRecord.secondary_task_id = None - collectionRecord.secondary_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') + collectionRecord = execute_session_query(statusQuery,'one') + if collection_hook == 'core': + collectionRecord.core_status = CollectionState.IGNORE.value + collectionRecord.core_task_id = None + collectionRecord.core_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') + elif collection_hook == 'secondary': + collectionRecord.secondary_status = CollectionState.IGNORE.value + collectionRecord.secondary_task_id = None + collectionRecord.secondary_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') - augur_db.session.commit() + augur_db.session.commit() + raise Exception("ERROR: Repo has moved! Resetting Collection!") - raise Exception("ERROR: Repo has moved! Marked repo as IGNORE and stopped collection!") + if attempts >= 10: + logger.error(f"Could not check if repo moved because the api timed out 10 times. Url: {url}") + raise Exception(f"ERROR: Could not get api response for repo: {url}") + #skip if not 404 + logger.info(f"Repo found at url: {url}") + return + From 4c8edae5a49e0336fc1b62e1eccfa88d7e7fd006 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 12 Feb 2024 17:49:08 -0600 Subject: [PATCH 12/15] typo causing cli error Signed-off-by: Isaac Milarsky --- augur/application/cli/_multicommand.py | 2 +- augur/tasks/github/detect_move/core.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/augur/application/cli/_multicommand.py b/augur/application/cli/_multicommand.py index 2df6e8b115..c0d8b1a967 100644 --- a/augur/application/cli/_multicommand.py +++ b/augur/application/cli/_multicommand.py @@ -27,7 +27,7 @@ def get_command(self, ctx, name): try: module = importlib.import_module('.' + name, 'augur.application.cli') return module.cli - except ModuleNotFoundError: + except ModuleNotFoundError as e: pass @click.command(cls=AugurMultiCommand, context_settings=CONTEXT_SETTINGS) diff --git a/augur/tasks/github/detect_move/core.py b/augur/tasks/github/detect_move/core.py index 79c4fbd728..875c649e24 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/augur/tasks/github/detect_move/core.py @@ -6,7 +6,7 @@ from augur.tasks.github.util.util import parse_json_response import logging from datetime import datetime -from augur.tasks.utl.collection_state import CollectionState +from augur.tasks.util.collection_state import CollectionState from augur.application.db.util import execute_session_query From be5c19c71f7cc8be53d3b41ff34de71682de6bfc Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 12 Feb 2024 18:08:29 -0600 Subject: [PATCH 13/15] make sure all status are updated on 404 as per andrew suggestion Signed-off-by: Isaac Milarsky --- augur/tasks/github/detect_move/core.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/augur/tasks/github/detect_move/core.py b/augur/tasks/github/detect_move/core.py index 875c649e24..6c5a43d8bb 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/augur/tasks/github/detect_move/core.py @@ -97,14 +97,18 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' statusQuery = augur_db.session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) collectionRecord = execute_session_query(statusQuery,'one') - if collection_hook == 'core': - collectionRecord.core_status = CollectionState.IGNORE.value - collectionRecord.core_task_id = None - collectionRecord.core_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') - elif collection_hook == 'secondary': - collectionRecord.secondary_status = CollectionState.IGNORE.value - collectionRecord.secondary_task_id = None - collectionRecord.secondary_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') + + collectionRecord.core_status = CollectionState.IGNORE.value + collectionRecord.core_task_id = None + collectionRecord.core_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') + + collectionRecord.secondary_status = CollectionState.IGNORE.value + collectionRecord.secondary_task_id = None + collectionRecord.secondary_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') + + collectionRecord.facade_status = CollectionState.IGNORE.value + collectionRecord.facade_task_id = None + collectionRecord.facade_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') augur_db.session.commit() raise Exception("ERROR: Repo has moved! Resetting Collection!") From 0a5299918d448c3e2837c62be763a55ddd274188 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 12 Feb 2024 18:29:26 -0600 Subject: [PATCH 14/15] i need to normalize this for the love of gosh Signed-off-by: Isaac Milarsky --- augur/tasks/github/detect_move/core.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/augur/tasks/github/detect_move/core.py b/augur/tasks/github/detect_move/core.py index 6c5a43d8bb..cf7d2d1e5a 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/augur/tasks/github/detect_move/core.py @@ -110,6 +110,11 @@ def ping_github_for_repo_move(augur_db, key_auth, repo, logger,collection_hook=' collectionRecord.facade_task_id = None collectionRecord.facade_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') + collectionRecord.ml_status = CollectionState.IGNORE.value + collectionRecord.ml_task_id = None + collectionRecord.ml_data_last_collected = datetime.today().strftime('%Y-%m-%dT%H:%M:%SZ') + + augur_db.session.commit() raise Exception("ERROR: Repo has moved! Resetting Collection!") From 84ab7b0c69f58eac50f0cf19851bc4f9fa7615e3 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Wed, 14 Feb 2024 04:51:06 -0600 Subject: [PATCH 15/15] Fixing Errant Nulls in PR Reports Query --- augur/api/routes/pull_request_reports.py | 49 ++++++++++++------------ 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/augur/api/routes/pull_request_reports.py b/augur/api/routes/pull_request_reports.py index 72b7a2c44e..9e65779542 100644 --- a/augur/api/routes/pull_request_reports.py +++ b/augur/api/routes/pull_request_reports.py @@ -53,7 +53,7 @@ def pull_request_data_collection(repo_id, start_date, end_date): ( EXTRACT ( EPOCH FROM last_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_last_response, first_response_time, last_response_time, - average_time_between_responses, + EXTRACT ( EPOCH FROM average_time_between_responses), assigned_count, review_requested_count, labeled_count, @@ -62,15 +62,15 @@ def pull_request_data_collection(repo_id, start_date, end_date): referenced_count, closed_count, head_ref_force_pushed_count, - merged_count, + merged_count::INT, milestoned_count, unlabeled_count, head_ref_deleted_count, comment_count, - lines_added, - lines_removed, + COALESCE(lines_added, 0), + COALESCE(lines_removed, 0), commit_count, - file_count + COALESCE(file_count, 0) FROM repo, repo_groups, @@ -87,46 +87,47 @@ def pull_request_data_collection(repo_id, start_date, end_date): count(*) FILTER (WHERE action = 'head_ref_force_pushed') AS head_ref_force_pushed_count, count(*) FILTER (WHERE action = 'head_ref_deleted') AS head_ref_deleted_count, count(*) FILTER (WHERE action = 'milestoned') AS milestoned_count, - count(*) FILTER (WHERE action = 'merged') AS merged_count, - MIN(message.msg_timestamp) AS first_response_time, - COUNT(DISTINCT message.msg_timestamp) AS comment_count, - MAX(message.msg_timestamp) AS last_response_time, - (MAX(message.msg_timestamp) - MIN(message.msg_timestamp)) / COUNT(DISTINCT message.msg_timestamp) AS average_time_between_responses - FROM pull_request_events, pull_requests, repo, pull_request_message_ref, message - WHERE repo.repo_id = {repo_id} - AND repo.repo_id = pull_requests.repo_id - AND pull_requests.pull_request_id = pull_request_events.pull_request_id - AND pull_requests.pull_request_id = pull_request_message_ref.pull_request_id - AND pull_request_message_ref.msg_id = message.msg_id + COALESCE(count(*) FILTER (WHERE action = 'merged'), 0) AS merged_count, + COALESCE(MIN(message.msg_timestamp), pull_requests.pr_merged_at, pull_requests.pr_closed_at) AS first_response_time, + COALESCE(COUNT(DISTINCT message.msg_timestamp), 0) AS comment_count, + COALESCE(MAX(message.msg_timestamp), pull_requests.pr_closed_at) AS last_response_time, + COALESCE((MAX(message.msg_timestamp) - MIN(message.msg_timestamp)) / COUNT(DISTINCT message.msg_timestamp), pull_requests.pr_created_at - pull_requests.pr_closed_at) AS average_time_between_responses + FROM pull_requests + LEFT OUTER JOIN pull_request_events on pull_requests.pull_request_id = pull_request_events.pull_request_id + JOIN repo on repo.repo_id = pull_requests.repo_id + LEFT OUTER JOIN pull_request_message_ref on pull_requests.pull_request_id = pull_request_message_ref.pull_request_id + LEFT OUTER JOIN message on pull_request_message_ref.msg_id = message.msg_id + WHERE repo.repo_id = 1 GROUP BY pull_requests.pull_request_id ) response_times ON pull_requests.pull_request_id = response_times.pull_request_id - LEFT OUTER JOIN ( - SELECT pull_request_commits.pull_request_id, count(DISTINCT pr_cmt_sha) AS commit_count FROM pull_request_commits, pull_requests, pull_request_meta + LEFT JOIN ( + SELECT pull_request_commits.pull_request_id, count(DISTINCT pr_cmt_sha) AS commit_count + FROM pull_request_commits, pull_requests, pull_request_meta WHERE pull_requests.pull_request_id = pull_request_commits.pull_request_id AND pull_requests.pull_request_id = pull_request_meta.pull_request_id - AND pull_requests.repo_id = {repo_id} + AND pull_requests.repo_id = 1 AND pr_cmt_sha <> pull_requests.pr_merge_commit_sha AND pr_cmt_sha <> pull_request_meta.pr_sha GROUP BY pull_request_commits.pull_request_id ) all_commit_counts ON pull_requests.pull_request_id = all_commit_counts.pull_request_id - LEFT OUTER JOIN ( + LEFT JOIN ( SELECT MAX(pr_repo_meta_id), pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label FROM pull_requests, pull_request_meta WHERE pull_requests.pull_request_id = pull_request_meta.pull_request_id - AND pull_requests.repo_id = {repo_id} + AND pull_requests.repo_id = 1 AND pr_head_or_base = 'base' GROUP BY pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label ) base_labels ON base_labels.pull_request_id = all_commit_counts.pull_request_id - LEFT OUTER JOIN ( + LEFT JOIN ( SELECT sum(cmt_added) AS lines_added, sum(cmt_removed) AS lines_removed, pull_request_commits.pull_request_id, count(DISTINCT cmt_filename) AS file_count FROM pull_request_commits, commits, pull_requests, pull_request_meta WHERE cmt_commit_hash = pr_cmt_sha AND pull_requests.pull_request_id = pull_request_commits.pull_request_id AND pull_requests.pull_request_id = pull_request_meta.pull_request_id - AND pull_requests.repo_id = {repo_id} + AND pull_requests.repo_id = 1 AND commits.repo_id = pull_requests.repo_id AND commits.cmt_commit_hash <> pull_requests.pr_merge_commit_sha AND commits.cmt_commit_hash <> pull_request_meta.pr_sha @@ -136,7 +137,7 @@ def pull_request_data_collection(repo_id, start_date, end_date): WHERE repo.repo_group_id = repo_groups.repo_group_id AND repo.repo_id = pull_requests.repo_id - AND repo.repo_id = {repo_id} + AND repo.repo_id = 1 ORDER BY merged_count DESC """)