From 5a249af35d8c9d30513bc9e9842d3266d6a76e11 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Wed, 25 Sep 2024 09:26:06 -0500 Subject: [PATCH 1/6] update version Signed-off-by: Sean P. Goggins --- README.md | 4 ++-- metadata.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 0ff218d81e..69aa2f551d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Augur NEW Release v0.76.2 +# Augur NEW Release v0.76.3 Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else! The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io @@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o ## NEW RELEASE ALERT! ### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md) -Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.2 +Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.3 - The `main` branch is a stable version of our new architecture, which features: - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks. diff --git a/metadata.py b/metadata.py index ae807e7a36..018f51d328 100644 --- a/metadata.py +++ b/metadata.py @@ -5,8 +5,8 @@ __short_description__ = "Python 3 package for free/libre and open-source software community metrics, models & data collection" -__version__ = "0.76.2" -__release__ = "v0.76.2 (Pumpkin Space)" +__version__ = "0.76.3" +__release__ = "v0.76.3 (Pumpkin Laser)" __license__ = "MIT" -__copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Brian Warner & Augurlabs 2024" +__copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Brian Warner & Augurlabs 2112" From 3bab4ca137aa80dd634a77dd824c88cb4eead964 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 28 Sep 2024 10:22:31 -0500 Subject: [PATCH 2/6] add support for nested repos for gitlab --- augur/api/view/api.py | 2 +- augur/application/db/models/augur_data.py | 2 +- augur/tasks/github/util/util.py | 5 +++ augur/tasks/gitlab/events_task.py | 14 ++++---- augur/tasks/gitlab/issues_task.py | 20 +++++++----- augur/tasks/gitlab/merge_request_task.py | 40 +++++++++++++++-------- 6 files changed, 53 insertions(+), 30 deletions(-) diff --git a/augur/api/view/api.py b/augur/api/view/api.py index cbd7e4a0f1..345f4a6427 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -106,7 +106,7 @@ def av_add_user_repo(): # matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo} elif Repo.parse_gitlab_repo_url(url)[0]: - org_name, repo_name = Repo.parse_github_repo_url(url) + org_name, repo_name = Repo.parse_gitlab_repo_url(url) repo_git = f"https://gitlab.com/{org_name}/{repo_name}" # TODO: gitlab ensure the whole repo git is inserted so it can be found here diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index 934949138e..c07f727124 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -1030,7 +1030,7 @@ def parse_gitlab_repo_url(url: str) -> tuple: Tuple of owner and repo. Or a tuple of None and None if the url is invalid. """ - result = re.search(r"https?:\/\/gitlab\.com\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$", url) + result = re.search(r"https?:\/\/gitlab\.com\/([A-Za-z0-9\-_\/]+)\/([A-Za-z0-9\-_]+)(\.git)?\/?$", url) if not result: return None, None diff --git a/augur/tasks/github/util/util.py b/augur/tasks/github/util/util.py index 8dd6e4d81b..a8bde66605 100644 --- a/augur/tasks/github/util/util.py +++ b/augur/tasks/github/util/util.py @@ -1,6 +1,7 @@ """Utility functions that are useful for several Github tasks""" from typing import Any, List, Tuple import logging +import urllib.parse import json import httpx from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth @@ -46,6 +47,10 @@ def get_owner_repo(git_url: str) -> Tuple[str, str]: return owner, repo +def get_gitlab_repo_identifier(owner, repo): + + return urllib.parse.quote(f"{owner}/{repo}", safe='') + def parse_json_response(logger: logging.Logger, response: httpx.Response) -> dict: # try to get json from response diff --git a/augur/tasks/gitlab/events_task.py b/augur/tasks/gitlab/events_task.py index c8d9a8f8a7..5c85bc5d84 100644 --- a/augur/tasks/gitlab/events_task.py +++ b/augur/tasks/gitlab/events_task.py @@ -7,8 +7,8 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler from augur.application.db.data_parse import extract_gitlab_mr_event_data, extract_gitlab_issue_event_data -from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import Issue, IssueEvent, PullRequest, PullRequestEvent +from augur.tasks.github.util.util import get_gitlab_repo_identifier +from augur.application.db.models import Issue, IssueEvent, PullRequest, PullRequestEvent, Repo from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth @@ -24,7 +24,7 @@ def collect_gitlab_issue_events(repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) logger = logging.getLogger(collect_gitlab_issue_events.__name__) @@ -52,7 +52,7 @@ def collect_gitlab_merge_request_events(repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) logger = logging.getLogger(collect_gitlab_issue_events.__name__) @@ -82,11 +82,13 @@ def retrieve_all_gitlab_event_data(gtype, repo_git, logger, key_auth) -> None: key_auth: key auth cache and rotator object """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) logger.info(f"Collecting gitlab issue events for {owner}/{repo}") - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/events?target_type={gtype}" + url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/events?target_type={gtype}" events = GitlabApiHandler(key_auth, logger) all_data = [] diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index 8a987a7744..509b4fc14c 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -8,8 +8,8 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler from augur.application.db.data_parse import extract_needed_issue_data_from_gitlab_issue, extract_needed_gitlab_issue_label_data, extract_needed_gitlab_issue_assignee_data, extract_needed_gitlab_issue_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data -from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts -from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Contributor +from augur.tasks.github.util.util import get_gitlab_repo_identifier, add_key_value_pair_to_dicts +from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Contributor, Repo from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth @@ -32,7 +32,7 @@ def collect_gitlab_issues(repo_git : str) -> int: key_auth = GitlabRandomKeyAuth(logger) try: - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) issue_data = retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) @@ -57,11 +57,13 @@ def retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) -> None: key_auth: key auth cache and rotator object """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_github_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) logger.info(f"Collecting gitlab issues for {owner}/{repo}") - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues?with_labels_details=True" + url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/issues?with_labels_details=True" issues = GitlabApiHandler(key_auth, logger) all_data = [] @@ -207,7 +209,7 @@ def collect_gitlab_issue_comments(issue_ids, repo_git) -> int: repo_git: repo url """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) logger = logging.getLogger(collect_gitlab_issues.__name__) @@ -237,7 +239,9 @@ def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git): repo_git: repo url """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) all_comments = {} issue_count = len(issue_ids) @@ -249,7 +253,7 @@ def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git): logger.info(f"Collecting {owner}/{repo} gitlab issue comments for issue {index} of {issue_count}") - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues/{id}/notes" + url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/issues/{id}/notes" for page_data, _ in comments.iter_pages(url): diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index 5e56067c53..cdcd27a387 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -4,7 +4,7 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_mr_reviewer_data, extract_needed_mr_commit_data, extract_needed_mr_file_data, extract_needed_mr_metadata, extract_needed_gitlab_mr_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data -from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts +from augur.tasks.github.util.util import get_gitlab_repo_identifier, add_key_value_pair_to_dicts from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message, Contributor, PullRequestAssignee from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth from augur.tasks.util.worker_util import remove_duplicate_dicts @@ -26,7 +26,7 @@ def collect_gitlab_merge_requests(repo_git: str) -> int: repo_id = get_repo_by_repo_git(repo_git).repo_id - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) key_auth = GitlabRandomKeyAuth(logger) @@ -51,11 +51,13 @@ def retrieve_all_mr_data(repo_git: str, logger, key_auth) -> None: key_auth: key auth cache and rotator object """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) logger.info(f"Collecting pull requests for {owner}/{repo}") - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests?with_labels_details=True" + url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests?with_labels_details=True" mrs = GitlabApiHandler(key_auth, logger) all_data = [] @@ -171,15 +173,17 @@ def collect_merge_request_comments(mr_ids, repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) logger = logging.getLogger(collect_merge_request_comments.__name__) + repo_identifier = get_gitlab_repo_identifier(owner, repo) + repo_id = get_repo_by_repo_git(repo_git).repo_id key_auth = GitlabRandomKeyAuth(logger) - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/notes".format(owner=owner, repo=repo, id="{id}") + url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/notes".format(repo_identifier=repo_identifier, id="{id}") comments = retrieve_merge_request_data(mr_ids, url, "comments", owner, repo, key_auth, logger, response_type="list") with get_session() as session: @@ -282,7 +286,9 @@ def collect_merge_request_metadata(mr_ids, repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) logger = logging.getLogger(collect_merge_request_metadata.__name__) @@ -290,7 +296,7 @@ def collect_merge_request_metadata(mr_ids, repo_git) -> int: key_auth = GitlabRandomKeyAuth(logger) - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}".format(owner=owner, repo=repo, id="{id}") + url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}".format(repo_identifier=repo_identifier, id="{id}") metadata_list = retrieve_merge_request_data(mr_ids, url, "metadata", owner, repo, key_auth, logger, response_type="dict") with get_session() as session: @@ -347,7 +353,9 @@ def collect_merge_request_reviewers(mr_ids, repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) logger = logging.getLogger(collect_merge_request_reviewers.__name__) @@ -355,7 +363,7 @@ def collect_merge_request_reviewers(mr_ids, repo_git) -> int: key_auth = GitlabRandomKeyAuth(logger) - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/approvals".format(owner=owner, repo=repo, id="{id}") + url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/approvals".format(repo_identifier=repo_identifier, id="{id}") reviewers = retrieve_merge_request_data(mr_ids, url, "reviewers", owner, repo, key_auth, logger, response_type="dict") with get_session() as session: @@ -414,7 +422,9 @@ def collect_merge_request_commits(mr_ids, repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) logger = logging.getLogger(collect_merge_request_commits.__name__) @@ -422,7 +432,7 @@ def collect_merge_request_commits(mr_ids, repo_git) -> int: key_auth = GitlabRandomKeyAuth(logger) - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/commits".format(owner=owner, repo=repo, id="{id}") + url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/commits".format(repo_identifier=repo_identifier, id="{id}") commits = retrieve_merge_request_data(mr_ids, url, "commits", owner, repo, key_auth, logger, response_type="list") with get_session() as session: @@ -484,13 +494,15 @@ def collect_merge_request_files(mr_ids, repo_git) -> int: logger = logging.getLogger(collect_merge_request_files.__name__) - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) repo_id = get_repo_by_repo_git(repo_git).repo_id key_auth = GitlabRandomKeyAuth(logger) - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/changes".format(owner=owner, repo=repo, id="{id}") + url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/changes".format(repo_identifier=repo_identifier, id="{id}") files = retrieve_merge_request_data(mr_ids, url, "files", owner, repo, key_auth, logger, response_type="dict") with get_session() as session: From 504026d600edc0a9c90ad70ec19f91c4181db4bb Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Sat, 28 Sep 2024 11:01:45 -0500 Subject: [PATCH 3/6] update insert logic to get repo identifier --- augur/application/db/models/augur_data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index c07f727124..c1e6d3e65c 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -26,6 +26,7 @@ import logging import re import json +import urllib.parse from augur.application.db.models.base import Base @@ -971,7 +972,7 @@ def is_valid_gitlab_repo(gl_session, url: str) -> bool: return False, {"status": "Invalid repo URL"} # Encode namespace and project name for the API request - project_identifier = f"{owner}%2F{repo}" + project_identifier = urllib.parse.quote(f"{owner}/{repo}", safe='') url = REPO_ENDPOINT.format(project_identifier) attempts = 0 From 1825ead3d8e94fb5ca6a00465c716355bfa06310 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 30 Sep 2024 18:22:12 -0500 Subject: [PATCH 4/6] change method to use gitlab one --- augur/tasks/gitlab/issues_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index 509b4fc14c..fe210a9f29 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -57,7 +57,7 @@ def retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) -> None: key_auth: key auth cache and rotator object """ - owner, repo = Repo.parse_github_repo_url(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) repo_identifier = get_gitlab_repo_identifier(owner, repo) From ea304c206d70e26874cd6cbc06b37a50a2ebc025 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Tue, 1 Oct 2024 18:11:09 -0500 Subject: [PATCH 5/6] run machine learning and contributor breadth every 30 days --- augur/tasks/init/celery_app.py | 10 +++------- augur/tasks/start_tasks.py | 2 +- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index da97751db9..0c18c21b2d 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -217,9 +217,9 @@ def setup_periodic_tasks(sender, **kwargs): sender.add_periodic_task(collection_interval, augur_collection_monitor.s()) #Do longer tasks less often - non_domain_collection_interval = collection_interval * 300 - logger.info(f"Scheduling non-repo-domain collection every {non_domain_collection_interval/60} minutes") - sender.add_periodic_task(non_domain_collection_interval, non_repo_domain_tasks.s()) + logger.info(f"Scheduling data analysis every 30 days") + thirty_days_in_seconds = 30*24*60*60 + sender.add_periodic_task(thirty_days_in_seconds, non_repo_domain_tasks.s()) mat_views_interval = int(config.get_value('Celery', 'refresh_materialized_views_interval_in_days')) logger.info(f"Scheduling refresh materialized view every night at 1am CDT") @@ -231,10 +231,6 @@ def setup_periodic_tasks(sender, **kwargs): logger.info(f"Setting 404 repos to be marked for retry on midnight each day") sender.add_periodic_task(crontab(hour=0, minute=0),retry_errored_repos.s()) - logger.info(f"Scheduling contributor breadth every 30 days") - thirty_days_in_seconds = 30*24*60*60 - sender.add_periodic_task(thirty_days_in_seconds, contributor_breadth_model.s()) - @after_setup_logger.connect def setup_loggers(*args,**kwargs): """Override Celery loggers with our own.""" diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 562069ce84..8e9aad8c78 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -143,7 +143,7 @@ def non_repo_domain_tasks(self): enabled_tasks = [] if not RUNNING_DOCKER and machine_learning_phase.__name__ in enabled_phase_names: - #enabled_tasks.extend(machine_learning_phase()) + enabled_tasks.extend(machine_learning_phase()) from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model enabled_tasks.append(contributor_breadth_model.si()) From 4a4d12104588f26eaa3385281f30b028a359ecaa Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 15 Oct 2024 17:56:41 -0500 Subject: [PATCH 6/6] Fixed bug Signed-off-by: Sean P. Goggins --- augur/tasks/start_tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 8e9aad8c78..562069ce84 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -143,7 +143,7 @@ def non_repo_domain_tasks(self): enabled_tasks = [] if not RUNNING_DOCKER and machine_learning_phase.__name__ in enabled_phase_names: - enabled_tasks.extend(machine_learning_phase()) + #enabled_tasks.extend(machine_learning_phase()) from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model enabled_tasks.append(contributor_breadth_model.si())