diff --git a/README.md b/README.md index 0ff218d81e..69aa2f551d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Augur NEW Release v0.76.2 +# Augur NEW Release v0.76.3 Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else! The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io @@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o ## NEW RELEASE ALERT! ### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md) -Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.2 +Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.3 - The `main` branch is a stable version of our new architecture, which features: - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks. diff --git a/augur/api/view/api.py b/augur/api/view/api.py index cbd7e4a0f1..345f4a6427 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -106,7 +106,7 @@ def av_add_user_repo(): # matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo} elif Repo.parse_gitlab_repo_url(url)[0]: - org_name, repo_name = Repo.parse_github_repo_url(url) + org_name, repo_name = Repo.parse_gitlab_repo_url(url) repo_git = f"https://gitlab.com/{org_name}/{repo_name}" # TODO: gitlab ensure the whole repo git is inserted so it can be found here diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index 934949138e..c1e6d3e65c 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -26,6 +26,7 @@ import logging import re import json +import urllib.parse from augur.application.db.models.base import Base @@ -971,7 +972,7 @@ def is_valid_gitlab_repo(gl_session, url: str) -> bool: return False, {"status": "Invalid repo URL"} # Encode namespace and project name for the API request - project_identifier = f"{owner}%2F{repo}" + project_identifier = urllib.parse.quote(f"{owner}/{repo}", safe='') url = REPO_ENDPOINT.format(project_identifier) attempts = 0 @@ -1030,7 +1031,7 @@ def parse_gitlab_repo_url(url: str) -> tuple: Tuple of owner and repo. Or a tuple of None and None if the url is invalid. """ - result = re.search(r"https?:\/\/gitlab\.com\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$", url) + result = re.search(r"https?:\/\/gitlab\.com\/([A-Za-z0-9\-_\/]+)\/([A-Za-z0-9\-_]+)(\.git)?\/?$", url) if not result: return None, None diff --git a/augur/tasks/github/util/util.py b/augur/tasks/github/util/util.py index 8dd6e4d81b..a8bde66605 100644 --- a/augur/tasks/github/util/util.py +++ b/augur/tasks/github/util/util.py @@ -1,6 +1,7 @@ """Utility functions that are useful for several Github tasks""" from typing import Any, List, Tuple import logging +import urllib.parse import json import httpx from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth @@ -46,6 +47,10 @@ def get_owner_repo(git_url: str) -> Tuple[str, str]: return owner, repo +def get_gitlab_repo_identifier(owner, repo): + + return urllib.parse.quote(f"{owner}/{repo}", safe='') + def parse_json_response(logger: logging.Logger, response: httpx.Response) -> dict: # try to get json from response diff --git a/augur/tasks/gitlab/events_task.py b/augur/tasks/gitlab/events_task.py index c8d9a8f8a7..5c85bc5d84 100644 --- a/augur/tasks/gitlab/events_task.py +++ b/augur/tasks/gitlab/events_task.py @@ -7,8 +7,8 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler from augur.application.db.data_parse import extract_gitlab_mr_event_data, extract_gitlab_issue_event_data -from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import Issue, IssueEvent, PullRequest, PullRequestEvent +from augur.tasks.github.util.util import get_gitlab_repo_identifier +from augur.application.db.models import Issue, IssueEvent, PullRequest, PullRequestEvent, Repo from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth @@ -24,7 +24,7 @@ def collect_gitlab_issue_events(repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) logger = logging.getLogger(collect_gitlab_issue_events.__name__) @@ -52,7 +52,7 @@ def collect_gitlab_merge_request_events(repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) logger = logging.getLogger(collect_gitlab_issue_events.__name__) @@ -82,11 +82,13 @@ def retrieve_all_gitlab_event_data(gtype, repo_git, logger, key_auth) -> None: key_auth: key auth cache and rotator object """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) logger.info(f"Collecting gitlab issue events for {owner}/{repo}") - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/events?target_type={gtype}" + url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/events?target_type={gtype}" events = GitlabApiHandler(key_auth, logger) all_data = [] diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index 8a987a7744..fe210a9f29 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -8,8 +8,8 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler from augur.application.db.data_parse import extract_needed_issue_data_from_gitlab_issue, extract_needed_gitlab_issue_label_data, extract_needed_gitlab_issue_assignee_data, extract_needed_gitlab_issue_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data -from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts -from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Contributor +from augur.tasks.github.util.util import get_gitlab_repo_identifier, add_key_value_pair_to_dicts +from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Contributor, Repo from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth @@ -32,7 +32,7 @@ def collect_gitlab_issues(repo_git : str) -> int: key_auth = GitlabRandomKeyAuth(logger) try: - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) issue_data = retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) @@ -57,11 +57,13 @@ def retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) -> None: key_auth: key auth cache and rotator object """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) logger.info(f"Collecting gitlab issues for {owner}/{repo}") - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues?with_labels_details=True" + url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/issues?with_labels_details=True" issues = GitlabApiHandler(key_auth, logger) all_data = [] @@ -207,7 +209,7 @@ def collect_gitlab_issue_comments(issue_ids, repo_git) -> int: repo_git: repo url """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) logger = logging.getLogger(collect_gitlab_issues.__name__) @@ -237,7 +239,9 @@ def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git): repo_git: repo url """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) all_comments = {} issue_count = len(issue_ids) @@ -249,7 +253,7 @@ def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git): logger.info(f"Collecting {owner}/{repo} gitlab issue comments for issue {index} of {issue_count}") - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues/{id}/notes" + url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/issues/{id}/notes" for page_data, _ in comments.iter_pages(url): diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index 5e56067c53..cdcd27a387 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -4,7 +4,7 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_mr_reviewer_data, extract_needed_mr_commit_data, extract_needed_mr_file_data, extract_needed_mr_metadata, extract_needed_gitlab_mr_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data -from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts +from augur.tasks.github.util.util import get_gitlab_repo_identifier, add_key_value_pair_to_dicts from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message, Contributor, PullRequestAssignee from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth from augur.tasks.util.worker_util import remove_duplicate_dicts @@ -26,7 +26,7 @@ def collect_gitlab_merge_requests(repo_git: str) -> int: repo_id = get_repo_by_repo_git(repo_git).repo_id - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) key_auth = GitlabRandomKeyAuth(logger) @@ -51,11 +51,13 @@ def retrieve_all_mr_data(repo_git: str, logger, key_auth) -> None: key_auth: key auth cache and rotator object """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) logger.info(f"Collecting pull requests for {owner}/{repo}") - url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests?with_labels_details=True" + url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests?with_labels_details=True" mrs = GitlabApiHandler(key_auth, logger) all_data = [] @@ -171,15 +173,17 @@ def collect_merge_request_comments(mr_ids, repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) logger = logging.getLogger(collect_merge_request_comments.__name__) + repo_identifier = get_gitlab_repo_identifier(owner, repo) + repo_id = get_repo_by_repo_git(repo_git).repo_id key_auth = GitlabRandomKeyAuth(logger) - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/notes".format(owner=owner, repo=repo, id="{id}") + url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/notes".format(repo_identifier=repo_identifier, id="{id}") comments = retrieve_merge_request_data(mr_ids, url, "comments", owner, repo, key_auth, logger, response_type="list") with get_session() as session: @@ -282,7 +286,9 @@ def collect_merge_request_metadata(mr_ids, repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) logger = logging.getLogger(collect_merge_request_metadata.__name__) @@ -290,7 +296,7 @@ def collect_merge_request_metadata(mr_ids, repo_git) -> int: key_auth = GitlabRandomKeyAuth(logger) - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}".format(owner=owner, repo=repo, id="{id}") + url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}".format(repo_identifier=repo_identifier, id="{id}") metadata_list = retrieve_merge_request_data(mr_ids, url, "metadata", owner, repo, key_auth, logger, response_type="dict") with get_session() as session: @@ -347,7 +353,9 @@ def collect_merge_request_reviewers(mr_ids, repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) logger = logging.getLogger(collect_merge_request_reviewers.__name__) @@ -355,7 +363,7 @@ def collect_merge_request_reviewers(mr_ids, repo_git) -> int: key_auth = GitlabRandomKeyAuth(logger) - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/approvals".format(owner=owner, repo=repo, id="{id}") + url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/approvals".format(repo_identifier=repo_identifier, id="{id}") reviewers = retrieve_merge_request_data(mr_ids, url, "reviewers", owner, repo, key_auth, logger, response_type="dict") with get_session() as session: @@ -414,7 +422,9 @@ def collect_merge_request_commits(mr_ids, repo_git) -> int: repo_git: the repo url string """ - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) logger = logging.getLogger(collect_merge_request_commits.__name__) @@ -422,7 +432,7 @@ def collect_merge_request_commits(mr_ids, repo_git) -> int: key_auth = GitlabRandomKeyAuth(logger) - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/commits".format(owner=owner, repo=repo, id="{id}") + url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/commits".format(repo_identifier=repo_identifier, id="{id}") commits = retrieve_merge_request_data(mr_ids, url, "commits", owner, repo, key_auth, logger, response_type="list") with get_session() as session: @@ -484,13 +494,15 @@ def collect_merge_request_files(mr_ids, repo_git) -> int: logger = logging.getLogger(collect_merge_request_files.__name__) - owner, repo = get_owner_repo(repo_git) + owner, repo = Repo.parse_gitlab_repo_url(repo_git) + + repo_identifier = get_gitlab_repo_identifier(owner, repo) repo_id = get_repo_by_repo_git(repo_git).repo_id key_auth = GitlabRandomKeyAuth(logger) - url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/changes".format(owner=owner, repo=repo, id="{id}") + url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/changes".format(repo_identifier=repo_identifier, id="{id}") files = retrieve_merge_request_data(mr_ids, url, "files", owner, repo, key_auth, logger, response_type="dict") with get_session() as session: diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index da97751db9..0c18c21b2d 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -217,9 +217,9 @@ def setup_periodic_tasks(sender, **kwargs): sender.add_periodic_task(collection_interval, augur_collection_monitor.s()) #Do longer tasks less often - non_domain_collection_interval = collection_interval * 300 - logger.info(f"Scheduling non-repo-domain collection every {non_domain_collection_interval/60} minutes") - sender.add_periodic_task(non_domain_collection_interval, non_repo_domain_tasks.s()) + logger.info(f"Scheduling data analysis every 30 days") + thirty_days_in_seconds = 30*24*60*60 + sender.add_periodic_task(thirty_days_in_seconds, non_repo_domain_tasks.s()) mat_views_interval = int(config.get_value('Celery', 'refresh_materialized_views_interval_in_days')) logger.info(f"Scheduling refresh materialized view every night at 1am CDT") @@ -231,10 +231,6 @@ def setup_periodic_tasks(sender, **kwargs): logger.info(f"Setting 404 repos to be marked for retry on midnight each day") sender.add_periodic_task(crontab(hour=0, minute=0),retry_errored_repos.s()) - logger.info(f"Scheduling contributor breadth every 30 days") - thirty_days_in_seconds = 30*24*60*60 - sender.add_periodic_task(thirty_days_in_seconds, contributor_breadth_model.s()) - @after_setup_logger.connect def setup_loggers(*args,**kwargs): """Override Celery loggers with our own.""" diff --git a/metadata.py b/metadata.py index ae807e7a36..018f51d328 100644 --- a/metadata.py +++ b/metadata.py @@ -5,8 +5,8 @@ __short_description__ = "Python 3 package for free/libre and open-source software community metrics, models & data collection" -__version__ = "0.76.2" -__release__ = "v0.76.2 (Pumpkin Space)" +__version__ = "0.76.3" +__release__ = "v0.76.3 (Pumpkin Laser)" __license__ = "MIT" -__copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Brian Warner & Augurlabs 2024" +__copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Brian Warner & Augurlabs 2112"