Skip to content

Commit

Permalink
Merge pull request #2930 from chaoss/dev
Browse files Browse the repository at this point in the history
Release 0.76.3: Fixes GitLab URL nesting Issue
  • Loading branch information
sgoggins authored Oct 15, 2024
2 parents c8eba65 + 4a4d121 commit 6a6dbff
Show file tree
Hide file tree
Showing 9 changed files with 63 additions and 43 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Augur NEW Release v0.76.2
# Augur NEW Release v0.76.3

Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else!
The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io
Expand All @@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o
## NEW RELEASE ALERT!
### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md)

Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.2
Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.76.3

- The `main` branch is a stable version of our new architecture, which features:
- Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks.
Expand Down
2 changes: 1 addition & 1 deletion augur/api/view/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def av_add_user_repo():
# matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo}
elif Repo.parse_gitlab_repo_url(url)[0]:

org_name, repo_name = Repo.parse_github_repo_url(url)
org_name, repo_name = Repo.parse_gitlab_repo_url(url)
repo_git = f"https://gitlab.com/{org_name}/{repo_name}"

# TODO: gitlab ensure the whole repo git is inserted so it can be found here
Expand Down
5 changes: 3 additions & 2 deletions augur/application/db/models/augur_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import logging
import re
import json
import urllib.parse


from augur.application.db.models.base import Base
Expand Down Expand Up @@ -971,7 +972,7 @@ def is_valid_gitlab_repo(gl_session, url: str) -> bool:
return False, {"status": "Invalid repo URL"}

# Encode namespace and project name for the API request
project_identifier = f"{owner}%2F{repo}"
project_identifier = urllib.parse.quote(f"{owner}/{repo}", safe='')
url = REPO_ENDPOINT.format(project_identifier)

attempts = 0
Expand Down Expand Up @@ -1030,7 +1031,7 @@ def parse_gitlab_repo_url(url: str) -> tuple:
Tuple of owner and repo. Or a tuple of None and None if the url is invalid.
"""

result = re.search(r"https?:\/\/gitlab\.com\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$", url)
result = re.search(r"https?:\/\/gitlab\.com\/([A-Za-z0-9\-_\/]+)\/([A-Za-z0-9\-_]+)(\.git)?\/?$", url)

if not result:
return None, None
Expand Down
5 changes: 5 additions & 0 deletions augur/tasks/github/util/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Utility functions that are useful for several Github tasks"""
from typing import Any, List, Tuple
import logging
import urllib.parse
import json
import httpx
from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth
Expand Down Expand Up @@ -46,6 +47,10 @@ def get_owner_repo(git_url: str) -> Tuple[str, str]:

return owner, repo

def get_gitlab_repo_identifier(owner, repo):

return urllib.parse.quote(f"{owner}/{repo}", safe='')


def parse_json_response(logger: logging.Logger, response: httpx.Response) -> dict:
# try to get json from response
Expand Down
14 changes: 8 additions & 6 deletions augur/tasks/gitlab/events_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask
from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler
from augur.application.db.data_parse import extract_gitlab_mr_event_data, extract_gitlab_issue_event_data
from augur.tasks.github.util.util import get_owner_repo
from augur.application.db.models import Issue, IssueEvent, PullRequest, PullRequestEvent
from augur.tasks.github.util.util import get_gitlab_repo_identifier
from augur.application.db.models import Issue, IssueEvent, PullRequest, PullRequestEvent, Repo
from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session
from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth

Expand All @@ -24,7 +24,7 @@ def collect_gitlab_issue_events(repo_git) -> int:
repo_git: the repo url string
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

logger = logging.getLogger(collect_gitlab_issue_events.__name__)

Expand Down Expand Up @@ -52,7 +52,7 @@ def collect_gitlab_merge_request_events(repo_git) -> int:
repo_git: the repo url string
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

logger = logging.getLogger(collect_gitlab_issue_events.__name__)

Expand Down Expand Up @@ -82,11 +82,13 @@ def retrieve_all_gitlab_event_data(gtype, repo_git, logger, key_auth) -> None:
key_auth: key auth cache and rotator object
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

repo_identifier = get_gitlab_repo_identifier(owner, repo)

logger.info(f"Collecting gitlab issue events for {owner}/{repo}")

url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/events?target_type={gtype}"
url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/events?target_type={gtype}"
events = GitlabApiHandler(key_auth, logger)

all_data = []
Expand Down
20 changes: 12 additions & 8 deletions augur/tasks/gitlab/issues_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask
from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler
from augur.application.db.data_parse import extract_needed_issue_data_from_gitlab_issue, extract_needed_gitlab_issue_label_data, extract_needed_gitlab_issue_assignee_data, extract_needed_gitlab_issue_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data
from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts
from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Contributor
from augur.tasks.github.util.util import get_gitlab_repo_identifier, add_key_value_pair_to_dicts
from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Contributor, Repo
from augur.tasks.util.worker_util import remove_duplicate_dicts
from augur.application.db.lib import bulk_insert_dicts, get_repo_by_repo_git, get_session
from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth
Expand All @@ -32,7 +32,7 @@ def collect_gitlab_issues(repo_git : str) -> int:
key_auth = GitlabRandomKeyAuth(logger)

try:
owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

issue_data = retrieve_all_gitlab_issue_data(repo_git, logger, key_auth)

Expand All @@ -57,11 +57,13 @@ def retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) -> None:
key_auth: key auth cache and rotator object
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

repo_identifier = get_gitlab_repo_identifier(owner, repo)

logger.info(f"Collecting gitlab issues for {owner}/{repo}")

url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues?with_labels_details=True"
url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/issues?with_labels_details=True"
issues = GitlabApiHandler(key_auth, logger)

all_data = []
Expand Down Expand Up @@ -207,7 +209,7 @@ def collect_gitlab_issue_comments(issue_ids, repo_git) -> int:
repo_git: repo url
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

logger = logging.getLogger(collect_gitlab_issues.__name__)

Expand Down Expand Up @@ -237,7 +239,9 @@ def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git):
repo_git: repo url
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

repo_identifier = get_gitlab_repo_identifier(owner, repo)

all_comments = {}
issue_count = len(issue_ids)
Expand All @@ -249,7 +253,7 @@ def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git):

logger.info(f"Collecting {owner}/{repo} gitlab issue comments for issue {index} of {issue_count}")

url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues/{id}/notes"
url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/issues/{id}/notes"

for page_data, _ in comments.iter_pages(url):

Expand Down
40 changes: 26 additions & 14 deletions augur/tasks/gitlab/merge_request_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask
from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler
from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_mr_reviewer_data, extract_needed_mr_commit_data, extract_needed_mr_file_data, extract_needed_mr_metadata, extract_needed_gitlab_mr_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data
from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts
from augur.tasks.github.util.util import get_gitlab_repo_identifier, add_key_value_pair_to_dicts
from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message, Contributor, PullRequestAssignee
from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth
from augur.tasks.util.worker_util import remove_duplicate_dicts
Expand All @@ -26,7 +26,7 @@ def collect_gitlab_merge_requests(repo_git: str) -> int:

repo_id = get_repo_by_repo_git(repo_git).repo_id

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

key_auth = GitlabRandomKeyAuth(logger)

Expand All @@ -51,11 +51,13 @@ def retrieve_all_mr_data(repo_git: str, logger, key_auth) -> None:
key_auth: key auth cache and rotator object
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

repo_identifier = get_gitlab_repo_identifier(owner, repo)

logger.info(f"Collecting pull requests for {owner}/{repo}")

url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests?with_labels_details=True"
url = f"https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests?with_labels_details=True"
mrs = GitlabApiHandler(key_auth, logger)

all_data = []
Expand Down Expand Up @@ -171,15 +173,17 @@ def collect_merge_request_comments(mr_ids, repo_git) -> int:
repo_git: the repo url string
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

logger = logging.getLogger(collect_merge_request_comments.__name__)

repo_identifier = get_gitlab_repo_identifier(owner, repo)

repo_id = get_repo_by_repo_git(repo_git).repo_id

key_auth = GitlabRandomKeyAuth(logger)

url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/notes".format(owner=owner, repo=repo, id="{id}")
url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/notes".format(repo_identifier=repo_identifier, id="{id}")
comments = retrieve_merge_request_data(mr_ids, url, "comments", owner, repo, key_auth, logger, response_type="list")

with get_session() as session:
Expand Down Expand Up @@ -282,15 +286,17 @@ def collect_merge_request_metadata(mr_ids, repo_git) -> int:
repo_git: the repo url string
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

repo_identifier = get_gitlab_repo_identifier(owner, repo)

logger = logging.getLogger(collect_merge_request_metadata.__name__)

repo_id = get_repo_by_repo_git(repo_git).repo_id

key_auth = GitlabRandomKeyAuth(logger)

url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}".format(owner=owner, repo=repo, id="{id}")
url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}".format(repo_identifier=repo_identifier, id="{id}")
metadata_list = retrieve_merge_request_data(mr_ids, url, "metadata", owner, repo, key_auth, logger, response_type="dict")

with get_session() as session:
Expand Down Expand Up @@ -347,15 +353,17 @@ def collect_merge_request_reviewers(mr_ids, repo_git) -> int:
repo_git: the repo url string
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

repo_identifier = get_gitlab_repo_identifier(owner, repo)

logger = logging.getLogger(collect_merge_request_reviewers.__name__)

repo_id = get_repo_by_repo_git(repo_git).repo_id

key_auth = GitlabRandomKeyAuth(logger)

url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/approvals".format(owner=owner, repo=repo, id="{id}")
url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/approvals".format(repo_identifier=repo_identifier, id="{id}")
reviewers = retrieve_merge_request_data(mr_ids, url, "reviewers", owner, repo, key_auth, logger, response_type="dict")

with get_session() as session:
Expand Down Expand Up @@ -414,15 +422,17 @@ def collect_merge_request_commits(mr_ids, repo_git) -> int:
repo_git: the repo url string
"""

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

repo_identifier = get_gitlab_repo_identifier(owner, repo)

logger = logging.getLogger(collect_merge_request_commits.__name__)

repo_id = get_repo_by_repo_git(repo_git).repo_id

key_auth = GitlabRandomKeyAuth(logger)

url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/commits".format(owner=owner, repo=repo, id="{id}")
url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/commits".format(repo_identifier=repo_identifier, id="{id}")
commits = retrieve_merge_request_data(mr_ids, url, "commits", owner, repo, key_auth, logger, response_type="list")

with get_session() as session:
Expand Down Expand Up @@ -484,13 +494,15 @@ def collect_merge_request_files(mr_ids, repo_git) -> int:

logger = logging.getLogger(collect_merge_request_files.__name__)

owner, repo = get_owner_repo(repo_git)
owner, repo = Repo.parse_gitlab_repo_url(repo_git)

repo_identifier = get_gitlab_repo_identifier(owner, repo)

repo_id = get_repo_by_repo_git(repo_git).repo_id

key_auth = GitlabRandomKeyAuth(logger)

url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/changes".format(owner=owner, repo=repo, id="{id}")
url = "https://gitlab.com/api/v4/projects/{repo_identifier}/merge_requests/{id}/changes".format(repo_identifier=repo_identifier, id="{id}")
files = retrieve_merge_request_data(mr_ids, url, "files", owner, repo, key_auth, logger, response_type="dict")

with get_session() as session:
Expand Down
10 changes: 3 additions & 7 deletions augur/tasks/init/celery_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,9 @@ def setup_periodic_tasks(sender, **kwargs):
sender.add_periodic_task(collection_interval, augur_collection_monitor.s())

#Do longer tasks less often
non_domain_collection_interval = collection_interval * 300
logger.info(f"Scheduling non-repo-domain collection every {non_domain_collection_interval/60} minutes")
sender.add_periodic_task(non_domain_collection_interval, non_repo_domain_tasks.s())
logger.info(f"Scheduling data analysis every 30 days")
thirty_days_in_seconds = 30*24*60*60
sender.add_periodic_task(thirty_days_in_seconds, non_repo_domain_tasks.s())

mat_views_interval = int(config.get_value('Celery', 'refresh_materialized_views_interval_in_days'))
logger.info(f"Scheduling refresh materialized view every night at 1am CDT")
Expand All @@ -231,10 +231,6 @@ def setup_periodic_tasks(sender, **kwargs):
logger.info(f"Setting 404 repos to be marked for retry on midnight each day")
sender.add_periodic_task(crontab(hour=0, minute=0),retry_errored_repos.s())

logger.info(f"Scheduling contributor breadth every 30 days")
thirty_days_in_seconds = 30*24*60*60
sender.add_periodic_task(thirty_days_in_seconds, contributor_breadth_model.s())

@after_setup_logger.connect
def setup_loggers(*args,**kwargs):
"""Override Celery loggers with our own."""
Expand Down
6 changes: 3 additions & 3 deletions metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

__short_description__ = "Python 3 package for free/libre and open-source software community metrics, models & data collection"

__version__ = "0.76.2"
__release__ = "v0.76.2 (Pumpkin Space)"
__version__ = "0.76.3"
__release__ = "v0.76.3 (Pumpkin Laser)"

__license__ = "MIT"
__copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Brian Warner & Augurlabs 2024"
__copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Brian Warner & Augurlabs 2112"

0 comments on commit 6a6dbff

Please sign in to comment.