Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GitHub data access pr migration #2864

Merged
merged 4 commits into from
Jul 9, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 17 additions & 56 deletions augur/tasks/github/pull_requests/tasks.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import logging

Check warning on line 1 in augur/tasks/github/pull_requests/tasks.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 C0114: Missing module docstring (missing-module-docstring) Raw Output: augur/tasks/github/pull_requests/tasks.py:1:0: C0114: Missing module docstring (missing-module-docstring)
from datetime import datetime, timedelta, timezone

from augur.tasks.github.pull_requests.core import extract_data_from_pr_list
from augur.tasks.init.celery_app import celery_app as celery
from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask, AugurSecondaryRepoCollectionTask
from augur.application.db.data_parse import *

Check warning on line 7 in augur/tasks/github/pull_requests/tasks.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0401: Wildcard import augur.application.db.data_parse (wildcard-import) Raw Output: augur/tasks/github/pull_requests/tasks.py:7:0: W0401: Wildcard import augur.application.db.data_parse (wildcard-import)
from augur.tasks.github.util.github_data_access import GithubDataAccess
from augur.tasks.github.util.github_paginator import GithubPaginator
from augur.tasks.util.worker_util import remove_duplicate_dicts
Expand All @@ -28,7 +28,6 @@
logger = logging.getLogger(collect_pull_requests.__name__)

with GithubTaskManifest(logger) as manifest:
#with GithubTaskManifest() as manifest:

augur_db = manifest.augur_db

Expand All @@ -50,37 +49,37 @@
all_data.append(pr)

if len(all_data) >= 1000:
process_pull_requests(all_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db)
process_pull_requests(all_data, f"{owner}/{repo}: Github Pr task", repo_id, logger, augur_db)
total_count += len(all_data)
all_data.clear()

if len(all_data):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pylint] reported by reviewdog 🐶
C1802: Do not use len(SEQUENCE) without comparison to determine if a sequence is empty (use-implicit-booleaness-not-len)

process_pull_requests(all_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db)
process_pull_requests(all_data, f"{owner}/{repo}: Github Pr task", repo_id, logger, augur_db)
total_count += len(all_data)

if total_count > 0:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pylint] reported by reviewdog 🐶
R1705: Unnecessary "else" after "return", remove the "else" and de-indent the code inside it (no-else-return)

return total_count
else:
logger.info(f"{owner}/{repo} has no pull requests")
logger.debug(f"{owner}/{repo} has no pull requests")
return 0



# TODO: Rename pull_request_reviewers table to pull_request_requested_reviewers

Check warning on line 68 in augur/tasks/github/pull_requests/tasks.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0511: TODO: Rename pull_request_reviewers table to pull_request_requested_reviewers (fixme) Raw Output: augur/tasks/github/pull_requests/tasks.py:68:1: W0511: TODO: Rename pull_request_reviewers table to pull_request_requested_reviewers (fixme)
# TODO: Fix column names in pull request labels table

Check warning on line 69 in augur/tasks/github/pull_requests/tasks.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0511: TODO: Fix column names in pull request labels table (fixme) Raw Output: augur/tasks/github/pull_requests/tasks.py:69:1: W0511: TODO: Fix column names in pull request labels table (fixme)
def retrieve_all_pr_data(repo_git: str, logger, key_auth, since): #-> Generator[List[Dict]]:

owner, repo = get_owner_repo(repo_git)

logger.info(f"Collecting pull requests for {owner}/{repo}")
logger.debug(f"Collecting pull requests for {owner}/{repo}")

url = f"https://api.github.com/repos/{owner}/{repo}/pulls?state=all&direction=desc&sort=updated"

github_data_access = GithubDataAccess(key_auth, logger)

num_pages = github_data_access.get_resource_page_count(url)

logger.info(f"{owner}/{repo}: Retrieving {num_pages} pages of pull requests")
logger.debug(f"{owner}/{repo}: Retrieving {num_pages} pages of pull requests")

# returns a generator so this method can be used by doing for x in retrieve_all_pr_data()

Expand All @@ -92,7 +91,7 @@
if since and datetime.fromisoformat(pr["updated_at"].replace("Z", "+00:00")).replace(tzinfo=timezone.utc) < since:
return

def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db):

Check warning on line 94 in augur/tasks/github/pull_requests/tasks.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 R0914: Too many local variables (32/30) (too-many-locals) Raw Output: augur/tasks/github/pull_requests/tasks.py:94:0: R0914: Too many local variables (32/30) (too-many-locals)
"""
Parse and insert all retrieved PR data.

Expand All @@ -107,7 +106,7 @@
tool_version = "2.0"
data_source = "Github API"

pr_dicts, pr_mapping_data, pr_numbers, contributors = extract_data_from_pr_list(pull_requests, repo_id, tool_source, tool_version, data_source)

Check warning on line 109 in augur/tasks/github/pull_requests/tasks.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0612: Unused variable 'pr_numbers' (unused-variable) Raw Output: augur/tasks/github/pull_requests/tasks.py:109:31: W0612: Unused variable 'pr_numbers' (unused-variable)

# remove duplicate contributors before inserting
contributors = remove_duplicate_dicts(contributors)
Expand Down Expand Up @@ -221,14 +220,14 @@
return pr_review_cntrb

@celery.task(base=AugurSecondaryRepoCollectionTask)
def collect_pull_request_review_comments(repo_git: str) -> None:

Check warning on line 223 in augur/tasks/github/pull_requests/tasks.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 R0914: Too many local variables (34/30) (too-many-locals) Raw Output: augur/tasks/github/pull_requests/tasks.py:223:0: R0914: Too many local variables (34/30) (too-many-locals)

Check warning on line 223 in augur/tasks/github/pull_requests/tasks.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 R0915: Too many statements (52/50) (too-many-statements) Raw Output: augur/tasks/github/pull_requests/tasks.py:223:0: R0915: Too many statements (52/50) (too-many-statements)

owner, repo = get_owner_repo(repo_git)

review_msg_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/comments"

logger = logging.getLogger(collect_pull_request_review_comments.__name__)
logger.info(f"Collecting pull request review comments for {owner}/{repo}")
logger.debug(f"Collecting pull request review comments for {owner}/{repo}")

repo_id = get_repo_by_repo_git(repo_git).repo_id

Expand All @@ -245,23 +244,9 @@
data_source = "Github API"

key_auth = GithubRandomKeyAuth(logger)
pr_review_messages = GithubPaginator(review_msg_url, key_auth, logger)
num_pages = pr_review_messages.get_num_pages()

all_raw_pr_review_messages = []
for page_data, page in pr_review_messages.iter_pages():

if page_data is None:
break

if len(page_data) == 0:
logger.debug(f"{owner}/{repo} Pr Review Messages Page {page} contains no data...returning")
logger.info(f"{owner}/{repo} Pr Review Messages Page {page} of {num_pages}")
break

logger.info(f"{owner}/{repo} Pr Review Messages Page {page} of {num_pages}")
github_data_access = GithubDataAccess(key_auth, logger)

all_raw_pr_review_messages += page_data
all_raw_pr_review_messages = list(github_data_access.paginate_resource(review_msg_url))

contributors = []
for comment in all_raw_pr_review_messages:
Expand All @@ -278,8 +263,7 @@
pr_review_msg_mapping_data = {}

pr_review_comments_len = len(all_raw_pr_review_messages)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[pylint] reported by reviewdog 🐶
W0612: Unused variable 'pr_review_comments_len' (unused-variable)

logger.info(f"{owner}/{repo}: Pr review comments len: {pr_review_comments_len}")
for index, comment in enumerate(all_raw_pr_review_messages):
for comment in all_raw_pr_review_messages:

# pull_request_review_id is required to map it to the correct pr review
if not comment["pull_request_review_id"]:
Expand Down Expand Up @@ -319,9 +303,7 @@
try:
augur_pr_review_id = pr_review_id_mapping[github_pr_review_id]
except KeyError:
logger.info(f"{owner}/{repo}: Could not find related pr review")
logger.info(f"{owner}/{repo}: We were searching for pr review with id: {github_pr_review_id}")
logger.info("Skipping")
logger.warning(f"{owner}/{repo}: Could not find related pr review. We were searching for pr review with id: {github_pr_review_id}")
continue

pr_review_message_ref = extract_pr_review_message_ref_data(comment, augur_pr_review_id, github_pr_review_id, repo_id, tool_version, data_source)
Expand Down Expand Up @@ -364,50 +346,30 @@

pr_count = len(prs)

github_data_access = GithubDataAccess(manifest.key_auth, logger)

all_pr_reviews = {}
for index, pr in enumerate(prs):

pr_number = pr.pr_src_number
pull_request_id = pr.pull_request_id

logger.info(f"{owner}/{repo} Collecting Pr Reviews for pr {index + 1} of {pr_count}")
logger.debug(f"{owner}/{repo} Collecting Pr Reviews for pr {index + 1} of {pr_count}")

pr_review_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/reviews"

pr_reviews = []
pr_reviews_generator = GithubPaginator(pr_review_url, manifest.key_auth, logger)
for page_data, page in pr_reviews_generator.iter_pages():

if page_data is None:
break

if len(page_data) == 0:
break

if isinstance(page_data, list):
page_data = [
element.decode('utf-8').replace('\x00', ' ') if isinstance(element, bytes) else element
for element in page_data
]
logger.info(f"NUL characters were found in PR Reviews and replaced with spaces.")
elif isinstance(page_data, bytes):
page_data = page_data.decode('utf-8').replace('\x00', ' ')
logger.info(f"NUL characters were found in PR Reviews and replaced with spaces.")


pr_reviews.extend(page_data)
pr_reviews = list(github_data_access.paginate_resource(pr_review_url))

if pr_reviews:
all_pr_reviews[pull_request_id] = pr_reviews

if not list(all_pr_reviews.keys()):
logger.info(f"{owner}/{repo} No pr reviews for repo")
logger.debug(f"{owner}/{repo} No pr reviews for repo")
return

contributors = []
for pull_request_id in all_pr_reviews.keys():
for pull_request_id, reviews in all_pr_reviews.items():

reviews = all_pr_reviews[pull_request_id]
for review in reviews:
contributor = process_pull_request_review_contributor(review, tool_source, tool_version, data_source)
if contributor:
Expand All @@ -418,9 +380,8 @@


pr_reviews = []
for pull_request_id in all_pr_reviews.keys():
for pull_request_id, reviews in all_pr_reviews.items():

reviews = all_pr_reviews[pull_request_id]
for review in reviews:

if "cntrb_id" in review:
Expand Down
Loading