diff --git a/.github/workflows/build_docker.yml b/.github/workflows/build_docker.yml index 0cf2441838..b29ab2ed89 100644 --- a/.github/workflows/build_docker.yml +++ b/.github/workflows/build_docker.yml @@ -3,9 +3,11 @@ on: push: branches: - main + - dev pull_request: branches: - main + - dev release: types: - published diff --git a/README.md b/README.md index 02ec125fb6..c0c99157cb 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Augur NEW Release v0.70.0 +# Augur NEW Release v0.71.0 Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else! The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io @@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o ## NEW RELEASE ALERT! ### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md) -Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.70.0 +Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.71.0 - The `main` branch is a stable version of our new architecture, which features: - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks. diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index 0b4ebbdd6b..35dee313b6 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -305,8 +305,8 @@ def bulk_insert_dicts(logger, data: Union[List[dict], dict], table, natural_keys # print(str(stmnt.compile(dialect=postgresql.dialect()))) attempts = 0 - # creates list from 1 to 10 - sleep_time_list = list(range(1,11)) + # creates list from 1 to 10 / changed to 10-30 because deadlocks are taking longer + sleep_time_list = list(range(10,30)) deadlock_detected = False engine = get_engine() diff --git a/augur/application/logs.py b/augur/application/logs.py index 11e1cb6ea5..0d6649ce48 100644 --- a/augur/application/logs.py +++ b/augur/application/logs.py @@ -36,12 +36,29 @@ def getFormatter(logLevel): return logging.Formatter(fmt=ERROR_FORMAT_STRING) # create a file handler and set the format and log level -def create_file_handler(file, formatter, level): - handler = FileHandler(filename=file, mode='a') - handler.setFormatter(fmt=formatter) - handler.setLevel(level) +# def create_file_handler(file, formatter, level): +# handler = FileHandler(filename=file, mode='a') +# handler.setFormatter(fmt=formatter) +# handler.setLevel(level) + +# return handler - return handler +def create_file_handler(file, formatter, level): + try: + # Ensure the directory exists + directory = os.path.dirname(file) + if not os.path.exists(directory): + os.makedirs(directory) + + # Create the file handler + handler = logging.FileHandler(filename=file, mode='a') + handler.setFormatter(formatter) + handler.setLevel(level) + + return handler + except Exception as e: + print(f"Failed to create file handler: {e}") + return None # function to create two file handlers and add them to a logger def initialize_file_handlers(logger, file, log_level): diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py b/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py index bcfe810a9c..9dfa10b4ea 100644 --- a/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py +++ b/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py @@ -1,4 +1,7 @@ import requests +import logging + +logger = logging.getLogger(__name__) def get_NPM_data(package): url = "https://registry.npmjs.org/%s" % package @@ -42,10 +45,16 @@ def get_latest_patch(version, data): def get_lastest_minor(version, data): - versions = data['versions'] + try: + versions = data['versions'] + except Exception as e: + logger.info(f'error is {e} on the NPM. Hey, its NODEJS, of course it does not work :D ') + raise e + try: index = list(versions.keys()).index(version) except ValueError as e: + logger.info(f'error is {e} on the NPM. Some kind of value error. Probably a VALUES error for Node, #AmIRight?') raise e major,minor,patch = split_version(version) diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py b/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py index 7aaaf1f190..dab06b1a09 100644 --- a/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py +++ b/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py @@ -160,7 +160,14 @@ def parse_conda(file_handle): pip = None if not contents: return [] - dependencies = contents['dependencies'] + #dependencies = contents['dependencies'] + dependencies = contents.get('dependencies', []) + + if not dependencies: + print("No dependencies found.") + return [] + else: + print("Dependencies found.") for dep in dependencies: if (type(dep) is dict) and dep['pip']: pip = dep diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index 167a450f4b..8b76033baa 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -6,6 +6,7 @@ from augur.tasks.git.dependency_tasks.dependency_util import dependency_calculator as dep_calc from augur.tasks.util.worker_util import parse_json_from_subprocess_call from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth def generate_deps_data(logger, repo_git): @@ -82,10 +83,22 @@ def generate_scorecard(logger, repo_git): #setting the environmental variable which is required by scorecard with get_session() as session: - + #key_handler = GithubRandomKeyAuth(logger) key_handler = GithubApiKeyHandler(logger) os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() + + # This seems outdated + #setting the environmental variable which is required by scorecard + #key_handler = GithubApiKeyHandler(session, session.logger) + #os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() + try: + required_output = parse_json_from_subprocess_call(logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) + except Exception as e: + session.logger.error(f"Could not parse required output! Error: {e}") + raise e + + # end logger.info('adding to database...') logger.debug(f"output: {required_output}") diff --git a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py index 8e7bb1a6e7..d92f17b692 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py @@ -397,7 +397,8 @@ def rebuild_unknown_affiliation_and_web_caches(facade_helper): # ("DELETE c.* FROM dm_repo_group_weekly c " # "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - execute_sql(clear_dm_repo_group_weekly) + +# session.execute_sql(clear_dm_repo_group_weekly) clear_dm_repo_group_monthly = s.sql.text(""" DELETE @@ -411,7 +412,8 @@ def rebuild_unknown_affiliation_and_web_caches(facade_helper): # ("DELETE c.* FROM dm_repo_group_monthly c " # "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - execute_sql(clear_dm_repo_group_monthly) + +# session.execute_sql(clear_dm_repo_group_monthly) clear_dm_repo_group_annual = s.sql.text(""" DELETE @@ -425,7 +427,7 @@ def rebuild_unknown_affiliation_and_web_caches(facade_helper): # ("DELETE c.* FROM dm_repo_group_annual c " # "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - execute_sql(clear_dm_repo_group_annual) +# session.execute_sql(clear_dm_repo_group_annual) clear_dm_repo_weekly = s.sql.text(""" DELETE @@ -442,7 +444,7 @@ def rebuild_unknown_affiliation_and_web_caches(facade_helper): # "JOIN repo r ON c.repo_id = r.repo_id " # "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - execute_sql(clear_dm_repo_weekly) +# session.execute_sql(clear_dm_repo_weekly) clear_dm_repo_monthly = s.sql.text(""" DELETE @@ -459,7 +461,7 @@ def rebuild_unknown_affiliation_and_web_caches(facade_helper): # "JOIN repo r ON c.repo_id = r.repo_id " # "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - execute_sql(clear_dm_repo_monthly) +# session.execute_sql(clear_dm_repo_monthly) clear_dm_repo_annual = s.sql.text(""" DELETE @@ -476,7 +478,7 @@ def rebuild_unknown_affiliation_and_web_caches(facade_helper): # "JOIN repo r ON c.repo_id = r.repo_id " # "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - execute_sql(clear_dm_repo_annual) +# session.execute_sql(clear_dm_repo_annual) clear_unknown_cache = s.sql.text(""" DELETE @@ -574,7 +576,7 @@ def rebuild_unknown_affiliation_and_web_caches(facade_helper): "r.repo_group_id, info.a, info.b, info.c") ).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) - execute_sql(cache_projects_by_week) +# session.execute_sql(cache_projects_by_week) cache_projects_by_month = s.sql.text( ("INSERT INTO dm_repo_group_monthly (repo_group_id, email, affiliation, month, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source) " @@ -610,7 +612,7 @@ def rebuild_unknown_affiliation_and_web_caches(facade_helper): "r.repo_group_id, info.a, info.b, info.c" )).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) - execute_sql(cache_projects_by_month) +# session.execute_sql(cache_projects_by_month) cache_projects_by_year = s.sql.text(( "INSERT INTO dm_repo_group_annual (repo_group_id, email, affiliation, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source) " @@ -650,7 +652,7 @@ def rebuild_unknown_affiliation_and_web_caches(facade_helper): - execute_sql(cache_projects_by_year) + # session.execute_sql(cache_projects_by_year) # Start caching by repo facade_helper.log_activity('Verbose','Caching repos') @@ -690,7 +692,7 @@ def rebuild_unknown_affiliation_and_web_caches(facade_helper): "a.repo_id, info.a, info.b, info.c" )).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) - execute_sql(cache_repos_by_week) +# session.execute_sql(cache_repos_by_week) cache_repos_by_month = s.sql.text(( "INSERT INTO dm_repo_monthly (repo_id, email, affiliation, month, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source)" @@ -726,7 +728,7 @@ def rebuild_unknown_affiliation_and_web_caches(facade_helper): "a.repo_id, info.a, info.b, info.c" )).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) - execute_sql(cache_repos_by_month) +# session.execute_sql(cache_repos_by_month) cache_repos_by_year = s.sql.text(( "INSERT INTO dm_repo_annual (repo_id, email, affiliation, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source)" @@ -760,7 +762,7 @@ def rebuild_unknown_affiliation_and_web_caches(facade_helper): "a.repo_id, info.a, info.b, info.c" )).bindparams(tool_source=facade_helper.tool_source,tool_version=facade_helper.tool_version,data_source=facade_helper.data_source) - execute_sql(cache_repos_by_year) +# session.execute_sql(cache_repos_by_year) # Reset cache flags diff --git a/augur/tasks/github/messages/tasks.py b/augur/tasks/github/messages/tasks.py index bee7412489..3e104fc6dc 100644 --- a/augur/tasks/github/messages/tasks.py +++ b/augur/tasks/github/messages/tasks.py @@ -5,12 +5,12 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.data_parse import * from augur.tasks.github.util.github_paginator import GithubPaginator -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth +from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import Message, PullRequestMessageRef, IssueMessageRef, Contributor -from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_issues_by_repo_id, get_pull_requests_by_repo_id - +from augur.application.db.models import PullRequest, Message, Issue, PullRequestMessageRef, IssueMessageRef, Contributor, Repo, CollectionStatus +from augur.application.db import get_engine, get_session +from sqlalchemy.sql import text platform_id = 1 @@ -27,8 +27,8 @@ def collect_github_messages(repo_git: str) -> None: Repo.repo_git == repo_git).one().repo_id owner, repo = get_owner_repo(repo_git) - task_name = f"{owner}/{repo}: Message Task" + if is_repo_small(repo_id): message_data = fast_retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name) @@ -133,7 +133,7 @@ def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger process_messages(all_data, task_name, repo_id, logger, augur_db) -def process_messages(messages, task_name, repo_id, logger): +def process_messages(messages, task_name, repo_id, logger, augur_db): tool_source = "Pr comment task" tool_version = "2.0" @@ -152,13 +152,13 @@ def process_messages(messages, task_name, repo_id, logger): # create mapping from issue url to issue id of current issues issue_url_to_id_map = {} - issues = get_issues_by_repo_id(repo_id) + issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() for issue in issues: issue_url_to_id_map[issue.issue_url] = issue.issue_id # create mapping from pr url to pr id of current pull requests pr_issue_url_to_id_map = {} - prs = get_pull_requests_by_repo_id(repo_id) + prs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for pr in prs: pr_issue_url_to_id_map[pr.pr_issue_url] = pr.pull_request_id @@ -229,13 +229,13 @@ def process_messages(messages, task_name, repo_id, logger): contributors = remove_duplicate_dicts(contributors) logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) + augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] - message_return_data = bulk_insert_dicts(logger, message_dicts, Message, message_natural_keys, + message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, return_columns=message_return_columns, string_fields=message_string_fields) if message_return_data is None: return @@ -258,11 +258,11 @@ def process_messages(messages, task_name, repo_id, logger): logger.info(f"{task_name}: Inserting {len(pr_message_ref_dicts)} pr messages ref rows") pr_message_ref_natural_keys = ["pull_request_id", "pr_message_ref_src_comment_id"] - bulk_insert_dicts(logger, pr_message_ref_dicts, PullRequestMessageRef, pr_message_ref_natural_keys) + augur_db.insert_data(pr_message_ref_dicts, PullRequestMessageRef, pr_message_ref_natural_keys) logger.info(f"{task_name}: Inserting {len(issue_message_ref_dicts)} issue messages ref rows") issue_message_ref_natural_keys = ["issue_id", "issue_msg_ref_src_comment_id"] - bulk_insert_dicts(logger, issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) + augur_db.insert_data(issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) logger.info(f"{task_name}: Inserted {len(message_dicts)} messages. {len(issue_message_ref_dicts)} from issues and {len(pr_message_ref_dicts)} from prs") @@ -287,4 +287,4 @@ def process_github_comment_contributors(message, tool_source, tool_version, data # This is done by searching all the dicts for the given key that has the specified value def find_dict_in_list_of_dicts(data, key, value): - return next((item for item in data if item[key] == value), None) + return next((item for item in data if item[key] == value), None) \ No newline at end of file diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/augur/tasks/github/pull_requests/files_model/core.py index 24cd81574e..fc57fecd58 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/augur/tasks/github/pull_requests/files_model/core.py @@ -2,22 +2,26 @@ from augur.tasks.github.util.gh_graphql_entities import GraphQlPageCollection from augur.application.db.models import * from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.lib import bulk_insert_dicts, execute_sql +from augur.application.db.util import execute_session_query +import traceback -def pull_request_files_model(repo,logger, key_auth): +def pull_request_files_model(repo_id,logger, augur_db, key_auth): # query existing PRs and the respective url we will append the commits url to pr_number_sql = s.sql.text(""" SELECT DISTINCT pr_src_number as pr_src_number, pull_requests.pull_request_id FROM pull_requests--, pull_request_meta WHERE repo_id = :repo_id - """).bindparams(repo_id=repo.repo_id) + """).bindparams(repo_id=repo_id) pr_numbers = [] #pd.read_sql(pr_number_sql, self.db, params={}) - result = execute_sql(pr_number_sql)#.fetchall() + result = augur_db.execute_sql(pr_number_sql)#.fetchall() pr_numbers = [dict(row) for row in result.mappings()] + query = augur_db.session.query(Repo).filter(Repo.repo_id == repo_id) + repo = execute_session_query(query, 'one') + owner, name = get_owner_repo(repo.repo_git) pr_file_rows = [] @@ -59,20 +63,31 @@ def pull_request_files_model(repo,logger, key_auth): 'values' : values } - + logger.debug(f"query: {query}; key_auth: {key_auth}; params: {params}") file_collection = GraphQlPageCollection(query, key_auth, logger,bind=params) - pr_file_rows += [{ - 'pull_request_id': pr_info['pull_request_id'], - 'pr_file_additions': pr_file['additions'] if 'additions' in pr_file else None, - 'pr_file_deletions': pr_file['deletions'] if 'deletions' in pr_file else None, - 'pr_file_path': pr_file['path'], - 'data_source': 'GitHub API', - 'repo_id': repo.repo_id, - } for pr_file in file_collection if pr_file and 'path' in pr_file] + logger.debug(f"Results of file_collection: {file_collection}") + + for pr_file in file_collection: + logger.debug(f"CHECK: {repr(file_collection)}") + if pr_file and 'path' in pr_file: + logger.debug(f"Checks out for {repr(pr_file)} and {repr(file_collection)}") + + try: + pr_file_rows += [{ + 'pull_request_id': pr_info['pull_request_id'], + 'pr_file_additions': pr_file['additions'] if 'additions' in pr_file else None, + 'pr_file_deletions': pr_file['deletions'] if 'deletions' in pr_file else None, + 'pr_file_path': pr_file['path'], + 'data_source': 'GitHub API', + 'repo_id': repo_id, + } for pr_file in file_collection if pr_file and 'path' in pr_file] + except Exception as e: + logger.error(f"PR Files Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") + if len(pr_file_rows) > 0: #Execute a bulk upsert with sqlalchemy pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"] - bulk_insert_dicts(logger, pr_file_rows, PullRequestFile, pr_file_natural_keys) + augur_db.insert_data(pr_file_rows, PullRequestFile, pr_file_natural_keys) \ No newline at end of file diff --git a/augur/tasks/github/pull_requests/files_model/tasks.py b/augur/tasks/github/pull_requests/files_model/tasks.py index 134e05e900..988261f6c8 100644 --- a/augur/tasks/github/pull_requests/files_model/tasks.py +++ b/augur/tasks/github/pull_requests/files_model/tasks.py @@ -1,21 +1,18 @@ import logging from augur.tasks.github.pull_requests.files_model.core import * +from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurSecondaryRepoCollectionTask -from augur.application.db.lib import get_repo_by_repo_git -from augur.application.db import get_engine -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth - +from augur.application.db.util import execute_session_query @celery.task(base=AugurSecondaryRepoCollectionTask) def process_pull_request_files(repo_git: str) -> None: - engine = get_engine() - logger = logging.getLogger(process_pull_request_files.__name__) - repo = get_repo_by_repo_git(repo_git) - - key_auth = GithubRandomKeyAuth(logger) + with GithubTaskManifest(logger) as manifest: + augur_db = manifest.augur_db + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo = execute_session_query(query, 'one') - pull_request_files_model(repo, logger, key_auth) \ No newline at end of file + pull_request_files_model(repo.repo_id, logger, augur_db, manifest.key_auth) \ No newline at end of file diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 9ccd398478..08f70b89e9 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -7,7 +7,8 @@ from augur.tasks.github.util.github_paginator import GithubPaginator from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo -from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor +from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor, Repo +from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.application.db.lib import get_session, get_repo_by_repo_git, bulk_insert_dicts, get_pull_request_reviews_by_repo_id from augur.application.db.util import execute_session_query from ..messages.tasks import process_github_comment_contributors @@ -24,6 +25,7 @@ def collect_pull_requests(repo_git: str) -> int: logger = logging.getLogger(collect_pull_requests.__name__) with GithubTaskManifest(logger) as manifest: + #with GithubTaskManifest() as manifest: augur_db = manifest.augur_db @@ -82,8 +84,7 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth): #-> Generator[List[Di yield page_data - -def process_pull_requests(pull_requests, task_name, repo_id, logger): +def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): """ Parse and insert all retrieved PR data. @@ -92,6 +93,7 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger): task_name: Name of the calling task and the repo repo_id: augur id of the repository logger: logging object + augur_db: sqlalchemy db object """ tool_source = "Pr Task" tool_version = "2.0" @@ -104,7 +106,7 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger): # insert contributors from these prs logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) + augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) # insert the prs into the pull_requests table. @@ -114,7 +116,7 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger): pr_natural_keys = ["repo_id", "pr_src_id"] pr_return_columns = ["pull_request_id", "pr_url"] pr_string_fields = ["pr_src_title", "pr_body"] - pr_return_data = bulk_insert_dicts(logger, pr_dicts, PullRequest, pr_natural_keys, + pr_return_data = augur_db.insert_data(pr_dicts, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) if pr_return_data is None: @@ -153,24 +155,24 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger): # we are using pr_src_id and pull_request_id to determine if the label is already in the database. pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] pr_label_string_fields = ["pr_src_description"] - bulk_insert_dicts(logger, pr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) + augur_db.insert_data(pr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) # inserting pr assignees # we are using pr_assignee_src_id and pull_request_id to determine if the label is already in the database. pr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] - bulk_insert_dicts(logger, pr_assignee_dicts, PullRequestAssignee, pr_assignee_natural_keys) + augur_db.insert_data(pr_assignee_dicts, PullRequestAssignee, pr_assignee_natural_keys) # inserting pr requested reviewers # we are using pr_src_id and pull_request_id to determine if the label is already in the database. pr_reviewer_natural_keys = ["pull_request_id", "pr_reviewer_src_id"] - bulk_insert_dicts(logger, pr_reviewer_dicts, PullRequestReviewer, pr_reviewer_natural_keys) + augur_db.insert_data(pr_reviewer_dicts, PullRequestReviewer, pr_reviewer_natural_keys) # inserting pr metadata # we are using pull_request_id, pr_head_or_base, and pr_sha to determine if the label is already in the database. pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] pr_metadata_string_fields = ["pr_src_meta_label"] - bulk_insert_dicts(logger, pr_metadata_dicts, PullRequestMeta, + augur_db.insert_data(pr_metadata_dicts, PullRequestMeta, pr_metadata_natural_keys, string_fields=pr_metadata_string_fields) @@ -187,6 +189,11 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger): + + + + + diff --git a/augur/tasks/github/traffic/tasks.py b/augur/tasks/github/traffic/tasks.py index 573c691301..4101faa3ff 100644 --- a/augur/tasks/github/traffic/tasks.py +++ b/augur/tasks/github/traffic/tasks.py @@ -32,27 +32,27 @@ def collect_github_repo_clones_data(repo_git: str) -> None: logger.info(f"{owner}/{repo} has no clones") def retrieve_all_clones_data(repo_git: str, logger, key_auth): - owner, repo = get_owner_repo(repo_git) + # owner, repo = get_owner_repo(repo_git) - url = f"https://api.github.com/repos/{owner}/{repo}/traffic/clones" + # url = f"https://api.github.com/repos/{owner}/{repo}/traffic/clones" - clones = GithubPaginator(url, key_auth, logger) + # clones = GithubPaginator(url, key_auth, logger) - num_pages = clones.get_num_pages() + # num_pages = clones.get_num_pages() all_data = [] - for page_data, page in clones.iter_pages(): + # for page_data, page in clones.iter_pages(): - if page_data is None: - return all_data + # if page_data is None: + # return all_data - elif len(page_data) == 0: - logger.debug(f"{repo.capitalize()} Traffic Page {page} contains no data...returning") - logger.info(f"Traffic Page {page} of {num_pages}") - return all_data + # elif len(page_data) == 0: + # logger.debug(f"{repo.capitalize()} Traffic Page {page} contains no data...returning") + # logger.info(f"Traffic Page {page} of {num_pages}") + # return all_data - logger.info(f"{repo} Traffic Page {page} of {num_pages}") + # logger.info(f"{repo} Traffic Page {page} of {num_pages}") - all_data += page_data + # all_data += page_data return all_data diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index 574adbbaf0..0667ab3315 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -250,9 +250,11 @@ def hit_api(self,query,variables={}): def extract_paginate_result(self,responseDict): if not responseDict: + self.logger.error(f"DEBUG CHECK THIS {responseDict}") raise TimeoutError("No data received from endpoint.") #err = process_graphql_dict_response(self.logger, responseObject, response) if 'data' not in responseDict: + self.logger.error(f"DEBUG CHECK THIS {responseDict}") self.logger.error(responseDict) raise KeyError @@ -293,6 +295,8 @@ def __getitem__(self, index):# -> dict: #extract the content from the graphql query result coreData = self.extract_paginate_result(data) + self.logger.debug(f"for page in range 298: {data}") + content = [data['node'] for data in list(coreData['edges'])] if self.repaginate: @@ -323,6 +327,8 @@ def __len__(self): data = self.request_graphql_dict(variables=params) coreData = self.extract_paginate_result(data) + self.logger.debug(f"__len__: debug: {data}") + totalCount = int(coreData['totalCount']) return totalCount @@ -342,7 +348,7 @@ def __iter__(self): coreData = self.extract_paginate_result(data) if coreData is not None: if coreData.get('totalCount') is not None: - self.logger.info("... core data obtained") + self.logger.info("debug-gog: ... core data obtained") else: self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.") coreData['totalCount'] = 0 @@ -384,6 +390,7 @@ def __iter__(self): data = self.request_graphql_dict(variables=params) coreData = self.extract_paginate_result(data) + self.logger.debug(f"while core data: {data}") #print(coreData) if len(coreData['edges']) == 0: diff --git a/augur/tasks/github/util/github_random_key_auth.py b/augur/tasks/github/util/github_random_key_auth.py index 95788da1cc..397670407d 100644 --- a/augur/tasks/github/util/github_random_key_auth.py +++ b/augur/tasks/github/util/github_random_key_auth.py @@ -24,4 +24,25 @@ def __init__(self, logger): header_name = "Authorization" key_format = "token {0}" - super().__init__(github_api_keys, header_name, logger, key_format) \ No newline at end of file + super().__init__(github_api_keys, header_name, logger, key_format) + + # This is what it needs to be. And until it is, the PR task will fail: + + # Right now many other tasks fail with the modified syntax + + # def __init__(self, session: Session, logger): + # """Creates a GithubRandomKeyAuth object and initializes the RandomKeyAuth parent class""" + + + # # gets the github api keys from the database via the GithubApiKeyHandler + # github_api_keys = GithubApiKeyHandler(session, logger).keys + # #github_api_keys = random.sample(github_api_keys, len(github_api_keys)) + + # if not github_api_keys: + # print("Failed to find github api keys. This is usually because your key has expired") + + # # defines the structure of the github api key + # header_name = "Authorization" + # key_format = "token {0}" + + # super().__init__(github_api_keys, header_name, logger, key_format) \ No newline at end of file diff --git a/augur/tasks/github/util/github_task_session.py b/augur/tasks/github/util/github_task_session.py index a21fbc233a..b84c5201f5 100644 --- a/augur/tasks/github/util/github_task_session.py +++ b/augur/tasks/github/util/github_task_session.py @@ -4,6 +4,27 @@ from augur.application.db.session import DatabaseSession from augur.application.db import get_engine +class GithubTaskManifest: + + def __init__(self, logger): + + engine = get_engine() + + self.augur_db = DatabaseSession(logger, engine) + #self.key_auth = GithubRandomKeyAuth(self.augur_db.session, logger) + #totalHack + self.key_auth = GithubRandomKeyAuth(logger) + self.logger = logger + self.platform_id = 1 + + def __enter__(self): + + return self + + def __exit__(self, exception_type, exception_value, exception_traceback): + + self.augur_db.close() + class GithubTaskSession(DatabaseSession): """ORM session used in github tasks. @@ -20,6 +41,5 @@ def __init__(self, logger: Logger, engine=None): super().__init__(logger, engine=engine) - self.oauths = GithubRandomKeyAuth(logger) - self.platform_id = 1 - + self.oauths = GithubRandomKeyAuth(self, logger) + self.platform_id = 1 \ No newline at end of file diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index ca1401d88d..3f1793c79d 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -154,7 +154,7 @@ def non_repo_domain_tasks(self): tasks.apply_async() -def build_primary_repo_collect_request(logger, enabled_phase_names, days_until_collect_again = 1): +def build_primary_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 1): #Add all required tasks to a list and pass it to the CollectionRequest primary_enabled_phases = [] primary_gitlab_enabled_phases = [] @@ -174,10 +174,10 @@ def core_task_success_util_gen(repo_git): primary_gitlab_enabled_phases.append(core_task_success_util_gen) primary_request = CollectionRequest("core",primary_enabled_phases,max_repo=40, days_until_collect_again=7, gitlab_phases=primary_gitlab_enabled_phases) - primary_request.get_valid_repos(logger) + primary_request.get_valid_repos(session) return primary_request -def build_secondary_repo_collect_request(logger, enabled_phase_names, days_until_collect_again = 1): +def build_secondary_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 1): #Deal with secondary collection secondary_enabled_phases = [] @@ -193,11 +193,11 @@ def secondary_task_success_util_gen(repo_git): secondary_enabled_phases.append(secondary_task_success_util_gen) request = CollectionRequest("secondary",secondary_enabled_phases,max_repo=10, days_until_collect_again=10) - request.get_valid_repos(logger) + request.get_valid_repos(session) return request -def build_facade_repo_collect_request(logger, enabled_phase_names, days_until_collect_again = 1): +def build_facade_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 1): #Deal with facade collection facade_enabled_phases = [] @@ -215,10 +215,10 @@ def facade_task_update_weight_util_gen(repo_git): request = CollectionRequest("facade",facade_enabled_phases,max_repo=30, days_until_collect_again=7) - request.get_valid_repos(logger) + request.get_valid_repos(session) return request -def build_ml_repo_collect_request(logger, enabled_phase_names, days_until_collect_again = 1): +def build_ml_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 1): ml_enabled_phases = [] ml_enabled_phases.append(machine_learning_phase) @@ -229,7 +229,7 @@ def ml_task_success_util_gen(repo_git): ml_enabled_phases.append(ml_task_success_util_gen) request = CollectionRequest("ml",ml_enabled_phases,max_repo=5, days_until_collect_again=10) - request.get_valid_repos(logger) + request.get_valid_repos(session) return request @celery.task(bind=True) @@ -247,26 +247,28 @@ def augur_collection_monitor(self): enabled_collection_hooks = [] - if primary_repo_collect_phase.__name__ in enabled_phase_names: - enabled_collection_hooks.append(build_primary_repo_collect_request(logger, enabled_phase_names)) - - if secondary_repo_collect_phase.__name__ in enabled_phase_names: - enabled_collection_hooks.append(build_secondary_repo_collect_request(logger, enabled_phase_names)) - #start_secondary_collection(session, max_repo=10) + with DatabaseSession(logger, self.app.engine) as session: - if facade_phase.__name__ in enabled_phase_names: - #start_facade_collection(session, max_repo=30) - enabled_collection_hooks.append(build_facade_repo_collect_request(logger, enabled_phase_names)) - - if machine_learning_phase.__name__ in enabled_phase_names: - enabled_collection_hooks.append(build_ml_repo_collect_request(logger, enabled_phase_names)) - #start_ml_collection(session,max_repo=5) - - logger.info(f"Starting collection phases: {[h.name for h in enabled_collection_hooks]}") + if primary_repo_collect_phase.__name__ in enabled_phase_names: + enabled_collection_hooks.append(build_primary_repo_collect_request(session, logger, enabled_phase_names)) + + if secondary_repo_collect_phase.__name__ in enabled_phase_names: + enabled_collection_hooks.append(build_secondary_repo_collect_request(session, logger, enabled_phase_names)) + #start_secondary_collection(session, max_repo=10) + + if facade_phase.__name__ in enabled_phase_names: + #start_facade_collection(session, max_repo=30) + enabled_collection_hooks.append(build_facade_repo_collect_request(session, logger, enabled_phase_names)) + + if machine_learning_phase.__name__ in enabled_phase_names: + enabled_collection_hooks.append(build_ml_repo_collect_request(session, logger, enabled_phase_names)) + #start_ml_collection(session,max_repo=5) + + logger.info(f"Starting collection phases: {[h.name for h in enabled_collection_hooks]}") - main_routine = AugurTaskRoutine(logger, enabled_collection_hooks) + main_routine = AugurTaskRoutine(logger, enabled_collection_hooks) - main_routine.start_data_collection() + main_routine.start_data_collection() # have a pipe of 180 diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index f274a286da..f8156c8bf9 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -129,13 +129,9 @@ def __init__(self,name,phases,max_repo = 10,days_until_collect_again = 1, gitlab if name == "facade": self.new_status = CollectionState.UPDATE.value - def get_active_repo_count(self,session): - return len(session.query(CollectionStatus).filter(getattr(CollectionStatus,f"{self.name}_status" ) == CollectionState.COLLECTING.value).all()) - - def get_valid_repos(self,session): - active_repo_count = self.get_active_repo_count(session) + active_repo_count = get_active_repo_count(self.name) limit = self.max_repo-active_repo_count if limit <= 0: diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index 6e158d199b..9676b40ce5 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -2,7 +2,7 @@ FROM python:3.10-bookworm LABEL maintainer="outdoors@acm.org" -LABEL version="0.70.0" +LABEL version="0.71.0" ENV DEBIAN_FRONTEND=noninteractive diff --git a/docker/database/Dockerfile b/docker/database/Dockerfile index 1421e1f76c..df88b16c1e 100644 --- a/docker/database/Dockerfile +++ b/docker/database/Dockerfile @@ -2,7 +2,7 @@ FROM postgres:14 LABEL maintainer="outdoors@acm.org" -LABEL version="0.70.0" +LABEL version="0.71.0" ENV POSTGRES_DB "test" ENV POSTGRES_USER "augur" diff --git a/docker/rabbitmq/Dockerfile b/docker/rabbitmq/Dockerfile index 9feca83cd9..266bec64a5 100644 --- a/docker/rabbitmq/Dockerfile +++ b/docker/rabbitmq/Dockerfile @@ -1,7 +1,7 @@ FROM rabbitmq:3.12-management-alpine LABEL maintainer="574/augur@simplelogin.com" -LABEL version="0.70.0" +LABEL version="0.71.0" ARG RABBIT_MQ_DEFAULT_USER=augur ARG RABBIT_MQ_DEFAULT_PASSWORD=password123 @@ -20,4 +20,4 @@ RUN chmod 777 /etc/rabbitmq/conf.d/augur.conf RUN apk add --no-cache python3 COPY docker/rabbitmq/update_config.py / -RUN exec python3 update_config.py \ No newline at end of file +RUN exec python3 update_config.py diff --git a/metadata.py b/metadata.py index b914869d58..497e74ad46 100644 --- a/metadata.py +++ b/metadata.py @@ -5,8 +5,8 @@ __short_description__ = "Python 3 package for free/libre and open-source software community metrics, models & data collection" -__version__ = "0.70.0" -__release__ = "v0.70.0 (Windows 95 Man!)" +__version__ = "0.71.0" +__release__ = "v0.71.0 (Taylor Baby!)" __license__ = "MIT" __copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Brian Warner & Augurlabs 2024" diff --git a/scripts/install/contributor.sql b/scripts/install/contributor.sql new file mode 100644 index 0000000000..7632f9706e --- /dev/null +++ b/scripts/install/contributor.sql @@ -0,0 +1,250 @@ +create materialized view augur_data.explorer_contributor_metrics as + SELECT * FROM ( + SELECT ID AS + cntrb_id, + A.created_at AS created_at, + date_part('month', A.created_at::DATE) AS month, + date_part('year', A.created_at::DATE) AS year, + A.repo_id, + repo_name, + full_name, + login, + ACTION, + rank() OVER ( + PARTITION BY id + ORDER BY A.created_at ASC + ) + FROM + ( + ( + SELECT + canonical_id AS ID, + created_at AS created_at, + repo_id, + 'issue_opened' AS ACTION, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + FROM + augur_data.issues + LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = issues.reporter_id + LEFT OUTER JOIN ( + SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + cntrb_canonical AS canonical_email, + data_collection_date, + cntrb_id AS canonical_id + FROM augur_data.contributors + WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + WHERE + --repo_id = {repo_id} + pull_request IS NULL + GROUP BY + canonical_id, + repo_id, + issues.created_at, + contributors.cntrb_full_name, + contributors.cntrb_login + ) UNION ALL + ( + SELECT + canonical_id AS ID, + TO_TIMESTAMP( cmt_author_date, 'YYYY-MM-DD' ) AS created_at, + repo_id, + 'commit' AS ACTION, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + FROM + augur_data.commits + LEFT OUTER JOIN augur_data.contributors ON cntrb_email = cmt_author_email + LEFT OUTER JOIN ( + SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + cntrb_canonical AS canonical_email, + data_collection_date, cntrb_id AS canonical_id + FROM augur_data.contributors + WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + --WHERE + -- repo_id = {repo_id} + GROUP BY + repo_id, + canonical_email, + canonical_id, + commits.cmt_author_date, + contributors.cntrb_full_name, + contributors.cntrb_login + ) UNION ALL + ( + SELECT + message.cntrb_id AS ID, + created_at AS created_at, + commits.repo_id, + 'commit_comment' AS ACTION, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + + FROM + augur_data.commit_comment_ref, + augur_data.commits, + augur_data.message + LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id + LEFT OUTER JOIN ( + SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + cntrb_canonical AS canonical_email, + data_collection_date, cntrb_id AS canonical_id + FROM augur_data.contributors + WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + WHERE + commits.cmt_id = commit_comment_ref.cmt_id + -- AND commits.repo_id = {repo_id} + AND commit_comment_ref.msg_id = message.msg_id + + GROUP BY + ID, + commits.repo_id, + commit_comment_ref.created_at, + contributors.cntrb_full_name, + contributors.cntrb_login + ) UNION ALL + ( + SELECT + issue_events.cntrb_id AS ID, + issue_events.created_at AS created_at, + issues.repo_id, + 'issue_closed' AS ACTION, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + FROM + augur_data.issues, + augur_data.issue_events + LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = issue_events.cntrb_id + LEFT OUTER JOIN ( + SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + cntrb_canonical AS canonical_email, + data_collection_date, + cntrb_id AS canonical_id + FROM augur_data.contributors + WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + WHERE + --issues.repo_id = {repo_id} + issues.issue_id = issue_events.issue_id + AND issues.pull_request IS NULL + AND issue_events.cntrb_id IS NOT NULL + AND ACTION = 'closed' + GROUP BY + issue_events.cntrb_id, + issues.repo_id, + issue_events.created_at, + contributors.cntrb_full_name, + contributors.cntrb_login + ) UNION ALL + ( + SELECT + pr_augur_contributor_id AS ID, + pr_created_at AS created_at, + pull_requests.repo_id, + 'open_pull_request' AS ACTION, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + FROM + augur_data.pull_requests + LEFT OUTER JOIN augur_data.contributors ON pull_requests.pr_augur_contributor_id = contributors.cntrb_id + LEFT OUTER JOIN ( + SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + cntrb_canonical AS canonical_email, + data_collection_date, + cntrb_id AS canonical_id + FROM augur_data.contributors + WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + -- WHERE + --pull_requests.repo_id = {repo_id} + GROUP BY + pull_requests.pr_augur_contributor_id, + pull_requests.repo_id, + pull_requests.pr_created_at, + contributors.cntrb_full_name, + contributors.cntrb_login + ) UNION ALL + ( + SELECT + message.cntrb_id AS ID, + msg_timestamp AS created_at, + pull_requests.repo_id as repo_id, + 'pull_request_comment' AS ACTION, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + FROM + augur_data.pull_requests, + augur_data.pull_request_message_ref, + augur_data.message + LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id + LEFT OUTER JOIN ( + SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + cntrb_canonical AS canonical_email, + data_collection_date, + cntrb_id AS canonical_id + FROM augur_data.contributors + WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + WHERE + -- pull_requests.repo_id = {repo_id} + pull_request_message_ref.pull_request_id = pull_requests.pull_request_id + AND pull_request_message_ref.msg_id = message.msg_id + GROUP BY + message.cntrb_id, + pull_requests.repo_id, + message.msg_timestamp, + contributors.cntrb_full_name, + contributors.cntrb_login + ) UNION ALL + ( + SELECT + issues.reporter_id AS ID, + msg_timestamp AS created_at, + issues.repo_id as repo_id, + 'issue_comment' AS ACTION, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + FROM + augur_data.issues, + augur_data.issue_message_ref, + augur_data.message + LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id + LEFT OUTER JOIN ( + SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + cntrb_canonical AS canonical_email, + data_collection_date, + cntrb_id AS canonical_id + FROM augur_data.contributors + WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + WHERE + --issues.repo_id = {repo_id} + issue_message_ref.msg_id = message.msg_id + AND issues.issue_id = issue_message_ref.issue_id + AND issues.pull_request_id = NULL + GROUP BY + issues.reporter_id, + issues.repo_id, + message.msg_timestamp, + contributors.cntrb_full_name, + contributors.cntrb_login + ) + ) A, + augur_data.repo + WHERE + ID IS NOT NULL + AND A.repo_id = repo.repo_id + GROUP BY + A.ID, + A.repo_id, + A.ACTION, + A.created_at, + repo.repo_name, + A.full_name, + A.login + ORDER BY + cntrb_id + ) b diff --git a/scripts/install/explorer-index.sql b/scripts/install/explorer-index.sql new file mode 100644 index 0000000000..95ae45be6a --- /dev/null +++ b/scripts/install/explorer-index.sql @@ -0,0 +1,5 @@ + +-- View indexes: +CREATE UNIQUE INDEX explorer_contributor_recent_actions_unique_idx ON augur_data.explorer_contributor_recent_actions USING btree (cntrb_id, created_at, repo_id, action, repo_name, login, rank); +CREATE INDEX explorer_contributor_recent_actions_cntrb_id_idx ON augur_data.explorer_contributor_recent_actions USING btree (cntrb_id); +CREATE INDEX explorer_contributor_recent_actions_repo_id_idx ON augur_data.explorer_contributor_recent_actions USING btree (repo_id DESC); diff --git a/scripts/install/explorer_contributor_recent_actions.sql b/scripts/install/explorer_contributor_recent_actions.sql new file mode 100644 index 0000000000..2368be5a44 --- /dev/null +++ b/scripts/install/explorer_contributor_recent_actions.sql @@ -0,0 +1,104 @@ +-- augur_data.explorer_contributor_recent_actions source +DROP MATERIALIZED VIEW if exists augur_data.explorer_contributor_recent_actions; +CREATE MATERIALIZED VIEW augur_data.explorer_contributor_recent_actions +AS SELECT a.id AS cntrb_id, + a.created_at, + a.repo_id, + a.action, + repo.repo_name, + a.login, + row_number() OVER (PARTITION BY a.id, a.repo_id ORDER BY a.created_at DESC) AS rank + FROM ( SELECT commits.cmt_ght_author_id AS id, + commits.cmt_author_timestamp AS created_at, + commits.repo_id, + 'commit'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.commits + LEFT JOIN augur_data.contributors ON contributors.cntrb_id::text = commits.cmt_ght_author_id::text and commits.cmt_author_timestamp >= now() - interval '13 months' + GROUP BY commits.cmt_commit_hash, commits.cmt_ght_author_id, commits.repo_id, commits.cmt_author_timestamp, 'commit'::text, contributors.cntrb_login + UNION ALL + SELECT issues.reporter_id AS id, + issues.created_at, + issues.repo_id, + 'issue_opened'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.issues + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = issues.reporter_id and issues.created_at >= now() - interval '13 months' + WHERE issues.pull_request IS NULL + UNION ALL + SELECT pull_request_events.cntrb_id AS id, + pull_request_events.created_at, + pull_requests.repo_id, + 'pull_request_closed'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.pull_requests, + augur_data.pull_request_events + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = pull_request_events.cntrb_id and pull_request_events.created_at >= now() - interval '13 months' + WHERE pull_requests.pull_request_id = pull_request_events.pull_request_id AND pull_requests.pr_merged_at IS NULL AND pull_request_events.action::text = 'closed'::text + UNION ALL + SELECT pull_request_events.cntrb_id AS id, + pull_request_events.created_at, + pull_requests.repo_id, + 'pull_request_merged'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.pull_requests, + augur_data.pull_request_events + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = pull_request_events.cntrb_id and pull_request_events.created_at >= now() - interval '13 months' + WHERE pull_requests.pull_request_id = pull_request_events.pull_request_id AND pull_request_events.action::text = 'merged'::text + UNION ALL + SELECT issue_events.cntrb_id AS id, + issue_events.created_at, + issues.repo_id, + 'issue_closed'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.issues, + augur_data.issue_events + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = issue_events.cntrb_id and issue_events.created_at >= now() - interval '13 months' + WHERE issues.issue_id = issue_events.issue_id AND issues.pull_request IS NULL AND issue_events.action::text = 'closed'::text + UNION ALL + SELECT pull_request_reviews.cntrb_id AS id, + pull_request_reviews.pr_review_submitted_at AS created_at, + pull_requests.repo_id, + 'pull_request_review_'::text || pull_request_reviews.pr_review_state::text AS action, + contributors.cntrb_login AS login + FROM augur_data.pull_requests, + augur_data.pull_request_reviews + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = pull_request_reviews.cntrb_id and pull_request_reviews.pr_review_submitted_at >= now() - interval '13 months' + WHERE pull_requests.pull_request_id = pull_request_reviews.pull_request_id + UNION ALL + SELECT pull_requests.pr_augur_contributor_id AS id, + pull_requests.pr_created_at AS created_at, + pull_requests.repo_id, + 'pull_request_open'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.pull_requests + LEFT JOIN augur_data.contributors ON pull_requests.pr_augur_contributor_id = contributors.cntrb_id and pull_requests.pr_created_at >= now() - interval '13 months' + UNION ALL + SELECT message.cntrb_id AS id, + message.msg_timestamp AS created_at, + pull_requests.repo_id, + 'pull_request_comment'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.pull_requests, + augur_data.pull_request_message_ref, + augur_data.message + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id + WHERE pull_request_message_ref.pull_request_id = pull_requests.pull_request_id AND pull_request_message_ref.msg_id = message.msg_id + and pull_requests.pr_created_at >= now() - interval '13 months' + UNION ALL + SELECT issues.reporter_id AS id, + message.msg_timestamp AS created_at, + issues.repo_id, + 'issue_comment'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.issues, + augur_data.issue_message_ref, + augur_data.message + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id and message.msg_timestamp >= now() - interval '13 months' + WHERE issue_message_ref.msg_id = message.msg_id AND issues.issue_id = issue_message_ref.issue_id AND issues.closed_at <> message.msg_timestamp) a, + augur_data.repo + WHERE a.repo_id = repo.repo_id and a.created_at >= now() - interval '13 months' + ORDER BY a.created_at DESC +WITH DATA; + +-- View indexes: diff --git a/scripts/install/matview.sh b/scripts/install/matview.sh new file mode 100755 index 0000000000..ab3c178e75 --- /dev/null +++ b/scripts/install/matview.sh @@ -0,0 +1,115 @@ +#!/bin/bash +set -eo pipefail + +psql -U augur -p 5432 -h data.chaoss.io augur < 'create materialized view augur_data.pull_request_metrics as + SELECT + repo.repo_id AS repo_id, + pull_requests.pr_src_id AS pr_src_id, + repo.repo_name AS repo_name, + pr_src_author_association, + repo_groups.rg_name AS repo_group, + pull_requests.pr_src_state, + pull_requests.pr_merged_at, + pull_requests.pr_created_at AS pr_created_at, + pull_requests.pr_closed_at AS pr_closed_at, + date_part( 'year', pr_created_at :: DATE ) AS CREATED_YEAR, + date_part( 'month', pr_created_at :: DATE ) AS CREATED_MONTH, + date_part( 'year', pr_closed_at :: DATE ) AS CLOSED_YEAR, + date_part( 'month', pr_closed_at :: DATE ) AS CLOSED_MONTH, + pr_src_meta_label, + pr_head_or_base, + ( EXTRACT ( EPOCH FROM pull_requests.pr_closed_at ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_close, + ( EXTRACT ( EPOCH FROM pull_requests.pr_closed_at ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_close, + ( EXTRACT ( EPOCH FROM first_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_first_response, + ( EXTRACT ( EPOCH FROM first_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_first_response, + ( EXTRACT ( EPOCH FROM last_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_last_response, + ( EXTRACT ( EPOCH FROM last_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_last_response, + first_response_time, + last_response_time, + EXTRACT ( EPOCH FROM average_time_between_responses), + assigned_count, + review_requested_count, + labeled_count, + subscribed_count, + mentioned_count, + referenced_count, + closed_count, + head_ref_force_pushed_count, + merged_count::INT, + milestoned_count, + unlabeled_count, + head_ref_deleted_count, + comment_count, + COALESCE(lines_added, 0) as lines_added, + COALESCE(lines_removed, 0) as lines_removed, + commit_count, + COALESCE(file_count, 0) as file_count + FROM + augur_data.repo, + augur_data.repo_groups, + augur_data.pull_requests LEFT OUTER JOIN ( + SELECT pull_requests.pull_request_id, + count(*) FILTER (WHERE action = 'assigned') AS assigned_count, + count(*) FILTER (WHERE action = 'review_requested') AS review_requested_count, + count(*) FILTER (WHERE action = 'labeled') AS labeled_count, + count(*) FILTER (WHERE action = 'unlabeled') AS unlabeled_count, + count(*) FILTER (WHERE action = 'subscribed') AS subscribed_count, + count(*) FILTER (WHERE action = 'mentioned') AS mentioned_count, + count(*) FILTER (WHERE action = 'referenced') AS referenced_count, + count(*) FILTER (WHERE action = 'closed') AS closed_count, + count(*) FILTER (WHERE action = 'head_ref_force_pushed') AS head_ref_force_pushed_count, + count(*) FILTER (WHERE action = 'head_ref_deleted') AS head_ref_deleted_count, + count(*) FILTER (WHERE action = 'milestoned') AS milestoned_count, + COALESCE(count(*) FILTER (WHERE action = 'merged'), 0) AS merged_count, + COALESCE(MIN(message.msg_timestamp), pull_requests.pr_merged_at, pull_requests.pr_closed_at) AS first_response_time, + COALESCE(COUNT(DISTINCT message.msg_timestamp), 0) AS comment_count, + COALESCE(MAX(message.msg_timestamp), pull_requests.pr_closed_at) AS last_response_time, + COALESCE((MAX(message.msg_timestamp) - MIN(message.msg_timestamp)) / COUNT(DISTINCT message.msg_timestamp), pull_requests.pr_created_at - pull_requests.pr_closed_at) AS average_time_between_responses + FROM augur_data.pull_requests + LEFT OUTER JOIN augur_data.pull_request_events on augur_data.pull_requests.pull_request_id = augur_data.pull_request_events.pull_request_id + JOIN augur_data.repo on repo.repo_id = pull_requests.repo_id + LEFT OUTER JOIN augur_data.pull_request_message_ref on pull_requests.pull_request_id = pull_request_message_ref.pull_request_id + LEFT OUTER JOIN augur_data.message on pull_request_message_ref.msg_id = augur_data.message.msg_id + --WHERE repo.repo_id = {repo_id} + GROUP BY pull_requests.pull_request_id + ) response_times + ON pull_requests.pull_request_id = response_times.pull_request_id + LEFT JOIN ( + SELECT pull_request_commits.pull_request_id, count(DISTINCT pr_cmt_sha) AS commit_count + FROM augur_data.pull_request_commits, augur_data.pull_requests, augur_data.pull_request_meta + WHERE pull_requests.pull_request_id = pull_request_commits.pull_request_id + AND pull_requests.pull_request_id = pull_request_meta.pull_request_id + --AND pull_requests.repo_id = {repo_id} + AND pr_cmt_sha <> pull_requests.pr_merge_commit_sha + AND pr_cmt_sha <> pull_request_meta.pr_sha + GROUP BY pull_request_commits.pull_request_id + ) all_commit_counts + ON pull_requests.pull_request_id = all_commit_counts.pull_request_id + LEFT JOIN ( + SELECT MAX(pr_repo_meta_id), pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label + FROM augur_data.pull_requests, augur_data.pull_request_meta + WHERE pull_requests.pull_request_id = pull_request_meta.pull_request_id + --AND pull_requests.repo_id = {repo_id} + AND pr_head_or_base = 'base' + GROUP BY pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label + ) base_labels + ON base_labels.pull_request_id = all_commit_counts.pull_request_id + LEFT JOIN ( + SELECT sum(cmt_added) AS lines_added, sum(cmt_removed) AS lines_removed, pull_request_commits.pull_request_id, count(DISTINCT cmt_filename) AS file_count + FROM augur_data.pull_request_commits, augur_data.commits, augur_data.pull_requests, augur_data.pull_request_meta + WHERE cmt_commit_hash = pr_cmt_sha + AND pull_requests.pull_request_id = pull_request_commits.pull_request_id + AND pull_requests.pull_request_id = pull_request_meta.pull_request_id + --AND pull_requests.repo_id = {repo_id} + AND commits.repo_id = pull_requests.repo_id + AND commits.cmt_commit_hash <> pull_requests.pr_merge_commit_sha + AND commits.cmt_commit_hash <> pull_request_meta.pr_sha + GROUP BY pull_request_commits.pull_request_id + ) master_merged_counts + ON base_labels.pull_request_id = master_merged_counts.pull_request_id + WHERE + repo.repo_group_id = repo_groups.repo_group_id + AND repo.repo_id = pull_requests.repo_id + --AND repo.repo_id = {repo_id} + ORDER BY + merged_count DESC' diff --git a/scripts/install/matview.sql b/scripts/install/matview.sql new file mode 100644 index 0000000000..9a56dcd7dc --- /dev/null +++ b/scripts/install/matview.sql @@ -0,0 +1,112 @@ +create materialized view augur_data.explorer_pr_metrics as + SELECT + repo.repo_id AS repo_id, + pull_requests.pr_src_id AS pr_src_id, + repo.repo_name AS repo_name, + pr_src_author_association, + repo_groups.rg_name AS repo_group, + pull_requests.pr_src_state, + pull_requests.pr_merged_at, + pull_requests.pr_created_at AS pr_created_at, + pull_requests.pr_closed_at AS pr_closed_at, + date_part( 'year', pr_created_at :: DATE ) AS CREATED_YEAR, + date_part( 'month', pr_created_at :: DATE ) AS CREATED_MONTH, + date_part( 'year', pr_closed_at :: DATE ) AS CLOSED_YEAR, + date_part( 'month', pr_closed_at :: DATE ) AS CLOSED_MONTH, + pr_src_meta_label, + pr_head_or_base, + ( EXTRACT ( EPOCH FROM pull_requests.pr_closed_at ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_close, + ( EXTRACT ( EPOCH FROM pull_requests.pr_closed_at ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_close, + ( EXTRACT ( EPOCH FROM first_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_first_response, + ( EXTRACT ( EPOCH FROM first_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_first_response, + ( EXTRACT ( EPOCH FROM last_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_last_response, + ( EXTRACT ( EPOCH FROM last_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_last_response, + first_response_time, + last_response_time, + EXTRACT ( EPOCH FROM average_time_between_responses), + assigned_count, + review_requested_count, + labeled_count, + subscribed_count, + mentioned_count, + referenced_count, + closed_count, + head_ref_force_pushed_count, + merged_count::INT, + milestoned_count, + unlabeled_count, + head_ref_deleted_count, + comment_count, + COALESCE(lines_added, 0) as lines_added, + COALESCE(lines_removed, 0) as lines_removed, + commit_count, + COALESCE(file_count, 0) as file_count + FROM + augur_data.repo, + augur_data.repo_groups, + augur_data.pull_requests LEFT OUTER JOIN ( + SELECT pull_requests.pull_request_id, + count(*) FILTER (WHERE action = 'assigned') AS assigned_count, + count(*) FILTER (WHERE action = 'review_requested') AS review_requested_count, + count(*) FILTER (WHERE action = 'labeled') AS labeled_count, + count(*) FILTER (WHERE action = 'unlabeled') AS unlabeled_count, + count(*) FILTER (WHERE action = 'subscribed') AS subscribed_count, + count(*) FILTER (WHERE action = 'mentioned') AS mentioned_count, + count(*) FILTER (WHERE action = 'referenced') AS referenced_count, + count(*) FILTER (WHERE action = 'closed') AS closed_count, + count(*) FILTER (WHERE action = 'head_ref_force_pushed') AS head_ref_force_pushed_count, + count(*) FILTER (WHERE action = 'head_ref_deleted') AS head_ref_deleted_count, + count(*) FILTER (WHERE action = 'milestoned') AS milestoned_count, + COALESCE(count(*) FILTER (WHERE action = 'merged'), 0) AS merged_count, + COALESCE(MIN(message.msg_timestamp), pull_requests.pr_merged_at, pull_requests.pr_closed_at) AS first_response_time, + COALESCE(COUNT(DISTINCT message.msg_timestamp), 0) AS comment_count, + COALESCE(MAX(message.msg_timestamp), pull_requests.pr_closed_at) AS last_response_time, + COALESCE((MAX(message.msg_timestamp) - MIN(message.msg_timestamp)) / COUNT(DISTINCT message.msg_timestamp), pull_requests.pr_created_at - pull_requests.pr_closed_at) AS average_time_between_responses + FROM augur_data.pull_requests + LEFT OUTER JOIN augur_data.pull_request_events on augur_data.pull_requests.pull_request_id = augur_data.pull_request_events.pull_request_id + JOIN augur_data.repo on repo.repo_id = pull_requests.repo_id + LEFT OUTER JOIN augur_data.pull_request_message_ref on pull_requests.pull_request_id = pull_request_message_ref.pull_request_id + LEFT OUTER JOIN augur_data.message on pull_request_message_ref.msg_id = augur_data.message.msg_id + --WHERE repo.repo_id = {repo_id} + GROUP BY pull_requests.pull_request_id + ) response_times + ON pull_requests.pull_request_id = response_times.pull_request_id + LEFT JOIN ( + SELECT pull_request_commits.pull_request_id, count(DISTINCT pr_cmt_sha) AS commit_count + FROM augur_data.pull_request_commits, augur_data.pull_requests, augur_data.pull_request_meta + WHERE pull_requests.pull_request_id = pull_request_commits.pull_request_id + AND pull_requests.pull_request_id = pull_request_meta.pull_request_id + --AND pull_requests.repo_id = {repo_id} + AND pr_cmt_sha <> pull_requests.pr_merge_commit_sha + AND pr_cmt_sha <> pull_request_meta.pr_sha + GROUP BY pull_request_commits.pull_request_id + ) all_commit_counts + ON pull_requests.pull_request_id = all_commit_counts.pull_request_id + LEFT JOIN ( + SELECT MAX(pr_repo_meta_id), pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label + FROM augur_data.pull_requests, augur_data.pull_request_meta + WHERE pull_requests.pull_request_id = pull_request_meta.pull_request_id + --AND pull_requests.repo_id = {repo_id} + AND pr_head_or_base = 'base' + GROUP BY pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label + ) base_labels + ON base_labels.pull_request_id = all_commit_counts.pull_request_id + LEFT JOIN ( + SELECT sum(cmt_added) AS lines_added, sum(cmt_removed) AS lines_removed, pull_request_commits.pull_request_id, count(DISTINCT cmt_filename) AS file_count + FROM augur_data.pull_request_commits, augur_data.commits, augur_data.pull_requests, augur_data.pull_request_meta + WHERE cmt_commit_hash = pr_cmt_sha + AND pull_requests.pull_request_id = pull_request_commits.pull_request_id + AND pull_requests.pull_request_id = pull_request_meta.pull_request_id + --AND pull_requests.repo_id = {repo_id} + AND commits.repo_id = pull_requests.repo_id + AND commits.cmt_commit_hash <> pull_requests.pr_merge_commit_sha + AND commits.cmt_commit_hash <> pull_request_meta.pr_sha + GROUP BY pull_request_commits.pull_request_id + ) master_merged_counts + ON base_labels.pull_request_id = master_merged_counts.pull_request_id + WHERE + repo.repo_group_id = repo_groups.repo_group_id + AND repo.repo_id = pull_requests.repo_id + --AND repo.repo_id = {repo_id} + ORDER BY + merged_count DESC diff --git a/scripts/mat_view_explore/materialized_view_pr.sql b/scripts/mat_view_explore/materialized_view_pr.sql new file mode 100644 index 0000000000..f5a5889259 --- /dev/null +++ b/scripts/mat_view_explore/materialized_view_pr.sql @@ -0,0 +1,112 @@ + create materialized view augur_data.explorer_pr_metrics as + SELECT + repo.repo_id AS repo_id, + pull_requests.pr_src_id AS pr_src_id, + repo.repo_name AS repo_name, + pr_src_author_association, + repo_groups.rg_name AS repo_group, + pull_requests.pr_src_state, + pull_requests.pr_merged_at, + pull_requests.pr_created_at AS pr_created_at, + pull_requests.pr_closed_at AS pr_closed_at, + date_part( 'year', pr_created_at :: DATE ) AS CREATED_YEAR, + date_part( 'month', pr_created_at :: DATE ) AS CREATED_MONTH, + date_part( 'year', pr_closed_at :: DATE ) AS CLOSED_YEAR, + date_part( 'month', pr_closed_at :: DATE ) AS CLOSED_MONTH, + pr_src_meta_label, + pr_head_or_base, + ( EXTRACT ( EPOCH FROM pull_requests.pr_closed_at ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_close, + ( EXTRACT ( EPOCH FROM pull_requests.pr_closed_at ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_close, + ( EXTRACT ( EPOCH FROM first_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_first_response, + ( EXTRACT ( EPOCH FROM first_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_first_response, + ( EXTRACT ( EPOCH FROM last_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_last_response, + ( EXTRACT ( EPOCH FROM last_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_last_response, + first_response_time, + last_response_time, + (EXTRACT ( EPOCH FROM average_time_between_responses) ) / 3600 as average_hours_between_responses, + assigned_count, + review_requested_count, + labeled_count, + subscribed_count, + mentioned_count, + referenced_count, + closed_count, + head_ref_force_pushed_count, + merged_count::INT, + milestoned_count, + unlabeled_count, + head_ref_deleted_count, + comment_count, + COALESCE(lines_added, 0) as lines_added, + COALESCE(lines_removed, 0) as lines_removed, + commit_count, + COALESCE(file_count, 0) as file_count + FROM + augur_data.repo, + augur_data.repo_groups, + augur_data.pull_requests LEFT OUTER JOIN ( + SELECT pull_requests.pull_request_id, + count(*) FILTER (WHERE action = 'assigned') AS assigned_count, + count(*) FILTER (WHERE action = 'review_requested') AS review_requested_count, + count(*) FILTER (WHERE action = 'labeled') AS labeled_count, + count(*) FILTER (WHERE action = 'unlabeled') AS unlabeled_count, + count(*) FILTER (WHERE action = 'subscribed') AS subscribed_count, + count(*) FILTER (WHERE action = 'mentioned') AS mentioned_count, + count(*) FILTER (WHERE action = 'referenced') AS referenced_count, + count(*) FILTER (WHERE action = 'closed') AS closed_count, + count(*) FILTER (WHERE action = 'head_ref_force_pushed') AS head_ref_force_pushed_count, + count(*) FILTER (WHERE action = 'head_ref_deleted') AS head_ref_deleted_count, + count(*) FILTER (WHERE action = 'milestoned') AS milestoned_count, + COALESCE(count(*) FILTER (WHERE action = 'merged'), 0) AS merged_count, + COALESCE(MIN(message.msg_timestamp), pull_requests.pr_merged_at, pull_requests.pr_closed_at) AS first_response_time, + COALESCE(COUNT(DISTINCT message.msg_timestamp), 0) AS comment_count, + COALESCE(MAX(message.msg_timestamp), pull_requests.pr_closed_at) AS last_response_time, + COALESCE((MAX(message.msg_timestamp) - MIN(message.msg_timestamp)) / COUNT(DISTINCT message.msg_timestamp), pull_requests.pr_created_at - pull_requests.pr_closed_at) AS average_time_between_responses + FROM augur_data.pull_requests + LEFT OUTER JOIN augur_data.pull_request_events on augur_data.pull_requests.pull_request_id = augur_data.pull_request_events.pull_request_id + JOIN augur_data.repo on repo.repo_id = pull_requests.repo_id + LEFT OUTER JOIN augur_data.pull_request_message_ref on pull_requests.pull_request_id = pull_request_message_ref.pull_request_id + LEFT OUTER JOIN augur_data.message on pull_request_message_ref.msg_id = augur_data.message.msg_id + --WHERE repo.repo_id = {repo_id} + GROUP BY pull_requests.pull_request_id + ) response_times + ON pull_requests.pull_request_id = response_times.pull_request_id + LEFT JOIN ( + SELECT pull_request_commits.pull_request_id, count(DISTINCT pr_cmt_sha) AS commit_count + FROM augur_data.pull_request_commits, augur_data.pull_requests, augur_data.pull_request_meta + WHERE pull_requests.pull_request_id = pull_request_commits.pull_request_id + AND pull_requests.pull_request_id = pull_request_meta.pull_request_id + --AND pull_requests.repo_id = {repo_id} + AND pr_cmt_sha <> pull_requests.pr_merge_commit_sha + AND pr_cmt_sha <> pull_request_meta.pr_sha + GROUP BY pull_request_commits.pull_request_id + ) all_commit_counts + ON pull_requests.pull_request_id = all_commit_counts.pull_request_id + LEFT JOIN ( + SELECT MAX(pr_repo_meta_id), pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label + FROM augur_data.pull_requests, augur_data.pull_request_meta + WHERE pull_requests.pull_request_id = pull_request_meta.pull_request_id + --AND pull_requests.repo_id = {repo_id} + AND pr_head_or_base = 'base' + GROUP BY pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label + ) base_labels + ON base_labels.pull_request_id = all_commit_counts.pull_request_id + LEFT JOIN ( + SELECT sum(cmt_added) AS lines_added, sum(cmt_removed) AS lines_removed, pull_request_commits.pull_request_id, count(DISTINCT cmt_filename) AS file_count + FROM augur_data.pull_request_commits, augur_data.commits, augur_data.pull_requests, augur_data.pull_request_meta + WHERE cmt_commit_hash = pr_cmt_sha + AND pull_requests.pull_request_id = pull_request_commits.pull_request_id + AND pull_requests.pull_request_id = pull_request_meta.pull_request_id + --AND pull_requests.repo_id = {repo_id} + AND commits.repo_id = pull_requests.repo_id + AND commits.cmt_commit_hash <> pull_requests.pr_merge_commit_sha + AND commits.cmt_commit_hash <> pull_request_meta.pr_sha + GROUP BY pull_request_commits.pull_request_id + ) master_merged_counts + ON base_labels.pull_request_id = master_merged_counts.pull_request_id + WHERE + repo.repo_group_id = repo_groups.repo_group_id + AND repo.repo_id = pull_requests.repo_id + --AND repo.repo_id = {repo_id} + ORDER BY + merged_count DESC