From bed9a5a9e888db8d96342e9899439d9482264958 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Fri, 10 May 2024 19:51:32 +0000 Subject: [PATCH 01/44] update to materialized views testing scripts Signed-off-by: Sean P. Goggins --- .../mat_view_explore/materialized_view_pr.sql | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 scripts/mat_view_explore/materialized_view_pr.sql diff --git a/scripts/mat_view_explore/materialized_view_pr.sql b/scripts/mat_view_explore/materialized_view_pr.sql new file mode 100644 index 0000000000..f5a5889259 --- /dev/null +++ b/scripts/mat_view_explore/materialized_view_pr.sql @@ -0,0 +1,112 @@ + create materialized view augur_data.explorer_pr_metrics as + SELECT + repo.repo_id AS repo_id, + pull_requests.pr_src_id AS pr_src_id, + repo.repo_name AS repo_name, + pr_src_author_association, + repo_groups.rg_name AS repo_group, + pull_requests.pr_src_state, + pull_requests.pr_merged_at, + pull_requests.pr_created_at AS pr_created_at, + pull_requests.pr_closed_at AS pr_closed_at, + date_part( 'year', pr_created_at :: DATE ) AS CREATED_YEAR, + date_part( 'month', pr_created_at :: DATE ) AS CREATED_MONTH, + date_part( 'year', pr_closed_at :: DATE ) AS CLOSED_YEAR, + date_part( 'month', pr_closed_at :: DATE ) AS CLOSED_MONTH, + pr_src_meta_label, + pr_head_or_base, + ( EXTRACT ( EPOCH FROM pull_requests.pr_closed_at ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_close, + ( EXTRACT ( EPOCH FROM pull_requests.pr_closed_at ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_close, + ( EXTRACT ( EPOCH FROM first_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_first_response, + ( EXTRACT ( EPOCH FROM first_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_first_response, + ( EXTRACT ( EPOCH FROM last_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_last_response, + ( EXTRACT ( EPOCH FROM last_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_last_response, + first_response_time, + last_response_time, + (EXTRACT ( EPOCH FROM average_time_between_responses) ) / 3600 as average_hours_between_responses, + assigned_count, + review_requested_count, + labeled_count, + subscribed_count, + mentioned_count, + referenced_count, + closed_count, + head_ref_force_pushed_count, + merged_count::INT, + milestoned_count, + unlabeled_count, + head_ref_deleted_count, + comment_count, + COALESCE(lines_added, 0) as lines_added, + COALESCE(lines_removed, 0) as lines_removed, + commit_count, + COALESCE(file_count, 0) as file_count + FROM + augur_data.repo, + augur_data.repo_groups, + augur_data.pull_requests LEFT OUTER JOIN ( + SELECT pull_requests.pull_request_id, + count(*) FILTER (WHERE action = 'assigned') AS assigned_count, + count(*) FILTER (WHERE action = 'review_requested') AS review_requested_count, + count(*) FILTER (WHERE action = 'labeled') AS labeled_count, + count(*) FILTER (WHERE action = 'unlabeled') AS unlabeled_count, + count(*) FILTER (WHERE action = 'subscribed') AS subscribed_count, + count(*) FILTER (WHERE action = 'mentioned') AS mentioned_count, + count(*) FILTER (WHERE action = 'referenced') AS referenced_count, + count(*) FILTER (WHERE action = 'closed') AS closed_count, + count(*) FILTER (WHERE action = 'head_ref_force_pushed') AS head_ref_force_pushed_count, + count(*) FILTER (WHERE action = 'head_ref_deleted') AS head_ref_deleted_count, + count(*) FILTER (WHERE action = 'milestoned') AS milestoned_count, + COALESCE(count(*) FILTER (WHERE action = 'merged'), 0) AS merged_count, + COALESCE(MIN(message.msg_timestamp), pull_requests.pr_merged_at, pull_requests.pr_closed_at) AS first_response_time, + COALESCE(COUNT(DISTINCT message.msg_timestamp), 0) AS comment_count, + COALESCE(MAX(message.msg_timestamp), pull_requests.pr_closed_at) AS last_response_time, + COALESCE((MAX(message.msg_timestamp) - MIN(message.msg_timestamp)) / COUNT(DISTINCT message.msg_timestamp), pull_requests.pr_created_at - pull_requests.pr_closed_at) AS average_time_between_responses + FROM augur_data.pull_requests + LEFT OUTER JOIN augur_data.pull_request_events on augur_data.pull_requests.pull_request_id = augur_data.pull_request_events.pull_request_id + JOIN augur_data.repo on repo.repo_id = pull_requests.repo_id + LEFT OUTER JOIN augur_data.pull_request_message_ref on pull_requests.pull_request_id = pull_request_message_ref.pull_request_id + LEFT OUTER JOIN augur_data.message on pull_request_message_ref.msg_id = augur_data.message.msg_id + --WHERE repo.repo_id = {repo_id} + GROUP BY pull_requests.pull_request_id + ) response_times + ON pull_requests.pull_request_id = response_times.pull_request_id + LEFT JOIN ( + SELECT pull_request_commits.pull_request_id, count(DISTINCT pr_cmt_sha) AS commit_count + FROM augur_data.pull_request_commits, augur_data.pull_requests, augur_data.pull_request_meta + WHERE pull_requests.pull_request_id = pull_request_commits.pull_request_id + AND pull_requests.pull_request_id = pull_request_meta.pull_request_id + --AND pull_requests.repo_id = {repo_id} + AND pr_cmt_sha <> pull_requests.pr_merge_commit_sha + AND pr_cmt_sha <> pull_request_meta.pr_sha + GROUP BY pull_request_commits.pull_request_id + ) all_commit_counts + ON pull_requests.pull_request_id = all_commit_counts.pull_request_id + LEFT JOIN ( + SELECT MAX(pr_repo_meta_id), pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label + FROM augur_data.pull_requests, augur_data.pull_request_meta + WHERE pull_requests.pull_request_id = pull_request_meta.pull_request_id + --AND pull_requests.repo_id = {repo_id} + AND pr_head_or_base = 'base' + GROUP BY pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label + ) base_labels + ON base_labels.pull_request_id = all_commit_counts.pull_request_id + LEFT JOIN ( + SELECT sum(cmt_added) AS lines_added, sum(cmt_removed) AS lines_removed, pull_request_commits.pull_request_id, count(DISTINCT cmt_filename) AS file_count + FROM augur_data.pull_request_commits, augur_data.commits, augur_data.pull_requests, augur_data.pull_request_meta + WHERE cmt_commit_hash = pr_cmt_sha + AND pull_requests.pull_request_id = pull_request_commits.pull_request_id + AND pull_requests.pull_request_id = pull_request_meta.pull_request_id + --AND pull_requests.repo_id = {repo_id} + AND commits.repo_id = pull_requests.repo_id + AND commits.cmt_commit_hash <> pull_requests.pr_merge_commit_sha + AND commits.cmt_commit_hash <> pull_request_meta.pr_sha + GROUP BY pull_request_commits.pull_request_id + ) master_merged_counts + ON base_labels.pull_request_id = master_merged_counts.pull_request_id + WHERE + repo.repo_group_id = repo_groups.repo_group_id + AND repo.repo_id = pull_requests.repo_id + --AND repo.repo_id = {repo_id} + ORDER BY + merged_count DESC From 5384e0fc48561bf3b41c7f140d06136275f09b85 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Mon, 13 May 2024 19:27:53 -0500 Subject: [PATCH 02/44] Update build_docker.yml Signed-off-by: Sean P. Goggins --- .github/workflows/build_docker.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build_docker.yml b/.github/workflows/build_docker.yml index 0cf2441838..b29ab2ed89 100644 --- a/.github/workflows/build_docker.yml +++ b/.github/workflows/build_docker.yml @@ -3,9 +3,11 @@ on: push: branches: - main + - dev pull_request: branches: - main + - dev release: types: - published From 16f6d6b3e4d96514542a39b4be3f70b55f20d4a7 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 21 May 2024 13:33:21 +0300 Subject: [PATCH 03/44] commented out the rebuilding of the dm_ tables. This should be rebuilt using a materialized view. Signed-off-by: Sean P. Goggins --- .../facade_worker/rebuildcache.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py index 5668739767..e4697dbc19 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py @@ -396,7 +396,8 @@ def rebuild_unknown_affiliation_and_web_caches(session): # ("DELETE c.* FROM dm_repo_group_weekly c " # "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_group_weekly) + +# session.execute_sql(clear_dm_repo_group_weekly) clear_dm_repo_group_monthly = s.sql.text(""" DELETE @@ -410,7 +411,8 @@ def rebuild_unknown_affiliation_and_web_caches(session): # ("DELETE c.* FROM dm_repo_group_monthly c " # "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_group_monthly) + +# session.execute_sql(clear_dm_repo_group_monthly) clear_dm_repo_group_annual = s.sql.text(""" DELETE @@ -424,7 +426,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): # ("DELETE c.* FROM dm_repo_group_annual c " # "JOIN repo_groups p ON c.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_group_annual) +# session.execute_sql(clear_dm_repo_group_annual) clear_dm_repo_weekly = s.sql.text(""" DELETE @@ -441,7 +443,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): # "JOIN repo r ON c.repo_id = r.repo_id " # "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_weekly) +# session.execute_sql(clear_dm_repo_weekly) clear_dm_repo_monthly = s.sql.text(""" DELETE @@ -458,7 +460,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): # "JOIN repo r ON c.repo_id = r.repo_id " # "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_monthly) +# session.execute_sql(clear_dm_repo_monthly) clear_dm_repo_annual = s.sql.text(""" DELETE @@ -475,7 +477,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): # "JOIN repo r ON c.repo_id = r.repo_id " # "JOIN repo_groups p ON r.repo_group_id = p.repo_group_id WHERE " # "p.rg_recache=TRUE") - session.execute_sql(clear_dm_repo_annual) +# session.execute_sql(clear_dm_repo_annual) clear_unknown_cache = s.sql.text(""" DELETE @@ -573,7 +575,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): "r.repo_group_id, info.a, info.b, info.c") ).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) - session.execute_sql(cache_projects_by_week) +# session.execute_sql(cache_projects_by_week) cache_projects_by_month = s.sql.text( ("INSERT INTO dm_repo_group_monthly (repo_group_id, email, affiliation, month, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source) " @@ -609,7 +611,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): "r.repo_group_id, info.a, info.b, info.c" )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) - session.execute_sql(cache_projects_by_month) +# session.execute_sql(cache_projects_by_month) cache_projects_by_year = s.sql.text(( "INSERT INTO dm_repo_group_annual (repo_group_id, email, affiliation, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source) " @@ -649,7 +651,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): - session.execute_sql(cache_projects_by_year) + # session.execute_sql(cache_projects_by_year) # Start caching by repo session.log_activity('Verbose','Caching repos') @@ -689,7 +691,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): "a.repo_id, info.a, info.b, info.c" )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) - session.execute_sql(cache_repos_by_week) +# session.execute_sql(cache_repos_by_week) cache_repos_by_month = s.sql.text(( "INSERT INTO dm_repo_monthly (repo_id, email, affiliation, month, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source)" @@ -725,7 +727,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): "a.repo_id, info.a, info.b, info.c" )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) - session.execute_sql(cache_repos_by_month) +# session.execute_sql(cache_repos_by_month) cache_repos_by_year = s.sql.text(( "INSERT INTO dm_repo_annual (repo_id, email, affiliation, year, added, removed, whitespace, files, patches, tool_source, tool_version, data_source)" @@ -759,7 +761,7 @@ def rebuild_unknown_affiliation_and_web_caches(session): "a.repo_id, info.a, info.b, info.c" )).bindparams(tool_source=session.tool_source,tool_version=session.tool_version,data_source=session.data_source) - session.execute_sql(cache_repos_by_year) +# session.execute_sql(cache_repos_by_year) # Reset cache flags From 2c8e856c26421f65d82bbe790c6b501d541fd76b Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Fri, 24 May 2024 10:32:30 +0300 Subject: [PATCH 04/44] checking pr file errors with more logging Signed-off-by: Sean P. Goggins --- augur/tasks/github/util/gh_graphql_entities.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index 574adbbaf0..dd92e3a37f 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -250,9 +250,11 @@ def hit_api(self,query,variables={}): def extract_paginate_result(self,responseDict): if not responseDict: + self.logger.error(f"DEBUG CHECK THIS {responseDict}") raise TimeoutError("No data received from endpoint.") #err = process_graphql_dict_response(self.logger, responseObject, response) if 'data' not in responseDict: + self.logger.error(f"DEBUG CHECK THIS {responseDict}") self.logger.error(responseDict) raise KeyError From 04b9987e81604d6a8f6ccc54e387cee324da8e90 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Fri, 24 May 2024 11:59:41 +0300 Subject: [PATCH 05/44] more debugging of pr files Signed-off-by: Sean P. Goggins --- augur/tasks/github/pull_requests/files_model/core.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/augur/tasks/github/pull_requests/files_model/core.py index 138aa61cb3..004642408b 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/augur/tasks/github/pull_requests/files_model/core.py @@ -65,6 +65,8 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth): file_collection = GraphQlPageCollection(query, key_auth, logger,bind=params) + logger.debug(f"Results of file_collection: {file_collection}") + pr_file_rows += [{ 'pull_request_id': pr_info['pull_request_id'], 'pr_file_additions': pr_file['additions'] if 'additions' in pr_file else None, From 8ad033fda8eb16485aa8663edae0966808eb4ed0 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Fri, 24 May 2024 12:11:41 +0300 Subject: [PATCH 06/44] update Signed-off-by: Sean P. Goggins --- augur/tasks/github/pull_requests/files_model/core.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/augur/tasks/github/pull_requests/files_model/core.py index 004642408b..f7b16c8a49 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/augur/tasks/github/pull_requests/files_model/core.py @@ -67,6 +67,11 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth): logger.debug(f"Results of file_collection: {file_collection}") + for pr_file in file_collection: + logger.debug(f"CHECK: {pr_file['path']}") + if pr_file and 'path' in pr_file: + logger.debug(f"Checks out for {pr_info['pull_request_id']}") + pr_file_rows += [{ 'pull_request_id': pr_info['pull_request_id'], 'pr_file_additions': pr_file['additions'] if 'additions' in pr_file else None, From d66bf45d6a2649de3e07a09313111897340b343d Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Fri, 24 May 2024 12:42:54 +0300 Subject: [PATCH 07/44] debugging files Signed-off-by: Sean P. Goggins --- augur/tasks/github/util/gh_graphql_entities.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index dd92e3a37f..0667ab3315 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -295,6 +295,8 @@ def __getitem__(self, index):# -> dict: #extract the content from the graphql query result coreData = self.extract_paginate_result(data) + self.logger.debug(f"for page in range 298: {data}") + content = [data['node'] for data in list(coreData['edges'])] if self.repaginate: @@ -325,6 +327,8 @@ def __len__(self): data = self.request_graphql_dict(variables=params) coreData = self.extract_paginate_result(data) + self.logger.debug(f"__len__: debug: {data}") + totalCount = int(coreData['totalCount']) return totalCount @@ -344,7 +348,7 @@ def __iter__(self): coreData = self.extract_paginate_result(data) if coreData is not None: if coreData.get('totalCount') is not None: - self.logger.info("... core data obtained") + self.logger.info("debug-gog: ... core data obtained") else: self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.") coreData['totalCount'] = 0 @@ -386,6 +390,7 @@ def __iter__(self): data = self.request_graphql_dict(variables=params) coreData = self.extract_paginate_result(data) + self.logger.debug(f"while core data: {data}") #print(coreData) if len(coreData['edges']) == 0: From 72b28a739ada90e9eac827e0a0fca8c14f9a8b2d Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Fri, 24 May 2024 12:47:42 +0300 Subject: [PATCH 08/44] resorting to traceback Signed-off-by: Sean P. Goggins --- .../github/pull_requests/files_model/core.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/augur/tasks/github/pull_requests/files_model/core.py index f7b16c8a49..752a9d1d1d 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/augur/tasks/github/pull_requests/files_model/core.py @@ -3,6 +3,7 @@ from augur.application.db.models import * from augur.tasks.github.util.util import get_owner_repo from augur.application.db.util import execute_session_query +import traceback def pull_request_files_model(repo_id,logger, augur_db, key_auth): @@ -72,14 +73,18 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth): if pr_file and 'path' in pr_file: logger.debug(f"Checks out for {pr_info['pull_request_id']}") - pr_file_rows += [{ - 'pull_request_id': pr_info['pull_request_id'], - 'pr_file_additions': pr_file['additions'] if 'additions' in pr_file else None, - 'pr_file_deletions': pr_file['deletions'] if 'deletions' in pr_file else None, - 'pr_file_path': pr_file['path'], - 'data_source': 'GitHub API', - 'repo_id': repo_id, - } for pr_file in file_collection if pr_file and 'path' in pr_file] + try: + pr_file_rows += [{ + 'pull_request_id': pr_info['pull_request_id'], + 'pr_file_additions': pr_file['additions'] if 'additions' in pr_file else None, + 'pr_file_deletions': pr_file['deletions'] if 'deletions' in pr_file else None, + 'pr_file_path': pr_file['path'], + 'data_source': 'GitHub API', + 'repo_id': repo_id, + } for pr_file in file_collection if pr_file and 'path' in pr_file] + except Exception as e: + logger.error(f"PR Files Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") + if len(pr_file_rows) > 0: From e13776c01f95a258b75d258b05c2f91e11bc0360 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Fri, 24 May 2024 13:33:22 +0300 Subject: [PATCH 09/44] fixing error Signed-off-by: Sean P. Goggins --- augur/tasks/github/pull_requests/files_model/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/augur/tasks/github/pull_requests/files_model/core.py index 752a9d1d1d..cf42c9f341 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/augur/tasks/github/pull_requests/files_model/core.py @@ -69,9 +69,9 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth): logger.debug(f"Results of file_collection: {file_collection}") for pr_file in file_collection: - logger.debug(f"CHECK: {pr_file['path']}") + logger.debug(f"CHECK: {repr(file_collection)}") if pr_file and 'path' in pr_file: - logger.debug(f"Checks out for {pr_info['pull_request_id']}") + logger.debug(f"Checks out for {repr(pr_file)} and {repr(file_collection)}") try: pr_file_rows += [{ From 023477fef9a05d3b32e1a7aa1621c1b2c79b9c34 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Wed, 29 May 2024 17:29:13 +0300 Subject: [PATCH 10/44] logging removeal Signed-off-by: Sean P. Goggins --- augur/tasks/github/pull_requests/files_model/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/augur/tasks/github/pull_requests/files_model/core.py index cf42c9f341..fc97b2b599 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/augur/tasks/github/pull_requests/files_model/core.py @@ -63,7 +63,7 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth): 'values' : values } - + logger.debug(f"query: {query}; key_auth: {key_auth}; params: {params}") file_collection = GraphQlPageCollection(query, key_auth, logger,bind=params) logger.debug(f"Results of file_collection: {file_collection}") From 3de5ba9d511cd5b1ca77227528c78f18686b464a Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 3 Jun 2024 18:51:42 -0500 Subject: [PATCH 11/44] use correct get active repo count Signed-off-by: Andrew Brain --- augur/tasks/util/collection_util.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index f274a286da..c70740b570 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -129,13 +129,9 @@ def __init__(self,name,phases,max_repo = 10,days_until_collect_again = 1, gitlab if name == "facade": self.new_status = CollectionState.UPDATE.value - def get_active_repo_count(self,session): - return len(session.query(CollectionStatus).filter(getattr(CollectionStatus,f"{self.name}_status" ) == CollectionState.COLLECTING.value).all()) - - def get_valid_repos(self,session): - active_repo_count = self.get_active_repo_count(session) + active_repo_count = get_active_repo_count(session) limit = self.max_repo-active_repo_count if limit <= 0: From 0c5d7348950ae2a59459c2c90df9db64cbfc7dc8 Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 3 Jun 2024 18:55:55 -0500 Subject: [PATCH 12/44] Pass correct thign Signed-off-by: Andrew Brain --- augur/tasks/util/collection_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index c70740b570..f8156c8bf9 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -131,7 +131,7 @@ def __init__(self,name,phases,max_repo = 10,days_until_collect_again = 1, gitlab def get_valid_repos(self,session): - active_repo_count = get_active_repo_count(session) + active_repo_count = get_active_repo_count(self.name) limit = self.max_repo-active_repo_count if limit <= 0: From 29790915c9c346c0f6ab8fe23b744dc5a20f86fa Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 3 Jun 2024 19:02:03 -0500 Subject: [PATCH 13/44] Fix Signed-off-by: Andrew Brain --- augur/tasks/start_tasks.py | 56 ++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index ca1401d88d..117fd9497f 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -154,7 +154,7 @@ def non_repo_domain_tasks(self): tasks.apply_async() -def build_primary_repo_collect_request(logger, enabled_phase_names, days_until_collect_again = 1): +def build_primary_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 1): #Add all required tasks to a list and pass it to the CollectionRequest primary_enabled_phases = [] primary_gitlab_enabled_phases = [] @@ -174,10 +174,10 @@ def core_task_success_util_gen(repo_git): primary_gitlab_enabled_phases.append(core_task_success_util_gen) primary_request = CollectionRequest("core",primary_enabled_phases,max_repo=40, days_until_collect_again=7, gitlab_phases=primary_gitlab_enabled_phases) - primary_request.get_valid_repos(logger) + primary_request.get_valid_repos(session) return primary_request -def build_secondary_repo_collect_request(logger, enabled_phase_names, days_until_collect_again = 1): +def build_secondary_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 1): #Deal with secondary collection secondary_enabled_phases = [] @@ -193,11 +193,11 @@ def secondary_task_success_util_gen(repo_git): secondary_enabled_phases.append(secondary_task_success_util_gen) request = CollectionRequest("secondary",secondary_enabled_phases,max_repo=10, days_until_collect_again=10) - request.get_valid_repos(logger) + request.get_valid_repos(session) return request -def build_facade_repo_collect_request(logger, enabled_phase_names, days_until_collect_again = 1): +def build_facade_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 1): #Deal with facade collection facade_enabled_phases = [] @@ -215,10 +215,10 @@ def facade_task_update_weight_util_gen(repo_git): request = CollectionRequest("facade",facade_enabled_phases,max_repo=30, days_until_collect_again=7) - request.get_valid_repos(logger) + request.get_valid_repos(session) return request -def build_ml_repo_collect_request(logger, enabled_phase_names, days_until_collect_again = 1): +def build_ml_repo_collect_request(session, logger, enabled_phase_names, days_until_collect_again = 1): ml_enabled_phases = [] ml_enabled_phases.append(machine_learning_phase) @@ -229,7 +229,7 @@ def ml_task_success_util_gen(repo_git): ml_enabled_phases.append(ml_task_success_util_gen) request = CollectionRequest("ml",ml_enabled_phases,max_repo=5, days_until_collect_again=10) - request.get_valid_repos(logger) + request.get_valid_repos(session) return request @celery.task(bind=True) @@ -247,26 +247,28 @@ def augur_collection_monitor(self): enabled_collection_hooks = [] - if primary_repo_collect_phase.__name__ in enabled_phase_names: - enabled_collection_hooks.append(build_primary_repo_collect_request(logger, enabled_phase_names)) - - if secondary_repo_collect_phase.__name__ in enabled_phase_names: - enabled_collection_hooks.append(build_secondary_repo_collect_request(logger, enabled_phase_names)) - #start_secondary_collection(session, max_repo=10) - - if facade_phase.__name__ in enabled_phase_names: - #start_facade_collection(session, max_repo=30) - enabled_collection_hooks.append(build_facade_repo_collect_request(logger, enabled_phase_names)) - - if machine_learning_phase.__name__ in enabled_phase_names: - enabled_collection_hooks.append(build_ml_repo_collect_request(logger, enabled_phase_names)) - #start_ml_collection(session,max_repo=5) - - logger.info(f"Starting collection phases: {[h.name for h in enabled_collection_hooks]}") - - main_routine = AugurTaskRoutine(logger, enabled_collection_hooks) + with get_session() as session: - main_routine.start_data_collection() + if primary_repo_collect_phase.__name__ in enabled_phase_names: + enabled_collection_hooks.append(build_primary_repo_collect_request(session, logger, enabled_phase_names)) + + if secondary_repo_collect_phase.__name__ in enabled_phase_names: + enabled_collection_hooks.append(build_secondary_repo_collect_request(session, logger, enabled_phase_names)) + #start_secondary_collection(session, max_repo=10) + + if facade_phase.__name__ in enabled_phase_names: + #start_facade_collection(session, max_repo=30) + enabled_collection_hooks.append(build_facade_repo_collect_request(session, logger, enabled_phase_names)) + + if machine_learning_phase.__name__ in enabled_phase_names: + enabled_collection_hooks.append(build_ml_repo_collect_request(session, logger, enabled_phase_names)) + #start_ml_collection(session,max_repo=5) + + logger.info(f"Starting collection phases: {[h.name for h in enabled_collection_hooks]}") + + main_routine = AugurTaskRoutine(logger, enabled_collection_hooks) + + main_routine.start_data_collection() # have a pipe of 180 From f8217ed9f3e2decaf191be223b52476cd0da982c Mon Sep 17 00:00:00 2001 From: Andrew Brain Date: Mon, 3 Jun 2024 19:07:08 -0500 Subject: [PATCH 14/44] Pass DatabaseSession Signed-off-by: Andrew Brain --- augur/tasks/start_tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 117fd9497f..3f1793c79d 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -247,7 +247,7 @@ def augur_collection_monitor(self): enabled_collection_hooks = [] - with get_session() as session: + with DatabaseSession(logger, self.app.engine) as session: if primary_repo_collect_phase.__name__ in enabled_phase_names: enabled_collection_hooks.append(build_primary_repo_collect_request(session, logger, enabled_phase_names)) From fc686e55b83379ef97b953b374ade7697087704e Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 16:12:55 -0500 Subject: [PATCH 15/44] fixing scorecard Signed-off-by: Sean P. Goggins --- augur/tasks/git/dependency_tasks/core.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index 167a450f4b..5eb83a0ece 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -85,7 +85,19 @@ def generate_scorecard(logger, repo_git): key_handler = GithubApiKeyHandler(logger) os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() + # start + + #setting the environmental variable which is required by scorecard + key_handler = GithubApiKeyHandler(session, session.logger) + os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() + try: + required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) + except Exception as e: + session.logger.error(f"Could not parse required output! Error: {e}") + raise e + + # end logger.info('adding to database...') logger.debug(f"output: {required_output}") From 7b914d7b854e49b385bf685b238a771d447ac375 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 16:39:04 -0500 Subject: [PATCH 16/44] fixing augur startup issue Signed-off-by: Sean P. Goggins --- augur/application/logs.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/augur/application/logs.py b/augur/application/logs.py index 11e1cb6ea5..0d6649ce48 100644 --- a/augur/application/logs.py +++ b/augur/application/logs.py @@ -36,12 +36,29 @@ def getFormatter(logLevel): return logging.Formatter(fmt=ERROR_FORMAT_STRING) # create a file handler and set the format and log level -def create_file_handler(file, formatter, level): - handler = FileHandler(filename=file, mode='a') - handler.setFormatter(fmt=formatter) - handler.setLevel(level) +# def create_file_handler(file, formatter, level): +# handler = FileHandler(filename=file, mode='a') +# handler.setFormatter(fmt=formatter) +# handler.setLevel(level) + +# return handler - return handler +def create_file_handler(file, formatter, level): + try: + # Ensure the directory exists + directory = os.path.dirname(file) + if not os.path.exists(directory): + os.makedirs(directory) + + # Create the file handler + handler = logging.FileHandler(filename=file, mode='a') + handler.setFormatter(formatter) + handler.setLevel(level) + + return handler + except Exception as e: + print(f"Failed to create file handler: {e}") + return None # function to create two file handlers and add them to a logger def initialize_file_handlers(logger, file, log_level): From e2217e595ea9ccf23ffaca1a9ef91f23cd2a3ee3 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 17:04:20 -0500 Subject: [PATCH 17/44] updated for facade repo out of sync error Signed-off-by: Sean P. Goggins --- augur/tasks/git/util/facade_worker/facade_worker/repofetch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py index 874f338902..dc2d22300c 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py @@ -315,7 +315,7 @@ def git_repo_updates(facade_helper, repo_git): try: - firstpull = (f"git -C {absolute_path} pull") + firstpull = (f"git -C {absolute_path} pull --rebase") return_code_remote = subprocess.Popen( [firstpull], shell=True).wait() From c2ee07c4a188b817f9d4add35262ce64c57b56d1 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 17:14:49 -0500 Subject: [PATCH 18/44] fixing dependency logic issue Signed-off-by: Sean P. Goggins --- augur/tasks/git/dependency_tasks/core.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index 5eb83a0ece..626b45484e 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -6,6 +6,7 @@ from augur.tasks.git.dependency_tasks.dependency_util import dependency_calculator as dep_calc from augur.tasks.util.worker_util import parse_json_from_subprocess_call from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path +from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth def generate_deps_data(logger, repo_git): @@ -82,14 +83,14 @@ def generate_scorecard(logger, repo_git): #setting the environmental variable which is required by scorecard with get_session() as session: - - key_handler = GithubApiKeyHandler(logger) + key_handler = GithubRandomKeyAuth(logger) + #key_handler = GithubApiKeyHandler(logger) os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() - # start + # This seems outdated #setting the environmental variable which is required by scorecard - key_handler = GithubApiKeyHandler(session, session.logger) - os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() + #key_handler = GithubApiKeyHandler(session, session.logger) + #os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() try: required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) From c3f4eaab8c266f744a681b36d93d0b53b5cc25dc Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 17:25:23 -0500 Subject: [PATCH 19/44] node dependency checker Signed-off-by: Sean P. Goggins --- .../libyear_util/npm_libyear_utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py b/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py index bcfe810a9c..9dfa10b4ea 100644 --- a/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py +++ b/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py @@ -1,4 +1,7 @@ import requests +import logging + +logger = logging.getLogger(__name__) def get_NPM_data(package): url = "https://registry.npmjs.org/%s" % package @@ -42,10 +45,16 @@ def get_latest_patch(version, data): def get_lastest_minor(version, data): - versions = data['versions'] + try: + versions = data['versions'] + except Exception as e: + logger.info(f'error is {e} on the NPM. Hey, its NODEJS, of course it does not work :D ') + raise e + try: index = list(versions.keys()).index(version) except ValueError as e: + logger.info(f'error is {e} on the NPM. Some kind of value error. Probably a VALUES error for Node, #AmIRight?') raise e major,minor,patch = split_version(version) From 90b149c8952afe6e4d2f5557c71237e70b17ceb6 Mon Sep 17 00:00:00 2001 From: Sean Goggins Date: Tue, 4 Jun 2024 17:28:07 -0500 Subject: [PATCH 20/44] version update Signed-off-by: Sean Goggins --- README.md | 4 +- flower_db | Bin 0 -> 16384 bytes scripts/install/contributor.sql | 250 ++++++++++++++++++ scripts/install/explorer-index.sql | 5 + .../explorer_contributor_recent_actions.sql | 104 ++++++++ scripts/install/matview.sh | 115 ++++++++ scripts/install/matview.sql | 112 ++++++++ 7 files changed, 588 insertions(+), 2 deletions(-) create mode 100644 flower_db create mode 100644 scripts/install/contributor.sql create mode 100644 scripts/install/explorer-index.sql create mode 100644 scripts/install/explorer_contributor_recent_actions.sql create mode 100755 scripts/install/matview.sh create mode 100644 scripts/install/matview.sql diff --git a/README.md b/README.md index 02ec125fb6..c0c99157cb 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Augur NEW Release v0.70.0 +# Augur NEW Release v0.71.0 Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else! The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io @@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o ## NEW RELEASE ALERT! ### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md) -Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.70.0 +Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.71.0 - The `main` branch is a stable version of our new architecture, which features: - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks. diff --git a/flower_db b/flower_db new file mode 100644 index 0000000000000000000000000000000000000000..f03fe98e2b3029b3d1103880728d018993291465 GIT binary patch literal 16384 zcmeI%!3l#v5Czc5(VLJ4EWr}&BE4Ba+OZ))-K^q3yyg&j!tTHxL>_$blTlfqS3_^nf1F z1A0IY=m9-&vmSWVTYdij?~jAu%w>D7-t4@paNv;dhdjuGJY)fR$Qt@V9^@el$V1l9 h5Aq-nSwJ4XHJlJ2K!5-N0t5&UAV7cs0RndxSOLec*~S0> literal 0 HcmV?d00001 diff --git a/scripts/install/contributor.sql b/scripts/install/contributor.sql new file mode 100644 index 0000000000..7632f9706e --- /dev/null +++ b/scripts/install/contributor.sql @@ -0,0 +1,250 @@ +create materialized view augur_data.explorer_contributor_metrics as + SELECT * FROM ( + SELECT ID AS + cntrb_id, + A.created_at AS created_at, + date_part('month', A.created_at::DATE) AS month, + date_part('year', A.created_at::DATE) AS year, + A.repo_id, + repo_name, + full_name, + login, + ACTION, + rank() OVER ( + PARTITION BY id + ORDER BY A.created_at ASC + ) + FROM + ( + ( + SELECT + canonical_id AS ID, + created_at AS created_at, + repo_id, + 'issue_opened' AS ACTION, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + FROM + augur_data.issues + LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = issues.reporter_id + LEFT OUTER JOIN ( + SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + cntrb_canonical AS canonical_email, + data_collection_date, + cntrb_id AS canonical_id + FROM augur_data.contributors + WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + WHERE + --repo_id = {repo_id} + pull_request IS NULL + GROUP BY + canonical_id, + repo_id, + issues.created_at, + contributors.cntrb_full_name, + contributors.cntrb_login + ) UNION ALL + ( + SELECT + canonical_id AS ID, + TO_TIMESTAMP( cmt_author_date, 'YYYY-MM-DD' ) AS created_at, + repo_id, + 'commit' AS ACTION, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + FROM + augur_data.commits + LEFT OUTER JOIN augur_data.contributors ON cntrb_email = cmt_author_email + LEFT OUTER JOIN ( + SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + cntrb_canonical AS canonical_email, + data_collection_date, cntrb_id AS canonical_id + FROM augur_data.contributors + WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + --WHERE + -- repo_id = {repo_id} + GROUP BY + repo_id, + canonical_email, + canonical_id, + commits.cmt_author_date, + contributors.cntrb_full_name, + contributors.cntrb_login + ) UNION ALL + ( + SELECT + message.cntrb_id AS ID, + created_at AS created_at, + commits.repo_id, + 'commit_comment' AS ACTION, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + + FROM + augur_data.commit_comment_ref, + augur_data.commits, + augur_data.message + LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id + LEFT OUTER JOIN ( + SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + cntrb_canonical AS canonical_email, + data_collection_date, cntrb_id AS canonical_id + FROM augur_data.contributors + WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + WHERE + commits.cmt_id = commit_comment_ref.cmt_id + -- AND commits.repo_id = {repo_id} + AND commit_comment_ref.msg_id = message.msg_id + + GROUP BY + ID, + commits.repo_id, + commit_comment_ref.created_at, + contributors.cntrb_full_name, + contributors.cntrb_login + ) UNION ALL + ( + SELECT + issue_events.cntrb_id AS ID, + issue_events.created_at AS created_at, + issues.repo_id, + 'issue_closed' AS ACTION, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + FROM + augur_data.issues, + augur_data.issue_events + LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = issue_events.cntrb_id + LEFT OUTER JOIN ( + SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + cntrb_canonical AS canonical_email, + data_collection_date, + cntrb_id AS canonical_id + FROM augur_data.contributors + WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + WHERE + --issues.repo_id = {repo_id} + issues.issue_id = issue_events.issue_id + AND issues.pull_request IS NULL + AND issue_events.cntrb_id IS NOT NULL + AND ACTION = 'closed' + GROUP BY + issue_events.cntrb_id, + issues.repo_id, + issue_events.created_at, + contributors.cntrb_full_name, + contributors.cntrb_login + ) UNION ALL + ( + SELECT + pr_augur_contributor_id AS ID, + pr_created_at AS created_at, + pull_requests.repo_id, + 'open_pull_request' AS ACTION, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + FROM + augur_data.pull_requests + LEFT OUTER JOIN augur_data.contributors ON pull_requests.pr_augur_contributor_id = contributors.cntrb_id + LEFT OUTER JOIN ( + SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + cntrb_canonical AS canonical_email, + data_collection_date, + cntrb_id AS canonical_id + FROM augur_data.contributors + WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + -- WHERE + --pull_requests.repo_id = {repo_id} + GROUP BY + pull_requests.pr_augur_contributor_id, + pull_requests.repo_id, + pull_requests.pr_created_at, + contributors.cntrb_full_name, + contributors.cntrb_login + ) UNION ALL + ( + SELECT + message.cntrb_id AS ID, + msg_timestamp AS created_at, + pull_requests.repo_id as repo_id, + 'pull_request_comment' AS ACTION, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + FROM + augur_data.pull_requests, + augur_data.pull_request_message_ref, + augur_data.message + LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id + LEFT OUTER JOIN ( + SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + cntrb_canonical AS canonical_email, + data_collection_date, + cntrb_id AS canonical_id + FROM augur_data.contributors + WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + WHERE + -- pull_requests.repo_id = {repo_id} + pull_request_message_ref.pull_request_id = pull_requests.pull_request_id + AND pull_request_message_ref.msg_id = message.msg_id + GROUP BY + message.cntrb_id, + pull_requests.repo_id, + message.msg_timestamp, + contributors.cntrb_full_name, + contributors.cntrb_login + ) UNION ALL + ( + SELECT + issues.reporter_id AS ID, + msg_timestamp AS created_at, + issues.repo_id as repo_id, + 'issue_comment' AS ACTION, + contributors.cntrb_full_name AS full_name, + contributors.cntrb_login AS login + FROM + augur_data.issues, + augur_data.issue_message_ref, + augur_data.message + LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id + LEFT OUTER JOIN ( + SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + cntrb_canonical AS canonical_email, + data_collection_date, + cntrb_id AS canonical_id + FROM augur_data.contributors + WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + WHERE + --issues.repo_id = {repo_id} + issue_message_ref.msg_id = message.msg_id + AND issues.issue_id = issue_message_ref.issue_id + AND issues.pull_request_id = NULL + GROUP BY + issues.reporter_id, + issues.repo_id, + message.msg_timestamp, + contributors.cntrb_full_name, + contributors.cntrb_login + ) + ) A, + augur_data.repo + WHERE + ID IS NOT NULL + AND A.repo_id = repo.repo_id + GROUP BY + A.ID, + A.repo_id, + A.ACTION, + A.created_at, + repo.repo_name, + A.full_name, + A.login + ORDER BY + cntrb_id + ) b diff --git a/scripts/install/explorer-index.sql b/scripts/install/explorer-index.sql new file mode 100644 index 0000000000..95ae45be6a --- /dev/null +++ b/scripts/install/explorer-index.sql @@ -0,0 +1,5 @@ + +-- View indexes: +CREATE UNIQUE INDEX explorer_contributor_recent_actions_unique_idx ON augur_data.explorer_contributor_recent_actions USING btree (cntrb_id, created_at, repo_id, action, repo_name, login, rank); +CREATE INDEX explorer_contributor_recent_actions_cntrb_id_idx ON augur_data.explorer_contributor_recent_actions USING btree (cntrb_id); +CREATE INDEX explorer_contributor_recent_actions_repo_id_idx ON augur_data.explorer_contributor_recent_actions USING btree (repo_id DESC); diff --git a/scripts/install/explorer_contributor_recent_actions.sql b/scripts/install/explorer_contributor_recent_actions.sql new file mode 100644 index 0000000000..2368be5a44 --- /dev/null +++ b/scripts/install/explorer_contributor_recent_actions.sql @@ -0,0 +1,104 @@ +-- augur_data.explorer_contributor_recent_actions source +DROP MATERIALIZED VIEW if exists augur_data.explorer_contributor_recent_actions; +CREATE MATERIALIZED VIEW augur_data.explorer_contributor_recent_actions +AS SELECT a.id AS cntrb_id, + a.created_at, + a.repo_id, + a.action, + repo.repo_name, + a.login, + row_number() OVER (PARTITION BY a.id, a.repo_id ORDER BY a.created_at DESC) AS rank + FROM ( SELECT commits.cmt_ght_author_id AS id, + commits.cmt_author_timestamp AS created_at, + commits.repo_id, + 'commit'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.commits + LEFT JOIN augur_data.contributors ON contributors.cntrb_id::text = commits.cmt_ght_author_id::text and commits.cmt_author_timestamp >= now() - interval '13 months' + GROUP BY commits.cmt_commit_hash, commits.cmt_ght_author_id, commits.repo_id, commits.cmt_author_timestamp, 'commit'::text, contributors.cntrb_login + UNION ALL + SELECT issues.reporter_id AS id, + issues.created_at, + issues.repo_id, + 'issue_opened'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.issues + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = issues.reporter_id and issues.created_at >= now() - interval '13 months' + WHERE issues.pull_request IS NULL + UNION ALL + SELECT pull_request_events.cntrb_id AS id, + pull_request_events.created_at, + pull_requests.repo_id, + 'pull_request_closed'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.pull_requests, + augur_data.pull_request_events + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = pull_request_events.cntrb_id and pull_request_events.created_at >= now() - interval '13 months' + WHERE pull_requests.pull_request_id = pull_request_events.pull_request_id AND pull_requests.pr_merged_at IS NULL AND pull_request_events.action::text = 'closed'::text + UNION ALL + SELECT pull_request_events.cntrb_id AS id, + pull_request_events.created_at, + pull_requests.repo_id, + 'pull_request_merged'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.pull_requests, + augur_data.pull_request_events + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = pull_request_events.cntrb_id and pull_request_events.created_at >= now() - interval '13 months' + WHERE pull_requests.pull_request_id = pull_request_events.pull_request_id AND pull_request_events.action::text = 'merged'::text + UNION ALL + SELECT issue_events.cntrb_id AS id, + issue_events.created_at, + issues.repo_id, + 'issue_closed'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.issues, + augur_data.issue_events + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = issue_events.cntrb_id and issue_events.created_at >= now() - interval '13 months' + WHERE issues.issue_id = issue_events.issue_id AND issues.pull_request IS NULL AND issue_events.action::text = 'closed'::text + UNION ALL + SELECT pull_request_reviews.cntrb_id AS id, + pull_request_reviews.pr_review_submitted_at AS created_at, + pull_requests.repo_id, + 'pull_request_review_'::text || pull_request_reviews.pr_review_state::text AS action, + contributors.cntrb_login AS login + FROM augur_data.pull_requests, + augur_data.pull_request_reviews + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = pull_request_reviews.cntrb_id and pull_request_reviews.pr_review_submitted_at >= now() - interval '13 months' + WHERE pull_requests.pull_request_id = pull_request_reviews.pull_request_id + UNION ALL + SELECT pull_requests.pr_augur_contributor_id AS id, + pull_requests.pr_created_at AS created_at, + pull_requests.repo_id, + 'pull_request_open'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.pull_requests + LEFT JOIN augur_data.contributors ON pull_requests.pr_augur_contributor_id = contributors.cntrb_id and pull_requests.pr_created_at >= now() - interval '13 months' + UNION ALL + SELECT message.cntrb_id AS id, + message.msg_timestamp AS created_at, + pull_requests.repo_id, + 'pull_request_comment'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.pull_requests, + augur_data.pull_request_message_ref, + augur_data.message + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id + WHERE pull_request_message_ref.pull_request_id = pull_requests.pull_request_id AND pull_request_message_ref.msg_id = message.msg_id + and pull_requests.pr_created_at >= now() - interval '13 months' + UNION ALL + SELECT issues.reporter_id AS id, + message.msg_timestamp AS created_at, + issues.repo_id, + 'issue_comment'::text AS action, + contributors.cntrb_login AS login + FROM augur_data.issues, + augur_data.issue_message_ref, + augur_data.message + LEFT JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id and message.msg_timestamp >= now() - interval '13 months' + WHERE issue_message_ref.msg_id = message.msg_id AND issues.issue_id = issue_message_ref.issue_id AND issues.closed_at <> message.msg_timestamp) a, + augur_data.repo + WHERE a.repo_id = repo.repo_id and a.created_at >= now() - interval '13 months' + ORDER BY a.created_at DESC +WITH DATA; + +-- View indexes: diff --git a/scripts/install/matview.sh b/scripts/install/matview.sh new file mode 100755 index 0000000000..ab3c178e75 --- /dev/null +++ b/scripts/install/matview.sh @@ -0,0 +1,115 @@ +#!/bin/bash +set -eo pipefail + +psql -U augur -p 5432 -h data.chaoss.io augur < 'create materialized view augur_data.pull_request_metrics as + SELECT + repo.repo_id AS repo_id, + pull_requests.pr_src_id AS pr_src_id, + repo.repo_name AS repo_name, + pr_src_author_association, + repo_groups.rg_name AS repo_group, + pull_requests.pr_src_state, + pull_requests.pr_merged_at, + pull_requests.pr_created_at AS pr_created_at, + pull_requests.pr_closed_at AS pr_closed_at, + date_part( 'year', pr_created_at :: DATE ) AS CREATED_YEAR, + date_part( 'month', pr_created_at :: DATE ) AS CREATED_MONTH, + date_part( 'year', pr_closed_at :: DATE ) AS CLOSED_YEAR, + date_part( 'month', pr_closed_at :: DATE ) AS CLOSED_MONTH, + pr_src_meta_label, + pr_head_or_base, + ( EXTRACT ( EPOCH FROM pull_requests.pr_closed_at ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_close, + ( EXTRACT ( EPOCH FROM pull_requests.pr_closed_at ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_close, + ( EXTRACT ( EPOCH FROM first_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_first_response, + ( EXTRACT ( EPOCH FROM first_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_first_response, + ( EXTRACT ( EPOCH FROM last_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_last_response, + ( EXTRACT ( EPOCH FROM last_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_last_response, + first_response_time, + last_response_time, + EXTRACT ( EPOCH FROM average_time_between_responses), + assigned_count, + review_requested_count, + labeled_count, + subscribed_count, + mentioned_count, + referenced_count, + closed_count, + head_ref_force_pushed_count, + merged_count::INT, + milestoned_count, + unlabeled_count, + head_ref_deleted_count, + comment_count, + COALESCE(lines_added, 0) as lines_added, + COALESCE(lines_removed, 0) as lines_removed, + commit_count, + COALESCE(file_count, 0) as file_count + FROM + augur_data.repo, + augur_data.repo_groups, + augur_data.pull_requests LEFT OUTER JOIN ( + SELECT pull_requests.pull_request_id, + count(*) FILTER (WHERE action = 'assigned') AS assigned_count, + count(*) FILTER (WHERE action = 'review_requested') AS review_requested_count, + count(*) FILTER (WHERE action = 'labeled') AS labeled_count, + count(*) FILTER (WHERE action = 'unlabeled') AS unlabeled_count, + count(*) FILTER (WHERE action = 'subscribed') AS subscribed_count, + count(*) FILTER (WHERE action = 'mentioned') AS mentioned_count, + count(*) FILTER (WHERE action = 'referenced') AS referenced_count, + count(*) FILTER (WHERE action = 'closed') AS closed_count, + count(*) FILTER (WHERE action = 'head_ref_force_pushed') AS head_ref_force_pushed_count, + count(*) FILTER (WHERE action = 'head_ref_deleted') AS head_ref_deleted_count, + count(*) FILTER (WHERE action = 'milestoned') AS milestoned_count, + COALESCE(count(*) FILTER (WHERE action = 'merged'), 0) AS merged_count, + COALESCE(MIN(message.msg_timestamp), pull_requests.pr_merged_at, pull_requests.pr_closed_at) AS first_response_time, + COALESCE(COUNT(DISTINCT message.msg_timestamp), 0) AS comment_count, + COALESCE(MAX(message.msg_timestamp), pull_requests.pr_closed_at) AS last_response_time, + COALESCE((MAX(message.msg_timestamp) - MIN(message.msg_timestamp)) / COUNT(DISTINCT message.msg_timestamp), pull_requests.pr_created_at - pull_requests.pr_closed_at) AS average_time_between_responses + FROM augur_data.pull_requests + LEFT OUTER JOIN augur_data.pull_request_events on augur_data.pull_requests.pull_request_id = augur_data.pull_request_events.pull_request_id + JOIN augur_data.repo on repo.repo_id = pull_requests.repo_id + LEFT OUTER JOIN augur_data.pull_request_message_ref on pull_requests.pull_request_id = pull_request_message_ref.pull_request_id + LEFT OUTER JOIN augur_data.message on pull_request_message_ref.msg_id = augur_data.message.msg_id + --WHERE repo.repo_id = {repo_id} + GROUP BY pull_requests.pull_request_id + ) response_times + ON pull_requests.pull_request_id = response_times.pull_request_id + LEFT JOIN ( + SELECT pull_request_commits.pull_request_id, count(DISTINCT pr_cmt_sha) AS commit_count + FROM augur_data.pull_request_commits, augur_data.pull_requests, augur_data.pull_request_meta + WHERE pull_requests.pull_request_id = pull_request_commits.pull_request_id + AND pull_requests.pull_request_id = pull_request_meta.pull_request_id + --AND pull_requests.repo_id = {repo_id} + AND pr_cmt_sha <> pull_requests.pr_merge_commit_sha + AND pr_cmt_sha <> pull_request_meta.pr_sha + GROUP BY pull_request_commits.pull_request_id + ) all_commit_counts + ON pull_requests.pull_request_id = all_commit_counts.pull_request_id + LEFT JOIN ( + SELECT MAX(pr_repo_meta_id), pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label + FROM augur_data.pull_requests, augur_data.pull_request_meta + WHERE pull_requests.pull_request_id = pull_request_meta.pull_request_id + --AND pull_requests.repo_id = {repo_id} + AND pr_head_or_base = 'base' + GROUP BY pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label + ) base_labels + ON base_labels.pull_request_id = all_commit_counts.pull_request_id + LEFT JOIN ( + SELECT sum(cmt_added) AS lines_added, sum(cmt_removed) AS lines_removed, pull_request_commits.pull_request_id, count(DISTINCT cmt_filename) AS file_count + FROM augur_data.pull_request_commits, augur_data.commits, augur_data.pull_requests, augur_data.pull_request_meta + WHERE cmt_commit_hash = pr_cmt_sha + AND pull_requests.pull_request_id = pull_request_commits.pull_request_id + AND pull_requests.pull_request_id = pull_request_meta.pull_request_id + --AND pull_requests.repo_id = {repo_id} + AND commits.repo_id = pull_requests.repo_id + AND commits.cmt_commit_hash <> pull_requests.pr_merge_commit_sha + AND commits.cmt_commit_hash <> pull_request_meta.pr_sha + GROUP BY pull_request_commits.pull_request_id + ) master_merged_counts + ON base_labels.pull_request_id = master_merged_counts.pull_request_id + WHERE + repo.repo_group_id = repo_groups.repo_group_id + AND repo.repo_id = pull_requests.repo_id + --AND repo.repo_id = {repo_id} + ORDER BY + merged_count DESC' diff --git a/scripts/install/matview.sql b/scripts/install/matview.sql new file mode 100644 index 0000000000..9a56dcd7dc --- /dev/null +++ b/scripts/install/matview.sql @@ -0,0 +1,112 @@ +create materialized view augur_data.explorer_pr_metrics as + SELECT + repo.repo_id AS repo_id, + pull_requests.pr_src_id AS pr_src_id, + repo.repo_name AS repo_name, + pr_src_author_association, + repo_groups.rg_name AS repo_group, + pull_requests.pr_src_state, + pull_requests.pr_merged_at, + pull_requests.pr_created_at AS pr_created_at, + pull_requests.pr_closed_at AS pr_closed_at, + date_part( 'year', pr_created_at :: DATE ) AS CREATED_YEAR, + date_part( 'month', pr_created_at :: DATE ) AS CREATED_MONTH, + date_part( 'year', pr_closed_at :: DATE ) AS CLOSED_YEAR, + date_part( 'month', pr_closed_at :: DATE ) AS CLOSED_MONTH, + pr_src_meta_label, + pr_head_or_base, + ( EXTRACT ( EPOCH FROM pull_requests.pr_closed_at ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_close, + ( EXTRACT ( EPOCH FROM pull_requests.pr_closed_at ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_close, + ( EXTRACT ( EPOCH FROM first_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_first_response, + ( EXTRACT ( EPOCH FROM first_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_first_response, + ( EXTRACT ( EPOCH FROM last_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_last_response, + ( EXTRACT ( EPOCH FROM last_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_last_response, + first_response_time, + last_response_time, + EXTRACT ( EPOCH FROM average_time_between_responses), + assigned_count, + review_requested_count, + labeled_count, + subscribed_count, + mentioned_count, + referenced_count, + closed_count, + head_ref_force_pushed_count, + merged_count::INT, + milestoned_count, + unlabeled_count, + head_ref_deleted_count, + comment_count, + COALESCE(lines_added, 0) as lines_added, + COALESCE(lines_removed, 0) as lines_removed, + commit_count, + COALESCE(file_count, 0) as file_count + FROM + augur_data.repo, + augur_data.repo_groups, + augur_data.pull_requests LEFT OUTER JOIN ( + SELECT pull_requests.pull_request_id, + count(*) FILTER (WHERE action = 'assigned') AS assigned_count, + count(*) FILTER (WHERE action = 'review_requested') AS review_requested_count, + count(*) FILTER (WHERE action = 'labeled') AS labeled_count, + count(*) FILTER (WHERE action = 'unlabeled') AS unlabeled_count, + count(*) FILTER (WHERE action = 'subscribed') AS subscribed_count, + count(*) FILTER (WHERE action = 'mentioned') AS mentioned_count, + count(*) FILTER (WHERE action = 'referenced') AS referenced_count, + count(*) FILTER (WHERE action = 'closed') AS closed_count, + count(*) FILTER (WHERE action = 'head_ref_force_pushed') AS head_ref_force_pushed_count, + count(*) FILTER (WHERE action = 'head_ref_deleted') AS head_ref_deleted_count, + count(*) FILTER (WHERE action = 'milestoned') AS milestoned_count, + COALESCE(count(*) FILTER (WHERE action = 'merged'), 0) AS merged_count, + COALESCE(MIN(message.msg_timestamp), pull_requests.pr_merged_at, pull_requests.pr_closed_at) AS first_response_time, + COALESCE(COUNT(DISTINCT message.msg_timestamp), 0) AS comment_count, + COALESCE(MAX(message.msg_timestamp), pull_requests.pr_closed_at) AS last_response_time, + COALESCE((MAX(message.msg_timestamp) - MIN(message.msg_timestamp)) / COUNT(DISTINCT message.msg_timestamp), pull_requests.pr_created_at - pull_requests.pr_closed_at) AS average_time_between_responses + FROM augur_data.pull_requests + LEFT OUTER JOIN augur_data.pull_request_events on augur_data.pull_requests.pull_request_id = augur_data.pull_request_events.pull_request_id + JOIN augur_data.repo on repo.repo_id = pull_requests.repo_id + LEFT OUTER JOIN augur_data.pull_request_message_ref on pull_requests.pull_request_id = pull_request_message_ref.pull_request_id + LEFT OUTER JOIN augur_data.message on pull_request_message_ref.msg_id = augur_data.message.msg_id + --WHERE repo.repo_id = {repo_id} + GROUP BY pull_requests.pull_request_id + ) response_times + ON pull_requests.pull_request_id = response_times.pull_request_id + LEFT JOIN ( + SELECT pull_request_commits.pull_request_id, count(DISTINCT pr_cmt_sha) AS commit_count + FROM augur_data.pull_request_commits, augur_data.pull_requests, augur_data.pull_request_meta + WHERE pull_requests.pull_request_id = pull_request_commits.pull_request_id + AND pull_requests.pull_request_id = pull_request_meta.pull_request_id + --AND pull_requests.repo_id = {repo_id} + AND pr_cmt_sha <> pull_requests.pr_merge_commit_sha + AND pr_cmt_sha <> pull_request_meta.pr_sha + GROUP BY pull_request_commits.pull_request_id + ) all_commit_counts + ON pull_requests.pull_request_id = all_commit_counts.pull_request_id + LEFT JOIN ( + SELECT MAX(pr_repo_meta_id), pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label + FROM augur_data.pull_requests, augur_data.pull_request_meta + WHERE pull_requests.pull_request_id = pull_request_meta.pull_request_id + --AND pull_requests.repo_id = {repo_id} + AND pr_head_or_base = 'base' + GROUP BY pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label + ) base_labels + ON base_labels.pull_request_id = all_commit_counts.pull_request_id + LEFT JOIN ( + SELECT sum(cmt_added) AS lines_added, sum(cmt_removed) AS lines_removed, pull_request_commits.pull_request_id, count(DISTINCT cmt_filename) AS file_count + FROM augur_data.pull_request_commits, augur_data.commits, augur_data.pull_requests, augur_data.pull_request_meta + WHERE cmt_commit_hash = pr_cmt_sha + AND pull_requests.pull_request_id = pull_request_commits.pull_request_id + AND pull_requests.pull_request_id = pull_request_meta.pull_request_id + --AND pull_requests.repo_id = {repo_id} + AND commits.repo_id = pull_requests.repo_id + AND commits.cmt_commit_hash <> pull_requests.pr_merge_commit_sha + AND commits.cmt_commit_hash <> pull_request_meta.pr_sha + GROUP BY pull_request_commits.pull_request_id + ) master_merged_counts + ON base_labels.pull_request_id = master_merged_counts.pull_request_id + WHERE + repo.repo_group_id = repo_groups.repo_group_id + AND repo.repo_id = pull_requests.repo_id + --AND repo.repo_id = {repo_id} + ORDER BY + merged_count DESC From 30b325d9f486c345bb1edafe86a1d739ede03e88 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 17:31:25 -0500 Subject: [PATCH 21/44] update Signed-off-by: Sean P. Goggins --- augur/tasks/git/util/facade_worker/facade_worker/repofetch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py index dc2d22300c..874f338902 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py @@ -315,7 +315,7 @@ def git_repo_updates(facade_helper, repo_git): try: - firstpull = (f"git -C {absolute_path} pull --rebase") + firstpull = (f"git -C {absolute_path} pull") return_code_remote = subprocess.Popen( [firstpull], shell=True).wait() From 0f00cdee65d6e5792ddd2d599d7561633e0fc9ef Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 17:38:43 -0500 Subject: [PATCH 22/44] fixing Signed-off-by: Sean P. Goggins --- augur/tasks/git/dependency_tasks/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index 626b45484e..29c04d4585 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -83,8 +83,8 @@ def generate_scorecard(logger, repo_git): #setting the environmental variable which is required by scorecard with get_session() as session: - key_handler = GithubRandomKeyAuth(logger) - #key_handler = GithubApiKeyHandler(logger) + #key_handler = GithubRandomKeyAuth(logger) + key_handler = GithubApiKeyHandler(logger) os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() # This seems outdated From e7d305da4b4352dfaf22142f49ed03235f6a2679 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 17:45:07 -0500 Subject: [PATCH 23/44] fixing open ssf score card Signed-off-by: Sean P. Goggins --- augur/tasks/git/dependency_tasks/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index 29c04d4585..8b76033baa 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -93,7 +93,7 @@ def generate_scorecard(logger, repo_git): #os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() try: - required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) + required_output = parse_json_from_subprocess_call(logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) except Exception as e: session.logger.error(f"Could not parse required output! Error: {e}") raise e From 2d945ca1950d8dbaff190c766773362fa5525f88 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 17:54:05 -0500 Subject: [PATCH 24/44] possibly fixed PR worker task Signed-off-by: Sean P. Goggins --- augur/tasks/github/pull_requests/tasks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 9ccd398478..fcc354054b 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -8,6 +8,7 @@ from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor +from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.application.db.lib import get_session, get_repo_by_repo_git, bulk_insert_dicts, get_pull_request_reviews_by_repo_id from augur.application.db.util import execute_session_query from ..messages.tasks import process_github_comment_contributors From 84b7fea7afbd16c0f313b3661f10d94d04d7e8ea Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 17:57:55 -0500 Subject: [PATCH 25/44] fixing missing task manifest info Signed-off-by: Sean P. Goggins --- .../tasks/github/util/github_task_session.py | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/augur/tasks/github/util/github_task_session.py b/augur/tasks/github/util/github_task_session.py index a21fbc233a..a74e8752bf 100644 --- a/augur/tasks/github/util/github_task_session.py +++ b/augur/tasks/github/util/github_task_session.py @@ -4,6 +4,25 @@ from augur.application.db.session import DatabaseSession from augur.application.db import get_engine +class GithubTaskManifest: + + def __init__(self, logger): + + engine = get_engine() + + self.augur_db = DatabaseSession(logger, engine) + self.key_auth = GithubRandomKeyAuth(self.augur_db.session, logger) + self.logger = logger + self.platform_id = 1 + + def __enter__(self): + + return self + + def __exit__(self, exception_type, exception_value, exception_traceback): + + self.augur_db.close() + class GithubTaskSession(DatabaseSession): """ORM session used in github tasks. @@ -20,6 +39,5 @@ def __init__(self, logger: Logger, engine=None): super().__init__(logger, engine=engine) - self.oauths = GithubRandomKeyAuth(logger) - self.platform_id = 1 - + self.oauths = GithubRandomKeyAuth(self, logger) + self.platform_id = 1 \ No newline at end of file From f3002c1c061ff649d24bb9f1c5d7363cb850cc5b Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 18:05:35 -0500 Subject: [PATCH 26/44] fixing random key auth Signed-off-by: Sean P. Goggins --- augur/tasks/github/util/github_random_key_auth.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/augur/tasks/github/util/github_random_key_auth.py b/augur/tasks/github/util/github_random_key_auth.py index 95788da1cc..ed539430d8 100644 --- a/augur/tasks/github/util/github_random_key_auth.py +++ b/augur/tasks/github/util/github_random_key_auth.py @@ -9,12 +9,12 @@ class GithubRandomKeyAuth(RandomKeyAuth): github collections can have a class randomly selects an api key for each request """ - def __init__(self, logger): + def __init__(self, session: Session, logger): """Creates a GithubRandomKeyAuth object and initializes the RandomKeyAuth parent class""" # gets the github api keys from the database via the GithubApiKeyHandler - github_api_keys = GithubApiKeyHandler(logger).keys + github_api_keys = GithubApiKeyHandler(session, logger).keys #github_api_keys = random.sample(github_api_keys, len(github_api_keys)) if not github_api_keys: From 6b2667aebb04de2908f31165b989de7aa16a237e Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 18:11:53 -0500 Subject: [PATCH 27/44] can you overload __init__ in Python? Signed-off-by: Sean P. Goggins --- .../tasks/github/util/github_random_key_auth.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/augur/tasks/github/util/github_random_key_auth.py b/augur/tasks/github/util/github_random_key_auth.py index ed539430d8..c0ef68d4c2 100644 --- a/augur/tasks/github/util/github_random_key_auth.py +++ b/augur/tasks/github/util/github_random_key_auth.py @@ -9,6 +9,23 @@ class GithubRandomKeyAuth(RandomKeyAuth): github collections can have a class randomly selects an api key for each request """ + def __init__(self, logger): + """Creates a GithubRandomKeyAuth object and initializes the RandomKeyAuth parent class""" + + + # gets the github api keys from the database via the GithubApiKeyHandler + github_api_keys = GithubApiKeyHandler(logger).keys + #github_api_keys = random.sample(github_api_keys, len(github_api_keys)) + + if not github_api_keys: + print("Failed to find github api keys. This is usually because your key has expired") + + # defines the structure of the github api key + header_name = "Authorization" + key_format = "token {0}" + + super().__init__(github_api_keys, header_name, logger, key_format) + def __init__(self, session: Session, logger): """Creates a GithubRandomKeyAuth object and initializes the RandomKeyAuth parent class""" From 082d5d29d1e54b779245ca601285d07d29c72b70 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 18:27:04 -0500 Subject: [PATCH 28/44] updating things Signed-off-by: Sean P. Goggins --- .../github/util/github_random_key_auth.py | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/augur/tasks/github/util/github_random_key_auth.py b/augur/tasks/github/util/github_random_key_auth.py index c0ef68d4c2..397670407d 100644 --- a/augur/tasks/github/util/github_random_key_auth.py +++ b/augur/tasks/github/util/github_random_key_auth.py @@ -26,19 +26,23 @@ def __init__(self, logger): super().__init__(github_api_keys, header_name, logger, key_format) - def __init__(self, session: Session, logger): - """Creates a GithubRandomKeyAuth object and initializes the RandomKeyAuth parent class""" + # This is what it needs to be. And until it is, the PR task will fail: + + # Right now many other tasks fail with the modified syntax + + # def __init__(self, session: Session, logger): + # """Creates a GithubRandomKeyAuth object and initializes the RandomKeyAuth parent class""" - # gets the github api keys from the database via the GithubApiKeyHandler - github_api_keys = GithubApiKeyHandler(session, logger).keys - #github_api_keys = random.sample(github_api_keys, len(github_api_keys)) + # # gets the github api keys from the database via the GithubApiKeyHandler + # github_api_keys = GithubApiKeyHandler(session, logger).keys + # #github_api_keys = random.sample(github_api_keys, len(github_api_keys)) - if not github_api_keys: - print("Failed to find github api keys. This is usually because your key has expired") + # if not github_api_keys: + # print("Failed to find github api keys. This is usually because your key has expired") - # defines the structure of the github api key - header_name = "Authorization" - key_format = "token {0}" + # # defines the structure of the github api key + # header_name = "Authorization" + # key_format = "token {0}" - super().__init__(github_api_keys, header_name, logger, key_format) \ No newline at end of file + # super().__init__(github_api_keys, header_name, logger, key_format) \ No newline at end of file From 6d25bd9d6b80b2a2dbd96d0422c4d7c2b8960060 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 18:34:08 -0500 Subject: [PATCH 29/44] trying a hack Signed-off-by: Sean P. Goggins --- augur/tasks/github/pull_requests/tasks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index fcc354054b..440c294954 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -24,7 +24,8 @@ def collect_pull_requests(repo_git: str) -> int: logger = logging.getLogger(collect_pull_requests.__name__) - with GithubTaskManifest(logger) as manifest: + #with GithubTaskManifest(logger) as manifest: + with GithubTaskManifest() as manifest: augur_db = manifest.augur_db From 85e7b2dda4478fe6c5626f1c80fa6a7d4f7839ca Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 18:36:29 -0500 Subject: [PATCH 30/44] hacking Signed-off-by: Sean P. Goggins --- augur/tasks/github/pull_requests/tasks.py | 4 ++-- augur/tasks/github/util/github_task_session.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 440c294954..2ea28cd11c 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -24,8 +24,8 @@ def collect_pull_requests(repo_git: str) -> int: logger = logging.getLogger(collect_pull_requests.__name__) - #with GithubTaskManifest(logger) as manifest: - with GithubTaskManifest() as manifest: + with GithubTaskManifest(logger) as manifest: + #with GithubTaskManifest() as manifest: augur_db = manifest.augur_db diff --git a/augur/tasks/github/util/github_task_session.py b/augur/tasks/github/util/github_task_session.py index a74e8752bf..b84c5201f5 100644 --- a/augur/tasks/github/util/github_task_session.py +++ b/augur/tasks/github/util/github_task_session.py @@ -11,7 +11,9 @@ def __init__(self, logger): engine = get_engine() self.augur_db = DatabaseSession(logger, engine) - self.key_auth = GithubRandomKeyAuth(self.augur_db.session, logger) + #self.key_auth = GithubRandomKeyAuth(self.augur_db.session, logger) + #totalHack + self.key_auth = GithubRandomKeyAuth(logger) self.logger = logger self.platform_id = 1 From da9952bac61ebfee35738af90f2992c3835f54d9 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 18:37:53 -0500 Subject: [PATCH 31/44] update Signed-off-by: Sean P. Goggins --- augur/tasks/github/messages/tasks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/augur/tasks/github/messages/tasks.py b/augur/tasks/github/messages/tasks.py index bee7412489..f552d1f685 100644 --- a/augur/tasks/github/messages/tasks.py +++ b/augur/tasks/github/messages/tasks.py @@ -5,6 +5,7 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.data_parse import * from augur.tasks.github.util.github_paginator import GithubPaginator +from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import get_owner_repo From e69f65a939c2b5c02cd9765f1db9a2379ab6eeaa Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 18:44:31 -0500 Subject: [PATCH 32/44] added Repo object Signed-off-by: Sean P. Goggins --- augur/tasks/github/pull_requests/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 2ea28cd11c..2826be2f87 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -7,7 +7,7 @@ from augur.tasks.github.util.github_paginator import GithubPaginator from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo -from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor +from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor, Repo from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.application.db.lib import get_session, get_repo_by_repo_git, bulk_insert_dicts, get_pull_request_reviews_by_repo_id from augur.application.db.util import execute_session_query From bf04ebc75c23dc81788715ba59c0ace686a75157 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 18:48:18 -0500 Subject: [PATCH 33/44] messages fix Signed-off-by: Sean P. Goggins --- augur/tasks/github/messages/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/github/messages/tasks.py b/augur/tasks/github/messages/tasks.py index f552d1f685..0c1199988a 100644 --- a/augur/tasks/github/messages/tasks.py +++ b/augur/tasks/github/messages/tasks.py @@ -9,7 +9,7 @@ from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import Message, PullRequestMessageRef, IssueMessageRef, Contributor +from augur.application.db.models import Message, PullRequestMessageRef, IssueMessageRef, Contributor, Repo from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_issues_by_repo_id, get_pull_requests_by_repo_id From 18ef7743c3b324a0f615a6193f57ead054d576bd Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 19:02:46 -0500 Subject: [PATCH 34/44] update session Signed-off-by: Sean P. Goggins --- augur/tasks/github/messages/tasks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/augur/tasks/github/messages/tasks.py b/augur/tasks/github/messages/tasks.py index 0c1199988a..e4bad90328 100644 --- a/augur/tasks/github/messages/tasks.py +++ b/augur/tasks/github/messages/tasks.py @@ -11,6 +11,7 @@ from augur.tasks.github.util.util import get_owner_repo from augur.application.db.models import Message, PullRequestMessageRef, IssueMessageRef, Contributor, Repo from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_issues_by_repo_id, get_pull_requests_by_repo_id +from augur.application.db import get_engine, get_session platform_id = 1 From ad1bc2d184dd30dc856e39df8cc052a68b25ad59 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 19:05:19 -0500 Subject: [PATCH 35/44] update for colelctionstatus Signed-off-by: Sean P. Goggins --- augur/tasks/github/messages/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/github/messages/tasks.py b/augur/tasks/github/messages/tasks.py index e4bad90328..f883db2997 100644 --- a/augur/tasks/github/messages/tasks.py +++ b/augur/tasks/github/messages/tasks.py @@ -9,7 +9,7 @@ from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import Message, PullRequestMessageRef, IssueMessageRef, Contributor, Repo +from augur.application.db.models import Message, PullRequestMessageRef, IssueMessageRef, Contributor, Repo, CollectionStatus from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_issues_by_repo_id, get_pull_requests_by_repo_id from augur.application.db import get_engine, get_session From 22312a706743ba2b6ad8743ec3ae06967f92f90f Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 19:07:27 -0500 Subject: [PATCH 36/44] fixing pr files Signed-off-by: Sean P. Goggins --- .../github/pull_requests/files_model/core.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/augur/tasks/github/pull_requests/files_model/core.py index 5dded9eed4..fc57fecd58 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/augur/tasks/github/pull_requests/files_model/core.py @@ -2,24 +2,26 @@ from augur.tasks.github.util.gh_graphql_entities import GraphQlPageCollection from augur.application.db.models import * from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.lib import bulk_insert_dicts, execute_sql from augur.application.db.util import execute_session_query -import traceback +import traceback -def pull_request_files_model(repo,logger, key_auth): +def pull_request_files_model(repo_id,logger, augur_db, key_auth): # query existing PRs and the respective url we will append the commits url to pr_number_sql = s.sql.text(""" SELECT DISTINCT pr_src_number as pr_src_number, pull_requests.pull_request_id FROM pull_requests--, pull_request_meta WHERE repo_id = :repo_id - """).bindparams(repo_id=repo.repo_id) + """).bindparams(repo_id=repo_id) pr_numbers = [] #pd.read_sql(pr_number_sql, self.db, params={}) - result = execute_sql(pr_number_sql)#.fetchall() + result = augur_db.execute_sql(pr_number_sql)#.fetchall() pr_numbers = [dict(row) for row in result.mappings()] + query = augur_db.session.query(Repo).filter(Repo.repo_id == repo_id) + repo = execute_session_query(query, 'one') + owner, name = get_owner_repo(repo.repo_git) pr_file_rows = [] @@ -66,21 +68,21 @@ def pull_request_files_model(repo,logger, key_auth): logger.debug(f"Results of file_collection: {file_collection}") - for pr_file in file_collection: + for pr_file in file_collection: logger.debug(f"CHECK: {repr(file_collection)}") - if pr_file and 'path' in pr_file: + if pr_file and 'path' in pr_file: logger.debug(f"Checks out for {repr(pr_file)} and {repr(file_collection)}") - try: + try: pr_file_rows += [{ 'pull_request_id': pr_info['pull_request_id'], 'pr_file_additions': pr_file['additions'] if 'additions' in pr_file else None, 'pr_file_deletions': pr_file['deletions'] if 'deletions' in pr_file else None, 'pr_file_path': pr_file['path'], 'data_source': 'GitHub API', - 'repo_id': repo_id, + 'repo_id': repo_id, } for pr_file in file_collection if pr_file and 'path' in pr_file] - except Exception as e: + except Exception as e: logger.error(f"PR Files Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") @@ -88,4 +90,4 @@ def pull_request_files_model(repo,logger, key_auth): if len(pr_file_rows) > 0: #Execute a bulk upsert with sqlalchemy pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"] - bulk_insert_dicts(logger, pr_file_rows, PullRequestFile, pr_file_natural_keys) + augur_db.insert_data(pr_file_rows, PullRequestFile, pr_file_natural_keys) \ No newline at end of file From 2c4341df3942502aeb1346fddf5c0367a94efee4 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 19:08:53 -0500 Subject: [PATCH 37/44] files model Signed-off-by: Sean P. Goggins --- .../github/pull_requests/files_model/tasks.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/augur/tasks/github/pull_requests/files_model/tasks.py b/augur/tasks/github/pull_requests/files_model/tasks.py index 134e05e900..988261f6c8 100644 --- a/augur/tasks/github/pull_requests/files_model/tasks.py +++ b/augur/tasks/github/pull_requests/files_model/tasks.py @@ -1,21 +1,18 @@ import logging from augur.tasks.github.pull_requests.files_model.core import * +from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurSecondaryRepoCollectionTask -from augur.application.db.lib import get_repo_by_repo_git -from augur.application.db import get_engine -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth - +from augur.application.db.util import execute_session_query @celery.task(base=AugurSecondaryRepoCollectionTask) def process_pull_request_files(repo_git: str) -> None: - engine = get_engine() - logger = logging.getLogger(process_pull_request_files.__name__) - repo = get_repo_by_repo_git(repo_git) - - key_auth = GithubRandomKeyAuth(logger) + with GithubTaskManifest(logger) as manifest: + augur_db = manifest.augur_db + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo = execute_session_query(query, 'one') - pull_request_files_model(repo, logger, key_auth) \ No newline at end of file + pull_request_files_model(repo.repo_id, logger, augur_db, manifest.key_auth) \ No newline at end of file From 8a97792bdbfa02459ddbaf54f37632debe6e6dbf Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 19:19:35 -0500 Subject: [PATCH 38/44] pr fix Signed-off-by: Sean P. Goggins --- augur/tasks/github/pull_requests/tasks.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 2826be2f87..08f70b89e9 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -84,8 +84,7 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth): #-> Generator[List[Di yield page_data - -def process_pull_requests(pull_requests, task_name, repo_id, logger): +def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): """ Parse and insert all retrieved PR data. @@ -94,6 +93,7 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger): task_name: Name of the calling task and the repo repo_id: augur id of the repository logger: logging object + augur_db: sqlalchemy db object """ tool_source = "Pr Task" tool_version = "2.0" @@ -106,7 +106,7 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger): # insert contributors from these prs logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) + augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) # insert the prs into the pull_requests table. @@ -116,7 +116,7 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger): pr_natural_keys = ["repo_id", "pr_src_id"] pr_return_columns = ["pull_request_id", "pr_url"] pr_string_fields = ["pr_src_title", "pr_body"] - pr_return_data = bulk_insert_dicts(logger, pr_dicts, PullRequest, pr_natural_keys, + pr_return_data = augur_db.insert_data(pr_dicts, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) if pr_return_data is None: @@ -155,24 +155,24 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger): # we are using pr_src_id and pull_request_id to determine if the label is already in the database. pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] pr_label_string_fields = ["pr_src_description"] - bulk_insert_dicts(logger, pr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) + augur_db.insert_data(pr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) # inserting pr assignees # we are using pr_assignee_src_id and pull_request_id to determine if the label is already in the database. pr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] - bulk_insert_dicts(logger, pr_assignee_dicts, PullRequestAssignee, pr_assignee_natural_keys) + augur_db.insert_data(pr_assignee_dicts, PullRequestAssignee, pr_assignee_natural_keys) # inserting pr requested reviewers # we are using pr_src_id and pull_request_id to determine if the label is already in the database. pr_reviewer_natural_keys = ["pull_request_id", "pr_reviewer_src_id"] - bulk_insert_dicts(logger, pr_reviewer_dicts, PullRequestReviewer, pr_reviewer_natural_keys) + augur_db.insert_data(pr_reviewer_dicts, PullRequestReviewer, pr_reviewer_natural_keys) # inserting pr metadata # we are using pull_request_id, pr_head_or_base, and pr_sha to determine if the label is already in the database. pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] pr_metadata_string_fields = ["pr_src_meta_label"] - bulk_insert_dicts(logger, pr_metadata_dicts, PullRequestMeta, + augur_db.insert_data(pr_metadata_dicts, PullRequestMeta, pr_metadata_natural_keys, string_fields=pr_metadata_string_fields) @@ -189,6 +189,11 @@ def process_pull_requests(pull_requests, task_name, repo_id, logger): + + + + + From b6adf9475aa416dfe5b532704674c8e9f3bcd748 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 21:35:49 -0500 Subject: [PATCH 39/44] update to message collection Signed-off-by: Sean P. Goggins --- augur/tasks/github/messages/tasks.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/augur/tasks/github/messages/tasks.py b/augur/tasks/github/messages/tasks.py index f883db2997..3e104fc6dc 100644 --- a/augur/tasks/github/messages/tasks.py +++ b/augur/tasks/github/messages/tasks.py @@ -6,13 +6,11 @@ from augur.application.db.data_parse import * from augur.tasks.github.util.github_paginator import GithubPaginator from augur.tasks.github.util.github_task_session import GithubTaskManifest -from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import Message, PullRequestMessageRef, IssueMessageRef, Contributor, Repo, CollectionStatus -from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_issues_by_repo_id, get_pull_requests_by_repo_id +from augur.application.db.models import PullRequest, Message, Issue, PullRequestMessageRef, IssueMessageRef, Contributor, Repo, CollectionStatus from augur.application.db import get_engine, get_session - +from sqlalchemy.sql import text platform_id = 1 @@ -29,8 +27,8 @@ def collect_github_messages(repo_git: str) -> None: Repo.repo_git == repo_git).one().repo_id owner, repo = get_owner_repo(repo_git) - task_name = f"{owner}/{repo}: Message Task" + if is_repo_small(repo_id): message_data = fast_retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name) @@ -135,7 +133,7 @@ def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger process_messages(all_data, task_name, repo_id, logger, augur_db) -def process_messages(messages, task_name, repo_id, logger): +def process_messages(messages, task_name, repo_id, logger, augur_db): tool_source = "Pr comment task" tool_version = "2.0" @@ -154,13 +152,13 @@ def process_messages(messages, task_name, repo_id, logger): # create mapping from issue url to issue id of current issues issue_url_to_id_map = {} - issues = get_issues_by_repo_id(repo_id) + issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() for issue in issues: issue_url_to_id_map[issue.issue_url] = issue.issue_id # create mapping from pr url to pr id of current pull requests pr_issue_url_to_id_map = {} - prs = get_pull_requests_by_repo_id(repo_id) + prs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() for pr in prs: pr_issue_url_to_id_map[pr.pr_issue_url] = pr.pull_request_id @@ -231,13 +229,13 @@ def process_messages(messages, task_name, repo_id, logger): contributors = remove_duplicate_dicts(contributors) logger.info(f"{task_name}: Inserting {len(contributors)} contributors") - bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) + augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] - message_return_data = bulk_insert_dicts(logger, message_dicts, Message, message_natural_keys, + message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, return_columns=message_return_columns, string_fields=message_string_fields) if message_return_data is None: return @@ -260,11 +258,11 @@ def process_messages(messages, task_name, repo_id, logger): logger.info(f"{task_name}: Inserting {len(pr_message_ref_dicts)} pr messages ref rows") pr_message_ref_natural_keys = ["pull_request_id", "pr_message_ref_src_comment_id"] - bulk_insert_dicts(logger, pr_message_ref_dicts, PullRequestMessageRef, pr_message_ref_natural_keys) + augur_db.insert_data(pr_message_ref_dicts, PullRequestMessageRef, pr_message_ref_natural_keys) logger.info(f"{task_name}: Inserting {len(issue_message_ref_dicts)} issue messages ref rows") issue_message_ref_natural_keys = ["issue_id", "issue_msg_ref_src_comment_id"] - bulk_insert_dicts(logger, issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) + augur_db.insert_data(issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) logger.info(f"{task_name}: Inserted {len(message_dicts)} messages. {len(issue_message_ref_dicts)} from issues and {len(pr_message_ref_dicts)} from prs") @@ -289,4 +287,4 @@ def process_github_comment_contributors(message, tool_source, tool_version, data # This is done by searching all the dicts for the given key that has the specified value def find_dict_in_list_of_dicts(data, key, value): - return next((item for item in data if item[key] == value), None) + return next((item for item in data if item[key] == value), None) \ No newline at end of file From 7989c1290d6830f350a3773a747625f92915e3b1 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 4 Jun 2024 22:23:53 -0500 Subject: [PATCH 40/44] we never get clones data so I commented it out. You need an API key with admin on the repo for the platform to return anything. It never works out. Saving resources. Signed-off-by: Sean P. Goggins --- augur/tasks/github/traffic/tasks.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/augur/tasks/github/traffic/tasks.py b/augur/tasks/github/traffic/tasks.py index 573c691301..4101faa3ff 100644 --- a/augur/tasks/github/traffic/tasks.py +++ b/augur/tasks/github/traffic/tasks.py @@ -32,27 +32,27 @@ def collect_github_repo_clones_data(repo_git: str) -> None: logger.info(f"{owner}/{repo} has no clones") def retrieve_all_clones_data(repo_git: str, logger, key_auth): - owner, repo = get_owner_repo(repo_git) + # owner, repo = get_owner_repo(repo_git) - url = f"https://api.github.com/repos/{owner}/{repo}/traffic/clones" + # url = f"https://api.github.com/repos/{owner}/{repo}/traffic/clones" - clones = GithubPaginator(url, key_auth, logger) + # clones = GithubPaginator(url, key_auth, logger) - num_pages = clones.get_num_pages() + # num_pages = clones.get_num_pages() all_data = [] - for page_data, page in clones.iter_pages(): + # for page_data, page in clones.iter_pages(): - if page_data is None: - return all_data + # if page_data is None: + # return all_data - elif len(page_data) == 0: - logger.debug(f"{repo.capitalize()} Traffic Page {page} contains no data...returning") - logger.info(f"Traffic Page {page} of {num_pages}") - return all_data + # elif len(page_data) == 0: + # logger.debug(f"{repo.capitalize()} Traffic Page {page} contains no data...returning") + # logger.info(f"Traffic Page {page} of {num_pages}") + # return all_data - logger.info(f"{repo} Traffic Page {page} of {num_pages}") + # logger.info(f"{repo} Traffic Page {page} of {num_pages}") - all_data += page_data + # all_data += page_data return all_data From 33890da083c4f4c0cf08f0f38fdb79785292bcf9 Mon Sep 17 00:00:00 2001 From: Sean Goggins Date: Wed, 5 Jun 2024 04:15:58 -0500 Subject: [PATCH 41/44] Updating version Signed-off-by: Sean Goggins --- metadata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata.py b/metadata.py index b914869d58..497e74ad46 100644 --- a/metadata.py +++ b/metadata.py @@ -5,8 +5,8 @@ __short_description__ = "Python 3 package for free/libre and open-source software community metrics, models & data collection" -__version__ = "0.70.0" -__release__ = "v0.70.0 (Windows 95 Man!)" +__version__ = "0.71.0" +__release__ = "v0.71.0 (Taylor Baby!)" __license__ = "MIT" __copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Brian Warner & Augurlabs 2024" From ed6f5e169d6087424b0172faf091eafc8cfd0679 Mon Sep 17 00:00:00 2001 From: Sean Goggins Date: Wed, 5 Jun 2024 08:30:21 -0500 Subject: [PATCH 42/44] updated Dockerfile version info Signed-off-by: Sean Goggins --- docker/backend/Dockerfile | 2 +- docker/database/Dockerfile | 2 +- docker/rabbitmq/Dockerfile | 4 ++-- flower_db | Bin 16384 -> 0 bytes 4 files changed, 4 insertions(+), 4 deletions(-) delete mode 100644 flower_db diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index 6e158d199b..9676b40ce5 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -2,7 +2,7 @@ FROM python:3.10-bookworm LABEL maintainer="outdoors@acm.org" -LABEL version="0.70.0" +LABEL version="0.71.0" ENV DEBIAN_FRONTEND=noninteractive diff --git a/docker/database/Dockerfile b/docker/database/Dockerfile index 1421e1f76c..df88b16c1e 100644 --- a/docker/database/Dockerfile +++ b/docker/database/Dockerfile @@ -2,7 +2,7 @@ FROM postgres:14 LABEL maintainer="outdoors@acm.org" -LABEL version="0.70.0" +LABEL version="0.71.0" ENV POSTGRES_DB "test" ENV POSTGRES_USER "augur" diff --git a/docker/rabbitmq/Dockerfile b/docker/rabbitmq/Dockerfile index 9feca83cd9..266bec64a5 100644 --- a/docker/rabbitmq/Dockerfile +++ b/docker/rabbitmq/Dockerfile @@ -1,7 +1,7 @@ FROM rabbitmq:3.12-management-alpine LABEL maintainer="574/augur@simplelogin.com" -LABEL version="0.70.0" +LABEL version="0.71.0" ARG RABBIT_MQ_DEFAULT_USER=augur ARG RABBIT_MQ_DEFAULT_PASSWORD=password123 @@ -20,4 +20,4 @@ RUN chmod 777 /etc/rabbitmq/conf.d/augur.conf RUN apk add --no-cache python3 COPY docker/rabbitmq/update_config.py / -RUN exec python3 update_config.py \ No newline at end of file +RUN exec python3 update_config.py diff --git a/flower_db b/flower_db deleted file mode 100644 index f03fe98e2b3029b3d1103880728d018993291465..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmeI%!3l#v5Czc5(VLJ4EWr}&BE4Ba+OZ))-K^q3yyg&j!tTHxL>_$blTlfqS3_^nf1F z1A0IY=m9-&vmSWVTYdij?~jAu%w>D7-t4@paNv;dhdjuGJY)fR$Qt@V9^@el$V1l9 h5Aq-nSwJ4XHJlJ2K!5-N0t5&UAV7cs0RndxSOLec*~S0> From 7cb9d97868f025e61c077ad8f143c3b19ac554df Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Wed, 5 Jun 2024 10:25:40 -0500 Subject: [PATCH 43/44] dependencies fix Signed-off-by: Sean P. Goggins --- .../dependency_libyear_tasks/libyear_util/pypi_parser.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py b/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py index 7aaaf1f190..dab06b1a09 100644 --- a/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py +++ b/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py @@ -160,7 +160,14 @@ def parse_conda(file_handle): pip = None if not contents: return [] - dependencies = contents['dependencies'] + #dependencies = contents['dependencies'] + dependencies = contents.get('dependencies', []) + + if not dependencies: + print("No dependencies found.") + return [] + else: + print("Dependencies found.") for dep in dependencies: if (type(dep) is dict) and dep['pip']: pip = dep From 744bcf5d31245baeb389dfd34e1adbc54261c271 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Wed, 5 Jun 2024 10:34:11 -0500 Subject: [PATCH 44/44] increaing db sleep due to errors like /home/sean/github/rh-k12/augur/tasks/git/dependency_libyear_tasks/tasks.py,: cannot open `/home/sean/github/rh-k12/augur/tasks/git/dependency_libyear_tasks/tasks.py,' (No such file or directory) line: cannot open `line' (No such file or directory) 12,: cannot open `12,' (No such file or directory) in: cannot open `in' (No such file or directory) process_libyear_dependency_metrics: cannot open `process_libyear_dependency_metrics' (No such file or directory) Signed-off-by: Sean P. Goggins --- augur/application/db/lib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index 0b4ebbdd6b..35dee313b6 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -305,8 +305,8 @@ def bulk_insert_dicts(logger, data: Union[List[dict], dict], table, natural_keys # print(str(stmnt.compile(dialect=postgresql.dialect()))) attempts = 0 - # creates list from 1 to 10 - sleep_time_list = list(range(1,11)) + # creates list from 1 to 10 / changed to 10-30 because deadlocks are taking longer + sleep_time_list = list(range(10,30)) deadlock_detected = False engine = get_engine()