diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml new file mode 100644 index 0000000000..c23bfd7bb3 --- /dev/null +++ b/.github/workflows/checks.yml @@ -0,0 +1,31 @@ +name: "run-linting-checks" +on: + pull_request: + branches: [main, dev] + +jobs: + run-pylint: + name: runner / pylint + permissions: write-all + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: dciborow/action-pylint@0.1.0 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + reporter: github-pr-review + level: warning + glob_pattern: "**/*.py" + filter_mode: "file" + + misspell: + name: runner / misspell + runs-on: ubuntu-latest + steps: + - name: Highlight any misspellings in changes. + uses: actions/checkout@v4 + - name: misspell + uses: reviewdog/action-misspell@v1 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + locale: "US" \ No newline at end of file diff --git a/.pylintrc b/.pylintrc index 0b1b7d2049..0056af873b 100644 --- a/.pylintrc +++ b/.pylintrc @@ -12,7 +12,7 @@ #refactoring checker #enable=R -disable=E0611,E1101,W1203,R0801,W0614,W0611,C0411,C0103,C0301,C0303,C0304,C0305,W0311 +disable=E0611,E1101,W1203,R0801,W0614,W0611,C0411,C0103,C0301,C0303,C0304,C0305,W0311,E0401 # Analyse import fallback blocks. This can be used to support both Python 2 and diff --git a/README.md b/README.md index 075a397fc5..3b4d7f4ddc 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ -# Augur NEW Release v0.60.2 +# Augur NEW Release v0.62.0 + +CHECK out Augur's Public Instance at https://ai.chaoss.io [![first-timers-only](https://img.shields.io/badge/first--timers--only-friendly-blue.svg?style=flat-square)](https://www.firsttimersonly.com/) We follow the [First Timers Only](https://www.firsttimersonly.com/) philosophy of tagging issues for first timers only, and walking one newcomer through the resolution process weekly. [You can find these issues tagged with "first timers only" on our issues list.](https://github.com/chaoss/augur/labels/first-timers-only). @@ -7,7 +9,7 @@ ## NEW RELEASE ALERT! ### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md) -Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.60.2 +Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.62.0 - The `main` branch is a stable version of our new architecture, which features: - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks. - A new job management architecture that uses Celery and Redis to manage queues, and enables users to run a Flower job monitoring dashboard diff --git a/augur/api/routes/__init__.py b/augur/api/routes/__init__.py index 5e601f54e8..03c2e2fa71 100644 --- a/augur/api/routes/__init__.py +++ b/augur/api/routes/__init__.py @@ -11,3 +11,4 @@ from .user import * from .dei import * from .util import * +from .complexity import * diff --git a/augur/api/routes/complexity.py b/augur/api/routes/complexity.py index ba82a12599..bee39eb923 100644 --- a/augur/api/routes/complexity.py +++ b/augur/api/routes/complexity.py @@ -6,32 +6,71 @@ import os import requests -AUGUR_API_VERSION = 'api/unstable' +from augur.api.routes import AUGUR_API_VERSION +from ..server import app, engine -def create_routes(server): - @server.app.route('/{}/complexity/project_languages'.format(AUGUR_API_VERSION), methods=["GET"]) - def get_project_languages(): - project_languages_sql = s.sql.text(""" - SELECT - e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, - e.programming_language, - e.code_lines, - e.files - FROM - augur_data.repo, - (SELECT +@app.route('/{}/complexity/project_languages'.format(AUGUR_API_VERSION), methods=["GET"]) +def get_project_languages(): + project_languages_sql = s.sql.text(""" + SELECT + e.repo_id, + augur_data.repo.repo_git, + augur_data.repo.repo_name, + e.programming_language, + e.code_lines, + e.files + FROM + augur_data.repo, + (SELECT + d.repo_id, + d.programming_language, + SUM(d.code_lines) AS code_lines, + COUNT(*)::int AS files + FROM + (SELECT + augur_data.repo_labor.repo_id, + augur_data.repo_labor.programming_language, + augur_data.repo_labor.code_lines + FROM + augur_data.repo_labor, + ( SELECT + augur_data.repo_labor.repo_id, + MAX ( data_collection_date ) AS last_collected + FROM + augur_data.repo_labor + GROUP BY augur_data.repo_labor.repo_id) recent + WHERE + augur_data.repo_labor.repo_id = recent.repo_id + AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + GROUP BY d.repo_id, d.programming_language) e + WHERE augur_data.repo.repo_id = e.repo_id + ORDER BY e.repo_id + """) + + with engine.connect() as conn: + results = pd.read_sql(project_languages_sql, conn) + data = results.to_json(orient="records", date_format='iso', date_unit='ms') + return Response(response=data, + status=200, + mimetype="application/json") + +@app.route('/{}/complexity/project_files'.format(AUGUR_API_VERSION), methods=["GET"]) +def get_project_files(): + project_files_sql = s.sql.text(""" + SELECT + e.repo_id, + augur_data.repo.repo_git, + augur_data.repo.repo_name, + e.files + FROM + augur_data.repo, + (SELECT d.repo_id, - d.programming_language, - SUM(d.code_lines) AS code_lines, - COUNT(*)::int AS files + count(*) AS files FROM (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.programming_language, - augur_data.repo_labor.code_lines + augur_data.repo_labor.repo_id FROM augur_data.repo_labor, ( SELECT @@ -43,119 +82,122 @@ def get_project_languages(): WHERE augur_data.repo_labor.repo_id = recent.repo_id AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d - GROUP BY d.repo_id, d.programming_language) e - WHERE augur_data.repo.repo_id = e.repo_id - ORDER BY e.repo_id - """) + GROUP BY d.repo_id) e + WHERE augur_data.repo.repo_id = e.repo_id + ORDER BY e.repo_id + """) - with server.engine.connect() as conn: - results = pd.read_sql(project_languages_sql, conn) - data = results.to_json(orient="records", date_format='iso', date_unit='ms') - return Response(response=data, - status=200, - mimetype="application/json") + with engine.connect() as conn: + results = pd.read_sql(project_files_sql, conn) + data = results.to_json(orient="records", date_format='iso', date_unit='ms') + return Response(response=data, + status=200, + mimetype="application/json") - @server.app.route('/{}/complexity/project_files'.format(AUGUR_API_VERSION), methods=["GET"]) - def get_project_files(): - project_files_sql = s.sql.text(""" - SELECT - e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, - e.files +@app.route('/{}/complexity/project_lines'.format(AUGUR_API_VERSION), methods=["GET"]) +def get_project_lines(): + project_lines_sql = s.sql.text(""" + SELECT + e.repo_id, + augur_data.repo.repo_git, + augur_data.repo.repo_name, + e.total_lines, + e.average_lines + FROM + augur_data.repo, + (SELECT + d.repo_id, + SUM(d.total_lines) AS total_lines, + AVG(d.total_lines)::INT AS average_lines FROM - augur_data.repo, - (SELECT - d.repo_id, - count(*) AS files - FROM - (SELECT - augur_data.repo_labor.repo_id - FROM - augur_data.repo_labor, - ( SELECT - augur_data.repo_labor.repo_id, - MAX ( data_collection_date ) AS last_collected - FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent - WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d - GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id - ORDER BY e.repo_id - """) + (SELECT + augur_data.repo_labor.repo_id, + augur_data.repo_labor.total_lines + FROM + augur_data.repo_labor, + ( SELECT + augur_data.repo_labor.repo_id, + MAX ( data_collection_date ) AS last_collected + FROM + augur_data.repo_labor + GROUP BY augur_data.repo_labor.repo_id) recent + WHERE + augur_data.repo_labor.repo_id = recent.repo_id + AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + GROUP BY d.repo_id) e + WHERE augur_data.repo.repo_id = e.repo_id + ORDER BY e.repo_id + """) - with server.engine.connect() as conn: - results = pd.read_sql(project_files_sql, conn) - data = results.to_json(orient="records", date_format='iso', date_unit='ms') - return Response(response=data, - status=200, - mimetype="application/json") + with engine.connect() as conn: + results = pd.read_sql(project_lines_sql, conn) + data = results.to_json(orient="records", date_format='iso', date_unit='ms') + return Response(response=data, + status=200, + mimetype="application/json") - @server.app.route('/{}/complexity/project_lines'.format(AUGUR_API_VERSION), methods=["GET"]) - def get_project_lines(): - project_lines_sql = s.sql.text(""" - SELECT - e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, - e.total_lines, - e.average_lines +@app.route('/{}/complexity/project_comment_lines'.format(AUGUR_API_VERSION), methods=["GET"]) +def get_project_comment_lines(): + comment_lines_sql = s.sql.text(""" + SELECT + e.repo_id, + augur_data.repo.repo_git, + augur_data.repo.repo_name, + e.comment_lines, + e.avg_comment_lines + FROM + augur_data.repo, + (SELECT + d.repo_id, + SUM(d.comment_lines) AS comment_lines, + AVG(d.comment_lines)::INT AS avg_comment_lines FROM - augur_data.repo, - (SELECT - d.repo_id, - SUM(d.total_lines) AS total_lines, - AVG(d.total_lines)::INT AS average_lines - FROM - (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.total_lines - FROM - augur_data.repo_labor, - ( SELECT - augur_data.repo_labor.repo_id, - MAX ( data_collection_date ) AS last_collected - FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent - WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d - GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id - ORDER BY e.repo_id - """) + (SELECT + augur_data.repo_labor.repo_id, + augur_data.repo_labor.comment_lines + FROM + augur_data.repo_labor, + ( SELECT + augur_data.repo_labor.repo_id, + MAX ( data_collection_date ) AS last_collected + FROM + augur_data.repo_labor + GROUP BY augur_data.repo_labor.repo_id) recent + WHERE + augur_data.repo_labor.repo_id = recent.repo_id + AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + GROUP BY d.repo_id) e + WHERE augur_data.repo.repo_id = e.repo_id + ORDER BY e.repo_id + """) - with server.engine.connect() as conn: - results = pd.read_sql(project_lines_sql, conn) - data = results.to_json(orient="records", date_format='iso', date_unit='ms') - return Response(response=data, - status=200, - mimetype="application/json") + with engine.connect() as conn: + results = pd.read_sql(comment_lines_sql, conn) + data = results.to_json(orient="records", date_format='iso', date_unit='ms') + return Response(response=data, + status=200, + mimetype="application/json") - @server.app.route('/{}/complexity/project_comment_lines'.format(AUGUR_API_VERSION), methods=["GET"]) - def get_project_comment_lines(): - comment_lines_sql = s.sql.text(""" - SELECT - e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, - e.comment_lines, - e.avg_comment_lines +@app.route('/{}/complexity/project_blank_lines'.format(AUGUR_API_VERSION), methods=["GET"]) +def get_project_blank_lines(): + blank_lines_sql = s.sql.text(""" + SELECT + e.repo_id, + augur_data.repo.repo_git, + augur_data.repo.repo_name, + e.blank_lines, + e.avg_blank_lines + FROM + augur_data.repo, + (SELECT + d.repo_id, + SUM(d.blank_lines) AS blank_lines, + AVG(d.blank_lines)::int AS avg_blank_lines FROM - augur_data.repo, - (SELECT - d.repo_id, - SUM(d.comment_lines) AS comment_lines, - AVG(d.comment_lines)::INT AS avg_comment_lines - FROM - (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.comment_lines - FROM + (SELECT + augur_data.repo_labor.repo_id, + augur_data.repo_labor.blank_lines + FROM augur_data.repo_labor, ( SELECT augur_data.repo_labor.repo_id, @@ -167,99 +209,57 @@ def get_project_comment_lines(): augur_data.repo_labor.repo_id = recent.repo_id AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id - ORDER BY e.repo_id + WHERE augur_data.repo.repo_id = e.repo_id + ORDER BY e.repo_id """) - with server.engine.connect() as conn: - results = pd.read_sql(comment_lines_sql, conn) - data = results.to_json(orient="records", date_format='iso', date_unit='ms') - return Response(response=data, - status=200, - mimetype="application/json") - - @server.app.route('/{}/complexity/project_blank_lines'.format(AUGUR_API_VERSION), methods=["GET"]) - def get_project_blank_lines(): - blank_lines_sql = s.sql.text(""" - SELECT - e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, - e.blank_lines, - e.avg_blank_lines - FROM - augur_data.repo, - (SELECT - d.repo_id, - SUM(d.blank_lines) AS blank_lines, - AVG(d.blank_lines)::int AS avg_blank_lines - FROM - (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.blank_lines - FROM - augur_data.repo_labor, - ( SELECT - augur_data.repo_labor.repo_id, - MAX ( data_collection_date ) AS last_collected - FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent - WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d - GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id - ORDER BY e.repo_id - """) - - with server.engine.connect() as conn: - results = pd.read_sql(blank_lines_sql, conn) - data = results.to_json(orient="records", date_format='iso', date_unit='ms') - return Response(response=data, - status=200, - mimetype="application/json") - + with engine.connect() as conn: + results = pd.read_sql(blank_lines_sql, conn) + data = results.to_json(orient="records", date_format='iso', date_unit='ms') + return Response(response=data, + status=200, + mimetype="application/json") + - @server.app.route('/{}/complexity/project_file_complexity'.format(AUGUR_API_VERSION), methods=["GET"]) - def get_project_file_complexity(): - project_file_complexity_sql = s.sql.text(""" - SELECT - e.repo_id, - augur_data.repo.repo_git, - augur_data.repo.repo_name, - e.sum_code_complexity, - e.average_code_complexity +@app.route('/{}/complexity/project_file_complexity'.format(AUGUR_API_VERSION), methods=["GET"]) +def get_project_file_complexity(): + project_file_complexity_sql = s.sql.text(""" + SELECT + e.repo_id, + augur_data.repo.repo_git, + augur_data.repo.repo_name, + e.sum_code_complexity, + e.average_code_complexity + FROM + augur_data.repo, + (SELECT + d.repo_id, + SUM(d.code_complexity) AS sum_code_complexity, + AVG(d.code_complexity)::int AS average_code_complexity FROM - augur_data.repo, - (SELECT - d.repo_id, - SUM(d.code_complexity) AS sum_code_complexity, - AVG(d.code_complexity)::int AS average_code_complexity - FROM - (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.code_complexity - FROM - augur_data.repo_labor, - ( SELECT - augur_data.repo_labor.repo_id, - MAX ( data_collection_date ) AS last_collected - FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent - WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d - GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id - ORDER BY e.repo_id - """) - - with server.engine.connect() as conn: - results = pd.read_sql(project_file_complexity_sql, conn) - data = results.to_json(orient="records", date_format='iso', date_unit='ms') - return Response(response=data, - status=200, - mimetype="application/json") + (SELECT + augur_data.repo_labor.repo_id, + augur_data.repo_labor.code_complexity + FROM + augur_data.repo_labor, + ( SELECT + augur_data.repo_labor.repo_id, + MAX ( data_collection_date ) AS last_collected + FROM + augur_data.repo_labor + GROUP BY augur_data.repo_labor.repo_id) recent + WHERE + augur_data.repo_labor.repo_id = recent.repo_id + AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + GROUP BY d.repo_id) e + WHERE augur_data.repo.repo_id = e.repo_id + ORDER BY e.repo_id + """) + with engine.connect() as conn: + results = pd.read_sql(project_file_complexity_sql, conn) + data = results.to_json(orient="records", date_format='iso', date_unit='ms') + return Response(response=data, + status=200, + mimetype="application/json") + diff --git a/augur/api/routes/dei.py b/augur/api/routes/dei.py index dea79b79c2..82324a8d62 100644 --- a/augur/api/routes/dei.py +++ b/augur/api/routes/dei.py @@ -52,7 +52,7 @@ def dei_track_repo(application: ClientApplication): return jsonify({"status": "Repo already exists"}) frontend_repo_group: RepoGroup = session.query(RepoGroup).filter(RepoGroup.rg_name == FRONTEND_REPO_GROUP_NAME).first() - repo_id = Repo.insert(session, repo_url, frontend_repo_group.repo_group_id, "API.DEI", repo_type="") + repo_id = Repo.insert_github_repo(session, repo_url, frontend_repo_group.repo_group_id, "API.DEI", repo_type="") if not repo_id: return jsonify({"status": "Error adding repo"}) diff --git a/augur/api/routes/user.py b/augur/api/routes/user.py index dfaeb81f7f..62bc44068a 100644 --- a/augur/api/routes/user.py +++ b/augur/api/routes/user.py @@ -227,7 +227,7 @@ def add_user_repo(): repo = request.args.get("repo_url") group_name = request.args.get("group_name") - result = current_user.add_repo(group_name, repo) + result = current_user.add_github_repo(group_name, repo) return jsonify(result[1]) @@ -260,7 +260,7 @@ def add_user_org(): org = request.args.get("org_url") group_name = request.args.get("group_name") - result = current_user.add_org(group_name, org) + result = current_user.add_github_org(group_name, org) return jsonify(result[1]) diff --git a/augur/api/routes/util.py b/augur/api/routes/util.py index 1f95b9b7a2..71d3526b96 100644 --- a/augur/api/routes/util.py +++ b/augur/api/routes/util.py @@ -1,10 +1,11 @@ #SPDX-License-Identifier: MIT +from augur.api.routes import AUGUR_API_VERSION +from ..server import app, engine import base64 import sqlalchemy as s import pandas as pd import json from flask import Response -import logging from augur.application.db.session import DatabaseSession from augur.application.logs import AugurLogger @@ -12,10 +13,6 @@ logger = AugurLogger("augur").get_logger() -from augur.api.routes import AUGUR_API_VERSION -from ..server import app, engine - - @app.route('/{}/repo-groups'.format(AUGUR_API_VERSION)) def get_all_repo_groups(): #TODO: make this name automatic - wrapper? repoGroupsSQL = s.sql.text(""" @@ -54,9 +51,9 @@ def get_all_repos(): (select * from api_get_all_repos_issues) b on repo.repo_id = b.repo_id - left outer join - (select * from api_get_all_repo_prs) c - on repo.repo_id=c.repo_id + left outer join + (select * from api_get_all_repo_prs) c + on repo.repo_id=c.repo_id JOIN repo_groups ON repo_groups.repo_group_id = repo.repo_group_id order by repo_name """) @@ -95,9 +92,9 @@ def get_repos_in_repo_group(repo_group_id): (select * from api_get_all_repos_issues) b on repo.repo_id = b.repo_id - left outer join - (select * from api_get_all_repo_prs) c - on repo.repo_id=c.repo_id + left outer join + (select * from api_get_all_repo_prs) c + on repo.repo_id=c.repo_id JOIN repo_groups ON repo_groups.repo_group_id = repo.repo_group_id WHERE repo_groups.repo_group_id = :repo_group_id @@ -111,6 +108,49 @@ def get_repos_in_repo_group(repo_group_id): status=200, mimetype="application/json") +@app.route('/{}/repos/'.format(AUGUR_API_VERSION)) +def get_repo_by_id(repo_id: int) -> Response: + repo_by_id_SQL = s.sql.text(""" + SELECT + repo.repo_id, + repo.repo_name, + repo.description, + repo.repo_git AS url, + a.commits_all_time, + b.issues_all_time, + c.pull_requests_all_time, + rg_name, + repo.repo_group_id + FROM + repo + LEFT OUTER JOIN + (SELECT * FROM api_get_all_repos_commits) a + ON repo.repo_id = a.repo_id + LEFT OUTER JOIN + (SELECT * FROM api_get_all_repos_issues) b + ON repo.repo_id = b.repo_id + LEFT OUTER JOIN + (SELECT * FROM api_get_all_repo_prs) c + ON repo.repo_id = c.repo_id + JOIN repo_groups ON repo_groups.repo_group_id = repo.repo_group_id + WHERE + repo.repo_id = :id + """) + + results = pd.read_sql(repo_by_id_SQL, engine, params={"id": repo_id}) + results["url"] = results["url"].apply(lambda datum: datum.split("//")[1]) # cut "https://" off the URL + results["base64_url"] = [base64.b64encode(results.at[i, "url"].encode()) for i in results.index] + data = results.to_json(orient="records", date_format="iso", date_unit="ms") + + if not data or data == "[]": + return Response(response='{"status": "Repository ' + str(repo_id) + ' does not exist"}', + status=400, + mimetype="application/json") + + return Response(response=data[1:-1], # cut off brackets at each end, turns list of length 1 into single value + status=200, + mimetype="application/json") + @app.route('/{}/owner//repo/'.format(AUGUR_API_VERSION)) def get_repo_by_git_name(owner, repo): diff --git a/augur/api/view/api.py b/augur/api/view/api.py index 287b079436..598c0cdb6d 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -102,7 +102,18 @@ def av_add_user_repo(): if rg_obj: # add the orgs repos to the group add_existing_org_to_group(session, current_user.user_id, group, rg_obj.repo_group_id) - + + # matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo} + elif Repo.parse_gitlab_repo_url(url)[0]: + + org_name, repo_name = Repo.parse_github_repo_url(url) + repo_git = f"https://gitlab.com/{org_name}/{repo_name}" + + # TODO: gitlab ensure the whole repo git is inserted so it can be found here + repo_obj = Repo.get_by_repo_git(session, repo_git) + if repo_obj: + add_existing_repo_to_group(session, current_user.user_id, group, repo_obj.repo_id) + else: invalid_urls.append(url) diff --git a/augur/api/view/routes.py b/augur/api/view/routes.py index ed617cd36c..72164a9291 100644 --- a/augur/api/view/routes.py +++ b/augur/api/view/routes.py @@ -1,4 +1,8 @@ +""" +Defines the api routes for the augur views +""" import logging +import math from flask import Flask, render_template, render_template_string, request, abort, jsonify, redirect, url_for, session, flash from sqlalchemy.orm.exc import NoResultFound from .utils import * @@ -37,9 +41,9 @@ def root(path=""): def logo(brand=None): if brand is None: return redirect(url_for('static', filename='img/augur_logo.png')) - elif "augur" in brand: + if "augur" in brand: return logo(None) - elif "chaoss" in brand: + if "chaoss" in brand: return redirect(url_for('static', filename='img/Chaoss_Logo_white.png')) return "" @@ -74,17 +78,15 @@ def repo_table_view(): pagination_offset = config.get_value("frontend", "pagination_offset") if current_user.is_authenticated: - data = load_repos_test(user = current_user, search = query, page = page, sort = sorting, direction = direction, source = "user") - page_count = load_repos_test(user = current_user, search = query, count = True, source = "user") - # data = current_user.get_repos(page = page, sort = sorting, direction = direction, search=query)[0] - # page_count = (current_user.get_repo_count(search = query)[0] or 0) // pagination_offset + data = current_user.get_repos(page = page, sort = sorting, direction = direction, search=query)[0] + repos_count = (current_user.get_repo_count(search = query)[0] or 0) else: - data = load_repos_test(search = query, page = page, sort = sorting, direction = direction) - page_count = load_repos_test(search = query, count = True) - # data = get_all_repos(page = page, sort = sorting, direction = direction, search=query)[0] - # page_count = (get_all_repos_count(search = query)[0] or 0) // pagination_offset + data = get_all_repos(page = page, sort = sorting, direction = direction, search=query)[0] + repos_count = (get_all_repos_count(search = query)[0] or 0) + + page_count = math.ceil(repos_count / pagination_offset) - 1 - if not data.count(): + if not data: data = None diff --git a/augur/api/view/utils.py b/augur/api/view/utils.py index 76551a6ab9..298e9950ae 100644 --- a/augur/api/view/utils.py +++ b/augur/api/view/utils.py @@ -1,6 +1,10 @@ +""" +Defines utility functions used by the augur api views +""" from pathlib import Path from concurrent.futures import ThreadPoolExecutor from flask import render_template, flash, url_for, Flask +from .init import init_logging from .init import * from ..server import app, db_session from augur.application.config import AugurConfig @@ -9,11 +13,13 @@ from augur.application.db.session import DatabaseSession from augur.application.db.engine import DatabaseEngine from augur.application.db.models import User, Repo, RepoGroup, UserGroup, UserRepo -from sqlalchemy import Column, Table, Integer, MetaData, or_, Label +from sqlalchemy import Column, Table, Integer, MetaData, or_ from sqlalchemy.sql.operators import ilike_op, distinct_op from sqlalchemy.sql.functions import coalesce from augur.application.db.models.base import Base +from sqlalchemy.orm import Query + init_logging() from .init import logger @@ -309,67 +315,5 @@ def render_module(module, **args): args.setdefault("body", module) return render_template('index.j2', **args) -""" ---------------------------------------------------------------- - No longer used -""" -# My attempt at a loading page -def renderLoading(dest, query, request): - cache_files_requested.append(request) - return render_template('index.j2', body="loading", title="Loading", d=dest, query_key=query, api_url=getSetting('serving')) - -with DatabaseEngine() as engine: - augur_data_schema = MetaData(schema = "augur_data") - augur_data_schema.reflect(bind = engine, views = True) - - commits_materialized_view: Table = augur_data_schema.tables["augur_data.api_get_all_repos_commits"] - issues_materialized_view: Table = augur_data_schema.tables["augur_data.api_get_all_repos_issues"] - """ ---------------------------------------------------------------- """ -def load_repos_test(count = False, source = None, **kwargs): - columns: list[Label] = [ - Repo.repo_id.distinct().label("repo_id"), - Repo.description.label("description"), - Repo.repo_git.label("url"), - coalesce(commits_materialized_view.columns.commits_all_time, 0).label("commits_all_time"), - coalesce(issues_materialized_view.columns.issues_all_time, 0).label("issues_all_time"), - RepoGroup.rg_name.label("rg_name"), - Repo.repo_git.regexp_replace('.*github\.com\/[A-Za-z0-9 \- _]+\/([A-Za-z0-9 \- _ .]+)$', "\\1").label("repo_name"), - Repo.repo_git.regexp_replace('.*github\.com\/([A-Za-z0-9 \- _]+)\/[A-Za-z0-9 \- _ .]+$', "\\1").label("repo_owner"), - RepoGroup.repo_group_id.label("repo_group_id") - ] - - def get_colum_by_label(label: str)-> Label: - for column in columns: - if column.name == label: - return column - - repos = db_session.query(*columns)\ - .outerjoin(commits_materialized_view, Repo.repo_id == commits_materialized_view.columns.repo_id)\ - .outerjoin(issues_materialized_view, Repo.repo_id == issues_materialized_view.columns.repo_id)\ - .join(RepoGroup, Repo.repo_group_id == RepoGroup.repo_group_id) - - user: User = kwargs.get("user") - if user: - repos = repos.join(UserRepo, Repo.repo_id == UserRepo.repo_id)\ - .join(UserGroup, UserGroup.group_id == UserRepo.group_id)\ - .filter(UserGroup.user_id == user.user_id) - - search = kwargs.get("search") - qkey = kwargs.get("query_key") or ["repo_name", "repo_owner"] - if search: - if isinstance(qkey, list) and len(qkey) > 0: - repos = repos.filter(or_(ilike_op(get_colum_by_label(filter_column), f"%{search}%") for filter_column in qkey)) - else: - repos = repos.filter(ilike_op(get_colum_by_label(qkey), f"%{search}%")) - - page_size: int = kwargs.get("page_size") or 25 - if count: - c = repos.count() - return math.ceil(c / page_size) - 1 - - page: int = kwargs.get("page") or 0 - offset = page * page_size - - return repos.slice(offset, offset + page_size) - diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index 29afab2b0d..fc466f021c 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -91,9 +91,12 @@ def start(disable_collection, development, port): logger.info("Deleting old task schedule") os.remove("celerybeat-schedule.db") - celery_beat_process = None - celery_command = "celery -A augur.tasks.init.celery_app.celery_app beat -l debug" - celery_beat_process = subprocess.Popen(celery_command.split(" ")) + with DatabaseSession(logger) as db_session: + config = AugurConfig(logger, db_session) + log_level = config.get_value("Logging", "log_level") + celery_beat_process = None + celery_command = f"celery -A augur.tasks.init.celery_app.celery_app beat -l {log_level.lower()}" + celery_beat_process = subprocess.Popen(celery_command.split(" ")) if not disable_collection: diff --git a/augur/application/db/data_parse.py b/augur/application/db/data_parse.py index abdc6de54c..7562181398 100644 --- a/augur/application/db/data_parse.py +++ b/augur/application/db/data_parse.py @@ -37,8 +37,63 @@ def extract_needed_pr_label_data(labels: List[dict], repo_id: int, tool_source: return label_dicts -# retrieve only the needed data for pr assignees from the api response + +def extract_needed_mr_label_data(labels: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for mr label data from the api response + + Arguments: + labels: List of dictionaries of label data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed label dicts + """ + + if len(labels) == 0: + return [] + + label_dicts = [] + for label in labels: + + label_dict = { + 'pr_src_id': label['id'], + 'pr_src_node_id': None, + 'pr_src_url': None, + 'pr_src_description': label['name'], + 'pr_src_color': label['color'], + # TODO: Populate this by making an api call for each label + 'pr_src_default_bool': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source, + 'repo_id': repo_id + } + + label_dicts.append(label_dict) + + return label_dicts + + def extract_needed_pr_assignee_data(assignees: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for pr assignees from the api response + + Arguments: + assignees: List of dictionaries of asignee data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed asignee dicts + """ if len(assignees) == 0: return [] @@ -48,7 +103,6 @@ def extract_needed_pr_assignee_data(assignees: List[dict], repo_id: int, tool_so for assignee in assignees: assignee_dict = { - # store the pr_url data on in the pr assignee data for now so we can relate it back to a pr later 'contrib_id': assignee["cntrb_id"], 'pr_assignee_src_id': int(assignee['id']), 'tool_source': tool_source, @@ -61,8 +115,59 @@ def extract_needed_pr_assignee_data(assignees: List[dict], repo_id: int, tool_so return assignee_dicts -# retrieve only the needed data for pr reviewers from the api response +def extract_needed_merge_request_assignee_data(assignees: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for merge request assignees from the api response + + Arguments: + assignees: List of dictionaries of asignee data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed asignee dicts + """ + + if len(assignees) == 0: + return [] + + assignee_dicts = [] + for assignee in assignees: + + assignee_dict = { + 'contrib_id': None, + 'repo_id': repo_id, + # TODO: Temporarily setting this to id which the id of the contributor, unitl we can get the contrib_id set and create a unique on the contrib_id and the pull_request_id + 'pr_assignee_src_id': assignee["id"], + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + assignee_dicts.append(assignee_dict) + + return assignee_dicts + + + def extract_needed_pr_reviewer_data(reviewers: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for pr reviewers from the api response + + Arguments: + reviewers: List of dictionaries of reviewer data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed reviewer dicts + """ if len(reviewers) == 0: return [] @@ -247,6 +352,42 @@ def extract_needed_issue_assignee_data(assignees: List[dict], repo_id: int, tool return assignee_dicts +def extract_needed_gitlab_issue_assignee_data(assignees: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for gitlab issue assignees from the api response + + Arguments: + assignees: List of dictionaries of gitlab assignee data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed assignee dicts + """ + + if len(assignees) == 0: + return [] + + assignee_dicts = [] + for assignee in assignees: + + assignee_dict = { + "cntrb_id": None, + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source, + "issue_assignee_src_id": assignee['id'], + "issue_assignee_src_node": None, + "repo_id": repo_id + } + + assignee_dicts.append(assignee_dict) + + return assignee_dicts + # retrieve only the needed data for pr labels from the api response @@ -277,9 +418,62 @@ def extract_needed_issue_label_data(labels: List[dict], repo_id: int, tool_sourc return label_dicts +def extract_needed_gitlab_issue_label_data(labels: List[dict], repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for gitlab issue labels from the api response + + Arguments: + labels: List of dictionaries of gitlab issue label data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed label dicts + """ + + if len(labels) == 0: + return [] + + label_dicts = [] + for label in labels: + + label_dict = { + "label_text": label["name"], + "label_description": label.get("description", None), + "label_color": label['color'], + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source, + "label_src_id": label['id'], + "label_src_node_id": None, + "repo_id": repo_id + } + + label_dicts.append(label_dict) + + return label_dicts + + -# retrieve only the needed data for pr labels from the api response def extract_needed_issue_message_ref_data(message: dict, issue_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for pr labels from the api response + + Arguments: + message: Message data dict + issue_id: id of the issue + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + Dict of message ref data. + """ message_ref_dict = { 'issue_id': issue_id, @@ -311,9 +505,21 @@ def extract_needed_pr_message_ref_data(comment: dict, pull_request_id: int, repo def extract_needed_pr_data(pr, repo_id, tool_source, tool_version): + """ + Retrieve only the needed data for the pr api response + + Arguments: + pr: PR data dict + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + + Returns: + Parsed pr dict + """ - pr_dict = { + pr = { 'repo_id': repo_id, 'pr_url': pr['url'], # 1-22-2022 inconsistent casting; sometimes int, sometimes float in bulk_insert @@ -367,9 +573,23 @@ def extract_needed_pr_data(pr, repo_id, tool_source, tool_version): 'data_source': 'GitHub API' } - return pr_dict + return pr def extract_needed_issue_data(issue: dict, repo_id: int, tool_source: str, tool_version: str, data_source: str): + """ + Retrieve only the needed data for the issue api response + + Arguments: + issue: Issue data dict + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: platform source + + + Returns: + Parsed issue dict + """ dict_data = { 'cntrb_id': None, # this the contributor who closed the issue @@ -513,8 +733,438 @@ def extract_needed_pr_review_data(review, pull_request_id, repo_id, platform_id, return review_row +def extract_needed_pr_data_from_gitlab_merge_request(pr, repo_id, tool_source, tool_version): + """ + Retrieve only the needed data for the pr gitlab api response - + Arguments: + pr: PR data dict + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + + + Returns: + Parsed pr dict + """ + + pr_dict = { + 'repo_id': repo_id, + 'pr_url': pr['web_url'], + 'pr_src_id': pr['id'], + 'pr_src_node_id': None, + 'pr_html_url': pr['web_url'], + 'pr_diff_url': None, + 'pr_patch_url': None, + 'pr_issue_url': None, + 'pr_augur_issue_id': None, + 'pr_src_number': pr['iid'], + 'pr_src_state': pr['state'], + 'pr_src_locked': pr['discussion_locked'], + 'pr_src_title': pr['title'], + # TODO: Add contributor logic for gitlab + 'pr_augur_contributor_id': None, + 'pr_body': pr['description'], + 'pr_created_at': pr['created_at'], + 'pr_updated_at': pr['updated_at'], + 'pr_closed_at': pr['closed_at'], + 'pr_merged_at': pr['merged_at'], + 'pr_merge_commit_sha': pr['merge_commit_sha'], + 'pr_teams': None, + 'pr_milestone': pr['milestone'].get('title') if pr['milestone'] else None, + 'pr_commits_url': None, + 'pr_review_comments_url': None, + 'pr_review_comment_url': None, + 'pr_comments_url': None, + 'pr_statuses_url': None, + 'pr_meta_head_id': None, + 'pr_meta_base_id': None, + 'pr_src_issue_url': None, + 'pr_src_comments_url': None, + 'pr_src_review_comments_url': None, + 'pr_src_commits_url': None, + 'pr_src_statuses_url': None, + 'pr_src_author_association': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': 'Gitlab API' + } + + return pr_dict + + +def extract_needed_issue_data_from_gitlab_issue(issue: dict, repo_id: int, tool_source: str, tool_version: str, data_source: str): + """ + Retrieve only the needed data for the issue gitlab api response + + Arguments: + issue: Issue data dict + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: platform source + + Returns: + Parsed issue dict + """ + + issue_dict = { + "repo_id": repo_id, + "reporter_id": None, + "pull_request": None, + "pull_request_id": None, + "created_at": issue['created_at'], + "issue_title": issue['title'], + "issue_body": issue['description'] if 'description' in issue else None, + "comment_count": issue['user_notes_count'], + "updated_at": issue['updated_at'], + "closed_at": issue['closed_at'], + "repository_url": issue['_links']['project'], + "issue_url": issue['_links']['self'], + "labels_url": None, + "comments_url": issue['_links']['notes'], + "events_url": None, + "html_url": issue['_links']['self'], + "issue_state": issue['state'], + "issue_node_id": None, + "gh_issue_id": issue['id'], + "gh_issue_number": issue['iid'], + "gh_user_id": issue['author']['id'], + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source + } + + return issue_dict + + + +def extract_gitlab_mr_event_data(event: dict, pr_id: int, platform_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> dict: + """ + Retrieve only the needed data for the mr event gitlab api response + + Arguments: + event: Event data dict + pr_id: id of the pr + platform_id: id of the platform + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: platform source + + + Returns: + Parsed event dict + """ + + mr_event = { + 'pull_request_id': pr_id, + 'cntrb_id': None, + 'action': event['action_name'], + 'action_commit_hash': None, + 'created_at': event['created_at'], + 'issue_event_src_id': event['target_id'], + 'repo_id': repo_id, + 'platform_id': platform_id, + 'node_id': None, + 'node_url': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + return mr_event + +def extract_gitlab_issue_event_data(event: dict, issue_id: int, platform_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> dict: + """ + Retrieve only the needed data for the issue event gitlab api response + + Arguments: + event: Event data dict + issue_id: id of the issue + platform_id: id of the platform + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: platform source + + + Returns: + Parsed event dict + """ + + issue_event = { + "issue_event_src_id": event['target_id'], + "issue_id": issue_id, + "node_id": None, + "node_url": None, + "cntrb_id": None, + "created_at": event['created_at'], + "action": event["action_name"], + "action_commit_hash": None, + "platform_id": platform_id, + "repo_id" : repo_id, + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source + } + + return issue_event + + +def extract_needed_mr_reviewer_data(data: List[dict], pull_request_id, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for pr reviewers from the api response + + Arguments: + data: List of dictionaries that contain mr reviewer data to parse + pull_request_id: id of the PR + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of extracted relevant data from needed mr reviwer data + """ + + if len(data) == 0: + return [] + + reviewer_dicts = [] + for x in data: + + for _ in x["suggested_approvers"]: + + reviewer_dict = { + 'pull_request_id': pull_request_id, + 'cntrb_id': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + reviewer_dicts.append(reviewer_dict) + + return reviewer_dicts + + +def extract_needed_mr_commit_data(commit, repo_id, pull_request_id, tool_source, tool_version, data_source): + """ + Retrieve only the needed data for mr commit data from the api response + + Arguments: + commit: commit data dictionary + repo_id: augur id of the repository + pull_request_id: id of the PR + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + Dictionary of the extracted commit data + """ + + commit = { + 'pull_request_id': pull_request_id, + 'pr_cmt_sha': commit['id'], + 'pr_cmt_node_id': None, + 'pr_cmt_message': commit['message'], + 'repo_id': repo_id, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source, + } + + return commit + + +def extract_needed_mr_file_data(gitlab_file_data, repo_id, pull_request_id, tool_source, tool_version, data_source): + """ + Retrieve only the needed data for mr file data from the api response + Arguments: + gitlab_file_data: file data dictionary + repo_id: augur id of the repository + pull_request_id: id of the PR + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of dicts of parsed gitlab file changes + """ + files = [] + + changes = gitlab_file_data["changes"] + for file_changes in changes: + try: + deletes = int(file_changes['diff'].split('@@')[1].strip().split(' ')[0].split(',')[1]) + adds = int(file_changes['diff'].split('@@')[1].strip().split(' ')[1].split(',')[1]) + except Exception: + deletes = 0 + adds = 0 + + file_dict = { + 'pull_request_id': pull_request_id, + 'repo_id': repo_id, + 'pr_file_additions': adds, + 'pr_file_deletions': deletes, + 'pr_file_path': file_changes['old_path'], + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source, + } + + files.append(file_dict) + + return files + + +def extract_needed_mr_metadata(mr_dict, repo_id, pull_request_id, tool_source, tool_version, data_source): + """ + Retrieve only the needed data for mr metadata from the api response + + Arguments: + mr_dict: mr data dictionary + repo_id: augur id of the repository + pull_request_id: id of the PR + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of dicts of parsed mr metadata + """ + head = {'sha': mr_dict['diff_refs']['head_sha'], + 'ref': mr_dict['target_branch'], + 'label': str(mr_dict['target_project_id']) + ':' + mr_dict['target_branch'], + 'author': mr_dict['author']['username'], + 'repo': str(mr_dict['target_project_id']) + } + + base = {'sha': mr_dict['diff_refs']['base_sha'], + 'ref': mr_dict['source_branch'], + 'label': str(mr_dict['source_project_id']) + ':' + mr_dict['source_branch'], + 'author': mr_dict['author']['username'], + 'repo': str(mr_dict['source_project_id']) + } + + pr_meta_dict = { + 'head': head, + 'base': base + } + all_meta = [] + for pr_side, pr_meta_data in pr_meta_dict.items(): + pr_meta = { + 'pull_request_id': pull_request_id, + 'repo_id': repo_id, + 'pr_head_or_base': pr_side, + 'pr_src_meta_label': pr_meta_data['label'], + 'pr_src_meta_ref': pr_meta_data['ref'], + 'pr_sha': pr_meta_data['sha'], + 'cntrb_id': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + all_meta.append(pr_meta) + + return all_meta + + +def extract_needed_gitlab_issue_message_ref_data(message: dict, issue_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Extract the message id for a given message on an issue from an api response + and connect it to the relevant repo id. + + Arguments: + message: message data dict + issue_id: id of the issue + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + Dict containing the message ref id as well as the repo id. + """ + + message_ref_dict = { + 'issue_id': issue_id, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source, + 'issue_msg_ref_src_comment_id': int(message['id']), + 'issue_msg_ref_src_node_id': None, + 'repo_id': repo_id + } + + return message_ref_dict + + +def extract_needed_gitlab_message_data(comment: dict, platform_id: int, tool_source: str, tool_version: str, data_source: str): + """ + Extract specific metadata for a comment from an api response + and connect it to the relevant platform id. + + Arguments: + comment: comment data dict + platform_id: augur id of the relevant platform + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + Dict containing parsed comment text and metadata + """ + + comment_dict = { + "pltfrm_id": platform_id, + "msg_text": comment['body'], + "msg_timestamp": comment['created_at'], + "cntrb_id": None, + "platform_msg_id": int(comment['id']), + "tool_source": tool_source, + "tool_version": tool_version, + "data_source": data_source + } + + return comment_dict + +def extract_needed_gitlab_mr_message_ref_data(comment: dict, pull_request_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: + """ + Retrieve only the needed data for pr labels from the api response + + Arguments: + comment: comment data dict + pull_request_id: id of the PR + repo_id: augur id of the repository + platform_id: augur id of the relevant platform + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + Dict containing the comment, pr and repo id of the parsed comment data. + """ + + pr_msg_ref = { + 'pull_request_id': pull_request_id, + 'pr_message_ref_src_comment_id': comment['id'], + 'repo_id': repo_id, + 'pr_message_ref_src_node_id': None, + 'tool_source': tool_source, + 'tool_version': tool_version, + 'data_source': data_source + } + + return pr_msg_ref diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index 3ed584ee14..7f97e4bbdc 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -564,6 +564,8 @@ class RepoGroup(Base): data_source = Column(String) data_collection_date = Column(TIMESTAMP(precision=0)) + repo = relationship("Repo", back_populates="repo_group") + @staticmethod def is_valid_repo_group_id(session, repo_group_id: int) -> bool: """Deterime is repo_group_id exists. @@ -866,8 +868,8 @@ class Repo(Base): TIMESTAMP(precision=0), server_default=text("CURRENT_TIMESTAMP") ) - repo_group = relationship("RepoGroup") - user_repo = relationship("UserRepo") + repo_group = relationship("RepoGroup", back_populates="repo") + user_repo = relationship("UserRepo", back_populates="repo") collection_status = relationship("CollectionStatus", back_populates="repo") issues = relationship("Issue", back_populates="repo") prs = relationship("PullRequest", back_populates="repo") @@ -927,6 +929,44 @@ def is_valid_github_repo(gh_session, url: str) -> bool: return False, {"status": f"Github Error: {data['message']}"} return True, {"status": "Valid repo", "repo_type": data["owner"]["type"]} + + @staticmethod + def is_valid_gitlab_repo(gl_session, url: str) -> bool: + """Determine whether a GitLab repo URL is valid. + + Args: + gl_session: GitLab session object with API key + url: Repository URL + + Returns: + True if repo URL is valid, False otherwise + """ + from augur.tasks.github.util.github_paginator import hit_api + + REPO_ENDPOINT = "https://gitlab.com/api/v4/projects/{}/" + + owner, repo = Repo.parse_gitlab_repo_url(url) + if not owner or not repo: + return False, {"status": "Invalid repo URL"} + + # Encode namespace and project name for the API request + project_identifier = f"{owner}%2F{repo}" + url = REPO_ENDPOINT.format(project_identifier) + + attempts = 0 + while attempts < 10: + response = hit_api(gl_session.oauths, url, logger) + + if response.status_code == 404: + return False, {"status": "Invalid repo"} + + if response.status_code == 200: + return True, {"status": "Valid repo"} + + attempts += 1 + + return False, {"status": "Failed to validate repo after multiple attempts"} + @staticmethod def parse_github_repo_url(url: str) -> tuple: @@ -946,6 +986,29 @@ def parse_github_repo_url(url: str) -> tuple: capturing_groups = result.groups() + owner = capturing_groups[0] + repo = capturing_groups[1] + + return owner, repo + + @staticmethod + def parse_gitlab_repo_url(url: str) -> tuple: + """ Gets the owner and repo from a gitlab url. + + Args: + url: Gitlab url + + Returns: + Tuple of owner and repo. Or a tuple of None and None if the url is invalid. + """ + + result = re.search(r"https?:\/\/gitlab\.com\/([A-Za-z0-9 \- _]+)\/([A-Za-z0-9 \- _ \.]+)(.git)?\/?$", url) + + if not result: + return None, None + + capturing_groups = result.groups() + owner = capturing_groups[0] repo = capturing_groups[1] @@ -972,12 +1035,60 @@ def parse_github_org_url(url): return result.groups()[0] @staticmethod - def insert(session, url: str, repo_group_id: int, tool_source, repo_type): + def insert_gitlab_repo(session, url: str, repo_group_id: int, tool_source): + """Add a repo to the repo table. + + Args: + url: repo url + repo_group_id: group to assign repo to + + Note: + If repo row exists then it will update the repo_group_id if param repo_group_id is not a default. If it does not exist is will simply insert the repo. + """ + + if not isinstance(url, str) or not isinstance(repo_group_id, int) or not isinstance(tool_source, str): + return None + + if not RepoGroup.is_valid_repo_group_id(session, repo_group_id): + return None + + if url.endswith("/"): + url = url[:-1] + + url = url.lower() + + owner, repo = Repo.parse_gitlab_repo_url(url) + if not owner or not repo: + return None + + repo_data = { + "repo_group_id": repo_group_id, + "repo_git": url, + "repo_path": f"gitlab.com/{owner}/", + "repo_name": repo, + "repo_type": None, + "tool_source": tool_source, + "tool_version": "1.0", + "data_source": "Git" + } + + repo_unique = ["repo_git"] + return_columns = ["repo_id"] + result = session.insert_data(repo_data, Repo, repo_unique, return_columns, on_conflict_update=False) + + if not result: + return None + + return result[0]["repo_id"] + + @staticmethod + def insert_github_repo(session, url: str, repo_group_id: int, tool_source, repo_type): """Add a repo to the repo table. Args: url: repo url repo_group_id: group to assign repo to + repo_type: github or gitlab Note: If repo row exists then it will update the repo_group_id if param repo_group_id is not a default. If it does not exist is will simply insert the repo. @@ -1208,10 +1319,6 @@ class Commit(Base): primaryjoin="Commit.cmt_author_platform_username == Contributor.cntrb_login", back_populates="commits" ) - contributor1 = relationship( - "Contributor", - primaryjoin="Commit.cmt_author_platform_username == Contributor.cntrb_login", - ) repo = relationship("Repo", back_populates="commits") message_ref = relationship("CommitCommentRef", back_populates="cmt") diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index f702d829a3..47f28b12f2 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -271,9 +271,9 @@ class User(Base): {"schema": "augur_operations"} ) - groups = relationship("UserGroup") - tokens = relationship("UserSessionToken") - applications = relationship("ClientApplication") + groups = relationship("UserGroup", back_populates="user") + tokens = relationship("UserSessionToken", back_populates="user") + applications = relationship("ClientApplication", back_populates="user") _is_authenticated = False _is_active = True @@ -449,17 +449,30 @@ def remove_group(self, group_name): return result - def add_repo(self, group_name, repo_url): + def add_github_repo(self, group_name, repo_url): from augur.tasks.github.util.github_task_session import GithubTaskSession from augur.tasks.github.util.github_api_key_handler import NoValidKeysError try: with GithubTaskSession(logger) as session: - result = UserRepo.add(session, repo_url, self.user_id, group_name) + result = UserRepo.add_github_repo(session, repo_url, self.user_id, group_name) except NoValidKeysError: return False, {"status": "No valid keys"} return result + + def add_gitlab_repo(self, group_name, repo_url): + + from augur.tasks.gitlab.gitlab_task_session import GitlabTaskSession + from augur.tasks.github.util.github_api_key_handler import NoValidKeysError + try: + with GitlabTaskSession(logger) as session: + result = UserRepo.add_gitlab_repo(session, repo_url, self.user_id, group_name) + except NoValidKeysError: + return False, {"status": "No valid keys"} + + return result + def remove_repo(self, group_name, repo_id): @@ -468,14 +481,14 @@ def remove_repo(self, group_name, repo_id): return result - def add_org(self, group_name, org_url): + def add_github_org(self, group_name, org_url): from augur.tasks.github.util.github_task_session import GithubTaskSession from augur.tasks.github.util.github_api_key_handler import NoValidKeysError try: with GithubTaskSession(logger) as session: - result = UserRepo.add_org_repos(session, org_url, self.user_id, group_name) + result = UserRepo.add_github_org_repos(session, org_url, self.user_id, group_name) except NoValidKeysError: return False, {"status": "No valid keys"} @@ -628,8 +641,8 @@ class UserGroup(Base): {"schema": "augur_operations"} ) - user = relationship("User") - repos = relationship("UserRepo") + user = relationship("User", back_populates="groups") + repos = relationship("UserRepo", back_populates="group") @staticmethod def insert(session, user_id:int, group_name:str) -> dict: @@ -739,8 +752,8 @@ class UserRepo(Base): ForeignKey("augur_data.repo.repo_id", name="user_repo_user_id_fkey"), primary_key=True, nullable=False ) - repo = relationship("Repo") - group = relationship("UserGroup") + repo = relationship("Repo", back_populates="user_repo") + group = relationship("UserGroup", back_populates="repos") @staticmethod def insert(session, repo_id: int, group_id:int = 1) -> bool: @@ -769,9 +782,69 @@ def insert(session, repo_id: int, group_id:int = 1) -> bool: return False return data[0]["group_id"] == group_id and data[0]["repo_id"] == repo_id + + @staticmethod + def add_gitlab_repo(session, url: List[str], user_id: int, group_name=None, group_id=None, from_org_list=False, repo_group_id=None) -> dict: + """Add repo to the user repo table + + Args: + urls: list of repo urls + user_id: id of user_id from users table + group_name: name of group to add repo to. + group_id: id of the group + valid_repo: boolean that indicates whether the repo has already been validated + + Note: + Either the group_name or group_id can be passed not both + + Returns: + Dict that contains the key "status" and additional useful data + """ + + if group_name and group_id: + return False, {"status": "Pass only the group name or group id not both"} + + if not group_name and not group_id: + return False, {"status": "Need group name or group id to add a repo"} + + if group_id is None: + + group_id = UserGroup.convert_group_name_to_id(session, user_id, group_name) + if group_id is None: + return False, {"status": "Invalid group name"} + + if not from_org_list: + result = Repo.is_valid_gitlab_repo(session, url) + if not result[0]: + return False, {"status": result[1]["status"], "repo_url": url} + + # if no repo_group_id is passed then assign the repo to the frontend repo group + if repo_group_id is None: + + frontend_repo_group = session.query(RepoGroup).filter(RepoGroup.rg_name == FRONTEND_REPO_GROUP_NAME).first() + if not frontend_repo_group: + return False, {"status": "Could not find repo group with name 'Frontend Repos'", "repo_url": url} + + repo_group_id = frontend_repo_group.repo_group_id + + + repo_id = Repo.insert_gitlab_repo(session, url, repo_group_id, "Frontend") + if not repo_id: + return False, {"status": "Repo insertion failed", "repo_url": url} + + result = UserRepo.insert(session, repo_id, group_id) + if not result: + return False, {"status": "repo_user insertion failed", "repo_url": url} + + #collection_status records are now only added during collection -IM 5/1/23 + #status = CollectionStatus.insert(session, repo_id) + #if not status: + # return False, {"status": "Failed to create status for repo", "repo_url": url} + + return True, {"status": "Repo Added", "repo_url": url} @staticmethod - def add(session, url: List[str], user_id: int, group_name=None, group_id=None, from_org_list=False, repo_type=None, repo_group_id=None) -> dict: + def add_github_repo(session, url: List[str], user_id: int, group_name=None, group_id=None, from_org_list=False, repo_type=None, repo_group_id=None) -> dict: """Add repo to the user repo table Args: @@ -820,7 +893,7 @@ def add(session, url: List[str], user_id: int, group_name=None, group_id=None, f repo_group_id = frontend_repo_group.repo_group_id - repo_id = Repo.insert(session, url, repo_group_id, "Frontend", repo_type) + repo_id = Repo.insert_github_repo(session, url, repo_group_id, "Frontend", repo_type) if not repo_id: return False, {"status": "Repo insertion failed", "repo_url": url} @@ -862,7 +935,7 @@ def delete(session, repo_id:int, user_id:int, group_name:str) -> dict: return True, {"status": "Repo Removed"} @staticmethod - def add_org_repos(session, url: List[str], user_id: int, group_name: int): + def add_github_org_repos(session, url: List[str], user_id: int, group_name: int): """Add list of orgs and their repos to a users repos. Args: @@ -911,7 +984,7 @@ def add_org_repos(session, url: List[str], user_id: int, group_name: int): failed_repos = [] for repo in repos: - result = UserRepo.add(session, repo, user_id, group_id=group_id, from_org_list=True, repo_type=type, repo_group_id=repo_group_id) + result = UserRepo.add_github_repo(session, repo, user_id, group_id=group_id, from_org_list=True, repo_type=type, repo_group_id=repo_group_id) # keep track of all the repos that failed if not result[0]: @@ -949,9 +1022,9 @@ class UserSessionToken(Base): application_id = Column(ForeignKey("augur_operations.client_applications.id", name="user_session_token_application_id_fkey"), nullable=False) created_at = Column(BigInteger) - user = relationship("User") - application = relationship("ClientApplication") - refresh_tokens = relationship("RefreshToken") + user = relationship("User", back_populates="tokens") + application = relationship("ClientApplication", back_populates="sessions") + refresh_tokens = relationship("RefreshToken", back_populates="user_session") @staticmethod def create(session, user_id, application_id, seconds_to_expire=86400): @@ -991,9 +1064,9 @@ class ClientApplication(Base): redirect_url = Column(String, nullable=False) api_key = Column(String, nullable=False) - user = relationship("User") + user = relationship("User", back_populates="applications") sessions = relationship("UserSessionToken") - subscriptions = relationship("Subscription") + subscriptions = relationship("Subscription", back_populates="application") def __eq__(self, other): return isinstance(other, ClientApplication) and str(self.id) == str(other.id) @@ -1013,8 +1086,8 @@ class Subscription(Base): application_id = Column(ForeignKey("augur_operations.client_applications.id", name="subscriptions_application_id_fkey"), primary_key=True) type_id = Column(ForeignKey("augur_operations.subscription_types.id", name="subscriptions_type_id_fkey"), primary_key=True) - application = relationship("ClientApplication") - type = relationship("SubscriptionType") + application = relationship("ClientApplication", back_populates="subscriptions") + type = relationship("SubscriptionType", back_populates="subscriptions") class SubscriptionType(Base): __tablename__ = "subscription_types" @@ -1027,7 +1100,7 @@ class SubscriptionType(Base): id = Column(BigInteger, primary_key=True) name = Column(String, nullable=False) - subscriptions = relationship("Subscription") + subscriptions = relationship("Subscription", back_populates="type") class RefreshToken(Base): @@ -1040,7 +1113,7 @@ class RefreshToken(Base): id = Column(String, primary_key=True) user_session_token = Column(ForeignKey("augur_operations.user_session_tokens.token", name="refresh_token_session_token_id_fkey"), nullable=False) - user_session = relationship("UserSessionToken") + user_session = relationship("UserSessionToken", back_populates="refresh_tokens") @staticmethod def create(session, user_session_token_id): @@ -1159,16 +1232,28 @@ def insert(session, repo_id): repo_git = repo.repo_git collection_status_unique = ["repo_id"] + pr_issue_count = 0 + github_weight = 0 + if "github" in repo_git: - try: - pr_issue_count = get_repo_weight_by_issue(session.logger, repo_git) - #session.logger.info(f"date weight: {calculate_date_weight_from_timestamps(repo.repo_added, None)}") - github_weight = pr_issue_count - calculate_date_weight_from_timestamps(repo.repo_added, None) - except Exception as e: - pr_issue_count = None - github_weight = None - session.logger.error( - ''.join(traceback.format_exception(None, e, e.__traceback__))) + try: + pr_issue_count = get_repo_weight_by_issue(session.logger, repo_git) + #session.logger.info(f"date weight: {calculate_date_weight_from_timestamps(repo.repo_added, None)}") + github_weight = pr_issue_count - calculate_date_weight_from_timestamps(repo.repo_added, None) + except Exception as e: + pr_issue_count = None + github_weight = None + session.logger.error( + ''.join(traceback.format_exception(None, e, e.__traceback__))) + else: + try: + pr_issue_count = 0 + github_weight = pr_issue_count - calculate_date_weight_from_timestamps(repo.repo_added, None) + except Exception as e: + pr_issue_count = None + github_weight = None + session.logger.error( + ''.join(traceback.format_exception(None, e, e.__traceback__))) record = { @@ -1178,6 +1263,7 @@ def insert(session, repo_id): "secondary_weight": github_weight, "ml_weight": github_weight } + result = session.insert_data(record, CollectionStatus, collection_status_unique, on_conflict_update=False) diff --git a/augur/application/db/session.py b/augur/application/db/session.py index f1d1e64dd0..22379ad050 100644 --- a/augur/application/db/session.py +++ b/augur/application/db/session.py @@ -194,14 +194,15 @@ def insert_data(self, data: Union[List[dict], dict], table, natural_keys: List[s except Exception as e: #self.logger.info(e) - if(len(data) == 1): + if len(data) == 1: raise e - else: - first_half = data[:len(data)//2] - second_half = data[len(data)//2:] + + time.sleep(3) + first_half = data[:len(data)//2] + second_half = data[len(data)//2:] - self.insert_data(first_half, table,natural_keys, return_columns, string_fields, on_conflict_update) - self.insert_data(second_half,table, natural_keys, return_columns, string_fields, on_conflict_update) + self.insert_data(first_half, table, natural_keys, return_columns, string_fields, on_conflict_update) + self.insert_data(second_half,table, natural_keys, return_columns, string_fields, on_conflict_update) else: self.logger.error("Unable to insert data in 10 attempts") @@ -231,14 +232,15 @@ def insert_data(self, data: Union[List[dict], dict], table, natural_keys: List[s raise e except Exception as e: - if(len(data) == 1): + if len(data) == 1: raise e - else: - first_half = data[:len(data)//2] - second_half = data[len(data)//2:] + + time.sleep(3) + first_half = data[:len(data)//2] + second_half = data[len(data)//2:] - self.insert_data(first_half, natural_keys, return_columns, string_fields, on_conflict_update) - self.insert_data(second_half, natural_keys, return_columns, string_fields, on_conflict_update) + self.insert_data(first_half, table, natural_keys, return_columns, string_fields, on_conflict_update) + self.insert_data(second_half, table, natural_keys, return_columns, string_fields, on_conflict_update) else: self.logger.error("Unable to insert and return data in 10 attempts") diff --git a/augur/application/schema/alembic/versions/26_materialized_view_unique_updates.py b/augur/application/schema/alembic/versions/26_materialized_view_unique_updates.py new file mode 100644 index 0000000000..f381ec48ef --- /dev/null +++ b/augur/application/schema/alembic/versions/26_materialized_view_unique_updates.py @@ -0,0 +1,245 @@ +""" Updating materialized views and associated indices + +Revision ID: 26 +Revises: 25 +Create Date: 2023-08-23 18:17:22.651191 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql +from sqlalchemy import text + +# revision identifiers, used by Alembic. +revision = '26' +down_revision = '25' +branch_labels = None +depends_on = None + + +def upgrade(): + + mview_keys_26() + +def downgrade(): + + upgrade=False + + mview_keys_26(upgrade) + +def mview_keys_26(upgrade=True): + + if upgrade: + conn = op.get_bind() + conn.execute(text(""" + drop materialized view if exists augur_data.explorer_pr_assignments; + drop materialized view if exists augur_data.explorer_user_repos; + drop materialized view if exists augur_data.explorer_pr_response_times; + drop materialized view if exists augur_data.explorer_pr_response; + drop materialized view if exists augur_data.explorer_issue_assignments;""")) + + conn.execute(text(""" + create materialized view augur_data.explorer_pr_assignments as + SELECT + pr.pull_request_id, + pr.repo_id AS ID, + pr.pr_created_at AS created, + pr.pr_closed_at AS closed, + pre.created_at AS assign_date, + pre.ACTION AS assignment_action, + pre.cntrb_id AS assignee, + pre.node_id AS node_id + FROM + ( + augur_data.pull_requests pr + LEFT JOIN augur_data.pull_request_events pre ON ( + ( + ( pr.pull_request_id = pre.pull_request_id ) + AND ( + ( pre.ACTION ) :: TEXT = ANY ( ARRAY [ ( 'unassigned' :: CHARACTER VARYING ) :: TEXT, ( 'assigned' :: CHARACTER VARYING ) :: TEXT ] ) + ) + ) + ) + );""")) + conn.execute(text(""" + create materialized view augur_data.explorer_pr_response as + SELECT pr.pull_request_id, + pr.repo_id AS id, + pr.pr_augur_contributor_id AS cntrb_id, + m.msg_timestamp, + m.msg_cntrb_id, + pr.pr_created_at, + pr.pr_closed_at + FROM (augur_data.pull_requests pr + LEFT JOIN ( SELECT prr.pull_request_id, + m_1.msg_timestamp, + m_1.cntrb_id AS msg_cntrb_id + FROM augur_data.pull_request_review_message_ref prrmr, + augur_data.pull_requests pr_1, + augur_data.message m_1, + augur_data.pull_request_reviews prr + WHERE ((prrmr.pr_review_id = prr.pr_review_id) AND (prrmr.msg_id = m_1.msg_id) AND (prr.pull_request_id = pr_1.pull_request_id)) + UNION + SELECT prmr.pull_request_id, + m_1.msg_timestamp, + m_1.cntrb_id AS msg_cntrb_id + FROM augur_data.pull_request_message_ref prmr, + augur_data.pull_requests pr_1, + augur_data.message m_1 + WHERE ((prmr.pull_request_id = pr_1.pull_request_id) AND (prmr.msg_id = m_1.msg_id))) m ON ((m.pull_request_id = pr.pull_request_id)));""")) + + + + conn.execute(text(""" + create materialized view augur_data.explorer_user_repos as + SELECT a.login_name, + a.user_id, + b.group_id, + c.repo_id + FROM augur_operations.users a, + augur_operations.user_groups b, + augur_operations.user_repos c + WHERE ((a.user_id = b.user_id) AND (b.group_id = c.group_id)) + ORDER BY a.user_id;""")) + + conn.execute(text(""" + create materialized view augur_data.explorer_pr_response_times as + SELECT repo.repo_id, + pull_requests.pr_src_id, + repo.repo_name, + pull_requests.pr_src_author_association, + repo_groups.rg_name AS repo_group, + pull_requests.pr_src_state, + pull_requests.pr_merged_at, + pull_requests.pr_created_at, + pull_requests.pr_closed_at, + date_part('year'::text, (pull_requests.pr_created_at)::date) AS created_year, + date_part('month'::text, (pull_requests.pr_created_at)::date) AS created_month, + date_part('year'::text, (pull_requests.pr_closed_at)::date) AS closed_year, + date_part('month'::text, (pull_requests.pr_closed_at)::date) AS closed_month, + base_labels.pr_src_meta_label, + base_labels.pr_head_or_base, + ((EXTRACT(epoch FROM pull_requests.pr_closed_at) - EXTRACT(epoch FROM pull_requests.pr_created_at)) / (3600)::numeric) AS hours_to_close, + ((EXTRACT(epoch FROM pull_requests.pr_closed_at) - EXTRACT(epoch FROM pull_requests.pr_created_at)) / (86400)::numeric) AS days_to_close, + ((EXTRACT(epoch FROM response_times.first_response_time) - EXTRACT(epoch FROM pull_requests.pr_created_at)) / (3600)::numeric) AS hours_to_first_response, + ((EXTRACT(epoch FROM response_times.first_response_time) - EXTRACT(epoch FROM pull_requests.pr_created_at)) / (86400)::numeric) AS days_to_first_response, + ((EXTRACT(epoch FROM response_times.last_response_time) - EXTRACT(epoch FROM pull_requests.pr_created_at)) / (3600)::numeric) AS hours_to_last_response, + ((EXTRACT(epoch FROM response_times.last_response_time) - EXTRACT(epoch FROM pull_requests.pr_created_at)) / (86400)::numeric) AS days_to_last_response, + response_times.first_response_time, + response_times.last_response_time, + response_times.average_time_between_responses, + response_times.assigned_count, + response_times.review_requested_count, + response_times.labeled_count, + response_times.subscribed_count, + response_times.mentioned_count, + response_times.referenced_count, + response_times.closed_count, + response_times.head_ref_force_pushed_count, + response_times.merged_count, + response_times.milestoned_count, + response_times.unlabeled_count, + response_times.head_ref_deleted_count, + response_times.comment_count, + master_merged_counts.lines_added, + master_merged_counts.lines_removed, + all_commit_counts.commit_count, + master_merged_counts.file_count + FROM augur_data.repo, + augur_data.repo_groups, + ((((augur_data.pull_requests + LEFT JOIN ( SELECT pull_requests_1.pull_request_id, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'assigned'::text)) AS assigned_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'review_requested'::text)) AS review_requested_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'labeled'::text)) AS labeled_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'unlabeled'::text)) AS unlabeled_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'subscribed'::text)) AS subscribed_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'mentioned'::text)) AS mentioned_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'referenced'::text)) AS referenced_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'closed'::text)) AS closed_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'head_ref_force_pushed'::text)) AS head_ref_force_pushed_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'head_ref_deleted'::text)) AS head_ref_deleted_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'milestoned'::text)) AS milestoned_count, + count(*) FILTER (WHERE ((pull_request_events.action)::text = 'merged'::text)) AS merged_count, + min(message.msg_timestamp) AS first_response_time, + count(DISTINCT message.msg_timestamp) AS comment_count, + max(message.msg_timestamp) AS last_response_time, + ((max(message.msg_timestamp) - min(message.msg_timestamp)) / (count(DISTINCT message.msg_timestamp))::double precision) AS average_time_between_responses + FROM augur_data.pull_request_events, + augur_data.pull_requests pull_requests_1, + augur_data.repo repo_1, + augur_data.pull_request_message_ref, + augur_data.message + WHERE ((repo_1.repo_id = pull_requests_1.repo_id) AND (pull_requests_1.pull_request_id = pull_request_events.pull_request_id) AND (pull_requests_1.pull_request_id = pull_request_message_ref.pull_request_id) AND (pull_request_message_ref.msg_id = message.msg_id)) + GROUP BY pull_requests_1.pull_request_id) response_times ON ((pull_requests.pull_request_id = response_times.pull_request_id))) + LEFT JOIN ( SELECT pull_request_commits.pull_request_id, + count(DISTINCT pull_request_commits.pr_cmt_sha) AS commit_count + FROM augur_data.pull_request_commits, + augur_data.pull_requests pull_requests_1, + augur_data.pull_request_meta + WHERE ((pull_requests_1.pull_request_id = pull_request_commits.pull_request_id) AND (pull_requests_1.pull_request_id = pull_request_meta.pull_request_id) AND ((pull_request_commits.pr_cmt_sha)::text <> (pull_requests_1.pr_merge_commit_sha)::text) AND ((pull_request_commits.pr_cmt_sha)::text <> (pull_request_meta.pr_sha)::text)) + GROUP BY pull_request_commits.pull_request_id) all_commit_counts ON ((pull_requests.pull_request_id = all_commit_counts.pull_request_id))) + LEFT JOIN ( SELECT max(pull_request_meta.pr_repo_meta_id) AS max, + pull_request_meta.pull_request_id, + pull_request_meta.pr_head_or_base, + pull_request_meta.pr_src_meta_label + FROM augur_data.pull_requests pull_requests_1, + augur_data.pull_request_meta + WHERE ((pull_requests_1.pull_request_id = pull_request_meta.pull_request_id) AND ((pull_request_meta.pr_head_or_base)::text = 'base'::text)) + GROUP BY pull_request_meta.pull_request_id, pull_request_meta.pr_head_or_base, pull_request_meta.pr_src_meta_label) base_labels ON ((base_labels.pull_request_id = all_commit_counts.pull_request_id))) + LEFT JOIN ( SELECT sum(commits.cmt_added) AS lines_added, + sum(commits.cmt_removed) AS lines_removed, + pull_request_commits.pull_request_id, + count(DISTINCT commits.cmt_filename) AS file_count + FROM augur_data.pull_request_commits, + augur_data.commits, + augur_data.pull_requests pull_requests_1, + augur_data.pull_request_meta + WHERE (((commits.cmt_commit_hash)::text = (pull_request_commits.pr_cmt_sha)::text) AND (pull_requests_1.pull_request_id = pull_request_commits.pull_request_id) AND (pull_requests_1.pull_request_id = pull_request_meta.pull_request_id) AND (commits.repo_id = pull_requests_1.repo_id) AND ((commits.cmt_commit_hash)::text <> (pull_requests_1.pr_merge_commit_sha)::text) AND ((commits.cmt_commit_hash)::text <> (pull_request_meta.pr_sha)::text)) + GROUP BY pull_request_commits.pull_request_id) master_merged_counts ON ((base_labels.pull_request_id = master_merged_counts.pull_request_id))) + WHERE ((repo.repo_group_id = repo_groups.repo_group_id) AND (repo.repo_id = pull_requests.repo_id)) + ORDER BY response_times.merged_count DESC;""")) + + conn.execute(text(""" + create materialized view augur_data.explorer_issue_assignments as + SELECT + i.issue_id, + i.repo_id AS ID, + i.created_at AS created, + i.closed_at AS closed, + ie.created_at AS assign_date, + ie.ACTION AS assignment_action, + ie.cntrb_id AS assignee, + ie.node_id as node_id + FROM + ( + augur_data.issues i + LEFT JOIN augur_data.issue_events ie ON ( + ( + ( i.issue_id = ie.issue_id ) + AND ( + ( ie.ACTION ) :: TEXT = ANY ( ARRAY [ ( 'unassigned' :: CHARACTER VARYING ) :: TEXT, ( 'assigned' :: CHARACTER VARYING ) :: TEXT ] ) + ) + ) + ) + );""")) + + conn = op.get_bind() + conn.execute(text("""CREATE UNIQUE INDEX ON augur_data.explorer_user_repos(login_name,user_id,group_id,repo_id);""")) + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text("""CREATE UNIQUE INDEX ON augur_data.explorer_pr_response_times(repo_id, pr_src_id, pr_src_meta_label);""")) + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text("""CREATE UNIQUE INDEX ON augur_data.explorer_pr_assignments(pull_request_id, id, node_id);""")) + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text("""CREATE UNIQUE INDEX ON augur_data.explorer_issue_assignments(issue_id, id, node_id);""")) + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text("""CREATE UNIQUE INDEX ON augur_data.explorer_pr_response(pull_request_id, id, cntrb_id, msg_cntrb_id, msg_timestamp);""")) + conn.execute(text("""COMMIT;""")) \ No newline at end of file diff --git a/augur/tasks/data_analysis/clustering_worker/setup.py b/augur/tasks/data_analysis/clustering_worker/setup.py index 56e01e52f3..78fb0b4b50 100644 --- a/augur/tasks/data_analysis/clustering_worker/setup.py +++ b/augur/tasks/data_analysis/clustering_worker/setup.py @@ -29,11 +29,11 @@ def read(filename): 'psycopg2-binary==2.9.3', #'sklearn==0.0.0', 'scikit-learn==1.1.3', - 'numpy==1.22.0', + 'numpy==1.26.0', 'nltk==3.6.6', 'seaborn==0.11.1', 'pandas==1.5.3', - 'matplotlib==3.5.1' + 'matplotlib>=3.5.1' ], classifiers=[ 'Development Status :: 2 - Pre-Alpha', diff --git a/augur/tasks/data_analysis/discourse_analysis/setup.py b/augur/tasks/data_analysis/discourse_analysis/setup.py index f109164ffd..37d6557ec5 100644 --- a/augur/tasks/data_analysis/discourse_analysis/setup.py +++ b/augur/tasks/data_analysis/discourse_analysis/setup.py @@ -28,13 +28,13 @@ def read(filename): 'requests==2.28.0', 'psycopg2-binary==2.9.3', 'click==8.0.3', - 'scipy==1.7.3', + 'scipy>=1.10.0', 'nltk==3.6.6', 'pandas==1.5.3', 'scikit-learn==1.1.3', 'textblob==0.15.3', - 'python-crfsuite==0.9.8', - 'sklearn-crfsuite==0.3.6', + 'python-crfsuite>=0.9.8', + 'sklearn-crfsuite>=0.3.6', 'tabulate==0.8.9' ], # python-crfsuite-0.9.8 sklearn-crfsuite-0.3.6 tabulate-0.8.9 entry_points={ diff --git a/augur/tasks/data_analysis/insight_worker/setup.py b/augur/tasks/data_analysis/insight_worker/setup.py index 0eb35d8a78..1ee6e8a4bd 100644 --- a/augur/tasks/data_analysis/insight_worker/setup.py +++ b/augur/tasks/data_analysis/insight_worker/setup.py @@ -29,9 +29,9 @@ def read(filename): 'requests==2.28.0', 'psycopg2-binary==2.9.3', 'click==8.0.3', - 'scipy>=1.7.3', + 'scipy>=1.10.0', 'sklearn==0.0', - 'numpy==1.22.0', + 'numpy==1.26.0', ], entry_points={ 'console_scripts': [ diff --git a/augur/tasks/data_analysis/message_insights/setup.py b/augur/tasks/data_analysis/message_insights/setup.py index e3dedb4191..a4f6a30c43 100644 --- a/augur/tasks/data_analysis/message_insights/setup.py +++ b/augur/tasks/data_analysis/message_insights/setup.py @@ -30,22 +30,22 @@ def read(filename): 'requests==2.28.0', 'psycopg2-binary==2.9.3', 'click==8.0.3', - 'scipy==1.7.3', + 'scipy>=1.10.0', 'scikit-learn==1.1.3', #0.24.2', - 'numpy==1.22.0', + 'numpy==1.26.0', 'nltk==3.6.6', 'pandas==1.5.3', 'emoji==1.2.0', - 'Keras==2.13.1', + 'keras>=2.15.0', 'Keras-Preprocessing', - 'tensorflow==2.13.1', + 'tensorflow==2.15.0', 'h5py==3.10.0', 'scikit-image==0.19.1', - 'joblib==1.0.1', + 'joblib==1.2.0', 'xgboost', 'bs4==0.0.1', 'xlrd==2.0.1', - 'gensim==4.2.0' + 'gensim>=4.2.0' ], classifiers=[ 'Development Status :: 3 - Alpha', diff --git a/augur/tasks/data_analysis/pull_request_analysis_worker/setup.py b/augur/tasks/data_analysis/pull_request_analysis_worker/setup.py index 5132f29d2e..3341f24ff1 100644 --- a/augur/tasks/data_analysis/pull_request_analysis_worker/setup.py +++ b/augur/tasks/data_analysis/pull_request_analysis_worker/setup.py @@ -29,12 +29,12 @@ def read(filename): 'psycopg2-binary==2.9.3', 'sklearn==0.0', 'nltk==3.6.6', - 'numpy==1.22.0', + 'numpy==1.26.0', 'pandas==1.5.3', 'emoji==1.2.0', - 'joblib==1.0.1', + 'joblib==1.2.0', 'xgboost==1.4.2', - 'scipy==1.7.3' + 'scipy>=1.10.0' ], classifiers=[ 'Development Status :: 2 - Pre-Alpha', diff --git a/augur/tasks/db/refresh_materialized_views.py b/augur/tasks/db/refresh_materialized_views.py index 76420c253e..f04d01552b 100644 --- a/augur/tasks/db/refresh_materialized_views.py +++ b/augur/tasks/db/refresh_materialized_views.py @@ -59,15 +59,35 @@ def refresh_materialized_views(): COMMIT; """) + mv9_refresh = s.sql.text(""" + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_user_repos with data; + COMMIT; + """) - try: - with DatabaseSession(logger, engine) as session: - session.execute_sql(mv1_refresh) - except Exception as e: - logger.info(f"error is {e}") - pass + mv10_refresh = s.sql.text(""" + + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_pr_response_times with data; + COMMIT; + """) + mv11_refresh = s.sql.text(""" + + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_pr_assignments with data; + COMMIT; + """) + + mv12_refresh = s.sql.text(""" + + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_issue_assignments with data; + COMMIT; + """) + + mv13_refresh = s.sql.text(""" + + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_pr_response with data; + COMMIT; + """) try: with DatabaseSession(logger, engine) as session: @@ -125,7 +145,40 @@ def refresh_materialized_views(): logger.info(f"error is {e}") pass + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv9_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv10_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv11_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv12_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + + try: + with DatabaseSession(logger, engine) as session: + session.execute_sql(mv13_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass diff --git a/augur/tasks/frontend.py b/augur/tasks/frontend.py index b8eb8b203c..fffd79d330 100644 --- a/augur/tasks/frontend.py +++ b/augur/tasks/frontend.py @@ -30,15 +30,15 @@ def add_org_repo_list(user_id, group_name, urls): valid_repos = [] for url in urls: - # matches https://github.com/{org}/ or htts://github.com/{org} + # matches https://github.com/{org}/ or http://github.com/{org} if Repo.parse_github_org_url(url): - added = user.add_org(group_name, url)[0] + added = user.add_github_org(group_name, url)[0] if added: valid_orgs.append(url) - # matches https://github.com/{org}/{repo}/ or htts://github.com/{org}/{repo} + # matches https://github.com/{org}/{repo}/ or http://github.com/{org}/{repo} elif Repo.parse_github_repo_url(url)[0]: - added = user.add_repo(group_name, url)[0] + added = user.add_github_repo(group_name, url)[0] if added: valid_repos.append(url) @@ -46,7 +46,7 @@ def add_org_repo_list(user_id, group_name, urls): elif (match := parse_org_and_repo_name(url)): org, repo = match.groups() repo_url = f"https://github.com/{org}/{repo}/" - added = user.add_repo(group_name, repo_url)[0] + added = user.add_github_repo(group_name, repo_url)[0] if added: valid_repos.append(url) @@ -54,9 +54,17 @@ def add_org_repo_list(user_id, group_name, urls): elif (match := parse_org_name(url)): org = match.group(1) org_url = f"https://github.com/{org}/" - added = user.add_org(group_name, org_url)[0] + added = user.add_github_org(group_name, org_url)[0] if added: valid_orgs.append(url) + + # matches https://gitlab.com/{org}/{repo}/ or http://gitlab.com/{org}/{repo} + elif Repo.parse_gitlab_repo_url(url)[0]: + + added = user.add_gitlab_repo(group_name, url)[0] + if added: + valid_repos.append(url) + else: invalid_urls.append(url) @@ -66,24 +74,25 @@ def add_org_repo_list(user_id, group_name, urls): - +# TODO: Change to github specific @celery.task def add_repo(user_id, group_name, repo_url): logger = logging.getLogger(add_org.__name__) with GithubTaskSession(logger) as session: - result = UserRepo.add(session, repo_url, user_id, group_name) + result = UserRepo.add_github_repo(session, repo_url, user_id, group_name) print(repo_url, result) +# TODO: Change to github specific @celery.task def add_org(user_id, group_name, org_url): logger = logging.getLogger(add_org.__name__) with GithubTaskSession(logger) as session: - result = UserRepo.add_org_repos(session, org_url, user_id, group_name) + result = UserRepo.add_github_org_repos(session, org_url, user_id, group_name) print(org_url, result) diff --git a/augur/tasks/github/issues/tasks.py b/augur/tasks/github/issues/tasks.py index 5380b8bf10..0ba793470e 100644 --- a/augur/tasks/github/issues/tasks.py +++ b/augur/tasks/github/issues/tasks.py @@ -195,7 +195,7 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: issue_assignee_dicts += add_key_value_pair_to_dicts(other_issue_data["assignees"], "issue_id", issue_id) - logger.info(f"{task_name}: Inserting other issue data of lengths: Labels: {len(issue_label_dicts)} - Assignees: {len(issue_assignee_dicts)}") + logger.info(f"{task_name}: Inserting other github issue data of lengths: Labels: {len(issue_label_dicts)} - Assignees: {len(issue_assignee_dicts)}") # inserting issue labels # we are using label_src_id and issue_id to determine if the label is already in the database. diff --git a/augur/tasks/github/messages/tasks.py b/augur/tasks/github/messages/tasks.py index 6e23434bae..4dfd3a634b 100644 --- a/augur/tasks/github/messages/tasks.py +++ b/augur/tasks/github/messages/tasks.py @@ -187,7 +187,8 @@ def process_messages(messages, task_name, repo_id, logger, augur_db): message_string_fields = ["msg_text"] message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, return_columns=message_return_columns, string_fields=message_string_fields) - + if message_return_data is None: + return pr_message_ref_dicts = [] issue_message_ref_dicts = [] diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 3af6e39e08..8db394754c 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -74,9 +74,18 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> None: return all_data - -def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): +def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): + """ + Parse and insert all retrieved PR data. + + Arguments: + pull_requests: List of paginated pr endpoint data + task_name: Name of the calling task and the repo + repo_id: augur id of the repository + logger: logging object + augur_db: sqlalchemy db object + """ tool_source = "Pr Task" tool_version = "2.0" data_source = "Github API" diff --git a/augur/tasks/github/releases/core.py b/augur/tasks/github/releases/core.py index f3050fc1b3..5957d4cb57 100644 --- a/augur/tasks/github/releases/core.py +++ b/augur/tasks/github/releases/core.py @@ -84,7 +84,8 @@ def insert_release(augur_db, logger, repo_id, owner, release, tag_only = False): release_inf = get_release_inf(repo_id, release, tag_only) #Do an upsert - augur_db.insert_data(release_inf,Release,['release_id']) + string_fields = ["release_name", "release_description", "release_author", "release_tag_name"] + augur_db.insert_data(release_inf,Release,['release_id'], string_fields=string_fields) logger.info(f"Inserted info for {owner}/{repo_id}/{release['name']}\n") diff --git a/augur/tasks/github/util/github_api_key_handler.py b/augur/tasks/github/util/github_api_key_handler.py index 8a19430e87..20ce07f066 100644 --- a/augur/tasks/github/util/github_api_key_handler.py +++ b/augur/tasks/github/util/github_api_key_handler.py @@ -32,7 +32,7 @@ def __init__(self, session: DatabaseSession): self.logger = session.logger self.config = AugurConfig(self.logger, session) - self.oauth_redis_key = "oauth_keys_list" + self.oauth_redis_key = "github_oauth_keys_list" self.redis_key_list = RedisList(self.oauth_redis_key) diff --git a/augur/tasks/github/util/github_paginator.py b/augur/tasks/github/util/github_paginator.py index 548d25b0f9..31c14565df 100644 --- a/augur/tasks/github/util/github_paginator.py +++ b/augur/tasks/github/util/github_paginator.py @@ -154,6 +154,8 @@ class GithubApiResult(Enum): SECONDARY_RATE_LIMIT = 4 RATE_LIMIT_EXCEEDED = 5 ABUSE_MECHANISM_TRIGGERED = 6 + # TODO: Add bad credentials detection that removes key + # from redis if bad credentials are detected BAD_CREDENTIALS = 7 HTML = 8 EMPTY_STRING = 9 diff --git a/augur/tasks/github/util/util.py b/augur/tasks/github/util/util.py index fbb23dd6e8..42989dcca3 100644 --- a/augur/tasks/github/util/util.py +++ b/augur/tasks/github/util/util.py @@ -54,10 +54,21 @@ def parse_json_response(logger: logging.Logger, response: httpx.Response) -> dic try: return response.json() except json.decoder.JSONDecodeError as e: - logger.warning(f"invalid return from GitHub. Response was: {response.text}. Exception: {e}") + logger.warning(f"invalid return. Response was: {response.text}. Exception: {e}") return json.loads(json.dumps(response.text)) def get_repo_weight_by_issue(logger,repo_git): + """ + Retrieve the sum of the number of issues and prs in a repository from a graphql query. + + Arguments: + logger: logger object + repo_git: repository url + + Returns: + Sum of issues and prs for that repo + """ + from augur.tasks.github.util.gh_graphql_entities import GitHubRepo as GitHubRepoGraphql owner,name = get_owner_repo(repo_git) diff --git a/augur/tasks/gitlab/events_task.py b/augur/tasks/gitlab/events_task.py new file mode 100644 index 0000000000..8058831ba3 --- /dev/null +++ b/augur/tasks/gitlab/events_task.py @@ -0,0 +1,209 @@ +""" +Module to define the task methods to collect gitlab event data for augur +""" +import logging + +from augur.tasks.init.celery_app import celery_app as celery +from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask +from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler +from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest +from augur.application.db.data_parse import extract_gitlab_mr_event_data, extract_gitlab_issue_event_data +from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts +from augur.application.db.models import Repo, Issue, IssueEvent, PullRequest, PullRequestEvent +from augur.application.db.util import execute_session_query + +platform_id = 2 + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_gitlab_issue_events(repo_git) -> int: + """ + Retrieve and parse gitlab events for the desired repo + + Arguments: + repo_git: the repo url string + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_gitlab_issue_events.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + events = retrieve_all_gitlab_event_data("issue", repo_git, logger, manifest.key_auth) + + if events: + logger.info(f"Length of gitlab issue events: {len(events)}") + process_issue_events(events, f"{owner}/{repo}: Gitlab Issue Events task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab issue events") + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_gitlab_merge_request_events(repo_git) -> int: + """ + Retrieve and parse gitlab mrs for the desired repo + + Arguments: + repo_git: the repo url string + """ + + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_gitlab_issue_events.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + events = retrieve_all_gitlab_event_data("merge_request", repo_git, logger, manifest.key_auth) + + if events: + logger.info(f"Length of gitlab merge request events: {len(events)}") + process_mr_events(events, f"{owner}/{repo}: Gitlab MR Events task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request events") + + +def retrieve_all_gitlab_event_data(gtype, repo_git, logger, key_auth) -> None: + """ + Retrieve only the needed data for mr label data from the api response + + Arguments: + gtype: type of event data + repo_git: url of the relevant repo + logger: loggin object + key_auth: key auth cache and rotator object + """ + + owner, repo = get_owner_repo(repo_git) + + logger.info(f"Collecting gitlab issue events for {owner}/{repo}") + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/events?target_type={gtype}" + events = GitlabApiHandler(key_auth, logger) + + all_data = [] + num_pages = events.get_num_pages(url) + for page_data, page in events.iter_pages(url): + + if page_data is None: + return all_data + + if len(page_data) == 0: + logger.debug( + f"{owner}/{repo}: Gitlab {gtype} Events Page {page} contains no data...returning") + logger.info(f"{owner}/{repo}: {gtype} Events Page {page} of {num_pages}") + return all_data + + logger.info(f"{owner}/{repo}: Gitlab {gtype} Events Page {page} of {num_pages}") + + all_data += page_data + + return all_data + +def process_issue_events(events, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr label data from the api response + + Arguments: + events: List of dictionaries of issue event data + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + tool_source = "Gitlab issue events task" + tool_version = "2.0" + data_source = "Gitlab API" + + issue_event_dicts = [] + + # create mapping from issue number to issue id of current issues + issue_url_to_id_map = {} + issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() + for issue in issues: + issue_url_to_id_map[issue.gh_issue_number] = issue.issue_id + + for event in events: + + issue_number = event["target_iid"] + + try: + issue_id = issue_url_to_id_map[issue_number] + except KeyError: + logger.info(f"{task_name}: Could not find related issue") + logger.info(f"{task_name}: We were searching for an issue with number {issue_number} in repo {repo_id}") + logger.info(f"{task_name}: Skipping") + continue + + issue_event_dicts.append( + extract_gitlab_issue_event_data(event, issue_id, platform_id, repo_id, + tool_source, tool_version, data_source) + ) + + logger.info(f"{task_name}: Inserting {len(issue_event_dicts)} gitlab issue events") + issue_event_natural_keys = ["issue_id", "issue_event_src_id"] + augur_db.insert_data(issue_event_dicts, IssueEvent, issue_event_natural_keys) + + +def process_mr_events(events, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr events from the api response + + Arguments: + labels: List of dictionaries of label data + repo_id: augur id of the repository + tool_source: The part of augur that processed the data + tool_version: The version of the augur task that processed the data + data_source: The source of the data + + + Returns: + List of parsed label dicts + """ + + tool_source = "Gitlab mr events task" + tool_version = "2.0" + data_source = "Gitlab API" + + mr_event_dicts = [] + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + for event in events: + + mr_number = event["target_iid"] + + try: + issue_id = mr_number_to_id_map[mr_number] + except KeyError: + logger.info(f"{task_name}: Could not find related mr") + logger.info(f"{task_name}: We were searching for an mr with number {mr_number} in repo {repo_id}") + logger.info(f"{task_name}: Skipping") + continue + + mr_event_dicts.append( + extract_gitlab_mr_event_data(event, issue_id, platform_id, repo_id, + tool_source, tool_version, data_source) + ) + + # TODO: Add unique key for this + logger.info(f"{task_name}: Inserting {len(mr_event_dicts)} gitlab mr events") + mr_event_natural_keys = ["pull_request_id", "issue_event_src_id"] + augur_db.insert_data(mr_event_dicts, PullRequestEvent, mr_event_natural_keys) + + diff --git a/augur/tasks/gitlab/gitlab_api_handler.py b/augur/tasks/gitlab/gitlab_api_handler.py new file mode 100644 index 0000000000..5303d606e9 --- /dev/null +++ b/augur/tasks/gitlab/gitlab_api_handler.py @@ -0,0 +1,386 @@ +""" +Defines a GitlabApiHandler class to paginate and handle interaction with GitLab's +api through automatic use of relevant key auth and pagination tools. +""" +import httpx +import time +import logging + +from typing import List, Optional, Union, Generator, Tuple +from urllib.parse import urlencode, urlparse, parse_qs, urlunparse +from enum import Enum + +from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth +from augur.tasks.github.util.util import parse_json_response + +class GitlabApiResult(Enum): + """All the different results of querying the Gitlab API.""" + + SUCCESS = 0 + TIMEOUT = 1 + NO_MORE_ATTEMPTS = 2 + NOT_FOUND = 3 + SECONDARY_RATE_LIMIT = 4 + RATE_LIMIT_EXCEEDED = 5 + ABUSE_MECHANISM_TRIGGERED = 6 + # TODO: Add bad credentials detection that removes key + # from redis if bad credentials are detected + BAD_CREDENTIALS = 7 + +class GitlabApiHandler(): + """This class is a sequence that handles retrieving data from the Gitlab API. + + Attributes: + url (str): The url that we are collecting data + key_mangager (GitlabRandomKeyAuth): Custom httpx auth class + that randomizes the github api key a request gets. + This is how the requests are getting their api keys + logger (logging.Logger): Logger that handler printing information to files and stdout + """ + + def __init__(self, key_manager: GitlabRandomKeyAuth, logger: logging.Logger): + """Initialize the class GitlabPaginator. + + Args: + url: url that the data is being collected + key_manager: class that randomly selects a Gitlab API key for each request + logger: handles logging + from_datetime: collects data after this datatime (not yet implemented) + to_datetime: collects data before this datatime (not yet implemented) + """ + self.key_manager = key_manager + self.logger = logger + + def get_length(self, url): + """Get the length of the Gitlab API data. + + Returns: + The length of the Gitlab API data at the url. + + Examples: + This function is called when len() is called on the GitlabPaginator class for example. + + issues = GitlabPaginator(url, session.oauths, logger) + issue_len = len(issues) + """ + + num_pages = self.get_num_pages(url) + + self.logger.info(f"Num pages: {num_pages}") + + params = {"page": num_pages} + url = add_query_params(url, params) + + # get the amount of data on last page + data, _, result = self.retrieve_data(url) + + if result == GitlabApiResult.SUCCESS: + return (100 * (num_pages -1)) + len(data) + + self.logger.debug("Unable to retrieve data length from api") + return 0 + + def iter(self, url) -> Generator[Optional[dict], None, None]: + """Provide data from Gitlab API via a generator that yields one dict at a time. + + Yields: + A piece of data from the github api as the specified url + """ + + url = self._set_paginaton_query_params(url) + + data_list, response, result = self.retrieve_data(url) + + if result != GitlabApiResult.SUCCESS: + self.logger.debug("Failed to retrieve the data even though 10 attempts were given") + yield None + return + + # yield the first page data + for data in data_list: + yield data + + while 'next' in response.links.keys(): + next_page = response.links['next']['url'] + + # Here we don't need to pass in params with the page, or the default params because the url from the headers already has those values + data_list, response, result = self.retrieve_data(next_page) + + if result != GitlabApiResult.SUCCESS: + self.logger.debug("Failed to retrieve the data even though 10 attempts were given") + return + + for data in data_list: + yield data + + def iter_pages(self, url) -> Generator[Tuple[Optional[List[dict]], int], None, None]: + """Provide data from Gitlab API via a generator that yields a page of dicts at a time. + + Returns: + A page of data from the Gitlab API at the specified url + """ + + url = self._set_paginaton_query_params(url) + + # retrieves the data for the given url + data_list, response, result = self.retrieve_data(url) + + if result != GitlabApiResult.SUCCESS: + self.logger.debug("Failed to retrieve the data even though 10 attempts were given") + yield None, None + return + + # this retrieves the page for the given url + page_number = get_url_page_number(url) + + # yields the first page of data and its page number + yield data_list, page_number + + while 'next' in response.links.keys(): + + # gets the next page from the last responses header + next_page = response.links['next']['url'] + + # Here we don't need to pass in params with the page, or the default params because the url from the headers already has those values + data_list, response, result = self.retrieve_data(next_page) + + if result != GitlabApiResult.SUCCESS: + self.logger.debug(f"Failed to retrieve the data for even though 10 attempts were given. Url: {next_page}") + return + + page_number = get_url_page_number(next_page) + + # if either the data or response is None then yield None and return + if data_list is None or response is None: + return + + # yield the data from the page and its number + yield data_list, page_number + + def retrieve_data(self, url: str) -> Tuple[Optional[List[dict]], Optional[httpx.Response]]: + """Attempt to retrieve data at given url. + + Args: + url: The url to retrieve the data from + + Returns + The response object from hitting the url and the data on the page + """ + + timeout = 30 + timeout_count = 0 + num_attempts = 1 + while num_attempts <= 10: + + response = hit_api(self.key_manager, url, self.logger, timeout) + + num_attempts += 1 + + if response is None: + if timeout_count == 10: + self.logger.error(f"Request timed out 10 times for {url}") + return None, None, GitlabApiResult.TIMEOUT + + timeout = timeout * 1.1 + num_attempts += 1 + continue + + if response.status_code == 500: + self.logger.error(f"Gitlab returned {response.status_code} error when fetching {url}. Message: {response.json()}") + continue + + if response.status_code == 429: + + current_epoch = int(time.time()) + epoch_when_key_resets = int(response.headers["ratelimit-reset"]) + key_reset_time = epoch_when_key_resets - current_epoch + + if key_reset_time < 0: + self.logger.error(f"Key reset time was less than 0 setting it to 0.\nThe current epoch is {current_epoch} and the epoch that the key resets at is {epoch_when_key_resets}") + key_reset_time = 0 + + self.logger.info(f"\n\n\nGitlab API rate limit exceeded. Sleeping until the key resets ({key_reset_time} seconds)") + time.sleep(key_reset_time) + continue + + if response.status_code == 404: + self.logger.info(f"ERROR: 404 not found for {url}") + return [], response, GitlabApiResult.NOT_FOUND + + if response.status_code == 204: + return [], response, GitlabApiResult.SUCCESS + + if response.status_code >= 200 and response.status_code <=299: + + page_data = parse_json_response(self.logger, response) + return page_data, response, GitlabApiResult.SUCCESS + + self.logger.warning(f"Unhandled gitlab response. Status code: {response.status_code}. Body: {response.json()}") + + + + self.logger.error("Unable to collect data in 10 attempts") + return None, None, GitlabApiResult.NO_MORE_ATTEMPTS + + def get_num_pages(self, url) -> Optional[int]: + """Get the number of pages of data that a url can paginate through. + + Returns: + The number of pages a url can access + """ + + url = self._set_paginaton_query_params(url) + + timeout: float = 5 + num_attempts = 0 + while num_attempts < 10: + r = self.hit_api(url=url, timeout=timeout, method="HEAD") + + if r: + break + + timeout = timeout * 1.2 + else: + raise RuntimeError("Unable to get the number of pages of data in 10 attempts") + + if 'last' not in r.links.keys(): + return 1 + + # get the last url from header + last_page_url = r.links['last']['url'] + + parsed_url = urlparse(last_page_url) + try: + num_pages = int(parse_qs(parsed_url.query)['page'][0]) + except (KeyError, ValueError): + return None + + return num_pages + + def hit_api(self, url, timeout, method): + """Attempt to retrieve data at given url. + + Args: + url: The url to retrieve the data from + timeout: time to wait until timeout + method: GET, POST, etc. + + Returns + The response object from hitting the url and the data on the page + """ + + return hit_api(self.key_manager, url, self.logger, timeout, method=method) + + def _set_paginaton_query_params(self, url): + + remove_fields = ["per_page", "page"] + url = clean_url(url, remove_fields) + + # we need to add query params directly to the url, instead of passing the param to the httpx.Client.request + # this is because github will only append specified params to the links in the headers if they are a part + # of the url, and not the params with the request + params = {"per_page": 100} + url = add_query_params(url, params) + + return url + +################################################################################ + +# Url Helper Method to remove query parameters from the url +def clean_url(url: str, keys: List[str]) -> str: + """Remove query params from url. + + Args: + url: the url that is being modified + keys: the query params that are being removed + + Returns: + A url with the params in keys removed + """ + u = urlparse(url) + query = parse_qs(u.query, keep_blank_values=True) + + for key in keys: + query.pop(key, None) + + u = u._replace(query=urlencode(query, True)) + + return urlunparse(u) + + +def add_query_params(url: str, additional_params: dict) -> str: + """Add query params to a url. + + Args: + url: the url that is being modified + additional_params: key value pairs specifying the parameters to be added + + Returns: + The url with the key value pairs in additional_params added as query params + """ + url_components = urlparse(url) + original_params = parse_qs(url_components.query) + # Before Python 3.5 you could update original_params with + # additional_params, but here all the variables are immutable. + merged_params = {**original_params, **additional_params} + updated_query = urlencode(merged_params, doseq=True) + # _replace() is how you can create a new NamedTuple with a changed field + return url_components._replace(query=updated_query).geturl() + + +def get_url_page_number(url: str) -> int: + """Parse the page number from the url. + + Note: + If the url does not contain a page number the function returns 1 + + Args: + url: url to get the page number from + + Returns: + The page number that the url contains + """ + parsed_url = urlparse(url) + try: + # if page is not a url query param then this is page 1 + page_number = int(parse_qs(parsed_url.query)['page'][0]) + + except KeyError: + return 1 + + return page_number + +################################################################################ + +def hit_api(key_manager, url: str, logger: logging.Logger, timeout: float = 10, method: str = 'GET', ) -> Optional[httpx.Response]: + """Ping the api and get the data back for the page. + + Returns: + A httpx response that contains the data. None if a timeout occurs + """ + # self.logger.info(f"Hitting endpoint with {method} request: {url}...\n") + + with httpx.Client() as client: + + try: + response = client.request( + method=method, url=url, auth=key_manager, timeout=timeout, follow_redirects=True) + + except TimeoutError: + logger.info(f"Request timed out. Sleeping {round(timeout)} seconds and trying again...\n") + time.sleep(round(timeout)) + return None + except httpx.TimeoutException: + logger.info(f"Request timed out. Sleeping {round(timeout)} seconds and trying again...\n") + time.sleep(round(timeout)) + return None + except httpx.NetworkError: + logger.info(f"Network Error. Sleeping {round(timeout)} seconds and trying again...\n") + time.sleep(round(timeout)) + return None + except httpx.ProtocolError: + logger.info(f"Protocol Error. Sleeping {round(timeout*1.5)} seconds and trying again...\n") + time.sleep(round(timeout*1.5)) + return None + + return response diff --git a/augur/tasks/gitlab/gitlab_api_key_handler.py b/augur/tasks/gitlab/gitlab_api_key_handler.py new file mode 100644 index 0000000000..20bc1219ca --- /dev/null +++ b/augur/tasks/gitlab/gitlab_api_key_handler.py @@ -0,0 +1,176 @@ +""" +Defines the handler logic needed to effectively fetch GitLab auth keys +from either the redis cache or the database. Follows the same patterns as +the github api key handler. +""" +import httpx +import time +import random + +from typing import Optional, List + +from augur.tasks.util.redis_list import RedisList +from augur.application.db.session import DatabaseSession +from augur.application.config import AugurConfig +from sqlalchemy import func + + +class NoValidKeysError(Exception): + """Defines an exception that is thrown when no gitlab keys are valid""" + + +class GitlabApiKeyHandler(): + """Handles Gitlab API key retrieval from the database and redis + + Attributes: + session (DatabaseSession): Database connection + logger (logging.Logger): Handles all logs + oauth_redis_key (str): The key where the gitlab api keys are cached in redis + redis_key_list (RedisList): Acts like a python list, and interacts directly with the redis cache + config_key (str): The api key that is stored in the users config table + key: (List[str]): List of keys retrieve from database or cache + """ + + def __init__(self, session: DatabaseSession): + + self.session = session + self.logger = session.logger + self.config = AugurConfig(self.logger, session) + + self.oauth_redis_key = "gitlab_oauth_keys_list" + + self.redis_key_list = RedisList(self.oauth_redis_key) + + self.config_key = self.get_config_key() + + self.keys = self.get_api_keys() + + self.logger.info(f"Retrieved {len(self.keys)} gitlab api keys for use") + + def get_random_key(self): + """Retrieves a random key from the list of keys + + Returns: + A random gitlab api key + """ + + return random.choice(self.keys) + + def get_config_key(self) -> str: + """Retrieves the users gitlab api key from their config table + + Returns: + Github API key from config table + """ + return self.config.get_value("Keys", "gitlab_api_key") + + def get_api_keys_from_database(self) -> List[str]: + """Retieves all gitlab api keys from database + + Note: + It retrieves all the keys from the database except the one defined in the users config + + Returns: + Github api keys that are in the database + """ + from augur.application.db.models import WorkerOauth + + select = WorkerOauth.access_token + # randomizing the order at db time + #select.order_by(func.random()) + where = [WorkerOauth.access_token != self.config_key, WorkerOauth.platform == 'gitlab'] + + return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).order_by(func.random()).all()] + #return [key_tuple[0] for key_tuple in self.session.query(select).filter(*where).all()] + + + def get_api_keys(self) -> List[str]: + """Retrieves all valid Github API Keys + + Note: + It checks to see if the keys are in the redis cache first. + It removes bad keys before returning. + If keys were taken from the database, it caches all the valid keys that were found + + Returns: + Valid Github api keys + """ + + redis_keys = list(self.redis_key_list) + + if redis_keys: + return redis_keys + + attempts = 0 + while attempts < 3: + + try: + keys = self.get_api_keys_from_database() + break + except Exception as e: + self.logger.error(f"Ran into issue when fetching key from database:\n {e}\n") + self.logger.error("Sleeping for 5 seconds...") + time.sleep(5) + attempts += 1 + + if self.config_key is not None: + keys += [self.config_key] + + if len(keys) == 0: + return [] + + valid_keys = [] + with httpx.Client() as client: + + for key in keys: + + # removes key if it returns "Bad Credentials" + if self.is_bad_api_key(client, key) is False: + valid_keys.append(key) + else: + print(f"WARNING: The key '{key}' is not a valid key. Hint: If valid in past it may have expired") + + # just in case the mulitprocessing adds extra values to the list. + # we are clearing it before we push the values we got + self.redis_key_list.clear() + + # add all the keys to redis + self.redis_key_list.extend(valid_keys) + + if not valid_keys: + raise NoValidKeysError("No valid gitlab api keys found in the config or worker oauth table") + + + # shuffling the keys so not all processes get the same keys in the same order + #valid_now = valid_keys + #try: + #self.logger.info(f'valid keys before shuffle: {valid_keys}') + #valid_keys = random.sample(valid_keys, len(valid_keys)) + #self.logger.info(f'valid keys AFTER shuffle: {valid_keys}') + #except Exception as e: + # self.logger.debug(f'{e}') + # valid_keys = valid_now + # pass + + return valid_keys + + def is_bad_api_key(self, client: httpx.Client, oauth_key: str) -> bool: + """Determines if a Gitlab API key is bad + + Args: + client: makes the http requests + oauth_key: gitlab api key that is being tested + + Returns: + True if key is bad. False if the key is good + """ + + url = "https://gitlab.com/api/v4/user" + + headers = {'Authorization': f'Bearer {oauth_key}'} + + response = client.request(method="GET", url=url, headers=headers, timeout=180) + if response.status_code == 401: + return True + + return False \ No newline at end of file diff --git a/augur/tasks/gitlab/gitlab_random_key_auth.py b/augur/tasks/gitlab/gitlab_random_key_auth.py new file mode 100644 index 0000000000..64ba31dd19 --- /dev/null +++ b/augur/tasks/gitlab/gitlab_random_key_auth.py @@ -0,0 +1,26 @@ +"""Defines the GitlabRandomKeyAuth class""" + +from augur.tasks.util.random_key_auth import RandomKeyAuth +from augur.tasks.gitlab.gitlab_api_key_handler import GitlabApiKeyHandler +from augur.application.db.session import DatabaseSession + + +class GitlabRandomKeyAuth(RandomKeyAuth): + """Defines a gitlab specific RandomKeyAuth class so + gitlab collections can have a class randomly selects an api key for each request + """ + + def __init__(self, session: DatabaseSession, logger): + """Creates a GitlabRandomKeyAuth object and initializes the RandomKeyAuth parent class""" + + + # gets the gitlab api keys from the database via the GitlabApiKeyHandler + gitlab_api_keys = GitlabApiKeyHandler(session).keys + + if not gitlab_api_keys: + print("Failed to find github api keys. This is usually because your key has expired") + + header_name = "Authorization" + key_format = "Bearer {0}" + + super().__init__(gitlab_api_keys, header_name, session.logger, key_format) \ No newline at end of file diff --git a/augur/tasks/gitlab/gitlab_task_session.py b/augur/tasks/gitlab/gitlab_task_session.py new file mode 100644 index 0000000000..58a6e64373 --- /dev/null +++ b/augur/tasks/gitlab/gitlab_task_session.py @@ -0,0 +1,55 @@ +""" +Defines a GitLab-specific session and manifest object for use in GitLab tasks +""" +from logging import Logger + +from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth +from augur.application.db.session import DatabaseSession + +class GitlabTaskManifest: + """ + Manifest object that represents the state and common elements of + the specified task. GitLab version for the GitLab tasks. + + Attributes: + augur_db: sqlalchemy db object + key_auth: GitLab specific key auth retrieval collection + logger: logging object + platform_id: GitLab specific platform id (github is 1) + """ + + def __init__(self, logger): + + from augur.tasks.init.celery_app import engine + + self.augur_db = DatabaseSession(logger, engine) + self.key_auth = GitlabRandomKeyAuth(self.augur_db.session, logger) + self.logger = logger + self.platform_id = 2 + + def __enter__(self): + + return self + + def __exit__(self, exception_type, exception_value, exception_traceback): + + self.augur_db.close() + +class GitlabTaskSession(DatabaseSession): + """ORM session used in gitlab tasks. + This class adds the platform_id and the gitlab key authentication class, + to the already existing DatabaseSession so there is a central location to access + api keys and a single platform_id reference + + Attributes: + oauths (GitlabRandomKeyAuth): Class that handles randomly assigning gitlab api keys to httpx requests + platform_id (int): The id that refers to the Gitlab platform + """ + + def __init__(self, logger: Logger, engine=None): + + super().__init__(logger, engine=engine) + + self.oauths = GitlabRandomKeyAuth(self, logger) + self.platform_id = 2 + diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py new file mode 100644 index 0000000000..cf6e5e5dab --- /dev/null +++ b/augur/tasks/gitlab/issues_task.py @@ -0,0 +1,320 @@ +""" +Defines the set of tasks used to retrieve GitLab issue data. +""" +import logging +import traceback + +from augur.tasks.init.celery_app import celery_app as celery +from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask +from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler +from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest +from augur.application.db.data_parse import extract_needed_issue_data_from_gitlab_issue, extract_needed_gitlab_issue_label_data, extract_needed_gitlab_issue_assignee_data, extract_needed_gitlab_issue_message_ref_data, extract_needed_gitlab_message_data +from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts +from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Repo +from augur.application.db.util import execute_session_query + +platform_id = 2 + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_gitlab_issues(repo_git : str) -> int: + """ + Retrieve and parse gitlab issues for the desired repo + + Arguments: + repo_git: the repo url string + """ + + logger = logging.getLogger(collect_gitlab_issues.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + try: + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + owner, repo = get_owner_repo(repo_git) + + issue_data = retrieve_all_gitlab_issue_data(repo_git, logger, manifest.key_auth) + + if issue_data: + issue_ids = process_issues(issue_data, f"{owner}/{repo}: Gitlab Issue task", repo_id, logger, augur_db) + + return issue_ids + else: + logger.info(f"{owner}/{repo} has no issues") + return [] + except Exception as e: + logger.error(f"Could not collect gitlab issues for repo {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") + return -1 + + + +def retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) -> None: + """ + Retrieve only the needed data for issues from the api response + + Arguments: + repo_git: url of the relevant repo + logger: loggin object + key_auth: key auth cache and rotator object + """ + + owner, repo = get_owner_repo(repo_git) + + logger.info(f"Collecting gitlab issues for {owner}/{repo}") + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues?with_labels_details=True" + issues = GitlabApiHandler(key_auth, logger) + + all_data = [] + num_pages = issues.get_num_pages(url) + for page_data, page in issues.iter_pages(url): + + if page_data is None: + return all_data + + if len(page_data) == 0: + logger.debug( + f"{owner}/{repo}: Gitlab Issues Page {page} contains no data...returning") + logger.info(f"{owner}/{repo}: Issues Page {page} of {num_pages}") + return all_data + + logger.info(f"{owner}/{repo}: Gitlab Issues Page {page} of {num_pages}") + + all_data += page_data + + return all_data + +def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: + """ + Retrieve only the needed data for issues from the api response + + Arguments: + issues: List of dictionaries of issue data + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + # get repo_id or have it passed + tool_source = "Gitlab Issue Task" + tool_version = "2.0" + data_source = "Gitlab API" + + issue_dicts = [] + issue_ids = [] + issue_mapping_data = {} + for issue in issues: + + issue_ids.append(issue["iid"]) + + issue_dicts.append( + extract_needed_issue_data_from_gitlab_issue(issue, repo_id, tool_source, tool_version, data_source) + ) + + issue_labels = extract_needed_gitlab_issue_label_data(issue["labels"], repo_id, + tool_source, tool_version, data_source) + + issue_assignees = extract_needed_gitlab_issue_assignee_data(issue["assignees"], repo_id, + tool_source, tool_version, data_source) + + mapping_data_key = issue["id"] + issue_mapping_data[mapping_data_key] = { + "labels": issue_labels, + "assignees": issue_assignees, + } + + + if len(issue_dicts) == 0: + print("No gitlab issues found while processing") + return + + logger.info(f"{task_name}: Inserting {len(issue_dicts)} gitlab issues") + issue_natural_keys = ["repo_id", "gh_issue_id"] + issue_string_columns = ["issue_title", "issue_body"] + issue_return_columns = ["gh_issue_id", "issue_id"] + + issue_return_data = augur_db.insert_data(issue_dicts, Issue, issue_natural_keys, return_columns=issue_return_columns, string_fields=issue_string_columns) + + issue_label_dicts = [] + issue_assignee_dicts = [] + for data in issue_return_data: + + gh_issue_id = data["gh_issue_id"] + issue_id = data["issue_id"] + + try: + other_issue_data = issue_mapping_data[gh_issue_id] + except KeyError as e: + logger.info(f"{task_name}: Cold not find other gitlab issue data. This should never happen. Error: {e}") + + + # add the issue id to the lables and assignees, then add them to a list of dicts that will be inserted soon + dict_key = "issue_id" + issue_label_dicts += add_key_value_pair_to_dicts(other_issue_data["labels"], dict_key, issue_id) + issue_assignee_dicts += add_key_value_pair_to_dicts(other_issue_data["assignees"], dict_key, issue_id) + + + logger.info(f"{task_name}: Inserting other gitlab issue data of lengths: Labels: {len(issue_label_dicts)} - Assignees: {len(issue_assignee_dicts)}") + + # inserting issue labels + # we are using label_src_id and issue_id to determine if the label is already in the database. + issue_label_natural_keys = ['label_src_id', 'issue_id'] + issue_label_string_fields = ["label_text", "label_description"] + augur_db.insert_data(issue_label_dicts, IssueLabel, + issue_label_natural_keys, string_fields=issue_label_string_fields) + + # inserting issue assignees + # we are using issue_assignee_src_id and issue_id to determine if the label is already in the database. + # issue_assignee_natural_keys = ['issue_assignee_src_id', 'issue_id'] + # augur_db.insert_data(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) + + return issue_ids + + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_gitlab_issue_comments(issue_ids, repo_git) -> int: + """ + Retrieve and parse gitlab events for the desired repo + + Arguments: + issue_ids: Set of issue ids to collect coments for + repo_git: repo url + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_gitlab_issues.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + comments = retrieve_all_gitlab_issue_comments(manifest.key_auth, logger, issue_ids, repo_git) + + if comments: + logger.info(f"Length of comments: {len(comments)}") + process_gitlab_issue_messages(comments, f"{owner}/{repo}: Gitlab issue messages task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab issue comments") + + +def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git): + """ + Retrieve only the needed data for issue comments + + Arguments: + key_auth: key auth cache and rotator object + logger: loggin object + issue_ids: ids of issues to find comements for + repo_git: repo url + """ + + owner, repo = get_owner_repo(repo_git) + + all_comments = {} + issue_count = len(issue_ids) + index = 1 + + comments = GitlabApiHandler(key_auth, logger) + + for id in issue_ids: + + logger.info(f"Collecting {owner}/{repo} gitlab issue comments for issue {index} of {issue_count}") + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues/{id}/notes" + + for page_data, page in comments.iter_pages(url): + + if page_data is None or len(page_data) == 0: + break + + if id in all_comments: + all_comments[id].extend(page_data) + else: + all_comments[id] = page_data + + index += 1 + + return all_comments + + +def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for issue messages from the api response + + Arguments: + data: List of dictionaries of issue event data + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + tool_source = "Gitlab issue comments" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + issue_number_to_id_map = {} + issues = augur_db.session.query(Issue).filter(Issue.repo_id == repo_id).all() + for issue in issues: + issue_number_to_id_map[issue.gh_issue_number] = issue.issue_id + + message_dicts = [] + message_ref_mapping_data = {} + for id, messages in data.items(): + + try: + issue_id = issue_number_to_id_map[id] + except KeyError: + logger.info(f"{task_name}: Could not find related issue") + logger.info(f"{task_name}: We were searching for issue number {id} in repo {repo_id}") + logger.info(f"{task_name}: Skipping") + continue + + for message in messages: + + issue_message_ref_data = extract_needed_gitlab_issue_message_ref_data(message, issue_id, repo_id, tool_source, tool_version, data_source) + + message_ref_mapping_data[message["id"]] = { + "msg_ref_data": issue_message_ref_data + } + + message_dicts.append( + extract_needed_gitlab_message_data(message, platform_id, tool_source, tool_version, data_source) + ) + + + logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") + message_natural_keys = ["platform_msg_id"] + message_return_columns = ["msg_id", "platform_msg_id"] + message_string_fields = ["msg_text"] + message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, + return_columns=message_return_columns, string_fields=message_string_fields) + + issue_message_ref_dicts = [] + for data in message_return_data: + + augur_msg_id = data["msg_id"] + platform_message_id = data["platform_msg_id"] + + ref = message_ref_mapping_data[platform_message_id] + message_ref_data = ref["msg_ref_data"] + message_ref_data["msg_id"] = augur_msg_id + + issue_message_ref_dicts.append(message_ref_data) + + logger.info(f"{task_name}: Inserting {len(issue_message_ref_dicts)} gitlab issue messages ref rows") + issue_message_ref_natural_keys = ["issue_id", "issue_msg_ref_src_comment_id"] + augur_db.insert_data(issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) + + diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py new file mode 100644 index 0000000000..ccf3c7e012 --- /dev/null +++ b/augur/tasks/gitlab/merge_request_task.py @@ -0,0 +1,560 @@ +import logging + +from augur.tasks.init.celery_app import celery_app as celery +from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask +from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler +from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest +from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_mr_reviewer_data, extract_needed_mr_commit_data, extract_needed_mr_file_data, extract_needed_mr_metadata, extract_needed_gitlab_mr_message_ref_data, extract_needed_gitlab_message_data +from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts +from augur.application.db.models import PullRequest, PullRequestAssignee, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message +from augur.application.db.util import execute_session_query + +platform_id = 2 + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_gitlab_merge_requests(repo_git: str) -> int: + """ + Retrieve and parse gitlab MRs for the desired repo + + Arguments: + repo_git: the repo url string + """ + + + logger = logging.getLogger(collect_gitlab_merge_requests.__name__) + + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + repo_id = augur_db.session.query(Repo).filter( + Repo.repo_git == repo_git).one().repo_id + + owner, repo = get_owner_repo(repo_git) + mr_data = retrieve_all_mr_data(repo_git, logger, manifest.key_auth) + + if mr_data: + mr_ids = process_merge_requests(mr_data, f"{owner}/{repo}: Mr task", repo_id, logger, augur_db) + + return mr_ids + else: + logger.info(f"{owner}/{repo} has no merge requests") + return [] + + +def retrieve_all_mr_data(repo_git: str, logger, key_auth) -> None: + """ + Retrieve only the needed data for MRs from the api response + + Arguments: + repo_git: url of the relevant repo + logger: loggin object + key_auth: key auth cache and rotator object + """ + + owner, repo = get_owner_repo(repo_git) + + logger.info(f"Collecting pull requests for {owner}/{repo}") + + url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests?with_labels_details=True" + mrs = GitlabApiHandler(key_auth, logger) + + all_data = [] + num_pages = mrs.get_num_pages(url) + for page_data, page in mrs.iter_pages(url): + + if page_data is None: + return all_data + + if len(page_data) == 0: + logger.debug( + f"{owner}/{repo} Mrs Page {page} contains no data...returning") + logger.info(f"{owner}/{repo} Mrs Page {page} of {num_pages}") + return all_data + + logger.info(f"{owner}/{repo} Mrs Page {page} of {num_pages}") + + all_data += page_data + + return all_data + + +def process_merge_requests(data, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr label data from the api response + + Arguments: + data: collection of mr data + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + + Returns: + List of parsed MR ids. + """ + + tool_source = "Mr Task" + tool_version = "2.0" + data_source = "Gitlab API" + + merge_requests = [] + mr_ids = [] + mr_mapping_data = {} + for mr in data: + + mr_ids.append(mr["iid"]) + + merge_requests.append(extract_needed_pr_data_from_gitlab_merge_request(mr, repo_id, tool_source, tool_version)) + + assignees = extract_needed_merge_request_assignee_data(mr["assignees"], repo_id, tool_source, tool_version, data_source) + + labels = extract_needed_mr_label_data(mr["labels"], repo_id, tool_source, tool_version, data_source) + + mapping_data_key = mr["id"] + mr_mapping_data[mapping_data_key] = { + "assignees": assignees, + "labels": labels + } + + logger.info(f"{task_name}: Inserting mrs of length: {len(merge_requests)}") + pr_natural_keys = ["repo_id", "pr_src_id"] + pr_string_fields = ["pr_src_title", "pr_body"] + pr_return_columns = ["pull_request_id", "pr_src_id"] + pr_return_data = augur_db.insert_data(merge_requests, PullRequest, pr_natural_keys, return_columns=pr_return_columns, string_fields=pr_string_fields) + + + mr_assignee_dicts = [] + mr_label_dicts = [] + for data in pr_return_data: + + mr_src_id = data["pr_src_id"] + pull_request_id = data["pull_request_id"] + + try: + other_mr_data = mr_mapping_data[mr_src_id] + except KeyError as e: + logger.info(f"Cold not find other pr data. This should never happen. Error: {e}") + + dict_key = "pull_request_id" + mr_assignee_dicts += add_key_value_pair_to_dicts(other_mr_data["assignees"], dict_key, pull_request_id) + mr_label_dicts += add_key_value_pair_to_dicts(other_mr_data["labels"], dict_key, pull_request_id) + + logger.info(f"{task_name}: Inserting other pr data of lengths: Labels: {len(mr_label_dicts)} - Assignees: {len(mr_assignee_dicts)}") + + # TODO: Setup unique key on asignees with a value of ('cntrb_id', 'pull_request_id') and add 'cntrb_id' to assingee data + # mr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] + # augur_db.insert_data(mr_assignee_dicts, PullRequestAssignee, mr_assignee_natural_keys) + + pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] + pr_label_string_fields = ["pr_src_description"] + augur_db.insert_data(mr_label_dicts, PullRequestLabel, pr_label_natural_keys, string_fields=pr_label_string_fields) + + return mr_ids + + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_comments(mr_ids, repo_git) -> int: + """ + Retrieve and parse gitlab events for the desired repo + + Arguments: + mr_ids: ids of MRs to paginate comments for + repo_git: the repo url string + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_merge_request_comments.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/notes".format(owner=owner, repo=repo, id="{id}") + comments = retrieve_merge_request_data(mr_ids, url, "comments", owner, repo, manifest.key_auth, logger, response_type="list") + + if comments: + logger.info(f"Length of merge request comments: {len(comments)}") + process_gitlab_mr_messages(comments, f"{owner}/{repo}: Gitlab mr messages task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request comments") + + +def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr label data from the api response + + Arguments: + data: List of dictionaries of mr message data + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + tool_source = "Gitlab mr comments" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + message_dicts = [] + message_ref_mapping_data = {} + for id, messages in data.items(): + + try: + pull_request_id = mr_number_to_id_map[id] + except KeyError: + logger.info(f"{task_name}: Could not find related mr") + logger.info(f"{task_name}: We were searching for mr number {id} in repo {repo_id}") + logger.info(f"{task_name}: Skipping") + continue + + for message in messages: + + mr_message_ref_data = extract_needed_gitlab_mr_message_ref_data(message, pull_request_id, repo_id, tool_source, tool_version, data_source) + + message_ref_mapping_data[message["id"]] = { + "msg_ref_data": mr_message_ref_data + } + + message_dicts.append( + extract_needed_gitlab_message_data(message, platform_id, tool_source, tool_version, data_source) + ) + + + logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") + message_natural_keys = ["platform_msg_id"] + message_return_columns = ["msg_id", "platform_msg_id"] + message_string_fields = ["msg_text"] + message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, + return_columns=message_return_columns, string_fields=message_string_fields) + + mr_message_ref_dicts = [] + for data in message_return_data: + + augur_msg_id = data["msg_id"] + platform_message_id = data["platform_msg_id"] + + ref = message_ref_mapping_data[platform_message_id] + message_ref_data = ref["msg_ref_data"] + message_ref_data["msg_id"] = augur_msg_id + + mr_message_ref_dicts.append(message_ref_data) + + logger.info(f"{task_name}: Inserting {len(mr_message_ref_dicts)} mr messages ref rows") + mr_message_ref_natural_keys = ["pull_request_id", "pr_message_ref_src_comment_id"] + augur_db.insert_data(mr_message_ref_dicts, PullRequestMessageRef, mr_message_ref_natural_keys) + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_metadata(mr_ids, repo_git) -> int: + """ + Retrieve and parse gitlab events for the desired repo + + Arguments: + mr_ids: list of mr ids to find metadata for + repo_git: the repo url string + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_merge_request_metadata.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}".format(owner=owner, repo=repo, id="{id}") + metadata_list = retrieve_merge_request_data(mr_ids, url, "metadata", owner, repo, manifest.key_auth, logger, response_type="dict") + + if metadata_list: + logger.info(f"Length of merge request metadata: {len(metadata_list)}") + process_mr_metadata(metadata_list, f"{owner}/{repo}: Mr metadata task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request metadata") + +def process_mr_metadata(data, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr label data from the api response + + Arguments: + data: List of dictionaries of mr metadata + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + tool_source = "Mr Metadata Task" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + all_metadata = [] + for id, metadata in data.items(): + + pull_request_id = mr_number_to_id_map[id] + + all_metadata.extend(extract_needed_mr_metadata(metadata, repo_id, pull_request_id, tool_source, tool_version, data_source)) + + logger.info(f"{task_name}: Inserting {len(all_metadata)} merge request metadata") + pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] + augur_db.insert_data(all_metadata, PullRequestMeta, pr_metadata_natural_keys) + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_reviewers(mr_ids, repo_git) -> int: + """ + Retrieve and parse mr reviewers for the desired repo + + Arguments: + mr_ids: mrs to search for reviewers for + repo_git: the repo url string + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_merge_request_reviewers.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/approvals".format(owner=owner, repo=repo, id="{id}") + reviewers = retrieve_merge_request_data(mr_ids, url, "reviewers", owner, repo, manifest.key_auth, logger, response_type="dict") + + if reviewers: + logger.info(f"Length of merge request reviewers: {len(reviewers)}") + process_mr_reviewers(reviewers, f"{owner}/{repo}: Mr reviewer task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request reviewers") + +def process_mr_reviewers(data, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr Reviewer data from the api response + + Arguments: + data: List of dictionaries of mr Reviewer data + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + tool_source = "Mr Reviewer Task" + tool_version = "2.0" + data_source = "Gitlab API" + + logger.info(f"Running {task_name}...") + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + all_reviewers = [] + for id, values in data.items(): + + pull_request_id = mr_number_to_id_map[id] + + reviewers = extract_needed_mr_reviewer_data(values, pull_request_id, tool_source, tool_version, data_source) + + all_reviewers += reviewers + + # TODO: Need to add unique key with pull_request_id and cntrb_id to insert gitlab reviewers + # pr_reviewer_natural_keys = ["pull_request_id", "cntrb_id"] + # augur_db.insert_data(all_reviewers, PullRequestReviewer, pr_reviewer_natural_keys) + + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_commits(mr_ids, repo_git) -> int: + """ + Retrieve and parse mr commits for the desired repo + + Arguments: + mr_ids: ids of mrs to get commits for + repo_git: the repo url string + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_merge_request_commits.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/commits".format(owner=owner, repo=repo, id="{id}") + commits = retrieve_merge_request_data(mr_ids, url, "commits", owner, repo, manifest.key_auth, logger, response_type="list") + + if commits: + logger.info(f"Length of merge request commits: {len(commits)}") + process_mr_commits(commits, f"{owner}/{repo}: Mr commit task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request commits") + + +def process_mr_commits(data, task_name, repo_id, logger, augur_db): + """ + Retrieve only the needed data for mr commits from the api response + + Arguments: + data: List of dictionaries of mr commit data + task_name: name of the task as well as the repo being processed + repo_id: augur id of the repo + logger: logging object + augur_db: sqlalchemy db object + """ + + tool_source = "Mr Commit Task" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + all_commits = [] + for id, values in data.items(): + + pull_request_id = mr_number_to_id_map[id] + + for commit in values: + + all_commits.append(extract_needed_mr_commit_data(commit, repo_id, pull_request_id, tool_source, tool_version, data_source)) + + + logger.info(f"{task_name}: Inserting {len(all_commits)} merge request commits") + pr_commits_natural_keys = ["pull_request_id", "repo_id", "pr_cmt_sha"] + augur_db.insert_data(all_commits,PullRequestCommit,pr_commits_natural_keys) + + + +@celery.task(base=AugurCoreRepoCollectionTask) +def collect_merge_request_files(mr_ids, repo_git) -> int: + """ + Retrieve and parse gitlab events for the desired repo + + Arguments: + mr_ids: the ids of mrs to get files for. + repo_git: the repo url string + """ + + owner, repo = get_owner_repo(repo_git) + + logger = logging.getLogger(collect_merge_request_files.__name__) + with GitlabTaskManifest(logger) as manifest: + + augur_db = manifest.augur_db + + query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) + repo_obj = execute_session_query(query, 'one') + repo_id = repo_obj.repo_id + + url = "https://gitlab.com/api/v4/projects/{owner}%2f{repo}/merge_requests/{id}/changes".format(owner=owner, repo=repo, id="{id}") + files = retrieve_merge_request_data(mr_ids, url, "files", owner, repo, manifest.key_auth, logger, response_type="dict") + + if files: + logger.info(f"Length of merge request files: {len(files)}") + process_mr_files(files, f"{owner}/{repo}: Mr files task", repo_id, logger, augur_db) + else: + logger.info(f"{owner}/{repo} has no gitlab merge request files") + +def process_mr_files(data, task_name, repo_id, logger, augur_db): + + tool_source = "Mr files Task" + tool_version = "2.0" + data_source = "Gitlab API" + + # create mapping from mr number to pull request id of current mrs + mr_number_to_id_map = {} + mrs = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).all() + for mr in mrs: + mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id + + all_files = [] + for id, gitlab_file_data in data.items(): + + pull_request_id = mr_number_to_id_map[id] + + all_files.extend(extract_needed_mr_file_data(gitlab_file_data, repo_id, pull_request_id, tool_source, tool_version, data_source)) + + logger.info(f"{task_name}: Inserting {len(all_files)} merge request files") + pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"] + augur_db.insert_data(all_files, PullRequestFile, pr_file_natural_keys) + + +def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger, response_type): + """ + Retrieve specific mr data from the GitLab api. + + Arguments: + ids: mr ids to paginate info for + url: endpoint to paginate or hit + name: name of data to collect + owner: owner of the repo + repo: repo name + key_auth: key auth cache and rotator object + logger: loggin object + response_type: type of data to get from the api + """ + + all_data = {} + mr_count = len(ids) + index = 1 + + api_handler = GitlabApiHandler(key_auth, logger) + for id in ids: + + print(f"Collecting {owner}/{repo} gitlab merge request {name} for merge request {index} of {mr_count}") + formatted_url = url.format(id=id) + + if response_type == "dict": + page_data, _, _ = api_handler.retrieve_data(formatted_url) + if page_data: + all_data[id] = page_data + + elif response_type == "list": + + for page_data, _ in api_handler.iter_pages(formatted_url): + + if page_data is None or len(page_data) == 0: + break + + if id in all_data: + all_data[id].extend(page_data) + else: + all_data[id] = page_data + else: + raise Exception(f"Unexpected response type: {response_type}") + + index += 1 + + return all_data diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index 706541d1c7..ee6eaeccdf 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -50,6 +50,10 @@ class CollectionState(Enum): 'augur.tasks.github.pull_requests.commits_model.tasks', 'augur.tasks.github.traffic.tasks'] +gitlab_tasks = ['augur.tasks.gitlab.merge_request_task', + 'augur.tasks.gitlab.issues_task', + 'augur.tasks.gitlab.events_task'] + git_tasks = ['augur.tasks.git.facade_tasks', 'augur.tasks.git.dependency_tasks.tasks', 'augur.tasks.git.dependency_libyear_tasks.tasks', @@ -66,7 +70,7 @@ class CollectionState(Enum): frontend_tasks = ['augur.tasks.frontend'] -tasks = start_tasks + github_tasks + git_tasks + materialized_view_tasks + frontend_tasks +tasks = start_tasks + github_tasks + gitlab_tasks + git_tasks + materialized_view_tasks + frontend_tasks if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": tasks += data_analysis_tasks diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 225f78ffde..10f04e40b7 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -24,6 +24,9 @@ from augur.tasks.github.pull_requests.commits_model.tasks import process_pull_request_commits from augur.tasks.git.dependency_tasks.tasks import process_ossf_dependency_metrics from augur.tasks.github.traffic.tasks import collect_github_repo_clones_data +from augur.tasks.gitlab.merge_request_task import collect_gitlab_merge_requests, collect_merge_request_comments, collect_merge_request_metadata, collect_merge_request_reviewers, collect_merge_request_commits, collect_merge_request_files +from augur.tasks.gitlab.issues_task import collect_gitlab_issues, collect_gitlab_issue_comments +from augur.tasks.gitlab.events_task import collect_gitlab_issue_events, collect_gitlab_merge_request_events from augur.tasks.git.facade_tasks import * from augur.tasks.db.refresh_materialized_views import * # from augur.tasks.data_analysis import * @@ -93,6 +96,27 @@ def primary_repo_collect_phase(repo_git): return repo_task_group +def primary_repo_collect_phase_gitlab(repo_git): + + logger = logging.getLogger(primary_repo_collect_phase_gitlab.__name__) + + jobs = group( + chain(collect_gitlab_merge_requests.si(repo_git), group( + #collect_merge_request_comments.s(repo_git), + #collect_merge_request_reviewers.s(repo_git), + collect_merge_request_metadata.s(repo_git), + collect_merge_request_commits.s(repo_git), + collect_merge_request_files.s(repo_git), + collect_gitlab_merge_request_events.si(repo_git), + )), + chain(collect_gitlab_issues.si(repo_git), group( + #collect_gitlab_issue_comments.s(repo_git), + collect_gitlab_issue_events.si(repo_git), + )), + ) + + return jobs + #This phase creates the message for secondary collection tasks. #These are less important and have their own worker. @@ -146,20 +170,23 @@ def non_repo_domain_tasks(): def build_primary_repo_collect_request(session,enabled_phase_names, days_until_collect_again = 1): #Add all required tasks to a list and pass it to the CollectionRequest primary_enabled_phases = [] + primary_gitlab_enabled_phases = [] #Primary jobs if prelim_phase.__name__ in enabled_phase_names: primary_enabled_phases.append(prelim_phase) primary_enabled_phases.append(primary_repo_collect_phase) + primary_gitlab_enabled_phases.append(primary_repo_collect_phase_gitlab) #task success is scheduled no matter what the config says. def core_task_success_util_gen(repo_git): return core_task_success_util.si(repo_git) primary_enabled_phases.append(core_task_success_util_gen) + primary_gitlab_enabled_phases.append(core_task_success_util_gen) - primary_request = CollectionRequest("core",primary_enabled_phases,max_repo=40, days_until_collect_again=7) + primary_request = CollectionRequest("core",primary_enabled_phases,max_repo=40, days_until_collect_again=7, gitlab_phases=primary_gitlab_enabled_phases) primary_request.get_valid_repos(session) return primary_request diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 4d5b663a20..47705785e9 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -132,9 +132,10 @@ def get_required_conditions_for_ml_repos(allow_collected_before = False, days_un class CollectionRequest: - def __init__(self,name,phases,max_repo = 10,days_until_collect_again = 1): + def __init__(self,name,phases,max_repo = 10,days_until_collect_again = 1, gitlab_phases=None): self.name = name self.phases = phases + self.gitlab_phases = gitlab_phases self.max_repo = max_repo self.days_until_collect_again = days_until_collect_again self.new_status = CollectionState.PENDING.value @@ -587,27 +588,44 @@ def send_messages(self): for col_hook in self.collection_hooks: self.logger.info(f"Starting collection on {len(col_hook.repo_list)} {col_hook.name} repos") - + for repo_git in col_hook.repo_list: - #repo = self.session.query(Repo).filter(Repo.repo_git == repo_git).one() - #repo_id = repo.repo_id - - augur_collection_sequence = [] - for job in col_hook.phases: - #Add the phase to the sequence in order as a celery task. - #The preliminary task creates the larger task chain - augur_collection_sequence.append(job(repo_git)) - - #augur_collection_sequence.append(core_task_success_util.si(repo_git)) - #Link all phases in a chain and send to celery - augur_collection_chain = chain(*augur_collection_sequence) - task_id = augur_collection_chain.apply_async().task_id - - self.logger.info(f"Setting repo {col_hook.name} status to collecting for repo: {repo_git}") - - #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated - yield repo_git, task_id, col_hook.name + repo = self.session.query(Repo).filter(Repo.repo_git == repo_git).one() + if "github" in repo.repo_git: + augur_collection_sequence = [] + for job in col_hook.phases: + #Add the phase to the sequence in order as a celery task. + #The preliminary task creates the larger task chain + augur_collection_sequence.append(job(repo_git)) + + #augur_collection_sequence.append(core_task_success_util.si(repo_git)) + #Link all phases in a chain and send to celery + augur_collection_chain = chain(*augur_collection_sequence) + task_id = augur_collection_chain.apply_async().task_id + + self.logger.info(f"Setting github repo {col_hook.name} status to collecting for repo: {repo_git}") + + #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated + yield repo_git, task_id, col_hook.name + else: + if col_hook.gitlab_phases is not None: + + augur_collection_sequence = [] + for job in col_hook.gitlab_phases: + #Add the phase to the sequence in order as a celery task. + #The preliminary task creates the larger task chain + augur_collection_sequence.append(job(repo_git)) + + #augur_collection_sequence.append(core_task_success_util.si(repo_git)) + #Link all phases in a chain and send to celery + augur_collection_chain = chain(*augur_collection_sequence) + task_id = augur_collection_chain.apply_async().task_id + + self.logger.info(f"Setting gitlab repo {col_hook.name} status to collecting for repo: {repo_git}") + + #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated + yield repo_git, task_id, col_hook.name #def start_block_of_repos(logger,session,repo_git_identifiers,phases,repos_type,hook="core"): # diff --git a/augur/tasks/util/worker_util.py b/augur/tasks/util/worker_util.py index 6380ed22b0..84c177724b 100644 --- a/augur/tasks/util/worker_util.py +++ b/augur/tasks/util/worker_util.py @@ -138,7 +138,7 @@ def parse_json_from_subprocess_call(logger, subprocess_arr, cwd=None): try: required_output = json.loads(output) except json.decoder.JSONDecodeError as e: - session.logger.error(f"Could not parse required output! \n output: {output} \n Error: {e}") + logger.error(f"Could not parse required output! \n output: {output} \n Error: {e}") raise e return required_output diff --git a/augur/util/repo_load_controller.py b/augur/util/repo_load_controller.py index f6841d976e..943e3373a6 100644 --- a/augur/util/repo_load_controller.py +++ b/augur/util/repo_load_controller.py @@ -12,10 +12,23 @@ from augur.application.db.models.augur_operations import retrieve_owner_repos from augur.application.db.util import execute_session_query +from sqlalchemy import Column, Table, MetaData, or_ +from sqlalchemy.sql.operators import ilike_op +from sqlalchemy.sql.functions import coalesce +from sqlalchemy.orm import Query + logger = logging.getLogger(__name__) +with DatabaseEngine() as engine: + augur_data_schema = MetaData(schema = "augur_data") + augur_data_schema.reflect(bind = engine, views = True) + + commits_materialized_view: Table = augur_data_schema.tables["augur_data.api_get_all_repos_commits"] + issues_materialized_view: Table = augur_data_schema.tables["augur_data.api_get_all_repos_issues"] + + class RepoLoadController: def __init__(self, gh_session): @@ -49,7 +62,7 @@ def add_cli_repo(self, repo_data: Dict[str, Any], from_org_list=False, repo_type # if the repo doesn't exist it adds it - repo_id = Repo.insert(self.session, url, repo_group_id, "CLI", repo_type) + repo_id = Repo.insert_github_repo(self.session, url, repo_group_id, "CLI", repo_type) if not repo_id: logger.warning(f"Invalid repo group id specified for {url}, skipping.") @@ -131,34 +144,33 @@ def paginate_repos(self, source, page=0, page_size=25, sort="repo_id", direction order_by = sort if sort else "repo_id" order_direction = direction if direction else "ASC" - query, query_args, result = self.generate_repo_query(source, count=False, order_by=order_by, direction=order_direction, + + query, result = self.generate_repo_query(source, count=False, order_by=order_by, direction=order_direction, page=page, page_size=page_size, **kwargs) + + + # query, query_args, result = self.generate_repo_query(source, count=False, order_by=order_by, direction=order_direction, + # page=page, page_size=page_size, **kwargs) if not query: return None, {"status": result["status"]} if result["status"] == "No data": return [], {"status": "No data"} - get_page_of_repos_sql = s.sql.text(query) - - with DatabaseEngine(connection_pool_size=1).connect() as conn: + # get_page_of_repos_sql = s.sql.text(query) - results = pd.read_sql(get_page_of_repos_sql, conn, params=query_args) + # with DatabaseEngine(connection_pool_size=1).connect() as conn: - results['url'] = results['url'].apply(lambda datum: datum.split('//')[1]) + # results = pd.read_sql(get_page_of_repos_sql, conn, params=query_args) - b64_urls = [] - for i in results.index: - b64_urls.append(base64.b64encode((results.at[i, 'url']).encode())) - results['base64_url'] = b64_urls + results = [dict(x._mapping) for x in query] - data = results.to_dict(orient="records") + for row in results: - # The SELECT statement in generate_repo_query has been updated to include `repo_name` - # for row in data: - # row["repo_name"] = re.search(r"github\.com\/[A-Za-z0-9 \- _]+\/([A-Za-z0-9 \- _ .]+)$", row["url"]).groups()[0] + row["url"] = row["url"].split('//')[1] + row["base64_url"] = base64.b64encode(row["url"].encode()) - return data, {"status": "success"} + return results, {"status": "success"} def get_repo_count(self, source, **kwargs): @@ -169,142 +181,97 @@ def get_repo_count(self, source, **kwargs): if source not in ["all", "user", "group"]: print("Func: get_repo_count. Error: Invalid source") return None, {"status": "Invalid source"} - - query, query_args, result = self.generate_repo_query(source, count=True, **kwargs) + + query, result = self.generate_repo_query(source, count=True, **kwargs) if not query: return None, result if result["status"] == "No data": return 0, {"status": "No data"} - - # surround query with count query so we just get the count of the rows - final_query = f"SELECT count(*) FROM ({query}) a;" - - get_page_of_repos_sql = s.sql.text(final_query) - - result = self.session.execute(get_page_of_repos_sql, query_args).fetchall() + + count = query.count() - return result[0]["count"], {"status": "success"} + return count, {"status": "success"} def generate_repo_query(self, source, count, **kwargs): - # TODO: need more flexible way of calculating count for variable column queries - - query_args = {} - - if count: - # only query for repos ids so the query is faster for getting the count - select = """ DISTINCT(augur_data.repo.repo_id), - (regexp_match(augur_data.repo.repo_git, 'github\.com\/[A-Za-z0-9 \- _]+\/([A-Za-z0-9 \- _ .]+)$'))[1] as repo_name, - (regexp_match(augur_data.repo.repo_git, 'github\.com\/([A-Za-z0-9 \- _]+)\/[A-Za-z0-9 \- _ .]+$'))[1] as repo_owner""" - else: - - select = """ DISTINCT(augur_data.repo.repo_id), - augur_data.repo.description, - augur_data.repo.repo_git AS url, - COALESCE(a.commits_all_time, 0) as commits_all_time, - COALESCE(b.issues_all_time, 0) as issues_all_time, - rg_name, - (regexp_match(augur_data.repo.repo_git, 'github\.com\/[A-Za-z0-9 \- _]+\/([A-Za-z0-9 \- _ .]+)$'))[1] as repo_name, - (regexp_match(augur_data.repo.repo_git, 'github\.com\/([A-Za-z0-9 \- _]+)\/[A-Za-z0-9 \- _ .]+$'))[1] as repo_owner, - augur_data.repo.repo_group_id""" - - query = f""" - SELECT - {select} - FROM - augur_data.repo - LEFT OUTER JOIN augur_data.api_get_all_repos_commits a ON augur_data.repo.repo_id = a.repo_id - LEFT OUTER JOIN augur_data.api_get_all_repos_issues b ON augur_data.repo.repo_id = b.repo_id - JOIN augur_data.repo_groups ON augur_data.repo.repo_group_id = augur_data.repo_groups.repo_group_id\n""" - + + columns: list[Column] = [ + Repo.repo_id.distinct().label("repo_id"), + Repo.description.label("description"), + Repo.repo_git.label("url"), + coalesce(commits_materialized_view.columns.commits_all_time, 0).label("commits_all_time"), + coalesce(issues_materialized_view.columns.issues_all_time, 0).label("issues_all_time"), + RepoGroup.rg_name.label("rg_name"), + Repo.repo_git.regexp_replace('.*github\.com\/[A-Za-z0-9 \- _]+\/([A-Za-z0-9 \- _ .]+)$', "\\1").label("repo_name"), + Repo.repo_git.regexp_replace('.*github\.com\/([A-Za-z0-9 \- _]+)\/[A-Za-z0-9 \- _ .]+$', "\\1").label("repo_owner"), + RepoGroup.repo_group_id.label("repo_group_id") + ] + + def get_colum_by_label(label: str)-> Column: + for column in columns: + if column.name == label: + return column + + repos: Query = self.session.query(*columns)\ + .outerjoin(commits_materialized_view, Repo.repo_id == commits_materialized_view.columns.repo_id)\ + .outerjoin(issues_materialized_view, Repo.repo_id == issues_materialized_view.columns.repo_id)\ + .join(RepoGroup, Repo.repo_group_id == RepoGroup.repo_group_id) + if source == "user": + user: User = kwargs.get("user") - user = kwargs.get("user") if not user: - print("Func: generate_repo_query. Error: User not passed when trying to get user repos") return None, {"status": "User not passed when trying to get user repos"} - if not user.groups: return None, {"status": "No data"} - - query += "\t\t JOIN augur_operations.user_repos ON augur_data.repo.repo_id = augur_operations.user_repos.repo_id\n" - query += "\t\t JOIN augur_operations.user_groups ON augur_operations.user_repos.group_id = augur_operations.user_groups.group_id\n" - query += "\t\t WHERE augur_operations.user_groups.user_id = :user_id\n" - - query_args["user_id"] = user.user_id - + + repos = repos.join(UserRepo, Repo.repo_id == UserRepo.repo_id)\ + .join(UserGroup, UserGroup.group_id == UserRepo.group_id)\ + .filter(UserGroup.user_id == user.user_id) + elif source == "group": - - user = kwargs.get("user") + user: User = kwargs.get("user") + if not user: - print("Func: generate_repo_query. Error: User not specified") return None, {"status": "User not specified"} - group_name = kwargs.get("group_name") if not group_name: - print("Func: generate_repo_query. Error: Group name not specified") return None, {"status": "Group name not specified"} - + group_id = UserGroup.convert_group_name_to_id(self.session, user.user_id, group_name) if group_id is None: - print("Func: generate_repo_query. Error: Group does not exist") return None, {"status": "Group does not exists"} - - query += "\t\t JOIN augur_operations.user_repos ON augur_data.repo.repo_id = augur_operations.user_repos.repo_id\n" - query += "\t\t WHERE augur_operations.user_repos.group_id = :group_id \n" - - query_args["group_id"] = group_id + + repos = repos.join(UserRepo, Repo.repo_id == UserRepo.repo_id)\ + .filter(UserRepo.group_id == group_id) - # implement sorting by query_key search = kwargs.get("search") qkey = kwargs.get("query_key") or ["repo_name", "repo_owner"] - if search: - # The WHERE clause cannot use a column alias created in the directly preceeding SELECT clause - # We must wrap the query in an additional SELECT with a table alias - # This way, we can use WHERE with the computed repo_name column alias - query = f"""\tSELECT * from ( - {query} - ) res\n""" - # This is done so repos with a NULL repo_name can still be sorted. - # "res" here is a randomly chosen table alias, short for "result" - # It is only included because it is required by the SQL syntax - if isinstance(qkey, list) and len(qkey) > 0: - query += f"\tWHERE :qkey_where ilike :search\n" - query_args["qkey_where"] = qkey.pop(0) - - for i, key in enumerate(qkey): - param_name = f"qkey_or_{i}" - query += f"OR :{param_name} ilike :search\n" - query_args[param_name] = key + repos = repos.filter(or_(ilike_op(get_colum_by_label(filter_column), f"%{search}%") for filter_column in qkey)) else: - query += f"\tWHERE :qkey ilike :search\n" - query_args["qkey"] = qkey - - query_args["search"] = f'%{search}%' - - - if not count: - order_by = kwargs.get("order_by") or "repo_id" - page = kwargs.get("page") or 0 - page_size = kwargs.get("page_size") or 25 + repos = repos.filter(ilike_op(get_colum_by_label(qkey), f"%{search}%")) + + page_size: int = kwargs.get("page_size") or 25 + if count: + return repos, {"status": "success"} + else: + page: int = kwargs.get("page") or 0 + offset = page * page_size direction = kwargs.get("direction") or "ASC" - + order_by = kwargs.get("order_by") or "repo_id" + if direction not in ["ASC", "DESC"]: - return None, None, {"status": "Invalid direction"} + return None, {"status": "Invalid direction"} if order_by not in ["repo_id", "repo_name", "repo_owner", "commits_all_time", "issues_all_time"]: - return None, None, {"status": "Invalid order by"} - - offset = page*page_size - - query += f"\tORDER BY {order_by} {direction}\n" - query += "\tLIMIT :page_size\n" - query += "\tOFFSET :offset;\n" - - query_args["page_size"] = page_size - query_args["offset"] = offset + return None, {"status": "Invalid order by"} + + # Find the column named in the 'order_by', and get its asc() or desc() method + directive: function = getattr(get_colum_by_label(order_by), direction.lower()) + + repos = repos.order_by(directive()) - return query, query_args, {"status": "success"} + return repos.slice(offset, offset + page_size), {"status": "success"} diff --git a/docker-compose.yml b/docker-compose.yml index bc8186914a..75ad63ea7f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,8 +8,11 @@ services: - "POSTGRES_DB=augur" - "POSTGRES_USER=${AUGUR_DB_USER:-augur}" - "POSTGRES_PASSWORD=${AUGUR_DB_PASSWORD:-augur}" + - "PGDATA=/var/lib/postgresql/data/pgdata" ports: - "127.0.0.1:${AUGUR_DB_PORT:-5432}:5432" + volumes: + - augurpostgres:/var/lib/postgresql/data redis: image: "redis:alpine" @@ -44,5 +47,7 @@ services: volumes: facade: driver: local + augurpostgres: + driver: local diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index b16deb4b34..9c09a4a2f8 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -1,8 +1,8 @@ #SPDX-License-Identifier: MIT -FROM python:3.8.11-slim-buster +FROM python:3.10-slim-bullseye LABEL maintainer="outdoors@acm.org" -LABEL version="0.53.1" +LABEL version="0.62.0" ENV DEBIAN_FRONTEND=noninteractive @@ -13,7 +13,9 @@ RUN set -x \ bash \ curl \ gcc \ - python3-pip \ + musl-dev \ + python3-dev \ + python3-distutils \ wget \ postgresql-client \ && rm -rf /var/lib/apt/lists/* @@ -32,6 +34,17 @@ COPY ./scripts/ scripts/ RUN python3 -m venv /opt/venv RUN set -x \ + && /opt/venv/bin/pip install --upgrade pip + +RUN set -x \ + && /opt/venv/bin/pip install wheel + +RUN set -x \ + && /opt/venv/bin/pip install . + +RUN set -x \ + && /opt/venv/bin/pip install --upgrade pip \ + && /opt/venv/bin/pip install wheel \ && /opt/venv/bin/pip install . RUN ./scripts/docker/install-workers-deps.sh diff --git a/docker/database/Dockerfile b/docker/database/Dockerfile index 5a91307297..1670599754 100644 --- a/docker/database/Dockerfile +++ b/docker/database/Dockerfile @@ -1,8 +1,8 @@ #SPDX-License-Identifier: MIT -FROM postgres:12 +FROM postgres:14 LABEL maintainer="outdoors@acm.org" -LABEL version="0.53.1" +LABEL version="0.62.0" ENV POSTGRES_DB "test" ENV POSTGRES_USER "augur" diff --git a/docs/source/getting-started/new-install.rst b/docs/source/getting-started/new-install.rst index 14e276fc51..ba38fb7758 100644 --- a/docs/source/getting-started/new-install.rst +++ b/docs/source/getting-started/new-install.rst @@ -44,7 +44,7 @@ Executable sudo snap install go --classic && #required: Go Needs to be version 1.19.x or higher. Snap is the package manager that gets you to the right version. Classic enables it to actually be installed at the correct version. sudo apt install nginx && # required for hosting sudo add-apt-repository ppa:mozillateam/firefox-next && - sudo apt install firefox=115.0~b2+build1-0ubuntu0.22.04.1 && + sudo apt install firefox=121.0~b7+build1-0ubuntu0.22.04.1 && sudo apt install firefox-geckodriver # You will almost certainly need to reboot after this. diff --git a/docs/source/rest-api/spec.yml b/docs/source/rest-api/spec.yml index 566e998346..7b969d5803 100644 --- a/docs/source/rest-api/spec.yml +++ b/docs/source/rest-api/spec.yml @@ -96,6 +96,49 @@ paths: type: array tags: - utility + /repos/:id: + get: + description: Get a downloaded repo by its ID in Augur. The schema block below says it is an array, but it is not. + operationId: Get Repo By ID + responses: + '200': + description: OK + schema: + items: + properties: + base64_url: + description: 'Base64 encode of the full URL. Example Z2l0aHViLmNvbS8zc2NhbGUvM3NjYWxlLW9wZXJhdG9yLW1ldGFkYXRh' + type: string + description: + description: 'Repository description. Example: null' + type: string + commits_all_time: + description: 'How many commits have been made to this repository? Example: 24' + type: integer + issues_all_time: + description: 'How many issues have been raised on this repository? Example: 1' + type: integer + pull_requests_all_time: + description: 'How many pull requests have been made to this repository? Example: 7' + type: integer + repo_id: + description: 'Repository ID, should match provided URL parameter. Example: 25551' + type: integer + repo_name: + description: 'Name of the provided repository. Example: 3scale-operator-metadata' + type: string + rg_name: + description: 'Name of the repository group containing this repo. Example: 3scale' + type: string + repo_group_id: + description: 'ID of the repository group containing this repo. Example: 25431' + type: integer + url: + description: 'URL of this repository, sans leading protocol. Example: github.com/3scale/3scale-operator-metadata' + type: string + type: array + tags: + - utility /owner/:owner/repo/:repo: get: description: Get the repo_group_id and repo_id of a particular repo. @@ -284,12 +327,12 @@ paths: tags: - utility #risk endpoints - /metadata/repo_info: - get: - description: 'Returns the metadata about all repositories in an Augur instance, using information from a git platform (GitHub, GitLab, etc.). Each record includes the default branch name, repository license file, forks, stars, watchers, and committers. Also includes metadata about current repository issue and pull request status and counts.' + /metadata/repo_info: + get: + description: 'Returns the metadata about all repositories in an Augur instance, using information from a git platform (GitHub, GitLab, etc.). Each record includes the default branch name, repository license file, forks, stars, watchers, and committers. Also includes metadata about current repository issue and pull request status and counts.' externalDocs: description: CHAOSS Metric Definition - url: https://github.com/chaoss/wg-risk/blob/main/focus-areas/business-risk.md + url: https://github.com/chaoss/wg-risk/blob/main/focus-areas/business-risk.md operationId: Activity Metadata (Repo) responses: '200': @@ -315,45 +358,45 @@ paths: fork_count: description: 'Example: 554' type: integer - watchers_count: + watchers_count: description: 'Example: 424' type: integer - stars_count: + stars_count: description: 'Example: 443' type: integer - commits_count: + commits_count: description: '4434' - type: integer - committers_count: + type: integer + committers_count: description: 'Example: 42' type: integer - open_issues: + open_issues: description: 'Example: 7' - type: integer - issues_count: + type: integer + issues_count: description: 'Example: 23332' type: integer - issues_closed: + issues_closed: description: 'Example: 23322' type: integer - pull_request_count: + pull_request_count: description: 'Example: 19445' type: integer - pull_requests_open: + pull_requests_open: description: 'Example: 10' type: integer - pull_requests_closed: + pull_requests_closed: description: 'Example: 19435' type: integer - pull_requests_merged: + pull_requests_merged: description: 'Example: 17473' type: integer type: array tags: - risk - /metadata/contributions_count: - get: - description: 'Returns a list of repositories contributed to by all the contributors in an Augur Instance: INCLUDING all repositories on a platform, *not* merely those repositories in the Augur Instance. Numerical totals represent total CONTRIBUTIONS.' + /metadata/contributions_count: + get: + description: 'Returns a list of repositories contributed to by all the contributors in an Augur Instance: INCLUDING all repositories on a platform, *not* merely those repositories in the Augur Instance. Numerical totals represent total CONTRIBUTIONS.' externalDocs: description: CHAOSS Metric Definition url: https://github.com/chaoss/wg-risk/blob/main/focus-areas/business-risk.md @@ -373,9 +416,9 @@ paths: type: array tags: - risk - /metadata/contributors_count: - get: - description: 'Returns a list of repositories contributed to by all the contributors in an Augur Instance: INCLUDING all repositories on a platform, *not* merely those repositories in the Augur Instance. Numerical totals represent total CONTRIBUTORS.' + /metadata/contributors_count: + get: + description: 'Returns a list of repositories contributed to by all the contributors in an Augur Instance: INCLUDING all repositories on a platform, *not* merely those repositories in the Augur Instance. Numerical totals represent total CONTRIBUTORS.' externalDocs: description: CHAOSS Metric Definition url: https://github.com/chaoss/wg-risk/blob/main/focus-areas/business-risk.md @@ -5228,9 +5271,9 @@ paths: type: object tags: - visualizations - /complexity/project_lines: - get: - description: 'Returns project line data for all repositories in an Augur instance, using information from a git platform (GitHub, GitLab, etc.). Each record includes the total and average number of lines in the project repository.' + /complexity/project_lines: + get: + description: 'Returns project line data for all repositories in an Augur instance, using information from a git platform (GitHub, GitLab, etc.). Each record includes the total and average number of lines in the project repository.' operationId: Total Lines (repo) responses: '200': @@ -5256,9 +5299,9 @@ paths: type: array tags: - complexity - /complexity/project_file_complexity: - get: - description: 'Returns project file complexity data for all repositories in an Augur instance, using information from a git platform (GitHub, GitLab, etc.). Each record includes the total and average file complexity of the project repository.' + /complexity/project_file_complexity: + get: + description: 'Returns project file complexity data for all repositories in an Augur instance, using information from a git platform (GitHub, GitLab, etc.). Each record includes the total and average file complexity of the project repository.' operationId: File Complexity (repo) responses: '200': @@ -5284,9 +5327,9 @@ paths: type: array tags: - complexity - /complexity/project_blank_lines: - get: - description: 'Returns project blank line data for all repositories in an Augur instance, using information from a git platform (GitHub, GitLab, etc.). Each record includes the total and average number of blank lines in the project repository.' + /complexity/project_blank_lines: + get: + description: 'Returns project blank line data for all repositories in an Augur instance, using information from a git platform (GitHub, GitLab, etc.). Each record includes the total and average number of blank lines in the project repository.' operationId: Total Blank Lines (repo) responses: '200': @@ -5312,9 +5355,9 @@ paths: type: array tags: - complexity - /complexity/project_comment_lines: - get: - description: 'Returns project comment line data for all repositories in an Augur instance, using information from a git platform (GitHub, GitLab, etc.). Each record includes the total and average number of comment lines in the project repository.' + /complexity/project_comment_lines: + get: + description: 'Returns project comment line data for all repositories in an Augur instance, using information from a git platform (GitHub, GitLab, etc.). Each record includes the total and average number of comment lines in the project repository.' operationId: Total Comment Lines (repo) responses: '200': @@ -5340,9 +5383,9 @@ paths: type: array tags: - complexity - /complexity/project_files: - get: - description: 'Returns project file data for all repositories in an Augur instance, using information from a git platform (GitHub, GitLab, etc.). Each record includes the total number of files in the project repository.' + /complexity/project_files: + get: + description: 'Returns project file data for all repositories in an Augur instance, using information from a git platform (GitHub, GitLab, etc.). Each record includes the total number of files in the project repository.' operationId: Total Files (repo) responses: '200': @@ -5365,9 +5408,9 @@ paths: type: array tags: - complexity - /complexity/project_languages: - get: - description: 'Returns project language data for all repositories in an Augur instance, using information from a git platform (GitHub, GitLab, etc.). Each record includes the lines and files of a language in a repository.' + /complexity/project_languages: + get: + description: 'Returns project language data for all repositories in an Augur instance, using information from a git platform (GitHub, GitLab, etc.). Each record includes the lines and files of a language in a repository.' operationId: Project Languages (repo) responses: '200': @@ -5401,7 +5444,7 @@ paths: description: 'The number of messages exchanged for a repository group over a specified period.' externalDocs: description: CHAOSS Metric Definition - url: + url: operationId: Repository Messages (Repo Group) parameters: - description: Repository Group ID @@ -5717,4 +5760,4 @@ paths: type: string enum: ["Missing argument"] tags: - - DEI Badging \ No newline at end of file + - DEI Badging diff --git a/metadata.py b/metadata.py index 3a08e2ba55..c3c1597b49 100644 --- a/metadata.py +++ b/metadata.py @@ -5,8 +5,8 @@ __short_description__ = "Python 3 package for free/libre and open-source software community metrics, models & data collection" -__version__ = "0.60.2" -__release__ = "v0.60.2 (Swifty Kelce)" +__version__ = "0.62.0" +__release__ = "v0.62.0 (AI for Pets)" __license__ = "MIT" -__copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Brian Warner & Augurlabs 2023" +__copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Brian Warner & Augurlabs 2024" diff --git a/setup.py b/setup.py index a456352065..1a72c87b33 100644 --- a/setup.py +++ b/setup.py @@ -49,7 +49,7 @@ "Flask-Login==0.5.0", "Flask-WTF==1.0.0", "pandas==1.5.3", # 1.4.3 - "numpy==1.22", # 1.23.2 + "numpy==1.26.0", # 1.23.2 "requests==2.28.0", # 2.28.1 "psycopg2-binary==2.9.3", #2.9.3 what is pscopg-binary 3.0.16 "click==8.0.3", # 8.1.3 @@ -66,26 +66,27 @@ "distributed >= 2021.03.0", # 2022.8.1 "nltk==3.6.6", # 3.7 "h5py==3.10.0", # 3.7 - "scipy==1.7.3", # 1.9.0 + "scipy>=1.10.0", # 1.9.0 "blinker==1.4", # 1.5 "protobuf<3.22", # 4.21.5 "slack==0.0.2", # 0.0.2 "boto3==1.17.57", # 1.24.56 "toml", # 0.10.2 - "mistune==0.8.4", # 2.0.4 + "mistune", # 2.0.4 "pyYaml", # 6.0 "redis==4.3.3", # 4.3.4 "XlsxWriter==1.3.7", # 3.0.3 "celery==5.2.7", # 5.2.7 "httpx==0.23.0", # 0.23.0 "eventlet==0.33.3", - "flower==1.2.0", - "tornado==6.1", # added because it sometimes errors when tornado is not 6.1 even though nothing we install depends on it + "flower==2.0.1", + "tornado==6.3.3", # added because it sometimes errors when tornado is not 6.1 even though nothing we install depends on it "pylint==2.15.5", "dnspython==2.2.1", 'Werkzeug~=2.0.0', "pylint==2.15.5", - "mdpdf==0.0.18" + "mdpdf==0.0.18", + "typing-extensions==4.7.1" ], extras_require={ "dev": [ @@ -93,11 +94,11 @@ "pytest==6.2.5", # 7.1.2 "toml >= 0.10.2", # 0.10.2 "ipdb==0.13.9", # 0.13.9 - "sphinx==4.2.0", # 5.1.1 - "sphinx_rtd_theme==1.0.0", # 1.0.0 - "sphinxcontrib-openapi==0.7.0", # 0.7.0 + "sphinx==7.2.6", #4.2.0", # 5.1.1 + "sphinx_rtd_theme==2.0.0", # 1.0.0 + "sphinxcontrib-openapi==0.8.3", # 0.7.0 "sphinxcontrib-redoc==1.6.0", # 1.6.0 - "docutils==0.17.1" # 0.19 + "docutils==0.20.1" # 0.19 ] }, entry_points={ diff --git a/tests/test_applicaton/test_db/test_models/test_augur_data/test_repo.py b/tests/test_applicaton/test_db/test_models/test_augur_data/test_repo.py index bf22254244..dd1ef44b79 100644 --- a/tests/test_applicaton/test_db/test_models/test_augur_data/test_repo.py +++ b/tests/test_applicaton/test_db/test_models/test_augur_data/test_repo.py @@ -77,20 +77,20 @@ def test_insert_repo(test_db_engine): with DatabaseSession(logger, test_db_engine) as session: - assert Repo.insert(session, data["repo_urls"][0], data["rg_id"], data["tool_source"]) is not None - assert Repo.insert(session, data["repo_urls"][1], data["rg_id"], data["tool_source"]) is not None + assert Repo.insert_github_repo(session, data["repo_urls"][0], data["rg_id"], data["tool_source"], None) is not None + assert Repo.insert_github_repo(session, data["repo_urls"][1], data["rg_id"], data["tool_source"], None) is not None # invalid rg_id - assert Repo.insert(session, data["repo_urls"][0], 12, data["tool_source"]) is None + assert Repo.insert_github_repo(session, data["repo_urls"][0], 12, data["tool_source"], None) is None # invalid type for repo url - assert Repo.insert(session, 1, data["rg_id"], data["tool_source"]) is None + assert Repo.insert_github_repo(session, 1, data["rg_id"], data["tool_source"], None) is None # invalid type for rg_id - assert Repo.insert(session, data["repo_urls"][1], "1", data["tool_source"]) is None + assert Repo.insert_github_repo(session, data["repo_urls"][1], "1", data["tool_source"], None) is None # invalid type for tool_source - assert Repo.insert(session, data["repo_urls"][1], data["rg_id"], 52) is None + assert Repo.insert_github_repo(session, data["repo_urls"][1], data["rg_id"], 52, None) is None with test_db_engine.connect() as connection: diff --git a/tests/test_applicaton/test_db/test_models/test_augur_operations/test_user_repo.py b/tests/test_applicaton/test_db/test_models/test_augur_operations/test_user_repo.py index 3fc5451791..4b288cbabb 100644 --- a/tests/test_applicaton/test_db/test_models/test_augur_operations/test_user_repo.py +++ b/tests/test_applicaton/test_db/test_models/test_augur_operations/test_user_repo.py @@ -124,7 +124,7 @@ def test_add_frontend_repos_with_invalid_repo(test_db_engine): with GithubTaskSession(logger, test_db_engine) as session: - result = UserRepo.add(session, url, data["user_id"], data["user_group_name"]) + result = UserRepo.add_github_repo(session, url, data["user_id"], data["user_group_name"]) assert result[1]["status"] == "Invalid repo" @@ -163,11 +163,11 @@ def test_add_frontend_repos_with_duplicates(test_db_engine): with GithubTaskSession(logger, test_db_engine) as session: - result = UserRepo.add(session, url, data["user_id"], data["user_group_name"]) - result2 = UserRepo.add(session, url, data["user_id"], data["user_group_name"]) + result = UserRepo.add_github_repo(session, url, data["user_id"], data["user_group_name"]) + result2 = UserRepo.add_github_repo(session, url, data["user_id"], data["user_group_name"]) # add repo with invalid group name - result3 = UserRepo.add(session, url, data["user_id"], "Invalid group name") + result3 = UserRepo.add_github_repo(session, url, data["user_id"], "Invalid group name") assert result[1]["status"] == "Repo Added" assert result2[1]["status"] == "Repo Added" @@ -263,11 +263,11 @@ def test_add_frontend_org_with_invalid_org(test_db_engine): with GithubTaskSession(logger, test_db_engine) as session: url = f"https://github.com/{data['org_name']}/" - result = UserRepo.add_org_repos(session, url, data["user_id"], data["user_group_name"]) + result = UserRepo.add_github_org_repos(session, url, data["user_id"], data["user_group_name"]) assert result[1]["status"] == "Invalid owner url" # test with invalid group name - result = UserRepo.add_org_repos(session, url, data["user_id"], "Invalid group name") + result = UserRepo.add_github_org_repos(session, url, data["user_id"], "Invalid group name") assert result[1]["status"] == "Invalid group name" with test_db_engine.connect() as connection: @@ -305,7 +305,7 @@ def test_add_frontend_org_with_valid_org(test_db_engine): with GithubTaskSession(logger, test_db_engine) as session: url = "https://github.com/{}/".format(data["org_name"]) - result = UserRepo.add_org_repos(session, url, data["user_id"], data["user_group_name"]) + result = UserRepo.add_github_org_repos(session, url, data["user_id"], data["user_group_name"]) assert result[1]["status"] == "Org repos added" with test_db_engine.connect() as connection: