diff --git a/.docker-setup.sh b/.docker-setup.sh deleted file mode 100755 index dfdc797a38..0000000000 --- a/.docker-setup.sh +++ /dev/null @@ -1,100 +0,0 @@ -#!/bin/bash -#automate the small things for setting up docker containers -#This file sets up the backend and the frontend and optional database container. -#NOTE: The frontend is currently under construction. -# -#The script is needed to handle: -# -Environment variables for -# -Runtime values needing to be set and accurate for the backend's database credentials and github api key -# -Pre-runtime values needing to be set and accurate for the database hostname and type (whether it is test data or not). The ip address needs to be added in the extra_hosts argument of the yml markup. -# -Setting up a network alias in order to let the docker container communicate with local hosts. -# -Easily seeing console output and process statistics from one convienient window. -# -Easily save console output to logs. -# -#This file uses two environment files -# - One called docker_env.txt which holds the runtime enviroment variables that the container itself uses -# - One called .env which holds the environment variables that docker-compose.yml uses and holds the database type. -#TODO: - #Let users know how to configure the database to work for local connection because its not *that* clear right now. Its in the docs at least. - #Make container work with gitlab key - #Test this script on macOS -# -missingModules="" - -#Check everything that needs to be in the $PATH is in there. -#Bash doesn't let this work if this is in an if statement for some reason it has to be chained -type -P "docker" &>/dev/null && echo "docker found..." || missingModules="${missingModules} docker" -type -P "ifconfig" &>/dev/null && echo "ifconfig found..." || missingModules="${missingModules} ifconfig (part of net-tools)" -type -P "psql" &>/dev/null && echo "psql found..." || missingModules="${missingModules} psql" -type -P "watch" &>/dev/null && echo "watch found..." || missingModules="${missingModules} watch" - -if [ ! -z "$missingModules" ] -then - echo "One or more modules required to run this script is missing or not in your \$PATH:" - echo "Note: OSX users will need to install watch with \"brew install watch\"" - echo "Including:$missingModules" - exit 1 -fi -unset $missingModules - -if [ "$EUID" -ne 0 ]; - then echo "Please run as root" - exit 1 -fi - -#Always use a clean .env file because it is a subset of docker_env.txt so we can just generate it from that. -if [[ -f ".env" ]] -then - rm .env -fi -touch .env - -#This is differant for MacOS -#Script uses an alias for localhost that is the below ip -echo "Setting up network alias..." -#Check kernel for OS, assumes either linux or macOS -if [ "$(uname -s)" == "Linux" ] -then - ifconfig lo:0 10.254.254.254 - ifconfig lo:0 - echo "Linux detected..." -else - ifconfig lo0 alias 10.254.254.254 - ifconfig lo0 -fi - -#Prompt for deploy type. -echo "Types of docker deployment: " -echo -echo "1. Deploy the backend using docker connected to a non-docker database." -echo "2. Deploy backend and database together in docker containers." -echo "3. Deploy the backend and database together in docker containers using premade test data." -echo -read -p "Would you like to use : " deployChoice - -case $deployChoice in - - 1) - #Start script to set up just two containers - exec scripts/docker/docker-setup-external.sh - ;; - - 2) - #Start script to set up all three containers. - #Set env variable to not use test data - echo "AUGUR_DB_TYPE=database" >> .env - exec scripts/docker/docker-setup-database.sh - ;; - - 3) - #Start script to set up all three containers - #Set env variable to use test data. - echo "AUGUR_DB_TYPE=test_data" >> .env - exec scripts/docker/docker-setup-database.sh - ;; - - *) - echo "Invalid choice!" - exit 1 - ;; -esac diff --git a/.github/workflows/auto_merge.yml b/.github/workflows/auto_merge.yml new file mode 100644 index 0000000000..d3e0e6f59f --- /dev/null +++ b/.github/workflows/auto_merge.yml @@ -0,0 +1,27 @@ +name: Merge main into dev + +on: + push: + branches: + - main + +jobs: + update-dev: + permissions: write-all + name: update-dev + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - run: | + git config user.name 'GitHub Actions' + git config user.email 'actions@users.noreply.github.com' + git checkout dev + git merge main + echo "Done with merge" + - name: Push to dev + uses: CasperWA/push-protected@v2 + with: + token: ${{ secrets.GITHUB_TOKEN }} + branch: dev \ No newline at end of file diff --git a/.pylintrc b/.pylintrc index 0056af873b..aec2f59d4c 100644 --- a/.pylintrc +++ b/.pylintrc @@ -12,7 +12,7 @@ #refactoring checker #enable=R -disable=E0611,E1101,W1203,R0801,W0614,W0611,C0411,C0103,C0301,C0303,C0304,C0305,W0311,E0401 +disable=E0611,E1101,W1203,R0801,W0614,W0611,C0411,C0103,C0301,C0303,C0304,C0305,W0311,E0401,C0116 # Analyse import fallback blocks. This can be used to support both Python 2 and diff --git a/README.md b/README.md index 13fbe0dca3..02ec125fb6 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Augur NEW Release v0.62.4 +# Augur NEW Release v0.70.0 Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else! The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io @@ -10,7 +10,8 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o ## NEW RELEASE ALERT! ### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md) -Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.62.4 +Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.70.0 + - The `main` branch is a stable version of our new architecture, which features: - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks. - A new job management architecture that uses Celery and Redis to manage queues, and enables users to run a Flower job monitoring dashboard diff --git a/augur/api/gunicorn_conf.py b/augur/api/gunicorn_conf.py index cabf88e5c5..09c21161a0 100644 --- a/augur/api/gunicorn_conf.py +++ b/augur/api/gunicorn_conf.py @@ -1,47 +1,48 @@ # from augur import ROOT_AUGUR_DIRECTORY import multiprocessing import logging -import os from pathlib import Path from glob import glob -import shutil -from augur.application.db.session import DatabaseSession -from augur.application.config import AugurConfig +from augur.application.db.lib import get_value, get_section +from augur.application.db import dispose_database_engine logger = logging.getLogger(__name__) -with DatabaseSession(logger) as session: - augur_config = AugurConfig(logger, session) - - - # ROOT_AUGUR_DIRECTORY = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) - # base_log_dir = ROOT_AUGUR_DIRECTORY + "/logs/" +# ROOT_AUGUR_DIRECTORY = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + +# base_log_dir = ROOT_AUGUR_DIRECTORY + "/logs/" + +# Path(base_log_dir).mkdir(exist_ok=True) - # Path(base_log_dir).mkdir(exist_ok=True) +workers = multiprocessing.cpu_count() * 2 + 1 +umask = 0o007 +reload = True +reload_extra_files = glob(str(Path.cwd() / '**/*.j2'), recursive=True) - workers = multiprocessing.cpu_count() * 2 + 1 - umask = 0o007 - reload = True - reload_extra_files = glob(str(Path.cwd() / '**/*.j2'), recursive=True) +# set the log location for gunicorn +logs_directory = get_value('Logging', 'logs_directory') +accesslog = f"{logs_directory}/gunicorn.log" +errorlog = f"{logs_directory}/gunicorn.log" - # set the log location for gunicorn - logs_directory = augur_config.get_value('Logging', 'logs_directory') - accesslog = f"{logs_directory}/gunicorn.log" - errorlog = f"{logs_directory}/gunicorn.log" +ssl_bool = get_value('Server', 'ssl') + +if ssl_bool is True: + + workers = int(get_value('Server', 'workers')) + bind = '%s:%s' % (get_value("Server", "host"), get_value("Server", "port")) + timeout = int(get_value('Server', 'timeout')) + certfile = str(get_value('Server', 'ssl_cert_file')) + keyfile = str(get_value('Server', 'ssl_key_file')) + +else: + workers = int(get_value('Server', 'workers')) + bind = '%s:%s' % (get_value("Server", "host"), get_value("Server", "port")) + timeout = int(get_value('Server', 'timeout')) - ssl_bool = augur_config.get_value('Server', 'ssl') - if ssl_bool is True: +def worker_exit(server, worker): + print("Stopping gunicorn worker process") + dispose_database_engine() - workers = int(augur_config.get_value('Server', 'workers')) - bind = '%s:%s' % (augur_config.get_value("Server", "host"), augur_config.get_value("Server", "port")) - timeout = int(augur_config.get_value('Server', 'timeout')) - certfile = str(augur_config.get_value('Server', 'ssl_cert_file')) - keyfile = str(augur_config.get_value('Server', 'ssl_key_file')) - - else: - workers = int(augur_config.get_value('Server', 'workers')) - bind = '%s:%s' % (augur_config.get_value("Server", "host"), augur_config.get_value("Server", "port")) - timeout = int(augur_config.get_value('Server', 'timeout')) diff --git a/augur/api/metrics/README.md b/augur/api/metrics/README.md index 5990291bf1..97d90ebbcb 100644 --- a/augur/api/metrics/README.md +++ b/augur/api/metrics/README.md @@ -14,7 +14,6 @@ import datetime import sqlalchemy as s import pandas as pd from augur.api.util import register_metric -from augur.application.db.engine import engine ``` 3. Defining the function 1. Add the decorator @register_metric to the function diff --git a/augur/api/metrics/commit.py b/augur/api/metrics/commit.py index 41d86abbff..a8e12dca70 100644 --- a/augur/api/metrics/commit.py +++ b/augur/api/metrics/commit.py @@ -6,9 +6,9 @@ import datetime import sqlalchemy as s import pandas as pd -from augur.api.util import register_metric +from flask import current_app -from ..server import engine +from augur.api.util import register_metric @register_metric() def committers(repo_group_id, repo_id=None, begin_date=None, end_date=None, period='month'): @@ -90,7 +90,7 @@ def committers(repo_group_id, repo_id=None, begin_date=None, end_date=None, peri """ ) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(committersSQL, conn, params={'repo_id': repo_id, 'repo_group_id': repo_group_id,'begin_date': begin_date, 'end_date': end_date, 'period':period}) @@ -168,7 +168,7 @@ def annual_commit_count_ranked_by_new_repo_in_repo_group(repo_group_id, repo_id= ORDER BY YEAR ASC """.format(table, period)) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(cdRgNewrepRankedCommitsSQL, conn, params={'repo_id': repo_id, 'repo_group_id': repo_group_id,'begin_date': begin_date, 'end_date': end_date}) return results @@ -267,7 +267,7 @@ def annual_commit_count_ranked_by_repo_in_repo_group(repo_group_id, repo_id=None LIMIT 10 """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(cdRgTpRankedCommitsSQL, conn, params={ "repo_group_id": repo_group_id, "repo_id": repo_id}) return results @@ -299,7 +299,7 @@ def top_committers(repo_group_id, repo_id=None, year=None, threshold=0.8): ORDER BY patches DESC) a """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(total_commits_SQL, conn, params={'year': year, 'repo_group_id': repo_group_id}) else: @@ -312,7 +312,7 @@ def top_committers(repo_group_id, repo_id=None, year=None, threshold=0.8): ORDER BY patches DESC) a """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(total_commits_SQL, conn, params={'year': year, 'repo_id': repo_id}) @@ -339,7 +339,7 @@ def top_committers(repo_group_id, repo_id=None, year=None, threshold=0.8): ORDER BY commits DESC """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(committers_SQL, conn, params={'year': year, 'repo_group_id': repo_group_id}) else: @@ -359,7 +359,7 @@ def top_committers(repo_group_id, repo_id=None, year=None, threshold=0.8): ORDER BY commits DESC """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(committers_SQL, conn, params={'year': year, 'repo_id': repo_id}) diff --git a/augur/api/metrics/contributor.py b/augur/api/metrics/contributor.py index 3f25236d0f..b89e36e76c 100644 --- a/augur/api/metrics/contributor.py +++ b/augur/api/metrics/contributor.py @@ -6,10 +6,10 @@ import datetime import sqlalchemy as s import pandas as pd +from flask import current_app + from augur.api.util import register_metric -import uuid -from ..server import engine @register_metric() def contributors(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=None): @@ -125,7 +125,7 @@ def contributors(repo_group_id, repo_id=None, period='day', begin_date=None, end ORDER BY total DESC """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(contributorsSQL, conn, params={'repo_id': repo_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) else: @@ -212,7 +212,7 @@ def contributors(repo_group_id, repo_id=None, period='day', begin_date=None, end ORDER BY total DESC """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(contributorsSQL, conn, params={'repo_group_id': repo_group_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) return results @@ -283,7 +283,7 @@ def contributors_new(repo_group_id, repo_id=None, period='day', begin_date=None, GROUP BY date, repo.repo_id, repo_name """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(contributorsNewSQL, conn, params={'repo_id': repo_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) else: @@ -333,7 +333,7 @@ def contributors_new(repo_group_id, repo_id=None, period='day', begin_date=None, GROUP BY date, repo.repo_id, repo_name """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(contributorsNewSQL, conn, params={'repo_group_id': repo_group_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) return results @@ -355,7 +355,7 @@ def lines_changed_by_author(repo_group_id, repo_id=None): GROUP BY commits.repo_id, date_trunc('week', cmt_author_date::date), cmt_author_affiliation, cmt_author_email, repo_name ORDER BY date_trunc('week', cmt_author_date::date) ASC; """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(linesChangedByAuthorSQL, conn, params={"repo_id": repo_id}) return results else: @@ -367,7 +367,7 @@ def lines_changed_by_author(repo_group_id, repo_id=None): GROUP BY repo_id, date_trunc('week', cmt_author_date::date), cmt_author_affiliation, cmt_author_email ORDER BY date_trunc('week', cmt_author_date::date) ASC; """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(linesChangedByAuthorSQL, conn, params={"repo_group_id": repo_group_id}) return results @@ -426,7 +426,7 @@ def contributors_code_development(repo_group_id, repo_id=None, period='all', beg GROUP BY a.email, a.repo_id, repo_name """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(contributorsSQL, conn, params={'repo_id': repo_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) else: @@ -462,7 +462,7 @@ def contributors_code_development(repo_group_id, repo_id=None, period='all', beg ORDER BY commits desc, email """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(contributorsSQL, conn, params={'repo_group_id': repo_group_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) return results diff --git a/augur/api/metrics/deps.py b/augur/api/metrics/deps.py index d92371d896..909ae4cb51 100644 --- a/augur/api/metrics/deps.py +++ b/augur/api/metrics/deps.py @@ -5,10 +5,11 @@ import sqlalchemy as s import pandas as pd -from augur.api.util import register_metric import datetime +from flask import current_app + +from augur.api.util import register_metric -from ..server import engine @register_metric() def deps(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=None): @@ -46,8 +47,8 @@ def deps(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=No AND repo_dependencies.repo_id = :repo_id """) - with engine.connect() as conn: - results = pd.read_sql(depsSQL, conn) + with current_app.engine.connect() as conn: + results = pd.read_sql(depsSQL, conn, params={'repo_id': repo_id}) else: @@ -71,8 +72,8 @@ def deps(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=No AND repo.repo_group_id = :repo_group_id """) - with engine.connect() as conn: - results = pd.read_sql(depsSQL, conn) + with current_app.engine.connect() as conn: + results = pd.read_sql(depsSQL, conn, params={'repo_group_id': repo_group_id}) return results diff --git a/augur/api/metrics/insight.py b/augur/api/metrics/insight.py index 848161e1a8..e5dad61829 100644 --- a/augur/api/metrics/insight.py +++ b/augur/api/metrics/insight.py @@ -7,7 +7,7 @@ import pandas as pd from augur.api.util import register_metric -from ..server import engine +from flask import current_app @register_metric(type="repo_group_only") def top_insights(repo_group_id, num_repos=6): @@ -29,6 +29,6 @@ def top_insights(repo_group_id, num_repos=6): LIMIT :num_repos ) """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(topInsightsSQL, conn, params={'repo_group_id': repo_group_id, 'num_repos': num_repos}) return results diff --git a/augur/api/metrics/issue.py b/augur/api/metrics/issue.py index 22ee2630b5..3410fbf806 100644 --- a/augur/api/metrics/issue.py +++ b/augur/api/metrics/issue.py @@ -6,9 +6,10 @@ import datetime import sqlalchemy as s import pandas as pd +from flask import current_app + from augur.api.util import register_metric -from ..server import engine @register_metric() def issues_first_time_opened(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=None): @@ -51,7 +52,7 @@ def issues_first_time_opened(repo_group_id, repo_id=None, period='day', begin_da ORDER BY issue_date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issueNewContributor, conn, params={'repo_id': repo_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) else: @@ -78,7 +79,7 @@ def issues_first_time_opened(repo_group_id, repo_id=None, period='day', begin_da GROUP BY repo.repo_id, issue_date ORDER BY issue_date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issueNewContributor, conn, params={'repo_group_id': repo_group_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) @@ -122,7 +123,7 @@ def issues_first_time_closed(repo_group_id, repo_id=None, period='day', begin_da ) AS iss_close GROUP BY issue_date, repo_name """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issuesClosedSQL, conn, params={'repo_id': repo_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) else: @@ -146,7 +147,7 @@ def issues_first_time_closed(repo_group_id, repo_id=None, period='day', begin_da GROUP BY repo_id, repo_name,issue_date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issuesClosedSQL, conn, params={'repo_group_id': repo_group_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) @@ -185,7 +186,7 @@ def issues_new(repo_group_id, repo_id=None, period='day', begin_date=None, end_d ORDER BY issues.repo_id, date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issues_new_SQL, conn, params={'repo_group_id': repo_group_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) @@ -205,7 +206,7 @@ def issues_new(repo_group_id, repo_id=None, period='day', begin_date=None, end_d ORDER BY date; """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issues_new_SQL, conn, params={'repo_id': repo_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) return results @@ -243,7 +244,7 @@ def issues_active(repo_group_id, repo_id=None, period='day', begin_date=None, en ORDER BY issues.repo_id, date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issues_active_SQL, conn, params={'repo_group_id': repo_group_id, 'period':period, 'begin_date': begin_date, 'end_date':end_date}) @@ -263,7 +264,7 @@ def issues_active(repo_group_id, repo_id=None, period='day', begin_date=None, en ORDER BY date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issues_active_SQL, conn, params={'repo_id': repo_id, 'period':period, 'begin_date': begin_date, 'end_date':end_date}) return results @@ -300,7 +301,7 @@ def issues_closed(repo_group_id, repo_id=None, period='day', begin_date=None, en ORDER BY issues.repo_id, date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issues_closed_SQL, conn, params={'repo_group_id': repo_group_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) @@ -319,7 +320,7 @@ def issues_closed(repo_group_id, repo_id=None, period='day', begin_date=None, en ORDER BY date; """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issues_closed_SQL, conn, params={'repo_id': repo_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) @@ -359,7 +360,7 @@ def issue_duration(repo_group_id, repo_id=None, begin_date=None, end_date=None): ORDER BY repo_id, issue_id """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issue_duration_SQL, conn, params={'repo_group_id': repo_group_id, 'begin_date': begin_date, 'end_date': end_date}) @@ -384,7 +385,7 @@ def issue_duration(repo_group_id, repo_id=None, begin_date=None, end_date=None): ORDER BY issue_id; """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issue_duration_SQL, conn, params={'repo_id': repo_id, 'begin_date': begin_date, 'end_date': end_date}) @@ -431,7 +432,7 @@ def issue_participants(repo_group_id, repo_id=None, begin_date=None, end_date=No ORDER BY issues.repo_id, issues.created_at """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: result = pd.read_sql(issue_participants_SQL, conn, params={'repo_group_id': repo_group_id, 'begin_date': begin_date, 'end_date': end_date}) @@ -460,7 +461,7 @@ def issue_participants(repo_group_id, repo_id=None, begin_date=None, end_date=No ORDER BY issues.created_at """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: result = pd.read_sql(issue_participants_SQL, conn, params={'repo_id': repo_id, 'begin_date': begin_date, 'end_date': end_date}) @@ -485,7 +486,7 @@ def issue_backlog(repo_group_id, repo_id=None): ORDER BY issues.repo_id """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: result = pd.read_sql(issue_backlog_SQL, conn, params={'repo_group_id': repo_group_id}) return result @@ -499,7 +500,7 @@ def issue_backlog(repo_group_id, repo_id=None): GROUP BY repo_name """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: result = pd.read_sql(issue_backlog_SQL, conn, params={'repo_id': repo_id}) return result @@ -528,7 +529,7 @@ def issue_throughput(repo_group_id, repo_id=None): AND table1.repo_id = repo.repo_id """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issue_throughput_SQL, conn, params={'repo_group_id': repo_group_id}) return results @@ -545,7 +546,7 @@ def issue_throughput(repo_group_id, repo_id=None): WHERE table1.repo_id = repo.repo_id """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: result = pd.read_sql(issue_throughput_SQL, conn, params={'repo_id': repo_id}) return result @@ -595,7 +596,7 @@ def issues_open_age(repo_group_id, repo_id=None, period='day', begin_date=None, ORDER BY open_date DESC """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(openAgeSQL, conn, params={'repo_id': repo_id, 'repo_group_id': repo_group_id, 'period': period, 'begin_date':begin_date, 'end_date':end_date}) @@ -656,7 +657,7 @@ def issues_closed_resolution_duration(repo_group_id, repo_id=None, period='day', ORDER BY gh_issue_number """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issueSQL, conn, params={'repo_id': repo_id, 'repo_group_id': repo_group_id, @@ -690,7 +691,7 @@ def average_issue_resolution_time(repo_group_id, repo_id=None): """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(avg_issue_resolution_SQL, conn, params={'repo_group_id': repo_group_id}) return results @@ -707,7 +708,7 @@ def average_issue_resolution_time(repo_group_id, repo_id=None): GROUP BY repo.repo_name """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(avg_issue_resolution_SQL, conn, params={'repo_id': repo_id}) return results @@ -782,7 +783,7 @@ def issues_maintainer_response_duration(repo_group_id, repo_id=None, begin_date= group by repo_id, repo_name """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issuesSQL, conn, params={'repo_id': repo_id, 'repo_group_id': repo_group_id,'begin_date': begin_date, 'end_date': end_date}) return results @@ -806,7 +807,7 @@ def open_issues_count(repo_group_id, repo_id=None): GROUP BY date, repo_groups.rg_name ORDER BY date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(openIssueCountSQL, conn, params={'repo_group_id': repo_group_id}) return results else: @@ -821,7 +822,7 @@ def open_issues_count(repo_group_id, repo_id=None): GROUP BY date, repo.repo_id ORDER BY date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(openIssueCountSQL, conn, params={'repo_id': repo_id}) return results @@ -845,7 +846,7 @@ def closed_issues_count(repo_group_id, repo_id=None): GROUP BY date, repo_groups.rg_name ORDER BY date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(closedIssueCountSQL, conn, params={'repo_group_id': repo_group_id}) return results else: @@ -860,7 +861,7 @@ def closed_issues_count(repo_group_id, repo_id=None): GROUP BY date, repo.repo_id ORDER BY date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(closedIssueCountSQL, conn, params={'repo_id': repo_id}) return results @@ -923,7 +924,7 @@ def issue_comments_mean(repo_group_id, repo_id=None, group_by='week'): else: raise ValueError("Incorrect value for 'group_by'") - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issue_comments_mean_std_SQL, conn, params={'repo_group_id': repo_group_id}) return results @@ -977,7 +978,7 @@ def issue_comments_mean(repo_group_id, repo_id=None, group_by='week'): else: raise ValueError("Incorrect value for 'group_by'") - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issue_comments_mean_std_SQL, conn, params={'repo_id': repo_id}) return results @@ -1010,7 +1011,7 @@ def issue_comments_mean_std(repo_group_id, repo_id=None, group_by='week'): """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issue_comments_mean_std_SQL, conn, params={'repo_group_id': repo_group_id, 'group_by': group_by}) @@ -1039,7 +1040,7 @@ def issue_comments_mean_std(repo_group_id, repo_id=None, group_by='week'): ORDER BY date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issue_comments_mean_std_SQL, conn, params={'repo_id': repo_id, 'group_by': group_by}) return results @@ -1091,7 +1092,7 @@ def abandoned_issues(repo_group_id, repo_id=None, period='day', begin_date=None, ''' ) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(abandonedSQL, conn, params={'repo_id': repo_id, 'repo_group_id': repo_group_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) return results diff --git a/augur/api/metrics/message.py b/augur/api/metrics/message.py index 9988f5a0d5..78a8338d97 100644 --- a/augur/api/metrics/message.py +++ b/augur/api/metrics/message.py @@ -7,9 +7,10 @@ import datetime import sqlalchemy as s import pandas as pd +from flask import current_app + from augur.api.util import register_metric -from ..server import engine @register_metric() def repo_messages(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=None): @@ -56,7 +57,7 @@ def repo_messages(repo_group_id, repo_id=None, period='day', begin_date=None, en """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(repomessagesSQL, conn, params={'repo_id': repo_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) else: @@ -86,7 +87,7 @@ def repo_messages(repo_group_id, repo_id=None, period='day', begin_date=None, en message_date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(repomessagesSQL, conn, params={'repo_group_id': repo_group_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) diff --git a/augur/api/metrics/pull_request.py b/augur/api/metrics/pull_request.py index 3b1798ec01..447c9557ae 100644 --- a/augur/api/metrics/pull_request.py +++ b/augur/api/metrics/pull_request.py @@ -3,12 +3,12 @@ Metrics that provide data about pull requests & their associated activity """ -import datetime +from datetime import datetime import sqlalchemy as s import pandas as pd -from augur.api.util import register_metric +from flask import current_app -from ..server import engine +from augur.api.util import register_metric @register_metric() def pull_requests_new(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=None): @@ -30,14 +30,15 @@ def pull_requests_new(repo_group_id, repo_id=None, period='day', begin_date=None if repo_id: new_pull_requests_query = s.sql.text(""" SELECT DATE_TRUNC(:period, pr_created_at) AS created_date, - COUNT(pr_id) AS new_pull_requests + COUNT(*) AS new_pull_requests FROM pull_requests WHERE repo_id = :repo_id AND pr_created_at BETWEEN :begin_date AND :end_date GROUP BY created_date """) - results = pd.read_sql(new_pull_requests_query, engine, params={'repo_id': repo_id, 'period': period, + with current_app.engine.connect() as conn: + results = pd.read_sql(new_pull_requests_query, conn, params={'repo_id': repo_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) else: @@ -50,8 +51,9 @@ def pull_requests_new(repo_group_id, repo_id=None, period='day', begin_date=None GROUP BY created_date """) - results = pd.read_sql(new_pull_requests_query, engine, - params={'repo_group_id': repo_group_id, 'period': period, + with current_app.engine.connect() as conn: + results = pd.read_sql(new_pull_requests_query, conn, + params={'repo_group_id': repo_group_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) @@ -72,7 +74,7 @@ def pull_requests_merge_contributor_new(repo_group_id, repo_id=None, period='day if not begin_date: begin_date = '1970-1-1 00:00:01' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + end_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') if repo_id: commitNewContributor = s.sql.text(""" @@ -87,7 +89,7 @@ def pull_requests_merge_contributor_new(repo_group_id, repo_id=None, period='day """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(commitNewContributor, conn, params={'repo_id': repo_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) @@ -106,7 +108,7 @@ def pull_requests_merge_contributor_new(repo_group_id, repo_id=None, period='day GROUP BY abc.repo_id, repo_name, commit_date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(commitNewContributor, conn, params={'repo_group_id': repo_group_id, 'period': period, 'begin_date': begin_date, @@ -128,7 +130,7 @@ def pull_requests_closed_no_merge(repo_group_id, repo_id=None, period='day', beg if not begin_date: begin_date = '1970-1-1 00:00:01' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + end_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') if repo_id: closedNoMerge = s.sql.text(""" @@ -144,7 +146,7 @@ def pull_requests_closed_no_merge(repo_group_id, repo_id=None, period='day', beg - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(closedNoMerge, conn, params={'repo_id': repo_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) @@ -159,7 +161,7 @@ def pull_requests_closed_no_merge(repo_group_id, repo_id=None, period='day', beg ORDER BY closed_date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(closedNoMerge, conn, params={'repo_group_id': repo_group_id, 'period': period, 'begin_date': begin_date, @@ -180,7 +182,7 @@ def reviews(repo_group_id, repo_id=None, period='day', begin_date=None, end_date if not begin_date: begin_date = '1970-1-1' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d') + end_date = datetime.now().strftime('%Y-%m-%d') if not repo_id: reviews_SQL = s.sql.text(""" @@ -200,7 +202,7 @@ def reviews(repo_group_id, repo_id=None, period='day', begin_date=None, end_date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(reviews_SQL, conn, params={'period': period, 'repo_group_id': repo_group_id, 'begin_date': begin_date, 'end_date': end_date }) @@ -221,7 +223,7 @@ def reviews(repo_group_id, repo_id=None, period='day', begin_date=None, end_date ORDER BY date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(reviews_SQL, conn, params={'period': period, 'repo_id': repo_id, 'begin_date': begin_date, 'end_date': end_date}) @@ -241,7 +243,7 @@ def reviews_accepted(repo_group_id, repo_id=None, period='day', begin_date=None, if not begin_date: begin_date = '1970-1-1' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d') + end_date = datetime.now().strftime('%Y-%m-%d') if not repo_id: reviews_accepted_SQL = s.sql.text(""" @@ -261,7 +263,7 @@ def reviews_accepted(repo_group_id, repo_id=None, period='day', begin_date=None, ORDER BY pull_requests.repo_id, date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(reviews_accepted_SQL, conn, params={'period': period, 'repo_group_id': repo_group_id, 'begin_date': begin_date, 'end_date': end_date}) @@ -282,7 +284,7 @@ def reviews_accepted(repo_group_id, repo_id=None, period='day', begin_date=None, ORDER BY date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(reviews_accepted_SQL, conn, params={'period': period, 'repo_id': repo_id, 'begin_date': begin_date, 'end_date': end_date}) @@ -302,7 +304,7 @@ def reviews_declined(repo_group_id, repo_id=None, period='day', begin_date=None, if not begin_date: begin_date = '1970-1-1' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d') + end_date = datetime.now().strftime('%Y-%m-%d') if not repo_id: reviews_declined_SQL = s.sql.text(""" @@ -322,7 +324,7 @@ def reviews_declined(repo_group_id, repo_id=None, period='day', begin_date=None, ORDER BY pull_requests.repo_id, date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(reviews_declined_SQL, conn, params={'period': period, 'repo_group_id': repo_group_id, 'begin_date': begin_date, 'end_date': end_date }) @@ -343,7 +345,7 @@ def reviews_declined(repo_group_id, repo_id=None, period='day', begin_date=None, ORDER BY date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(reviews_declined_SQL, conn, params={'period': period, 'repo_id': repo_id, 'begin_date': begin_date, 'end_date': end_date}) @@ -362,7 +364,7 @@ def review_duration(repo_group_id, repo_id=None, begin_date=None, end_date=None) if not begin_date: begin_date = '1970-1-1' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d') + end_date = datetime.now().strftime('%Y-%m-%d') if not repo_id: review_duration_SQL = s.sql.text(""" @@ -383,7 +385,7 @@ def review_duration(repo_group_id, repo_id=None, begin_date=None, end_date=None) ORDER BY pull_requests.repo_id, pull_requests.pull_request_id """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(review_duration_SQL, conn, params={'repo_group_id': repo_group_id, 'begin_date': begin_date, @@ -407,7 +409,7 @@ def review_duration(repo_group_id, repo_id=None, begin_date=None, end_date=None) ORDER BY pull_requests.repo_id, pull_request_id """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(review_duration_SQL, conn, params={'repo_id': repo_id, 'begin_date': begin_date, @@ -427,7 +429,7 @@ def pull_request_acceptance_rate(repo_group_id, repo_id=None, begin_date=None, e if not begin_date: begin_date = '1970-1-1 00:00:01' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + end_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') if not repo_id: prAccRateSQL = s.sql.text(""" @@ -461,7 +463,7 @@ def pull_request_acceptance_rate(repo_group_id, repo_id=None, begin_date=None, e ON opened.date_created = accepted.accepted_on """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(prAccRateSQL, conn, params={'repo_group_id': repo_group_id, 'group_by': group_by, 'begin_date': begin_date, 'end_date': end_date}) return results @@ -495,7 +497,7 @@ def pull_request_acceptance_rate(repo_group_id, repo_id=None, begin_date=None, e ON opened.date_created = accepted.accepted_on """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(prAccRateSQL, conn, params={'repo_id': repo_id, 'group_by': group_by, 'begin_date': begin_date, 'end_date': end_date}) return results @@ -516,7 +518,7 @@ def pull_request_average_time_to_close(repo_group_id, repo_id=None, group_by='mo if not begin_date: begin_date = '1970-1-1' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d') + end_date = datetime.now().strftime('%Y-%m-%d') unit_options = ['year', 'month', 'week', 'day'] @@ -601,10 +603,15 @@ def pull_request_average_time_to_close(repo_group_id, repo_id=None, group_by='mo - with engine.connect() as conn: + with current_app.engine.connect() as conn: pr_all = pd.read_sql(pr_all_SQL, conn, params={'repo_id': repo_id, 'repo_group_id':repo_group_id, 'begin_date': begin_date, 'end_date': end_date}) + + + if pr_all.empty: + return [] + if not repo_id: pr_avg_time_to_close = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).mean().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + ['average_{}_to_close'.format(time_unit)]] else: @@ -632,7 +639,7 @@ def pull_request_merged_status_counts(repo_group_id, repo_id=None, begin_date='1 if not begin_date: begin_date = '1970-1-1' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d') + end_date = datetime.now().strftime('%Y-%m-%d') unit_options = ['year', 'month', 'week', 'day'] @@ -714,10 +721,14 @@ def pull_request_merged_status_counts(repo_group_id, repo_id=None, begin_date='1 """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: pr_all = pd.read_sql(pr_all_SQL, conn, params={'repo_id': repo_id, 'repo_group_id':repo_group_id, 'begin_date': begin_date, 'end_date': end_date}) + + if pr_all.empty: + return [] + if not repo_id: pr_avg_time_between_responses = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).mean().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + ['average_{}_between_responses'.format(time_unit)]] else: @@ -740,7 +751,7 @@ def pull_request_average_commit_counts(repo_group_id, repo_id=None, group_by='mo if not begin_date: begin_date = '1970-1-1' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d') + end_date = datetime.now().strftime('%Y-%m-%d') unit_options = ['year', 'month', 'week', 'day'] @@ -825,10 +836,14 @@ def pull_request_average_commit_counts(repo_group_id, repo_id=None, group_by='mo """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: pr_all = pd.read_sql(pr_all_SQL, conn, params={'repo_id': repo_id, 'repo_group_id':repo_group_id, 'begin_date': begin_date, 'end_date': end_date}) + + if pr_all.empty: + return [] + if not repo_id: pr_avg_commit_counts = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).mean().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + ['average_commits_per_pull_request']] else: @@ -851,7 +866,7 @@ def pull_request_average_event_counts(repo_group_id, repo_id=None, group_by='mon if not begin_date: begin_date = '1970-1-1' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d') + end_date = datetime.now().strftime('%Y-%m-%d') unit_options = ['year', 'month', 'week', 'day'] @@ -985,7 +1000,7 @@ def pull_request_average_event_counts(repo_group_id, repo_id=None, group_by='mon """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: pr_all = pd.read_sql(pr_all_SQL, conn, params={'repo_id': repo_id, 'repo_group_id':repo_group_id, 'begin_date': begin_date, 'end_date': end_date}) @@ -995,6 +1010,10 @@ def pull_request_average_event_counts(repo_group_id, repo_id=None, group_by='mon for name in count_names.copy(): average_count_names.append('average_' + name) + + if pr_all.empty: + return [] + if not repo_id: pr_avg_event_counts = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).mean().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + average_count_names] else: @@ -1018,7 +1037,7 @@ def pull_request_average_time_to_responses_and_close(repo_group_id, repo_id=None if not begin_date: begin_date = '1970-1-1' if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d') + end_date = datetime.now().strftime('%Y-%m-%d') unit_options = ['year', 'month', 'week', 'day'] @@ -1109,11 +1128,14 @@ def pull_request_average_time_to_responses_and_close(repo_group_id, repo_id=None GROUP BY closed_year, merged_status, response_times.first_response_time, response_times.last_response_time, response_times.pr_created_at, response_times.pr_closed_at """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: pr_all = pd.read_sql(pr_all_SQL, conn, params={'repo_id': repo_id, 'repo_group_id':repo_group_id, 'begin_date': begin_date, 'end_date': end_date}) + if pr_all.empty: + return [] + if not repo_id: avg_pr_time_to_responses_and_close = pr_all.groupby(['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys).mean().reset_index()[['merged_status', 'repo_id', 'repo_name', 'repo_group_id', 'repo_group_name'] + time_group_bys + ['average_{}_to_first_response'.format(time_unit), 'average_{}_to_last_response'.format(time_unit), 'average_{}_to_close'.format(time_unit)]] else: @@ -1134,7 +1156,7 @@ def pull_request_merged_status_counts(repo_group_id, repo_id=None, begin_date='1 """ if not end_date: - end_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + end_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') unit_options = ['year', 'month', 'week', 'day'] time_group_bys = [] @@ -1193,7 +1215,7 @@ def pull_request_merged_status_counts(repo_group_id, repo_id=None, begin_date='1 """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: pr_all = pd.read_sql(pr_all_sql, conn, params={'repo_group_id': repo_group_id, 'repo_id': repo_id, 'begin_date': begin_date, 'end_date': end_date}) diff --git a/augur/api/metrics/release.py b/augur/api/metrics/release.py index 5594f7ef08..890bb481b7 100644 --- a/augur/api/metrics/release.py +++ b/augur/api/metrics/release.py @@ -6,9 +6,9 @@ import datetime import sqlalchemy as s import pandas as pd -from augur.api.util import register_metric +from flask import current_app -from ..server import engine +from augur.api.util import register_metric @register_metric() def releases(repo_group_id, repo_id=None, period='day', begin_date=None, end_date=None): @@ -50,7 +50,7 @@ def releases(repo_group_id, repo_id=None, period='day', begin_date=None, end_dat ORDER BY releases.release_published_at DESC """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(releases_SQL, conn, params={'period': period, 'repo_group_id': repo_group_id, 'begin_date': begin_date, 'end_date': end_date }) @@ -80,7 +80,7 @@ def releases(repo_group_id, repo_id=None, period='day', begin_date=None, end_dat ORDER BY releases.release_published_at DESC """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(releases_SQL, conn, params={'period': period, 'repo_id': repo_id, 'begin_date': begin_date, 'end_date': end_date}) @@ -127,7 +127,7 @@ def tag_only_releases(repo_group_id, repo_id=None, period='day', begin_date=None ORDER BY releases.release_published_at DESC """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(releases_SQL, conn, params={'period': period, 'repo_group_id': repo_group_id, 'begin_date': begin_date, 'end_date': end_date }) @@ -150,7 +150,7 @@ def tag_only_releases(repo_group_id, repo_id=None, period='day', begin_date=None ORDER BY releases.release_published_at DESC """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(releases_SQL, conn, params={'period': period, 'repo_id': repo_id, 'begin_date': begin_date, 'end_date': end_date}) diff --git a/augur/api/metrics/repo_meta.py b/augur/api/metrics/repo_meta.py index c5d8e1138d..ffc8fc84ef 100644 --- a/augur/api/metrics/repo_meta.py +++ b/augur/api/metrics/repo_meta.py @@ -6,11 +6,11 @@ import datetime import sqlalchemy as s import pandas as pd -import math import logging +from flask import current_app from augur.api.util import register_metric -from ..server import engine + logger = logging.getLogger("augur") @@ -46,7 +46,7 @@ def code_changes(repo_group_id, repo_id=None, period='week', begin_date=None, en ORDER BY week """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(code_changes_SQL, conn, params={'repo_group_id': repo_group_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) results['week'] = results['week'].apply(lambda x: x - 1) @@ -68,7 +68,7 @@ def code_changes(repo_group_id, repo_id=None, period='week', begin_date=None, en ORDER BY week """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(code_changes_SQL, conn, params={'repo_id': repo_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) @@ -111,7 +111,7 @@ def code_changes_lines(repo_group_id, repo_id=None, period='day', begin_date=Non ORDER BY commits.repo_id, date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(code_changes_lines_SQL, conn, params={'repo_group_id': repo_group_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) @@ -131,7 +131,7 @@ def code_changes_lines(repo_group_id, repo_id=None, period='day', begin_date=Non ORDER BY date; """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(code_changes_lines_SQL, conn, params={'repo_id': repo_id, 'period': period, 'begin_date': begin_date, 'end_date': end_date}) return results @@ -164,7 +164,7 @@ def sub_projects(repo_group_id, repo_id=None, begin_date=None, end_date=None): AND repo_added BETWEEN :begin_date AND :end_date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(sub_projectsSQL, conn, params={'repo_id': repo_id, 'begin_date': begin_date, 'end_date': end_date}) else: @@ -175,7 +175,7 @@ def sub_projects(repo_group_id, repo_id=None, begin_date=None, end_date=None): AND repo_added BETWEEN :begin_date AND :end_date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(sub_projectsSQL, conn, params={'repo_group_id': repo_group_id, 'begin_date': begin_date, 'end_date': end_date}) return results @@ -197,10 +197,111 @@ def sbom_download(repo_group_id, repo_id=None): logger.debug(dosocs_SQL) params = {'repo_id': repo_id} - with engine.connect() as conn: + with current_app.engine.connect() as conn: return pd.read_sql(dosocs_SQL, conn, params=params) #return [json.dumps(license_information)] + +def calculate_nadia_project_category(unique_contributor_count,stargazers_count): + """ + Calculates the correct nadia eghbal project label based on Microsoft's cutoff + values for the taxonomy. + + A Club is a project with a lot of dev activity but not a lot of users + + A Federation is a project that has a lot of contributors and users + + A Stadium is a project with a lot of users but not a lot of devs + + A toy is a project with not a lot of devs or users + + ContribMid is a misc category. + + :param unique_contributor_count: The count of contributors the repo has + :param stargazers_count: The count of stargazers the repo has + :return: String containing the project category + """ + + ratio_stargazers_to_contribs = stargazers_count / unique_contributor_count + + if unique_contributor_count > 75 and ratio_stargazers_to_contribs < 2: + return "club" + elif unique_contributor_count > 75 and ratio_stargazers_to_contribs > 2 and stargazers_count > 1000: + return "federation" + elif unique_contributor_count < 6 and stargazers_count > 100: + return "stadium" + elif unique_contributor_count < 6 and stargazers_count < 100: + return "toy" + + #"ContribMid" is the label for repos that don't make sense in the other + #categories. Contribs > 6 and < 75 + return "contribMid" + + + +@register_metric() +def nadia_project_labeling_badge(repo_group_id, repo_id=None): + """Returns the project type of the desired repo according to + Microsoft's implementation of 'Road's and Bridges' style + project catagorization + + :param repo_group_id: The repository's repo_group_id + :param repo_id: The repository's repo_id + + :return: JSON object with project label and url to badge + """ + + if not repo_id: + return {} + + get_unique_contributor_ids_sql = s.sql.text(""" + SELECT repo_id, COUNT(*) AS repo_contributor_count FROM + ( + SELECT cntrb_id, repo_id, COUNT(*) FROM explorer_contributor_actions GROUP BY cntrb_id, repo_id + ) a + WHERE repo_id= :repo_id_param + GROUP BY repo_id + ORDER BY repo_id; + """).bindparams(repo_id_param=repo_id) + + with current_app.engine.connect() as conn: + raw_df = pd.read_sql(get_unique_contributor_ids_sql, conn) + #print(raw_df) + try: + unique_contribs = int(raw_df.at[0,'repo_contributor_count']) + except KeyError: + result = { + "nadia_badge_level": "unknown" + } + return pd.DataFrame(result, index=[0]) + + stars_count_SQL = s.sql.text(""" + SELECT repo_name, stars_count AS stars + FROM repo_info JOIN repo ON repo_info.repo_id = repo.repo_id + WHERE repo_info.repo_id = :repo_id_param + ORDER BY repo_info.data_collection_date DESC + LIMIT 1 + """).bindparams(repo_id_param=repo_id) + + with current_app.engine.connect() as conn: + raw_df = pd.read_sql(stars_count_SQL, conn) + + if raw_df.empty: + return {"status": "Not enough data"} + + stargazers_count = int(raw_df.at[0,'stars']) + repo_name = str(raw_df.at[0,'repo_name']) + + category = calculate_nadia_project_category(unique_contribs, stargazers_count) + + result = { + "repo_name" : repo_name, + "nadia_badge_level": category + } + + return pd.DataFrame(result, index=[0]) + + @register_metric() def cii_best_practices_badge(repo_group_id, repo_id=None): """Returns the CII best practices badge level @@ -226,7 +327,7 @@ def cii_best_practices_badge(repo_group_id, repo_id=None): LIMIT 1 """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: raw_df = pd.read_sql(cii_best_practices_badge_SQL, conn, params={'repo_id': repo_id}) if len(raw_df) == 0: @@ -267,7 +368,7 @@ def forks(repo_group_id, repo_id=None): ORDER BY repo_info.repo_id, date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(forks_SQL, conn, params={'repo_group_id': repo_group_id}) return results @@ -282,7 +383,7 @@ def forks(repo_group_id, repo_id=None): ORDER BY date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(forks_SQL, conn, params={'repo_id': repo_id}) return results @@ -307,7 +408,7 @@ def fork_count(repo_group_id, repo_id=None): WHERE repo_group_id = :repo_group_id) """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(fork_count_SQL, conn, params={'repo_group_id': repo_group_id}) return results else: @@ -319,7 +420,7 @@ def fork_count(repo_group_id, repo_id=None): LIMIT 1 """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(fork_count_SQL, conn, params={'repo_id': repo_id}) return results @@ -338,7 +439,7 @@ def languages(repo_group_id, repo_id=None): WHERE repo_id IN (SELECT repo_id FROM repo WHERE repo_group_id = :repo_group_id) """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(languages_SQL, conn, params={'repo_group_id': repo_group_id}) return results @@ -349,7 +450,7 @@ def languages(repo_group_id, repo_id=None): WHERE repo_id = :repo_id """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(languages_SQL, conn, params={'repo_id': repo_id}) return results @@ -386,7 +487,7 @@ def license_files(license_id, spdx_binary, repo_group_id, repo_id=None,): b.license_id in ( 369,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482)); """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(license_data_SQL, conn, params={'repo_id': repo_id, 'spdx_binary': spdx_binary, 'license_id': license_id}) return results @@ -456,7 +557,7 @@ def license_declared(repo_group_id, repo_id=None): short_name; """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(license_declared_SQL, conn, params={'repo_id': repo_id}) return results @@ -541,7 +642,7 @@ def license_coverage(repo_group_id, repo_id=None): GROUP BY a.name, a.licensed, a.licensed, b.total """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(license_declared_SQL, conn, params={'repo_id': repo_id, 'repo_group_id':repo_group_id}) return results @@ -603,7 +704,7 @@ def license_count(repo_group_id, repo_id=None): GROUP BY a.name, a.number_of_license, a.licensed, b.total """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(license_declared_SQL, conn, params={'repo_id': repo_id, 'repo_group_id':repo_group_id}) return results @@ -632,7 +733,7 @@ def stars(repo_group_id, repo_id=None): ORDER BY repo_info.repo_id, date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(stars_SQL, conn, params={'repo_group_id': repo_group_id}) return results @@ -647,7 +748,7 @@ def stars(repo_group_id, repo_id=None): ORDER BY date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(stars_SQL, conn, params={'repo_id': repo_id}) return results @@ -672,7 +773,7 @@ def stars_count(repo_group_id, repo_id=None): WHERE repo_group_id = :repo_group_id) """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(stars_count_SQL, conn, params={'repo_group_id': repo_group_id}) return results else: @@ -684,7 +785,7 @@ def stars_count(repo_group_id, repo_id=None): LIMIT 1 """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(stars_count_SQL, conn, params={'repo_id': repo_id}) return results @@ -711,7 +812,7 @@ def watchers(repo_group_id, repo_id=None): ORDER BY repo_info.repo_id, date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(watchers_SQL, conn, params={'repo_group_id': repo_group_id}) return results @@ -726,7 +827,7 @@ def watchers(repo_group_id, repo_id=None): ORDER BY date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(watchers_SQL, conn, params={'repo_id': repo_id}) return results @@ -751,7 +852,7 @@ def watchers_count(repo_group_id, repo_id=None): WHERE repo_group_id = :repo_group_id) """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(watchers_count_SQL, conn, params={'repo_group_id': repo_group_id}) return results else: @@ -763,7 +864,7 @@ def watchers_count(repo_group_id, repo_id=None): LIMIT 1 """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(watchers_count_SQL, conn, params={'repo_id': repo_id}) return results @@ -808,7 +909,7 @@ def annual_lines_of_code_count_ranked_by_new_repo_in_repo_group(repo_group_id, r LIMIT 10 """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(cdRgNewrepRankedCommitsSQL, conn, params={ "repo_group_id": repo_group_id, "repo_id": repo_id, "calendar_year": calendar_year}) return results @@ -905,7 +1006,7 @@ def annual_lines_of_code_count_ranked_by_repo_in_repo_group(repo_group_id, repo_ """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(cdRgTpRankedCommitsSQL, conn, params={ "repo_group_id": repo_group_id, "repo_id": repo_id}) return results @@ -959,7 +1060,7 @@ def lines_of_code_commit_counts_by_calendar_year_grouped(repo_url, calendar_year GROUP BY week """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(cdRepTpIntervalLocCommitsSQL, conn, params={"repourl": '%{}%'.format(repo_url), 'calendar_year': calendar_year}) return results @@ -980,7 +1081,7 @@ def average_weekly_commits(repo_group_id=None, repo_id=None, calendar_year=None) ORDER BY repo_name """.format(extra_and)) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(average_weekly_commits_sql, conn, params={"repo_group_id": repo_group_id, "repo_id": repo_id, "calendar_year": calendar_year}) return results @@ -1065,7 +1166,7 @@ def aggregate_summary(repo_group_id, repo_id=None, begin_date=None, end_date=Non ) commit_data """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(summarySQL, conn, params={'repo_group_id': repo_group_id, 'begin_date': begin_date, 'end_date': end_date}) return results @@ -1135,7 +1236,7 @@ def aggregate_summary(repo_group_id, repo_id=None, begin_date=None, end_date=Non ) commit_data """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(summarySQL, conn, params={'repo_id': repo_id, 'begin_date': begin_date, 'end_date': end_date}) return results diff --git a/augur/api/metrics/toss.py b/augur/api/metrics/toss.py index d3e91ad405..40a4a12b00 100644 --- a/augur/api/metrics/toss.py +++ b/augur/api/metrics/toss.py @@ -2,9 +2,10 @@ import datetime import sqlalchemy as s import pandas as pd +from flask import current_app + from augur.api.util import register_metric -from ..server import engine @register_metric(type="toss") def toss_pull_request_acceptance_rate(repo_id, begin_date=None, end_date=None, group_by='week'): @@ -27,7 +28,7 @@ def toss_pull_request_acceptance_rate(repo_id, begin_date=None, end_date=None, g ( SELECT COUNT ( pull_request_events.pull_request_id ) AS num_approved, - repo_id + pull_requests.repo_id FROM pull_requests JOIN pull_request_events ON pull_request_events.pull_request_id = pull_requests.pull_request_id @@ -38,12 +39,12 @@ def toss_pull_request_acceptance_rate(repo_id, begin_date=None, end_date=None, g AND pull_request_events.created_at BETWEEN :begin_date AND :end_date GROUP BY - repo_id + pull_requests.repo_id ) merged JOIN ( SELECT COUNT ( pull_request_events.pull_request_id ) AS num_opened, - repo_id + pull_requests.repo_id FROM pull_requests JOIN pull_request_events ON pull_request_events.pull_request_id = pull_requests.pull_request_id @@ -53,11 +54,11 @@ def toss_pull_request_acceptance_rate(repo_id, begin_date=None, end_date=None, g AND pull_request_events.created_at BETWEEN :begin_date AND :end_date GROUP BY - repo_id + pull_requests.repo_id ) opened ON merged.repo_id = opened.repo_id """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(pr_acceptance_rate_sql, conn, params={'repo_id': repo_id, 'group_by': group_by, 'begin_date': begin_date, 'end_date': end_date}) return results @@ -90,7 +91,7 @@ def toss_review_duration(repo_id, begin_date=None, end_date=None): AND :end_date """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(pr_acceptance_rate_sql, conn, params={'repo_id': repo_id, 'begin_date': begin_date, 'end_date': end_date}) if results.iloc[0]['duration'] is None: @@ -122,6 +123,6 @@ def toss_repo_info(repo_id): LIMIT 1; """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(license_file_sql, conn, params={'repo_id': repo_id}) return results diff --git a/augur/api/routes/application.py b/augur/api/routes/application.py index 3d2b22b8ed..a978bb1e66 100644 --- a/augur/api/routes/application.py +++ b/augur/api/routes/application.py @@ -4,31 +4,13 @@ """ import logging -import requests -import os -import base64 -import time -import secrets -import pandas as pd -from flask import request, Response, jsonify, session -from flask_login import login_user, logout_user, current_user, login_required -from werkzeug.security import check_password_hash -from sqlalchemy.sql import text -from sqlalchemy.orm import sessionmaker -from sqlalchemy.orm.exc import NoResultFound -from augur.application.db.session import DatabaseSession -from augur.tasks.github.util.github_task_session import GithubTaskSession -from augur.util.repo_load_controller import RepoLoadController +from flask import request, jsonify from augur.api.util import api_key_required, ssl_required -from augur.application.db.models import User, UserRepo, UserGroup, UserSessionToken, ClientApplication, RefreshToken -from augur.application.config import get_development_flag -from augur.tasks.init.redis_connection import redis_connection as redis -from ..server import app, engine +from augur.application.db.models import User, ClientApplication +from ..server import app logger = logging.getLogger(__name__) -development = get_development_flag() -Session = sessionmaker(bind=engine) from augur.api.routes import AUGUR_API_VERSION diff --git a/augur/api/routes/collection_status.py b/augur/api/routes/collection_status.py index 8afd8eb2da..0f6f3cfacb 100644 --- a/augur/api/routes/collection_status.py +++ b/augur/api/routes/collection_status.py @@ -2,10 +2,10 @@ import sqlalchemy as s import pandas as pd import json -from flask import Response +from flask import Response, current_app from augur.api.routes import AUGUR_API_VERSION -from ..server import app, engine +from ..server import app @app.route('/{}/collection_status/commits'.format(AUGUR_API_VERSION)) @@ -26,7 +26,7 @@ def commit_collection_status(): # TODO: make this name automatic - wrapper? c.facade_status = 'Success'; """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(commit_collection_sql, conn) data = results.to_json( orient="records", date_format='iso', date_unit='ms') @@ -89,7 +89,7 @@ def issue_collection_status(): # TODO: make this name automatic - wrapper? WHERE d.issues_enabled = 'true'; """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(issue_collection_sql, conn) data = results.to_json( orient="records", date_format='iso', date_unit='ms') @@ -161,7 +161,7 @@ def pull_request_collection_status(): # TODO: make this name automatic - wrappe ratio_abs; """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(pull_request_collection_sql, conn) data = results.to_json( orient="records", date_format='iso', date_unit='ms') diff --git a/augur/api/routes/complexity.py b/augur/api/routes/complexity.py index bee39eb923..feb58c6c39 100644 --- a/augur/api/routes/complexity.py +++ b/augur/api/routes/complexity.py @@ -1,17 +1,19 @@ #SPDX-License-Identifier: MIT -from flask import Response -import sqlalchemy as s +from flask import Response, current_app, request import pandas as pd +import sqlalchemy as s from augur.api.util import metric_metadata import os import requests from augur.api.routes import AUGUR_API_VERSION -from ..server import app, engine +from ..server import app @app.route('/{}/complexity/project_languages'.format(AUGUR_API_VERSION), methods=["GET"]) def get_project_languages(): + + repo_id = request.args.get('repo_id') project_languages_sql = s.sql.text(""" SELECT e.repo_id, @@ -48,7 +50,7 @@ def get_project_languages(): ORDER BY e.repo_id """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(project_languages_sql, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, @@ -87,7 +89,7 @@ def get_project_files(): ORDER BY e.repo_id """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(project_files_sql, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, @@ -96,8 +98,10 @@ def get_project_files(): @app.route('/{}/complexity/project_lines'.format(AUGUR_API_VERSION), methods=["GET"]) def get_project_lines(): + + repo_id = request.args.get('repo_id') project_lines_sql = s.sql.text(""" - SELECT + SELECT e.repo_id, augur_data.repo.repo_git, augur_data.repo.repo_name, @@ -125,11 +129,11 @@ def get_project_lines(): augur_data.repo_labor.repo_id = recent.repo_id AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id + WHERE augur_data.repo.repo_id = e.repo_id and augur_data.repo.repo_id = :repo_id_param ORDER BY e.repo_id - """) + """).bindparams(repo_id_param=repo_id) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(project_lines_sql, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, @@ -138,6 +142,8 @@ def get_project_lines(): @app.route('/{}/complexity/project_comment_lines'.format(AUGUR_API_VERSION), methods=["GET"]) def get_project_comment_lines(): + + repo_id = request.args.get('repo_id') comment_lines_sql = s.sql.text(""" SELECT e.repo_id, @@ -167,11 +173,12 @@ def get_project_comment_lines(): augur_data.repo_labor.repo_id = recent.repo_id AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id + WHERE augur_data.repo.repo_id = e.repo_id + AND e.repo_id = :repo_id_param ORDER BY e.repo_id - """) + """).bindparams(repo_id_param=repo_id) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(comment_lines_sql, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, @@ -180,40 +187,43 @@ def get_project_comment_lines(): @app.route('/{}/complexity/project_blank_lines'.format(AUGUR_API_VERSION), methods=["GET"]) def get_project_blank_lines(): + + repo_id = request.args.get('repo_id') blank_lines_sql = s.sql.text(""" - SELECT + SELECT e.repo_id, augur_data.repo.repo_git, augur_data.repo.repo_name, e.blank_lines, e.avg_blank_lines - FROM - augur_data.repo, - (SELECT - d.repo_id, - SUM(d.blank_lines) AS blank_lines, - AVG(d.blank_lines)::int AS avg_blank_lines - FROM - (SELECT - augur_data.repo_labor.repo_id, - augur_data.repo_labor.blank_lines - FROM - augur_data.repo_labor, - ( SELECT - augur_data.repo_labor.repo_id, - MAX ( data_collection_date ) AS last_collected - FROM - augur_data.repo_labor - GROUP BY augur_data.repo_labor.repo_id) recent - WHERE - augur_data.repo_labor.repo_id = recent.repo_id - AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d - GROUP BY d.repo_id) e - WHERE augur_data.repo.repo_id = e.repo_id - ORDER BY e.repo_id - """) + FROM + augur_data.repo, + (SELECT + d.repo_id, + SUM(d.blank_lines) AS blank_lines, + AVG(d.blank_lines)::int AS avg_blank_lines + FROM + (SELECT + augur_data.repo_labor.repo_id, + augur_data.repo_labor.blank_lines + FROM + augur_data.repo_labor, + ( SELECT + augur_data.repo_labor.repo_id, + MAX ( data_collection_date ) AS last_collected + FROM + augur_data.repo_labor + GROUP BY augur_data.repo_labor.repo_id) recent + WHERE + augur_data.repo_labor.repo_id = recent.repo_id + AND augur_data.repo_labor.data_collection_date > recent.last_collected - (5 * interval '1 minute')) d + GROUP BY d.repo_id) e + WHERE augur_data.repo.repo_id = e.repo_id + AND e.repo_id = :repo_id_param + ORDER BY e.repo_id + """).bindparams(repo_id_param=repo_id) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(blank_lines_sql, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, @@ -256,7 +266,7 @@ def get_project_file_complexity(): ORDER BY e.repo_id """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(project_file_complexity_sql, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, diff --git a/augur/api/routes/config.py b/augur/api/routes/config.py index f6075a8e2b..6a2f82976e 100644 --- a/augur/api/routes/config.py +++ b/augur/api/routes/config.py @@ -3,9 +3,7 @@ Creates routes for config functionality """ import logging -import requests -import os -from flask import request, jsonify, Response +from flask import request, jsonify, current_app import sqlalchemy as s # Disable the requirement for SSL by setting env["AUGUR_DEV"] = True @@ -33,7 +31,7 @@ def get_config(): if not development and not request.is_secure: return generate_upgrade_request() - with DatabaseSession(logger) as session: + with DatabaseSession(logger, engine=current_app.engine) as session: config_dict = AugurConfig(logger, session).config.load_config() @@ -47,7 +45,7 @@ def update_config(): update_dict = request.get_json() - with DatabaseSession(logger) as session: + with DatabaseSession(logger, engine=current_app.engine) as session: for section, data in update_dict.items(): diff --git a/augur/api/routes/contributor_reports.py b/augur/api/routes/contributor_reports.py index c600e81416..711f321b3e 100644 --- a/augur/api/routes/contributor_reports.py +++ b/augur/api/routes/contributor_reports.py @@ -6,7 +6,7 @@ import datetime import pandas as pd from math import pi -from flask import request, send_file, Response +from flask import request, send_file, Response, current_app # import visualization libraries from bokeh.io import export_png @@ -18,7 +18,7 @@ from bokeh.transform import cumsum from augur.api.routes import AUGUR_API_VERSION -from ..server import app, engine +from ..server import app warnings.filterwarnings('ignore') @@ -294,7 +294,7 @@ def new_contributor_data_collection(repo_id, required_contributions): """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: df = pd.read_sql(contributor_query, conn) df = df.loc[~df['full_name'].str.contains('bot', na=False)] @@ -337,7 +337,7 @@ def months_data_collection(start_date, end_date): ) y """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: months_df = pd.read_sql(months_query, conn) # add yearmonths to months_df diff --git a/augur/api/routes/dei.py b/augur/api/routes/dei.py index 82324a8d62..990a6e7368 100644 --- a/augur/api/routes/dei.py +++ b/augur/api/routes/dei.py @@ -2,31 +2,23 @@ Creates routes for DEI badging functionality """ -import logging, subprocess, inspect +import logging, subprocess -from flask import request, Response, jsonify, render_template, send_file +from flask import request, jsonify, render_template, send_file, current_app from pathlib import Path -from sqlalchemy.orm import sessionmaker -from sqlalchemy.orm.exc import NoResultFound - from augur.api.util import api_key_required, ssl_required -from augur.util.repo_load_controller import RepoLoadController -from augur.application.db.models import User, ClientApplication, CollectionStatus, Repo, RepoGroup, BadgingDEI +from augur.application.db.models import ClientApplication, CollectionStatus, Repo, RepoGroup, BadgingDEI from augur.application.db.session import DatabaseSession -from augur.application.config import AugurConfig from augur.tasks.util.collection_util import CollectionRequest,AugurTaskRoutine, get_enabled_phase_names_from_config, core_task_success_util from augur.tasks.start_tasks import prelim_phase, primary_repo_collect_phase -from augur.tasks.github.util.github_task_session import GithubTaskSession -from augur.tasks.init.redis_connection import redis_connection as redis from augur.tasks.github.util.util import get_repo_weight_by_issue -from ..server import app, engine +from ..server import app logger = logging.getLogger(__name__) -Session = sessionmaker(bind=engine, autocommit=True) from augur.api.routes import AUGUR_API_VERSION from augur.application.db.models.augur_operations import FRONTEND_REPO_GROUP_NAME @@ -44,7 +36,7 @@ def dei_track_repo(application: ClientApplication): repo_url = repo_url.lower() - session = DatabaseSession(logger) + session = DatabaseSession(logger, engine=current_app.engine) session.autocommit = True repo: Repo = session.query(Repo).filter(Repo.repo_git==repo_url).first() if repo: @@ -77,7 +69,7 @@ def dei_track_repo(application: ClientApplication): "repo_id": repo_id } - enabled_phase_names = get_enabled_phase_names_from_config(logger, session) + enabled_phase_names = get_enabled_phase_names_from_config() #Primary collection hook. primary_enabled_phases = [] @@ -117,7 +109,7 @@ def dei_report(application: ClientApplication): if not dei_id: return jsonify({"status": "Missing argument"}), 400 - session = DatabaseSession(logger) + session = DatabaseSession(logger, engine=current_app.engine) project: BadgingDEI = session.query(BadgingDEI).filter(BadgingDEI.badging_id==dei_id).first() diff --git a/augur/api/routes/metadata.py b/augur/api/routes/metadata.py index f49dbb88f8..ced7b60efd 100644 --- a/augur/api/routes/metadata.py +++ b/augur/api/routes/metadata.py @@ -1,19 +1,11 @@ #SPDX-License-Identifier: MIT -from flask import Response -from flask import request -import datetime -import base64 +from flask import Response, current_app import sqlalchemy as s import pandas as pd -from augur.api.util import metric_metadata -import boto3 import json -from boto3.dynamodb.conditions import Key, Attr -import os -import requests from augur.api.routes import AUGUR_API_VERSION -from ..server import app, engine +from ..server import app @app.route('/{}/metadata/repo_info'.format(AUGUR_API_VERSION), methods=["GET"]) def get_repo_info(): @@ -48,7 +40,7 @@ def get_repo_info(): repo.repo_name; """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(repo_info_sql, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') parsed_data = json.loads(data) @@ -64,7 +56,7 @@ def contributions_count(): order by contributions desc; """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(repo_info_sql, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') parsed_data = json.loads(data) @@ -80,7 +72,7 @@ def contributors_count(): order by contributors desc; """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(repo_info_sql, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') parsed_data = json.loads(data) diff --git a/augur/api/routes/pull_request_reports.py b/augur/api/routes/pull_request_reports.py index 9e65779542..9520fc21f7 100644 --- a/augur/api/routes/pull_request_reports.py +++ b/augur/api/routes/pull_request_reports.py @@ -6,7 +6,7 @@ import datetime import json # from scipy import stats -from flask import request, send_file, Response +from flask import request, send_file, Response, current_app import math from bokeh.palettes import Colorblind, mpl, Category20 @@ -24,7 +24,7 @@ warnings.filterwarnings('ignore') from augur.api.routes import AUGUR_API_VERSION -from ..server import app, engine +from ..server import app def pull_request_data_collection(repo_id, start_date, end_date): @@ -67,10 +67,10 @@ def pull_request_data_collection(repo_id, start_date, end_date): unlabeled_count, head_ref_deleted_count, comment_count, - COALESCE(lines_added, 0), - COALESCE(lines_removed, 0), + COALESCE(lines_added, 0) as lines_added, + COALESCE(lines_removed, 0) as lines_removed, commit_count, - COALESCE(file_count, 0) + COALESCE(file_count, 0) as file_count FROM repo, repo_groups, @@ -97,7 +97,7 @@ def pull_request_data_collection(repo_id, start_date, end_date): JOIN repo on repo.repo_id = pull_requests.repo_id LEFT OUTER JOIN pull_request_message_ref on pull_requests.pull_request_id = pull_request_message_ref.pull_request_id LEFT OUTER JOIN message on pull_request_message_ref.msg_id = message.msg_id - WHERE repo.repo_id = 1 + WHERE repo.repo_id = {repo_id} GROUP BY pull_requests.pull_request_id ) response_times ON pull_requests.pull_request_id = response_times.pull_request_id @@ -106,7 +106,7 @@ def pull_request_data_collection(repo_id, start_date, end_date): FROM pull_request_commits, pull_requests, pull_request_meta WHERE pull_requests.pull_request_id = pull_request_commits.pull_request_id AND pull_requests.pull_request_id = pull_request_meta.pull_request_id - AND pull_requests.repo_id = 1 + AND pull_requests.repo_id = {repo_id} AND pr_cmt_sha <> pull_requests.pr_merge_commit_sha AND pr_cmt_sha <> pull_request_meta.pr_sha GROUP BY pull_request_commits.pull_request_id @@ -116,7 +116,7 @@ def pull_request_data_collection(repo_id, start_date, end_date): SELECT MAX(pr_repo_meta_id), pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label FROM pull_requests, pull_request_meta WHERE pull_requests.pull_request_id = pull_request_meta.pull_request_id - AND pull_requests.repo_id = 1 + AND pull_requests.repo_id = {repo_id} AND pr_head_or_base = 'base' GROUP BY pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label ) base_labels @@ -127,7 +127,7 @@ def pull_request_data_collection(repo_id, start_date, end_date): WHERE cmt_commit_hash = pr_cmt_sha AND pull_requests.pull_request_id = pull_request_commits.pull_request_id AND pull_requests.pull_request_id = pull_request_meta.pull_request_id - AND pull_requests.repo_id = 1 + AND pull_requests.repo_id = {repo_id} AND commits.repo_id = pull_requests.repo_id AND commits.cmt_commit_hash <> pull_requests.pr_merge_commit_sha AND commits.cmt_commit_hash <> pull_request_meta.pr_sha @@ -137,12 +137,12 @@ def pull_request_data_collection(repo_id, start_date, end_date): WHERE repo.repo_group_id = repo_groups.repo_group_id AND repo.repo_id = pull_requests.repo_id - AND repo.repo_id = 1 + AND repo.repo_id = {repo_id} ORDER BY merged_count DESC """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: pr_all = pd.read_sql(pr_query, conn) pr_all[['assigned_count', diff --git a/augur/api/routes/user.py b/augur/api/routes/user.py index 62bc44068a..31f18ce663 100644 --- a/augur/api/routes/user.py +++ b/augur/api/routes/user.py @@ -6,21 +6,21 @@ import logging import secrets -from flask import request, jsonify, session +from flask import request, jsonify from flask_login import login_user, logout_user, current_user, login_required from werkzeug.security import check_password_hash -from sqlalchemy.orm import sessionmaker -from augur.application.db.session import DatabaseSession +from sqlalchemy.orm import object_session + +from augur.application.db import get_session from augur.api.util import api_key_required from augur.api.util import ssl_required from augur.application.db.models import User, UserSessionToken, RefreshToken from augur.tasks.init.redis_connection import redis_connection as redis -from ..server import app, engine +from ..server import app logger = logging.getLogger(__name__) current_user: User = current_user -Session = sessionmaker(bind=engine) @app.route(f"/{AUGUR_API_VERSION}/user/validate", methods=['POST']) @ssl_required @@ -31,9 +31,8 @@ def validate_user(): # https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/400 return jsonify({"status": "Missing argument"}), 400 - session = Session() - user = session.query(User).filter(User.login_name == username).first() - session.close() + with get_session() as session: + user = session.query(User).filter(User.login_name == username).first() if user is None: return jsonify({"status": "Invalid username"}) @@ -88,7 +87,7 @@ def generate_session(application): if not username: return jsonify({"status": "Invalid authorization code"}) - with DatabaseSession(logger) as session: + with get_session() as session: user = User.get_user(session, username) if not user: @@ -123,7 +122,7 @@ def refresh_session(application): if request.args.get("grant_type") != "refresh_token": return jsonify({"status": "Invalid grant type"}) - with DatabaseSession(logger) as session: + with get_session() as session: refresh_token = session.query(RefreshToken).filter(RefreshToken.id == refresh_token_str).first() if not refresh_token: @@ -190,21 +189,20 @@ def update_user(): new_login_name = request.args.get("new_username") new_password = request.args.get("new_password") + session = object_session(current_user) + if email is not None: - existing_user = session.query(User).filter(User.email == email).one() + existing_user = session.query(User).filter(User.email == email).one_or_none() if existing_user is not None: - session = Session() return jsonify({"status": "Already an account with this email"}) - + current_user.email = email session.commit() - session = Session() return jsonify({"status": "Email Updated"}) if new_password is not None: current_user.login_hashword = User.compute_hashsed_password(new_password) session.commit() - session = Session() return jsonify({"status": "Password Updated"}) if new_login_name is not None: @@ -214,7 +212,6 @@ def update_user(): current_user.login_name = new_login_name session.commit() - session = Session() return jsonify({"status": "Username Updated"}) return jsonify({"status": "Missing argument"}), 400 diff --git a/augur/api/routes/util.py b/augur/api/routes/util.py index 71d3526b96..457afaf6ed 100644 --- a/augur/api/routes/util.py +++ b/augur/api/routes/util.py @@ -1,15 +1,14 @@ #SPDX-License-Identifier: MIT from augur.api.routes import AUGUR_API_VERSION -from ..server import app, engine +from ..server import app import base64 import sqlalchemy as s import pandas as pd import json -from flask import Response +from flask import Response, current_app -from augur.application.db.session import DatabaseSession +from augur.application.db.lib import get_value from augur.application.logs import AugurLogger -from augur.application.config import AugurConfig logger = AugurLogger("augur").get_logger() @@ -21,7 +20,7 @@ def get_all_repo_groups(): #TODO: make this name automatic - wrapper? ORDER BY rg_name """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(repoGroupsSQL, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, @@ -58,7 +57,7 @@ def get_all_repos(): order by repo_name """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(get_all_repos_sql, conn) results['url'] = results['url'].apply(lambda datum: datum.split('//')[1]) @@ -101,7 +100,7 @@ def get_repos_in_repo_group(repo_group_id): ORDER BY repo.repo_git """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(repos_in_repo_groups_SQL, conn, params={'repo_group_id': repo_group_id}) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, @@ -137,7 +136,7 @@ def get_repo_by_id(repo_id: int) -> Response: repo.repo_id = :id """) - results = pd.read_sql(repo_by_id_SQL, engine, params={"id": repo_id}) + results = pd.read_sql(repo_by_id_SQL, current_app.engine, params={"id": repo_id}) results["url"] = results["url"].apply(lambda datum: datum.split("//")[1]) # cut "https://" off the URL results["base64_url"] = [base64.b64encode(results.at[i, "url"].encode()) for i in results.index] data = results.to_json(orient="records", date_format="iso", date_unit="ms") @@ -161,7 +160,7 @@ def get_repo_by_git_name(owner, repo): GROUP BY repo_id, rg_name """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(get_repo_by_git_name_sql, conn, params={'owner': '%{}%'.format(owner), 'repo': repo,}) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, @@ -179,7 +178,7 @@ def get_repo_by_name(rg_name, repo_name): AND LOWER(repo_name) = LOWER(:repo_name) """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(get_repo_by_name_sql, conn, params={'rg_name': rg_name, 'repo_name': repo_name}) results['url'] = results['url'].apply(lambda datum: datum.split('//')[1]) data = results.to_json(orient="records", date_format='iso', date_unit='ms') @@ -195,7 +194,7 @@ def get_group_by_name(rg_name): WHERE lower(rg_name) = lower(:rg_name) """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(groupSQL, conn, params={'rg_name': rg_name}) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, @@ -210,7 +209,7 @@ def get_repos_for_dosocs(): WHERE a.setting='repo_directory' """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(get_repos_for_dosocs_SQL, conn) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, @@ -240,7 +239,7 @@ def get_issues(repo_group_id, repo_id=None): ORDER by OPEN_DAY DESC """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(get_issues_sql, conn, params={'repo_group_id': repo_group_id}) else: get_issues_sql = s.sql.text(""" @@ -262,7 +261,7 @@ def get_issues(repo_group_id, repo_id=None): ORDER by OPEN_DAY DESC """) - with engine.connect() as conn: + with current_app.engine.connect() as conn: results = pd.read_sql(get_issues_sql, conn, params={'repo_id': repo_id}) data = results.to_json(orient="records", date_format='iso', date_unit='ms') return Response(response=data, @@ -272,9 +271,7 @@ def get_issues(repo_group_id, repo_id=None): @app.route('/{}/api-port'.format(AUGUR_API_VERSION)) def api_port(): - with DatabaseSession(logger) as session: - - response = {'port': AugurConfig(logger, session).get_value('Server', 'port')} - return Response(response=json.dumps(response), - status=200, - mimetype="application/json") + response = {'port': get_value('Server', 'port')} + return Response(response=json.dumps(response), + status=200, + mimetype="application/json") diff --git a/augur/api/server.py b/augur/api/server.py index d3e92ad99e..64a4e94bf5 100644 --- a/augur/api/server.py +++ b/augur/api/server.py @@ -4,15 +4,13 @@ import glob import sys import inspect -import types import json import os import base64 -import logging import importlib import graphene -from typing import Optional, List, Any, Tuple +from typing import List, Any from pathlib import Path @@ -21,19 +19,20 @@ import pandas as pd from beaker.util import parse_cache_config_options from beaker.cache import CacheManager, Cache -from sqlalchemy import create_engine from sqlalchemy.pool import StaticPool from flask_graphql import GraphQLView from graphene_sqlalchemy import SQLAlchemyObjectType from augur.application.logs import AugurLogger -from augur.application.config import AugurConfig from augur.application.db.session import DatabaseSession +from augur.application.config import AugurConfig from augur.application.db.engine import get_database_string, create_database_engine -from metadata import __version__ as augur_code_version from augur.application.db.models import Repo, Issue, PullRequest, Message, PullRequestReview, Commit, IssueAssignee, PullRequestAssignee, PullRequestCommit, PullRequestFile, Contributor, IssueLabel, PullRequestLabel, ContributorsAlias, Release, ClientApplication +from metadata import __version__ as augur_code_version + + # from augur.api.routes import AUGUR_API_VERSION AUGUR_API_VERSION = "api/unstable" @@ -313,14 +312,14 @@ def create_cache_manager() -> CacheManager: return cache -def get_server_cache(config, cache_manager) -> Cache: +def get_server_cache(cache_manager) -> Cache: """Create the server cache, set expiration, and clear Returns: server cache """ - expire = int(config.get_value('Server', 'cache_expire')) + expire = int(augur_config.get_value('Server', 'cache_expire')) server_cache = cache_manager.get_cache('server', expire=expire) server_cache.clear() @@ -672,7 +671,7 @@ def resolve_contributor(self, info, id): # so when we pass the flask app to the routes files we # know can access the api version via the app variable app.augur_api_version = AUGUR_API_VERSION - +app.engine = engine CORS(app) app.url_map.strict_slashes = False @@ -733,7 +732,7 @@ def dispatch_request(self): from .view.api import * cache_manager = create_cache_manager() -server_cache = get_server_cache(augur_config, cache_manager) +server_cache = get_server_cache(cache_manager) diff --git a/augur/api/util.py b/augur/api/util.py index bdcd70d124..eaefab8bf7 100644 --- a/augur/api/util.py +++ b/augur/api/util.py @@ -4,21 +4,16 @@ """ import os import re -import inspect -import types -import sys import beaker -from flask import request, jsonify +from flask import request, jsonify, current_app -from .server import engine +from augur.application.db import get_session from functools import wraps -from sqlalchemy.orm import sessionmaker from sqlalchemy.orm.exc import NoResultFound from augur.application.config import get_development_flag -from augur.application.db.models import User, UserRepo, UserGroup, UserSessionToken, ClientApplication, RefreshToken +from augur.application.db.models import ClientApplication -Session = sessionmaker(bind=engine) development = get_development_flag() __ROOT = os.path.abspath(os.path.dirname(__file__)) @@ -133,13 +128,13 @@ def wrapper(*args, **kwargs): # If valid: if client_token: - session = Session() - try: - kwargs["application"] = session.query(ClientApplication).filter(ClientApplication.api_key == client_token).one() - except NoResultFound as e: - return {"status": "Unauthorized client"} + with get_session() as session: + try: + kwargs["application"] = session.query(ClientApplication).filter(ClientApplication.api_key == client_token).one() + except NoResultFound as e: + return {"status": "Unauthorized client"} - return fun(*args, **kwargs) + return fun(*args, **kwargs) return {"status": "Unauthorized client"} diff --git a/augur/api/view/api.py b/augur/api/view/api.py index 598c0cdb6d..cbd7e4a0f1 100644 --- a/augur/api/view/api.py +++ b/augur/api/view/api.py @@ -1,10 +1,10 @@ -from flask import Flask, render_template, render_template_string, request, abort, jsonify, redirect, url_for, session, flash +from flask import request, jsonify, redirect, url_for, flash, current_app import re from flask_login import current_user, login_required from augur.application.db.models import Repo, RepoGroup, UserGroup, UserRepo from augur.tasks.frontend import add_org_repo_list, parse_org_and_repo_name, parse_org_name from .utils import * -from ..server import app, engine +from ..server import app from augur.application.db.session import DatabaseSession @app.route('/cache/file/') @@ -68,7 +68,7 @@ def av_add_user_repo(): invalid_urls = [] - with DatabaseSession(logger, engine) as session: + with DatabaseSession(logger, current_app.engine) as session: for url in urls: # matches https://github.com/{org}/ or htts://github.com/{org} diff --git a/augur/api/view/augur_view.py b/augur/api/view/augur_view.py index b39d698452..ff4b25145c 100644 --- a/augur/api/view/augur_view.py +++ b/augur/api/view/augur_view.py @@ -1,4 +1,4 @@ -from flask import Flask, render_template, redirect, url_for, session, request, jsonify +from flask import render_template, redirect, url_for, session, request, jsonify from flask_login import LoginManager from io import StringIO from .utils import * @@ -48,6 +48,7 @@ def internal_server_error(error): traceback.print_tb(error.__traceback__, file=errout) # traceback.print_exception(error, file=errout) stacktrace = errout.getvalue() + stacktrace += f"\n{type(error).__name__}: {str(error)}" errout.close() except Exception as e: logger.error(e) @@ -83,7 +84,6 @@ def load_user(user_id): repos = group.repos for token in tokens: application = token.application - db_session.expunge(user) # The flask_login library sets a unique session["_id"] diff --git a/augur/api/view/init.py b/augur/api/view/init.py index 8ad7936700..e7c8386d30 100644 --- a/augur/api/view/init.py +++ b/augur/api/view/init.py @@ -1,6 +1,6 @@ from pathlib import Path from augur.application.logs import AugurLogger -import logging, secrets, yaml +import secrets, yaml # load configuration files and initialize globals configFile = Path("config.yml") diff --git a/augur/api/view/routes.py b/augur/api/view/routes.py index 72164a9291..00d456733f 100644 --- a/augur/api/view/routes.py +++ b/augur/api/view/routes.py @@ -3,16 +3,14 @@ """ import logging import math -from flask import Flask, render_template, render_template_string, request, abort, jsonify, redirect, url_for, session, flash -from sqlalchemy.orm.exc import NoResultFound +from flask import render_template, request, redirect, url_for, session, flash from .utils import * from flask_login import login_user, logout_user, current_user, login_required from augur.application.db.models import User, Repo, ClientApplication from .server import LoginException -from augur.tasks.init.redis_connection import redis_connection as redis from augur.application.util import * -from augur.application.config import AugurConfig +from augur.application.db.lib import get_value from ..server import app, db_session logger = logging.getLogger(__name__) @@ -73,9 +71,7 @@ def repo_table_view(): direction = "DESC" if rev else "ASC" - config = AugurConfig(logger, db_session) - - pagination_offset = config.get_value("frontend", "pagination_offset") + pagination_offset = get_value("frontend", "pagination_offset") if current_user.is_authenticated: data = current_user.get_repos(page = page, sort = sorting, direction = direction, search=query)[0] @@ -243,9 +239,7 @@ def repo_repo_view(id): def user_groups_view(): params = {} - config = AugurConfig(logger, db_session) - - pagination_offset = config.get_value("frontend", "pagination_offset") + pagination_offset = get_value("frontend", "pagination_offset") params = {} @@ -319,9 +313,7 @@ def user_group_view(group = None): rev = True params["direction"] = "DESC" - config = AugurConfig(logger, db_session) - - pagination_offset = config.get_value("frontend", "pagination_offset") + pagination_offset = get_value("frontend", "pagination_offset") data = current_user.get_group_repos(group, **params)[0] page_count = current_user.get_group_repo_count(group, search = query)[0] or 0 diff --git a/augur/api/view/utils.py b/augur/api/view/utils.py index 298e9950ae..aae5140cd7 100644 --- a/augur/api/view/utils.py +++ b/augur/api/view/utils.py @@ -3,29 +3,17 @@ """ from pathlib import Path from concurrent.futures import ThreadPoolExecutor -from flask import render_template, flash, url_for, Flask +from flask import render_template, flash, url_for from .init import init_logging from .init import * -from ..server import app, db_session -from augur.application.config import AugurConfig -import urllib.request, urllib.error, json, os, math, yaml, urllib3, time, logging, re, math +from augur.application.db.lib import get_value +import urllib.error, math, yaml, urllib3, time, math -from augur.application.db.session import DatabaseSession -from augur.application.db.engine import DatabaseEngine -from augur.application.db.models import User, Repo, RepoGroup, UserGroup, UserRepo -from sqlalchemy import Column, Table, Integer, MetaData, or_ -from sqlalchemy.sql.operators import ilike_op, distinct_op -from sqlalchemy.sql.functions import coalesce -from augur.application.db.models.base import Base - -from sqlalchemy.orm import Query init_logging() from .init import logger -config = AugurConfig(logger, db_session) - """ ---------------------------------------------------------------- loadSettings: This function attempts to load the application settings from the config file @@ -76,7 +64,7 @@ def getSetting(key, section = "View"): return "http://127.0.0.1:5000/api/unstable" return settings[key] else: - return config.get_value(section, key) + return get_value(section, key) loadSettings() diff --git a/augur/application/cli/__init__.py b/augur/application/cli/__init__.py index aaf548432a..f15758c9cf 100644 --- a/augur/application/cli/__init__.py +++ b/augur/application/cli/__init__.py @@ -3,11 +3,12 @@ from functools import update_wrapper import os import sys -import socket import re import json +import httpx from augur.application.db.engine import DatabaseEngine +from augur.application.db import get_engine, dispose_database_engine from sqlalchemy.exc import OperationalError @@ -15,13 +16,22 @@ def test_connection(function_internet_connection): @click.pass_context def new_func(ctx, *args, **kwargs): usage = re.search(r"Usage:\s(.*)\s\[OPTIONS\]", str(ctx.get_usage())).groups()[0] - try: - #try to ping google's dns server - socket.create_connection(("8.8.8.8",53)) - return ctx.invoke(function_internet_connection, *args, **kwargs) - except OSError as e: - print(e) - print(f"\n\n{usage} command setup failed\nYou are not connect to the internet. Please connect to the internet to run Augur\n") + with httpx.Client() as client: + try: + _ = client.request( + method="GET", url="http://chaoss.community", timeout=10, follow_redirects=True) + + return ctx.invoke(function_internet_connection, *args, **kwargs) + except (TimeoutError, httpx.TimeoutException): + print("Request timed out.") + except httpx.NetworkError: + print(f"Network Error: {httpx.NetworkError}") + except httpx.ProtocolError: + print(f"Protocol Error: {httpx.ProtocolError}") + print(f"\n\n{usage} command setup failed\n \ + You are not connected to the internet.\n \ + Please connect to the internet to run Augur\n \ + Consider setting http_proxy variables for limited access installations.") sys.exit() return update_wrapper(new_func, function_internet_connection) @@ -72,6 +82,22 @@ def new_func(ctx, *args, **kwargs): return update_wrapper(new_func, function_db_connection) + +class DatabaseContext(): + def __init__(self): + self.engine = None + +def with_database(f): + @click.pass_context + def new_func(ctx, *args, **kwargs): + ctx.obj.engine = get_engine() + try: + return ctx.invoke(f, *args, **kwargs) + finally: + dispose_database_engine() + return new_func + + # def pass_application(f): # @click.pass_context # def new_func(ctx, *args, **kwargs): diff --git a/augur/application/cli/_cli_util.py b/augur/application/cli/_cli_util.py new file mode 100644 index 0000000000..d5837f9f90 --- /dev/null +++ b/augur/application/cli/_cli_util.py @@ -0,0 +1,69 @@ +import resource +import os +import subprocess +import psutil +import signal +from urllib.parse import urlparse + +from augur.tasks.init.redis_connection import redis_connection + +def clear_redis_caches(logger): + """Clears the redis databases that celery and redis use.""" + + logger.info("Flushing all redis databases this instance was using") + celery_purge_command = "celery -A augur.tasks.init.celery_app.celery_app purge -f" + subprocess.call(celery_purge_command.split(" ")) + redis_connection.flushdb() + + +def clear_rabbitmq_messages(connection_string, queues, logger): + #virtual_host_string = connection_string.split("/")[-1] + + logger.info("Clearing all messages from celery queue in rabbitmq") + from augur.tasks.init.celery_app import celery_app + celery_app.control.purge() + + clear_message_queues(connection_string, queues) + + +def clear_message_queues(connection_string, queues): + queues = ['celery','secondary','scheduling','facade'] + + virtual_host_string = connection_string.split("/")[-1] + + #Parse username and password with urllib + parsed = urlparse(connection_string) + + for q in queues: + curl_cmd = f"curl -i -u {parsed.username}:{parsed.password} -XDELETE http://localhost:15672/api/queues/{virtual_host_string}/{q}" + subprocess.call(curl_cmd.split(" "),stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + +def _broadcast_signal_to_processes(processes, logger, broadcast_signal=signal.SIGTERM): + + for process in processes: + if process.pid != os.getpid(): + logger.info(f"Stopping process {process.pid}") + try: + process.send_signal(broadcast_signal) + except psutil.NoSuchProcess: + pass + + +def raise_open_file_limit(num_files): + """ + sets number of open files soft limit + """ + current_soft, current_hard = resource.getrlimit(resource.RLIMIT_NOFILE) + + # if soft is already greater than the requested amount then don't change it + if current_soft > num_files: + return + + # if the requested amount is greater than the hard limit then set the hard limit to the num_files value + if current_hard <= num_files: + current_hard = num_files + + resource.setrlimit(resource.RLIMIT_NOFILE, (num_files, current_hard)) + + return \ No newline at end of file diff --git a/augur/application/cli/_multicommand.py b/augur/application/cli/_multicommand.py index c0d8b1a967..2a1bfd1c71 100644 --- a/augur/application/cli/_multicommand.py +++ b/augur/application/cli/_multicommand.py @@ -7,6 +7,8 @@ import click import importlib import traceback + +from pathlib import Path # import augur.application CONTEXT_SETTINGS = dict(auto_envvar_prefix='AUGUR') @@ -24,11 +26,16 @@ def list_commands(self, ctx): return rv def get_command(self, ctx, name): - try: - module = importlib.import_module('.' + name, 'augur.application.cli') - return module.cli - except ModuleNotFoundError as e: - pass + cmdfile = "augur/application/cli" / Path(name + ".py") + + # Check that the command exists before importing + if not cmdfile.is_file(): + + return + + # Prefer to raise exception instead of silcencing it + module = importlib.import_module('.' + name, 'augur.application.cli') + return module.cli @click.command(cls=AugurMultiCommand, context_settings=CONTEXT_SETTINGS) @click.pass_context diff --git a/augur/application/cli/api.py b/augur/application/cli/api.py new file mode 100644 index 0000000000..d716957c0b --- /dev/null +++ b/augur/application/cli/api.py @@ -0,0 +1,158 @@ +#SPDX-License-Identifier: MIT +""" +Augur library commands for controlling the backend components +""" +import os +import time +import subprocess +import click +import logging +import psutil +import signal +import uuid +import traceback + +from augur.application.db.session import DatabaseSession +from augur.application.logs import AugurLogger +from augur.application.cli import test_connection, test_db_connection, with_database +from augur.application.cli._cli_util import _broadcast_signal_to_processes, raise_open_file_limit, clear_redis_caches, clear_rabbitmq_messages +from augur.application.db.lib import get_value + +logger = AugurLogger("augur", reset_logfiles=True).get_logger() + +@click.group('api', short_help='Commands for controlling the backend API server') +def cli(): + pass + +@cli.command("start") +@click.option("--development", is_flag=True, default=False, help="Enable development mode") +@click.option('--port') +@test_connection +@test_db_connection +@with_database +@click.pass_context +def start(ctx, development, port): + """Start Augur's backend server.""" + + try: + if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": + raise_open_file_limit(100000) + except Exception as e: + logger.error( + ''.join(traceback.format_exception(None, e, e.__traceback__))) + + logger.error("Failed to raise open file limit!") + raise e + + if development: + os.environ["AUGUR_DEV"] = "1" + logger.info("Starting in development mode") + + try: + gunicorn_location = os.getcwd() + "/augur/api/gunicorn_conf.py" + except FileNotFoundError: + logger.error("\n\nPlease run augur commands in the root directory\n\n") + + host = get_value("Server", "host") + + if not port: + port = get_value("Server", "port") + + gunicorn_command = f"gunicorn -c {gunicorn_location} -b {host}:{port} augur.api.server:app --log-file gunicorn.log" + server = subprocess.Popen(gunicorn_command.split(" ")) + + time.sleep(3) + logger.info('Gunicorn webserver started...') + logger.info(f'Augur is running at: {"http" if development else "https"}://{host}:{port}') + + frontend_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n frontend:{uuid.uuid4().hex}@%h -Q frontend" + frontend_worker_process = subprocess.Popen(frontend_worker.split(" ")) + + try: + server.wait() + except KeyboardInterrupt: + + if server: + logger.info("Shutting down server") + server.terminate() + + logger.info("Shutting down frontend celery worker process") + if frontend_worker_process: + frontend_worker_process.terminate() + +@cli.command('stop') +@with_database +@click.pass_context +def stop(ctx): + """ + Sends SIGTERM to all Augur api processes + """ + logger = logging.getLogger("augur.cli") + + augur_stop(signal.SIGTERM, logger, ctx.obj.engine) + +@cli.command('kill') +@with_database +@click.pass_context +def kill(ctx): + """ + Sends SIGKILL to all Augur api processes + """ + logger = logging.getLogger("augur.cli") + augur_stop(signal.SIGKILL, logger, ctx.obj.engine) + +@cli.command('processes') +def processes(): + """ + Outputs the name/PID of all Augur api process""" + augur_processes = get_augur_api_processes() + for process in augur_processes: + logger.info(f"Found process {process.pid}") + +def augur_stop(signal, logger, engine): + """ + Stops augur with the given signal, + and cleans up the api + """ + + augur_processes = get_augur_api_processes() + + _broadcast_signal_to_processes(augur_processes, logger=logger, broadcast_signal=signal) + + cleanup_after_api_halt(logger, engine) + + +def cleanup_after_api_halt(logger, engine): + + queues = ['frontend','celery'] + connection_string = get_value("RabbitMQ", "connection_string") + + clear_rabbitmq_messages(connection_string, queues, logger) + clear_redis_caches(logger) + +def get_augur_api_processes(): + augur_api_processes = [] + for process in psutil.process_iter(['cmdline', 'name', 'environ']): + if process.info['cmdline'] is not None and process.info['environ'] is not None: + try: + if is_api_process(process): + augur_api_processes.append(process) + except (KeyError, FileNotFoundError): + pass + return augur_api_processes + +def is_api_process(process): + + command = ''.join(process.info['cmdline'][:]).lower() + if os.getenv('VIRTUAL_ENV') in process.info['environ']['VIRTUAL_ENV'] and 'python' in command: + + if process.pid != os.getpid(): + + if ("augur.api.server:app" in command or + "augurbackendapi" in command or + ("augur.tasks.init.celery_app.celery_app" in command and "frontend" in command)): + return True + + return False + + diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index 9b6894a7dd..d7a8ad745d 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -10,37 +10,30 @@ import logging import psutil import signal -import sys from redis.exceptions import ConnectionError as RedisConnectionError -from celery import chain, signature, group import uuid import traceback from urllib.parse import urlparse -from datetime import datetime -from augur import instance_id -from augur.tasks.util.collection_state import CollectionState from augur.tasks.start_tasks import augur_collection_monitor, create_collection_status_records from augur.tasks.git.facade_tasks import clone_repos from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model from augur.tasks.init.redis_connection import redis_connection -from augur.application.db.models import Repo, CollectionStatus, UserRepo +from augur.application.db.models import UserRepo from augur.application.db.session import DatabaseSession -from augur.application.db.util import execute_session_query from augur.application.logs import AugurLogger -from augur.application.config import AugurConfig -from augur.application.cli import test_connection, test_db_connection +from augur.application.db.lib import get_value +from augur.application.cli import test_connection, test_db_connection, with_database, DatabaseContext import sqlalchemy as s -from sqlalchemy import or_, and_ logger = AugurLogger("augur", reset_logfiles=True).get_logger() - @click.group('server', short_help='Commands for controlling the backend API server & data collection workers') -def cli(): - pass +@click.pass_context +def cli(ctx): + ctx.obj = DatabaseContext() @cli.command("start") @click.option("--disable-collection", is_flag=True, default=False, help="Turns off data collection workers") @@ -48,7 +41,9 @@ def cli(): @click.option('--port') @test_connection @test_db_connection -def start(disable_collection, development, port): +@with_database +@click.pass_context +def start(ctx, disable_collection, development, port): """Start Augur's backend server.""" try: @@ -70,14 +65,12 @@ def start(disable_collection, development, port): except FileNotFoundError: logger.error("\n\nPlease run augur commands in the root directory\n\n") - with DatabaseSession(logger) as db_session: - config = AugurConfig(logger, db_session) - host = config.get_value("Server", "host") + host = get_value("Server", "host") - if not port: - port = config.get_value("Server", "port") - - worker_vmem_cap = config.get_value("Celery", 'worker_process_vmem_cap') + if not port: + port = get_value("Server", "port") + + worker_vmem_cap = get_value("Celery", 'worker_process_vmem_cap') gunicorn_command = f"gunicorn -c {gunicorn_location} -b {host}:{port} augur.api.server:app --log-file gunicorn.log" server = subprocess.Popen(gunicorn_command.split(" ")) @@ -92,16 +85,14 @@ def start(disable_collection, development, port): logger.info("Deleting old task schedule") os.remove("celerybeat-schedule.db") - with DatabaseSession(logger) as db_session: - config = AugurConfig(logger, db_session) - log_level = config.get_value("Logging", "log_level") - celery_beat_process = None - celery_command = f"celery -A augur.tasks.init.celery_app.celery_app beat -l {log_level.lower()}" - celery_beat_process = subprocess.Popen(celery_command.split(" ")) + log_level = get_value("Logging", "log_level") + celery_beat_process = None + celery_command = f"celery -A augur.tasks.init.celery_app.celery_app beat -l {log_level.lower()}" + celery_beat_process = subprocess.Popen(celery_command.split(" ")) if not disable_collection: - with DatabaseSession(logger) as session: + with DatabaseSession(logger, engine=ctx.obj.engine) as session: clean_collection_status(session) assign_orphan_repos_to_default_user(session) @@ -139,7 +130,7 @@ def start(disable_collection, development, port): if not disable_collection: try: - cleanup_after_collection_halt(logger) + cleanup_after_collection_halt(logger, ctx.obj.engine) except RedisConnectionError: pass @@ -181,7 +172,7 @@ def determine_worker_processes(ratio,maximum): sleep_time += 6 #20% of estimate, Maximum value of 25 - secondary_num_processes = determine_worker_processes(.25, 25) + secondary_num_processes = determine_worker_processes(.25, 45) logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}") secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" process_list.append(subprocess.Popen(secondary_worker.split(" "))) @@ -201,24 +192,32 @@ def determine_worker_processes(ratio,maximum): @cli.command('stop') -def stop(): +@test_connection +@test_db_connection +@with_database +@click.pass_context +def stop(ctx): """ Sends SIGTERM to all Augur server & worker processes """ logger = logging.getLogger("augur.cli") - augur_stop(signal.SIGTERM, logger) + augur_stop(signal.SIGTERM, logger, ctx.obj.engine) @cli.command('kill') -def kill(): +@test_connection +@test_db_connection +@with_database +@click.pass_context +def kill(ctx): """ Sends SIGKILL to all Augur server & worker processes """ logger = logging.getLogger("augur.cli") - augur_stop(signal.SIGKILL, logger) + augur_stop(signal.SIGKILL, logger, ctx.obj.engine) -def augur_stop(signal, logger): +def augur_stop(signal, logger, engine): """ Stops augur with the given signal, and cleans up collection if it was running @@ -231,15 +230,15 @@ def augur_stop(signal, logger): _broadcast_signal_to_processes(augur_processes, broadcast_signal=signal, given_logger=logger) if "celery" in process_names: - cleanup_after_collection_halt(logger) + cleanup_after_collection_halt(logger, engine) -def cleanup_after_collection_halt(logger): +def cleanup_after_collection_halt(logger, engine): clear_redis_caches() - connection_string = "" - with DatabaseSession(logger) as session: - config = AugurConfig(logger, session) - connection_string = config.get_section("RabbitMQ")['connection_string'] + + connection_string = get_value("RabbitMQ", "connection_string") + + with DatabaseSession(logger, engine=engine) as session: clean_collection_status(session) diff --git a/augur/application/cli/collection.py b/augur/application/cli/collection.py new file mode 100644 index 0000000000..7d65cad978 --- /dev/null +++ b/augur/application/cli/collection.py @@ -0,0 +1,304 @@ +#SPDX-License-Identifier: MIT +""" +Augur library commands for controlling the backend components +""" +import resource +import os +import time +import subprocess +import click +import logging +import psutil +import signal +from redis.exceptions import ConnectionError as RedisConnectionError +import uuid +import traceback +import sqlalchemy as s + +from augur.tasks.start_tasks import augur_collection_monitor, create_collection_status_records +from augur.tasks.git.facade_tasks import clone_repos +from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model +from augur.application.db.models import UserRepo +from augur.application.db.session import DatabaseSession +from augur.application.logs import AugurLogger +from augur.application.db.lib import get_value +from augur.application.cli import test_connection, test_db_connection, with_database +from augur.application.cli._cli_util import _broadcast_signal_to_processes, raise_open_file_limit, clear_redis_caches, clear_rabbitmq_messages + +logger = AugurLogger("augur", reset_logfiles=True).get_logger() + +@click.group('server', short_help='Commands for controlling the backend API server & data collection workers') +def cli(): + pass + +@cli.command("start") +@click.option("--development", is_flag=True, default=False, help="Enable development mode, implies --disable-collection") +@test_connection +@test_db_connection +@with_database +@click.pass_context +def start(ctx, development): + """Start Augur's backend server.""" + + try: + if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1": + raise_open_file_limit(100000) + except Exception as e: + logger.error( + ''.join(traceback.format_exception(None, e, e.__traceback__))) + + logger.error("Failed to raise open file limit!") + raise e + + if development: + os.environ["AUGUR_DEV"] = "1" + logger.info("Starting in development mode") + + worker_vmem_cap = get_value("Celery", 'worker_process_vmem_cap') + + processes = start_celery_collection_processes(float(worker_vmem_cap)) + + if os.path.exists("celerybeat-schedule.db"): + logger.info("Deleting old task schedule") + os.remove("celerybeat-schedule.db") + + log_level = get_value("Logging", "log_level") + celery_beat_process = None + celery_command = f"celery -A augur.tasks.init.celery_app.celery_app beat -l {log_level.lower()}" + celery_beat_process = subprocess.Popen(celery_command.split(" ")) + + + with DatabaseSession(logger, ctx.obj.engine) as session: + + clean_collection_status(session) + assign_orphan_repos_to_default_user(session) + + create_collection_status_records.si().apply_async() + time.sleep(3) + + contributor_breadth_model.si().apply_async() + + # start cloning repos when augur starts + clone_repos.si().apply_async() + + augur_collection_monitor.si().apply_async() + + + try: + processes[0].wait() + except KeyboardInterrupt: + + logger.info("Shutting down all celery worker processes") + for p in processes: + if p: + p.terminate() + + if celery_beat_process: + logger.info("Shutting down celery beat process") + celery_beat_process.terminate() + try: + cleanup_after_collection_halt(logger, ctx.obj.engine) + except RedisConnectionError: + pass + +def start_celery_collection_processes(vmem_cap_ratio): + + #Calculate process scaling based on how much memory is available on the system in bytes. + #Each celery process takes ~500MB or 500 * 1024^2 bytes + + process_list = [] + + #Cap memory usage to 30% of total virtual memory + available_memory_in_bytes = psutil.virtual_memory().total * vmem_cap_ratio + available_memory_in_megabytes = available_memory_in_bytes / (1024 ** 2) + max_process_estimate = available_memory_in_megabytes // 500 + sleep_time = 0 + + #Get a subset of the maximum processes available using a ratio, not exceeding a maximum value + def determine_worker_processes(ratio,maximum): + return max(min(round(max_process_estimate * ratio),maximum),1) + + #2 processes are always reserved as a baseline. + scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=2 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling" + max_process_estimate -= 2 + process_list.append(subprocess.Popen(scheduling_worker.split(" "))) + sleep_time += 6 + + #60% of estimate, Maximum value of 45 + core_num_processes = determine_worker_processes(.6, 45) + logger.info(f"Starting core worker processes with concurrency={core_num_processes}") + core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={core_num_processes} -n core:{uuid.uuid4().hex}@%h" + process_list.append(subprocess.Popen(core_worker.split(" "))) + sleep_time += 6 + + #20% of estimate, Maximum value of 25 + secondary_num_processes = determine_worker_processes(.25, 45) + logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}") + secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" + process_list.append(subprocess.Popen(secondary_worker.split(" "))) + sleep_time += 6 + + #15% of estimate, Maximum value of 20 + facade_num_processes = determine_worker_processes(.15, 20) + logger.info(f"Starting facade worker processes with concurrency={facade_num_processes}") + facade_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={facade_num_processes} -n facade:{uuid.uuid4().hex}@%h -Q facade" + + process_list.append(subprocess.Popen(facade_worker.split(" "))) + sleep_time += 6 + + time.sleep(sleep_time) + + return process_list + + +@cli.command('stop') +@with_database +@click.pass_context +def stop(ctx): + """ + Sends SIGTERM to all Augur server & worker processes + """ + logger = logging.getLogger("augur.cli") + + augur_stop(signal.SIGTERM, logger, ctx.obj.engine) + +@cli.command('kill') +@with_database +@click.pass_context +def kill(ctx): + """ + Sends SIGKILL to all Augur server & worker processes + """ + logger = logging.getLogger("augur.cli") + augur_stop(signal.SIGKILL, logger, ctx.obj.engine) + +@cli.command('repo-reset') +@test_connection +@test_db_connection +@with_database +@click.pass_context +def repo_reset(ctx): + """ + Refresh repo collection to force data collection + """ + with ctx.obj.engine.connect() as connection: + connection.execute(s.sql.text(""" + UPDATE augur_operations.collection_status + SET core_status='Pending',core_task_id = NULL, core_data_last_collected = NULL; + + UPDATE augur_operations.collection_status + SET secondary_status='Pending',secondary_task_id = NULL, secondary_data_last_collected = NULL; + + UPDATE augur_operations.collection_status + SET facade_status='Pending', facade_task_id=NULL, facade_data_last_collected = NULL; + + TRUNCATE augur_data.commits CASCADE; + """)) + + logger.info("Repos successfully reset") + +@cli.command('processes') +def processes(): + """ + Outputs the name/PID of all Augur server & worker processes""" + augur_processes = get_augur_collection_processes() + for process in augur_processes: + logger.info(f"Found process {process.pid}") + +def get_augur_collection_processes(): + augur_processes = [] + for process in psutil.process_iter(['cmdline', 'name', 'environ']): + if process.info['cmdline'] is not None and process.info['environ'] is not None: + try: + if is_collection_process(process): + augur_processes.append(process) + except (KeyError, FileNotFoundError): + pass + return augur_processes + +def is_collection_process(process): + + command = ''.join(process.info['cmdline'][:]).lower() + if os.getenv('VIRTUAL_ENV') in process.info['environ']['VIRTUAL_ENV'] and 'python' in command: + if process.pid != os.getpid(): + + if "augurbackendcollection" in command or "celery_app.celery_appbeat" in command: + return True + if "augur.tasks.init.celery_app.celery_app" in command: + + if ("scheduling" in command or + "facade" in command or + "secondary" in command or + "core" in command): + + return True + + return False + + +def augur_stop(signal, logger, engine): + """ + Stops augur with the given signal, + and cleans up collection if it was running + """ + + augur_collection_processes = get_augur_collection_processes() + + _broadcast_signal_to_processes(augur_collection_processes, logger=logger, broadcast_signal=signal) + + cleanup_after_collection_halt(logger, engine) + +def cleanup_after_collection_halt(logger, engine): + + queues = ['celery', 'core', 'secondary','scheduling','facade'] + + connection_string = get_value("RabbitMQ", "connection_string") + + with DatabaseSession(logger, engine) as session: + clean_collection_status(session) + + clear_rabbitmq_messages(connection_string, queues, logger) + clear_redis_caches(logger) + +#Make sure that database reflects collection status when processes are killed/stopped. +def clean_collection_status(session): + session.execute_sql(s.sql.text(""" + UPDATE augur_operations.collection_status + SET core_status='Pending',core_task_id = NULL + WHERE core_status='Collecting' AND core_data_last_collected IS NULL; + + UPDATE augur_operations.collection_status + SET core_status='Success',core_task_id = NULL + WHERE core_status='Collecting' AND core_data_last_collected IS NOT NULL; + + UPDATE augur_operations.collection_status + SET secondary_status='Pending',secondary_task_id = NULL + WHERE secondary_status='Collecting' AND secondary_data_last_collected IS NULL; + + UPDATE augur_operations.collection_status + SET secondary_status='Success',secondary_task_id = NULL + WHERE secondary_status='Collecting' AND secondary_data_last_collected IS NOT NULL; + + UPDATE augur_operations.collection_status + SET facade_status='Update', facade_task_id=NULL + WHERE facade_status LIKE '%Collecting%' and facade_data_last_collected IS NULL; + + UPDATE augur_operations.collection_status + SET facade_status='Success', facade_task_id=NULL + WHERE facade_status LIKE '%Collecting%' and facade_data_last_collected IS NOT NULL; + + UPDATE augur_operations.collection_status + SET facade_status='Pending', facade_task_id=NULL + WHERE facade_status='Failed Clone' OR facade_status='Initializing'; + """)) + #TODO: write timestamp for currently running repos. + +def assign_orphan_repos_to_default_user(session): + query = s.sql.text(""" + SELECT repo_id FROM repo WHERE repo_id NOT IN (SELECT repo_id FROM augur_operations.user_repos) + """) + + repos = session.execute_sql(query).fetchall() + + for repo in repos: + UserRepo.insert(session,repo[0],1) diff --git a/augur/application/cli/config.py b/augur/application/cli/config.py index b748dd30b0..e9786e1ef8 100644 --- a/augur/application/cli/config.py +++ b/augur/application/cli/config.py @@ -7,13 +7,10 @@ import json import logging -from pathlib import Path - from augur.application.db.models import Config from augur.application.db.session import DatabaseSession -from augur.application.logs import AugurLogger from augur.application.config import AugurConfig -from augur.application.cli import test_connection, test_db_connection +from augur.application.cli import DatabaseContext, test_connection, test_db_connection, with_database from augur.util.inspect_without_import import get_phase_names_without_import ROOT_AUGUR_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) @@ -22,8 +19,9 @@ ENVVAR_PREFIX = "AUGUR_" @click.group('config', short_help='Generate an augur.config.json') -def cli(): - pass +@click.pass_context +def cli(ctx): + ctx.obj = DatabaseContext() @cli.command('init') @click.option('--github-api-key', help="GitHub API key for data collection from the GitHub API", envvar=ENVVAR_PREFIX + 'GITHUB_API_KEY') @@ -33,8 +31,10 @@ def cli(): @click.option('--rabbitmq-conn-string', help="String to connect to rabbitmq broker", envvar=ENVVAR_PREFIX + 'RABBITMQ_CONN_STRING') @test_connection @test_db_connection -def init_config(github_api_key, facade_repo_directory, gitlab_api_key, redis_conn_string, rabbitmq_conn_string): - print(__file__, ROOT_AUGUR_DIRECTORY, "\n", Path.cwd()) +@with_database +@click.pass_context +def init_config(ctx, github_api_key, facade_repo_directory, gitlab_api_key, redis_conn_string, rabbitmq_conn_string): + if not github_api_key: github_api_key = str(input("Please enter a valid github api key: ")) @@ -63,7 +63,7 @@ def init_config(github_api_key, facade_repo_directory, gitlab_api_key, redis_con keys["github_api_key"] = github_api_key keys["gitlab_api_key"] = gitlab_api_key - with DatabaseSession(logger) as session: + with DatabaseSession(logger, engine=ctx.obj.engine) as session: config = AugurConfig(logger, session) @@ -108,9 +108,11 @@ def init_config(github_api_key, facade_repo_directory, gitlab_api_key, redis_con @click.option('--file', required=True) @test_connection @test_db_connection -def load_config(file): +@with_database +@click.pass_context +def load_config(ctx, file): - with DatabaseSession(logger) as session: + with DatabaseSession(logger, engine=ctx.obj.engine) as session: config = AugurConfig(logger, session) print("WARNING: This will override your current config") @@ -131,9 +133,11 @@ def load_config(file): @click.option('--file', required=True) @test_connection @test_db_connection -def add_section(section_name, file): +@with_database +@click.pass_context +def add_section(ctx, section_name, file): - with DatabaseSession(logger) as session: + with DatabaseSession(logger, engine=ctx.obj.engine) as session: config = AugurConfig(logger, session) if config.is_section_in_config(section_name): @@ -160,9 +164,11 @@ def add_section(section_name, file): @click.option('--data-type') @test_connection @test_db_connection -def config_set(section, setting, value, data_type): +@with_database +@click.pass_context +def config_set(ctx, section, setting, value, data_type): - with DatabaseSession(logger) as session: + with DatabaseSession(logger, engine=ctx.obj.engine) as session: config = AugurConfig(logger, session) if not data_type: @@ -190,9 +196,11 @@ def config_set(section, setting, value, data_type): @click.option('--setting') @test_connection @test_db_connection -def config_get(section, setting): +@with_database +@click.pass_context +def config_get(ctx, section, setting): - with DatabaseSession(logger) as session: + with DatabaseSession(logger, engine=ctx.obj.engine) as session: config = AugurConfig(logger, session) if setting: @@ -236,9 +244,11 @@ def config_get_all_json(): @cli.command('clear') @test_connection @test_db_connection -def clear_config(): +@with_database +@click.pass_context +def clear_config(ctx): - with DatabaseSession(logger) as session: + with DatabaseSession(logger, ctx.obj.engine) as session: config = AugurConfig(logger, session) if not config.empty(): diff --git a/augur/application/cli/db.py b/augur/application/cli/db.py index 42d57ecc6b..c20fcf0b2e 100644 --- a/augur/application/cli/db.py +++ b/augur/application/cli/db.py @@ -1,10 +1,8 @@ # SPDX-License-Identifier: MIT -from os import walk, chdir, environ, chmod, path import os +from os import environ, chmod, path, getenv, stat import logging from sys import exit -import stat -from collections import OrderedDict from subprocess import call import random import string @@ -12,73 +10,88 @@ import click import sqlalchemy as s import pandas as pd -import requests import json -import sqlalchemy as s import re +import stat as stat_module -from augur.application.cli import test_connection, test_db_connection +from augur.application.cli import ( + test_connection, + test_db_connection, + with_database, + DatabaseContext, +) from augur.application.db.session import DatabaseSession -from augur.application.logs import AugurLogger -from augur.application.db.engine import DatabaseEngine from sqlalchemy import update from datetime import datetime from augur.application.db.models import Repo logger = logging.getLogger(__name__) + @click.group("db", short_help="Database utilities") -def cli(): - pass +@click.pass_context +def cli(ctx): + ctx.obj = DatabaseContext() @cli.command("add-repos") @click.argument("filename", type=click.Path(exists=True)) @test_connection @test_db_connection -def add_repos(filename): - """Add repositories to Augur's database. +@with_database +@click.pass_context +def add_repos(ctx, filename): + """Add repositories to Augur's database. The .csv file format should be repo_url,group_id NOTE: The Group ID must already exist in the REPO_Groups Table. - If you want to add an entire GitHub organization, refer to the command: augur db add-github-org""" + If you want to add an entire GitHub organization, refer to the command: augur db add-github-org""" from augur.tasks.github.util.github_task_session import GithubTaskSession from augur.util.repo_load_controller import RepoLoadController - with GithubTaskSession(logger) as session: - + with GithubTaskSession(logger, engine=ctx.obj.engine) as session: controller = RepoLoadController(session) + line_total = len(open(filename).readlines()) with open(filename) as upload_repos_file: data = csv.reader(upload_repos_file, delimiter=",") - for row in data: - + for line_num, row in enumerate(data): repo_data = {} repo_data["url"] = row[0] try: repo_data["repo_group_id"] = int(row[1]) except ValueError: - print(f"Invalid repo group_id: {row[1]} for Git url: `{repo_data['url']}`") + print( + f"Invalid repo group_id: {row[1]} for Git url: `{repo_data['url']}`" + ) continue - + print( - f"Inserting repo with Git URL `{repo_data['url']}` into repo group {repo_data['repo_group_id']}") - controller.add_cli_repo(repo_data) + f"Inserting repo {line_num}/{line_total} with Git URL `{repo_data['url']}` into repo group {repo_data['repo_group_id']}" + ) + succeeded, message = controller.add_cli_repo(repo_data) + if not succeeded: + logger.error(f"insert repo failed with error: {message['status']}`") + else: + logger.info(f"Repo added: {repo_data}") + print("Success") @cli.command("get-repo-groups") @test_connection @test_db_connection -def get_repo_groups(): +@with_database +@click.pass_context +def get_repo_groups(ctx): """ List all repo groups and their associated IDs """ - with DatabaseEngine() as engine, engine.connect() as connection: + with ctx.obj.engine.connect() as connection: df = pd.read_sql( s.sql.text( "SELECT repo_group_id, rg_name, rg_description FROM augur_data.repo_groups" @@ -86,21 +99,21 @@ def get_repo_groups(): connection, ) print(df) - engine.dispose() return df @cli.command("add-repo-groups") +@click.argument("filename", type=click.Path(exists=True)) @test_connection @test_db_connection -@click.argument("filename", type=click.Path(exists=True)) -def add_repo_groups(filename): +@with_database +@click.pass_context +def add_repo_groups(ctx, filename): """ Create new repo groups in Augur's database """ - with DatabaseEngine() as engine, engine.begin() as connection: - + with ctx.obj.engine.begin() as connection: df = pd.read_sql( s.sql.text("SELECT repo_group_id FROM augur_data.repo_groups"), connection, @@ -116,7 +129,6 @@ def add_repo_groups(filename): with open(filename) as create_repo_groups_file: data = csv.reader(create_repo_groups_file, delimiter=",") for row in data: - # Handle case where there's a hanging empty row. if not row: logger.info("Skipping empty data...") @@ -126,53 +138,51 @@ def add_repo_groups(filename): if int(row[0]) not in repo_group_IDs: repo_group_IDs.append(int(row[0])) connection.execute( - insert_repo_group_sql, - repo_group_id=int(row[0]), - repo_group_name=row[1], + insert_repo_group_sql.bindparams( + repo_group_id=int(row[0]), + repo_group_name=row[1], + ) ) else: logger.info( f"Repo group with ID {row[1]} for repo group {row[1]} already exists, skipping..." ) - engine.dispose() - @cli.command("add-github-org") @click.argument("organization_name") @test_connection @test_db_connection -def add_github_org(organization_name): +@with_database +@click.pass_context +def add_github_org(ctx, organization_name): """ Create new repo groups in Augur's database """ from augur.tasks.github.util.github_task_session import GithubTaskSession from augur.util.repo_load_controller import RepoLoadController - with GithubTaskSession(logger) as session: - + with GithubTaskSession(logger, engine=ctx.obj.engine) as session: controller = RepoLoadController(session) controller.add_cli_org(organization_name) -# get_db_version is a helper function to print_db_version and upgrade_db_version -def get_db_version(): +# get_db_version is a helper function to print_db_version and upgrade_db_version +def get_db_version(engine): db_version_sql = s.sql.text( """ SELECT * FROM augur_operations.augur_settings WHERE setting = 'augur_data_version' """ ) - with DatabaseEngine() as engine, engine.connect() as connection: - + with engine.connect() as connection: result = int(connection.execute(db_version_sql).fetchone()[2]) engine.dispose() return result - @cli.command("print-db-version") @test_connection @test_db_connection @@ -234,7 +244,9 @@ def generate_api_key(ctx): @click.argument("api_key") @test_connection @test_db_connection -def update_api_key(api_key): +@with_database +@click.pass_context +def update_api_key(ctx, api_key): """ Update the API key in the database to the given key """ @@ -248,18 +260,17 @@ def update_api_key(api_key): """ ) - with DatabaseEngine() as engine, engine.begin() as connection: - + with ctx.obj.engine.begin() as connection: connection.execute(update_api_key_sql, api_key=api_key) logger.info(f"Updated Augur API key to: {api_key}") - engine.dispose() - @cli.command("get-api-key") @test_connection @test_db_connection -def get_api_key(): +@with_database +@click.pass_context +def get_api_key(ctx): get_api_key_sql = s.sql.text( """ SELECT value FROM augur_operations.augur_settings WHERE setting='augur_api_key'; @@ -267,35 +278,34 @@ def get_api_key(): ) try: - with DatabaseEngine() as engine, engine.connect() as connection: + with ctx.obj.engine.connect() as connection: print(connection.execute(get_api_key_sql).fetchone()[0]) except TypeError: print("No Augur API key found.") - engine.dispose() - @cli.command( "check-pgpass", short_help="Check the ~/.pgpass file for Augur's database credentials", ) def check_pgpass(): - augur_db_env_var = os.getenv("AUGUR_DB") + augur_db_env_var = getenv("AUGUR_DB") if augur_db_env_var: - # gets the user, passowrd, host, port, and database_name out of environment variable # assumes database string of structure //:@:/ # it returns a tuple like (, , , , str: +def parse_database_string(db_string: str) -> str: """Parse database string into the following components: username, password, host, port, database """ @@ -91,7 +88,7 @@ def get_database_string() -> str: return db_conn_string -def create_database_engine(url, **kwargs): +def create_database_engine(url: str, **kwargs) -> Engine: """Create sqlalchemy database engine Note: diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py new file mode 100644 index 0000000000..c1da707dbf --- /dev/null +++ b/augur/application/db/lib.py @@ -0,0 +1,97 @@ +import sqlalchemy as s +import logging +from typing import List, Any, Optional +from augur.application.db.models import Config +from augur.application.db import get_session +from augur.application.db.util import execute_session_query + +logger = logging.getLogger("db_lib") + +def convert_type_of_value(config_dict, logger=None): + + + data_type = config_dict["type"] + + if data_type == "str" or data_type is None: + return config_dict + + elif data_type == "int": + config_dict["value"] = int(config_dict["value"]) + + elif data_type == "bool": + value = config_dict["value"] + + if value.lower() == "false": + config_dict["value"] = False + else: + config_dict["value"] = True + + elif data_type == "float": + config_dict["value"] = float(config_dict["value"]) + + else: + if logger: + logger.error(f"Need to add support for {data_type} types to config") + else: + print(f"Need to add support for {data_type} types to config") + + return config_dict + + +def get_section(section_name) -> dict: + """Get a section of data from the config. + + Args: + section_name: The name of the section being retrieved + + Returns: + The section data as a dict + """ + with get_session() as session: + + query = session.query(Config).filter_by(section_name=section_name) + section_data = execute_session_query(query, 'all') + + section_dict = {} + for setting in section_data: + setting_dict = setting.__dict__ + + setting_dict = convert_type_of_value(setting_dict, logger) + + setting_name = setting_dict["setting_name"] + setting_value = setting_dict["value"] + + section_dict[setting_name] = setting_value + + return section_dict + + +def get_value(section_name: str, setting_name: str) -> Optional[Any]: + """Get the value of a setting from the config. + + Args: + section_name: The name of the section that the setting belongs to + setting_name: The name of the setting + + Returns: + The value from config if found, and None otherwise + """ + + with get_session() as session: + + + # TODO temporary until added to the DB schema + if section_name == "frontend" and setting_name == "pagination_offset": + return 25 + + try: + query = session.query(Config).filter(Config.section_name == section_name, Config.setting_name == setting_name) + config_setting = execute_session_query(query, 'one') + except s.orm.exc.NoResultFound: + return None + + setting_dict = config_setting.__dict__ + + setting_dict = convert_type_of_value(setting_dict, logger) + + return setting_dict["value"] diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index 7f97e4bbdc..dffe065353 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -22,14 +22,13 @@ from sqlalchemy.orm import relationship from sqlalchemy.sql import text from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound +from time import sleep, mktime, gmtime, time, localtime import logging import re -from typing import List, Any, Dict import json from augur.application.db.models.base import Base -from augur.application import requires_db_session from augur.application.db.util import execute_session_query DEFAULT_REPO_GROUP_ID = 1 @@ -880,7 +879,13 @@ class Repo(Base): @staticmethod def get_by_id(session, repo_id): - return session.query(Repo).filter(Repo.repo_id == repo_id).first() + try: + return session.query(Repo).filter(Repo.repo_id == repo_id).first() + except Exception as e: + session.rollback() + raise e + + @staticmethod def get_by_repo_git(session, repo_git): @@ -920,6 +925,17 @@ def is_valid_github_repo(gh_session, url: str) -> bool: continue data = result.json() + if result.status_code == 403: #GH Rate limiting + wait_until = int(result.headers.get("x-ratelimit-reset")) + # use time package to find how many seconds to wait + wait_in_seconds = int( + mktime(gmtime(wait_until)) - + mktime(gmtime(time())) + ) + wait_until_time = localtime(wait_until) + logger.error(f"rate limited fetching {url}z") + logger.error(f"sleeping until {wait_until_time.tm_hour}:{wait_until_time.tm_min} ({wait_in_seconds} seconds)") + sleep(wait_in_seconds) # if there was an error return False if "message" in data.keys(): @@ -930,6 +946,8 @@ def is_valid_github_repo(gh_session, url: str) -> bool: return True, {"status": "Valid repo", "repo_type": data["owner"]["type"]} + return False, {"status": "Failed to validate repo after multiple attempts"} + @staticmethod def is_valid_gitlab_repo(gl_session, url: str) -> bool: """Determine whether a GitLab repo URL is valid. @@ -957,6 +975,11 @@ def is_valid_gitlab_repo(gl_session, url: str) -> bool: while attempts < 10: response = hit_api(gl_session.oauths, url, logger) + if wait_in_seconds := response.headers.get("Retry-After") is not None: + logger.info(f"rate limited fetching {url}, sleeping for {wait_in_seconds}") + print(f"rate limited fetching {url}, sleeping for {wait_in_seconds}") + sleep(int(wait_in_seconds)) + if response.status_code == 404: return False, {"status": "Invalid repo"} @@ -964,6 +987,8 @@ def is_valid_gitlab_repo(gl_session, url: str) -> bool: return True, {"status": "Valid repo"} attempts += 1 + logger.info(f"could not validate {url}, will attempt again in {attempts*5} seconds") + sleep(attempts*3) return False, {"status": "Failed to validate repo after multiple attempts"} @@ -1476,7 +1501,7 @@ class LstmAnomalyResult(Base): class Message(Base): __tablename__ = "message" __table_args__ = ( - UniqueConstraint("platform_msg_id", name="message-insert-unique"), + UniqueConstraint("platform_msg_id", "pltfrm_id", name="message-insert-unique"), Index("msg-cntrb-id-idx", "cntrb_id"), Index("platformgrouper", "msg_id", "pltfrm_id"), Index("messagegrouper", "msg_id", "rgls_id", unique=True), diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index 47f28b12f2..029444215e 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -329,6 +329,9 @@ def get_user(session, username: str): return user except NoResultFound: return None + except Exception as e: + session.rollback() + raise e @staticmethod def get_by_id(session, user_id: int): @@ -1073,7 +1076,13 @@ def __eq__(self, other): @staticmethod def get_by_id(session, client_id): - return session.query(ClientApplication).filter(ClientApplication.id == client_id).first() + + try: + return session.query(ClientApplication).filter(ClientApplication.id == client_id).first() + except Exception as e: + session.rollback() + raise e + class Subscription(Base): __tablename__ = "subscriptions" diff --git a/augur/application/db/session.py b/augur/application/db/session.py index 22379ad050..1bc9878e82 100644 --- a/augur/application/db/session.py +++ b/augur/application/db/session.py @@ -1,9 +1,5 @@ -import os -import re import time -import sys import random -import logging from sqlalchemy.orm import Session from sqlalchemy.dialects import postgresql from sqlalchemy.exc import OperationalError @@ -12,8 +8,7 @@ from psycopg2.errors import DeadlockDetected # from augur.tasks.util.random_key_auth import RandomKeyAuth -from augur.application.db.engine import EngineConnection -from augur.tasks.util.worker_util import remove_duplicate_dicts, remove_duplicates_by_uniques +from augur.tasks.util.worker_util import remove_duplicates_by_uniques def remove_null_characters_from_string(string): diff --git a/augur/application/log_analysis/http/http_server.py b/augur/application/log_analysis/http/http_server.py index c9f39bae7d..80fe4d0490 100644 --- a/augur/application/log_analysis/http/http_server.py +++ b/augur/application/log_analysis/http/http_server.py @@ -1,7 +1,6 @@ import http.server import socketserver import logging -import cgi import json from pathlib import Path import re diff --git a/augur/application/logs.py b/augur/application/logs.py index 2c976c2af7..11e1cb6ea5 100644 --- a/augur/application/logs.py +++ b/augur/application/logs.py @@ -3,17 +3,11 @@ import logging import logging.config import logging.handlers -from logging import FileHandler, StreamHandler, Formatter -from multiprocessing import Process, Queue, Event, current_process -from inspect import getmembers, isfunction -from time import sleep +from logging import FileHandler import os from pathlib import Path -import atexit import shutil import coloredlogs -from copy import deepcopy -import typing from sqlalchemy.orm import Session from augur.application.db.models import Config diff --git a/augur/application/schema/alembic/env.py b/augur/application/schema/alembic/env.py index 94127a43be..b6be9dee05 100644 --- a/augur/application/schema/alembic/env.py +++ b/augur/application/schema/alembic/env.py @@ -1,13 +1,10 @@ from logging.config import fileConfig -from sqlalchemy import engine_from_config -from sqlalchemy import pool from alembic import context from augur.application.db.models.base import Base -from augur.application.db.engine import DatabaseEngine, get_database_string +from augur.application.db.engine import get_database_string from sqlalchemy import create_engine, event -from sqlalchemy.pool import NullPool # this is the Alembic Config object, which provides # access to the values within the .ini file in use. diff --git a/augur/application/schema/alembic/versions/18_schedule_any_old_facade_repositories_to.py b/augur/application/schema/alembic/versions/18_schedule_any_old_facade_repositories_to.py index 84f8f088b1..bbe22c724a 100644 --- a/augur/application/schema/alembic/versions/18_schedule_any_old_facade_repositories_to.py +++ b/augur/application/schema/alembic/versions/18_schedule_any_old_facade_repositories_to.py @@ -6,13 +6,11 @@ """ from alembic import op -import sqlalchemy as sa -from sqlalchemy.dialects import postgresql from sqlalchemy.sql import text import pathlib import shutil from augur.application.config import AugurConfig -from augur.application.db.session import DatabaseSession +from augur.application.db.lib import get_value import logging # revision identifiers, used by Alembic. @@ -43,9 +41,7 @@ def total_facade_reset(): try: - with DatabaseSession(logger) as session: - config = AugurConfig(logger, session) - facade_base_dir = config.get_section("Facade")['repo_directory'] + facade_base_dir = get_value("Facade", "repo_directory") #remove path path = pathlib.Path(facade_base_dir) diff --git a/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py b/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py index 24d7fd08c6..29da454ed6 100644 --- a/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py +++ b/augur/application/schema/alembic/versions/19_add_extra_celery_options_to_the_config_.py @@ -6,7 +6,6 @@ """ from alembic import op -import sqlalchemy as sa from augur.application.db.session import DatabaseSession from augur.application.config import * from sqlalchemy.sql import text diff --git a/augur/application/schema/alembic/versions/27_update_messages_unique.py b/augur/application/schema/alembic/versions/27_update_messages_unique.py new file mode 100644 index 0000000000..9c60349412 --- /dev/null +++ b/augur/application/schema/alembic/versions/27_update_messages_unique.py @@ -0,0 +1,33 @@ +""" Update messages unique + +Revision ID: 27 +Revises: 26 +Create Date: 2024-03-10 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy import text + +# revision identifiers, used by Alembic. +revision = '27' +down_revision = '26' +branch_labels = None +depends_on = None + + +schema_name = 'augur_data' +table_name = "message" +constraint_name = "message-insert-unique" + +def upgrade(): + + op.drop_constraint(constraint_name, table_name, schema=schema_name, type_='unique') + + op.create_unique_constraint(constraint_name, table_name, ['platform_msg_id', 'pltfrm_id'], schema=schema_name) + +def downgrade(): + + op.drop_constraint(constraint_name, table_name, schema=schema_name, type_='unique') + + op.create_unique_constraint(constraint_name, table_name, ['platform_msg_id'], schema=schema_name) diff --git a/augur/application/schema/alembic/versions/28_Performance_Indexes_a.py b/augur/application/schema/alembic/versions/28_Performance_Indexes_a.py new file mode 100644 index 0000000000..906cb2c121 --- /dev/null +++ b/augur/application/schema/alembic/versions/28_Performance_Indexes_a.py @@ -0,0 +1,93 @@ +""" Updating materialized views and associated indices + +Revision ID: 28 +Revises: 27 +Create Date: 2023-08-23 18:17:22.651191 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql +from sqlalchemy import text + +# revision identifiers, used by Alembic. +revision = '28' +down_revision = '27' +branch_labels = None +depends_on = None + +def upgrade(): + + mview_keys_28() + +def mview_keys_28(upgrade=True): + + if upgrade: + conn = op.get_bind() + conn.execute(text(""" + + DROP INDEX if exists "pr_ID_prs_table"; + DROP INDEX if exists "pr_id_pr_files"; + DROP INDEX if exists "pr_id_pr_reviews"; + DROP materialized view if exists augur_data.explorer_repo_languages; + + + + CREATE INDEX "pr_ID_prs_table" ON "augur_data"."pull_requests" USING btree ( + "pull_request_id" "pg_catalog"."int8_ops" ASC NULLS LAST + ); + + CREATE INDEX "pr_id_pr_files" ON "augur_data"."pull_request_files" USING btree ( + "pull_request_id" "pg_catalog"."int8_ops" ASC NULLS LAST + ); + + CREATE INDEX "pr_id_pr_reviews" ON "augur_data"."pull_request_reviews" USING btree ( + "pull_request_id" "pg_catalog"."int8_ops" ASC NULLS LAST + );""")) + + conn = op.get_bind() + conn.execute(text(""" + CREATE MATERIALIZED VIEW augur_data.explorer_repo_languages as + SELECT e.repo_id, + repo.repo_git, + repo.repo_name, + e.programming_language, + e.code_lines, + e.files + FROM augur_data.repo, + ( SELECT d.repo_id, + d.programming_language, + sum(d.code_lines) AS code_lines, + (count(*))::integer AS files + FROM ( SELECT repo_labor.repo_id, + repo_labor.programming_language, + repo_labor.code_lines + FROM augur_data.repo_labor, + ( SELECT repo_labor_1.repo_id, + max(repo_labor_1.data_collection_date) AS last_collected + FROM augur_data.repo_labor repo_labor_1 + GROUP BY repo_labor_1.repo_id) recent + WHERE ((repo_labor.repo_id = recent.repo_id) AND (repo_labor.data_collection_date > (recent.last_collected - ((5)::double precision * '00:01:00'::interval))))) d + GROUP BY d.repo_id, d.programming_language) e + WHERE (repo.repo_id = e.repo_id) + ORDER BY e.repo_id;""")) + + conn.execute(text("""COMMIT;""")) + + conn = op.get_bind() + conn.execute(text(""" + CREATE UNIQUE INDEX ON augur_data.explorer_repo_languages(repo_id, programming_language); """)) + conn.execute(text("""COMMIT;""")) +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + conn = op.get_bind() + + #Make unique initially deferred + conn.execute(text(f""" + DROP INDEX if exists "pr_ID_prs_table"; + DROP INDEX if exists "pr_id_pr_files"; + DROP INDEX if exists "pr_id_pr_reviews"; + DROP materialized view if exists augur_data.explorer_repo_languages; + """)) + + # ### end Alembic commands ### diff --git a/augur/application/schema/alembic/versions/3_oauth_and_user_groups.py b/augur/application/schema/alembic/versions/3_oauth_and_user_groups.py index 0d9c6d744a..9a19512514 100644 --- a/augur/application/schema/alembic/versions/3_oauth_and_user_groups.py +++ b/augur/application/schema/alembic/versions/3_oauth_and_user_groups.py @@ -10,7 +10,6 @@ from alembic import op import sqlalchemy as sa from augur.application.db.session import DatabaseSession -from augur.application.db.models.augur_operations import UserGroup, UserRepo CLI_USER_ID = 1 diff --git a/augur/application/schema/alembic/versions/7_no_null_repo_path_and_repo_name.py b/augur/application/schema/alembic/versions/7_no_null_repo_path_and_repo_name.py index f856d938bb..e2711ca197 100644 --- a/augur/application/schema/alembic/versions/7_no_null_repo_path_and_repo_name.py +++ b/augur/application/schema/alembic/versions/7_no_null_repo_path_and_repo_name.py @@ -6,10 +6,7 @@ """ from alembic import op -import sqlalchemy as sa -from sqlalchemy.dialects import postgresql from sqlalchemy.sql import text -import re from augur.application.db.models import Repo diff --git a/augur/tasks/data_analysis/__init__.py b/augur/tasks/data_analysis/__init__.py index 3787493523..b600bcac77 100644 --- a/augur/tasks/data_analysis/__init__.py +++ b/augur/tasks/data_analysis/__init__.py @@ -1,8 +1,4 @@ -from augur.application.db.session import DatabaseSession -from augur.application.db.models import Repo -from augur.application.db.util import execute_session_query -from celery import group, chain, chord, signature -from augur.tasks.init.celery_app import celery_app as celery +from celery import chain import logging def machine_learning_phase(repo_git): @@ -12,8 +8,6 @@ def machine_learning_phase(repo_git): from augur.tasks.data_analysis.message_insights.tasks import message_insight_task from augur.tasks.data_analysis.pull_request_analysis_worker.tasks import pull_request_analysis_task - from augur.tasks.init.celery_app import engine - logger = logging.getLogger(machine_learning_phase.__name__) ml_tasks = [] diff --git a/augur/tasks/data_analysis/clustering_worker/tasks.py b/augur/tasks/data_analysis/clustering_worker/tasks.py index c102e6c227..e59951ab0e 100644 --- a/augur/tasks/data_analysis/clustering_worker/tasks.py +++ b/augur/tasks/data_analysis/clustering_worker/tasks.py @@ -21,7 +21,7 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.application.db.session import DatabaseSession -from augur.application.config import AugurConfig +from augur.application.db.lib import get_value from augur.application.db.models import Repo, RepoClusterMessage, RepoTopic, TopicWord from augur.application.db.util import execute_session_query from augur.tasks.init.celery_app import AugurMlRepoCollectionTask @@ -31,11 +31,11 @@ stemmer = nltk.stem.snowball.SnowballStemmer("english") -@celery.task(base=AugurMlRepoCollectionTask) -def clustering_task(repo_git): +@celery.task(base=AugurMlRepoCollectionTask, bind=True) +def clustering_task(self, repo_git): logger = logging.getLogger(clustering_model.__name__) - from augur.tasks.init.celery_app import engine + engine = self.app.engine with DatabaseSession(logger, engine) as session: clustering_model(repo_git, logger, engine, session) @@ -56,15 +56,13 @@ def clustering_model(repo_git: str,logger,engine, session) -> None: tool_version = '0.2.0' data_source = 'Augur Collected Messages' - config = AugurConfig(logger, session) - query = session.query(Repo).filter(Repo.repo_git == repo_git) repo_id = execute_session_query(query, 'one').repo_id - num_clusters = config.get_value("Clustering_Task", 'num_clusters') - max_df = config.get_value("Clustering_Task", 'max_df') - max_features = config.get_value("Clustering_Task", 'max_features') - min_df = config.get_value("Clustering_Task", 'min_df') + num_clusters = get_value("Clustering_Task", 'num_clusters') + max_df = get_value("Clustering_Task", 'max_df') + max_features = get_value("Clustering_Task", 'max_features') + min_df = get_value("Clustering_Task", 'min_df') logger.info(f"Min df: {min_df}. Max df: {max_df}") diff --git a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py index 4521a722e2..15660e763b 100644 --- a/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py +++ b/augur/tasks/data_analysis/contributor_breadth_worker/contributor_breadth_worker.py @@ -1,6 +1,5 @@ #SPDX-License-Identifier: MIT -import logging, json -import pandas as pd +import logging import sqlalchemy as s from datetime import datetime @@ -15,10 +14,10 @@ ### Logic: For each unique platform contributor, gather non duplicate events, using the GitHub "id" ### for the event API (GitLab coming!) -@celery.task -def contributor_breadth_model() -> None: +@celery.task(bind=True) +def contributor_breadth_model(self) -> None: - from augur.tasks.init.celery_app import engine + engine = self.app.engine logger = logging.getLogger(contributor_breadth_model.__name__) diff --git a/augur/tasks/data_analysis/discourse_analysis/tasks.py b/augur/tasks/data_analysis/discourse_analysis/tasks.py index 5a9941679c..450ec15a29 100644 --- a/augur/tasks/data_analysis/discourse_analysis/tasks.py +++ b/augur/tasks/data_analysis/discourse_analysis/tasks.py @@ -32,14 +32,13 @@ ROOT_AUGUR_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) DISCOURSE_ANALYSIS_DIR = f"{ROOT_AUGUR_DIRECTORY}/tasks/data_analysis/discourse_analysis/" -@celery.task(base=AugurMlRepoCollectionTask) -def discourse_analysis_task(repo_git): +@celery.task(base=AugurMlRepoCollectionTask, bind=True) +def discourse_analysis_task(self, repo_git): logger = logging.getLogger(discourse_analysis_task.__name__) - from augur.tasks.init.celery_app import engine + engine = self.app.engine - with DatabaseSession(logger, engine) as session: - discourse_analysis_model(repo_git, logger, engine) + discourse_analysis_model(repo_git, logger, engine) def discourse_analysis_model(repo_git: str,logger,engine) -> None: diff --git a/augur/tasks/data_analysis/insight_worker/tasks.py b/augur/tasks/data_analysis/insight_worker/tasks.py index 37ae5f484c..5bf159d2fa 100644 --- a/augur/tasks/data_analysis/insight_worker/tasks.py +++ b/augur/tasks/data_analysis/insight_worker/tasks.py @@ -1,11 +1,7 @@ # SPDX-License-Identifier: MIT -from multiprocessing import Process, Queue -from urllib.parse import urlparse import requests import pandas as pd import sqlalchemy as s -from sqlalchemy.ext.automap import automap_base -from sqlalchemy import MetaData, and_ import logging, json import numpy as np import scipy.stats @@ -15,7 +11,7 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.application.db.session import DatabaseSession -from augur.application.config import AugurConfig +from augur.application.db.lib import get_value from augur.application.db.models import Repo, ChaossMetricStatus, RepoInsight, RepoInsightsRecord from augur.application.db.util import execute_session_query from augur.tasks.init.celery_app import AugurMlRepoCollectionTask @@ -23,11 +19,11 @@ warnings.filterwarnings('ignore') -@celery.task(base=AugurMlRepoCollectionTask) -def insight_task(repo_git): +@celery.task(base=AugurMlRepoCollectionTask, bind=True) +def insight_task(self, repo_git): logger = logging.getLogger(insight_task.__name__) - from augur.tasks.init.celery_app import engine + engine = self.app.engine with DatabaseSession(logger, engine) as session: insight_model(repo_git, logger, engine, session) @@ -44,17 +40,15 @@ def insight_model(repo_git: str,logger,engine,session) -> None: metrics = {"issues-new": "issues", "code-changes": "commit_count", "code-changes-lines": "added", "reviews": "pull_requests", "contributors-new": "new_contributors"} - config = AugurConfig(logger, session) - query = session.query(Repo).filter(Repo.repo_git == repo_git) repo_id = execute_session_query(query, 'one').repo_id - anomaly_days = config.get_value('Insight_Task', 'anomaly_days') - training_days = config.get_value('Insight_Task', 'training_days') - contamination = config.get_value('Insight_Task', 'contamination') - confidence = config.get_value('Insight_Task', 'confidence_interval') / 100 - api_host = config.get_value('Server', 'host') - api_port = config.get_value('Server', 'port') + anomaly_days = get_value('Insight_Task', 'anomaly_days') + training_days = get_value('Insight_Task', 'training_days') + contamination = get_value('Insight_Task', 'contamination') + confidence = get_value('Insight_Task', 'confidence_interval') / 100 + api_host = get_value('Server', 'host') + api_port = get_value('Server', 'port') logger.info("Discovering insights for repo {}\n".format(repo_git)) @@ -114,7 +108,9 @@ def insight_model(repo_git: str,logger,engine,session) -> None: repo_id = :repo_id AND ri_date < :min_date """) - result = engine.execute(delete_record_SQL, repo_id=repo_id, min_date=min_date) + + with engine.connect() as conn: + result = conn.execute(delete_record_SQL, parameters=dict(repo_id=repo_id, min_date=min_date)) logger.info("Deleting out of date data points ...\n") delete_points_SQL = s.sql.text(""" @@ -135,8 +131,8 @@ def insight_model(repo_git: str,logger,engine,session) -> None: AND repo_insights.ri_field = to_delete.ri_field """) - with engine.connect as conn: - result = conn.execute(delete_points_SQL, repo_id=repo_id, min_date=min_date) + with engine.connect() as conn: + result = conn.execute(delete_points_SQL, parameters=dict(repo_id=repo_id, min_date=min_date)) # get table values to check for dupes later on @@ -260,7 +256,7 @@ def classify_anomalies(df, metric): repo_insight_record_obj.ri_id)) # Send insight to Jonah for slack bot - send_insight(record, abs(next_recent_anomaly.iloc[0][metric] - mean), logger,engine) + send_insight(record, abs(next_recent_anomaly.iloc[0][metric] - mean), logger,engine, anomaly_days, send_insights) insight_count += 1 else: @@ -522,7 +518,7 @@ def is_unique_key(key): else: logger.info("Key: {} has empty raw_values, should not have key here".format(key)) -def send_insight(insight, units_from_mean, logger, engine): +def send_insight(insight, units_from_mean, logger, engine, anomaly_days, send_insights): try: repoSQL = s.sql.text(""" SELECT repo_git, rg_name diff --git a/augur/tasks/data_analysis/message_insights/message_novelty.py b/augur/tasks/data_analysis/message_insights/message_novelty.py index 5a7ebb4ff0..7821cbf278 100644 --- a/augur/tasks/data_analysis/message_insights/message_novelty.py +++ b/augur/tasks/data_analysis/message_insights/message_novelty.py @@ -1,20 +1,8 @@ #SPDX-License-Identifier: MIT ## Added imports -import re -import unicodedata -import nltk -import string -from nltk.tokenize import word_tokenize -from nltk.stem.snowball import SnowballStemmer -from bs4 import BeautifulSoup -import matplotlib.pyplot as plt -from datetime import date ## -import logging -import multiprocessing import os -import traceback from datetime import datetime, timedelta import numpy as np @@ -22,7 +10,6 @@ from gensim.models.doc2vec import Doc2Vec, TaggedDocument from keras.layers import Dense, Input from keras.models import Model, load_model -from scipy.spatial.distance import cosine from skimage.filters import threshold_otsu from sklearn import utils as skl_utils diff --git a/augur/tasks/data_analysis/message_insights/preprocess_text.py b/augur/tasks/data_analysis/message_insights/preprocess_text.py index fd849376c1..f5f9b0e701 100644 --- a/augur/tasks/data_analysis/message_insights/preprocess_text.py +++ b/augur/tasks/data_analysis/message_insights/preprocess_text.py @@ -5,7 +5,6 @@ import unicodedata import nltk -import pandas as pd from bs4 import BeautifulSoup from nltk.stem.snowball import SnowballStemmer from nltk.tokenize import word_tokenize diff --git a/augur/tasks/data_analysis/message_insights/tasks.py b/augur/tasks/data_analysis/message_insights/tasks.py index 4727d3def7..6cc0446ab8 100644 --- a/augur/tasks/data_analysis/message_insights/tasks.py +++ b/augur/tasks/data_analysis/message_insights/tasks.py @@ -13,7 +13,7 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.application.db.session import DatabaseSession -from augur.application.config import AugurConfig +from augur.application.db.lib import get_value from augur.application.db.models import Repo, MessageAnalysis, MessageAnalysisSummary from augur.application.db.util import execute_session_query from augur.tasks.init.celery_app import AugurMlRepoCollectionTask @@ -22,11 +22,11 @@ ROOT_AUGUR_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) -@celery.task(base=AugurMlRepoCollectionTask) -def message_insight_task(repo_git): +@celery.task(base=AugurMlRepoCollectionTask, bind=True) +def message_insight_task(self, repo_git): logger = logging.getLogger(message_insight_task.__name__) - from augur.tasks.init.celery_app import engine + engine = self.app.engine with DatabaseSession(logger, engine) as session: message_insight_model(repo_git, logger, engine, session) @@ -45,13 +45,11 @@ def message_insight_model(repo_git: str,logger,engine, session) -> None: now = datetime.datetime.utcnow() run_id = int(now.timestamp())+5 - config = AugurConfig(logger, session) - query = session.query(Repo).filter(Repo.repo_git == repo_git) repo_id = execute_session_query(query, 'one').repo_id - models_dir = os.path.join(ROOT_AUGUR_DIRECTORY, "tasks", "data_analysis", "message_insights", config.get_value("Message_Insights", 'models_dir')) - insight_days = config.get_value("Message_Insights", 'insight_days') + models_dir = os.path.join(ROOT_AUGUR_DIRECTORY, "tasks", "data_analysis", "message_insights", get_value("Message_Insights", 'models_dir')) + insight_days = get_value("Message_Insights", 'insight_days') # Any initial database instructions, like finding the last tuple inserted or generate the next ID value diff --git a/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py b/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py index 9d6d5be78e..af806bcdd1 100644 --- a/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py +++ b/augur/tasks/data_analysis/pull_request_analysis_worker/tasks.py @@ -10,7 +10,7 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.application.db.session import DatabaseSession -from augur.application.config import AugurConfig +from augur.application.db.lib import get_value from augur.application.db.models import Repo, PullRequestAnalysis from augur.application.db.util import execute_session_query from augur.tasks.init.celery_app import AugurMlRepoCollectionTask @@ -22,15 +22,13 @@ ROOT_AUGUR_DIRECTORY = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))) -@celery.task(base=AugurMlRepoCollectionTask) -def pull_request_analysis_task(repo_git): +@celery.task(base=AugurMlRepoCollectionTask, bind=True) +def pull_request_analysis_task(self, repo_git): logger = logging.getLogger(pull_request_analysis_task.__name__) - from augur.tasks.init.celery_app import engine - - with DatabaseSession(logger, engine) as session: - pull_request_analysis_model(repo_git, logger, engine) + engine = self.app.engine + pull_request_analysis_model(repo_git, logger, engine) def pull_request_analysis_model(repo_git: str,logger,engine) -> None: @@ -44,12 +42,10 @@ def pull_request_analysis_model(repo_git: str,logger,engine) -> None: with DatabaseSession(logger, engine) as session: - config = AugurConfig(logger, session) - query = session.query(Repo).filter(Repo.repo_git == repo_git) repo_id = execute_session_query(query, 'one').repo_id - senti_models_dir = os.path.join(ROOT_AUGUR_DIRECTORY, "tasks", "data_analysis", "message_insights", config.get_value("Message_Insights", 'models_dir')) + senti_models_dir = os.path.join(ROOT_AUGUR_DIRECTORY, "tasks", "data_analysis", "message_insights", get_value("Message_Insights", 'models_dir')) logger.info(f'Sentiment model dir located - {senti_models_dir}') diff --git a/augur/tasks/db/refresh_materialized_views.py b/augur/tasks/db/refresh_materialized_views.py index f04d01552b..c191b56039 100644 --- a/augur/tasks/db/refresh_materialized_views.py +++ b/augur/tasks/db/refresh_materialized_views.py @@ -1,20 +1,16 @@ from __future__ import annotations import logging import sqlalchemy as s -from celery import signature -from celery import group, chain, chord, signature from augur.tasks.init.celery_app import celery_app as celery from augur.application.db.session import DatabaseSession -from augur.application.logs import AugurLogger - -@celery.task -def refresh_materialized_views(): +@celery.task(bind=True) +def refresh_materialized_views(self): #self.logger = AugurLogger("data_collection_jobs").get_logger() - from augur.tasks.init.celery_app import engine + engine = self.app.engine logger = logging.getLogger(refresh_materialized_views.__name__) #self.logger = logging.getLogger(refresh_materialized_views.__name__) diff --git a/augur/tasks/git/dependency_libyear_tasks/core.py b/augur/tasks/git/dependency_libyear_tasks/core.py index f4ebdd4b3b..9e48757d61 100644 --- a/augur/tasks/git/dependency_libyear_tasks/core.py +++ b/augur/tasks/git/dependency_libyear_tasks/core.py @@ -1,15 +1,7 @@ from datetime import datetime -import logging -import requests -import re -import os, subprocess -import traceback -import sqlalchemy as s from augur.application.db.models import * -from augur.application.db.session import DatabaseSession -from augur.application.config import AugurConfig +from augur.application.db.lib import get_value from augur.application.db.util import execute_session_query -from urllib.parse import quote from augur.tasks.git.dependency_libyear_tasks.libyear_util.util import get_deps_libyear_data from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path @@ -25,10 +17,8 @@ def deps_libyear_model( session, repo_id,repo_git,repo_group_id): Repo.repo_git == repo_git) result = execute_session_query(query, 'one') - - config = AugurConfig(session.logger, session) - absolute_repo_path = get_absolute_repo_path(config.get_section("Facade")['repo_directory'],repo_id,result.repo_path,result.repo_name) + absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo_id,result.repo_path,result.repo_name) #config.get_section("Facade")['repo_directory'] + relative_repo_path#self.config['repo_directory'] + relative_repo_path generate_deps_libyear_data(session,repo_id, absolute_repo_path) diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py b/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py index 9fbf2a88f6..bcfe810a9c 100644 --- a/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py +++ b/augur/tasks/git/dependency_libyear_tasks/libyear_util/npm_libyear_utils.py @@ -1,4 +1,3 @@ -import os, re import requests def get_NPM_data(package): diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_libyear_util.py b/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_libyear_util.py index e17fdafcc9..46304490fa 100644 --- a/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_libyear_util.py +++ b/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_libyear_util.py @@ -1,4 +1,3 @@ -from distutils import version import requests import dateutil.parser # from packaging import version diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py b/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py index 7ad45e1471..7aaaf1f190 100644 --- a/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py +++ b/augur/tasks/git/dependency_libyear_tasks/libyear_util/pypi_parser.py @@ -1,10 +1,6 @@ import re, os import json -from typing import Dict import toml -import dateutil.parser -from augur.tasks.git.dependency_libyear_tasks.libyear_util.pypi_libyear_util import sort_dependency_requirement,get_pypi_data,get_latest_version,get_release_date -from augur.tasks.git.dependency_libyear_tasks.libyear_util.pypi_libyear_util import get_libyear import logging import yaml diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py b/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py index 2d60976983..111d3fc631 100644 --- a/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py +++ b/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py @@ -1,6 +1,4 @@ -from distutils.version import LooseVersion import dateutil.parser -from distutils import version import os from augur.tasks.git.dependency_libyear_tasks.libyear_util.pypi_parser import parse_conda, parse_pipfile,parse_pipfile_lock,parse_poetry,parse_poetry_lock,parse_requirement_txt,parse_setup_py from augur.tasks.git.dependency_libyear_tasks.libyear_util.npm_parser import parse_package_json diff --git a/augur/tasks/git/dependency_libyear_tasks/tasks.py b/augur/tasks/git/dependency_libyear_tasks/tasks.py index cfcfc7883f..ff15c61d91 100644 --- a/augur/tasks/git/dependency_libyear_tasks/tasks.py +++ b/augur/tasks/git/dependency_libyear_tasks/tasks.py @@ -1,16 +1,15 @@ import logging -import traceback from augur.application.db.session import DatabaseSession from augur.tasks.git.dependency_libyear_tasks.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask from augur.application.db.util import execute_session_query -@celery.task(base=AugurFacadeRepoCollectionTask) -def process_libyear_dependency_metrics(repo_git): +@celery.task(base=AugurFacadeRepoCollectionTask, bind=True) +def process_libyear_dependency_metrics(self, repo_git): #raise NotImplementedError - from augur.tasks.init.celery_app import engine + engine = self.app.engine logger = logging.getLogger(process_libyear_dependency_metrics.__name__) diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index 9176a93283..e4c6273479 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -1,16 +1,7 @@ from datetime import datetime -import logging -import requests -import json import os -import subprocess -import re -import traceback from augur.application.db.models import * -from augur.application.db.session import DatabaseSession -from augur.application.config import AugurConfig from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler -from augur.application.db.util import execute_session_query from augur.tasks.git.dependency_tasks.dependency_util import dependency_calculator as dep_calc from augur.tasks.util.worker_util import parse_json_from_subprocess_call @@ -78,10 +69,14 @@ def generate_scorecard(session,repo_id,path): path_to_scorecard = os.environ['HOME'] + '/scorecard' #setting the environmental variable which is required by scorecard - key_handler = GithubApiKeyHandler(session) + key_handler = GithubApiKeyHandler(session, session.logger) os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() - required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) + try: + required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) + except Exception as e: + session.logger.error(f"Could not parse required output! Error: {e}") + raise e session.logger.info('adding to database...') session.logger.debug(f"output: {required_output}") diff --git a/augur/tasks/git/dependency_tasks/dependency_util/c_deps.py b/augur/tasks/git/dependency_tasks/dependency_util/c_deps.py index 51a375058b..dc1ad1099c 100644 --- a/augur/tasks/git/dependency_tasks/dependency_util/c_deps.py +++ b/augur/tasks/git/dependency_tasks/dependency_util/c_deps.py @@ -1,4 +1,3 @@ -import sys import re from pathlib import Path diff --git a/augur/tasks/git/dependency_tasks/dependency_util/cpp_deps.py b/augur/tasks/git/dependency_tasks/dependency_util/cpp_deps.py index 28a54fdf74..3b262304fc 100644 --- a/augur/tasks/git/dependency_tasks/dependency_util/cpp_deps.py +++ b/augur/tasks/git/dependency_tasks/dependency_util/cpp_deps.py @@ -1,4 +1,3 @@ -import sys import re from pathlib import Path diff --git a/augur/tasks/git/dependency_tasks/dependency_util/csharp_deps.py b/augur/tasks/git/dependency_tasks/dependency_util/csharp_deps.py index 342903e9d6..7f29a60bcc 100644 --- a/augur/tasks/git/dependency_tasks/dependency_util/csharp_deps.py +++ b/augur/tasks/git/dependency_tasks/dependency_util/csharp_deps.py @@ -1,4 +1,3 @@ -import sys import re from pathlib import Path diff --git a/augur/tasks/git/dependency_tasks/dependency_util/go_deps.py b/augur/tasks/git/dependency_tasks/dependency_util/go_deps.py index 84e3f63684..8c8718c6b3 100644 --- a/augur/tasks/git/dependency_tasks/dependency_util/go_deps.py +++ b/augur/tasks/git/dependency_tasks/dependency_util/go_deps.py @@ -1,4 +1,4 @@ -import sys, re +import re from pathlib import Path def get_files(path): diff --git a/augur/tasks/git/dependency_tasks/dependency_util/java_deps.py b/augur/tasks/git/dependency_tasks/dependency_util/java_deps.py index b55c357e3a..2e40a5f40b 100644 --- a/augur/tasks/git/dependency_tasks/dependency_util/java_deps.py +++ b/augur/tasks/git/dependency_tasks/dependency_util/java_deps.py @@ -1,4 +1,3 @@ -import sys import re from pathlib import Path diff --git a/augur/tasks/git/dependency_tasks/dependency_util/javascript_deps.py b/augur/tasks/git/dependency_tasks/dependency_util/javascript_deps.py index fda86bd4ec..cf323fabfd 100644 --- a/augur/tasks/git/dependency_tasks/dependency_util/javascript_deps.py +++ b/augur/tasks/git/dependency_tasks/dependency_util/javascript_deps.py @@ -1,4 +1,3 @@ -import sys import re from pathlib import Path diff --git a/augur/tasks/git/dependency_tasks/dependency_util/php_deps.py b/augur/tasks/git/dependency_tasks/dependency_util/php_deps.py index 690c06fbcf..740e188cb5 100644 --- a/augur/tasks/git/dependency_tasks/dependency_util/php_deps.py +++ b/augur/tasks/git/dependency_tasks/dependency_util/php_deps.py @@ -1,4 +1,3 @@ -import sys import re from pathlib import Path diff --git a/augur/tasks/git/dependency_tasks/dependency_util/python_deps.py b/augur/tasks/git/dependency_tasks/dependency_util/python_deps.py index 0d0709fd09..ba3105eeec 100644 --- a/augur/tasks/git/dependency_tasks/dependency_util/python_deps.py +++ b/augur/tasks/git/dependency_tasks/dependency_util/python_deps.py @@ -1,7 +1,5 @@ -import sys import re from pathlib import Path -import codecs import ast diff --git a/augur/tasks/git/dependency_tasks/dependency_util/ruby_deps.py b/augur/tasks/git/dependency_tasks/dependency_util/ruby_deps.py index 802a8951f1..d7275bb41a 100644 --- a/augur/tasks/git/dependency_tasks/dependency_util/ruby_deps.py +++ b/augur/tasks/git/dependency_tasks/dependency_util/ruby_deps.py @@ -1,4 +1,3 @@ -import sys import re from pathlib import Path diff --git a/augur/tasks/git/dependency_tasks/dependency_util/rust_deps.py b/augur/tasks/git/dependency_tasks/dependency_util/rust_deps.py index 92380d8098..c2fab3963f 100644 --- a/augur/tasks/git/dependency_tasks/dependency_util/rust_deps.py +++ b/augur/tasks/git/dependency_tasks/dependency_util/rust_deps.py @@ -1,4 +1,3 @@ -import sys import re from pathlib import Path diff --git a/augur/tasks/git/dependency_tasks/dependency_util/vb_deps.py b/augur/tasks/git/dependency_tasks/dependency_util/vb_deps.py index e3ed8e7845..685e9a68d0 100644 --- a/augur/tasks/git/dependency_tasks/dependency_util/vb_deps.py +++ b/augur/tasks/git/dependency_tasks/dependency_util/vb_deps.py @@ -1,4 +1,3 @@ -import sys import re from pathlib import Path diff --git a/augur/tasks/git/dependency_tasks/tasks.py b/augur/tasks/git/dependency_tasks/tasks.py index 5e7c1d846f..152c053080 100644 --- a/augur/tasks/git/dependency_tasks/tasks.py +++ b/augur/tasks/git/dependency_tasks/tasks.py @@ -3,17 +3,17 @@ from augur.application.db.session import DatabaseSession from augur.tasks.git.dependency_tasks.core import * from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask, AugurCoreRepoCollectionTask, AugurSecondaryRepoCollectionTask +from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask, AugurSecondaryRepoCollectionTask from augur.application.db.util import execute_session_query from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path -from augur.application.config import AugurConfig +from augur.application.db.lib import get_value -@celery.task(base=AugurFacadeRepoCollectionTask) -def process_dependency_metrics(repo_git): +@celery.task(base=AugurFacadeRepoCollectionTask, bind=True) +def process_dependency_metrics(self, repo_git): #raise NotImplementedError - from augur.tasks.init.celery_app import engine + engine = self.app.engine logger = logging.getLogger(process_dependency_metrics.__name__) @@ -23,19 +23,18 @@ def process_dependency_metrics(repo_git): repo = execute_session_query(query,'one') - - config = AugurConfig(session.logger, session) - absolute_repo_path = get_absolute_repo_path(config.get_section("Facade")['repo_directory'],repo.repo_id,repo.repo_path,repo.repo_name) + absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo.repo_id,repo.repo_path,repo.repo_name) session.logger.debug(f"This is the deps model repo: {repo_git}.") generate_deps_data(session,repo.repo_id,absolute_repo_path) -@celery.task(base=AugurSecondaryRepoCollectionTask) -def process_ossf_dependency_metrics(repo_git): - from augur.tasks.init.celery_app import engine +@celery.task(base=AugurSecondaryRepoCollectionTask, bind=True) +def process_ossf_dependency_metrics(self, repo_git): + + engine = self.app.engine logger = logging.getLogger(process_ossf_dependency_metrics.__name__) diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index ee3dc047ff..9c699f7e79 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -1,59 +1,37 @@ #SPDX-License-Identifier: MIT -import sys -import time -import traceback import logging -import platform -import imp -import time -import datetime -import html.parser -import subprocess -import os -import getopt -import xlsxwriter -import configparser -import multiprocessing -import numpy as np -from celery import group, chain, chord, signature -from celery.utils.log import get_task_logger -from celery.result import allow_join_result -from celery.signals import after_setup_logger -from datetime import timedelta +from celery import group, chain import sqlalchemy as s -from sqlalchemy import or_, and_, update, insert -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import update_repo_log, trim_commits, store_working_author, trim_author +from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import trim_commits from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path, get_parent_commits_set, get_existing_commits_set from augur.tasks.git.util.facade_worker.facade_worker.analyzecommit import analyze_commit -from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_facade_weight_time_factor, get_repo_commit_count, update_facade_scheduling_fields, get_facade_weight_with_commit_count, facade_bulk_insert_commits +from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_repo_commit_count, update_facade_scheduling_fields, get_facade_weight_with_commit_count, facade_bulk_insert_commits +from augur.tasks.git.util.facade_worker.facade_worker.rebuildcache import fill_empty_affiliations, invalidate_caches, nuke_affiliations, rebuild_unknown_affiliation_and_web_caches +from augur.tasks.git.util.facade_worker.facade_worker.postanalysiscleanup import git_repo_cleanup + from augur.tasks.github.facade_github.tasks import * from augur.tasks.util.collection_state import CollectionState from augur.tasks.util.collection_util import get_collection_status_repo_git_from_filter -from augur.tasks.git.util.facade_worker.facade_worker.repofetch import GitCloneError, git_repo_initialize +from augur.tasks.git.util.facade_worker.facade_worker.repofetch import GitCloneError, git_repo_initialize, git_repo_updates -from augur.tasks.util.worker_util import create_grouped_task_load from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask -from augur.tasks.util.AugurUUID import GithubUUID, UnresolvableUUID -from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestEvent, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Issue, IssueEvent, IssueLabel, IssueAssignee, PullRequestMessageRef, IssueMessageRef, Contributor, Repo, CollectionStatus +from augur.application.db.models import Repo, CollectionStatus from augur.tasks.git.dependency_tasks.tasks import process_dependency_metrics from augur.tasks.git.dependency_libyear_tasks.tasks import process_libyear_dependency_metrics from augur.tasks.git.scc_value_tasks.tasks import process_scc_value_metrics -from augur.tasks.github.util.github_paginator import GithubPaginator, hit_api -from augur.tasks.github.util.gh_graphql_entities import PullRequest from augur.tasks.github.util.github_task_session import * -from augur.application.logs import TaskLogConfig #define an error callback for chains in facade collection so facade doesn't make the program crash #if it does. @@ -252,7 +230,7 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: session.log_activity('Debug',f"Commits missing from repo {repo_id}: {len(missing_commits)}") - if not len(missing_commits): + if not len(missing_commits) or repo_id is None: #session.log_activity('Info','Type of missing_commits: %s' % type(missing_commits)) return @@ -388,22 +366,23 @@ def clone_repos(): -#@celery.task -#def check_for_repo_updates_facade_task(repo_git): +#@celery.task(bind=True) +#def check_for_repo_updates_facade_task(self, repo_git): # -# from augur.tasks.init.celery_app import engine +# engine = self.app.engine # # logger = logging.getLogger(check_for_repo_updates_facade_task.__name__) # # with FacadeSession(logger) as session: # check_for_repo_updates(session, repo_git) -@celery.task(base=AugurFacadeRepoCollectionTask) -def git_update_commit_count_weight(repo_git): +@celery.task(base=AugurFacadeRepoCollectionTask, bind=True) +def git_update_commit_count_weight(self, repo_git): - from augur.tasks.init.celery_app import engine + engine = self.app.engine logger = logging.getLogger(git_update_commit_count_weight.__name__) + # Change facade session to take in engine with FacadeSession(logger) as session: commit_count = get_repo_commit_count(session, repo_git) facade_weight = get_facade_weight_with_commit_count(session, repo_git, commit_count) @@ -460,32 +439,6 @@ def generate_analysis_sequence(logger,repo_git, session): return analysis_sequence - -def generate_contributor_sequence(logger,repo_git, session): - - contributor_sequence = [] - #all_repo_ids = [] - repo_id = None - - #contributor_sequence.append(facade_start_contrib_analysis_task.si()) - query = s.sql.text("""SELECT repo_id FROM repo - WHERE repo_git=:value""").bindparams(value=repo_git) - - repo = session.execute_sql(query).fetchone() - session.logger.info(f"repo: {repo}") - repo_id = repo[0] - #pdb.set_trace() - #breakpoint() - #for repo in all_repos: - # contributor_sequence.append(insert_facade_contributors.si(repo['repo_id'])) - #all_repo_ids = [repo['repo_id'] for repo in all_repos] - - #contrib_group = create_grouped_task_load(dataList=all_repo_ids,task=insert_facade_contributors)#group(contributor_sequence) - #contrib_group.link_error(facade_error_handler.s()) - #return contrib_group#chain(facade_start_contrib_analysis_task.si(), contrib_group) - return insert_facade_contributors.si(repo_id) - - def facade_phase(repo_git): logger = logging.getLogger(facade_phase.__name__) logger.info("Generating facade sequence") @@ -527,7 +480,7 @@ def facade_phase(repo_git): #Generate contributor analysis task group. if not limited_run or (limited_run and run_facade_contributors): - facade_core_collection.append(generate_contributor_sequence(logger,repo_git,session)) + facade_core_collection.append(insert_facade_contributors.si(repo_git)) #These tasks need repos to be cloned by facade before they can work. diff --git a/augur/tasks/git/scc_value_tasks/core.py b/augur/tasks/git/scc_value_tasks/core.py index 5fd7afb7b8..71993ebcd1 100644 --- a/augur/tasks/git/scc_value_tasks/core.py +++ b/augur/tasks/git/scc_value_tasks/core.py @@ -1,16 +1,6 @@ from datetime import datetime -import logging -import requests -import json import os -import subprocess -import re -import traceback from augur.application.db.models import * -from augur.application.db.session import DatabaseSession -from augur.application.config import AugurConfig -from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler -from augur.application.db.util import execute_session_query from augur.tasks.util.worker_util import parse_json_from_subprocess_call def value_model(session,repo_git,repo_id, path): diff --git a/augur/tasks/git/scc_value_tasks/tasks.py b/augur/tasks/git/scc_value_tasks/tasks.py index a2e4d11fc8..37ff4ac4b1 100644 --- a/augur/tasks/git/scc_value_tasks/tasks.py +++ b/augur/tasks/git/scc_value_tasks/tasks.py @@ -1,18 +1,17 @@ import logging -import traceback from augur.application.db.session import DatabaseSession from augur.tasks.git.scc_value_tasks.core import * from augur.tasks.init.celery_app import celery_app as celery -from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask, AugurCoreRepoCollectionTask +from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask from augur.application.db.util import execute_session_query -from augur.application.config import AugurConfig +from augur.application.db.lib import get_value from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path -@celery.task(base=AugurFacadeRepoCollectionTask) -def process_scc_value_metrics(repo_git): +@celery.task(base=AugurFacadeRepoCollectionTask, bind=True) +def process_scc_value_metrics(self, repo_git): - from augur.tasks.init.celery_app import engine + engine = self.app.engine logger = logging.getLogger(process_scc_value_metrics.__name__) @@ -22,7 +21,6 @@ def process_scc_value_metrics(repo_git): query = session.query(Repo).filter(Repo.repo_git == repo_git) repo = execute_session_query(query, 'one') - config = AugurConfig(session.logger, session) - absolute_repo_path = get_absolute_repo_path(config.get_section("Facade")['repo_directory'],repo.repo_id,repo.repo_path,repo.repo_name) + absolute_repo_path = get_absolute_repo_path(get_value("Facade", "repo_directory"),repo.repo_id,repo.repo_path,repo.repo_name) value_model(session,repo_git,repo.repo_id, absolute_repo_path) \ No newline at end of file diff --git a/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py b/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py index 285ec6c780..a0ca29701a 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/analyzecommit.py @@ -25,20 +25,9 @@ # and checks for any parents of HEAD that aren't already accounted for in the # repos. It also rebuilds analysis data, checks any changed affiliations and # aliases, and caches data for display. -import sys -import platform -import imp -import time -import datetime -import html.parser import subprocess import os -import getopt -import xlsxwriter -import configparser -import traceback import sqlalchemy as s -from sqlalchemy.exc import IntegrityError, DataError def analyze_commit(session, repo_id, repo_loc, commit): diff --git a/augur/tasks/git/util/facade_worker/facade_worker/config.py b/augur/tasks/git/util/facade_worker/facade_worker/config.py index ed5424b1f3..19539d79de 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/config.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/config.py @@ -24,17 +24,8 @@ # repos. It also rebuilds analysis data, checks any changed affiliations and # aliases, and caches data for display. import sys -import platform -import imp import time -import datetime -import html.parser -import subprocess import os -import getopt -import xlsxwriter -import configparser -import psycopg2 import json import logging import random @@ -44,7 +35,6 @@ from psycopg2.errors import DeadlockDetected from augur.tasks.github.util.github_task_session import * -from augur.application.logs import AugurLogger from augur.application.config import AugurConfig from logging import Logger @@ -112,7 +102,8 @@ class FacadeSession(GithubTaskSession): """ def __init__(self,logger: Logger): - from augur.tasks.init.celery_app import engine + from augur.application.db import get_engine + engine = get_engine() #self.cfg = FacadeConfig(logger) self.repos_processed = 0 super().__init__(logger=logger, engine=engine) @@ -293,7 +284,7 @@ def __init__(self, logger: Logger): #worker_options = read_config("Workers", "facade_worker", None, None) - with DatabaseSession(logger) as session: + with DatabaseSession(logger, engine) as session: config = AugurConfig(logger, session) worker_options = config.get_section("Facade") diff --git a/augur/tasks/git/util/facade_worker/facade_worker/excel_generators/example.py b/augur/tasks/git/util/facade_worker/facade_worker/excel_generators/example.py index f837fcffc3..83192da5cd 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/excel_generators/example.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/excel_generators/example.py @@ -23,7 +23,6 @@ # places to be modified when creating a derivative script are marked with #--> import sys -import MySQLdb import imp import time import datetime diff --git a/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py b/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py index 909c418094..b41c6f14da 100755 --- a/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/facade00mainprogram.py @@ -26,24 +26,14 @@ # repos. It also rebuilds analysis data, checks any changed affiliations and # aliases, and caches data for display. from __future__ import annotations -import traceback -import sys, platform, imp, time, datetime, html.parser, subprocess, os, getopt, xlsxwriter, configparser, logging -from multiprocessing import Process, Queue +import html.parser from .config import FacadeSession as FacadeSession -from .utilitymethods import trim_commits, store_working_author, trim_author -from .analyzecommit import analyze_commit -from .postanalysiscleanup import git_repo_cleanup -from .repofetch import git_repo_initialize, check_for_repo_updates, force_repo_updates, force_repo_analysis, git_repo_updates #.facade06analyze analysis moved to facade_tasks.py - IM 10/12/22 -from .rebuildcache import nuke_affiliations, fill_empty_affiliations, invalidate_caches, rebuild_unknown_affiliation_and_web_caches #from contributor_interfaceable.facade08contributorinterfaceable import ContributorInterfaceable from augur.tasks.github.facade_github.contributor_interfaceable.contributor_interface import * -from augur.tasks.github.util.github_task_session import GithubTaskSession -from logging import Logger -from sqlalchemy.sql.elements import TextClause diff --git a/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py b/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py index b9323921ce..3ec2013274 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/postanalysiscleanup.py @@ -25,17 +25,7 @@ # and checks for any parents of HEAD that aren't already accounted for in the # repos. It also rebuilds analysis data, checks any changed affiliations and # aliases, and caches data for display. -import sys -import platform -import imp -import time -import datetime -import html.parser import subprocess -import os -import getopt -import xlsxwriter -import configparser import sqlalchemy as s from augur.application.db.util import execute_session_query from .utilitymethods import get_absolute_repo_path @@ -60,7 +50,7 @@ def git_repo_cleanup(session,repo_git): # Remove the files on disk - absolute_path = get_absolute_repo_path(session.repo_base_directory, row.repo_id, row.repo_path,repo.repo_name) + absolute_path = get_absolute_repo_path(session.repo_base_directory, row.repo_id, row.repo_path,row.repo_name) cmd = ("rm -rf %s" % (absolute_path)) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py index 03206b0242..5668739767 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/rebuildcache.py @@ -25,19 +25,8 @@ # and checks for any parents of HEAD that aren't already accounted for in the # repos. It also rebuilds analysis data, checks any changed affiliations and # aliases, and caches data for display. -import sys -import platform -import imp -import time -import datetime -import html.parser -import subprocess -import os -import getopt -import xlsxwriter -import configparser import sqlalchemy as s -from .utilitymethods import update_repo_log, trim_commits, store_working_author, trim_author +from .utilitymethods import store_working_author, trim_author # if platform.python_implementation() == 'PyPy': # import pymysql # else: diff --git a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py index 35110239bf..64571bdd9b 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py @@ -25,20 +25,12 @@ # and checks for any parents of HEAD that aren't already accounted for in the # repos. It also rebuilds analysis data, checks any changed affiliations and # aliases, and caches data for display. -import sys -import platform -import imp -import time -import datetime import html.parser import subprocess import os -import getopt -import xlsxwriter -import configparser import pathlib import sqlalchemy as s -from .utilitymethods import update_repo_log, trim_commits, store_working_author, trim_author, get_absolute_repo_path +from .utilitymethods import update_repo_log, get_absolute_repo_path from augur.application.db.models.augur_data import * from augur.application.db.models.augur_operations import CollectionStatus from augur.application.db.util import execute_session_query, convert_orm_list_to_dict_list diff --git a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py index aef4e59989..848cb38917 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py @@ -25,21 +25,11 @@ # and checks for any parents of HEAD that aren't already accounted for in the # repos. It also rebuilds analysis data, checks any changed affiliations and # aliases, and caches data for display. -import sys -import platform -import imp -import time -import datetime -import html.parser import subprocess from subprocess import check_output import os -import getopt -import xlsxwriter -import configparser import sqlalchemy as s -from sqlalchemy.exc import IntegrityError, DataError -from .config import get_database_args_from_env +from sqlalchemy.exc import DataError from augur.application.db.models import * from .config import FacadeSession as FacadeSession from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps diff --git a/augur/tasks/github/contributors/tasks.py b/augur/tasks/github/contributors/tasks.py index 02970c35d6..882725d205 100644 --- a/augur/tasks/github/contributors/tasks.py +++ b/augur/tasks/github/contributors/tasks.py @@ -1,14 +1,12 @@ import time import logging - from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask -from augur.tasks.github.util.github_paginator import GithubPaginator, hit_api +from augur.tasks.github.util.github_paginator import hit_api from augur.tasks.github.util.github_task_session import GithubTaskManifest -from augur.tasks.util.worker_util import wait_child_tasks from augur.tasks.github.facade_github.tasks import * -from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestEvent, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Issue, IssueEvent, IssueLabel, IssueAssignee, PullRequestMessageRef, IssueMessageRef, Contributor, Repo +from augur.application.db.models import Contributor, Repo from augur.application.db.util import execute_session_query @@ -105,10 +103,10 @@ def retrieve_dict_data(url: str, key_auth, logger): return None -@celery.task(base=AugurCoreRepoCollectionTask) -def grab_comitters(repo_git,platform="github"): +@celery.task(base=AugurCoreRepoCollectionTask, bind=True) +def grab_comitters(self, repo_git,platform="github"): - from augur.tasks.init.celery_app import engine + engine = self.app.engine logger = logging.getLogger(grab_comitters.__name__) with DatabaseSession(logger,engine) as session: diff --git a/augur/tasks/github/detect_move/core.py b/augur/tasks/github/detect_move/core.py index cf7d2d1e5a..2bf96ffa1f 100644 --- a/augur/tasks/github/detect_move/core.py +++ b/augur/tasks/github/detect_move/core.py @@ -1,10 +1,8 @@ from augur.tasks.github.util.github_task_session import * from augur.application.db.models import * -from augur.tasks.github.util.github_paginator import GithubPaginator from augur.tasks.github.util.github_paginator import hit_api from augur.tasks.github.util.util import get_owner_repo from augur.tasks.github.util.util import parse_json_response -import logging from datetime import datetime from augur.tasks.util.collection_state import CollectionState from augur.application.db.util import execute_session_query diff --git a/augur/tasks/github/detect_move/tasks.py b/augur/tasks/github/detect_move/tasks.py index c1ddeab1e2..c9da0d3ca2 100644 --- a/augur/tasks/github/detect_move/tasks.py +++ b/augur/tasks/github/detect_move/tasks.py @@ -1,9 +1,10 @@ +import logging + from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.detect_move.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask, AugurSecondaryRepoCollectionTask from augur.application.db.util import execute_session_query -import traceback diff --git a/augur/tasks/github/events/tasks.py b/augur/tasks/github/events/tasks.py index 640079d852..442af9922f 100644 --- a/augur/tasks/github/events/tasks.py +++ b/augur/tasks/github/events/tasks.py @@ -1,4 +1,3 @@ -import time import logging import traceback import sqlalchemy as s @@ -6,12 +5,11 @@ from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.data_parse import * -from augur.tasks.github.util.github_paginator import GithubPaginator, hit_api +from augur.tasks.github.util.github_paginator import GithubPaginator from augur.tasks.github.util.github_task_session import GithubTaskManifest -from augur.application.db.session import DatabaseSession from augur.tasks.github.util.util import get_owner_repo from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestEvent, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Issue, IssueEvent, IssueLabel, IssueAssignee, PullRequestMessageRef, IssueMessageRef, Contributor, Repo +from augur.application.db.models import PullRequest, PullRequestEvent, Issue, IssueEvent, Contributor, Repo from augur.application.db.util import execute_session_query platform_id = 1 @@ -214,14 +212,24 @@ def update_issue_closed_cntrbs_from_events(engine, repo_id): with engine.connect() as conn: result = conn.execute(get_ranked_issues).fetchall() - update_data = [{'issue_id': row[0], 'cntrb_id': row[1], 'repo_id': repo_id} for row in result] - with engine.connect() as connection: - update_stmt = s.text(""" - UPDATE issues - SET cntrb_id = :cntrb_id - WHERE issue_id = :issue_id - AND repo_id = :repo_id - """) - connection.execute(update_stmt, update_data) + update_data = [] + for row in result: + update_data.append( + { + 'issue_id': row[0], + 'cntrb_id': row[1], + 'repo_id': repo_id + } + ) + + if update_data: + with engine.connect() as connection: + update_stmt = s.text(""" + UPDATE issues + SET cntrb_id = :cntrb_id + WHERE issue_id = :issue_id + AND repo_id = :repo_id + """) + connection.execute(update_stmt, update_data) diff --git a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py index 515ebe9ac2..44b6c706f8 100644 --- a/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py +++ b/augur/tasks/github/facade_github/contributor_interfaceable/contributor_interface.py @@ -1,19 +1,9 @@ -from requests.api import head from augur.tasks.github.util.github_task_session import * -import logging -from logging import FileHandler, Formatter, StreamHandler, log -from psycopg2.errors import UniqueViolation -from random import randint import json -import multiprocessing import time -import numpy as np import sqlalchemy as s -import math -import traceback from augur.application.db.models import * -from augur.tasks.util.AugurUUID import AugurUUID, GithubUUID, UnresolvableUUID -from augur.tasks.github.util.github_paginator import GithubPaginator, hit_api, process_dict_response, retrieve_dict_from_endpoint +from augur.tasks.github.util.github_paginator import hit_api, process_dict_response, retrieve_dict_from_endpoint # Debugger import traceback from augur.tasks.github.util.github_paginator import GithubApiResult @@ -367,7 +357,12 @@ def get_login_with_supplemental_data(logger,db,auth, commit_data): return None # Grab first result and make sure it has the highest match score - match = login_json['items'][0] + try: + match = login_json['items'][0] + except IndexError as e: + logger.error(f"Ran into error {e} when parsing users with search url: {url}\n return dict: {login_json}") + return None + for item in login_json['items']: if item['score'] > match['score']: match = item diff --git a/augur/tasks/github/facade_github/core.py b/augur/tasks/github/facade_github/core.py index c74a09e6fb..10f4affc6a 100644 --- a/augur/tasks/github/facade_github/core.py +++ b/augur/tasks/github/facade_github/core.py @@ -1,14 +1,9 @@ from augur.tasks.github.facade_github.contributor_interfaceable.contributor_interface import * from augur.tasks.github.util.util import get_owner_repo -from numpy.lib.utils import source from augur.tasks.github.util.github_task_session import * from augur.tasks.github.util.github_paginator import * from augur.application.db.models import * -import sqlalchemy as s -import time -import math -import traceback -from augur.tasks.util.AugurUUID import AugurUUID, GithubUUID, UnresolvableUUID +from augur.tasks.util.AugurUUID import GithubUUID diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index 26d1027538..6bf9888c07 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -1,20 +1,14 @@ -import time import logging from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurFacadeRepoCollectionTask -from augur.tasks.github.util.github_paginator import GithubPaginator, hit_api, retrieve_dict_from_endpoint -from augur.tasks.github.util.github_task_session import GithubTaskSession, GithubTaskManifest -from augur.tasks.github.util.util import get_owner_repo -from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestEvent, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Issue, IssueEvent, IssueLabel, IssueAssignee, PullRequestMessageRef, IssueMessageRef, Contributor, Repo +from augur.tasks.github.util.github_paginator import retrieve_dict_from_endpoint +from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.application.db.models import Contributor from augur.tasks.github.facade_github.core import * -from augur.tasks.util.worker_util import create_grouped_task_load -from celery.result import allow_join_result from augur.application.db.util import execute_session_query from augur.tasks.git.util.facade_worker.facade_worker.facade00mainprogram import * -from sqlalchemy.orm.exc import NoResultFound def process_commit_metadata(logger,db,auth,contributorQueue,repo_id,platform_id): @@ -199,15 +193,23 @@ def link_commits_to_contributor(session,contributorQueue): # Update the contributors table from the data facade has gathered. -@celery.task(base=AugurFacadeRepoCollectionTask) -def insert_facade_contributors(repo_id): +@celery.task(base=AugurFacadeRepoCollectionTask, bind=True) +def insert_facade_contributors(self, repo_git): - from augur.tasks.init.celery_app import engine + engine = self.app.engine logger = logging.getLogger(insert_facade_contributors.__name__) + repo_id = None with GithubTaskManifest(logger) as manifest: + #contributor_sequence.append(facade_start_contrib_analysis_task.si()) + query = s.sql.text("""SELECT repo_id FROM repo + WHERE repo_git=:value""").bindparams(value=repo_git) + + repo = manifest.augur_db.execute_sql(query).fetchone() + logger.info(f"repo: {repo}") + repo_id = repo[0] # Get all of the commit data's emails and names from the commit table that do not appear # in the contributors table or the contributors_aliases table. diff --git a/augur/tasks/github/issues/tasks.py b/augur/tasks/github/issues/tasks.py index 0ba793470e..baccfdc60e 100644 --- a/augur/tasks/github/issues/tasks.py +++ b/augur/tasks/github/issues/tasks.py @@ -1,21 +1,17 @@ -import time import logging import traceback -import re from sqlalchemy.exc import IntegrityError -from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.data_parse import * -from augur.tasks.github.util.github_paginator import GithubPaginator, hit_api +from augur.tasks.github.util.github_paginator import GithubPaginator from augur.tasks.github.util.github_task_session import GithubTaskManifest -from augur.application.db.session import DatabaseSession from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestEvent, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Issue, IssueEvent, IssueLabel, IssueAssignee, PullRequestMessageRef, IssueMessageRef, Contributor, Repo +from augur.application.db.models import Issue, IssueLabel, IssueAssignee, Contributor, Repo from augur.application.config import get_development_flag from augur.application.db.util import execute_session_query diff --git a/augur/tasks/github/messages/tasks.py b/augur/tasks/github/messages/tasks.py index 4dfd3a634b..f3a30a54f6 100644 --- a/augur/tasks/github/messages/tasks.py +++ b/augur/tasks/github/messages/tasks.py @@ -1,24 +1,19 @@ -import time import logging -import traceback from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.data_parse import * -from augur.tasks.github.util.github_paginator import GithubPaginator, hit_api +from augur.tasks.github.util.github_paginator import GithubPaginator from augur.tasks.github.util.github_task_session import GithubTaskManifest -from augur.application.db.session import DatabaseSession from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestEvent, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Issue, IssueEvent, IssueLabel, IssueAssignee, PullRequestMessageRef, IssueMessageRef, Contributor, Repo -from augur.application.db.util import execute_session_query - - +from augur.application.db.models import PullRequest, Message, Issue, PullRequestMessageRef, IssueMessageRef, Contributor, Repo, CollectionStatus +from augur.application.db import get_engine, get_session +from sqlalchemy.sql import text platform_id = 1 - @celery.task(base=AugurCoreRepoCollectionTask) def collect_github_messages(repo_git: str) -> None: @@ -33,18 +28,30 @@ def collect_github_messages(repo_git: str) -> None: owner, repo = get_owner_repo(repo_git) task_name = f"{owner}/{repo}: Message Task" - message_data = retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name) - - if message_data: + - process_messages(message_data, task_name, repo_id, logger, augur_db) + if is_repo_small(repo_id): + message_data = fast_retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name) + + if message_data: + process_messages(message_data, task_name, repo_id, logger, augur_db) + + else: + logger.info(f"{owner}/{repo} has no messages") else: - logger.info(f"{owner}/{repo} has no messages") + process_large_issue_and_pr_message_collection(repo_id, repo_git, logger, manifest.key_auth, task_name, augur_db) + +def is_repo_small(repo_id): + with get_session() as session: -def retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, task_name) -> None: + result = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo_id, CollectionStatus.issue_pr_sum <= 10).first() + + return result != None + +def fast_retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, task_name) -> None: owner, repo = get_owner_repo(repo_git) @@ -81,7 +88,50 @@ def retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, task_nam return all_data - + + +def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger, key_auth, task_name, augur_db) -> None: + + owner, repo = get_owner_repo(repo_git) + + # define logger for task + logger.info(f"Collecting github comments for {owner}/{repo}") + + engine = get_engine() + + with engine.connect() as connection: + + query = text(f""" + (select pr_comments_url from pull_requests WHERE repo_id={repo_id} order by pr_created_at desc) + UNION + (select comments_url as comment_url from issues WHERE repo_id={repo_id} order by created_at desc); + """) + + result = connection.execute(query).fetchall() + comment_urls = [x[0] for x in result] + + all_data = [] + for index, comment_url in enumerate(comment_urls): + + logger.info(f"{task_name}: Github messages index {index+1} of {len(comment_urls)}") + + messages = GithubPaginator(comment_url, key_auth, logger) + for page_data, _ in messages.iter_pages(): + + if page_data is None or len(page_data) == 0: + break + + all_data += page_data + + logger.info(f"All data size: {len(all_data)}") + + if len(all_data) >= 20: + process_messages(all_data, task_name, repo_id, logger, augur_db) + all_data.clear() + + if len(all_data) > 0: + process_messages(all_data, task_name, repo_id, logger, augur_db) + def process_messages(messages, task_name, repo_id, logger, augur_db): @@ -182,7 +232,7 @@ def process_messages(messages, task_name, repo_id, logger, augur_db): augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") - message_natural_keys = ["platform_msg_id"] + message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, diff --git a/augur/tasks/github/pull_requests/commits_model/core.py b/augur/tasks/github/pull_requests/commits_model/core.py index 33acb4bfbc..ea91a597da 100644 --- a/augur/tasks/github/pull_requests/commits_model/core.py +++ b/augur/tasks/github/pull_requests/commits_model/core.py @@ -1,9 +1,5 @@ -import logging -from typing import Dict, List, Tuple, Optional -import traceback import sqlalchemy as s -from augur.application.db.session import DatabaseSession -from augur.tasks.github.util.github_paginator import GithubPaginator, hit_api +from augur.tasks.github.util.github_paginator import GithubPaginator from augur.application.db.models import * from augur.tasks.github.util.util import get_owner_repo from augur.application.db.util import execute_session_query diff --git a/augur/tasks/github/pull_requests/commits_model/tasks.py b/augur/tasks/github/pull_requests/commits_model/tasks.py index 61bb66fc1c..f0a065bdd1 100644 --- a/augur/tasks/github/pull_requests/commits_model/tasks.py +++ b/augur/tasks/github/pull_requests/commits_model/tasks.py @@ -1,6 +1,4 @@ import logging -import traceback -from augur.application.db.session import DatabaseSession from augur.tasks.github.pull_requests.commits_model.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurSecondaryRepoCollectionTask diff --git a/augur/tasks/github/pull_requests/core.py b/augur/tasks/github/pull_requests/core.py index 16d843063d..5bc86cd676 100644 --- a/augur/tasks/github/pull_requests/core.py +++ b/augur/tasks/github/pull_requests/core.py @@ -1,4 +1,3 @@ -import time import logging from typing import Dict, List, Tuple, Optional @@ -7,7 +6,7 @@ from augur.application.db.session import DatabaseSession from augur.tasks.github.util.util import add_key_value_pair_to_dicts from augur.tasks.util.worker_util import remove_duplicate_dicts -from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestEvent, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, PullRequestMessageRef, Contributor, Repo +from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, Contributor PLATFORM_ID = 1 diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/augur/tasks/github/pull_requests/files_model/core.py index 81b4c4397a..138aa61cb3 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/augur/tasks/github/pull_requests/files_model/core.py @@ -1,10 +1,5 @@ -import logging -from typing import Dict, List, Tuple, Optional -import traceback import sqlalchemy as s -from augur.application.db.session import DatabaseSession -from augur.tasks.github.util.github_paginator import GithubPaginator, hit_api -from augur.tasks.github.util.gh_graphql_entities import GraphQlPageCollection, hit_api_graphql +from augur.tasks.github.util.gh_graphql_entities import GraphQlPageCollection from augur.application.db.models import * from augur.tasks.github.util.util import get_owner_repo from augur.application.db.util import execute_session_query diff --git a/augur/tasks/github/pull_requests/files_model/tasks.py b/augur/tasks/github/pull_requests/files_model/tasks.py index 851b9e4118..988261f6c8 100644 --- a/augur/tasks/github/pull_requests/files_model/tasks.py +++ b/augur/tasks/github/pull_requests/files_model/tasks.py @@ -1,6 +1,4 @@ import logging -import traceback -from augur.application.db.session import DatabaseSession from augur.tasks.github.pull_requests.files_model.core import * from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.init.celery_app import celery_app as celery diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 8db394754c..73ea1b025a 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -1,20 +1,19 @@ -import time import logging -import traceback from augur.tasks.github.pull_requests.core import extract_data_from_pr_list from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask, AugurSecondaryRepoCollectionTask from augur.application.db.data_parse import * -from augur.tasks.github.util.github_paginator import GithubPaginator, hit_api +from augur.tasks.github.util.github_paginator import GithubPaginator from augur.tasks.github.util.github_task_session import GithubTaskManifest -from augur.application.db.session import DatabaseSession from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo -from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestEvent, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, PullRequestMessageRef, Contributor, Repo +from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor, Repo from augur.application.db.util import execute_session_query from ..messages.tasks import process_github_comment_contributors +from typing import Generator, List, Dict + platform_id = 1 @@ -32,20 +31,32 @@ def collect_pull_requests(repo_git: str) -> int: Repo.repo_git == repo_git).one().repo_id owner, repo = get_owner_repo(repo_git) - pr_data = retrieve_all_pr_data(repo_git, logger, manifest.key_auth) - if pr_data: - process_pull_requests(pr_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db) + total_count = 0 + all_data = [] + for page in retrieve_all_pr_data(repo_git, logger, manifest.key_auth): + all_data += page + + if len(all_data) >= 1000: + process_pull_requests(all_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db) + total_count += len(all_data) + all_data.clear() + + if len(all_data): + process_pull_requests(all_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db) + total_count += len(all_data) - return len(pr_data) + if total_count > 0: + return total_count else: logger.info(f"{owner}/{repo} has no pull requests") return 0 + # TODO: Rename pull_request_reviewers table to pull_request_requested_reviewers # TODO: Fix column names in pull request labels table -def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> None: +def retrieve_all_pr_data(repo_git: str, logger, key_auth): #-> Generator[List[Dict]]: owner, repo = get_owner_repo(repo_git) @@ -55,24 +66,21 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> None: # returns an iterable of all prs at this url (this essentially means you can treat the prs variable as a list of the prs) prs = GithubPaginator(url, key_auth, logger) - all_data = [] num_pages = prs.get_num_pages() for page_data, page in prs.iter_pages(): if page_data is None: - return all_data + return if len(page_data) == 0: logger.debug( f"{owner}/{repo} Prs Page {page} contains no data...returning") logger.info(f"{owner}/{repo} Prs Page {page} of {num_pages}") - return all_data + return logger.info(f"{owner}/{repo} Prs Page {page} of {num_pages}") - - all_data += page_data - - return all_data + + yield page_data def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): @@ -282,7 +290,7 @@ def collect_pull_request_review_comments(repo_git: str) -> None: logger.info(f"Inserting {len(pr_review_comment_dicts)} pr review comments") - message_natural_keys = ["platform_msg_id"] + message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_return_data = augur_db.insert_data(pr_review_comment_dicts, Message, message_natural_keys, message_return_columns) if message_return_data is None: diff --git a/augur/tasks/github/releases/core.py b/augur/tasks/github/releases/core.py index 5957d4cb57..b7f953c618 100644 --- a/augur/tasks/github/releases/core.py +++ b/augur/tasks/github/releases/core.py @@ -1,16 +1,8 @@ #SPDX-License-Identifier: MIT -import logging, os, sys, time, requests, json -from datetime import datetime -from multiprocessing import Process, Queue -from urllib.parse import urlparse -import pandas as pd -import sqlalchemy as s -from sqlalchemy import MetaData -from sqlalchemy.ext.automap import automap_base from augur.tasks.github.util.github_task_session import * from augur.application.db.models import * from augur.tasks.github.util.util import get_owner_repo -from augur.tasks.github.util.gh_graphql_entities import hit_api_graphql, request_graphql_dict +from augur.tasks.github.util.gh_graphql_entities import request_graphql_dict from augur.application.db.util import execute_session_query diff --git a/augur/tasks/github/releases/tasks.py b/augur/tasks/github/releases/tasks.py index 11b73b3424..310da90d74 100644 --- a/augur/tasks/github/releases/tasks.py +++ b/augur/tasks/github/releases/tasks.py @@ -1,9 +1,10 @@ +import logging + from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.github.releases.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.util import execute_session_query -import traceback @celery.task(base=AugurCoreRepoCollectionTask) def collect_releases(repo_git): diff --git a/augur/tasks/github/repo_info/core.py b/augur/tasks/github/repo_info/core.py index 50142f614e..2a9f21af72 100644 --- a/augur/tasks/github/repo_info/core.py +++ b/augur/tasks/github/repo_info/core.py @@ -1,15 +1,10 @@ #SPDX-License-Identifier: MIT -import logging, os, sys, time, requests, json -from datetime import datetime -from multiprocessing import Process, Queue -import pandas as pd +import json import sqlalchemy as s -import httpx -import logging from augur.tasks.github.util.github_paginator import GithubPaginator from augur.tasks.github.util.github_paginator import hit_api from augur.tasks.github.util.util import get_owner_repo -from augur.tasks.github.util.gh_graphql_entities import hit_api_graphql, request_graphql_dict +from augur.tasks.github.util.gh_graphql_entities import request_graphql_dict from augur.application.db.models import * from augur.tasks.github.util.github_task_session import * from augur.application.db.models.augur_data import RepoBadging diff --git a/augur/tasks/github/repo_info/tasks.py b/augur/tasks/github/repo_info/tasks.py index d35c5dbdf8..b31bc7bf62 100644 --- a/augur/tasks/github/repo_info/tasks.py +++ b/augur/tasks/github/repo_info/tasks.py @@ -1,10 +1,10 @@ +import logging + from augur.tasks.github.util.github_task_session import GithubTaskManifest -from augur.application.db.session import DatabaseSession from augur.tasks.github.repo_info.core import * from augur.tasks.init.celery_app import celery_app as celery from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.application.db.util import execute_session_query -import traceback #Task to get regular misc github info diff --git a/augur/tasks/github/traffic/tasks.py b/augur/tasks/github/traffic/tasks.py index 70bf2fde15..068c9616b7 100644 --- a/augur/tasks/github/traffic/tasks.py +++ b/augur/tasks/github/traffic/tasks.py @@ -1,7 +1,6 @@ -import time import logging -from augur.tasks.init.celery_app import celery_app as celery, engine +from augur.tasks.init.celery_app import celery_app as celery from augur.application.db.data_parse import extract_needed_clone_history_data from augur.tasks.github.util.github_paginator import GithubPaginator from augur.tasks.github.util.github_task_session import GithubTaskManifest diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index e4f718af68..574adbbaf0 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -1,10 +1,8 @@ from augur.tasks.github.util.github_task_session import * -from typing import List, Optional, Union, Generator, Tuple #from gql import gql, Client #from gql.transport.aiohttp import AIOHTTPTransport import httpx import json -from random import choice import collections import time import traceback @@ -340,17 +338,21 @@ def __iter__(self): #self.logger.info(f"{params}") data = self.request_graphql_dict(variables=params) try: - coreData = self.extract_paginate_result(data) - #Check to make sure we have data - coreData['totalCount'] + coreData = self.extract_paginate_result(data) + if coreData is not None: + if coreData.get('totalCount') is not None: + self.logger.info("... core data obtained") + else: + self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.") + coreData['totalCount'] = 0 + else: + self.logger.error("Core data is None, cannot proceed with operations on it, but assigning a value of Zero to ensure continued collection.") + yield None + return except KeyError as e: self.logger.error("Could not extract paginate result because there was no data returned") - self.logger.error( - ''.join(traceback.format_exception(None, e, e.__traceback__))) - - self.logger.info(f"Graphql paramters: {params}") - return + self.logger.error(''.join(traceback.format_exception(None, e, e.__traceback__))) if int(coreData['totalCount']) == 0: diff --git a/augur/tasks/github/util/github_api_key_handler.py b/augur/tasks/github/util/github_api_key_handler.py index 20ce07f066..d87d7495eb 100644 --- a/augur/tasks/github/util/github_api_key_handler.py +++ b/augur/tasks/github/util/github_api_key_handler.py @@ -2,11 +2,12 @@ import time import random -from typing import Optional, List +from typing import List +from sqlalchemy.orm import Session from augur.tasks.util.redis_list import RedisList from augur.application.db.session import DatabaseSession -from augur.application.config import AugurConfig +from augur.application.db.lib import get_value from sqlalchemy import func @@ -26,11 +27,10 @@ class GithubApiKeyHandler(): key: (List[str]): List of keys retrieve from database or cache """ - def __init__(self, session: DatabaseSession): + def __init__(self, session: Session, logger): self.session = session - self.logger = session.logger - self.config = AugurConfig(self.logger, session) + self.logger = logger self.oauth_redis_key = "github_oauth_keys_list" @@ -58,7 +58,7 @@ def get_config_key(self) -> str: Github API key from config table """ - return self.config.get_value("Keys", "github_api_key") + return get_value("Keys", "github_api_key") def get_api_keys_from_database(self) -> List[str]: """Retieves all github api keys from database diff --git a/augur/tasks/github/util/github_paginator.py b/augur/tasks/github/util/github_paginator.py index 31c14565df..90593cedf6 100644 --- a/augur/tasks/github/util/github_paginator.py +++ b/augur/tasks/github/util/github_paginator.py @@ -4,8 +4,6 @@ import httpx import time import json -import asyncio -import datetime import logging @@ -305,8 +303,7 @@ def __iter__(self) -> Generator[Optional[dict], None, None]: return # yield the first page data - for data in data_list: - yield data + yield from data_list while 'next' in response.links.keys(): next_page = response.links['next']['url'] @@ -317,9 +314,8 @@ def __iter__(self) -> Generator[Optional[dict], None, None]: if result != GithubApiResult.SUCCESS: self.logger.debug("Failed to retrieve the data even though 10 attempts were given") return - - for data in data_list: - yield data + + yield from data_list def iter_pages(self) -> Generator[Tuple[Optional[List[dict]], int], None, None]: """Provide data from Github API via a generator that yields a page of dicts at a time. @@ -391,9 +387,37 @@ def retrieve_data(self, url: str) -> Tuple[Optional[List[dict]], Optional[httpx. if response.status_code == 204: return [], response, GithubApiResult.SUCCESS + if response.status_code == 404: + return None, response, GithubApiResult.REPO_NOT_FOUND - page_data = parse_json_response(self.logger, response) + if response.status_code in [403, 429]: + + if "Retry-After" in response.headers: + retry_after = int(response.headers["Retry-After"]) + self.logger.info( + f'\n\n\n\nSleeping for {retry_after} seconds due to secondary rate limit issue.\n\n\n\n') + time.sleep(retry_after) + + elif "X-RateLimit-Remaining" in response.headers and int(response.headers["X-RateLimit-Remaining"]) == 0: + current_epoch = int(time.time()) + epoch_when_key_resets = int(response.headers["X-RateLimit-Reset"]) + key_reset_time = epoch_when_key_resets - current_epoch + + if key_reset_time < 0: + self.logger.error(f"Key reset time was less than 0 setting it to 0.\nThe current epoch is {current_epoch} and the epoch that the key resets at is {epoch_when_key_resets}") + key_reset_time = 0 + + self.logger.info(f"\n\n\nAPI rate limit exceeded. Sleeping until the key resets ({key_reset_time} seconds)") + time.sleep(key_reset_time) + num_attempts = 0 + + else: + time.sleep(60) + + continue + + page_data = parse_json_response(self.logger, response) # if the data is a list, then return it and the response if isinstance(page_data, list) is True: @@ -403,6 +427,8 @@ def retrieve_data(self, url: str) -> Tuple[Optional[List[dict]], Optional[httpx. if isinstance(page_data, dict) is True: dict_processing_result = process_dict_response(self.logger, response, page_data) + self.logger.info(f"Used string interogation of dict to determine result. Response code: {response.status_code}. Processing result: {dict_processing_result}. Response body: {page_data}") + if dict_processing_result == GithubApiResult.NEW_RESULT: self.logger.info(f"Encountered new dict response from api on url: {url}. Response: {page_data}") return None, None, GithubApiResult.NEW_RESULT @@ -587,6 +613,7 @@ def retrieve_dict_from_endpoint(logger, key_auth, url, timeout_wait=10) -> Tuple page_data = parse_json_response(logger, response) if isinstance(page_data, str): + # TODO: Define process_str_response as outside the class and fix this reference str_processing_result: Union[str, List[dict]] = process_str_response(logger,page_data) if isinstance(str_processing_result, dict): diff --git a/augur/tasks/github/util/github_random_key_auth.py b/augur/tasks/github/util/github_random_key_auth.py index 926ac04216..ed539430d8 100644 --- a/augur/tasks/github/util/github_random_key_auth.py +++ b/augur/tasks/github/util/github_random_key_auth.py @@ -2,21 +2,19 @@ from augur.tasks.util.random_key_auth import RandomKeyAuth from augur.tasks.github.util.github_api_key_handler import GithubApiKeyHandler -from augur.application.db.session import DatabaseSession -import random - +from sqlalchemy.orm import Session class GithubRandomKeyAuth(RandomKeyAuth): """Defines a github specific RandomKeyAuth class so github collections can have a class randomly selects an api key for each request """ - def __init__(self, session: DatabaseSession, logger): + def __init__(self, session: Session, logger): """Creates a GithubRandomKeyAuth object and initializes the RandomKeyAuth parent class""" # gets the github api keys from the database via the GithubApiKeyHandler - github_api_keys = GithubApiKeyHandler(session).keys + github_api_keys = GithubApiKeyHandler(session, logger).keys #github_api_keys = random.sample(github_api_keys, len(github_api_keys)) if not github_api_keys: @@ -26,4 +24,4 @@ def __init__(self, session: DatabaseSession, logger): header_name = "Authorization" key_format = "token {0}" - super().__init__(github_api_keys, header_name, session.logger, key_format) \ No newline at end of file + super().__init__(github_api_keys, header_name, logger, key_format) \ No newline at end of file diff --git a/augur/tasks/github/util/github_task_session.py b/augur/tasks/github/util/github_task_session.py index 76343a62f2..0acbbf64cd 100644 --- a/augur/tasks/github/util/github_task_session.py +++ b/augur/tasks/github/util/github_task_session.py @@ -2,14 +2,13 @@ from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth from augur.application.db.session import DatabaseSession - +from augur.application.db import get_engine class GithubTaskManifest: def __init__(self, logger): - from augur.tasks.init.celery_app import engine - from augur.application.db.session import DatabaseSession + engine = get_engine() self.augur_db = DatabaseSession(logger, engine) self.key_auth = GithubRandomKeyAuth(self.augur_db.session, logger) diff --git a/augur/tasks/github/util/util.py b/augur/tasks/github/util/util.py index 42989dcca3..5dfe100977 100644 --- a/augur/tasks/github/util/util.py +++ b/augur/tasks/github/util/util.py @@ -1,6 +1,5 @@ """Utility functions that are useful for several Github tasks""" from typing import Any, List, Tuple -from httpx import Response import logging import json import httpx @@ -81,7 +80,9 @@ def get_repo_weight_by_issue(logger,repo_git): #Get the weight for each repo for the core collection hook def get_repo_weight_core(logger,repo_git): - from augur.tasks.init.celery_app import engine + + from augur.application.db import get_engine + engine = get_engine() with DatabaseSession(logger,engine) as session: repo = Repo.get_by_repo_git(session, repo_git) diff --git a/augur/tasks/init/test.py b/augur/tasks/gitlab/__init__.py similarity index 100% rename from augur/tasks/init/test.py rename to augur/tasks/gitlab/__init__.py diff --git a/augur/tasks/gitlab/events_task.py b/augur/tasks/gitlab/events_task.py index 8058831ba3..a7b886da2d 100644 --- a/augur/tasks/gitlab/events_task.py +++ b/augur/tasks/gitlab/events_task.py @@ -8,7 +8,7 @@ from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest from augur.application.db.data_parse import extract_gitlab_mr_event_data, extract_gitlab_issue_event_data -from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts +from augur.tasks.github.util.util import get_owner_repo from augur.application.db.models import Repo, Issue, IssueEvent, PullRequest, PullRequestEvent from augur.application.db.util import execute_session_query @@ -201,9 +201,8 @@ def process_mr_events(events, task_name, repo_id, logger, augur_db): tool_source, tool_version, data_source) ) - # TODO: Add unique key for this logger.info(f"{task_name}: Inserting {len(mr_event_dicts)} gitlab mr events") - mr_event_natural_keys = ["pull_request_id", "issue_event_src_id"] + mr_event_natural_keys = ["platform_id", "node_id"] augur_db.insert_data(mr_event_dicts, PullRequestEvent, mr_event_natural_keys) diff --git a/augur/tasks/gitlab/gitlab_api_handler.py b/augur/tasks/gitlab/gitlab_api_handler.py index 5303d606e9..711688b2bb 100644 --- a/augur/tasks/gitlab/gitlab_api_handler.py +++ b/augur/tasks/gitlab/gitlab_api_handler.py @@ -6,7 +6,7 @@ import time import logging -from typing import List, Optional, Union, Generator, Tuple +from typing import List, Optional, Generator, Tuple from urllib.parse import urlencode, urlparse, parse_qs, urlunparse from enum import Enum diff --git a/augur/tasks/gitlab/gitlab_api_key_handler.py b/augur/tasks/gitlab/gitlab_api_key_handler.py index 20bc1219ca..c3a76f6ddc 100644 --- a/augur/tasks/gitlab/gitlab_api_key_handler.py +++ b/augur/tasks/gitlab/gitlab_api_key_handler.py @@ -7,11 +7,11 @@ import time import random -from typing import Optional, List +from typing import List +from sqlalchemy.orm import Session from augur.tasks.util.redis_list import RedisList -from augur.application.db.session import DatabaseSession -from augur.application.config import AugurConfig +from augur.application.db.lib import get_value from sqlalchemy import func @@ -31,11 +31,10 @@ class GitlabApiKeyHandler(): key: (List[str]): List of keys retrieve from database or cache """ - def __init__(self, session: DatabaseSession): + def __init__(self, session: Session, logger): self.session = session - self.logger = session.logger - self.config = AugurConfig(self.logger, session) + self.logger = logger self.oauth_redis_key = "gitlab_oauth_keys_list" @@ -62,7 +61,7 @@ def get_config_key(self) -> str: Returns: Github API key from config table """ - return self.config.get_value("Keys", "gitlab_api_key") + return get_value("Keys", "gitlab_api_key") def get_api_keys_from_database(self) -> List[str]: """Retieves all gitlab api keys from database diff --git a/augur/tasks/gitlab/gitlab_random_key_auth.py b/augur/tasks/gitlab/gitlab_random_key_auth.py index 64ba31dd19..b2afded3ae 100644 --- a/augur/tasks/gitlab/gitlab_random_key_auth.py +++ b/augur/tasks/gitlab/gitlab_random_key_auth.py @@ -1,8 +1,9 @@ """Defines the GitlabRandomKeyAuth class""" +from sqlalchemy.orm import Session + from augur.tasks.util.random_key_auth import RandomKeyAuth from augur.tasks.gitlab.gitlab_api_key_handler import GitlabApiKeyHandler -from augur.application.db.session import DatabaseSession class GitlabRandomKeyAuth(RandomKeyAuth): @@ -10,12 +11,12 @@ class GitlabRandomKeyAuth(RandomKeyAuth): gitlab collections can have a class randomly selects an api key for each request """ - def __init__(self, session: DatabaseSession, logger): + def __init__(self, session: Session, logger): """Creates a GitlabRandomKeyAuth object and initializes the RandomKeyAuth parent class""" # gets the gitlab api keys from the database via the GitlabApiKeyHandler - gitlab_api_keys = GitlabApiKeyHandler(session).keys + gitlab_api_keys = GitlabApiKeyHandler(session, logger).keys if not gitlab_api_keys: print("Failed to find github api keys. This is usually because your key has expired") diff --git a/augur/tasks/gitlab/gitlab_task_session.py b/augur/tasks/gitlab/gitlab_task_session.py index 58a6e64373..0892087d22 100644 --- a/augur/tasks/gitlab/gitlab_task_session.py +++ b/augur/tasks/gitlab/gitlab_task_session.py @@ -5,6 +5,7 @@ from augur.tasks.gitlab.gitlab_random_key_auth import GitlabRandomKeyAuth from augur.application.db.session import DatabaseSession +from augur.application.db import get_engine class GitlabTaskManifest: """ @@ -20,7 +21,7 @@ class GitlabTaskManifest: def __init__(self, logger): - from augur.tasks.init.celery_app import engine + engine = get_engine() self.augur_db = DatabaseSession(logger, engine) self.key_auth = GitlabRandomKeyAuth(self.augur_db.session, logger) diff --git a/augur/tasks/gitlab/issues_task.py b/augur/tasks/gitlab/issues_task.py index cf6e5e5dab..b96650c9a1 100644 --- a/augur/tasks/gitlab/issues_task.py +++ b/augur/tasks/gitlab/issues_task.py @@ -8,10 +8,11 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest -from augur.application.db.data_parse import extract_needed_issue_data_from_gitlab_issue, extract_needed_gitlab_issue_label_data, extract_needed_gitlab_issue_assignee_data, extract_needed_gitlab_issue_message_ref_data, extract_needed_gitlab_message_data +from augur.application.db.data_parse import extract_needed_issue_data_from_gitlab_issue, extract_needed_gitlab_issue_label_data, extract_needed_gitlab_issue_assignee_data, extract_needed_gitlab_issue_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts -from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Repo +from augur.application.db.models import Issue, IssueLabel, IssueAssignee, IssueMessageRef, Message, Repo, Contributor from augur.application.db.util import execute_session_query +from augur.tasks.util.worker_util import remove_duplicate_dicts platform_id = 2 @@ -50,8 +51,6 @@ def collect_gitlab_issues(repo_git : str) -> int: logger.error(f"Could not collect gitlab issues for repo {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") return -1 - - def retrieve_all_gitlab_issue_data(repo_git, logger, key_auth) -> None: """ Retrieve only the needed data for issues from the api response @@ -108,10 +107,15 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: issue_dicts = [] issue_ids = [] issue_mapping_data = {} + contributors = [] for issue in issues: issue_ids.append(issue["iid"]) + issue, contributor_data = process_issue_contributors(issue, tool_source, tool_version, data_source) + + contributors += contributor_data + issue_dicts.append( extract_needed_issue_data_from_gitlab_issue(issue, repo_id, tool_source, tool_version, data_source) ) @@ -132,6 +136,13 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: if len(issue_dicts) == 0: print("No gitlab issues found while processing") return + + # remove duplicate contributors before inserting + contributors = remove_duplicate_dicts(contributors) + + # insert contributors from these issues + logger.info(f"{task_name}: Inserting {len(contributors)} contributors") + augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(issue_dicts)} gitlab issues") issue_natural_keys = ["repo_id", "gh_issue_id"] @@ -169,13 +180,26 @@ def process_issues(issues, task_name, repo_id, logger, augur_db) -> None: issue_label_natural_keys, string_fields=issue_label_string_fields) # inserting issue assignees - # we are using issue_assignee_src_id and issue_id to determine if the label is already in the database. - # issue_assignee_natural_keys = ['issue_assignee_src_id', 'issue_id'] - # augur_db.insert_data(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) + issue_assignee_natural_keys = ['issue_assignee_src_id', 'issue_id'] + augur_db.insert_data(issue_assignee_dicts, IssueAssignee, issue_assignee_natural_keys) return issue_ids +def process_issue_contributors(issue, tool_source, tool_version, data_source): + + contributors = [] + + issue_cntrb = extract_needed_gitlab_contributor_data(issue["author"], tool_source, tool_version, data_source) + issue["cntrb_id"] = issue_cntrb["cntrb_id"] + contributors.append(issue_cntrb) + for assignee in issue["assignees"]: + + issue_assignee_cntrb = extract_needed_gitlab_contributor_data(assignee, tool_source, tool_version, data_source) + assignee["cntrb_id"] = issue_assignee_cntrb["cntrb_id"] + contributors.append(issue_assignee_cntrb) + + return issue, contributors @celery.task(base=AugurCoreRepoCollectionTask) def collect_gitlab_issue_comments(issue_ids, repo_git) -> int: @@ -232,7 +256,7 @@ def retrieve_all_gitlab_issue_comments(key_auth, logger, issue_ids, repo_git): url = f"https://gitlab.com/api/v4/projects/{owner}%2f{repo}/issues/{id}/notes" - for page_data, page in comments.iter_pages(url): + for page_data, _ in comments.iter_pages(url): if page_data is None or len(page_data) == 0: break @@ -270,6 +294,7 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): issue_number_to_id_map[issue.gh_issue_number] = issue.issue_id message_dicts = [] + contributors = [] message_ref_mapping_data = {} for id, messages in data.items(): @@ -283,6 +308,11 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): for message in messages: + message, contributor = process_gitlab_issue_comment_contributors(message, tool_source, tool_version, data_source) + + if contributor: + contributors.append(contributor) + issue_message_ref_data = extract_needed_gitlab_issue_message_ref_data(message, issue_id, repo_id, tool_source, tool_version, data_source) message_ref_mapping_data[message["id"]] = { @@ -293,9 +323,13 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): extract_needed_gitlab_message_data(message, platform_id, tool_source, tool_version, data_source) ) + contributors = remove_duplicate_dicts(contributors) + + logger.info(f"{task_name}: Inserting {len(contributors)} contributors") + augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") - message_natural_keys = ["platform_msg_id"] + message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, @@ -318,3 +352,12 @@ def process_gitlab_issue_messages(data, task_name, repo_id, logger, augur_db): augur_db.insert_data(issue_message_ref_dicts, IssueMessageRef, issue_message_ref_natural_keys) +def process_gitlab_issue_comment_contributors(message, tool_source, tool_version, data_source): + + contributor = extract_needed_gitlab_contributor_data(message["author"], tool_source, tool_version, data_source) + if contributor: + message["cntrb_id"] = contributor["cntrb_id"] + else: + message["cntrb_id"] = None + + return message, contributor \ No newline at end of file diff --git a/augur/tasks/gitlab/merge_request_task.py b/augur/tasks/gitlab/merge_request_task.py index ccf3c7e012..d5212a52d4 100644 --- a/augur/tasks/gitlab/merge_request_task.py +++ b/augur/tasks/gitlab/merge_request_task.py @@ -4,10 +4,11 @@ from augur.tasks.init.celery_app import AugurCoreRepoCollectionTask from augur.tasks.gitlab.gitlab_api_handler import GitlabApiHandler from augur.tasks.gitlab.gitlab_task_session import GitlabTaskManifest -from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_mr_reviewer_data, extract_needed_mr_commit_data, extract_needed_mr_file_data, extract_needed_mr_metadata, extract_needed_gitlab_mr_message_ref_data, extract_needed_gitlab_message_data +from augur.application.db.data_parse import extract_needed_pr_data_from_gitlab_merge_request, extract_needed_merge_request_assignee_data, extract_needed_mr_label_data, extract_needed_mr_reviewer_data, extract_needed_mr_commit_data, extract_needed_mr_file_data, extract_needed_mr_metadata, extract_needed_gitlab_mr_message_ref_data, extract_needed_gitlab_message_data, extract_needed_gitlab_contributor_data from augur.tasks.github.util.util import get_owner_repo, add_key_value_pair_to_dicts -from augur.application.db.models import PullRequest, PullRequestAssignee, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message +from augur.application.db.models import PullRequest, PullRequestLabel, PullRequestMeta, PullRequestCommit, PullRequestFile, PullRequestMessageRef, Repo, Message, Contributor from augur.application.db.util import execute_session_query +from augur.tasks.util.worker_util import remove_duplicate_dicts platform_id = 2 @@ -99,12 +100,17 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): data_source = "Gitlab API" merge_requests = [] + contributors = [] mr_ids = [] mr_mapping_data = {} for mr in data: mr_ids.append(mr["iid"]) + mr, contributor_data = process_mr_contributors(mr, tool_source, tool_version, data_source) + + contributors += contributor_data + merge_requests.append(extract_needed_pr_data_from_gitlab_merge_request(mr, repo_id, tool_source, tool_version)) assignees = extract_needed_merge_request_assignee_data(mr["assignees"], repo_id, tool_source, tool_version, data_source) @@ -117,6 +123,11 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): "labels": labels } + contributors = remove_duplicate_dicts(contributors) + + logger.info(f"{task_name}: Inserting {len(contributors)} contributors") + augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) + logger.info(f"{task_name}: Inserting mrs of length: {len(merge_requests)}") pr_natural_keys = ["repo_id", "pr_src_id"] pr_string_fields = ["pr_src_title", "pr_body"] @@ -142,9 +153,8 @@ def process_merge_requests(data, task_name, repo_id, logger, augur_db): logger.info(f"{task_name}: Inserting other pr data of lengths: Labels: {len(mr_label_dicts)} - Assignees: {len(mr_assignee_dicts)}") - # TODO: Setup unique key on asignees with a value of ('cntrb_id', 'pull_request_id') and add 'cntrb_id' to assingee data - # mr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] - # augur_db.insert_data(mr_assignee_dicts, PullRequestAssignee, mr_assignee_natural_keys) + mr_assignee_natural_keys = ['pr_assignee_src_id', 'pull_request_id'] + augur_db.insert_data(mr_assignee_dicts, PullRequestAssignee, mr_assignee_natural_keys) pr_label_natural_keys = ['pr_src_id', 'pull_request_id'] pr_label_string_fields = ["pr_src_description"] @@ -208,6 +218,7 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): mr_number_to_id_map[mr.pr_src_number] = mr.pull_request_id message_dicts = [] + contributors = [] message_ref_mapping_data = {} for id, messages in data.items(): @@ -221,6 +232,11 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): for message in messages: + message, contributor = process_gitlab_mr_comment_contributors(message, tool_source, tool_version, data_source) + + if contributor: + contributors.append(contributor) + mr_message_ref_data = extract_needed_gitlab_mr_message_ref_data(message, pull_request_id, repo_id, tool_source, tool_version, data_source) message_ref_mapping_data[message["id"]] = { @@ -231,9 +247,13 @@ def process_gitlab_mr_messages(data, task_name, repo_id, logger, augur_db): extract_needed_gitlab_message_data(message, platform_id, tool_source, tool_version, data_source) ) + contributors = remove_duplicate_dicts(contributors) + + logger.info(f"{task_name}: Inserting {len(contributors)} mr message contributors") + augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) - logger.info(f"{task_name}: Inserting {len(message_dicts)} messages") - message_natural_keys = ["platform_msg_id"] + logger.info(f"{task_name}: Inserting {len(message_dicts)} mr messages") + message_natural_keys = ["platform_msg_id", "pltfrm_id"] message_return_columns = ["msg_id", "platform_msg_id"] message_string_fields = ["msg_text"] message_return_data = augur_db.insert_data(message_dicts, Message, message_natural_keys, @@ -312,8 +332,10 @@ def process_mr_metadata(data, task_name, repo_id, logger, augur_db): for id, metadata in data.items(): pull_request_id = mr_number_to_id_map[id] - - all_metadata.extend(extract_needed_mr_metadata(metadata, repo_id, pull_request_id, tool_source, tool_version, data_source)) + + #ignore blank metadata + if metadata: + all_metadata.extend(extract_needed_mr_metadata(metadata, repo_id, pull_request_id, tool_source, tool_version, data_source)) logger.info(f"{task_name}: Inserting {len(all_metadata)} merge request metadata") pr_metadata_natural_keys = ['pull_request_id', 'pr_head_or_base', 'pr_sha'] @@ -558,3 +580,30 @@ def retrieve_merge_request_data(ids, url, name, owner, repo, key_auth, logger, r index += 1 return all_data + + +def process_mr_contributors(mr, tool_source, tool_version, data_source): + + contributors = [] + + issue_cntrb = extract_needed_gitlab_contributor_data(mr["author"], tool_source, tool_version, data_source) + mr["cntrb_id"] = issue_cntrb["cntrb_id"] + contributors.append(issue_cntrb) + + for assignee in mr["assignees"]: + + issue_assignee_cntrb = extract_needed_gitlab_contributor_data(assignee, tool_source, tool_version, data_source) + assignee["cntrb_id"] = issue_assignee_cntrb["cntrb_id"] + contributors.append(issue_assignee_cntrb) + + return mr, contributors + +def process_gitlab_mr_comment_contributors(message, tool_source, tool_version, data_source): + + contributor = extract_needed_gitlab_contributor_data(message["author"], tool_source, tool_version, data_source) + if contributor: + message["cntrb_id"] = contributor["cntrb_id"] + else: + message["cntrb_id"] = None + + return message, contributor \ No newline at end of file diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index 274305449a..a18284186c 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -1,25 +1,23 @@ """Defines the Celery app.""" -from celery.signals import worker_process_init, worker_process_shutdown, eventlet_pool_started, eventlet_pool_preshutdown, eventlet_pool_postshutdown +from celery.signals import worker_process_init, worker_process_shutdown import logging from typing import List, Dict import os import datetime -from enum import Enum import traceback import celery from celery import Celery from celery import current_app from celery.signals import after_setup_logger -from sqlalchemy import create_engine, event, or_, and_ from augur.application.logs import TaskLogConfig, AugurLogger from augur.application.db.session import DatabaseSession from augur.application.db.engine import DatabaseEngine +from augur.application.db import get_engine from augur.application.config import AugurConfig -from augur.application.db.engine import get_database_string from augur.tasks.init import get_redis_conn_values, get_rabbitmq_conn_string -from augur.application.db.models import CollectionStatus, Repo +from augur.application.db.models import Repo from augur.tasks.util.collection_state import CollectionState logger = logging.getLogger(__name__) @@ -77,7 +75,9 @@ class AugurCoreRepoCollectionTask(celery.Task): def augur_handle_task_failure(self,exc,task_id,repo_git,logger_name,collection_hook='core',after_fail=CollectionState.ERROR.value): - from augur.tasks.init.celery_app import engine + + # Note: I think self.app.engine would work but leaving it to try later + engine = get_engine() logger = AugurLogger(logger_name).get_logger() @@ -205,8 +205,10 @@ def setup_periodic_tasks(sender, **kwargs): from augur.tasks.git.facade_tasks import clone_repos from augur.tasks.db.refresh_materialized_views import refresh_materialized_views from augur.tasks.data_analysis.contributor_breadth_worker.contributor_breadth_worker import contributor_breadth_model - - with DatabaseEngine() as engine, DatabaseSession(logger, engine) as session: + from augur.application.db import temporary_database_engine + + # Need to engine to be temporary so that there isn't an engine defined when the parent is forked to create worker processes + with temporary_database_engine() as engine, DatabaseSession(logger, engine) as session: config = AugurConfig(logger, session) @@ -233,7 +235,6 @@ def setup_periodic_tasks(sender, **kwargs): thirty_days_in_seconds = 30*24*60*60 sender.add_periodic_task(thirty_days_in_seconds, contributor_breadth_model.s()) - @after_setup_logger.connect def setup_loggers(*args,**kwargs): """Override Celery loggers with our own.""" @@ -245,23 +246,29 @@ def setup_loggers(*args,**kwargs): TaskLogConfig(split_tasks_into_groups(augur_tasks)) -engine = None +#engine = None @worker_process_init.connect def init_worker(**kwargs): - global engine + celery_app.engine = get_engine() - from augur.application.db.engine import DatabaseEngine - from sqlalchemy.pool import NullPool, StaticPool + # global engine - engine = DatabaseEngine(poolclass=StaticPool).engine + # from augur.application.db.engine import DatabaseEngine + # from sqlalchemy.pool import NullPool, StaticPool + + # engine = DatabaseEngine(poolclass=StaticPool).engine @worker_process_shutdown.connect def shutdown_worker(**kwargs): - global engine - if engine: - logger.info('Closing database connectionn for worker') - engine.dispose() + + from augur.application.db import dispose_database_engine + dispose_database_engine() + + # global engine + # if engine: + # logger.info('Closing database connectionn for worker') + # engine.dispose() diff --git a/augur/tasks/init/celery_worker.py b/augur/tasks/init/celery_worker.py deleted file mode 100644 index c9a76569a7..0000000000 --- a/augur/tasks/init/celery_worker.py +++ /dev/null @@ -1,4 +0,0 @@ -from celery.signals import worker_process_init, worker_process_shutdown - -print("Celery worker") - diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index a9ba7e1634..866b7a0288 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -1,17 +1,9 @@ from __future__ import annotations -from typing import List -import time import logging import os -from enum import Enum -import math -import numpy as np -import datetime -import random #from celery.result import AsyncResult -from celery import signature -from celery import group, chain, chord, signature -from sqlalchemy import or_, and_,tuple_, update +from celery import group, chain +from sqlalchemy import and_,update from augur.tasks.github import * @@ -24,16 +16,13 @@ from augur.tasks.github.pull_requests.commits_model.tasks import process_pull_request_commits from augur.tasks.git.dependency_tasks.tasks import process_ossf_dependency_metrics from augur.tasks.github.traffic.tasks import collect_github_repo_clones_data -from augur.tasks.gitlab.merge_request_task import collect_gitlab_merge_requests, collect_merge_request_comments, collect_merge_request_metadata, collect_merge_request_reviewers, collect_merge_request_commits, collect_merge_request_files +from augur.tasks.gitlab.merge_request_task import collect_gitlab_merge_requests, collect_merge_request_metadata, collect_merge_request_commits, collect_merge_request_files, collect_merge_request_comments from augur.tasks.gitlab.issues_task import collect_gitlab_issues, collect_gitlab_issue_comments from augur.tasks.gitlab.events_task import collect_gitlab_issue_events, collect_gitlab_merge_request_events from augur.tasks.git.facade_tasks import * from augur.tasks.db.refresh_materialized_views import * -# from augur.tasks.data_analysis import * from augur.tasks.init.celery_app import celery_app as celery from augur.application.db.session import DatabaseSession -from logging import Logger -from augur.tasks.util.redis_list import RedisList from augur.application.db.models import CollectionStatus, Repo from augur.tasks.util.collection_state import CollectionState from augur.tasks.util.collection_util import * @@ -101,16 +90,16 @@ def primary_repo_collect_phase_gitlab(repo_git): logger = logging.getLogger(primary_repo_collect_phase_gitlab.__name__) jobs = group( - chain(collect_gitlab_merge_requests.si(repo_git), group( - #collect_merge_request_comments.s(repo_git), - #collect_merge_request_reviewers.s(repo_git), + chain(collect_gitlab_merge_requests.si(repo_git), group( + collect_merge_request_comments.s(repo_git), + #collect_merge_request_reviewers.s(repo_git), collect_merge_request_metadata.s(repo_git), collect_merge_request_commits.s(repo_git), collect_merge_request_files.s(repo_git), collect_gitlab_merge_request_events.si(repo_git), )), chain(collect_gitlab_issues.si(repo_git), group( - #collect_gitlab_issue_comments.s(repo_git), + collect_gitlab_issue_comments.s(repo_git), collect_gitlab_issue_events.si(repo_git), )), ) @@ -137,19 +126,16 @@ def secondary_repo_collect_phase(repo_git): #This is a periodic task that runs less often to handle less important collection tasks such as #refreshing the materialized views. -@celery.task -def non_repo_domain_tasks(): +@celery.task(bind=True) +def non_repo_domain_tasks(self): - from augur.tasks.init.celery_app import engine + engine = self.app.engine logger = logging.getLogger(non_repo_domain_tasks.__name__) logger.info("Executing non-repo domain tasks") - enabled_phase_names = [] - with DatabaseSession(logger, engine) as session: - - enabled_phase_names = get_enabled_phase_names_from_config(session.logger, session) + enabled_phase_names = get_enabled_phase_names_from_config() enabled_tasks = [] @@ -245,10 +231,10 @@ def ml_task_success_util_gen(repo_git): request.get_valid_repos(session) return request -@celery.task -def augur_collection_monitor(): +@celery.task(bind=True) +def augur_collection_monitor(self): - from augur.tasks.init.celery_app import engine + engine = self.app.engine logger = logging.getLogger(augur_collection_monitor.__name__) @@ -256,7 +242,7 @@ def augur_collection_monitor(): with DatabaseSession(logger, engine) as session: #Get list of enabled phases - enabled_phase_names = get_enabled_phase_names_from_config(session.logger, session) + enabled_phase_names = get_enabled_phase_names_from_config() enabled_collection_hooks = [] @@ -283,10 +269,10 @@ def augur_collection_monitor(): # have a pipe of 180 -@celery.task -def augur_collection_update_weights(): +@celery.task(bind=True) +def augur_collection_update_weights(self): - from augur.tasks.init.celery_app import engine + engine = self.app.engine logger = logging.getLogger(augur_collection_update_weights.__name__) @@ -328,25 +314,34 @@ def augur_collection_update_weights(): session.commit() #git_update_commit_count_weight(repo_git) -@celery.task -def retry_errored_repos(): +@celery.task(bind=True) +def retry_errored_repos(self): """ Periodic task to reset repositories that have errored and try again. """ - from augur.tasks.init.celery_app import engine + engine = self.app.engine logger = logging.getLogger(create_collection_status_records.__name__) #TODO: Isaac needs to normalize the status's to be abstract in the #collection_status table once augur dev is less unstable. with DatabaseSession(logger,engine) as session: - query = s.sql.text(f"""UPDATE repo SET secondary_status = {CollectionState.PENDING.value}""" - f""" WHERE secondary_status = '{CollectionState.ERROR.value}' ;""" - f"""UPDATE repo SET core_status = {CollectionState.PENDING.value}""" - f""" WHERE core_status = '{CollectionState.ERROR.value}' ;""" - f"""UPDATE repo SET facade_status = {CollectionState.PENDING.value}""" - f""" WHERE facade_status = '{CollectionState.ERROR.value}' ;""" - f"""UPDATE repo SET ml_status = {CollectionState.PENDING.value}""" - f""" WHERE ml_status = '{CollectionState.ERROR.value}' ;""" + query = s.sql.text(f"""UPDATE collection_status SET secondary_status = '{CollectionState.PENDING.value}'""" + f""" WHERE secondary_status = '{CollectionState.ERROR.value}' and secondary_data_last_collected is NULL;""" + f"""UPDATE collection_status SET core_status = '{CollectionState.PENDING.value}'""" + f""" WHERE core_status = '{CollectionState.ERROR.value}' and core_data_last_collected is NULL;""" + f"""UPDATE collection_status SET facade_status = '{CollectionState.PENDING.value}'""" + f""" WHERE facade_status = '{CollectionState.ERROR.value}' and facade_data_last_collected is NULL;""" + f"""UPDATE collection_status SET ml_status = '{CollectionState.PENDING.value}'""" + f""" WHERE ml_status = '{CollectionState.ERROR.value}' and ml_data_last_collected is NULL;""" + + f"""UPDATE collection_status SET secondary_status = '{CollectionState.SUCCESS.value}'""" + f""" WHERE secondary_status = '{CollectionState.ERROR.value}' and secondary_data_last_collected is not NULL;""" + f"""UPDATE collection_status SET core_status = '{CollectionState.SUCCESS.value}'""" + f""" WHERE core_status = '{CollectionState.ERROR.value}' and core_data_last_collected is not NULL;;""" + f"""UPDATE collection_status SET facade_status = '{CollectionState.SUCCESS.value}'""" + f""" WHERE facade_status = '{CollectionState.ERROR.value}' and facade_data_last_collected is not NULL;;""" + f"""UPDATE collection_status SET ml_status = '{CollectionState.SUCCESS.value}'""" + f""" WHERE ml_status = '{CollectionState.ERROR.value}' and ml_data_last_collected is not NULL;;""" ) session.execute_sql(query) @@ -354,8 +349,8 @@ def retry_errored_repos(): #Retry this task for every issue so that repos that were added manually get the chance to be added to the collection_status table. -@celery.task(autoretry_for=(Exception,), retry_backoff=True, retry_backoff_max=300, retry_jitter=True, max_retries=None) -def create_collection_status_records(): +@celery.task(autoretry_for=(Exception,), retry_backoff=True, retry_backoff_max=300, retry_jitter=True, max_retries=None, bind=True) +def create_collection_status_records(self): """ Automatic task that runs and checks for repos that haven't been given a collection_status record corresponding to the state of their collection at the monent. @@ -363,7 +358,7 @@ def create_collection_status_records(): A special celery task that automatically retries itself and has no max retries. """ - from augur.tasks.init.celery_app import engine + engine = self.app.engine logger = logging.getLogger(create_collection_status_records.__name__) with DatabaseSession(logger,engine) as session: diff --git a/augur/tasks/test.py b/augur/tasks/test.py index 3db8c58c52..efdacb77f4 100644 --- a/augur/tasks/test.py +++ b/augur/tasks/test.py @@ -1,10 +1,5 @@ -from celery import signature -from celery import group, chain, chord, signature - - from augur.tasks.init.celery_app import celery_app as celery - @celery.task() def successful_task(): pass diff --git a/augur/tasks/util/AugurUUID.py b/augur/tasks/util/AugurUUID.py index 5dfabc8ac4..ae8f05f124 100644 --- a/augur/tasks/util/AugurUUID.py +++ b/augur/tasks/util/AugurUUID.py @@ -129,6 +129,19 @@ class GithubUUID(AugurUUID): def __init__(self): super().__init__(platform = 1) +class GitlabUUID(AugurUUID): + struct = { + "platform": {"start": 0, "size": 1}, + "user": {"start": 1, "size": 4}, + "repo": {"start": 5, "size": 3}, + "issue": {"start": 8, "size": 4}, + "event": {"start": 12, "size": 4}, + "metadata": {"start": 12, "size": 4} + } + + def __init__(self): + super().__init__(platform = 2) + class UnresolvableUUID(GithubUUID): def __init__(self): super(GithubUUID, self).__init__(platform = 0) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 89ae5f3d53..9776258626 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -1,28 +1,19 @@ from __future__ import annotations -from typing import List -import time import logging import random -import os -from enum import Enum -import math -import numpy as np import datetime #from celery.result import AsyncResult -from celery import signature -from celery import group, chain, chord, signature +from celery import chain import sqlalchemy as s -from sqlalchemy import or_, and_, update +from sqlalchemy import or_, update from augur.application.logs import AugurLogger from augur.tasks.init.celery_app import celery_app as celery from augur.application.db.models import CollectionStatus, Repo from augur.application.db.util import execute_session_query -from augur.application.config import AugurConfig -from augur.tasks.github.util.util import get_owner_repo, get_repo_weight_core, get_repo_weight_by_issue -from augur.tasks.github.util.gh_graphql_entities import GitHubRepo as GitHubRepoGraphql -from augur.tasks.github.util.gh_graphql_entities import GraphQlPageCollection -from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.application.db.lib import get_section +from augur.tasks.github.util.util import get_repo_weight_core, get_repo_weight_by_issue from augur.application.db.session import DatabaseSession +from augur.application.db import get_engine from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps from augur.tasks.util.collection_state import CollectionState @@ -141,64 +132,81 @@ def __init__(self,name,phases,max_repo = 10,days_until_collect_again = 1, gitlab def get_active_repo_count(self,session): return len(session.query(CollectionStatus).filter(getattr(CollectionStatus,f"{self.name}_status" ) == CollectionState.COLLECTING.value).all()) - #Get repo urls based on passed in info. + def get_valid_repos(self,session): - #getattr(CollectionStatus,f"{hook}_status" ) represents the status of the given hook - #Get the count of repos that are currently running this collection hook - #status_column = f"{hook}_status" + active_repo_count = self.get_active_repo_count(session) + limit = self.max_repo-active_repo_count - #Will always disallow errored repos and repos that are already collecting + if limit <= 0: + return - #The maximum amount of repos to schedule is affected by the existing repos running tasks - limit = self.max_repo-active_repo_count + collection_list = get_newly_added_repos(session, limit, hook=self.name) + self.repo_list.extend(collection_list) + limit -= len(collection_list) - #Extract the user id from the randomized list and split into four chunks - split_user_list = split_random_users_list(session,f"{self.name}_status",self.new_status) + #Now start recollecting other repos if there is space to do so. + if limit <= 0: + return - session.logger.info(f"User_list: {split_user_list}") + collection_list = get_repos_for_recollection(session, limit, hook=self.name, days_until_collect_again=self.days_until_collect_again) - #Iterate through each fourth of the users fetched - for quarter_list in split_user_list: - if limit <= 0: - return + self.repo_list.extend(collection_list) - collection_list = get_valid_repos_for_users(session,limit,tuple(quarter_list),hook=self.name, days_to_wait_until_next_collection=self.days_until_collect_again) - self.repo_list.extend(collection_list) - #Update limit with amount of repos started - limit -= len(collection_list) +def get_newly_added_repos(session, limit, hook): - #Now start old repos if there is space to do so. - if limit <= 0: - return + condition_string = "" + if hook in ["core", "secondary", "ml"]: + condition_string += f"""{hook}_status='{str(CollectionState.PENDING.value)}'""" + + elif hook == "facade": + condition_string += f"""facade_status='{str(CollectionState.UPDATE.value)}'""" + + if hook == "secondary": + condition_string += f""" and core_status='{str(CollectionState.SUCCESS.value)}'""" + repo_query = s.sql.text(f""" + select repo_git + from augur_operations.collection_status x, augur_data.repo y + where x.repo_id=y.repo_id + and {condition_string} + order by repo_added + limit :limit_num + """).bindparams(limit_num=limit) + + valid_repos = session.execute_sql(repo_query).fetchall() + valid_repo_git_list = [repo[0] for repo in valid_repos] - user_list = get_list_of_all_users(session) - random.shuffle(user_list) + return valid_repo_git_list - #Extract the user id from the randomized list and split into four chunks - split_user_list = split_list_into_chunks([row[0] for row in user_list], 4) +def get_repos_for_recollection(session, limit, hook, days_until_collect_again): - for quarter_list in split_user_list: + if hook in ["core", "secondary", "ml"]: + condition_string = f"""{hook}_status='{str(CollectionState.SUCCESS.value)}'""" - #Break out if limit has been reached - if limit <= 0: - return + elif hook == "facade": + condition_string = f"""facade_status='{str(CollectionState.SUCCESS.value)}'""" - #only start repos older than the specified amount of days - #Query a set of valid repositories sorted by weight, also making sure that the repos aren't new or errored - #Order by the relevant weight for the collection hook - collection_list = get_valid_repos_for_users(session,limit,tuple(quarter_list),allow_old_repos=True,hook=self.name, days_to_wait_until_next_collection=self.days_until_collect_again) + repo_query = s.sql.text(f""" + select repo_git + from augur_operations.collection_status x, repo y + where x.repo_id = y.repo_id + and {condition_string} + and {hook}_data_last_collected <= NOW() - INTERVAL '{days_until_collect_again} DAYS' + order by {hook}_data_last_collected + limit :limit_num + """).bindparams(limit_num=limit) + + valid_repos = session.execute_sql(repo_query).fetchall() + valid_repo_git_list = [repo[0] for repo in valid_repos] - self.repo_list.extend(collection_list) - limit -= len(collection_list) + return valid_repo_git_list -def get_enabled_phase_names_from_config(logger, session): +def get_enabled_phase_names_from_config(): - config = AugurConfig(logger, session) - phase_options = config.get_section("Task_Routine") + phase_options = get_section("Task_Routine") #Get list of enabled phases enabled_phase_names = [name for name, phase in phase_options.items() if phase == 1] @@ -224,10 +232,10 @@ def split_list_into_chunks(given_list, num_chunks): return [given_list[i:i + n] for i in range(0, len(given_list),n)] -@celery.task -def task_failed_util(request,exc,traceback): +@celery.task(bind=True) +def task_failed_util(self, request,exc,traceback): - from augur.tasks.init.celery_app import engine + engine = self.app.engine logger = logging.getLogger(task_failed_util.__name__) @@ -281,9 +289,10 @@ def task_failed_util(request,exc,traceback): #This task updates the core and secondary weight with the issues and prs already passed in -@celery.task -def issue_pr_task_update_weight_util(issue_and_pr_nums,repo_git=None,session=None): - from augur.tasks.init.celery_app import engine +@celery.task(bind=True) +def issue_pr_task_update_weight_util(self, issue_and_pr_nums,repo_git=None,session=None): + + engine = self.app.engine logger = logging.getLogger(issue_pr_task_update_weight_util.__name__) if repo_git is None: @@ -296,10 +305,10 @@ def issue_pr_task_update_weight_util(issue_and_pr_nums,repo_git=None,session=Non update_issue_pr_weights(logger,session,repo_git,sum(issue_and_pr_nums)) -@celery.task -def core_task_success_util(repo_git): +@celery.task(bind=True) +def core_task_success_util(self, repo_git): - from augur.tasks.init.celery_app import engine + engine = self.app.engine logger = logging.getLogger(core_task_success_util.__name__) @@ -363,10 +372,10 @@ def update_issue_pr_weights(logger,session,repo_git,raw_sum): -@celery.task -def secondary_task_success_util(repo_git): +@celery.task(bind=True) +def secondary_task_success_util(self, repo_git): - from augur.tasks.init.celery_app import engine + engine = self.app.engine logger = logging.getLogger(secondary_task_success_util.__name__) @@ -395,7 +404,8 @@ def secondary_task_success_util(repo_git): #Get the weight for each repo for the secondary collection hook. def get_repo_weight_secondary(logger,repo_git): - from augur.tasks.init.celery_app import engine + + engine = get_engine() with DatabaseSession(logger,engine) as session: repo = Repo.get_by_repo_git(session, repo_git) @@ -415,10 +425,10 @@ def get_repo_weight_secondary(logger,repo_git): return get_repo_weight_by_issue(logger, repo_git, days) -@celery.task -def facade_task_success_util(repo_git): +@celery.task(bind=True) +def facade_task_success_util(self, repo_git): - from augur.tasks.init.celery_app import engine + engine = self.app.engine logger = logging.getLogger(facade_task_success_util.__name__) @@ -438,9 +448,10 @@ def facade_task_success_util(repo_git): session.commit() -@celery.task -def ml_task_success_util(repo_git): - from augur.tasks.init.celery_app import engine +@celery.task(bind=True) +def ml_task_success_util(self, repo_git): + + engine = self.app.engine logger = logging.getLogger(facade_task_success_util.__name__) @@ -462,10 +473,10 @@ def ml_task_success_util(repo_git): -@celery.task -def facade_clone_success_util(repo_git): +@celery.task(bind=True) +def facade_clone_success_util(self, repo_git): - from augur.tasks.init.celery_app import engine + engine = self.app.engine logger = logging.getLogger(facade_clone_success_util.__name__) @@ -617,80 +628,3 @@ def send_messages(self): #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated yield repo_git, task_id, col_hook.name - -#def start_block_of_repos(logger,session,repo_git_identifiers,phases,repos_type,hook="core"): -# -# logger.info(f"Starting collection on {len(repo_git_identifiers)} {repos_type} {hook} repos") -# if len(repo_git_identifiers) == 0: -# return 0 -# -# logger.info(f"Collection starting for {hook}: {tuple(repo_git_identifiers)}") -# -# routine = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=phases,collection_hook=hook) -# -# routine.start_data_collection() -# -# return len(repo_git_identifiers) - -def get_valid_repos_for_users(session,limit,users,allow_old_repos = False,hook="core",days_to_wait_until_next_collection = 1): - - condition_string = "1" - - if hook == "core": - condition_string = get_required_conditions_for_core_repos(allow_collected_before=allow_old_repos,days_until_collect_again= days_to_wait_until_next_collection) - elif hook == "secondary": - condition_string = get_required_conditions_for_secondary_repos(allow_collected_before=allow_old_repos,days_until_collect_again = days_to_wait_until_next_collection) - elif hook == "facade": - condition_string = get_required_conditions_for_facade_repos(allow_collected_before=allow_old_repos,days_until_collect_again = days_to_wait_until_next_collection) - elif hook == "ml": - condition_string = get_required_conditions_for_ml_repos(allow_collected_before=allow_old_repos,days_until_collect_again = days_to_wait_until_next_collection) - - #Query a set of valid repositories sorted by weight, also making sure that the repos are new - #Order by the relevant weight for the collection hook - repo_query = s.sql.text(f""" - SELECT DISTINCT repo.repo_id, repo.repo_git, collection_status.{hook}_weight - FROM augur_operations.user_groups - JOIN augur_operations.user_repos ON augur_operations.user_groups.group_id = augur_operations.user_repos.group_id - JOIN augur_data.repo ON augur_operations.user_repos.repo_id = augur_data.repo.repo_id - JOIN augur_operations.collection_status ON augur_operations.user_repos.repo_id = augur_operations.collection_status.repo_id - WHERE user_id IN :list_of_user_ids AND {condition_string} - ORDER BY augur_operations.collection_status.{hook}_weight - LIMIT :limit_num - """).bindparams(list_of_user_ids=users,limit_num=limit) - - #Get a list of valid repo ids, limit set to 2 times the usual - valid_repos = session.execute_sql(repo_query).fetchall() - valid_repo_git_list = [repo[1] for repo in valid_repos] - - session.logger.info(f"valid repo git list: {tuple(valid_repo_git_list)}") - - #start repos for new primary collection hook - #collection_size = start_block_of_repos( - # session.logger, session, - # valid_repo_git_list, - # phases, repos_type=repos_type, hook=hook - #) - - return valid_repo_git_list - -def split_random_users_list(session,status_col, status_new): - #Split all users that have new repos into four lists and randomize order - query = s.sql.text(f""" - SELECT - user_id - FROM augur_operations.user_groups - JOIN augur_operations.user_repos ON augur_operations.user_groups.group_id = augur_operations.user_repos.group_id - JOIN augur_data.repo ON augur_operations.user_repos.repo_id = augur_data.repo.repo_id - JOIN augur_operations.collection_status ON augur_operations.user_repos.repo_id = augur_operations.collection_status.repo_id - WHERE {status_col}='{str(status_new)}' - GROUP BY user_id - """) - - user_list = session.execute_sql(query).fetchall() - random.shuffle(user_list) - - #Extract the user id from the randomized list and split into four chunks - split_user_list = split_list_into_chunks([row[0] for row in user_list], 4) - - return split_user_list - diff --git a/augur/tasks/util/redis_list.py b/augur/tasks/util/redis_list.py index 0137273c1e..8da8f12746 100644 --- a/augur/tasks/util/redis_list.py +++ b/augur/tasks/util/redis_list.py @@ -6,7 +6,6 @@ from collections.abc import MutableSequence from augur.tasks.init.redis_connection import redis_connection as redis from augur import instance_id -from redis import exceptions class RedisList(MutableSequence): diff --git a/augur/tasks/util/worker_util.py b/augur/tasks/util/worker_util.py index 84c177724b..6198f1ccdb 100644 --- a/augur/tasks/util/worker_util.py +++ b/augur/tasks/util/worker_util.py @@ -1,14 +1,13 @@ #SPDX-License-Identifier: MIT -import os, json, requests, logging -from flask import Flask, Response, jsonify, request +import json #import gunicorn.app.base import numpy as np from celery import group from celery.result import AsyncResult from celery.result import allow_join_result -from typing import Optional, List, Any, Tuple -from datetime import datetime, timedelta +from typing import List +from datetime import datetime import json import subprocess diff --git a/augur/templates/settings.j2 b/augur/templates/settings.j2 index c75b6522ad..c10a0c914c 100644 --- a/augur/templates/settings.j2 +++ b/augur/templates/settings.j2 @@ -56,7 +56,7 @@
  • - Repo Tracker + My Repos
  • @@ -170,7 +170,7 @@