diff --git a/src/scribe_data/checkquery.py b/src/scribe_data/checkquery.py new file mode 100755 index 000000000..b2cf49602 --- /dev/null +++ b/src/scribe_data/checkquery.py @@ -0,0 +1,457 @@ +#!/usr/bin/env python3 + +""" +Command line tool for testing SPARQl queries against an endpoint. +""" + +import argparse +import math +import os +import subprocess +import sys +import time +import urllib.request +from dataclasses import dataclass +from http import HTTPStatus +from pathlib import Path +from typing import Optional +from urllib.error import HTTPError + +import SPARQLWrapper as SPARQL +from SPARQLWrapper import SPARQLExceptions +from tqdm.auto import tqdm + + +EXIT_SUCCESS = 0 +EXIT_FAILURE = 1 +EXIT_CLI_ERROR = 2 + +PROJECT_ROOT = "Scribe-Data" + + +@dataclass(repr=False, frozen=True) +class QueryFile: + """ + Holds a reference to a file containing a SPARQL query. + """ + + path: Path + + def load(self, limit: int) -> str: + """Load the SPARQL query from 'path' into a string. + + Args: + limit (int): the maximum number of results a query should return. + + Returns: + str: the SPARQL query. + """ + with open(self.path, encoding="utf-8") as in_stream: + return f"{in_stream.read()}\nLIMIT {limit}\n" + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(path={self.path})" + + +class QueryExecutionException(Exception): + """ + Raised when execution of a query fails. + """ + + def __init__(self, message: str, query: QueryFile) -> None: + """ + Args: + message (str): why the query failed. + query (QueryFile): the query that failed. + """ + self.message = message + self.query = query + super().__init__(self.message) + + def __str__(self) -> str: + return f"{self.query.path} : {self.message}" + + +def ping(url: str, timeout: int) -> bool: + """ + Test if a URL is reachable. + + Args: + url (str): the URL to test. + timeout (int): the maximum number of seconds to wait for a reply. + + Returns: + bool: True if connectivity established. False otherwise. + """ + try: + with urllib.request.urlopen(url, timeout=timeout) as response: + return response.getcode() == HTTPStatus.OK + except (HTTPError, Exception) as err: + print(f"{type(err).__name__}: {str(err)}", file=sys.stderr) + + return False + + +def all_queries() -> list[QueryFile]: + """ + All the SPARQL queries in, and below, 'Scribe-Data/'. + + Returns: + list[QueryFile]: the SPARQL query files. + """ + parts = Path(__file__).resolve().parts + prj_root_idx = parts.index(PROJECT_ROOT) + prj_root = str(Path(*parts[: prj_root_idx + 1])) + + queries: list[QueryFile] = [] + + for root, _, fnames in os.walk(prj_root): + for fname in fnames: + fpath = Path(root, fname) + if fpath.suffix == ".sparql": + queries.append(QueryFile(fpath)) + + return queries + + +def changed_queries() -> Optional[list[QueryFile]]: + """ + Find all the SPARQL queries that have changed. + + Includes new queries. + + Returns: + Optional[list[QueryFile]]: list of changed/new SPARQL queries or None if error. + """ + + result = subprocess.run( + ( + "git", + "status", + "--short", + ), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, + check=False, + ) + + if result.returncode != EXIT_SUCCESS: + print(f"ERROR: {result.stderr}", file=sys.stderr) + return None + + # example 'result.stdout' + # + # M a/b/c/changed.sparql + # ?? ../new.sparql + + changed_files = [ + Path(norm_line.split(maxsplit=1)[1]).resolve() + for line in result.stdout.split("\n") + if (norm_line := line.strip()) + ] + + return [QueryFile(fpath) for fpath in changed_files if fpath.suffix == ".sparql"] + + +def sparql_context(url: str) -> SPARQL.SPARQLWrapper: + """ + Configure a SPARQL context. + + A context allows the execution of SPARQL queries. + + Args: + url (str): a valid URL of a SPARQL endpoint. + + Returns: + SPARQLWrapper: the context. + """ + context = SPARQL.SPARQLWrapper(url) + context.setReturnFormat(SPARQL.JSON) + context.setMethod(SPARQL.POST) + + return context + + +def execute( + query: QueryFile, limit: int, context: SPARQL.SPARQLWrapper, tries: int = 3 +) -> dict: + """ + Execute a SPARQL query in a given context. + + Args: + query (QueryFile): the SPARQL query to run. + limit (int): the maximum number of results a query should return. + context (SPARQLWrapper): the SPARQL context. + tries (int): the maximum number of times the query should be executed + after failure. + + Returns: + dict: results of the query. + """ + + def delay_in_seconds() -> int: + """How long to wait, in seconds, between executing repeat queries.""" + return int(math.ceil(10.0 / math.sqrt(tries))) + + if tries <= 0: + raise QueryExecutionException("Failed too many times.", query) + + try: + context.setQuery(query.load(limit)) + return context.queryAndConvert() + + except HTTPError: + time.sleep(delay_in_seconds()) + return execute(query, limit, context, tries - 1) + + except SPARQLExceptions.SPARQLWrapperException as err: + raise QueryExecutionException(err.msg, query) from err + + except Exception as err: + raise QueryExecutionException( + f"{type(err).__name__} - {str(err)}", query + ) from err + + +def check_sparql_file(fpath: str) -> Path: + """ + Check meta information of SPARQL query file. + + Args: + fpath (str): the file to validate. + + Returns: + Path: the validated file. + """ + path = Path(fpath) + + if not path.is_file(): + raise argparse.ArgumentTypeError(f"Not a valid file path: {path}") + + if path.suffix != ".sparql": + raise argparse.ArgumentTypeError(f"{path} does not have a '.sparql' extension") + + return path + + +def check_positive_int(value: str, err_msg: str) -> int: + """ + Ensure 'value' is a positive number. + + Args: + value (str): the value to be validated. + err_msg (str): used when value fails validation. + + Raises: + argparse.ArgumentTypeError + + Returns: + int: the validated number. + """ + try: + number = int(value) + if number >= 1: + return number + except ValueError: + pass + + raise argparse.ArgumentTypeError(err_msg) + + +def check_limit(limit: str) -> int: + """ + Validate the 'limit' argument. + + Args: + limit (str): the LIMIT to be validated. + + Raises: + argparse.ArgumentTypeError + + Returns: + int: the validated LIMIT + """ + return check_positive_int(limit, "LIMIT must be an integer of value 1 or greater.") + + +def check_timeout(timeout: str) -> int: + """ + Validate the 'timeout' argument. + + Args: + timeout (str): the timeout to be validated. + + Raises: + argparse.ArgumentTypeError + + Returns: + int: the validated timeout. + """ + return check_positive_int( + timeout, "timeout must be an integer of value 1 or greater." + ) + + +def main(argv=None) -> int: + """ + The main function. + + Args: + argv : If set to None then argparse will use sys.argv as the arguments. + + Returns: + int: the exit status - 0: success, any other value - failure. + """ + cli = argparse.ArgumentParser( + description=f"run SPARQL queries from the '{PROJECT_ROOT}' project", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + group = cli.add_mutually_exclusive_group(required=True) + + group.add_argument( + "-c", + "--changed", + action="store_true", + help="run only changed/new SPARQL queries", + ) + + group.add_argument( + "-a", "--all", action="store_true", help="run all SPARQL queries" + ) + + group.add_argument( + "-f", + "--file", + help="path to a file containing a valid SPARQL query", + type=check_sparql_file, + ) + + group.add_argument( + "-p", + "--ping", + action="store_true", + default=False, + help="check responsiveness of endpoint", + ) + + cli.add_argument( + "--timeout", + type=check_timeout, + default=10, + help="maximum number of seconds to wait for a response from the endpoint when 'pinging'", + ) + + cli.add_argument( + "-e", + "--endpoint", + type=str, + default="https://query.wikidata.org/sparql", + help="URL of the SPARQL endpoint", + ) + + cli.add_argument( + "-l", + "--limit", + type=check_limit, + default=5, + help="the maximum number or results a query should return", + ) + + cli.add_argument( + "-v", + "--verbose", + action="store_true", + default=False, + help="increase output verbosity", + ) + + args = cli.parse_args(argv) + + endpoint = args.endpoint + + if args.ping: + if ping(endpoint, args.timeout): + print(f"Success: pinged '{endpoint}'") + return EXIT_SUCCESS + + print( + f"FAILURE: unable to contact '{endpoint}'. Network problems? " + "Malformed URL? Increase timeout?", + file=sys.stderr, + ) + return EXIT_FAILURE + + queries = None + + if args.all: + queries = all_queries() + elif args.changed: + queries = changed_queries() + elif args.file: + queries = [QueryFile(args.file)] + else: + assert False, "Unknown option" + + if queries is None: + return EXIT_FAILURE + + context = sparql_context(endpoint) + + failures = [] + successes = [] + + for query in tqdm(queries, position=0): + try: + results = execute(query, args.limit, context) + successes.append((query, results)) + except QueryExecutionException as err: + failures.append(err) + + success_report(successes, display=args.verbose) + error_report(failures) + + print("\nSummary") + print( + f"\tQueries run: {len(queries)}, passed: {len(successes)}, failed: {len(failures)}\n" + ) + + return EXIT_FAILURE if failures else EXIT_SUCCESS + + +def error_report(failures: list[QueryExecutionException]) -> None: + """ + Report failed queries. + + Args: + failures (list[QueryExecutionException]): failed queries. + """ + if not failures: + return + + qword = "query" if len(failures) == 1 else "queries" + print(f"\nFollowing {qword} failed:\n", file=sys.stderr) + for failed_query in failures: + print(str(failed_query), file=sys.stderr) + + +def success_report(successes: list[tuple[QueryFile, dict]], display: bool) -> None: + """ + Report successful queries. + + Args: + successes (list[tuple[QueryFile, dict]]): successful queries. + display (bool): should there be output? + """ + if not (display and successes): + return + + qword = "query" if len(successes) == 1 else "queries" + print(f"\nFollowing {qword} ran successfully:\n") + for query, results in successes: + print(f"{query.path} returned: {results}") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/load/test_checkquery.py b/tests/load/test_checkquery.py new file mode 100755 index 000000000..6ca22a3e2 --- /dev/null +++ b/tests/load/test_checkquery.py @@ -0,0 +1,361 @@ +import argparse +from http import HTTPStatus +from urllib.error import HTTPError + +from unittest.mock import MagicMock, mock_open, patch +from pathlib import Path + +import pytest + +from scribe_data.checkquery import ( + all_queries, + changed_queries, + check_limit, + check_timeout, + check_sparql_file, + error_report, + execute, + main, + QueryExecutionException, + QueryFile, + ping, + success_report, +) + +S_PATH = "/root/project/src/dir/query.sparql" +A_PATH = Path(S_PATH) + + +@pytest.fixture +def a_query(): + return QueryFile(A_PATH) + + +# Query +def test_full_path(a_query): + assert a_query.path == A_PATH + + +@patch("builtins.open", new_callable=mock_open, read_data="QUERY") +def test_query_load(_, a_query): + assert a_query.load(12) == "QUERY\nLIMIT 12\n" + + +def test_query_equals(a_query): + assert a_query == QueryFile(A_PATH) + + +def test_query_not_equals(a_query): + assert a_query != QueryFile("/root/project/src/Dir/query.sparql") + + +def test_query_not_equals_object(a_query): + assert a_query != object() + + +def test_query_str(a_query): + assert str(a_query) == "QueryFile(path=/root/project/src/dir/query.sparql)" + + +def test_query_repr(a_query): + assert repr(a_query) == "QueryFile(path=/root/project/src/dir/query.sparql)" + + +def test_query_execution_exception(a_query): + exception = QueryExecutionException("failure", a_query) + assert str(exception) == f"{S_PATH} : failure" + + +# ping +@patch("urllib.request.urlopen") +def test_ping_pass(mock_urlopen): + mock_urlopen.return_value.__enter__.return_value.getcode.return_value = ( + HTTPStatus.OK + ) + + assert ping("http://www.python.org", 0) + + +@patch("urllib.request.urlopen") +def test_ping_httperror_fail(mock_urlopen): + mock_urlopen.return_value.__enter__.side_effect = HTTPError + + assert not ping("http://www.python.org", 0) + + +@patch("urllib.request.urlopen") +def test_ping_exception_fail(mock_urlopen): + mock_urlopen.return_value.__enter__.side_effect = Exception + + assert not ping("http://www.python.org", 0) + + +@patch("urllib.request.urlopen") +def test_ping_fail(mock_urlopen): + mock_urlopen.return_value.__enter__.return_value.getcode.return_value = ( + HTTPStatus.BAD_REQUEST + ) + assert not ping("http://www.python.org", 0) + + +# check_sparql_file +@patch.object(Path, "is_file", return_value=True) +def test_check_sparql_file_exists(_): + assert check_sparql_file(S_PATH) == A_PATH + + +@patch.object(Path, "is_file", return_value=False) +def test_check_sparql_file_not_exists(_): + with pytest.raises(argparse.ArgumentTypeError) as err: + _ = check_sparql_file(S_PATH) + + assert str(err.value) == f"Not a valid file path: {S_PATH}" + + +@patch.object(Path, "is_file", return_value=True) +def test_check_sparql_file_not_sparql_extension(_): + fpath = Path("/root/query.txt") + with pytest.raises(argparse.ArgumentTypeError) as err: + _ = check_sparql_file(fpath) + + assert str(err.value) == f"{fpath} does not have a '.sparql' extension" + + +# changed_queries +@pytest.mark.parametrize( + "git_status, expected", + [ + ("", []), + ("\n", []), + ("?? README.md \n M .gitignore", []), + (" M /a/b/a.sparql", [QueryFile(Path("/a/b/a.sparql"))]), + ( + " M /a/with space/a.sparql \n ", + [QueryFile(Path("/a/with space/a.sparql"))], + ), + (" M /a/a.sparql\nM a.txt", [QueryFile(Path("/a/a.sparql"))]), + ( + " M /a/a.sparql\nM /a/a.txt\n ?? /a/b.sparql", + [QueryFile(Path("/a/a.sparql")), QueryFile(Path("/a/b.sparql"))], + ), + ], +) +@patch("subprocess.run") +def test_changed_queries(mock_run, git_status, expected): + mock_result = MagicMock() + mock_result.configure_mock(**{"returncode": 0, "stdout": git_status}) + + mock_run.return_value = mock_result + assert changed_queries() == expected + + +@patch("subprocess.run") +def test_changed_queries_failure(mock_run, capsys): + mock_result = MagicMock() + mock_result.configure_mock(**{"returncode": 1, "stderr": "no git"}) + + mock_run.return_value = mock_result + assert changed_queries() is None + + err_out = capsys.readouterr().err + assert "ERROR: no git" == err_out.strip() + + +# all_queries +@pytest.mark.parametrize( + "tree, expected", + [ + ( + [ + ("/root", ("src",), ("README.txt",)), + ("/root/src", (), ("spam.sh", "eggs.py")), + ], + [], + ), + ( + [ + ("/root", ("src",), ("a.sparql",)), + ("/root/src", (), ("sparql.pdf", "b.sparql")), + ], + [ + QueryFile(Path("/root/a.sparql")), + QueryFile(Path("/root/src/b.sparql")), + ], + ), + ], +) +def test_all_queries(tree, expected): + with patch("os.walk") as mock_walk: + mock_walk.return_value = tree + assert all_queries() == expected + + +# execute + + +def test_execute(a_query): + with pytest.raises(QueryExecutionException) as err: + _ = execute(a_query, 1, None, 0) + + assert str(err) == f"{a_query.path} : Failed too many times." + + +# check_limit +@pytest.mark.parametrize( + "candidate, limit", + [ + ("1", 1), + ("34", 34), + ("1000", 1000), + ], +) +def test_check_limit_pos(candidate, limit): + assert check_limit(candidate) == limit + + +@pytest.mark.parametrize( + "candidate", + [ + "0", + "-1", + "a", + "word", + ], +) +def test_check_limit_neg(candidate): + with pytest.raises(argparse.ArgumentTypeError) as err: + _ = check_limit(candidate) + + assert str(err.value) == "LIMIT must be an integer of value 1 or greater." + + +# check_timeout +@pytest.mark.parametrize( + "candidate, timeout", + [ + ("1", 1), + ("9", 9), + ("8888", 8888), + ], +) +def test_check_timeout_pos(candidate, timeout): + assert check_timeout(candidate) == timeout + + +@pytest.mark.parametrize( + "candidate", + [ + "0", + "-1", + "O", + "ten", + ], +) +def test_check_timeout_neg(candidate): + with pytest.raises(argparse.ArgumentTypeError) as err: + _ = check_timeout(candidate) + + assert str(err.value) == "timeout must be an integer of value 1 or greater." + + +# main + + +@pytest.mark.parametrize("arg", ["-h", "--help"]) +def test_main_help(arg): + with pytest.raises(SystemExit) as err: + _ = main(arg) + assert err.code == 0 + + +@pytest.mark.parametrize( + "args", + [ + ["-a", "-c"], + ["-p", "-f"], + ["-p", "-c"], + ["-p", "-a"], + ["-c", "-p", "-a"], + ["-c", "-f", "-a"], + ], +) +def test_main_mutex_opts(args): + """ + Some options cannot be used together. + """ + with pytest.raises(SystemExit) as err: + _ = main([args]) + assert err.code == 2 + + +def test_error_report_single(a_query, capsys): + failures = [QueryExecutionException("timeout", a_query)] + error_report(failures) + err_out = capsys.readouterr().err + + assert ( + err_out == "\nFollowing query failed:\n\n" + "/root/project/src/dir/query.sparql : timeout\n" + ) + + +def test_error_report_multiple(a_query, capsys): + failures = [ + QueryExecutionException("timeout", a_query), + QueryExecutionException("bad format", a_query), + ] + error_report(failures) + err_out = capsys.readouterr().err + + assert ( + err_out == "\nFollowing queries failed:\n\n" + "/root/project/src/dir/query.sparql : timeout\n" + "/root/project/src/dir/query.sparql : bad format\n" + ) + + +def test_error_report_no_errors(capsys): + error_report([]) + assert capsys.readouterr().err == "" + + +def test_success_report_single_display_set(a_query, capsys): + successes = [(a_query, {"a": 23})] + success_report(successes, display=True) + + out = capsys.readouterr().out + + assert ( + out == "\nFollowing query ran successfully:\n\n" + "/root/project/src/dir/query.sparql returned: {'a': 23}\n" + ) + + +def test_success_report_no_success_display_set(capsys): + success_report([], display=True) + + assert capsys.readouterr().out == "" + + +@pytest.mark.parametrize( + "successes", + [[], [(a_query, {"a": 23})], [(a_query, {"a": 23}), (a_query, {"b": 53})]], +) +def test_success_report_display_not_set(successes, capsys): + success_report(successes, display=False) + + out = capsys.readouterr().out + + assert out == "" + + +def test_success_report_multiple_display_set(a_query, capsys): + successes = [(a_query, {"a": 23}), (a_query, {"b": 57})] + success_report(successes, display=True) + out = capsys.readouterr().out + + assert ( + out == "\nFollowing queries ran successfully:\n\n" + "/root/project/src/dir/query.sparql returned: {'a': 23}\n" + "/root/project/src/dir/query.sparql returned: {'b': 57}\n" + )