From 7ece335e98c5830453b4e36260ab777a3b989924 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Thu, 7 Dec 2023 23:50:24 -0700 Subject: [PATCH 01/43] Fix reference to table files --- theiavalidate/Validator.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index c0dda71..ff92109 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -5,6 +5,7 @@ import os import pandas as pd import pdfkit as pdf +import subprocess import sys class Validator: @@ -317,6 +318,18 @@ def compare(self): self.logger.info("Counting how many cells have values") self.count_populated_cells() + + # test localizing files to compare using gcloud storage + # create directories for holding files to compare + dir1 = f"{self.table1_name}/" + dir2 = f"{self.table2_name}/" + os.mkdir(dir1) + os.mkdir(dir2) + + # localize files to compare + # TODO map gs:// URI to local path + self.table1.apply(localize_files, dir=dir1, axis=1) + self.table2.apply(localize_files, dir=dir2, axis=1) self.logger.info("Performing an exact string match") self.perform_exact_match() @@ -329,4 +342,9 @@ def compare(self): self.make_pdf_report() self.logger.info("Done!") - \ No newline at end of file + +def localize_files(row, dir): + for value in row: + if isinstance(value, str) and value.startswith("gs://"): + # copy file to local directory + subprocess.run(["gcloud", "storage", "cp", value, dir]) \ No newline at end of file From f9942d4d469ba9a555dac6bcf8c312e74ed8a007 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Fri, 8 Dec 2023 19:17:08 -0700 Subject: [PATCH 02/43] Remove test localize files code --- theiavalidate/Validator.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index ff92109..c9f8386 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -319,18 +319,6 @@ def compare(self): self.logger.info("Counting how many cells have values") self.count_populated_cells() - # test localizing files to compare using gcloud storage - # create directories for holding files to compare - dir1 = f"{self.table1_name}/" - dir2 = f"{self.table2_name}/" - os.mkdir(dir1) - os.mkdir(dir2) - - # localize files to compare - # TODO map gs:// URI to local path - self.table1.apply(localize_files, dir=dir1, axis=1) - self.table2.apply(localize_files, dir=dir2, axis=1) - self.logger.info("Performing an exact string match") self.perform_exact_match() @@ -342,9 +330,3 @@ def compare(self): self.make_pdf_report() self.logger.info("Done!") - -def localize_files(row, dir): - for value in row: - if isinstance(value, str) and value.startswith("gs://"): - # copy file to local directory - subprocess.run(["gcloud", "storage", "cp", value, dir]) \ No newline at end of file From 6d3cbbd584bb3815362d5b7a5d863fa5fae7527d Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sat, 9 Dec 2023 05:49:56 +0000 Subject: [PATCH 03/43] Retain path while localizing files --- .gitignore | 5 ++++- theiavalidate/Validator.py | 24 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index cb5fc37..7de3fa6 100644 --- a/.gitignore +++ b/.gitignore @@ -160,4 +160,7 @@ cython_debug/ #.idea/ # IDE -.vscode/ \ No newline at end of file +.vscode/ + +# testing files +sandbox/ \ No newline at end of file diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index c9f8386..9eb1266 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -319,6 +319,19 @@ def compare(self): self.logger.info("Counting how many cells have values") self.count_populated_cells() + # test localizing files to compare using gcloud storage + # create directories for holding files to compare + dir1 = f"{self.table1_name}/" + dir2 = f"{self.table2_name}/" + os.mkdir(dir1) + os.mkdir(dir2) + + # localize files to compare + self.table1.apply(localize_files, dir=dir1, axis=1) + self.table2.apply(localize_files, dir=dir2, axis=1) + + subprocess.run(["ls", "-R", "compare_files"]) + self.logger.info("Performing an exact string match") self.perform_exact_match() @@ -330,3 +343,14 @@ def compare(self): self.make_pdf_report() self.logger.info("Done!") + +def localize_files(row, dir): + for value in row: + if isinstance(value, str) and value.startswith("gs://"): + # copy files to to compare_files/ directory + # it would be much faster to copy them all at once, but any files with + # the same name would be clobbered, so create local directories matching + # gsutil path and loop to copy + destination_path = os.path.dirname(value[5:]) + os.mkdirs(os.path.join("./compare_files/", destination_path) + subprocess.run(["gsutil", "-m", "cp", value, destination_path]) \ No newline at end of file From 8975b9cded3e2ccc4dbfb0625b1dbe033f02bfc8 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Fri, 8 Dec 2023 23:21:07 -0700 Subject: [PATCH 04/43] Change directory names --- theiavalidate/Validator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 9eb1266..47b4a8b 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -321,8 +321,8 @@ def compare(self): # test localizing files to compare using gcloud storage # create directories for holding files to compare - dir1 = f"{self.table1_name}/" - dir2 = f"{self.table2_name}/" + dir1 = f"{self.table1_name}_files/" + dir2 = f"{self.table2_name}_files/" os.mkdir(dir1) os.mkdir(dir2) @@ -344,7 +344,7 @@ def compare(self): self.logger.info("Done!") -def localize_files(row, dir): +def localize_files(row, directory): for value in row: if isinstance(value, str) and value.startswith("gs://"): # copy files to to compare_files/ directory @@ -352,5 +352,5 @@ def localize_files(row, dir): # the same name would be clobbered, so create local directories matching # gsutil path and loop to copy destination_path = os.path.dirname(value[5:]) - os.mkdirs(os.path.join("./compare_files/", destination_path) + os.makedirs(os.path.join(directory, destination_path)) subprocess.run(["gsutil", "-m", "cp", value, destination_path]) \ No newline at end of file From 5692fb500a7ebd9f08ed4605742c74e3883af7c6 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sat, 9 Dec 2023 00:35:02 -0700 Subject: [PATCH 05/43] Fix bugs with localizing files --- theiavalidate/Validator.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 47b4a8b..60c7c20 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -319,18 +319,17 @@ def compare(self): self.logger.info("Counting how many cells have values") self.count_populated_cells() - # test localizing files to compare using gcloud storage - # create directories for holding files to compare - dir1 = f"{self.table1_name}_files/" - dir2 = f"{self.table2_name}_files/" + dir1 = f"table1_files/" + dir2 = f"table2_files/" os.mkdir(dir1) os.mkdir(dir2) # localize files to compare - self.table1.apply(localize_files, dir=dir1, axis=1) - self.table2.apply(localize_files, dir=dir2, axis=1) + self.table1.apply(localize_files, directory=dir1, axis=1) + self.table2.apply(localize_files, directory=dir2, axis=1) - subprocess.run(["ls", "-R", "compare_files"]) + subprocess.run(["ls", "-R", dir1]) + subprocess.run(["ls", "-R", dir2]) self.logger.info("Performing an exact string match") self.perform_exact_match() @@ -351,6 +350,7 @@ def localize_files(row, directory): # it would be much faster to copy them all at once, but any files with # the same name would be clobbered, so create local directories matching # gsutil path and loop to copy - destination_path = os.path.dirname(value[5:]) - os.makedirs(os.path.join(directory, destination_path)) - subprocess.run(["gsutil", "-m", "cp", value, destination_path]) \ No newline at end of file + remote_path = os.path.dirname(value[5:]) # exclude 'gs://' prefix + destination_path = os.path.join(directory, remote_path) + os.makedirs(destination_path) + subprocess.run(["gsutil", "-m", "cp", value, destination_path]) From 205c68abfbf306eeeef2e27948706cf7254fd9c9 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sat, 9 Dec 2023 13:53:11 -0700 Subject: [PATCH 06/43] Determine columns with GCP URIs --- theiavalidate/Validator.py | 35 +++++++++++++++++++++++++++++----- theiavalidate/theiavalidate.py | 9 +++++++-- 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 60c7c20..b1fd3d0 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -39,7 +39,8 @@ def __init__(self, options): self.validation_criteria = options.validation_criteria self.columns_to_compare = options.columns_to_compare self.columns_to_compare.append("samples") - + self.file_columns = set() # columns that contain GCP URIs to files + self.output_prefix = options.output_prefix self.na_values = options.na_values @@ -135,6 +136,29 @@ def count_populated_cells(self): self.logger.debug("Creating the summary table with the number of populated cells") self.summary_output = pd.concat([table1_populated_rows, table2_populated_rows], join="outer", axis=1) + + """ + This function determines columns with GCP URIs for file comparisons so that + they are excluded from regular comparisons and instead use filecmp to compare + the downloaded files + """ + def determine_file_columns(self): + for df in [self.table1, self.table2]: + # select columns with at least one GCP URI among nulls + file_columns = df.columns[(df.apply(lambda x: x.astype(str).str.startswith("gs://") + | x.isnull()).all()) + & (~df.isnull().all())] + + file_columns = file_columns.tolist() + self.file_columns.update(file_columns) + + # Ensure file_columns set only has GCP URIs and nulls + for df in [self.table1, self.table2]: + remove_columns = df.columns[df.apply(lambda x: x.astype(str).str.startswith("gs://") + | x.isnull().all())] + remove_columns = set(remove_columns.tolist()) + self.file_columns = self.file_columns - remove_columns + """ This function performs an exact match and creates and Excel file that contains the exact match differences """ @@ -319,17 +343,18 @@ def compare(self): self.logger.info("Counting how many cells have values") self.count_populated_cells() + self.logger.info("Determining columns for file comparisons") + self.determine_file_columns() + dir1 = f"table1_files/" dir2 = f"table2_files/" os.mkdir(dir1) os.mkdir(dir2) - # localize files to compare + self.logger.info("Localizing files to compare...") self.table1.apply(localize_files, directory=dir1, axis=1) self.table2.apply(localize_files, directory=dir2, axis=1) - subprocess.run(["ls", "-R", dir1]) - subprocess.run(["ls", "-R", dir2]) self.logger.info("Performing an exact string match") self.perform_exact_match() @@ -350,7 +375,7 @@ def localize_files(row, directory): # it would be much faster to copy them all at once, but any files with # the same name would be clobbered, so create local directories matching # gsutil path and loop to copy - remote_path = os.path.dirname(value[5:]) # exclude 'gs://' prefix + remote_path = os.path.dirname(value.removeprefix('gs://')) destination_path = os.path.join(directory, remote_path) os.makedirs(destination_path) subprocess.run(["gsutil", "-m", "cp", value, destination_path]) diff --git a/theiavalidate/theiavalidate.py b/theiavalidate/theiavalidate.py index bbdc82d..c04ae3b 100644 --- a/theiavalidate/theiavalidate.py +++ b/theiavalidate/theiavalidate.py @@ -5,6 +5,11 @@ from __init__ import __VERSION__ from Validator import Validator +DEFAULT_NA_VALUES = [ + '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', + '', '#NA', 'NULL', 'null', 'NaN','-NaN', 'nan', '-nan', 'None' +] + def main(): parser = argparse.ArgumentParser( description = "This tool compares two tab-delimited files and outputs a report of the differences between the two files.", @@ -25,8 +30,8 @@ def main(): parser.add_argument("-o", "--output_prefix", help="the output file name prefix\ndo not include any spaces", default="theiavalidate", metavar="\b") parser.add_argument("-n", "--na_values", - help="the values that should be considered NA\ndefault values = ['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', '', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', 'None']", - default= ['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', '', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', 'None'], metavar="\b", type=int) + help=f"the values that should be considered NA\ndefault values = {DEFAULT_NA_VALUES}", + default=DEFAULT_NA_VALUES, metavar="\b", type=int) parser.add_argument("--verbose", help="increase stdout verbosity", action="store_true", default=False) parser.add_argument("--debug", From 7c02221f708491d2ff2abf25bec6e3d1978c6dd0 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sat, 9 Dec 2023 15:25:22 -0700 Subject: [PATCH 07/43] Add unit tests for determine_file_columns() --- .devcontainer/devcontainer.json | 26 ++++++++ __init__.py | 0 tests/__init__.py | 0 tests/test_validator.py | 109 ++++++++++++++++++++++++++++++++ theiavalidate/Validator.py | 6 +- theiavalidate/theiavalidate.py | 6 +- 6 files changed, 142 insertions(+), 5 deletions(-) create mode 100644 .devcontainer/devcontainer.json create mode 100644 __init__.py create mode 100644 tests/__init__.py create mode 100644 tests/test_validator.py diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..8d96444 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,26 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile +{ + "name": "Existing Dockerfile", + "build": { + // Sets the run context to one level up instead of the .devcontainer folder. + "context": "..", + // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. + "dockerfile": "../Dockerfile" + } + + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + + // Uncomment the next line to run commands after the container is created. + // "postCreateCommand": "cat /etc/os-release", + + // Configure tool-specific properties. + // "customizations": {}, + + // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "devcontainer" +} diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_validator.py b/tests/test_validator.py new file mode 100644 index 0000000..224cc07 --- /dev/null +++ b/tests/test_validator.py @@ -0,0 +1,109 @@ +from theiavalidate.Validator import Validator +from theiavalidate.theiavalidate import DEFAULT_NA_VALUES + +import numpy as np +import pandas as pd +import unittest + +class MockOptions: + """ + Mock the "options" object that is created in theiavalidate.py. In + theiavalidate.py, this object is created from command-line arguments using + the argparse package, but here we will simulate this object with a + different class to more easily create Validator objects. + """ + def __init__(self, options_dict=None): + # defaults + self.table1 = None + self.table2 = None + self.version = None + self.columns_to_compare = [] + self.validation_criteria = None + self.column_translation = None + self.output_prefix = None + self.na_values = DEFAULT_NA_VALUES + self.verbose = False + self.debug = False + + # overwrite defaults with options_dict + if options_dict is not None: + for key, value in options_dict.items(): + setattr(self, key, value) + + +class TestDetermineFileColumns(unittest.TestCase): + def setUp(self): + self.validator = Validator(MockOptions()) + + def run_determine_file_columns(self, data1, data2): + self.validator.table1 = pd.DataFrame(data1) + self.validator.table2 = pd.DataFrame(data2) + self.validator.determine_file_columns() + + def test_no_file_columns(self): + data = { + "col1": [1, 2, 3], + "col2": ["foo", "bar", "baz"] + } + self.run_determine_file_columns(data, data) + self.assertEqual(len(self.validator.file_columns), 0) + + def test_some_file_columns(self): + data1 = { + "col1": [1, 2, 3], + "col2": ["gs://foo", "gs://bar", "gs://baz"] + } + data2 = { + "col1": [1, 2, 3], + "col2": ["gs://eggs", "gs://spam", "gs://monty"] + } + self.run_determine_file_columns(data1, data2) + self.assertEqual(self.validator.file_columns, {"col2"}) + + def test_missing_uri(self): + data1 = { + "col1": [1, 2, 3], + "col2": ["gs://foo", np.nan, "gs://baz"] + } + data2 = { + "col1": [1, 2, 3], + "col2": ["gs://eggs", "gs://spam", "gs://monty"] + } + self.run_determine_file_columns(data1, data2) + self.assertEqual(self.validator.file_columns, {"col2"}) + + def test_both_columns_null(self): + data1 = { + "col1": ["gs://foo", "gs://bar", "gs://baz"], + "col2": [np.nan, np.nan, np.nan] + } + data2 = { + "col1": ["gs://eggs", "gs://spam", "gs://monty"], + "col2": [np.nan, np.nan, np.nan] + } + self.run_determine_file_columns(data1, data2) + self.assertEqual(self.validator.file_columns, {"col1"}) + + def test_one_column_null(self): + data1 = { + "col1": ["gs://foo", "gs://bar", "gs://baz"], + "col2": ["gs://x", "gs://y", "gs://z"] + } + data2 = { + "col1": ["gs://eggs", "gs://spam", "gs://monty"], + "col2": [np.nan, np.nan, np.nan] + } + self.run_determine_file_columns(data1, data2) + self.assertEqual(self.validator.file_columns, {"col1", "col2"}) + + def test_one_column_not_null(self): + data1 = { + "col1": ["gs://foo", "gs://bar", "gs://baz"], + "col2": ["gs://x", "gs://y", "gs://z"] + } + data2 = { + "col1": ["gs://eggs", "gs://spam", "gs://monty"], + "col2": [1, 2, 3] + } + self.run_determine_file_columns(data1, data2) + self.assertEqual(self.validator.file_columns, {"col1"}) \ No newline at end of file diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index b1fd3d0..a3e29f7 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -154,8 +154,10 @@ def determine_file_columns(self): # Ensure file_columns set only has GCP URIs and nulls for df in [self.table1, self.table2]: - remove_columns = df.columns[df.apply(lambda x: x.astype(str).str.startswith("gs://") - | x.isnull().all())] + remove_columns = df.columns[~(df.apply(lambda x: x.astype(str).str.startswith('gs://') + | x.isnull()).all())] + +# Convert the Index object to a set remove_columns = set(remove_columns.tolist()) self.file_columns = self.file_columns - remove_columns diff --git a/theiavalidate/theiavalidate.py b/theiavalidate/theiavalidate.py index c04ae3b..f18ff5e 100644 --- a/theiavalidate/theiavalidate.py +++ b/theiavalidate/theiavalidate.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 import argparse -import CheckInputs -from __init__ import __VERSION__ -from Validator import Validator +from . import CheckInputs +from .__init__ import __VERSION__ +from .Validator import Validator DEFAULT_NA_VALUES = [ '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', From 8e4a3e3b57eca9fd8565c91ac3e80892fa892c5f Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sat, 9 Dec 2023 15:34:57 -0700 Subject: [PATCH 08/43] Only apply localize_files to file columns --- tests/test_validator.py | 3 ++- theiavalidate/Validator.py | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/test_validator.py b/tests/test_validator.py index 224cc07..3db1996 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -106,4 +106,5 @@ def test_one_column_not_null(self): "col2": [1, 2, 3] } self.run_determine_file_columns(data1, data2) - self.assertEqual(self.validator.file_columns, {"col1"}) \ No newline at end of file + self.assertEqual(self.validator.file_columns, {"col1"}) + \ No newline at end of file diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index a3e29f7..c5fb2f0 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -140,7 +140,7 @@ def count_populated_cells(self): """ This function determines columns with GCP URIs for file comparisons so that they are excluded from regular comparisons and instead use filecmp to compare - the downloaded files + the downloaded files. """ def determine_file_columns(self): for df in [self.table1, self.table2]: @@ -157,7 +157,7 @@ def determine_file_columns(self): remove_columns = df.columns[~(df.apply(lambda x: x.astype(str).str.startswith('gs://') | x.isnull()).all())] -# Convert the Index object to a set + # Convert the Index object to a set remove_columns = set(remove_columns.tolist()) self.file_columns = self.file_columns - remove_columns @@ -348,14 +348,14 @@ def compare(self): self.logger.info("Determining columns for file comparisons") self.determine_file_columns() - dir1 = f"table1_files/" - dir2 = f"table2_files/" + dir1 = "table1_files/" + dir2 = "table2_files/" os.mkdir(dir1) os.mkdir(dir2) self.logger.info("Localizing files to compare...") - self.table1.apply(localize_files, directory=dir1, axis=1) - self.table2.apply(localize_files, directory=dir2, axis=1) + self.table1[self.file_columns].apply(localize_files, directory=dir1) + self.table2[self.file_columns].apply(localize_files, directory=dir2) self.logger.info("Performing an exact string match") From c1c5c3bb98bae5631836e27211c4cc103f232aaf Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sat, 9 Dec 2023 20:07:44 -0700 Subject: [PATCH 09/43] Perform exact file comparisons --- theiavalidate/Validator.py | 77 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 73 insertions(+), 4 deletions(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index c5fb2f0..0d59f77 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -1,5 +1,8 @@ from datetime import date from pretty_html_table import build_table + +import difflib +import filecmp import logging import numpy as np import os @@ -12,6 +15,7 @@ class Validator: """ This class runs the parsing module for theiavalidate """ + NUMBER_OF_DIFFERENCES_COLUMN_HEADER = def __init__(self, options): logging.basicConfig(encoding='utf-8', level=logging.ERROR, stream=sys.stderr) self.logger = logging.getLogger(__name__) @@ -166,30 +170,96 @@ def determine_file_columns(self): """ def perform_exact_match(self): self.logger.debug("Performing an exact match and removing the sample name column") + + if self.file_columns: + # exclude file_columns for string comparison + table1 = self.table1.drop(list(self.file_columns), axis=1) + table2 = self.table2.drop(list(self.file_columns), axis=1) + + # handle file comparisons separately from strings + + # TODO: set index to samples column in main table earlier? + files_df1 = self.table1.set_index("samples") + files_df2 = self.table2.set_index("samples") + files_df1 = files_df1[list(self.file_columns)] + files_df2 = files_df2[list(self.file_columns)] + file_number_of_differences = compare_files(files_df1, files_df2) + else: + table1 = self.table1 + table2 = self.table2 + # count the number of differences using exact string matches # temporarily make NaNs null since NaN != NaN for the pd.DataFrame.eq() function # also: remove the samplename row - number_of_differences = pd.DataFrame((~self.table1.fillna("NULL").astype(str).eq(self.table2.fillna("NULL").astype(str))).sum(), columns = ["Number of differences (exact match)"]) + number_of_differences = pd.DataFrame((~table1.fillna("NULL").astype(str).eq(table2.fillna("NULL").astype(str))).sum(), columns = ["Number of differences (exact match)"]) + number_of_differences.drop("samples", axis=0, inplace=True) + # add the number of differences to the summary output table self.logger.debug("Adding the number of exact match differences to the summary table") self.summary_output = pd.concat([self.summary_output, number_of_differences], join="outer", axis=1) + # get a table of self-other differences # also: temporarily drop the sample name column for comparison and then set it as the index for the output data frame self.logger.debug("Creating a table of self-other differences") - exact_differences_table = self.table1.drop("samples", axis=1).compare(self.table2.drop("samples", axis=1), keep_shape=True).set_index(self.table1["samples"]) + exact_differences_table = table1.drop("samples", axis=1).compare(table2.drop("samples", axis=1), keep_shape=True).set_index(table1["samples"]) # rename the self and other with the table names self.logger.debug("Renaming the self and other to be the table names") exact_differences_table.rename(columns={"self": self.table1_name, "other": self.table2_name}, level=-1, inplace=True) # replace matching values (NAs) with blanks self.logger.debug("Replacing all NA values with blanks") exact_differences_table.replace(np.nan, "", inplace=True) + self.logger.debug("Writing the self-other differences table to a TSV file") exact_differences_table.to_csv(self.output_prefix + "_exact_differences.tsv", sep="\t", index=True) + + def compare_files(df1, df2): + comparison_df = pd.DataFrame(index=df1.index, columns=df1.columns) + + for row in df1.index: + for col in df1.columns: + uri1 = df1.loc[row, col] + uri2 = df2.loc[row, col] + file1 = os.path.join("table1_files", uri1.removeprefix("gs://")) + file2 = os.path.join("table2_files", uri2.removeprefix("gs://")) + if pd.isnull(file1) and pd.isnull(file2): + # count two nulls as matching + comparison_df.loc[row, col] = True + elif (not pd.isnull(file1) and not pd.isnull(file2)): + is_match = filecmp.cmp(file1, file2) + comparison_df.loc[row, col] = is_match + if not is_match: + output_filename = f"{row}_{col}_diff.txt" + create_diff(file1, file2, output_filename) + else: + # count as not matching if pair is missing + comparison_df.loc[row, col] = False + + number_of_differences = pd.DataFrame(columns=["Number of differences (exact match)"]) + for col in number_of_differences.columns: + count = comparison_df[col].dropna().ne(True).sum() + number_of_differences.loc[col] = count + + return number_of_differences + + def create_diff(file1, file2, output_filename): + # create unified diff + with open(file1, "r") as f1, open(file2, "r") as f2: + diff = difflib.unified_diff( + f1.readlines(), + f2.readlines(), + fromfile=file1, + tofile=file2, + lineterm='', + ) + diff = "".join(diff) + with open(output_filename, "w") as out: + out.write(diff) + """ This function calculates the percent difference between two values """ @@ -356,7 +426,6 @@ def compare(self): self.logger.info("Localizing files to compare...") self.table1[self.file_columns].apply(localize_files, directory=dir1) self.table2[self.file_columns].apply(localize_files, directory=dir2) - self.logger.info("Performing an exact string match") self.perform_exact_match() @@ -377,7 +446,7 @@ def localize_files(row, directory): # it would be much faster to copy them all at once, but any files with # the same name would be clobbered, so create local directories matching # gsutil path and loop to copy - remote_path = os.path.dirname(value.removeprefix('gs://')) + remote_path = os.path.dirname(value.removeprefix("gs://")) destination_path = os.path.join(directory, remote_path) os.makedirs(destination_path) subprocess.run(["gsutil", "-m", "cp", value, destination_path]) From cf4c0ff87b9acc4aa687c2e903b1273125314da9 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sat, 9 Dec 2023 20:21:34 -0700 Subject: [PATCH 10/43] Fix typos in Validator --- tests/test_validator.py | 6 +++++- theiavalidate/Validator.py | 14 +++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/test_validator.py b/tests/test_validator.py index 3db1996..c16bb95 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -107,4 +107,8 @@ def test_one_column_not_null(self): } self.run_determine_file_columns(data1, data2) self.assertEqual(self.validator.file_columns, {"col1"}) - \ No newline at end of file + + +class TestCompareFiles(unittest.TestCase): + def setUp(self): + self.validator = Validator(MockOptions()) \ No newline at end of file diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 0d59f77..df41bcb 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -15,7 +15,6 @@ class Validator: """ This class runs the parsing module for theiavalidate """ - NUMBER_OF_DIFFERENCES_COLUMN_HEADER = def __init__(self, options): logging.basicConfig(encoding='utf-8', level=logging.ERROR, stream=sys.stderr) self.logger = logging.getLogger(__name__) @@ -217,13 +216,14 @@ def perform_exact_match(self): exact_differences_table.to_csv(self.output_prefix + "_exact_differences.tsv", sep="\t", index=True) - def compare_files(df1, df2): - comparison_df = pd.DataFrame(index=df1.index, columns=df1.columns) + def compare_files(file_df1, file_df2): + comparison_df = pd.DataFrame(index=file_df1.index, + columns=file_df1.columns) - for row in df1.index: - for col in df1.columns: - uri1 = df1.loc[row, col] - uri2 = df2.loc[row, col] + for row in file_df1.index: + for col in file_df1.columns: + uri1 = file_df1.loc[row, col] + uri2 = file_df2.loc[row, col] file1 = os.path.join("table1_files", uri1.removeprefix("gs://")) file2 = os.path.join("table2_files", uri2.removeprefix("gs://")) if pd.isnull(file1) and pd.isnull(file2): From dd6de05bacc8c2c9ae8dcc8c338a35b74534f9cd Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sat, 9 Dec 2023 21:09:56 -0700 Subject: [PATCH 11/43] Fix indentation --- theiavalidate/Validator.py | 86 +++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index df41bcb..913e9f0 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -216,49 +216,49 @@ def perform_exact_match(self): exact_differences_table.to_csv(self.output_prefix + "_exact_differences.tsv", sep="\t", index=True) - def compare_files(file_df1, file_df2): - comparison_df = pd.DataFrame(index=file_df1.index, - columns=file_df1.columns) - - for row in file_df1.index: - for col in file_df1.columns: - uri1 = file_df1.loc[row, col] - uri2 = file_df2.loc[row, col] - file1 = os.path.join("table1_files", uri1.removeprefix("gs://")) - file2 = os.path.join("table2_files", uri2.removeprefix("gs://")) - if pd.isnull(file1) and pd.isnull(file2): - # count two nulls as matching - comparison_df.loc[row, col] = True - elif (not pd.isnull(file1) and not pd.isnull(file2)): - is_match = filecmp.cmp(file1, file2) - comparison_df.loc[row, col] = is_match - if not is_match: - output_filename = f"{row}_{col}_diff.txt" - create_diff(file1, file2, output_filename) - else: - # count as not matching if pair is missing - comparison_df.loc[row, col] = False - - number_of_differences = pd.DataFrame(columns=["Number of differences (exact match)"]) - for col in number_of_differences.columns: - count = comparison_df[col].dropna().ne(True).sum() - number_of_differences.loc[col] = count - - return number_of_differences - - def create_diff(file1, file2, output_filename): - # create unified diff - with open(file1, "r") as f1, open(file2, "r") as f2: - diff = difflib.unified_diff( - f1.readlines(), - f2.readlines(), - fromfile=file1, - tofile=file2, - lineterm='', - ) - diff = "".join(diff) - with open(output_filename, "w") as out: - out.write(diff) + def compare_files(file_df1, file_df2): + comparison_df = pd.DataFrame(index=file_df1.index, + columns=file_df1.columns) + + for row in file_df1.index: + for col in file_df1.columns: + uri1 = file_df1.loc[row, col] + uri2 = file_df2.loc[row, col] + file1 = os.path.join("table1_files", uri1.removeprefix("gs://")) + file2 = os.path.join("table2_files", uri2.removeprefix("gs://")) + if pd.isnull(file1) and pd.isnull(file2): + # count two nulls as matching + comparison_df.loc[row, col] = True + elif (not pd.isnull(file1) and not pd.isnull(file2)): + is_match = filecmp.cmp(file1, file2) + comparison_df.loc[row, col] = is_match + if not is_match: + output_filename = f"{row}_{col}_diff.txt" + create_diff(file1, file2, output_filename) + else: + # count as not matching if pair is missing + comparison_df.loc[row, col] = False + + number_of_differences = pd.DataFrame(columns=["Number of differences (exact match)"]) + for col in number_of_differences.columns: + count = comparison_df[col].dropna().ne(True).sum() + number_of_differences.loc[col] = count + + return number_of_differences + + def create_diff(file1, file2, output_filename): + # create unified diff + with open(file1, "r") as f1, open(file2, "r") as f2: + diff = difflib.unified_diff( + f1.readlines(), + f2.readlines(), + fromfile=file1, + tofile=file2, + lineterm='', + ) + diff = "".join(diff) + with open(output_filename, "w") as out: + out.write(diff) """ This function calculates the percent difference between two values From 2b2d6d252d12747337f0cf5d3b2b16b5f26083e9 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sat, 9 Dec 2023 21:11:26 -0700 Subject: [PATCH 12/43] Add self as argument to methods --- theiavalidate/Validator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 913e9f0..328ca88 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -216,7 +216,7 @@ def perform_exact_match(self): exact_differences_table.to_csv(self.output_prefix + "_exact_differences.tsv", sep="\t", index=True) - def compare_files(file_df1, file_df2): + def compare_files(self, file_df1, file_df2): comparison_df = pd.DataFrame(index=file_df1.index, columns=file_df1.columns) @@ -246,7 +246,7 @@ def compare_files(file_df1, file_df2): return number_of_differences - def create_diff(file1, file2, output_filename): + def create_diff(self, file1, file2, output_filename): # create unified diff with open(file1, "r") as f1, open(file2, "r") as f2: diff = difflib.unified_diff( From 901c2805fb705d43dc713eb01e7583254f90facb Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sat, 9 Dec 2023 21:20:28 -0700 Subject: [PATCH 13/43] Add directory to store files as Validator init variable --- tests/test_validator.py | 30 +++++++++++++++++++++++++++++- theiavalidate/Validator.py | 14 ++++++++------ 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/tests/test_validator.py b/tests/test_validator.py index c16bb95..678811a 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -111,4 +111,32 @@ def test_one_column_not_null(self): class TestCompareFiles(unittest.TestCase): def setUp(self): - self.validator = Validator(MockOptions()) \ No newline at end of file + self.validator = Validator(MockOptions()) + self.file_comparison_dir = "tests/file1_files" + self.file_comparison_dir = "tests/file2_files" + + def test_matching_files(self): + df1 = pd.DataFrame({ + "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-2.txt"], + "col2": ["gs://match2-1.txt", "gs://match2-2.txt", "gs://match2-3.txt"] + }) + df2 = pd.DataFrame({ + "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-2.txt"], + "col2": ["gs://match2-1.txt", "gs://match2-2.txt", "gs://match2-3.txt"] + }) + observed = self.validator.compare_files(df1, df2) + expected = pd.DataFrame({ + "Number of differences (exact match)": [0, 0] + }) + expected.index = ["col1, col2"] + pd.testing.assert_frame_equal(observed, expected) + + def test_mismatching_files(self): + pass + + def test_mix_matching_files(self): + pass + + def test_null_file(self): + pass + diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 328ca88..7b37be9 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -43,6 +43,8 @@ def __init__(self, options): self.columns_to_compare = options.columns_to_compare self.columns_to_compare.append("samples") self.file_columns = set() # columns that contain GCP URIs to files + self.table1_files_dir = "table1_files" + self.table2_files_dir = "table2_files" self.output_prefix = options.output_prefix self.na_values = options.na_values @@ -182,7 +184,7 @@ def perform_exact_match(self): files_df2 = self.table2.set_index("samples") files_df1 = files_df1[list(self.file_columns)] files_df2 = files_df2[list(self.file_columns)] - file_number_of_differences = compare_files(files_df1, files_df2) + file_number_of_differences = self.compare_files(files_df1, files_df2) else: table1 = self.table1 table2 = self.table2 @@ -224,8 +226,8 @@ def compare_files(self, file_df1, file_df2): for col in file_df1.columns: uri1 = file_df1.loc[row, col] uri2 = file_df2.loc[row, col] - file1 = os.path.join("table1_files", uri1.removeprefix("gs://")) - file2 = os.path.join("table2_files", uri2.removeprefix("gs://")) + file1 = os.path.join(self.table1_files_dir, uri1.removeprefix("gs://")) + file2 = os.path.join(self.table2_files_dir, uri2.removeprefix("gs://")) if pd.isnull(file1) and pd.isnull(file2): # count two nulls as matching comparison_df.loc[row, col] = True @@ -234,7 +236,7 @@ def compare_files(self, file_df1, file_df2): comparison_df.loc[row, col] = is_match if not is_match: output_filename = f"{row}_{col}_diff.txt" - create_diff(file1, file2, output_filename) + self._create_diff(file1, file2, output_filename) else: # count as not matching if pair is missing comparison_df.loc[row, col] = False @@ -418,8 +420,8 @@ def compare(self): self.logger.info("Determining columns for file comparisons") self.determine_file_columns() - dir1 = "table1_files/" - dir2 = "table2_files/" + dir1 = f"{self.table1_files_dir}/" + dir2 = f"{self.table2_files_dir}/" os.mkdir(dir1) os.mkdir(dir2) From 22aa10272f9f0cb2378cbe0f88332334a9fa6cbb Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sat, 9 Dec 2023 21:42:28 -0700 Subject: [PATCH 14/43] Fix wrong DataFrame variable in compare_files() --- theiavalidate/Validator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 7b37be9..ccca839 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -220,7 +220,7 @@ def perform_exact_match(self): def compare_files(self, file_df1, file_df2): comparison_df = pd.DataFrame(index=file_df1.index, - columns=file_df1.columns) + columns=file_df1.columns) for row in file_df1.index: for col in file_df1.columns: @@ -242,7 +242,7 @@ def compare_files(self, file_df1, file_df2): comparison_df.loc[row, col] = False number_of_differences = pd.DataFrame(columns=["Number of differences (exact match)"]) - for col in number_of_differences.columns: + for col in comparison_df.columns: count = comparison_df[col].dropna().ne(True).sum() number_of_differences.loc[col] = count From 425ff5cfad2ac7ebb7d44a0445f0a56cdaae32d2 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sat, 9 Dec 2023 21:58:41 -0700 Subject: [PATCH 15/43] Fix typo _create_diff -> create_dff --- theiavalidate/Validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index ccca839..c7233d1 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -236,7 +236,7 @@ def compare_files(self, file_df1, file_df2): comparison_df.loc[row, col] = is_match if not is_match: output_filename = f"{row}_{col}_diff.txt" - self._create_diff(file1, file2, output_filename) + self.create_diff(file1, file2, output_filename) else: # count as not matching if pair is missing comparison_df.loc[row, col] = False From cf461db59e90d4fd7e98950a6207006aa1d9451e Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sat, 9 Dec 2023 22:29:20 -0700 Subject: [PATCH 16/43] Fix another indentation error --- theiavalidate/Validator.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index c7233d1..3a5bd05 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -45,6 +45,7 @@ def __init__(self, options): self.file_columns = set() # columns that contain GCP URIs to files self.table1_files_dir = "table1_files" self.table2_files_dir = "table2_files" + self.diff_dir = "file_diffs" self.output_prefix = options.output_prefix self.na_values = options.na_values @@ -222,12 +223,13 @@ def compare_files(self, file_df1, file_df2): comparison_df = pd.DataFrame(index=file_df1.index, columns=file_df1.columns) - for row in file_df1.index: - for col in file_df1.columns: + for col in file_df1.columns: + for row in file_df1.index: uri1 = file_df1.loc[row, col] uri2 = file_df2.loc[row, col] file1 = os.path.join(self.table1_files_dir, uri1.removeprefix("gs://")) file2 = os.path.join(self.table2_files_dir, uri2.removeprefix("gs://")) + print(f"files: {file1}, {file2}") if pd.isnull(file1) and pd.isnull(file2): # count two nulls as matching comparison_df.loc[row, col] = True @@ -236,19 +238,20 @@ def compare_files(self, file_df1, file_df2): comparison_df.loc[row, col] = is_match if not is_match: output_filename = f"{row}_{col}_diff.txt" - self.create_diff(file1, file2, output_filename) + output_path = os.path.join(self.diff_dir, output_filename) + self.create_diff(file1, file2, output_path) else: # count as not matching if pair is missing comparison_df.loc[row, col] = False - - number_of_differences = pd.DataFrame(columns=["Number of differences (exact match)"]) - for col in comparison_df.columns: - count = comparison_df[col].dropna().ne(True).sum() - number_of_differences.loc[col] = count - return number_of_differences + number_of_differences = pd.DataFrame(columns=["Number of differences (exact match)"]) + for col in comparison_df.columns: + count = comparison_df[col].dropna().ne(True).sum() + number_of_differences.loc[col] = count + + return number_of_differences - def create_diff(self, file1, file2, output_filename): + def create_diff(self, file1, file2, output_path): # create unified diff with open(file1, "r") as f1, open(file2, "r") as f2: diff = difflib.unified_diff( @@ -259,7 +262,9 @@ def create_diff(self, file1, file2, output_filename): lineterm='', ) diff = "".join(diff) - with open(output_filename, "w") as out: + + os.makedirs(os.path.dirname(output_path), exist_ok=True) + with open(output_path, "w") as out: out.write(diff) """ From 494b8b4dc6e29fd1152cbcb00e13b0e469a53085 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sat, 9 Dec 2023 22:47:56 -0700 Subject: [PATCH 17/43] rearrange order of execution in compare_files() --- theiavalidate/Validator.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 3a5bd05..7620684 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -227,13 +227,12 @@ def compare_files(self, file_df1, file_df2): for row in file_df1.index: uri1 = file_df1.loc[row, col] uri2 = file_df2.loc[row, col] - file1 = os.path.join(self.table1_files_dir, uri1.removeprefix("gs://")) - file2 = os.path.join(self.table2_files_dir, uri2.removeprefix("gs://")) - print(f"files: {file1}, {file2}") - if pd.isnull(file1) and pd.isnull(file2): + if pd.isnull(uri1) and pd.isnull(uri2): # count two nulls as matching comparison_df.loc[row, col] = True - elif (not pd.isnull(file1) and not pd.isnull(file2)): + elif (not pd.isnull(uri1) and not pd.isnull(uri2)): + file1 = os.path.join(self.table1_files_dir, uri1.removeprefix("gs://")) + file2 = os.path.join(self.table2_files_dir, uri2.removeprefix("gs://")) is_match = filecmp.cmp(file1, file2) comparison_df.loc[row, col] = is_match if not is_match: From 710b30aff9b67bc33255a59782651ba7bf49acc5 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sat, 9 Dec 2023 22:49:10 -0700 Subject: [PATCH 18/43] Add unit tests for compare_files() --- .DS_Store | Bin 0 -> 6148 bytes file_diffs/0_col1_diff.txt | 5 +++ file_diffs/0_col2_diff.txt | 3 ++ file_diffs/1_col1_diff.txt | 3 ++ file_diffs/1_col2_diff.txt | 3 ++ file_diffs/2_col1_diff.txt | 5 +++ file_diffs/2_col2_diff.txt | 3 ++ tests/table1_files/match1-1.txt | 3 ++ tests/table1_files/match1-2.txt | 3 ++ tests/table1_files/match1-3.txt | 3 ++ tests/table1_files/match2-1.txt | 2 + tests/table1_files/match2-2.txt | 2 + tests/table1_files/match2-3.txt | 2 + tests/table1_files/mismatch1-1.txt | 3 ++ tests/table1_files/mismatch1-2.txt | 2 + tests/table1_files/mismatch1-3.txt | 4 ++ tests/table1_files/mismatch2-1.txt | 2 + tests/table1_files/mismatch2-2.txt | 2 + tests/table1_files/mismatch2-3.txt | 2 + tests/table2_files/match1-1.txt | 3 ++ tests/table2_files/match1-2.txt | 3 ++ tests/table2_files/match1-3.txt | 3 ++ tests/table2_files/match2-1.txt | 2 + tests/table2_files/match2-2.txt | 2 + tests/table2_files/match2-3.txt | 2 + tests/table2_files/mismatch1-1.txt | 3 ++ tests/table2_files/mismatch1-2.txt | 3 ++ tests/table2_files/mismatch1-3.txt | 3 ++ tests/table2_files/mismatch2-1.txt | 1 + tests/table2_files/mismatch2-2.txt | 2 + tests/table2_files/mismatch2-3.txt | 1 + tests/test_validator.py | 58 ++++++++++++++++++++++++----- 32 files changed, 129 insertions(+), 9 deletions(-) create mode 100644 .DS_Store create mode 100644 file_diffs/0_col1_diff.txt create mode 100644 file_diffs/0_col2_diff.txt create mode 100644 file_diffs/1_col1_diff.txt create mode 100644 file_diffs/1_col2_diff.txt create mode 100644 file_diffs/2_col1_diff.txt create mode 100644 file_diffs/2_col2_diff.txt create mode 100644 tests/table1_files/match1-1.txt create mode 100644 tests/table1_files/match1-2.txt create mode 100644 tests/table1_files/match1-3.txt create mode 100644 tests/table1_files/match2-1.txt create mode 100644 tests/table1_files/match2-2.txt create mode 100644 tests/table1_files/match2-3.txt create mode 100644 tests/table1_files/mismatch1-1.txt create mode 100644 tests/table1_files/mismatch1-2.txt create mode 100644 tests/table1_files/mismatch1-3.txt create mode 100644 tests/table1_files/mismatch2-1.txt create mode 100644 tests/table1_files/mismatch2-2.txt create mode 100644 tests/table1_files/mismatch2-3.txt create mode 100644 tests/table2_files/match1-1.txt create mode 100644 tests/table2_files/match1-2.txt create mode 100644 tests/table2_files/match1-3.txt create mode 100644 tests/table2_files/match2-1.txt create mode 100644 tests/table2_files/match2-2.txt create mode 100644 tests/table2_files/match2-3.txt create mode 100644 tests/table2_files/mismatch1-1.txt create mode 100644 tests/table2_files/mismatch1-2.txt create mode 100644 tests/table2_files/mismatch1-3.txt create mode 100644 tests/table2_files/mismatch2-1.txt create mode 100644 tests/table2_files/mismatch2-2.txt create mode 100644 tests/table2_files/mismatch2-3.txt diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..51227d71a94e52683c4d00d01ac912a7dfa75f3b GIT binary patch literal 6148 zcmeHKI|>3p3{CuiU}I@HSMUad=n3`$7K)81_^Y?_TprDrPoXS!S|~4&yqQeiEc=Sh zMnrUeSOjyq1*=D6P-_S;VFyM%Fva+Zyp5TE&WLZbpy zfC^9nDnJE3tw46z(fHFB^FAs-1%6%u`#u!7VNGlU{nLTqBLHxKv>VnwO8|={fHkoV zL$0gG>bW2DSK<@<*e5h_zG?{ceokWPC@W?4D@!4jkV*c7e!sMHO^~d U8|ZZ8oet#BfayY`0^e5P0W;M1& literal 0 HcmV?d00001 diff --git a/file_diffs/0_col1_diff.txt b/file_diffs/0_col1_diff.txt new file mode 100644 index 0000000..a0a1ba9 --- /dev/null +++ b/file_diffs/0_col1_diff.txt @@ -0,0 +1,5 @@ +--- tests/table1_files/mismatch1-1.txt+++ tests/table2_files/mismatch1-1.txt@@ -1,3 +1,3 @@-foo +-bar ++eggs ++spam + diff --git a/file_diffs/0_col2_diff.txt b/file_diffs/0_col2_diff.txt new file mode 100644 index 0000000..852e058 --- /dev/null +++ b/file_diffs/0_col2_diff.txt @@ -0,0 +1,3 @@ +--- tests/table1_files/mismatch2-3.txt+++ tests/table2_files/mismatch2-3.txt@@ -1,2 +1 @@-hello, world +- ++hello, world! diff --git a/file_diffs/1_col1_diff.txt b/file_diffs/1_col1_diff.txt new file mode 100644 index 0000000..7f6663f --- /dev/null +++ b/file_diffs/1_col1_diff.txt @@ -0,0 +1,3 @@ +--- tests/table1_files/mismatch1-2.txt+++ tests/table2_files/mismatch1-2.txt@@ -1,2 +1,3 @@+foo + foo + diff --git a/file_diffs/1_col2_diff.txt b/file_diffs/1_col2_diff.txt new file mode 100644 index 0000000..e08f21a --- /dev/null +++ b/file_diffs/1_col2_diff.txt @@ -0,0 +1,3 @@ +--- tests/table1_files/mismatch2-2.txt+++ tests/table2_files/mismatch2-2.txt@@ -1,2 +1,2 @@-5 6 6 ++4 5 6 + diff --git a/file_diffs/2_col1_diff.txt b/file_diffs/2_col1_diff.txt new file mode 100644 index 0000000..89a7f0c --- /dev/null +++ b/file_diffs/2_col1_diff.txt @@ -0,0 +1,5 @@ +--- tests/table1_files/mismatch1-3.txt+++ tests/table2_files/mismatch1-3.txt@@ -1,4 +1,3 @@+spam + +-spam + eggs +- diff --git a/file_diffs/2_col2_diff.txt b/file_diffs/2_col2_diff.txt new file mode 100644 index 0000000..852e058 --- /dev/null +++ b/file_diffs/2_col2_diff.txt @@ -0,0 +1,3 @@ +--- tests/table1_files/mismatch2-3.txt+++ tests/table2_files/mismatch2-3.txt@@ -1,2 +1 @@-hello, world +- ++hello, world! diff --git a/tests/table1_files/match1-1.txt b/tests/table1_files/match1-1.txt new file mode 100644 index 0000000..7bd112e --- /dev/null +++ b/tests/table1_files/match1-1.txt @@ -0,0 +1,3 @@ +foo +bar + diff --git a/tests/table1_files/match1-2.txt b/tests/table1_files/match1-2.txt new file mode 100644 index 0000000..42f0295 --- /dev/null +++ b/tests/table1_files/match1-2.txt @@ -0,0 +1,3 @@ +baz +eggs + diff --git a/tests/table1_files/match1-3.txt b/tests/table1_files/match1-3.txt new file mode 100644 index 0000000..fe05684 --- /dev/null +++ b/tests/table1_files/match1-3.txt @@ -0,0 +1,3 @@ +spam +monty + diff --git a/tests/table1_files/match2-1.txt b/tests/table1_files/match2-1.txt new file mode 100644 index 0000000..2b70035 --- /dev/null +++ b/tests/table1_files/match2-1.txt @@ -0,0 +1,2 @@ +1 2 3 + diff --git a/tests/table1_files/match2-2.txt b/tests/table1_files/match2-2.txt new file mode 100644 index 0000000..8db5eef --- /dev/null +++ b/tests/table1_files/match2-2.txt @@ -0,0 +1,2 @@ +4 5 6 + diff --git a/tests/table1_files/match2-3.txt b/tests/table1_files/match2-3.txt new file mode 100644 index 0000000..ee64adb --- /dev/null +++ b/tests/table1_files/match2-3.txt @@ -0,0 +1,2 @@ +7 8 9 + diff --git a/tests/table1_files/mismatch1-1.txt b/tests/table1_files/mismatch1-1.txt new file mode 100644 index 0000000..7bd112e --- /dev/null +++ b/tests/table1_files/mismatch1-1.txt @@ -0,0 +1,3 @@ +foo +bar + diff --git a/tests/table1_files/mismatch1-2.txt b/tests/table1_files/mismatch1-2.txt new file mode 100644 index 0000000..75d7bfb --- /dev/null +++ b/tests/table1_files/mismatch1-2.txt @@ -0,0 +1,2 @@ +foo + diff --git a/tests/table1_files/mismatch1-3.txt b/tests/table1_files/mismatch1-3.txt new file mode 100644 index 0000000..d86174f --- /dev/null +++ b/tests/table1_files/mismatch1-3.txt @@ -0,0 +1,4 @@ + +spam +eggs + diff --git a/tests/table1_files/mismatch2-1.txt b/tests/table1_files/mismatch2-1.txt new file mode 100644 index 0000000..2b70035 --- /dev/null +++ b/tests/table1_files/mismatch2-1.txt @@ -0,0 +1,2 @@ +1 2 3 + diff --git a/tests/table1_files/mismatch2-2.txt b/tests/table1_files/mismatch2-2.txt new file mode 100644 index 0000000..a28f8ae --- /dev/null +++ b/tests/table1_files/mismatch2-2.txt @@ -0,0 +1,2 @@ +5 6 6 + diff --git a/tests/table1_files/mismatch2-3.txt b/tests/table1_files/mismatch2-3.txt new file mode 100644 index 0000000..ae0e511 --- /dev/null +++ b/tests/table1_files/mismatch2-3.txt @@ -0,0 +1,2 @@ +hello, world + diff --git a/tests/table2_files/match1-1.txt b/tests/table2_files/match1-1.txt new file mode 100644 index 0000000..7bd112e --- /dev/null +++ b/tests/table2_files/match1-1.txt @@ -0,0 +1,3 @@ +foo +bar + diff --git a/tests/table2_files/match1-2.txt b/tests/table2_files/match1-2.txt new file mode 100644 index 0000000..42f0295 --- /dev/null +++ b/tests/table2_files/match1-2.txt @@ -0,0 +1,3 @@ +baz +eggs + diff --git a/tests/table2_files/match1-3.txt b/tests/table2_files/match1-3.txt new file mode 100644 index 0000000..fe05684 --- /dev/null +++ b/tests/table2_files/match1-3.txt @@ -0,0 +1,3 @@ +spam +monty + diff --git a/tests/table2_files/match2-1.txt b/tests/table2_files/match2-1.txt new file mode 100644 index 0000000..2b70035 --- /dev/null +++ b/tests/table2_files/match2-1.txt @@ -0,0 +1,2 @@ +1 2 3 + diff --git a/tests/table2_files/match2-2.txt b/tests/table2_files/match2-2.txt new file mode 100644 index 0000000..8db5eef --- /dev/null +++ b/tests/table2_files/match2-2.txt @@ -0,0 +1,2 @@ +4 5 6 + diff --git a/tests/table2_files/match2-3.txt b/tests/table2_files/match2-3.txt new file mode 100644 index 0000000..ee64adb --- /dev/null +++ b/tests/table2_files/match2-3.txt @@ -0,0 +1,2 @@ +7 8 9 + diff --git a/tests/table2_files/mismatch1-1.txt b/tests/table2_files/mismatch1-1.txt new file mode 100644 index 0000000..34ae2c6 --- /dev/null +++ b/tests/table2_files/mismatch1-1.txt @@ -0,0 +1,3 @@ +eggs +spam + diff --git a/tests/table2_files/mismatch1-2.txt b/tests/table2_files/mismatch1-2.txt new file mode 100644 index 0000000..7cd519a --- /dev/null +++ b/tests/table2_files/mismatch1-2.txt @@ -0,0 +1,3 @@ +foo +foo + diff --git a/tests/table2_files/mismatch1-3.txt b/tests/table2_files/mismatch1-3.txt new file mode 100644 index 0000000..fbabddf --- /dev/null +++ b/tests/table2_files/mismatch1-3.txt @@ -0,0 +1,3 @@ +spam + +eggs diff --git a/tests/table2_files/mismatch2-1.txt b/tests/table2_files/mismatch2-1.txt new file mode 100644 index 0000000..8d04f96 --- /dev/null +++ b/tests/table2_files/mismatch2-1.txt @@ -0,0 +1 @@ +1 2 diff --git a/tests/table2_files/mismatch2-2.txt b/tests/table2_files/mismatch2-2.txt new file mode 100644 index 0000000..336a0f9 --- /dev/null +++ b/tests/table2_files/mismatch2-2.txt @@ -0,0 +1,2 @@ +4 5 6 + diff --git a/tests/table2_files/mismatch2-3.txt b/tests/table2_files/mismatch2-3.txt new file mode 100644 index 0000000..270c611 --- /dev/null +++ b/tests/table2_files/mismatch2-3.txt @@ -0,0 +1 @@ +hello, world! diff --git a/tests/test_validator.py b/tests/test_validator.py index 678811a..39b2b61 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -112,31 +112,71 @@ def test_one_column_not_null(self): class TestCompareFiles(unittest.TestCase): def setUp(self): self.validator = Validator(MockOptions()) - self.file_comparison_dir = "tests/file1_files" - self.file_comparison_dir = "tests/file2_files" + self.validator.table1_files_dir = "tests/table1_files" + self.validator.table2_files_dir = "tests/table2_files" + self.diff_dir = "/dev/null" def test_matching_files(self): df1 = pd.DataFrame({ - "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-2.txt"], + "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"], "col2": ["gs://match2-1.txt", "gs://match2-2.txt", "gs://match2-3.txt"] }) df2 = pd.DataFrame({ - "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-2.txt"], + "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"], "col2": ["gs://match2-1.txt", "gs://match2-2.txt", "gs://match2-3.txt"] }) observed = self.validator.compare_files(df1, df2) expected = pd.DataFrame({ "Number of differences (exact match)": [0, 0] }) - expected.index = ["col1, col2"] + expected.index = ["col1", "col2"] pd.testing.assert_frame_equal(observed, expected) def test_mismatching_files(self): - pass + df1 = pd.DataFrame({ + "col1": ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"], + "col2": ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"] + }) + df2 = pd.DataFrame({ + "col1": ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"], + "col2": ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"] + }) + observed = self.validator.compare_files(df1, df2) + expected = pd.DataFrame({ + "Number of differences (exact match)": [3, 3] + }) + expected.index = ["col1", "col2"] + pd.testing.assert_frame_equal(observed, expected) def test_mix_matching_files(self): - pass + df1 = pd.DataFrame({ + "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"], + "col2": ["gs://mismatch2-3.txt", "gs://match2-2.txt", "gs://mismatch2-3.txt"] + }) + df2 = pd.DataFrame({ + "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"], + "col2": ["gs://mismatch2-3.txt", "gs://match2-2.txt", "gs://mismatch2-3.txt"] + }) + observed = self.validator.compare_files(df1, df2) + expected = pd.DataFrame({ + "Number of differences (exact match)": [0, 2] + }) + expected.index = ["col1", "col2"] + pd.testing.assert_frame_equal(observed, expected) - def test_null_file(self): - pass + def test_null_files(self): + df1 = pd.DataFrame({ + "col1": [np.nan, "gs://match1-2.txt", "gs://match1-3.txt"], + "col2": [np.nan, "gs://match2-2.txt", "gs://mismatch2-3.txt"] + }) + df2 = pd.DataFrame({ + "col1": [np.nan, "gs://match1-2.txt", "gs://match1-3.txt"], + "col2": ["gs://match2-3.txt", np.nan, np.nan] + }) + observed = self.validator.compare_files(df1, df2) + expected = pd.DataFrame({ + "Number of differences (exact match)": [0, 3] + }) + expected.index = ["col1", "col2"] + pd.testing.assert_frame_equal(observed, expected) From e7fb41af00c390bb9e08b696d1793d4be938858e Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sat, 9 Dec 2023 23:01:40 -0700 Subject: [PATCH 19/43] Apppend file number of differences to summary_output --- theiavalidate/Validator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 7620684..f3b111c 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -173,6 +173,7 @@ def determine_file_columns(self): def perform_exact_match(self): self.logger.debug("Performing an exact match and removing the sample name column") + file_number_of_differences = None if self.file_columns: # exclude file_columns for string comparison table1 = self.table1.drop(list(self.file_columns), axis=1) @@ -201,7 +202,8 @@ def perform_exact_match(self): # add the number of differences to the summary output table self.logger.debug("Adding the number of exact match differences to the summary table") self.summary_output = pd.concat([self.summary_output, number_of_differences], join="outer", axis=1) - + if file_number_of_differences is not None: + self.summary_output = pd.concat([self.summary_output, file_number_of_differences], join="outer", axis="1") # get a table of self-other differences # also: temporarily drop the sample name column for comparison and then set it as the index for the output data frame From 59912a3252ce667cf1d2b501188d9ee34dfad08c Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sat, 9 Dec 2023 23:19:46 -0700 Subject: [PATCH 20/43] __init__ hacks to get imports to work --- __init__.py | 2 ++ tests/__init__.py | 2 ++ theiavalidate/__init__.py | 3 ++- theiavalidate/theiavalidate.py | 6 +++--- 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/__init__.py b/__init__.py index e69de29..c0986f5 100644 --- a/__init__.py +++ b/__init__.py @@ -0,0 +1,2 @@ +__VERSION__ = "v0.0.1" +import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__))) \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py index e69de29..c0986f5 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -0,0 +1,2 @@ +__VERSION__ = "v0.0.1" +import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__))) \ No newline at end of file diff --git a/theiavalidate/__init__.py b/theiavalidate/__init__.py index 9a65ac3..c0986f5 100644 --- a/theiavalidate/__init__.py +++ b/theiavalidate/__init__.py @@ -1 +1,2 @@ -__VERSION__ = "v0.0.1" \ No newline at end of file +__VERSION__ = "v0.0.1" +import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__))) \ No newline at end of file diff --git a/theiavalidate/theiavalidate.py b/theiavalidate/theiavalidate.py index f18ff5e..c04ae3b 100644 --- a/theiavalidate/theiavalidate.py +++ b/theiavalidate/theiavalidate.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 import argparse -from . import CheckInputs -from .__init__ import __VERSION__ -from .Validator import Validator +import CheckInputs +from __init__ import __VERSION__ +from Validator import Validator DEFAULT_NA_VALUES = [ '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', From 0d752d1c9a1b1c1731d5bb409bd383d16b14ff97 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sun, 10 Dec 2023 00:51:31 -0700 Subject: [PATCH 21/43] Use list instead of set for indexing --- theiavalidate/Validator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index f3b111c..18102c8 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -432,8 +432,8 @@ def compare(self): os.mkdir(dir2) self.logger.info("Localizing files to compare...") - self.table1[self.file_columns].apply(localize_files, directory=dir1) - self.table2[self.file_columns].apply(localize_files, directory=dir2) + self.table1[list(self.file_columns)].apply(localize_files, directory=dir1) + self.table2[list(self.file_columns)].apply(localize_files, directory=dir2) self.logger.info("Performing an exact string match") self.perform_exact_match() From abf7f0044a5474b8c776a51f7b7219e2ddba8a01 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sun, 10 Dec 2023 01:13:51 -0700 Subject: [PATCH 22/43] Add exist_ok=True to os.makedirs() --- theiavalidate/Validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 18102c8..2641a5e 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -456,5 +456,5 @@ def localize_files(row, directory): # gsutil path and loop to copy remote_path = os.path.dirname(value.removeprefix("gs://")) destination_path = os.path.join(directory, remote_path) - os.makedirs(destination_path) + os.makedirs(destination_path, exist_ok=True) subprocess.run(["gsutil", "-m", "cp", value, destination_path]) From 275281eedb5d2cbfe602d87a8b28c7d0921e5eec Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sun, 10 Dec 2023 01:22:49 -0700 Subject: [PATCH 23/43] fix typo axis="1" -> axis=1 --- theiavalidate/Validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 2641a5e..5d1a73b 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -203,7 +203,7 @@ def perform_exact_match(self): self.logger.debug("Adding the number of exact match differences to the summary table") self.summary_output = pd.concat([self.summary_output, number_of_differences], join="outer", axis=1) if file_number_of_differences is not None: - self.summary_output = pd.concat([self.summary_output, file_number_of_differences], join="outer", axis="1") + self.summary_output = pd.concat([self.summary_output, file_number_of_differences], join="outer", axis=1) # get a table of self-other differences # also: temporarily drop the sample name column for comparison and then set it as the index for the output data frame From 169dbfd9906ac5ab6c8783c39ef8631c65895d5b Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sun, 10 Dec 2023 20:35:07 -0700 Subject: [PATCH 24/43] Fix issues appending file number of differences to summary output --- theiavalidate/Validator.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 5d1a73b..778f3df 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -15,6 +15,7 @@ class Validator: """ This class runs the parsing module for theiavalidate """ + NUM_DIFFERENCES_COL = "Number of differences (exact match)" def __init__(self, options): logging.basicConfig(encoding='utf-8', level=logging.ERROR, stream=sys.stderr) self.logger = logging.getLogger(__name__) @@ -180,7 +181,6 @@ def perform_exact_match(self): table2 = self.table2.drop(list(self.file_columns), axis=1) # handle file comparisons separately from strings - # TODO: set index to samples column in main table earlier? files_df1 = self.table1.set_index("samples") files_df2 = self.table2.set_index("samples") @@ -194,7 +194,7 @@ def perform_exact_match(self): # count the number of differences using exact string matches # temporarily make NaNs null since NaN != NaN for the pd.DataFrame.eq() function # also: remove the samplename row - number_of_differences = pd.DataFrame((~table1.fillna("NULL").astype(str).eq(table2.fillna("NULL").astype(str))).sum(), columns = ["Number of differences (exact match)"]) + number_of_differences = pd.DataFrame((~table1.fillna("NULL").astype(str).eq(table2.fillna("NULL").astype(str))).sum(), columns = [self.NUM_DIFFERENCES_COL]) number_of_differences.drop("samples", axis=0, inplace=True) @@ -203,7 +203,11 @@ def perform_exact_match(self): self.logger.debug("Adding the number of exact match differences to the summary table") self.summary_output = pd.concat([self.summary_output, number_of_differences], join="outer", axis=1) if file_number_of_differences is not None: - self.summary_output = pd.concat([self.summary_output, file_number_of_differences], join="outer", axis=1) + self.summary_output = self.summary_output.combine_first(file_number_of_differences) + self.summary_output[self.NUM_DIFFERENCES_COL] = self.summary_output[self.NUM_DIFFERENCES_COL].astype(int) + + # Ensure number of differences column is the last column + self.summary_output[self.NUM_DIFFERENCES_COL] = self.summary_output.pop(self.NUM_DIFFERENCES_COL) # get a table of self-other differences # also: temporarily drop the sample name column for comparison and then set it as the index for the output data frame @@ -245,7 +249,7 @@ def compare_files(self, file_df1, file_df2): # count as not matching if pair is missing comparison_df.loc[row, col] = False - number_of_differences = pd.DataFrame(columns=["Number of differences (exact match)"]) + number_of_differences = pd.DataFrame(columns=[self.NUM_DIFFERENCES_COL]) for col in comparison_df.columns: count = comparison_df[col].dropna().ne(True).sum() number_of_differences.loc[col] = count From f5e9392b8894390e7db143f2ee98177ea0525405 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sun, 10 Dec 2023 21:20:45 -0700 Subject: [PATCH 25/43] Refactor compare_files() and set instead of return The DataFrames created in compare_files() will be useful in other parts of the code. Particularly, for creating a table similar to the exact_differences_table and for implementing validation criteria for factors. Break calculating file number of differences in a new method. Also, set DataFrames as properties rather than return values to make the DataFrames more accessible. --- tests/test_validator.py | 30 +++++++++++++++++------------- theiavalidate/Validator.py | 27 ++++++++++++++++----------- 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/tests/test_validator.py b/tests/test_validator.py index 39b2b61..ed99cc7 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -109,7 +109,7 @@ def test_one_column_not_null(self): self.assertEqual(self.validator.file_columns, {"col1"}) -class TestCompareFiles(unittest.TestCase): +class TestFileNumberOfDifferences(unittest.TestCase): def setUp(self): self.validator = Validator(MockOptions()) self.validator.table1_files_dir = "tests/table1_files" @@ -125,12 +125,13 @@ def test_matching_files(self): "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"], "col2": ["gs://match2-1.txt", "gs://match2-2.txt", "gs://match2-3.txt"] }) - observed = self.validator.compare_files(df1, df2) + self.validator.compare_files(df1, df2) + self.validator.set_file_number_of_differences() expected = pd.DataFrame({ - "Number of differences (exact match)": [0, 0] + self.validator.NUM_DIFFERENCES_COL: [0, 0] }) expected.index = ["col1", "col2"] - pd.testing.assert_frame_equal(observed, expected) + pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) def test_mismatching_files(self): df1 = pd.DataFrame({ @@ -141,12 +142,13 @@ def test_mismatching_files(self): "col1": ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"], "col2": ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"] }) - observed = self.validator.compare_files(df1, df2) + self.validator.compare_files(df1, df2) + self.validator.set_file_number_of_differences() expected = pd.DataFrame({ - "Number of differences (exact match)": [3, 3] + self.validator.NUM_DIFFERENCES_COL: [3, 3] }) expected.index = ["col1", "col2"] - pd.testing.assert_frame_equal(observed, expected) + pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) def test_mix_matching_files(self): df1 = pd.DataFrame({ @@ -157,12 +159,13 @@ def test_mix_matching_files(self): "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"], "col2": ["gs://mismatch2-3.txt", "gs://match2-2.txt", "gs://mismatch2-3.txt"] }) - observed = self.validator.compare_files(df1, df2) + self.validator.compare_files(df1, df2) + self.validator.set_file_number_of_differences() expected = pd.DataFrame({ - "Number of differences (exact match)": [0, 2] + self.validator.NUM_DIFFERENCES_COL: [0, 2] }) expected.index = ["col1", "col2"] - pd.testing.assert_frame_equal(observed, expected) + pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) def test_null_files(self): df1 = pd.DataFrame({ @@ -173,10 +176,11 @@ def test_null_files(self): "col1": [np.nan, "gs://match1-2.txt", "gs://match1-3.txt"], "col2": ["gs://match2-3.txt", np.nan, np.nan] }) - observed = self.validator.compare_files(df1, df2) + self.validator.compare_files(df1, df2) + self.validator.set_file_number_of_differences() expected = pd.DataFrame({ - "Number of differences (exact match)": [0, 3] + self.validator.NUM_DIFFERENCES_COL: [0, 3] }) expected.index = ["col1", "col2"] - pd.testing.assert_frame_equal(observed, expected) + pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 778f3df..85f8f33 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -43,11 +43,17 @@ def __init__(self, options): self.validation_criteria = options.validation_criteria self.columns_to_compare = options.columns_to_compare self.columns_to_compare.append("samples") + self.file_columns = set() # columns that contain GCP URIs to files self.table1_files_dir = "table1_files" self.table2_files_dir = "table2_files" self.diff_dir = "file_diffs" + # DataFrames for holding file comparison results + self.file_exact_matches = None + self.file_exact_differences_table = None + self.file_number_of_differences = None + self.output_prefix = options.output_prefix self.na_values = options.na_values @@ -226,8 +232,8 @@ def perform_exact_match(self): def compare_files(self, file_df1, file_df2): - comparison_df = pd.DataFrame(index=file_df1.index, - columns=file_df1.columns) + self.file_exact_matches = pd.DataFrame(index=file_df1.index, + columns=file_df1.columns) for col in file_df1.columns: for row in file_df1.index: @@ -235,26 +241,25 @@ def compare_files(self, file_df1, file_df2): uri2 = file_df2.loc[row, col] if pd.isnull(uri1) and pd.isnull(uri2): # count two nulls as matching - comparison_df.loc[row, col] = True + self.file_exact_matches.loc[row, col] = True elif (not pd.isnull(uri1) and not pd.isnull(uri2)): file1 = os.path.join(self.table1_files_dir, uri1.removeprefix("gs://")) file2 = os.path.join(self.table2_files_dir, uri2.removeprefix("gs://")) is_match = filecmp.cmp(file1, file2) - comparison_df.loc[row, col] = is_match + self.file_exact_matches.loc[row, col] = is_match if not is_match: output_filename = f"{row}_{col}_diff.txt" output_path = os.path.join(self.diff_dir, output_filename) self.create_diff(file1, file2, output_path) else: # count as not matching if pair is missing - comparison_df.loc[row, col] = False - - number_of_differences = pd.DataFrame(columns=[self.NUM_DIFFERENCES_COL]) - for col in comparison_df.columns: - count = comparison_df[col].dropna().ne(True).sum() - number_of_differences.loc[col] = count + self.file_exact_matches.loc[row, col] = False - return number_of_differences + def set_file_number_of_differences(self): + self.file_number_of_differences = pd.DataFrame(columns=[self.NUM_DIFFERENCES_COL]) + for col in self.file_exact_matches.columns: + count = self.file_exact_matches[col].dropna().ne(True).sum() + self.file_number_of_differences.loc[col] = count def create_diff(self, file1, file2, output_path): # create unified diff From 3b768097836d36c7ab3174c3c7e1c3cf26b35379 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sun, 10 Dec 2023 21:34:34 -0700 Subject: [PATCH 26/43] Update perform_exact_match() to use new Validator props --- theiavalidate/Validator.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 85f8f33..9cf26b8 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -192,7 +192,7 @@ def perform_exact_match(self): files_df2 = self.table2.set_index("samples") files_df1 = files_df1[list(self.file_columns)] files_df2 = files_df2[list(self.file_columns)] - file_number_of_differences = self.compare_files(files_df1, files_df2) + self.compare_files(files_df1, files_df2) else: table1 = self.table1 table2 = self.table2 @@ -208,8 +208,10 @@ def perform_exact_match(self): # add the number of differences to the summary output table self.logger.debug("Adding the number of exact match differences to the summary table") self.summary_output = pd.concat([self.summary_output, number_of_differences], join="outer", axis=1) - if file_number_of_differences is not None: - self.summary_output = self.summary_output.combine_first(file_number_of_differences) + + self.set_file_number_of_differences() + if self.file_number_of_differences is not None: + self.summary_output = self.summary_output.combine_first(self.file_number_of_differences) self.summary_output[self.NUM_DIFFERENCES_COL] = self.summary_output[self.NUM_DIFFERENCES_COL].astype(int) # Ensure number of differences column is the last column From 1358a75126a2b9d011a11eecef01572730a6e937 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sun, 10 Dec 2023 22:47:40 -0700 Subject: [PATCH 27/43] Add file URIs to exact_differences_table --- theiavalidate/Validator.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 9cf26b8..55c47ba 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -51,7 +51,7 @@ def __init__(self, options): # DataFrames for holding file comparison results self.file_exact_matches = None - self.file_exact_differences_table = None + self.file_exact_differences = None self.file_number_of_differences = None self.output_prefix = options.output_prefix @@ -180,7 +180,6 @@ def determine_file_columns(self): def perform_exact_match(self): self.logger.debug("Performing an exact match and removing the sample name column") - file_number_of_differences = None if self.file_columns: # exclude file_columns for string comparison table1 = self.table1.drop(list(self.file_columns), axis=1) @@ -214,7 +213,7 @@ def perform_exact_match(self): self.summary_output = self.summary_output.combine_first(self.file_number_of_differences) self.summary_output[self.NUM_DIFFERENCES_COL] = self.summary_output[self.NUM_DIFFERENCES_COL].astype(int) - # Ensure number of differences column is the last column + # ensure number of differences column is the last column self.summary_output[self.NUM_DIFFERENCES_COL] = self.summary_output.pop(self.NUM_DIFFERENCES_COL) # get a table of self-other differences @@ -224,6 +223,10 @@ def perform_exact_match(self): # rename the self and other with the table names self.logger.debug("Renaming the self and other to be the table names") exact_differences_table.rename(columns={"self": self.table1_name, "other": self.table2_name}, level=-1, inplace=True) + + # add file exact differences + exact_differences_table = pd.concat([exact_differences_table, self.file_exact_differences], axis=1) + # replace matching values (NAs) with blanks self.logger.debug("Replacing all NA values with blanks") exact_differences_table.replace(np.nan, "", inplace=True) @@ -236,6 +239,13 @@ def perform_exact_match(self): def compare_files(self, file_df1, file_df2): self.file_exact_matches = pd.DataFrame(index=file_df1.index, columns=file_df1.columns) + + # create similar table to one generated by df1.compare(df2) + # for adding to the exact differences TSV + self.file_exact_differences = pd.DataFrame( + index=file_df1.index, + columns=pd.MultiIndex.from_product([file_df1.columns, [self.table1_name, self.table2_name]]) + ) for col in file_df1.columns: for row in file_df1.index: @@ -249,13 +259,22 @@ def compare_files(self, file_df1, file_df2): file2 = os.path.join(self.table2_files_dir, uri2.removeprefix("gs://")) is_match = filecmp.cmp(file1, file2) self.file_exact_matches.loc[row, col] = is_match - if not is_match: + if is_match: + # don't add URIs to exact differences table if files match + self.file_exact_differences.loc[row, (col, self.table1_name)] = np.nan + self.file_exact_differences.loc[row, (col, self.table2_name)] = np.nan + continue + else: output_filename = f"{row}_{col}_diff.txt" output_path = os.path.join(self.diff_dir, output_filename) self.create_diff(file1, file2, output_path) else: # count as not matching if pair is missing self.file_exact_matches.loc[row, col] = False + + self.file_exact_differences.loc[row, (col, self.table1_name)] = uri1 + self.file_exact_differences.loc[row, (col, self.table2_name)] = uri2 + def set_file_number_of_differences(self): self.file_number_of_differences = pd.DataFrame(columns=[self.NUM_DIFFERENCES_COL]) From 208bfcd7c48a7f3097da9047da83c5b1aac4289a Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sun, 10 Dec 2023 23:54:49 -0700 Subject: [PATCH 28/43] Only run set_file_number_of_differences() if have files --- theiavalidate/Validator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 55c47ba..89f9edb 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -192,6 +192,7 @@ def perform_exact_match(self): files_df1 = files_df1[list(self.file_columns)] files_df2 = files_df2[list(self.file_columns)] self.compare_files(files_df1, files_df2) + self.set_file_number_of_differences() else: table1 = self.table1 table2 = self.table2 @@ -208,7 +209,6 @@ def perform_exact_match(self): self.logger.debug("Adding the number of exact match differences to the summary table") self.summary_output = pd.concat([self.summary_output, number_of_differences], join="outer", axis=1) - self.set_file_number_of_differences() if self.file_number_of_differences is not None: self.summary_output = self.summary_output.combine_first(self.file_number_of_differences) self.summary_output[self.NUM_DIFFERENCES_COL] = self.summary_output[self.NUM_DIFFERENCES_COL].astype(int) @@ -322,7 +322,6 @@ def validate(self, column): self.validation_table[(column.name, self.table1_name)] = self.table1[column.name].where(exact_matches) self.validation_table[(column.name, self.table2_name)] = self.table2[column.name].where(exact_matches) - number_of_differences = exact_matches.sum() return ("EXACT", number_of_differences) elif column[0] == "IGNORE": # do not check; there are no failures (0) @@ -367,7 +366,8 @@ def run_validation_checks(self): self.logger.debug("Performing the validation checks") self.summary_output[["Validation Criteria", "Number of samples failing the validation criteria"]] = pd.DataFrame(self.validation_criteria.apply(lambda x: self.validate(x), result_type="expand")).transpose() - + print("validation criteria table:") + print(self.validation_table) # format the validation criteria differences table self.logger.debug("Formatting the validation criteria differences table") self.validation_table.set_index(self.table1["samples"], inplace=True) From 5dc584becc14fc53fdc9deaa1113c6d460b39500 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Mon, 11 Dec 2023 07:16:55 +0000 Subject: [PATCH 29/43] Remove ununsed .transpose() --- theiavalidate/Validator.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 89f9edb..6bb615a 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -367,12 +367,10 @@ def run_validation_checks(self): self.logger.debug("Performing the validation checks") self.summary_output[["Validation Criteria", "Number of samples failing the validation criteria"]] = pd.DataFrame(self.validation_criteria.apply(lambda x: self.validate(x), result_type="expand")).transpose() print("validation criteria table:") - print(self.validation_table) # format the validation criteria differences table self.logger.debug("Formatting the validation criteria differences table") self.validation_table.set_index(self.table1["samples"], inplace=True) self.validation_table.rename_axis(None, axis="index", inplace=True) - self.validation_table.transpose() self.validation_table.columns = pd.MultiIndex.from_tuples(self.validation_table.columns, names=["Column", "Table"]) From 745011781c2374e7e20d5362f8f9d279834d1780 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Mon, 11 Dec 2023 14:12:14 -0700 Subject: [PATCH 30/43] Implement EXACT and IGNORE validation for file comparisons --- theiavalidate/Validator.py | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 6bb615a..2041c9e 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -53,6 +53,7 @@ def __init__(self, options): self.file_exact_matches = None self.file_exact_differences = None self.file_number_of_differences = None + self.file_validations = None self.output_prefix = options.output_prefix self.na_values = options.na_values @@ -274,7 +275,8 @@ def compare_files(self, file_df1, file_df2): self.file_exact_differences.loc[row, (col, self.table1_name)] = uri1 self.file_exact_differences.loc[row, (col, self.table2_name)] = uri2 - + + self.file_exact_matches = self.file_exact_matches.astype(bool) def set_file_number_of_differences(self): self.file_number_of_differences = pd.DataFrame(columns=[self.NUM_DIFFERENCES_COL]) @@ -315,7 +317,11 @@ def percent_difference(self, value1, value2): def validate(self, column): if column.name in self.table1.columns: # check the data type of the validation criteria; based on its type, we can assume the comparison to perform - if pd.api.types.is_string_dtype(column) == True: # if a string + if column.name in self.file_columns: + # handle file validation separately from strings, floats + validation_criterion, number_of_differences = self.validate_files(column) + return (validation_criterion, number_of_differences) + elif pd.api.types.is_string_dtype(column) == True: # if a string if column[0] == "EXACT": # count the number of exact match failures/differences self.logger.debug("Performing an exact match on column {} and counting the number of differences".format(column.name)) exact_matches = ~self.table1[column.name].fillna("NULL").eq(self.table2[column.name].fillna("NULL")) @@ -357,16 +363,38 @@ def validate(self, column): else: self.logger.debug("Column {} was not found; indicating np.nan failures".format(column.name)) return ("COLUMN " + column.name + " NOT FOUND", np.nan) + + def validate_files(self, column): + validation_criterion = column.iloc[0] + if validation_criterion == "EXACT": + # we already know where the exact matches are from compare_files() + self.validation_table[(column.name, self.table1_name)] = (self.table1 + .set_index("samples")[column.name] + .where(~self.file_exact_matches[column.name]) + .reset_index()[column.name] + ) + self.validation_table[(column.name, self.table2_name)] = (self.table2 + .set_index("samples")[column.name] + .where(~self.file_exact_matches[column.name]) + .reset_index()[column.name] + ) + number_of_differences = self.file_number_of_differences.loc[column.name, self.NUM_DIFFERENCES_COL] + elif validation_criterion == "IGNORE": + number_of_differences = 0 + elif validation_criterion == "SET": + pass + else: + raise Exception("Only EXACT, IGNORE, and SET validation criteria implemented for file columns") + return (validation_criterion, number_of_differences) """ This function creates, formats, and runs the validation criteria checks - """ + """ def run_validation_checks(self): self.validation_table = pd.DataFrame() self.logger.debug("Performing the validation checks") self.summary_output[["Validation Criteria", "Number of samples failing the validation criteria"]] = pd.DataFrame(self.validation_criteria.apply(lambda x: self.validate(x), result_type="expand")).transpose() - print("validation criteria table:") # format the validation criteria differences table self.logger.debug("Formatting the validation criteria differences table") self.validation_table.set_index(self.table1["samples"], inplace=True) From 40eb2849a515a48b4ab4fbf6dd0dd543ba195efd Mon Sep 17 00:00:00 2001 From: sam-baird Date: Mon, 11 Dec 2023 18:04:01 -0700 Subject: [PATCH 31/43] Implement SET validation for file comparisons --- theiavalidate/Validator.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 2041c9e..44645c5 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -382,11 +382,41 @@ def validate_files(self, column): elif validation_criterion == "IGNORE": number_of_differences = 0 elif validation_criterion == "SET": - pass + # for SET, sort lines in files then compare + concat_columns = pd.concat([self.table1[column.name], self.table2[column.name]], axis=1) + concat_columns = concat_columns.applymap( + lambda x: x.removeprefix("gs://") if pd.notnull(x) else x + ) + sorted_file_matches = concat_columns.apply(self.compare_sorted_files, axis=1) + self.validation_table[(column.name, self.table1_name)] = (self.table1[column.name] + .where(~sorted_file_matches) + ) + self.validation_table[(column.name, self.table2_name)] = (self.table2[column.name] + .where(~sorted_file_matches) + ) + number_of_differences = len(sorted_file_matches) - sorted_file_matches.sum() else: raise Exception("Only EXACT, IGNORE, and SET validation criteria implemented for file columns") return (validation_criterion, number_of_differences) + def compare_sorted_files(self, row): + file1 = row.iloc[0] + file2 = row.iloc[1] + if pd.isnull(file1) and pd.isnull(file2): + # count two nulls as matching + return True + if pd.notnull(file1) and pd.notnull(file2): + file1 = os.path.join(self.table1_files_dir, file1) + file2 = os.path.join(self.table2_files_dir, file2) + with open(file1, "r") as f1, open(file2, "r") as f2: + lines1 = f1.readlines() + lines2 = f2.readlines() + lines1.sort() + lines2.sort() + return lines1 == lines2 + # count null + not-null as mismatching + return False + """ This function creates, formats, and runs the validation criteria checks """ From eec95fa5b1efdb8f3d333a85011d86dba5685f1b Mon Sep 17 00:00:00 2001 From: sam-baird Date: Mon, 11 Dec 2023 19:39:46 -0700 Subject: [PATCH 32/43] Add more unit tests for compare_files() --- tests/test_validator.py | 191 +++++++++++++++++++++++++++++++++------- 1 file changed, 160 insertions(+), 31 deletions(-) diff --git a/tests/test_validator.py b/tests/test_validator.py index ed99cc7..610ab41 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -96,6 +96,18 @@ def test_one_column_null(self): self.run_determine_file_columns(data1, data2) self.assertEqual(self.validator.file_columns, {"col1", "col2"}) + def test_mixed_nulls(self): + data1 = { + "col1": ["gs://foo", "gs://foo", np.nan], + "col2": ["gs://x", "gs://y", np.nan] + } + data2 = { + "col1": ["gs://eggs", np.nan, np.nan], + "col2": [np.nan, "gs://b", np.nan] + } + self.run_determine_file_columns(data1, data2) + self.assertEqual(self.validator.file_columns, {"col1", "col2"}) + def test_one_column_not_null(self): data1 = { "col1": ["gs://foo", "gs://bar", "gs://baz"], @@ -109,14 +121,19 @@ def test_one_column_not_null(self): self.assertEqual(self.validator.file_columns, {"col1"}) -class TestFileNumberOfDifferences(unittest.TestCase): +class TestCompareFiles(unittest.TestCase): + SAMPLES_INDEX = ["sample1", "sample2", "sample3"] + COLUMNS_INDEX = ["col1", "col2"] + def setUp(self): self.validator = Validator(MockOptions()) + self.validator.table1_name = "table1" + self.validator.table2_name = "table2" self.validator.table1_files_dir = "tests/table1_files" self.validator.table2_files_dir = "tests/table2_files" self.diff_dir = "/dev/null" - def test_matching_files(self): + def create_matching_files_tables(self): df1 = pd.DataFrame({ "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"], "col2": ["gs://match2-1.txt", "gs://match2-2.txt", "gs://match2-3.txt"] @@ -125,15 +142,11 @@ def test_matching_files(self): "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"], "col2": ["gs://match2-1.txt", "gs://match2-2.txt", "gs://match2-3.txt"] }) - self.validator.compare_files(df1, df2) - self.validator.set_file_number_of_differences() - expected = pd.DataFrame({ - self.validator.NUM_DIFFERENCES_COL: [0, 0] - }) - expected.index = ["col1", "col2"] - pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) + for df in [df1, df2]: + df.index = self.SAMPLES_INDEX + return df1, df2 - def test_mismatching_files(self): + def create_mismatching_files_tables(self): df1 = pd.DataFrame({ "col1": ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"], "col2": ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"] @@ -142,45 +155,161 @@ def test_mismatching_files(self): "col1": ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"], "col2": ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"] }) - self.validator.compare_files(df1, df2) - self.validator.set_file_number_of_differences() - expected = pd.DataFrame({ - self.validator.NUM_DIFFERENCES_COL: [3, 3] - }) - expected.index = ["col1", "col2"] - pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) - - def test_mix_matching_files(self): + for df in [df1, df2]: + df.index = self.SAMPLES_INDEX + return df1, df2 + + def create_mix_matching_files_tables(self): df1 = pd.DataFrame({ "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"], - "col2": ["gs://mismatch2-3.txt", "gs://match2-2.txt", "gs://mismatch2-3.txt"] + "col2": ["gs://mismatch2-1.txt", "gs://match2-2.txt", "gs://mismatch2-3.txt"] }) df2 = pd.DataFrame({ "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"], - "col2": ["gs://mismatch2-3.txt", "gs://match2-2.txt", "gs://mismatch2-3.txt"] + "col2": ["gs://mismatch2-1.txt", "gs://match2-2.txt", "gs://mismatch2-3.txt"] }) - self.validator.compare_files(df1, df2) - self.validator.set_file_number_of_differences() - expected = pd.DataFrame({ - self.validator.NUM_DIFFERENCES_COL: [0, 2] - }) - expected.index = ["col1", "col2"] - pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) + for df in [df1, df2]: + df.index = self.SAMPLES_INDEX + return df1, df2 - def test_null_files(self): + def create_null_files_tables(self): df1 = pd.DataFrame({ "col1": [np.nan, "gs://match1-2.txt", "gs://match1-3.txt"], "col2": [np.nan, "gs://match2-2.txt", "gs://mismatch2-3.txt"] }) df2 = pd.DataFrame({ "col1": [np.nan, "gs://match1-2.txt", "gs://match1-3.txt"], - "col2": ["gs://match2-3.txt", np.nan, np.nan] + "col2": ["gs://match2-1.txt", np.nan, np.nan] + }) + for df in [df1, df2]: + df.index = self.SAMPLES_INDEX + return df1, df2 + + def test_matching_files_exact_matches(self): + df1, df2 = self.create_matching_files_tables() + self.validator.compare_files(df1, df2) + expected = pd.DataFrame({ + "col1": [True, True, True], + "col2": [True, True, True] + }) + expected.index = self.SAMPLES_INDEX + pd.testing.assert_frame_equal(self.validator.file_exact_matches, expected) + + def test_mismatching_files_exact_matches(self): + df1, df2 = self.create_mismatching_files_tables() + self.validator.compare_files(df1, df2) + expected = pd.DataFrame({ + "col1": [False, False, False], + "col2": [False, False, False] + }) + expected.index = self.SAMPLES_INDEX + pd.testing.assert_frame_equal(self.validator.file_exact_matches, expected) + + def test_mix_matching_files_exact_matches(self): + df1, df2 = self.create_mix_matching_files_tables() + self.validator.compare_files(df1, df2) + expected = pd.DataFrame({ + "col1": [True, True, True], + "col2": [False, True, False] }) + expected.index = self.SAMPLES_INDEX + pd.testing.assert_frame_equal(self.validator.file_exact_matches, expected) + + def test_null_files_exact_matches(self): + df1, df2 = self.create_null_files_tables() + self.validator.compare_files(df1, df2) + expected = pd.DataFrame({ + "col1": [True, True, True], + "col2": [False, False, False] + }) + expected.index = self.SAMPLES_INDEX + pd.testing.assert_frame_equal(self.validator.file_exact_matches, expected) + + def test_null_files_number_of_differences(self): + df1, df2 = self.create_null_files_tables() self.validator.compare_files(df1, df2) self.validator.set_file_number_of_differences() expected = pd.DataFrame({ self.validator.NUM_DIFFERENCES_COL: [0, 3] }) - expected.index = ["col1", "col2"] + expected.index = self.COLUMNS_INDEX + pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) + + def test_mismatching_files_number_of_differences(self): + df1, df2 = self.create_mismatching_files_tables() + self.validator.compare_files(df1, df2) + self.validator.set_file_number_of_differences() + expected = pd.DataFrame({ + self.validator.NUM_DIFFERENCES_COL: [3, 3] + }) + expected.index = self.COLUMNS_INDEX + pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) + + def test_mix_matching_files_number_of_differences(self): + df1, df2 = self.create_mix_matching_files_tables() + self.validator.compare_files(df1, df2) + self.validator.set_file_number_of_differences() + expected = pd.DataFrame({ + self.validator.NUM_DIFFERENCES_COL: [0, 2] + }) + expected.index = self.COLUMNS_INDEX pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) + def test_null_files_number_of_differences(self): + df1, df2 = self.create_null_files_tables() + self.validator.compare_files(df1, df2) + self.validator.set_file_number_of_differences() + expected = pd.DataFrame({ + self.validator.NUM_DIFFERENCES_COL: [0, 3] + }) + expected.index = self.COLUMNS_INDEX + pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) + + def test_matching_files_exact_differences(self): + df1, df2 = self.create_matching_files_tables() + self.validator.compare_files(df1, df2) + expected = pd.DataFrame({ + ("col1", "table1"): [np.nan, np.nan, np.nan], + ("col1", "table2"): [np.nan, np.nan, np.nan], + ("col2", "table1"): [np.nan, np.nan, np.nan], + ("col2", "table2"): [np.nan, np.nan, np.nan] + }).astype(object) + expected.index = self.SAMPLES_INDEX + pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected) + + def test_mismatching_files_exact_differences(self): + df1, df2 = self.create_mismatching_files_tables() + self.validator.compare_files(df1, df2) + expected = pd.DataFrame({ + ("col1", "table1"): ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"], + ("col1", "table2"): ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"], + ("col2", "table1"): ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"], + ("col2", "table2"): ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"] + }).astype(object) + expected.index = self.SAMPLES_INDEX + pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected) + + def test_mix_matching_files_exact_differences(self): + df1, df2 = self.create_mix_matching_files_tables() + self.validator.compare_files(df1, df2) + expected = pd.DataFrame({ + ("col1", "table1"): [np.nan, np.nan, np.nan], + ("col1", "table2"): [np.nan, np.nan, np.nan], + ("col2", "table1"): ["gs://mismatch2-1.txt", np.nan, "gs://mismatch2-3.txt"], + ("col2", "table2"): ["gs://mismatch2-1.txt", np.nan, "gs://mismatch2-3.txt"] + }).astype(object) + expected.index = self.SAMPLES_INDEX + pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected) + + def test_null_files_exact_differences(self): + df1, df2 = self.create_mix_matching_files_tables() + self.validator.compare_files(df2, df2) + expected = pd.DataFrame({ + ("col1", "table1"): [np.nan, np.nan, np.nan], + ("col1", "table2"): [np.nan, np.nan, np.nan], + ("col2", "table1"): [np.nan, "gs://match2-2.txt", "gs://mismatch2-3.txt"], + ("col2", "table2"): ["gs://match2-1.txt", np.nan, np.nan] + }).astype(object) + expected.index = self.SAMPLES_INDEX + pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected) + From 5a8209792b89c52ce3b59b2340b91fbf6c14bb33 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Mon, 11 Dec 2023 21:04:11 -0700 Subject: [PATCH 33/43] Add unit tests for validate_files() --- tests/test_validator.py | 109 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 2 deletions(-) diff --git a/tests/test_validator.py b/tests/test_validator.py index 610ab41..ba51957 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -302,8 +302,8 @@ def test_mix_matching_files_exact_differences(self): pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected) def test_null_files_exact_differences(self): - df1, df2 = self.create_mix_matching_files_tables() - self.validator.compare_files(df2, df2) + df1, df2 = self.create_null_files_tables() + self.validator.compare_files(df1, df2) expected = pd.DataFrame({ ("col1", "table1"): [np.nan, np.nan, np.nan], ("col1", "table2"): [np.nan, np.nan, np.nan], @@ -313,3 +313,108 @@ def test_null_files_exact_differences(self): expected.index = self.SAMPLES_INDEX pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected) +class TestValidateFiles(unittest.TestCase): + SAMPLES_INDEX = ["sample1", "sample2", "sample3", "sample4", "sample5"] + COLUMNS_INDEX = ["exact_col", "set_col", "ignore_col", "float_col"] + TABLE1_FILE_URIS = ["gs://match1-1.txt", "gs://mismatch1-2.txt", "gs://match1-3", "gs://sortmatch1-1.txt", np.nan] + TABLE2_FILE_URIS = ["gs://match1-1.txt", "gs://mismatch1-2.txt", np.nan, "gs://sortmatch1-1.txt", np.nan] + EXACT_MATCHES_MASK = [True, False, False, False, True] + + def setUp(self): + self.validator = Validator(MockOptions()) + self.validator.validation_criteria = pd.DataFrame({ + "exact_col": "EXACT", + "set_col": "SET", + "ignore_col": "IGNORE", + "float_col": 0.1, + }, index=["column", "criteria"] + ) + + # This numeric convertion is done in Validator init method + self.validator.validation_criteria = (self.validator.validation_criteria + .apply(pd.to_numeric, errors="ignore").convert_dtypes() + ) + + self.validator.table1 = pd.DataFrame({ + "samples": self.SAMPLES_INDEX, + "exact_col": self.TABLE1_FILE_URIS, + "set_col": self.TABLE1_FILE_URIS, + "ignore_col": self.TABLE1_FILE_URIS, + "float_col": self.TABLE1_FILE_URIS # uh-oh + }) + + self.validator.table2 = pd.DataFrame({ + "samples": self.SAMPLES_INDEX, + "exact_col": self.TABLE2_FILE_URIS, + "set_col": self.TABLE2_FILE_URIS, + "ignore_col": self.TABLE2_FILE_URIS, + "float_col": self.TABLE2_FILE_URIS # uh-oh + }) + + self.validator.file_exact_matches = pd.DataFrame({ + "exact_col": self.EXACT_MATCHES_MASK, + "set_col": self.EXACT_MATCHES_MASK, + "ignore_col": self.EXACT_MATCHES_MASK, + "float_col": self.EXACT_MATCHES_MASK + }) + self.validator.file_exact_matches.index = self.SAMPLES_INDEX + + self.validator.file_number_of_differences = pd.DataFrame({ + self.validator.NUM_DIFFERENCES_COL: [3, 3, 3, 3] + }) + self.validator.file_number_of_differences.index = self.COLUMNS_INDEX + + self.validator.table1_name = "table1" + self.validator.table2_name = "table2" + self.validator.table1_files_dir = "tests/table1_files" + self.validator.table2_files_dir = "tests/table2_files" + + self.validator.validation_table = pd.DataFrame() + + def test_validate_exact(self): + column = self.validator.validation_criteria["exact_col"] + observed = self.validator.validate_files(column) + expected = ("EXACT", 3) + self.assertEqual(observed, expected) + + def test_validate_ignore(self): + column = self.validator.validation_criteria["ignore_col"] + observed = self.validator.validate_files(column) + expected = ("IGNORE", 0) + self.assertEqual(observed, expected) + + def test_validate_set(self): + column = self.validator.validation_criteria["set_col"] + observed = self.validator.validate_files(column) + expected = ("SET", 2) # sorted file should not count as different + self.assertEqual(observed, expected) + + def test_validate_float(self): + # have not implemented % difference for files + column = self.validator.validation_criteria["set_col"] + self.assertRaises(Exception, self.validator.validate_files(column)) + + def test_validation_table(self): + for column in ["exact_col", "set_col", "ignore_col"]: + column = self.validator.validation_criteria[column] + self.validator.validate_files(column) + + # these steps are done in run_validation_checks + self.validator.validation_table.set_index(self.validator.table1["samples"], inplace=True) + self.validator.validation_table.rename_axis(None, axis="index", inplace=True) + self.validator.validation_table.columns = pd.MultiIndex.from_tuples( + self.validator.validation_table.columns, names=["Column", "Table"] + ) + + # exact_col should count sortmatch file as a mismatch, while set_col should + # count it as a match. No column should be generated for ignore_col + expected = pd.DataFrame({ + ("exact_col", "table1"): [np.nan, "gs://mismatch1-2.txt", "gs://match1-3", "gs://sortmatch1-1.txt", np.nan], + ("exact_col", "table2"): [np.nan, "gs://mismatch1-2.txt", np.nan, "gs://sortmatch1-1.txt", np.nan], + ("set_col", "table1"): [np.nan, "gs://mismatch1-2.txt", "gs://match1-3", np.nan, np.nan], + ("set_col", "table2"): [np.nan, "gs://mismatch1-2.txt", np.nan, np.nan, np.nan], + }) + expected.set_index(self.validator.table1["samples"], inplace=True) + expected.rename_axis(None, axis="index", inplace=True) + expected.columns = pd.MultiIndex.from_tuples(expected.columns, names=["Column", "Table"] + pd.testing.assert_frame_equal(self.validator.validation_table, expected) From 2d31b845dc815ddf07a2844814043eb23e55ce7e Mon Sep 17 00:00:00 2001 From: sam-baird Date: Mon, 11 Dec 2023 21:17:51 -0700 Subject: [PATCH 34/43] Reformat and add more documentation to testing file --- tests/test_validator.py | 197 ++++++++++++++++++++++------------------ 1 file changed, 108 insertions(+), 89 deletions(-) diff --git a/tests/test_validator.py b/tests/test_validator.py index ba51957..baee834 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -32,96 +32,105 @@ def __init__(self, options_dict=None): class TestDetermineFileColumns(unittest.TestCase): - def setUp(self): - self.validator = Validator(MockOptions()) - - def run_determine_file_columns(self, data1, data2): - self.validator.table1 = pd.DataFrame(data1) - self.validator.table2 = pd.DataFrame(data2) - self.validator.determine_file_columns() - - def test_no_file_columns(self): - data = { - "col1": [1, 2, 3], - "col2": ["foo", "bar", "baz"] - } - self.run_determine_file_columns(data, data) - self.assertEqual(len(self.validator.file_columns), 0) - - def test_some_file_columns(self): - data1 = { - "col1": [1, 2, 3], - "col2": ["gs://foo", "gs://bar", "gs://baz"] - } - data2 = { - "col1": [1, 2, 3], - "col2": ["gs://eggs", "gs://spam", "gs://monty"] - } - self.run_determine_file_columns(data1, data2) - self.assertEqual(self.validator.file_columns, {"col2"}) - - def test_missing_uri(self): - data1 = { - "col1": [1, 2, 3], - "col2": ["gs://foo", np.nan, "gs://baz"] - } - data2 = { - "col1": [1, 2, 3], - "col2": ["gs://eggs", "gs://spam", "gs://monty"] - } - self.run_determine_file_columns(data1, data2) - self.assertEqual(self.validator.file_columns, {"col2"}) - - def test_both_columns_null(self): - data1 = { - "col1": ["gs://foo", "gs://bar", "gs://baz"], - "col2": [np.nan, np.nan, np.nan] - } - data2 = { - "col1": ["gs://eggs", "gs://spam", "gs://monty"], - "col2": [np.nan, np.nan, np.nan] - } - self.run_determine_file_columns(data1, data2) - self.assertEqual(self.validator.file_columns, {"col1"}) - - def test_one_column_null(self): - data1 = { - "col1": ["gs://foo", "gs://bar", "gs://baz"], - "col2": ["gs://x", "gs://y", "gs://z"] - } - data2 = { - "col1": ["gs://eggs", "gs://spam", "gs://monty"], - "col2": [np.nan, np.nan, np.nan] - } - self.run_determine_file_columns(data1, data2) - self.assertEqual(self.validator.file_columns, {"col1", "col2"}) - - def test_mixed_nulls(self): - data1 = { - "col1": ["gs://foo", "gs://foo", np.nan], - "col2": ["gs://x", "gs://y", np.nan] - } - data2 = { - "col1": ["gs://eggs", np.nan, np.nan], - "col2": [np.nan, "gs://b", np.nan] - } - self.run_determine_file_columns(data1, data2) - self.assertEqual(self.validator.file_columns, {"col1", "col2"}) - - def test_one_column_not_null(self): - data1 = { - "col1": ["gs://foo", "gs://bar", "gs://baz"], - "col2": ["gs://x", "gs://y", "gs://z"] - } - data2 = { - "col1": ["gs://eggs", "gs://spam", "gs://monty"], - "col2": [1, 2, 3] - } - self.run_determine_file_columns(data1, data2) - self.assertEqual(self.validator.file_columns, {"col1"}) + """ + Test detecting which columns in the tables correspond to files. If there is at + least one URI and no other values except np.nan in both tables, we should + treat the column as a "file_column". + """ + def setUp(self): + self.validator = Validator(MockOptions()) + + def run_determine_file_columns(self, data1, data2): + self.validator.table1 = pd.DataFrame(data1) + self.validator.table2 = pd.DataFrame(data2) + self.validator.determine_file_columns() + + def test_no_file_columns(self): + data = { + "col1": [1, 2, 3], + "col2": ["foo", "bar", "baz"] + } + self.run_determine_file_columns(data, data) + self.assertEqual(len(self.validator.file_columns), 0) + + def test_some_file_columns(self): + data1 = { + "col1": [1, 2, 3], + "col2": ["gs://foo", "gs://bar", "gs://baz"] + } + data2 = { + "col1": [1, 2, 3], + "col2": ["gs://eggs", "gs://spam", "gs://monty"] + } + self.run_determine_file_columns(data1, data2) + self.assertEqual(self.validator.file_columns, {"col2"}) + + def test_missing_uri(self): + data1 = { + "col1": [1, 2, 3], + "col2": ["gs://foo", np.nan, "gs://baz"] + } + data2 = { + "col1": [1, 2, 3], + "col2": ["gs://eggs", "gs://spam", "gs://monty"] + } + self.run_determine_file_columns(data1, data2) + self.assertEqual(self.validator.file_columns, {"col2"}) + + def test_both_columns_null(self): + data1 = { + "col1": ["gs://foo", "gs://bar", "gs://baz"], + "col2": [np.nan, np.nan, np.nan] + } + data2 = { + "col1": ["gs://eggs", "gs://spam", "gs://monty"], + "col2": [np.nan, np.nan, np.nan] + } + self.run_determine_file_columns(data1, data2) + self.assertEqual(self.validator.file_columns, {"col1"}) + + def test_one_column_null(self): + data1 = { + "col1": ["gs://foo", "gs://bar", "gs://baz"], + "col2": ["gs://x", "gs://y", "gs://z"] + } + data2 = { + "col1": ["gs://eggs", "gs://spam", "gs://monty"], + "col2": [np.nan, np.nan, np.nan] + } + self.run_determine_file_columns(data1, data2) + self.assertEqual(self.validator.file_columns, {"col1", "col2"}) + + def test_mixed_nulls(self): + data1 = { + "col1": ["gs://foo", "gs://foo", np.nan], + "col2": ["gs://x", "gs://y", np.nan] + } + data2 = { + "col1": ["gs://eggs", np.nan, np.nan], + "col2": [np.nan, "gs://b", np.nan] + } + self.run_determine_file_columns(data1, data2) + self.assertEqual(self.validator.file_columns, {"col1", "col2"}) + + def test_one_column_not_null(self): + data1 = { + "col1": ["gs://foo", "gs://bar", "gs://baz"], + "col2": ["gs://x", "gs://y", "gs://z"] + } + data2 = { + "col1": ["gs://eggs", "gs://spam", "gs://monty"], + "col2": [1, 2, 3] + } + self.run_determine_file_columns(data1, data2) + self.assertEqual(self.validator.file_columns, {"col1"}) class TestCompareFiles(unittest.TestCase): + """ + Test comparing files (exact match). Identical files or two np.nans + should count as an exact match, anything else should count as a mismatch. + """ SAMPLES_INDEX = ["sample1", "sample2", "sample3"] COLUMNS_INDEX = ["col1", "col2"] @@ -131,7 +140,7 @@ def setUp(self): self.validator.table2_name = "table2" self.validator.table1_files_dir = "tests/table1_files" self.validator.table2_files_dir = "tests/table2_files" - self.diff_dir = "/dev/null" + self.diff_dir = "/dev/null" # discard diff files def create_matching_files_tables(self): df1 = pd.DataFrame({ @@ -314,6 +323,12 @@ def test_null_files_exact_differences(self): pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected) class TestValidateFiles(unittest.TestCase): + """ + Test comparing files using the validation criteria. EXACT follows the same + logic as compare_files(), SET should treat files as matching if after + sorting they are identical, IGNORE should "skip" the files. Other criteria + should result in an Exception. + """ SAMPLES_INDEX = ["sample1", "sample2", "sample3", "sample4", "sample5"] COLUMNS_INDEX = ["exact_col", "set_col", "ignore_col", "float_col"] TABLE1_FILE_URIS = ["gs://match1-1.txt", "gs://mismatch1-2.txt", "gs://match1-3", "gs://sortmatch1-1.txt", np.nan] @@ -335,6 +350,8 @@ def setUp(self): .apply(pd.to_numeric, errors="ignore").convert_dtypes() ) + # assign the same URIs to each column, will test that the validation + # results vary depending on the the validation criterion self.validator.table1 = pd.DataFrame({ "samples": self.SAMPLES_INDEX, "exact_col": self.TABLE1_FILE_URIS, @@ -351,6 +368,7 @@ def setUp(self): "float_col": self.TABLE2_FILE_URIS # uh-oh }) + # the exact matches will be identical regardless of validation criteria self.validator.file_exact_matches = pd.DataFrame({ "exact_col": self.EXACT_MATCHES_MASK, "set_col": self.EXACT_MATCHES_MASK, @@ -407,7 +425,8 @@ def test_validation_table(self): ) # exact_col should count sortmatch file as a mismatch, while set_col should - # count it as a match. No column should be generated for ignore_col + # count it as a match. + # no column should be generated for ignore_col. expected = pd.DataFrame({ ("exact_col", "table1"): [np.nan, "gs://mismatch1-2.txt", "gs://match1-3", "gs://sortmatch1-1.txt", np.nan], ("exact_col", "table2"): [np.nan, "gs://mismatch1-2.txt", np.nan, "gs://sortmatch1-1.txt", np.nan], From cd3c67f1906daed4022e6fe1793fa04fa91a6b5b Mon Sep 17 00:00:00 2001 From: sam-baird Date: Mon, 11 Dec 2023 21:32:40 -0700 Subject: [PATCH 35/43] Reformat/add docstrings --- theiavalidate/Validator.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 44645c5..61a6ae3 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -151,12 +151,11 @@ def count_populated_cells(self): self.summary_output = pd.concat([table1_populated_rows, table2_populated_rows], join="outer", axis=1) - """ - This function determines columns with GCP URIs for file comparisons so that - they are excluded from regular comparisons and instead use filecmp to compare - the downloaded files. - """ def determine_file_columns(self): + """ + Determine the columns with GCP URIs so that they are excluded from regular + comparisons and instead file comparisons are performed. + """ for df in [self.table1, self.table2]: # select columns with at least one GCP URI among nulls file_columns = df.columns[(df.apply(lambda x: x.astype(str).str.startswith("gs://") @@ -238,6 +237,9 @@ def perform_exact_match(self): def compare_files(self, file_df1, file_df2): + """ + Determine which pairs of files referenced in the DataFrames are identical + """ self.file_exact_matches = pd.DataFrame(index=file_df1.index, columns=file_df1.columns) @@ -275,7 +277,7 @@ def compare_files(self, file_df1, file_df2): self.file_exact_differences.loc[row, (col, self.table1_name)] = uri1 self.file_exact_differences.loc[row, (col, self.table2_name)] = uri2 - + self.file_exact_matches = self.file_exact_matches.astype(bool) def set_file_number_of_differences(self): @@ -365,6 +367,11 @@ def validate(self, column): return ("COLUMN " + column.name + " NOT FOUND", np.nan) def validate_files(self, column): + """ + Perform validation of matching file contents based on which of EXACT, + IGNORE, or SET is assigned as the column's validation criterion. For SET, + sort lines in file before comparing. + """ validation_criterion = column.iloc[0] if validation_criterion == "EXACT": # we already know where the exact matches are from compare_files() @@ -400,6 +407,9 @@ def validate_files(self, column): return (validation_criterion, number_of_differences) def compare_sorted_files(self, row): + """ + Compare two files sorted alphabetically by line for a pair of file URIs. + """ file1 = row.iloc[0] file2 = row.iloc[1] if pd.isnull(file1) and pd.isnull(file2): @@ -534,6 +544,9 @@ def compare(self): self.logger.info("Done!") def localize_files(row, directory): + """ + Download files to compare from GCP. + """ for value in row: if isinstance(value, str) and value.startswith("gs://"): # copy files to to compare_files/ directory From c292900938f3294e73d4259fbcd0c2edcbc82abe Mon Sep 17 00:00:00 2001 From: sam-baird Date: Mon, 11 Dec 2023 21:37:03 -0700 Subject: [PATCH 36/43] Remove unnecessary argument to MockOptions --- tests/test_validator.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/test_validator.py b/tests/test_validator.py index baee834..69e2fd4 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -1,3 +1,6 @@ +# To run these unit tests, run "python3 -m unittest" from the root of the +# project directory. + from theiavalidate.Validator import Validator from theiavalidate.theiavalidate import DEFAULT_NA_VALUES @@ -12,8 +15,7 @@ class MockOptions: the argparse package, but here we will simulate this object with a different class to more easily create Validator objects. """ - def __init__(self, options_dict=None): - # defaults + def __init__(self): self.table1 = None self.table2 = None self.version = None @@ -25,10 +27,6 @@ def __init__(self, options_dict=None): self.verbose = False self.debug = False - # overwrite defaults with options_dict - if options_dict is not None: - for key, value in options_dict.items(): - setattr(self, key, value) class TestDetermineFileColumns(unittest.TestCase): @@ -435,5 +433,5 @@ def test_validation_table(self): }) expected.set_index(self.validator.table1["samples"], inplace=True) expected.rename_axis(None, axis="index", inplace=True) - expected.columns = pd.MultiIndex.from_tuples(expected.columns, names=["Column", "Table"] + expected.columns = pd.MultiIndex.from_tuples(expected.columns, names=["Column", "Table"]) pd.testing.assert_frame_equal(self.validator.validation_table, expected) From 0354c743ff95f079f990aca6b35e2a9cde4792b0 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Mon, 11 Dec 2023 21:38:17 -0700 Subject: [PATCH 37/43] Adjust whitespace --- tests/test_validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_validator.py b/tests/test_validator.py index 69e2fd4..453f79f 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -8,6 +8,7 @@ import pandas as pd import unittest + class MockOptions: """ Mock the "options" object that is created in theiavalidate.py. In @@ -28,7 +29,6 @@ def __init__(self): self.debug = False - class TestDetermineFileColumns(unittest.TestCase): """ Test detecting which columns in the tables correspond to files. If there is at From 3422a73869972b7e1910a59f3d8d7b310c25f99f Mon Sep 17 00:00:00 2001 From: sam-baird Date: Mon, 11 Dec 2023 21:39:32 -0700 Subject: [PATCH 38/43] Add sortmatch files for testing SET criterion --- file_diffs/sample1_col1_diff.txt | 5 +++++ file_diffs/sample1_col2_diff.txt | 3 +++ file_diffs/sample2_col1_diff.txt | 3 +++ file_diffs/sample2_col2_diff.txt | 3 +++ file_diffs/sample3_col1_diff.txt | 5 +++++ file_diffs/sample3_col2_diff.txt | 3 +++ tests/table1_files/sortmatch1-1.txt | 3 +++ tests/table2_files/sortmatch1-1.txt | 3 +++ 8 files changed, 28 insertions(+) create mode 100644 file_diffs/sample1_col1_diff.txt create mode 100644 file_diffs/sample1_col2_diff.txt create mode 100644 file_diffs/sample2_col1_diff.txt create mode 100644 file_diffs/sample2_col2_diff.txt create mode 100644 file_diffs/sample3_col1_diff.txt create mode 100644 file_diffs/sample3_col2_diff.txt create mode 100644 tests/table1_files/sortmatch1-1.txt create mode 100644 tests/table2_files/sortmatch1-1.txt diff --git a/file_diffs/sample1_col1_diff.txt b/file_diffs/sample1_col1_diff.txt new file mode 100644 index 0000000..a0a1ba9 --- /dev/null +++ b/file_diffs/sample1_col1_diff.txt @@ -0,0 +1,5 @@ +--- tests/table1_files/mismatch1-1.txt+++ tests/table2_files/mismatch1-1.txt@@ -1,3 +1,3 @@-foo +-bar ++eggs ++spam + diff --git a/file_diffs/sample1_col2_diff.txt b/file_diffs/sample1_col2_diff.txt new file mode 100644 index 0000000..4ca418d --- /dev/null +++ b/file_diffs/sample1_col2_diff.txt @@ -0,0 +1,3 @@ +--- tests/table1_files/mismatch2-1.txt+++ tests/table2_files/mismatch2-1.txt@@ -1,2 +1 @@-1 2 3 +- ++1 2 diff --git a/file_diffs/sample2_col1_diff.txt b/file_diffs/sample2_col1_diff.txt new file mode 100644 index 0000000..7f6663f --- /dev/null +++ b/file_diffs/sample2_col1_diff.txt @@ -0,0 +1,3 @@ +--- tests/table1_files/mismatch1-2.txt+++ tests/table2_files/mismatch1-2.txt@@ -1,2 +1,3 @@+foo + foo + diff --git a/file_diffs/sample2_col2_diff.txt b/file_diffs/sample2_col2_diff.txt new file mode 100644 index 0000000..e08f21a --- /dev/null +++ b/file_diffs/sample2_col2_diff.txt @@ -0,0 +1,3 @@ +--- tests/table1_files/mismatch2-2.txt+++ tests/table2_files/mismatch2-2.txt@@ -1,2 +1,2 @@-5 6 6 ++4 5 6 + diff --git a/file_diffs/sample3_col1_diff.txt b/file_diffs/sample3_col1_diff.txt new file mode 100644 index 0000000..89a7f0c --- /dev/null +++ b/file_diffs/sample3_col1_diff.txt @@ -0,0 +1,5 @@ +--- tests/table1_files/mismatch1-3.txt+++ tests/table2_files/mismatch1-3.txt@@ -1,4 +1,3 @@+spam + +-spam + eggs +- diff --git a/file_diffs/sample3_col2_diff.txt b/file_diffs/sample3_col2_diff.txt new file mode 100644 index 0000000..852e058 --- /dev/null +++ b/file_diffs/sample3_col2_diff.txt @@ -0,0 +1,3 @@ +--- tests/table1_files/mismatch2-3.txt+++ tests/table2_files/mismatch2-3.txt@@ -1,2 +1 @@-hello, world +- ++hello, world! diff --git a/tests/table1_files/sortmatch1-1.txt b/tests/table1_files/sortmatch1-1.txt new file mode 100644 index 0000000..86e041d --- /dev/null +++ b/tests/table1_files/sortmatch1-1.txt @@ -0,0 +1,3 @@ +foo +bar +baz diff --git a/tests/table2_files/sortmatch1-1.txt b/tests/table2_files/sortmatch1-1.txt new file mode 100644 index 0000000..4fc6926 --- /dev/null +++ b/tests/table2_files/sortmatch1-1.txt @@ -0,0 +1,3 @@ +baz +foo +bar From 5a06a70ccf83fede28686300a1fdc67f1637c0c9 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Mon, 11 Dec 2023 21:43:42 -0700 Subject: [PATCH 39/43] Remove ignored files --- .devcontainer/devcontainer.json | 26 -------------------------- .gitignore | 4 +++- file_diffs/0_col1_diff.txt | 5 ----- file_diffs/0_col2_diff.txt | 3 --- file_diffs/1_col1_diff.txt | 3 --- file_diffs/1_col2_diff.txt | 3 --- file_diffs/2_col1_diff.txt | 5 ----- file_diffs/2_col2_diff.txt | 3 --- file_diffs/sample1_col1_diff.txt | 5 ----- file_diffs/sample1_col2_diff.txt | 3 --- file_diffs/sample2_col1_diff.txt | 3 --- file_diffs/sample2_col2_diff.txt | 3 --- file_diffs/sample3_col1_diff.txt | 5 ----- file_diffs/sample3_col2_diff.txt | 3 --- 14 files changed, 3 insertions(+), 71 deletions(-) delete mode 100644 .devcontainer/devcontainer.json delete mode 100644 file_diffs/0_col1_diff.txt delete mode 100644 file_diffs/0_col2_diff.txt delete mode 100644 file_diffs/1_col1_diff.txt delete mode 100644 file_diffs/1_col2_diff.txt delete mode 100644 file_diffs/2_col1_diff.txt delete mode 100644 file_diffs/2_col2_diff.txt delete mode 100644 file_diffs/sample1_col1_diff.txt delete mode 100644 file_diffs/sample1_col2_diff.txt delete mode 100644 file_diffs/sample2_col1_diff.txt delete mode 100644 file_diffs/sample2_col2_diff.txt delete mode 100644 file_diffs/sample3_col1_diff.txt delete mode 100644 file_diffs/sample3_col2_diff.txt diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json deleted file mode 100644 index 8d96444..0000000 --- a/.devcontainer/devcontainer.json +++ /dev/null @@ -1,26 +0,0 @@ -// For format details, see https://aka.ms/devcontainer.json. For config options, see the -// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile -{ - "name": "Existing Dockerfile", - "build": { - // Sets the run context to one level up instead of the .devcontainer folder. - "context": "..", - // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. - "dockerfile": "../Dockerfile" - } - - // Features to add to the dev container. More info: https://containers.dev/features. - // "features": {}, - - // Use 'forwardPorts' to make a list of ports inside the container available locally. - // "forwardPorts": [], - - // Uncomment the next line to run commands after the container is created. - // "postCreateCommand": "cat /etc/os-release", - - // Configure tool-specific properties. - // "customizations": {}, - - // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. - // "remoteUser": "devcontainer" -} diff --git a/.gitignore b/.gitignore index 7de3fa6..08a404d 100644 --- a/.gitignore +++ b/.gitignore @@ -161,6 +161,8 @@ cython_debug/ # IDE .vscode/ +.devcontainer # testing files -sandbox/ \ No newline at end of file +sandbox/ +file_diffs/ \ No newline at end of file diff --git a/file_diffs/0_col1_diff.txt b/file_diffs/0_col1_diff.txt deleted file mode 100644 index a0a1ba9..0000000 --- a/file_diffs/0_col1_diff.txt +++ /dev/null @@ -1,5 +0,0 @@ ---- tests/table1_files/mismatch1-1.txt+++ tests/table2_files/mismatch1-1.txt@@ -1,3 +1,3 @@-foo --bar -+eggs -+spam - diff --git a/file_diffs/0_col2_diff.txt b/file_diffs/0_col2_diff.txt deleted file mode 100644 index 852e058..0000000 --- a/file_diffs/0_col2_diff.txt +++ /dev/null @@ -1,3 +0,0 @@ ---- tests/table1_files/mismatch2-3.txt+++ tests/table2_files/mismatch2-3.txt@@ -1,2 +1 @@-hello, world -- -+hello, world! diff --git a/file_diffs/1_col1_diff.txt b/file_diffs/1_col1_diff.txt deleted file mode 100644 index 7f6663f..0000000 --- a/file_diffs/1_col1_diff.txt +++ /dev/null @@ -1,3 +0,0 @@ ---- tests/table1_files/mismatch1-2.txt+++ tests/table2_files/mismatch1-2.txt@@ -1,2 +1,3 @@+foo - foo - diff --git a/file_diffs/1_col2_diff.txt b/file_diffs/1_col2_diff.txt deleted file mode 100644 index e08f21a..0000000 --- a/file_diffs/1_col2_diff.txt +++ /dev/null @@ -1,3 +0,0 @@ ---- tests/table1_files/mismatch2-2.txt+++ tests/table2_files/mismatch2-2.txt@@ -1,2 +1,2 @@-5 6 6 -+4 5 6 - diff --git a/file_diffs/2_col1_diff.txt b/file_diffs/2_col1_diff.txt deleted file mode 100644 index 89a7f0c..0000000 --- a/file_diffs/2_col1_diff.txt +++ /dev/null @@ -1,5 +0,0 @@ ---- tests/table1_files/mismatch1-3.txt+++ tests/table2_files/mismatch1-3.txt@@ -1,4 +1,3 @@+spam - --spam - eggs -- diff --git a/file_diffs/2_col2_diff.txt b/file_diffs/2_col2_diff.txt deleted file mode 100644 index 852e058..0000000 --- a/file_diffs/2_col2_diff.txt +++ /dev/null @@ -1,3 +0,0 @@ ---- tests/table1_files/mismatch2-3.txt+++ tests/table2_files/mismatch2-3.txt@@ -1,2 +1 @@-hello, world -- -+hello, world! diff --git a/file_diffs/sample1_col1_diff.txt b/file_diffs/sample1_col1_diff.txt deleted file mode 100644 index a0a1ba9..0000000 --- a/file_diffs/sample1_col1_diff.txt +++ /dev/null @@ -1,5 +0,0 @@ ---- tests/table1_files/mismatch1-1.txt+++ tests/table2_files/mismatch1-1.txt@@ -1,3 +1,3 @@-foo --bar -+eggs -+spam - diff --git a/file_diffs/sample1_col2_diff.txt b/file_diffs/sample1_col2_diff.txt deleted file mode 100644 index 4ca418d..0000000 --- a/file_diffs/sample1_col2_diff.txt +++ /dev/null @@ -1,3 +0,0 @@ ---- tests/table1_files/mismatch2-1.txt+++ tests/table2_files/mismatch2-1.txt@@ -1,2 +1 @@-1 2 3 -- -+1 2 diff --git a/file_diffs/sample2_col1_diff.txt b/file_diffs/sample2_col1_diff.txt deleted file mode 100644 index 7f6663f..0000000 --- a/file_diffs/sample2_col1_diff.txt +++ /dev/null @@ -1,3 +0,0 @@ ---- tests/table1_files/mismatch1-2.txt+++ tests/table2_files/mismatch1-2.txt@@ -1,2 +1,3 @@+foo - foo - diff --git a/file_diffs/sample2_col2_diff.txt b/file_diffs/sample2_col2_diff.txt deleted file mode 100644 index e08f21a..0000000 --- a/file_diffs/sample2_col2_diff.txt +++ /dev/null @@ -1,3 +0,0 @@ ---- tests/table1_files/mismatch2-2.txt+++ tests/table2_files/mismatch2-2.txt@@ -1,2 +1,2 @@-5 6 6 -+4 5 6 - diff --git a/file_diffs/sample3_col1_diff.txt b/file_diffs/sample3_col1_diff.txt deleted file mode 100644 index 89a7f0c..0000000 --- a/file_diffs/sample3_col1_diff.txt +++ /dev/null @@ -1,5 +0,0 @@ ---- tests/table1_files/mismatch1-3.txt+++ tests/table2_files/mismatch1-3.txt@@ -1,4 +1,3 @@+spam - --spam - eggs -- diff --git a/file_diffs/sample3_col2_diff.txt b/file_diffs/sample3_col2_diff.txt deleted file mode 100644 index 852e058..0000000 --- a/file_diffs/sample3_col2_diff.txt +++ /dev/null @@ -1,3 +0,0 @@ ---- tests/table1_files/mismatch2-3.txt+++ tests/table2_files/mismatch2-3.txt@@ -1,2 +1 @@-hello, world -- -+hello, world! From ce7fae175af0fc029cc255bbdf7f37b0e5d9ac7e Mon Sep 17 00:00:00 2001 From: sam-baird Date: Tue, 12 Dec 2023 21:19:55 -0700 Subject: [PATCH 40/43] Remove outdated comment --- theiavalidate/Validator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 61a6ae3..8fdd3a7 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -549,8 +549,7 @@ def localize_files(row, directory): """ for value in row: if isinstance(value, str) and value.startswith("gs://"): - # copy files to to compare_files/ directory - # it would be much faster to copy them all at once, but any files with + # it would be much faster to copy files all at once, but any files with # the same name would be clobbered, so create local directories matching # gsutil path and loop to copy remote_path = os.path.dirname(value.removeprefix("gs://")) From 5c914c097afb065f7b6a398ecfc139b95bd1ee72 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Tue, 12 Dec 2023 21:44:08 -0700 Subject: [PATCH 41/43] Add example files for file comparison --- ...mple-validation_criteria_exact_sort_file.tsv | 7 +++++++ .../file_comparison_column_translation.tsv | 2 ++ .../file_comparison_columns_to_compare.txt | 1 + .../file_comparison/file_comparison_table1.tsv | 6 ++++++ .../file_comparison/file_comparison_table2.tsv | 6 ++++++ .../outputs/diffs/sample02_file_column_diff.txt | 5 +++++ .../diffs/sample02_sort_file_column_diff.txt | 5 +++++ .../outputs/diffs/sample03_file_column_diff.txt | 3 +++ .../diffs/sample03_sort_file_column_diff.txt | 4 ++++ .../file_comparison_exact_differences.tsv | 8 ++++++++ .../outputs/file_comparison_summary.pdf | Bin 0 -> 28661 bytes ...ison_validation_criteria_differences (2).tsv | 7 +++++++ .../outputs/filtered_file_comparison_table1.tsv | 6 ++++++ .../outputs/filtered_file_comparison_table2.tsv | 6 ++++++ 14 files changed, 66 insertions(+) create mode 100644 examples/file_comparison/example-validation_criteria_exact_sort_file.tsv create mode 100644 examples/file_comparison/file_comparison_column_translation.tsv create mode 100644 examples/file_comparison/file_comparison_columns_to_compare.txt create mode 100644 examples/file_comparison/file_comparison_table1.tsv create mode 100644 examples/file_comparison/file_comparison_table2.tsv create mode 100644 examples/file_comparison/outputs/diffs/sample02_file_column_diff.txt create mode 100644 examples/file_comparison/outputs/diffs/sample02_sort_file_column_diff.txt create mode 100644 examples/file_comparison/outputs/diffs/sample03_file_column_diff.txt create mode 100644 examples/file_comparison/outputs/diffs/sample03_sort_file_column_diff.txt create mode 100644 examples/file_comparison/outputs/file_comparison_exact_differences.tsv create mode 100644 examples/file_comparison/outputs/file_comparison_summary.pdf create mode 100644 examples/file_comparison/outputs/file_comparison_validation_criteria_differences (2).tsv create mode 100644 examples/file_comparison/outputs/filtered_file_comparison_table1.tsv create mode 100644 examples/file_comparison/outputs/filtered_file_comparison_table2.tsv diff --git a/examples/file_comparison/example-validation_criteria_exact_sort_file.tsv b/examples/file_comparison/example-validation_criteria_exact_sort_file.tsv new file mode 100644 index 0000000..1590aa0 --- /dev/null +++ b/examples/file_comparison/example-validation_criteria_exact_sort_file.tsv @@ -0,0 +1,7 @@ +column criteria +assembly_length 0.01 +gambit_predicted_taxon EXACT +amrfinderplus_amr_core_genes SET +extra_column IGNORE +file_column EXACT +sort_file_column SET diff --git a/examples/file_comparison/file_comparison_column_translation.tsv b/examples/file_comparison/file_comparison_column_translation.tsv new file mode 100644 index 0000000..3cf7192 --- /dev/null +++ b/examples/file_comparison/file_comparison_column_translation.tsv @@ -0,0 +1,2 @@ +amrfinderplus_amr_genes amrfinderplus_amr_core_genes +extra_column2 extra_column \ No newline at end of file diff --git a/examples/file_comparison/file_comparison_columns_to_compare.txt b/examples/file_comparison/file_comparison_columns_to_compare.txt new file mode 100644 index 0000000..d67db40 --- /dev/null +++ b/examples/file_comparison/file_comparison_columns_to_compare.txt @@ -0,0 +1 @@ +"assembly_length,gambit_predicted_taxon,amrfinderplus_amr_core_genes,extra_column,file_column,sort_file_column" \ No newline at end of file diff --git a/examples/file_comparison/file_comparison_table1.tsv b/examples/file_comparison/file_comparison_table1.tsv new file mode 100644 index 0000000..1d42049 --- /dev/null +++ b/examples/file_comparison/file_comparison_table1.tsv @@ -0,0 +1,6 @@ +entity:table1_with_files_id amrfinderplus_amr_core_genes assembly_length extra_column file_column gambit_predicted_taxon sort_file_column +sample01 tet(A),aph(6)-Id,aph(3'')-Ib 4783605 extra_value gs://path/to/table1_files/match1-1.txt Salmonella enterica gs://path/to/table1_files/match1-1.txt +sample02 glpT_E448K,gyrA_D87G,gyrA_S83L,sat2,dfrA1,parC_S80I,blaCTX-M-27 5226301 gs://path/to/table1_files/mismatch1-1.txt Shigella sonnei gs://path/to/table1_files/mismatch1-1.txt +sample03 4719410 extra_value gs://path/to/table1_files/mismatch2-1.txt Shigella gs://path/to/table1_files/sortmatch1-1.txt +sample04 sul1,aadA7,parC_S87L,gyrA_T83I 6674526 gs://path/to/table1_files/mismatch2-1.txt Pseudomonas aeruginosa gs://path/to/table1_files/mismatch1-1.txt +sample05 parC_S80Y,tet(38),mecR1,murA_G257D,fosB,gyrA_S84L,mecA 2773544 Staphylococcus aureus diff --git a/examples/file_comparison/file_comparison_table2.tsv b/examples/file_comparison/file_comparison_table2.tsv new file mode 100644 index 0000000..0e39e38 --- /dev/null +++ b/examples/file_comparison/file_comparison_table2.tsv @@ -0,0 +1,6 @@ +entity:table2_with_files_id amrfinderplus_amr_genes assembly_length extra_column2 file_column gambit_predicted_taxon sort_file_column +sample01 aph(3'')-Ib,aph(6)-Id,tet(A) 4783610 extra_value gs://path/to/table2_files/match1-1.txt Salmonella enterica gs://path/to/table2_files/match1-1.txt +sample02 glpT_E448K,gyrA_D87G,gyrA_S83L,parC_S80I,blaCTX-M-27,sat2,dfrA1 5274928 gs://path/to/table2_files/mismatch1-1.txt Shigella sonnei gs://path/to/table2_files/mismatch1-1.txt +sample03 glpT_E448K,gyrA_D87G,gyrA_S83L,sat2 5287603 gs://path/to/table2_files/mismatch2-1.txt Shigella sonnei gs://path/to/table2_files/sortmatch1-1.txt +sample04 parC_S87L,gyrA_T83I,sul1,aadA7 6674503 extra_value Pseudomonas aeruginosa +sample05 parC_S80Y,tet(38),fosB,gyrA_S84L,mecA,mecR1,murA_G257D 2771914 Staphylococcus aureus diff --git a/examples/file_comparison/outputs/diffs/sample02_file_column_diff.txt b/examples/file_comparison/outputs/diffs/sample02_file_column_diff.txt new file mode 100644 index 0000000..c6aa9ad --- /dev/null +++ b/examples/file_comparison/outputs/diffs/sample02_file_column_diff.txt @@ -0,0 +1,5 @@ +--- table1_files/path/to/table1_files/mismatch1-1.txt+++ table2_files/path/to/table2_files/mismatch1-1.txt@@ -1,3 +1,3 @@-foo +-bar ++eggs ++spam + diff --git a/examples/file_comparison/outputs/diffs/sample02_sort_file_column_diff.txt b/examples/file_comparison/outputs/diffs/sample02_sort_file_column_diff.txt new file mode 100644 index 0000000..c6aa9ad --- /dev/null +++ b/examples/file_comparison/outputs/diffs/sample02_sort_file_column_diff.txt @@ -0,0 +1,5 @@ +--- table1_files/path/to/table1_files/mismatch1-1.txt+++ table2_files/path/to/table2_files/mismatch1-1.txt@@ -1,3 +1,3 @@-foo +-bar ++eggs ++spam + diff --git a/examples/file_comparison/outputs/diffs/sample03_file_column_diff.txt b/examples/file_comparison/outputs/diffs/sample03_file_column_diff.txt new file mode 100644 index 0000000..aebe16f --- /dev/null +++ b/examples/file_comparison/outputs/diffs/sample03_file_column_diff.txt @@ -0,0 +1,3 @@ +--- table1_files/path/to/table1_files/mismatch2-1.txt+++ table2_files/path/to/table2_files/mismatch2-1.txt@@ -1,2 +1 @@-1 2 3 +- ++1 2 diff --git a/examples/file_comparison/outputs/diffs/sample03_sort_file_column_diff.txt b/examples/file_comparison/outputs/diffs/sample03_sort_file_column_diff.txt new file mode 100644 index 0000000..fad4e1e --- /dev/null +++ b/examples/file_comparison/outputs/diffs/sample03_sort_file_column_diff.txt @@ -0,0 +1,4 @@ +--- table1_files/path/to/table1_files/sortmatch1-1.txt+++ table2_files/path/to/table2_files/sortmatch1-1.txt@@ -1,3 +1,3 @@+baz + foo + bar +-baz diff --git a/examples/file_comparison/outputs/file_comparison_exact_differences.tsv b/examples/file_comparison/outputs/file_comparison_exact_differences.tsv new file mode 100644 index 0000000..9e07948 --- /dev/null +++ b/examples/file_comparison/outputs/file_comparison_exact_differences.tsv @@ -0,0 +1,8 @@ + amrfinderplus_amr_core_genes amrfinderplus_amr_core_genes assembly_length assembly_length extra_column extra_column gambit_predicted_taxon gambit_predicted_taxon sort_file_column sort_file_column file_column file_column + table1_with_files.tsv table2_with_files.tsv table1_with_files.tsv table2_with_files.tsv table1_with_files.tsv table2_with_files.tsv table1_with_files.tsv table2_with_files.tsv table1_with_files.tsv table2_with_files.tsv table1_with_files.tsv table2_with_files.tsv +samples +sample01 tet(A),aph(6)-Id,aph(3'')-Ib aph(3'')-Ib,aph(6)-Id,tet(A) 4783605 4783610 +sample02 glpT_E448K,gyrA_D87G,gyrA_S83L,sat2,dfrA1,parC_S80I,blaCTX-M-27 glpT_E448K,gyrA_D87G,gyrA_S83L,parC_S80I,blaCTX-M-27,sat2,dfrA1 5226301 5274928 gs://path/to/table1_files/mismatch1-1.txt gs://path/to/table2_files/mismatch1-1.txt gs://path/to/table1_files/mismatch1-1.txt gs://path/to/table2_files/mismatch1-1.txt +sample03 glpT_E448K,gyrA_D87G,gyrA_S83L,sat2 4719410 5287603 extra_value Shigella Shigella sonnei gs://path/to/table1_files/sortmatch1-1.txt gs://path/to/table2_files/sortmatch1-1.txt gs://path/to/table1_files/mismatch2-1.txt gs://path/to/table2_files/mismatch2-1.txt +sample04 sul1,aadA7,parC_S87L,gyrA_T83I parC_S87L,gyrA_T83I,sul1,aadA7 6674526 6674503 extra_value gs://path/to/table1_files/mismatch1-1.txt gs://path/to/table1_files/mismatch2-1.txt +sample05 parC_S80Y,tet(38),mecR1,murA_G257D,fosB,gyrA_S84L,mecA parC_S80Y,tet(38),fosB,gyrA_S84L,mecA,mecR1,murA_G257D 2773544 2771914 diff --git a/examples/file_comparison/outputs/file_comparison_summary.pdf b/examples/file_comparison/outputs/file_comparison_summary.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f36eb86164536b50619171e7611d61b95d8cb1f6 GIT binary patch literal 28661 zcmeFZWmH_vw(pI*lb`__cWc~(1b27W;O-hcxH|+5?gaPX7Tlcx!3h=!@HUWVKYO3O z&wJ1PaPOzvV|3T5TC-+Z*BWz;-+wJ~c@a^1W(HOS@}AtDJOpL{6TsH+JpwN;0;95p zla&d8iW-4Y*uliW$<_ft_3#Mh3T6pr4(0@A4Q2)UwFR>SGX^sS1AsAsF@Q0H0YSg) zpc?WHw#LpzCNEVKK!yM?wr7K44go1h;Rd$fGo^FW*`$Akb{|BlNy1a zAHl@N_^B_zf9{jJoe6+Z*ucra%GM0jjDeYnBY*{Dp!iP}SpTiUJ2xjWB_~i*2#iXC z04E3Mrvn)P69W?vfzikSz|6n|8l~V%Dkz}@Fg37pe6}%gv;;89+S-`>-5<6;?zKS8 zJ2{y+*Z>$sOk6CCOcce0bpGlz`@i+!sjp8xb}+GV`qO1bVOtv~P}UK^%=A14Mnw}x zTW1F&(4d%~hOLOeC}?A2`(*GkNJbeGV+#WzTQ`8#Qw?@zAOq9$2f)Gg*P+foO~td9 zzZO*3*2>mF$LKXJb&szi0V*z)wBb z0{mlApJ(Z>(MX%vm^qmPn4kW#pb>~#Sb^38z$j`3T4oUwBU@vWCvT2Up#QElf?LLs z&Wi04JDTsc-ltt~)n3L}h+uGuG$foV@OKc02J<}=75L%x>a!vDccj#capd=8chXa` zyD*=PzO`-cneroM&%c;j>U9n_&$(7!QPeTvl$Scj^&6T22 zsk0jQ7I(`>*GfhgO(oQ#$h9TvSS8Pm;g<3AbQk3GtD&iTIegXMmcDnN*1twyT^gMY zSE?wqqGl|I@LM}sONu2~++9^}XsN0+PY2`MRH|)#Iqvbb@p&k`IZW2#DA{aHvTtfR ziPkxm;>i4!y>Zdj*n-_hqry~NvYEZcK}0u0YEe@XQ-Wqg6ofZbCaJ>QVzwxL!xKb0 z)mOV67Kj$XL+wUhE3u)@&HQFu*^uf$QWq&v2C2X1wJk}guJY(yctE>hBX4}laTOuD^2K9n0N?o@qXM?BM4q2`ck3PiD!$E1S( z3|_jnN{=KOY^f8qq=M@mP!^Rl35YLcTNy>3&O+p&zms@MK`q_Qxf{xzCnY+asv@Oa zK>7ozAKf`PKH@VAvYavn*$?(LOcO_ZXPUZA#cfgAejtlxJzZ^XqERYtIqz_*`Hn`R zC~ZrMdps{pIo&t+JRX>G-fvvB+|RilE|~H+-!!H;pOS@90bEVoM1t~&p(s^R6EWJn zn!H4Xg>OTjx6R2*xcM}>atFVakf-5*Y`4vK!a#oIoCwQ*k9UcjU9{|uj^;6_>+t$k z4LkQNUF|F&s9QBrWvyYaHTSK%e*M{xY*Y|e|82-OpJ2l$pE|J~c7~m%C-juG#8`z? z`?{7W)=s3}t0WGQ`f6M0F19%#Bh~xY-rMr;G!1G!q5G;65oFd7Y+74)OGuau2dCvYs5UV8+mDwg!R;zM4F>7fg^RTcskuTf`Y zs&h!Q)ie$rQ$3toc=0!cV-g}hmse#UJU@Tn_=ZEt-MQFwBy${5Na{S;in#Dmx>73q z8g{(vD)=(^8ukO%>?Nr-eqAQmcNR-TCWqt9+DV+_s%70*m(4C!qSdyThiT3&qkVYR ze8UmU8G{ZO@j2(%xs^hu>;@6l#{SGJ{br{jUlZ){md{WO-``@;DcYk0n=EmfgFp31 zi-iG0-I0YWX^^N*g%ErU3K2%5iG+i6YmJHYFcYL=UD~qfx^noAu!p7vc@f>nTzf(mb}8qK19A`1!?O|TXrLyD z$x~tWKvzy2rV0gXv-h6dynni;%&Wy=Z>bDfAR<{P+6vYZHnFEqaHJR&{wxfsUkyoy*Q?AyQMkjjH!H zum-Iquk2_K^$uk&%~Qu(M(gZwdYN;j+doUbyll4K(f{lWc+vL`|Ng4f%{~Ve%fZw= zO*DZx+$vSEr9NZj$WG}FZ%^(hvV?C@pYeTDmQY9BTi?ZaQuXvJnY=lAcv5}_Vt5_= z2n)n3Ici&KdjvDi#wivH6LOWax_2S>V3d>z6rF+02A){#cTm(_twyYS6$Y5&$A_G7 zxDcpYx>^wjx9EJXSs^X6D!%c=Jg!p%z{b89PhV}*jKU$Cd_ImufHkMp*D{RUi!NWY zuiHX9OmK5Gj!pEtrUYho-O*xI0*c{0aMi6sgVV3Xsjsx zd-tme^%IqVE2+%9%W$Uia*I+>{2`jKf^V`l@1zrHr@!gvl|HU=TRE$>+0sidHUYbw zy9!~hW92UuqP>oe`S=ZzV|YNZ-eb{$S!kmvFM=7KZ#!pq5q+Yp+H@T) z)i81&<-6wbP#GjcJAQx^T}ISqpJW_wO5CrWQ?bgV=uniEB&eY6NR)-96OoX^sZ>1*%tiY%<`BxK~!M>2IVd-&OdrsQ$*K z*~;RPdh>V%kpc36{)u(z8?Q9x-K-M}zux3hK8}eSCjp-@n>RNBse0_}4oO+#iGD>= zNP-VxS%k#y9z`Pau{8#_cY?5FDXb<&CQN3yET?Q!OkUQyDjb2OrzNl0es=a9nkDS= zxL-Zwu^#hmmTZ5Oqee1Cx(PpF0=p`${kRu0`98TACj*EKh zhN4UkzSfmdjG2cix+j4y{wzwN=$Z28FApwf593f8^VSf3hLPvJ5~CR5;^;l3y9NdE zm1TD5=murzgdMYUdhqt*(g6aMq>1k%IJZg1ptKccnp@}DtB5crjKDjn_Q2^QSsLHB zLJWk6NfeD!k{GkTu}sJzAXKC84D04jo0`eZuY~geEhVmwH$U zYAV->3hKp%33>zJ_5@MIGUxWQ#HZV}N8)Qz-&d}qw)R8$6nBobYT~OhEfBSD6kBy) zC%@vYIxB*9&X81?^hU~nN^aYB%6@fDwGUlD9@7Aj*19UNW;LPV?PwN?#E6RLPjeMi zl%}^<6q(P4uInXDP-PZ`C5tpX^hUx%9$J8P#X@7Z@L;fepEmr3S~Uofv(*uYnhM!K zTU|ku-ojQB?NkOSZRV^}JL`r~8_SmC5!^OME9xn<{S!|8iOip4ucvqn$ie~qEAnOj zJNyPPDnG~TPtl-=iKCH&g&pWN;w3tiHLwOn@(OZ-pbxEx$$JA;XC(t0N6<+t12acZ zWcgwy^c?HbGcy617}%INIGF+Ttn5GrPA(>PARB<0gNucMm5qam1r#r{02x?;%$yvc zc#%;8bc0}FBxqv>x`SZ?xe|0VdJ4O_fSe3KAdsDn9aQ$zqo+c8APXAz_REap%FseVI>A8bdikwh>XM{iJC z-D5^yo?ow~>F#RM?=SXOwCnhEQS5 zU08gKeq)8tO7}w$(@&h%xeGGbtt?cOv7oaKFnRAvy04o*@0H+wjaa2cEVd?tHFj!;%o3866Q)N>H6S>U zYw?oV$4XPxR5UadZ!LE-HXu3ZTP-#gDi1$-UgLEvHBv7+_7}={Gk!Hn=-Xrr3MeH# z{v_O|R*|YGN#wJ==G$>ktjQkPp{vKAzUsXDV0O~{b8&y>tvjXsgX8sGKH+?dg+VqK zq{=vqXVL4 zkMXwx_sf0AGNa?mo)i=U4F{-g`w4gJsCbnf4=%Rs4UHRpTv_cG*FMF|GG^lycM?)2 zDvndL)7D>37Z5qDZ-?_rxV>RcUEW+YA=UwBCc`iJkAGjCI!6(q_;#IjHE9J!(1!$% zzk!I)gud~GbgPP2>I~SF((b_bIl0`p20J{Axuf`W@CEz_il1R1t7<}1vN8u;K8%jV zVct1>^Z9a@Fhlu~?a~+u0fZC+p$n-KmleY>?x$vifR2rqf@1;+s5g(kWCqQef>=AyoY>JQ9_1-r3qG zj;Fz=_-nSk8*RF~TC!=njGTI`nw;Z4EycXRA-`S`v$e(ZnV!cyRr-N?Xy2;1y6BSq zP3p^o#{K4o*s`#b<63&lT%{{i$eN{%hn?0>gcu$CU*#<@WjY2LEy2l2(yj^PC0Kz_ zqcm!xtu+>rQbA6~X^n-pETS&_Li?mV)GLLSXYGfbE?bqlK}2|~)dg!Se02s2^ec}U z_KR*m7k15cjtaBuM>C~-Z;cj(J=be%7VAhww$Oe>RFR6Sz!0hQ@3Z!=&P7~@aCCep zyge12TQakCXuIEt?3^F)#o=f)#bFF<6*5z$vmdjU^N*PI69qm^!N;`(z)8i@k2IEDGh?@@h-?&*}m*-{m zyR_Y`BuBzV+_hf=*|_rMz;$yt+rZs;?7R7#u-e|ucMN@BIqxdj8siCOi5lCm%|<(( zb1`p3ck?9b)N@uzb_n0N42?{ z#bM=W8wR(Zaz{9w`lrT#mjEE?QenZP!N{?|U<4t{b6;OlsLK?jVlNBmv^2K+oGn(| ze*JjR(%xWje;aw%;WOvaY=8fg$fwHGo-e4r{_^AgW-!C<8079T=epO~^7mdB1S7 z_8P&=#_fWcw!#Jc-N4#R%n|Jnm~|w9WKOn&bUB)p$<&aUil2#2B_1PB#pv;>cjwKm zrC4;kD97dGU_B4ZjV0 zHs9S{;R>+H9xaKPX(K+6e$F2IB^S0Y&@E2F{$9kG698-+jH>-UcP${k|Ka-aRt+_j z^|G^a4?F5j;l(NXR!E;^Nc0=7LzD?@6uc9hlUBRab;Jt~kNYm{cKaW@eLwqt7Wv4^ zW#+CZ-~OWhw0pn)1#iNd$uR|BNTeE>^l@65zIvrt{F=m(+tw47r7*KS-|XsgoKsgYsxew%vsK*XNtLOPA2u(C>#rgHep zg0lFoxCcJ_Df@(YgbvHB%XY_*{cSxw862UF889uLXS`-^Sv?OlWXW?gAd5ewL z1pU~ue7vn^8^qT$J&qY`+@6>#?|I)s>3I3J?>F#p=>GgF5K3Zk5j|j!;zVcD!2Onz6 zy>bXdigg`?yXTFZs+{V`6Wc%84>bzTTdbb{w##|cI@`3^wDb#qx5^U5fjguw9ARX# zb1(Z~ue|A3kwqI@p)n1gQ6xd{yGQfLOyd%xBsP~Ww^TNl_;GG_jV+AsH>jFh)cHu; zEd5G@$g=4nr4{#!zj?~eX12fYtjxOMr8fVrEVswCr+);O&u17ocS2XhHj*^`HdkK# zLHbiiZJ%rkM_Kj95eH>WBaYM~^Xfiw<6u&g$f6j@{;gRz8jbI6qj-8MZ6m+9EWOzF zb*}u`vP4bkA4}L6NR^U^C}i{_k#NaqBw`3BrJM#9e@Z0K`C0O5KvlxvI732F(t?Rn z;2a>DP}S-4U5s08!fpW;G_v%{B2IVzLBEhaLz4(7A) z!}1cZ+U=%d?=fdN)Do@%Lx)y}uO4g^77F9km*_Er(QgK~g;?i2RP$>Nv_vbHku;T; zxw!b@<>LfisaN;e-xc-zHN4K0iiV_nqX0qq&Z==-2vkC!a1BN2y+c`2Dxp`&6aV#4{F;Fhwonk3dTEtyiLL!XSPL<9O2YOn6arsTs zq+KmUpn^CN(yIbn2Q3Q7TA2EB|EO2f`h0t>{;b^rW$)q1M-MFV@jfa<3b7KFa~iT( zmuL~1O-0gL$j74KQJT9Flr=<5chQ<|?7_aNuTt|bMMz3O&kkp=gl+{M61O#HX{f4U zL+0NvXvz$hC~#GL=5w&t4BknCK(QjKzIU=>4Q1PK1>RO55$4B$`@PQcaQ`i*|2`;D z&~N{1CcOG^jyCsg^F&%WkBAdo*?k@>mu8fgm&wP?1FuOYw@AvCeMpQ5gnD-NNn8y# zct-Z(L;GKjvo}{`!x?_BgGmX@06v57S(H>HRhyjaO&igdN(>6<)a zBo}L%`)gV(=;PD&jS4$FV;G^!v-|FGX$@r%5cFs0iKNtZ0mLXuQzPBQq}X}))ZupB zs4|D_FyPkbHn%kq;I_BACjQCFxjVh9IJ#`xv55Hd{A zKNiP>U-;{eNH!Owz8ZTiX*`^RG?7(Ut|18XP5;X-uY;R|!>j?! zM(9EWET(Jsr}L@izoGS)Wi^l`AX4?6DovcO ztC$JSEDTfJrhmDCHGap>{ zORgYoK)n%pWp(!(`NrtH5e$bhCJyYov|{9+bg>jwMFB;CFIL&zmi)fbeOTOCBAdpE zbYbs5va)^C{S+^FV8vXFnAsoOpRyw%aiPSU`>=69&J>X<%&gNeki2WPLE?&bwqLv?Ux~ zl~_*Zlw&vJI{2JoMm|F8-g*_0^i!NiS2UZjCUMs|p}UaPPw}gr?+)j#fr;$#Hy)g_ zg(7Rv5~ea^;yAZE8MV?vx0;r7R^8_lsbX%KOH*jck6X~jv77?s7~xq;GL{s~UE|F> zgCp^&+fp{TblEXWLHZ`h%WOSXEQ?=1(i*m5IWtAcyMu48z5Z%c{yz|MDfYM}K&xblp^tpBK8?~S5J*$?n!wjs|&VB2@@tbpr_ znx}MQ=8Ej}%6iYN=96#kIE&&++_`32?g9RV=nVdWPn{^tKICgraTA&$HaA8J^cclD zm7l9&17bxc7`CXlhk|MR5=z1+=vf&JuK|Ax`1m}EMi&Tc@NWoS@8l5!6ng_MatMNW zM;rCzb%Nw6%mjyr3^V!_E%zt4h4*}Aq8xB`P&+^7{YFm-%du|tTkEsDlchC8^J1A6 z#U9#y#|IVLjSz^b`RW#Fo0maZKapZOxhj3V6_?swfpo*3yt?|erWx;vJ{LkcRW>Ty z>$iJMd*5JBAC@EbqkR(U2Pf;prY=0EC?TL1t%Y!dOw=G%vUikpr**1db<0}DKDNFp zgl?TRQ}Q1>pFOE6v;2uEI23DNSA%8rEks_o&Y2UNlpd$!2y0h|3w3zck6~Eg1J_vi zrDcCCGDqsN!DfvGwpJDHJ+;0)&X`3{pg!h0n`1KSSUna0?p{-?j+X1_em(h}!VL`E3zp>7G3^X*# zLSk`SO;?7oE7M90i($0v3jgIP%7?skpIyzqzO~qyWYRkHN&b;$k;jNnid?=YH><4v zw4~REVfaAAyz{W7KvaY8#2aD>2y89|I87*Zx+6J{hA#AAvf_THZRx)pbj@UN9lxEL zMm2nQpYttou73{IaO>*B&Ff>>YKhAa;k3WZ!R3DGh9Gk_q5N%OuR{H$GCfJ zR}|EvUNgAqSWq*3M|ns&r)IdtTkef_D`~aduBe+!JEdlbd4X6Vtrr%HctpGN_``|o zPV$!!24u2W^;~mTOq-=JDjsN+j)+jv=ObGPe2Y)QXAWo{>>$D(#wy zVfH2iCjN}ckidcNAOq&z00U-+p$>%$xDLfbwR0-Y4}D~Pov`J!#jEby!~=qAd+P^@ z%c%L3RrJSR53$46wmdJ?y{!|1YQ#MPnZWCIXowB|eba;sJXi+7%kkcW7rSKb|ZloO=D!*gdtSjyaoH* z;Q;QCJA|teW|9ZY4I_VBCE}`@42@c5{sDI6jf|;LK)iq8YN@faK1E^`96UlvvEU*FA zV;lqOazEg*I;yIZ$6tdvMq0Kh0tcH-q%1pOM8cHGisE28}?zttCwl6w1j z*0{utjT~91uJcNrBnvHyg524T4+l(<8cTRkW+n_QQF)yr zGq|`7tUK0(r^CErD?A(0exr)8cm+}V%0MfO267mx%xyo3I1Xx-ZvO3FzMjN3PW+cW zG-mA~V?b;EaXuZE^a6M-PF1Ud2YKeTB%Xf4)H`l;-Q??Iobf^S^fYa8ZEoNX!w-`9{{Jj@fTwJgl0&B-un`!3z7JXL~94f39 ziDerx(RGNs8C7H7+M(t5#8swcHIExj-+cP{*BfBy=Pn|%XZncOE@fM#zX?VOkM9)C z2Cz|T^+WHLcp!aUNAclKUq6U*Vw}4# zL6=R)T8dSz^lJv_5R9&d3e0z@-MoEu67Dy?jVxk;;C6pRxO!-P-GE?kMFKJMin0=g zg1zpR+l$z?v!^GsouUKBkMN#YPYH@x@H!_9tY+!eA>5s+YZp}K0vWHBbvmkqADHfq zwa}RSZxl4O^)M*m*f+G6_XgP^p281RRpL@qaEAzw1_Mu&`;9;KSKgnE6y*Lm zhZqWh@-egpWL8rSR#?!lYEgS4(){Ql-wTBpcNfr&l!SSf8L8JtQ78EAlA|e~4|Z+q zCDtQlD2DlPaj7F#m2eKn#EWoOe5RIiQr1(}GfhU|J`vnzerFcGs5Fe*6MhgURW$f8 z+_O{sAw>iB>|^l<^>&m^Yz^4zYUh~5?YvNl+VKc`glVs?ZiWKYO_%aseu8=omZQ_^ z8Oaf(d8v}bd6Tk4^>$TI9op*L=5> zJ`y_^w1(K(_>8WDy^{zX+i_t!vvFY>Si9NWw}Y?w4_kv!JN&vsWVCNi zC0_9(A!ctbZsZ~2tps-ve1uU&OkR}wmEFaCxOCaog5qh${JT__{=u6K(($V=ea4Ld z(g6&HcUUtNSt)&e>Z){RA~WN6U#>^&S$Uqm|G-$DxWE%*VP|Iko3Z>YFZyT30unVj z8#=v6q$EV1sLK<70U|I88916eOL9Qa<8RU;48j%ejv%lhVPk3wQtJRf@*^_~5aMwM zPzi#hS0>Z|MmYy#69)?$GXRx@F$k7eIJtugm7MMDtW2z*;0Y4|ggKt&K7Ruqz%v4R zQWZ&vDBFrjh{za#)J1>QM_^QY5-(}7GlBF=EbM>2Y@ief(lar?Ug$^|{zy{BVHEIY9L1*(WEcE+;F91pSK`>Pd<8 zA|C?$gXZLh_V2Npy{ze0BNd>EkKH;m)!HO zk+p%{^Kkz+=2Jyj>EEXkU~FOvas-;N7e4m94D^3Z9X+Vd-xkko4b&G<4d$25Jc*th zpY&Z01~wqE8Sj%aj32-&3i>>&>Hf=k;7ct-(|>K{9TUi>c@X>4HQ z^t4YM0b0yYTU-%@GL>yrY%HEu4n#Yjiv9^JK5@qXB8OxBC(+yE@fu$i@DDAJPK<2nC<6dZ3LB{D*e&56pf(|F5(81ZhE7_-{xH;=cbgu^|t9-mlH3@s0`-4%RcGTgVq@s@m z{bK?K1zTdQc{G}*ER;;u7msXscJ`|2kRhRmI;E$QOwFicZcI8JJx}&e(SdRV<25C!!=J$(s9GJhomf`=biX;mm5va}*8G)R*?oH*Smhf%s7Q9J zR2d($A7W6+41*t4nY;ovXO~SEMLx3<5;fwM5+haEbx9EBq^`-{>cysw=tVDK;TlXw z5ovc%r~NY_AtCVx>}ue_7dVZjx1Wy)hZPJddv$F)ve7&eA1u7HMCfbbs-?3{>$$D4 zJ{}>*ao7ItL&&K;BzI`j|p5hUtQMVo@B_ssSk|HdwXwvXFEW-uKhYFe)EXxzQKf8 z=DmNlT3zL$uVeW9fae?->w2N_#f7CyNO`t)P|EHIgQGGx4J{1~ZJH`88#^2laul#X z_eWYm9;+yQf-G&RhPFx9AZZRM+Wo`##7| zN#Y^!v0iCO-J#+cEY#W_LTf#C5zH|Z$z(%i62dgOa+<~ucNQ9l#W?r zD%j>#$lx*hMt@K;!P#=QOlg|!V79#JTZ^^j@jlP>Vbjn3??Gbut%=c z@MtP1t`1jIotLSfUnnNh{qfVAzt(e)<;$wq!RR8(npeW4En)7A)1vU&M7y+F`F^zC zNtQ)%^lGaQpN{fOE`c8ZHNtJDuBOB=(eFOtJiV|AJ;b6oedtrq1N$6)SaU|Ue73}u zItXb)KHXi-r8IXYf$u1~1YNzVJd{%{Xlzh=Qm!0C4C zcUgG6`Yoiz)1#S)@aU+}c#Z$4(Z!4C_r!wJRl9emC-Uh7^u(uiZxjaxD~QT>JD13` zUD~sZzqNJ)O7{?sEUZsrNfmy%_a6$3=uI5*$jC-&-cg_34=3q%*8aS(Yye+xCOl1G zq)%zBnS`-je^BXO-FY2Mwf>6j)V3HvtZZDW>`LJxQmIqzbk^y?UFr+lwuN%kb}?}* zccKh@-)cav)5==?Fn)i}5fy=)AJAD{C4_t8gjm_>t>0@a zM#~zyiXB{v-~+UTmK|7+W?b!^xp)LgwB0l%8GAHzYf@dIrWw0`U(FD6szHQwJXGWx81 z*%?5;`>@!NMX`vn_8a?^Je}8PxtjZonwkCs5BoV4YLbYc2gSM z&J5>}4_1_O_7~gfFO69Z-JQxPWr%J<4$@{vnlfCjSEGv+QVh($bUwnYe-V6bFc3m8 zEaN_-#hOo$vt%9~EK684avSgR!(3n?rwtiRL`$D^k=cNF;5~9~qj`Up8^m#)@OM41 z%N-2=1Bc&YrDd}vF3tCQ(R*yCEjpfp_`kj>SohBQ{z#Z%H?7U!=6YYBF)1w}NU0>Q z>EjZ&`j|)}K~US@ZW1av=$P|UZgNRx+(ULgZneT&)EvAf&708fxKmTo|0swRL8CqZ z<-PSHligj`jJ?Av*<;GytD1ZI8n8|%wcK7dEND|cGGPUcVLJZdWI2A5;&XY)goeC5*&Gie}4@l^r^G0Yb^5k_;^+KsHuNFWcKnh5Wv#rcGTL` zrggI`?sX-yZ+4y9F373Eb&US`$en_BUv7$hq8Fjn;)ChfICKHm;k z(v`io=-E||wsgg7kE7nD4$uAhYBgP-yXeY?hp)-H-lNlGkTS9%h~b2O3~zl)$oFI_ zJ0xra#%(P;W!p-Rs>|x>CJBoOj2NT&Bw*`c!EbY1y3eD=dUhf`PG>^9!|?drKHl9P ztbCbR^yF}F!#|ml{k>nWrsr{XUHNc!l&r_)a#*m|M!)xgi06H0He!R0=p3?xHBvc~#gF#QhzUwNB73_(7qpd5!+W?Ed%BzZ(ia>v`YJH~U(o4GXk1%9#4 z*GrnGSqTsBWWL$t&NaY)E4BV$woH}9vbko=Vki3 z+$xQuqjoE11|4qor(fkq7|82oOe8quu_;aBt&2Ql`471W^zS<{7Fbcc4RMVpKC3P@ zaNHYoB@!UMOG4{-yqvZcxSKYyd=O(j#`LX&Hi;H) zRWhL6g4G|2!|R=uK3CgWd2JF+TDX}_v&1MqZ6Elio`^Va%3~n!iYTV%ifq3&3lIXL z$E&W0Wws=24MhX8o(UIWcS)231_zY`Uh1~kmVSIzk3e@w4}3)|;-k&Qp#jG$iW;=5 z>!mEPH)|wo^>cASr|uK>^afYS63U%DsqSFxA&sF>ZN~>r%qvip9eL3c9h#bSUo_jz ze`W1-luwn-X>e${stXw1Qy!p}k1tv^)HnDn(Kje8U0eCqv>c%)l!deU>EXo5-84ZU zBsTdH=@^tzr^@QF%wielo#0v0rdn+DEy5h+R5!#+(Q)8g!ZwYZr&MQ&4$(8>AM6|Z zJEvB+JI!)bv=6I~18CKJ2i3>Prqnvj%Z9_OFX3~_qDFgp#eP%~?z-!h?_$errkDnZI7QE)0=^ zDq0P#$^%{zvb`>XGbR}-BMJvkzUfB}eS*u$0_L`57X|5s_gTsfwZ_zungRSCTHsP30um?XohYL3HeNutlo3!jtk2X54H8 z9OBi~U|QRmigHmQMvCv{#~dVL-Fd&VH2I70j*#x>zmfwtDRh} z0u?I4o{t$@{MKzl9i2(QEQYY|9Tsbf=zRs)$LV;AMq=6pH}0=$jX%;QnbX~pBl~G5 z7}6u+sYvDs-m)!EZIh?T`tz%RWKWa|UA?923b`n!KRAQILJ;+*PK*5(OnQrQJ~V<= zA~BH_mZYjUJEOnlPH;4t;vxr$R)%s8B?s+V>O&!?K1mm4;c#ze*(u@@=)mAk8RLv0 zd}(JIqhYG=%0%bH6dvGQ2C;J*nBa-D05p;WsM$~UX>S?S^$o3j+wNJ8%bN4!kSm1JuRv_@6Jfb4urfUdR5^n zIL}}8c-9cl0Td;0RbBC1=E@6{ro5%ET`A8KMGuP5_>G{dWQmd0ReJh2gR#X!h38mV z>`Df{y7?k(aczWfyI_9cEdNhfbn_H64FlXYYR8B z3k-rRXMy4tU&E_;ddt+7WZh?*vf#p*JOC2-elw90l3HtBR{Bx zg_=SQq1K?0DSZ%R4g)a_dW*7|kFdW==9{rk2^r)Q2u5we6qPb@3&Xf#=t~=!QqAXZ zV!vi(V#ds0Z+$N!+^}7SOA8(Ode_g8GYTj*z}2&?{ll`E{X&x@vn$J@EMU3H8$$r` z*OrylCqr_KL=_;JL!$W)=_(;gQc>$~AbKtq+aJ0i0e_)H-jU}BqUY2#?$vXtua&#Q zar!f2J86@WsRA+(JPX3>nAH{4oFz?B5+^*J*4Qi}=H#xtO@K|t!BEhs@e`tE%hcoedjyYnXiq}vS6J!+#qbJx85DJ$XLUQp6bBh+-V;?Rx~M^F)n2OiOe{0gipV(3xp-av zX8;0$v#!LPt&mc*{UCmz-*szM|EFlg&P>9__x>wbbszChLhoj9BHOC4aPIT*?e{q$ zP8sZM{R#Zjx9S@gp?1Cfq&hvsi6&3M8US|RL+U$M;9Cez_nF7wLrv%RgDtCVNOKWq z_J4uV1Xh!-Losp}5zjNq)9(y@d?ho+y+H$ZvlQ84(|PpHE5MiXO3d(x6#%skVZLx{yq5&YBFLiBKw z#9R58M^xJuf^LG&m|2rtKk-$u4j{0SKS^bxetOK}-2n`(fip&(hv-uVHrPY@U{>z) zSceLpj>_Y&P-Wv^u8SeypJq(tPMC?UZb7_LwGj1&Z-F1viiyK#!^lv^z|^XCrud1H zR5THM?k=xH;KmK-anb2+>mzn8#yGTSwWrx%eAjl2N$_sbS`yX$HX0$0w` zshn?Ct>S%}y>S*#`tqkOphdD~74&^^&pK1eT(D0-CU|953$OfK_f1CL_n$5nI}FEo zh%4syYx{IV^gf ztqeIF&x)iEjd)EAw+f|$%vL^fnHZk%m~3KO5B2hQCWQH#D^UbyD5rpX-_(*`ft!k%b>D4_MU9MbJHJciJTus3I-|~- z)E)>!<*5uC?=@?1Ouwnh4I5AWj)H9{lqcKi%SSDZhck(;#vr75hN!veX}y_l|BG?z zthBaf1a@1*qjC1@_@w zWKzvpC8fof(-`)3j;80?LQTIR@2Gw70T79CWtm{8y{0CiBsam96YDoJD3el+JEQXn zVaVEH)`<0=%^@jx>qI$#VhE07E{&UzsV(D3&DPCQ69?^g;fH|t@dE-GAxt4|hO$vj zagiNo=18Z#{o6?JCw@K@a8;r}K2z^U*Ii)6P5|P{(eEZVv=wEG<7jTe`Y`qLOQA)f z*1MdP31BYO=13^RGZZFcwbeygV>#ilhz5#l*8LN~IJ-tRL>NdoaidDBon)a+uAv(#M?s9HrqhIt!^=niNW16gAExdpCzA_oZ_CSRp9n@aF z|3yp$#-45<(sm{wQMC(6!&^9+H1iLSSUAliyFfF=Q9iDCtC z0wzT5;>Np{{)N!}1LGTtn$Uz8Q?erLK92EEUK!Xr^k(3)n zsp%9v`iwcd0a+NO$d#WuGpcF+DT4(X)CW%Kf>gU1)bGmV`2Yj(j}9a>DetdORXZ4a zKRnFJSMscHtO8CDZaiD*v#@lJP`aJ#!E)y!C) zntOc;^gWYvw8%0QndC^xK_FDCinlxtt~f zh86r)g4GEuO3TY&FF(u_L_3ZX2Uwxi@5d;vey+~2&A$av*UiD}_$w>fz?6A23DF?X z*zQ^ypjxeuh$+E>^rr@ez+FEUO%cp*Z~Kj)lE88j+Jh(ieem>vva4EI`PRkZCfp_8 zxpn-?EXZZi>p>RP^3wHAbD6kut)Ii~?W^lxl;|=1(8=YED;I9}k1bOx6ur!0p+?P| z(9TTY6W`C=A&yZfQC-~A+6=)xE?{^^iFx6hVP%{EEu%hq6SYnF?CTj4-Gd%WeBuyL zc0C=Y+elxp{Tm_V28iHxH1;oH_+*b!+&*-Zl!FUY(H`#GzC+!E@kZGCh2#?y*}E!o z;->_$h*tG=>liFhunWejDz0{2Cu_xkD-U6fJwQt?GDY-qd0(V*CLXNOP747-ZLNnq zfjpG1jN8M7_LY@FRHny|uNUB}$cf@|ZL!3-Kz-HxxVr91l2oMrTAY1(h;=o4fO{)nVQ-vD-bN)Rwam5_r=>J>6qPYMZ8&z5Il27*j{5 zpDrb(QRjdDNSud0OjQ4*2C73_9oN48#)F()rLuRt9VE1BQ?$QHs*{o*CzjCu>=K51 zgdtGr5|&u?aXb>c*~GAzRp2BkOlKk~43{2L;5sQR(l+&`sZQEzGAYbACMhgvnJ*Ln zKET@bj>}oB4Djohli0{%)D0*pYvkG7ohGQ3Xd(6CUjY&HxVM8G{8mGKA-;ZnA(>5Q z8)n~bn5?`cJwvW-&6R)tzMN-e!uDz$+qH!`u(SDs=yPD_g4Ojn|Dj<|Na_CwNPlK^{}YhT2?F9AATZ6$#0mr*fT%hLCx8_M;#ffRoE=2P zSy@2j{E4VPQF73E7SO|vtRS1ej-M*CgRtBaY3Bm5v8Ox>Cy1=G{V50e0zGI6U}t4} zPB}P0HY`tepte9Z|4Q0{Pj4&vi!uIrz>$rU^Z$|d{}E{if{6DMX=i-_;2baHn_c9Y zZnM7taQ2r%j+Y$M3qNOn!Qt#L7@gyVinG7Kc#ap0&My4yh2w?0v;Rr{RR06q|Iqsv zXB;o2o&ANa|6%eRe?a}8zPvO9d@7W1n|AgtEN%OygWdPg1V*3Bc>Ho9F=fC0ff15%Ov z#|No7K#2bN0cIuuJqs&413L%T)8zrk%*4jP3OagX^h_X%&+&92_`m)jHSo{7g8ndi zmY268{jGR-KL4*%31al1N4Nj!L28!&;R}JjawEFEnrYe{;m07QVGaxpmyV+#ZYEJHGRjf8i6|2gNSqR)?-?0pDXpxSq)sy42%%3;ok3N}#f#^aDdp7xk1FH%e1dDfb z9bH2(HwQnEQQUmw&wh(B`M`cEMfH5wb);d#21ww5u&iyZifOw$$2NE%VtS0@c%^mu zEwRZUkix>?p*Ck56DNfIus;D zxN8*hZbs91HzjOTS|teUzX{$E+tfpG5 zlr_(#UUojwVVp=xm>A+2Dq$+i(AAO?tq!$(N%E4U^d7w>v&fT0!|wIh4Rd1uyyF#0s=f+O4wXk?ELUIvdps%74-jpO# zWjVb!p}Fqv=@?<>`D0axJQiEp(eF`^H*O=w8{AG=MfQ&Yu6tw;&&SlKHcl^`kFF$N z5uQOMTmgE41-=Dxeu{=HKDd>78d|AILg(o0>}Bby&vbM&RCRSOms_jab{C^s8e2&| zNJ)JFG2dHubZK0yut$U|gj@%lAP%=8CxEuy2@Kel_>|a_pNXni1Pk_siW<9X8qcI& z_0dQNIUamm!lroQWiq_Q`L!$4e*&p1E%gbfjrj~ubNlw`J|Rm9#k}{)K;#~=wgwdH za5HUiCjH2Ue9L1eDn=NG9D7yd=A~lFTP!^kH3EMOFf{Ny5c-}s&M%16uqd4KdylR6 zk#4ibwAWsI!Fs2-%~^GK!QjK49k2*+iEu5Yed^GeE`Q3-_!QyTbk1Ztt`bUVHG5$l{9H%ZUmXvV5z(-}TlN=)PppM63p1T|XeUT$l?sWl1@YMUUAYC8fF z6ZWr?xGvwHjj!69t+S?7uiJwoY3H3$7cItqLpqLezQhOC@GE%{EP;dBIR+)eP@Avf zJj)yPb7h*z(9}e5^|yVb6n+@jZbjn~)<)(ry}vQMoCsU{hZ1jjwRH0`etHHx!;Pyg zNOReD&(vDwS#Xt&E<>YRbTw}(~UqD+qY1+K=1A2`T^#319!Qj zQc=UQ01ps0{|-5~#+Ti)khaQaJ_zf!a3Lm@eelt!hkmM3_kMs90XSzBP6ed+Nar8& zEwWerThNz8#vh2)RvXNYL2fKYJ)fUj8|eD%H2eCDK|P`U6_%VmE6();$yFzQ*%S-Z=F$(1MiTOlj{UDOCldX!jo$fASWA>@ zW3|-GET>C#Mp~Y-@DP@E#>uW$hsriPj0FAk-eOo9wH|w|bAS=jE-?7kl%A zY})pW7ZAp9#d^#=gBnCUzK2c57ZOi1-QRj3>D%{+<^4!qXB=*!poePQ3{;@2XUc#k zRNLN`S#`Lhel#|2qZQVk9e3LZ-)6Rk9xk3#R};)DzcM}tD=jML>4NOFc)R9N&fz>U z7!=c#zi|1r(vyHvh5gdJbLXFW)39qt`taAe-dtbx+xj#v%&AnQRcDuBkZoA$Q}WkG zt*2H9)$X34;T3RRzte)dkxqsyH2~nf=>GUpVN`N7aR4z_aocWJtD&c~B(LMVpeJE> zH8>Z3!Uj*$4x_m#-7EzHLUqFVenN z&?eg3v)@~}e2i_9T&b!f-=M@#Iw5npoPB!bY?)x72NKimfBem{{tL}hg(K^|9lq(^ z8R2x$#%nS=TK7%1wMis6dzEQ|PLX-{$?fHGy^Mw`G$C^~lVwFwu9QLthLW5O`|Oto z^K&63LyipMnWWx%<+ZhYG1lcg%GMVq^FNMvgwIW`8l88@WTfp-Tq4_Lrb|72Ox_>Q zSn`MGPcG$JX;DD-z&STE9k!EjwYt%^DZ-D#O%%0`3_~)z`&Mgw0j0{OUDRrL1|cX{ z^R_^si_0EAYA-pKt%h)fyw@Aqp&Y3w>!;OcCohy;wD+G@7eU?j1b1GsP&bzI>48Wp zHYs$QoTh7bE<5&H*pH1o_u@&P)lAP+n|zLBNMbu)N?b?zYF$0RhY24g|IOZpi6L%* z0jt@TDG2|4Z&c?H$-(ewyBF{~U2Pz=aRyh7BO7O!S;qdr8w2eIi(~$DIpyJ%i+OXJhR5S&F|m zqt^nIM#;jGVw|r3%Kq^dc{Gx#quls(dMr0RNqC=J0}Y8G)l15~&B!;eOy+ZZSm!q1 z**a->7}^c^DFcDbNOPOn20?(n(Fzv4Ol^8$k-9DCHS?S*G+%Q>PCc1M5O~ydg3&o< zwp<7z3w7sMD6A14s@?B7sydePbdU^LS&53;6KO9exqy4FzRrsb=;OI$C{hXPwmG`M zalPhIf#5OPZCb?Iav|e_nY9$(6wp)JZnWm5lUlY$rSWvGiSVBk&Py1Ry>_uzhbDbe zI^F@itUJ4E&=o+f#a9G8JNDgS{v!3>FVs=97k+St2!AL0?Q@l?)}_fz;gEe53K2IK zfSqV97)fa@yEwpKQbILY)0s{|yehq88f)5`_d=qtQ0``v(e-SPV=?Soi-Gr3x3x!K zM*~l+XJ_`#2DEY%V#)QxrF&rOG5qn-*q=!sCnvoXh@jUiO+VUGgv?5>A==#6I-G1h zy}pvOqhVL|cA)fj#H^>7^!@RJfJRnGSpcIAYgtvTmUAzpK>UR5dFEgvmGMYv*ys4omx3nB;Ie5|f@T-oe%v-3@ zW=I{<%8c@^CPlB62v|he^}+|xp4U85-=eA6pGYwsKOOlM$VfJl_L)dEC(a)|d3)8N z$Jv4PkYlEIIW$d|1x4{fM_ZcXRLzk_YX?(}Ch8$-tCO4iBv!XDNbrQUL4K6c{4&M@ zL{j}(o|Oz|TSx=><3s5Y>(y9RS^STsFT@p<8_hq&ey`3cgOV+}?w<$OyFw2syf;dt zcBcwGZ#0XLA-$;=lV67kqsQJ|JnU2Ni)w8`St*|U)P0KB9XTV{=k0;-YP?+E)-kzg z7j{T|uq5mw-bQvdD8G$VO1?p8JRCsAy+Ascnq`mYEo?V*>Dc*0H>wWT#t>nqE&U2P zgyLiIkK&rXc|>l6eYf;&YuX?}T5i^F@HuvybBD5;uPO@8zw4E9>3dCX`IJ$RKDmmL znV3_IW6ji?Whf(=@naM5Kq7l7|HyUzMVE)P-94LW5O=kVo3xv$NfhNsd5!rBNMx?2 zO1TJnKE<{6RM}^5NA1(#xNL-33r=3Dbdzk3phL_^G$*4{?fty#xW432$Y51>YFhsy z1X<00nqLNTu{p)gIkUK^Q;X}$PTGW*^k@zvD=2}l!Igo8nTVNC#QW&_aLZU(Jy-l8 zccby=*pc<_p!!bCnC~Y`&Uy7gP1ef|HEz-*AI(-#mb$xUP8?0Y+?U>@SO|E>!?0Gc zxLkBvU58IU$JGqUs%|lT)8*5?JQvYa%8WdjOhz%8m|=Nv!_R41w<$a{h!$=wFgF+8 zqo;bBCD-OMD9kTf84aT(n`rK@E93)6fB!SVA(l4w} z4~a6j^&=ykMT8uhEz$HZ#@kMT}=LyJ@_?_)hJ7i+3&j@DoY`&DEK*hw^f%SXB@-_r21p9&+t0 z#-PwEJSO#8S+tmIWdDwuP4a=6S=|T$Mau#FSdtoxv%WMr#(w?&vKQD8X6$$c zTvOT1#i}H0Qu4z7iq>qWGo8Qd{;e`)HbQXJmE=Sx78k;GM@50LH`7@TaWf8jfEm?CV%DM z9=$J$=R0JsN0gekBjz=8F^s=QwUkvbG8~^P+(Oo?iwZqXi!Pf)z!(mAM6+b}88fHl z2cOCVH)zo*LTf~kG6>Tv?f~J_U}bugX^zJ*;dj96nJ_)bJx_y`w;PF$vklq}J{D!E z9^g+F9FzLFAFK)fZHs7hqByK`?rjjL|G)G5(D=CbO z*}^GJjqiucOzT9k2PqUszEfpS>gi6(ElzLm)_!(kcD_0!HB|T}F86glKkv64+u%l;H7WfUDY{#%UEaKRVd(LOfo|n0k^4+(! zqDU5_6_!IeDA>xA?ocVMzZ>r}c=Wox8_Jm0Qw0B*@fETP{<#XrVGi+^2NrroNT{7D-DM*@H%f}p zDhh@kBM_V2mjF(lBXHG7;o!)Z4xW`WJzY#=9;(uVH{QNui$c>GC44^$az*4&v3VI zC?B}lTNj#~aIqaa8DtA%=RCn|Pen|;5z-Gf*~2#oel8Dv7dk6{DoL-+k&j!Tzf}Mc za?(~%Q)er4PxCvZQw4T?_Y4ROmlFe&ahV!X9;0g#5{)_mUWPlwkPrbxp+!NG#3zWz zfbzq)?y;yl+(%0}JRD3pi8L?gzcZ<~KC_cHJS-O81XN(CvT`-X4M(9!i>>#J#t9WL zMh}RcMTxN%c`Gp%IEiafAF#kD@UMYBM5M`K=eABDG;uYiWq+H-$6!r-)~=iWT-A9* z*{faNQ9Rt7UUG7TPHrj66q4U(oxVLBZF-*@sOmv)l@V${qC_Sv0hCZ1l1oj3gz841 z5Y$JRgGYZPB?`U~NSKu!$u(YSW~WmX8JF3-rz$JyAGAKk8K_pGQ_Kj z9noZWD=2HRqAyC zyK~~E301F8uyj^bw*z%9&Vn)PPB9B}p=9A#p0q*_y&P{G=91PgzEas6xVTWJ!uR^i zJl5X<^Fu(1z0d2=hu*k_XCY;s4u zV3#L7<0%xRr)s1X_Jmky@?`;fi&8xFIhq=uj0ivDoRw%QWT$826F^x?6!G2jxd8Ew%#hj=E|d+hNVNcGVaQ=ogJ zOpWaK>C^zHJl%Z1?Xal`JDY-d1r3_}LoiN0Equ^vqd-79|PduD%p(aJGtX zi>}S`t9d!+OZYY0w7qh`icOdr($A*@MaMfm`y`!{m;y;31l+{qmmn$Z);~~u$+R<@ z>`WG0SEtU=q{0$r#`=UYK4hkmrZ`ZVw9PvFXov?(eyS?&yzo)4Zh<$ReZ21lHB{n5 zLUx9^80ffAtJ_FMjVvoldr*(C$dcWCF@)~O#dJ++pF&;F-2KK}w!d~O!#6cBjnT@C z{F#O3;3tbgxp_C32|P^m?Rz%ey)E^5+nfs($9i`@xqa{X8f9${F@foXqGJg;7y2tF zKF!hbS;JuQyv0)!Iqd}2A*Jx1{F5ZW9w93DoO-5NBaqZbB7E4rLT#&Xg?L9H*wY?a zeqx{8l_fH=Nz*G66Svwk^E!8I0mr5AP!%)IyZY5~M9%vw)_lAi_~Siov-ENG1)0k6 zH{ZM%Ka1ItZrGUGA1ji+N%2>q@gkAYSB;%a@10D$bkfB?r{H2dPSQRwSPC?iQc~Ks zOClzK5I7!!Er!D7y&W8`kBQ!1^7jcF#zaXW)f409!0R7|_r%7HvP(+{r?h@SG^-#c z@YAx@K<$yN_dAsogYY8*WIUu0DB~axgxY1^7h#~}WfG(A-a7>l5OFF}8SBTiOyWOd zXTpcK17IHkTvTF-6U92DH0Y$(daRkK1p0mBHGjb9#ck>foNpp4=`70guP96|65 zS`9h&9}4SZVo>G6O_;M!lKtG^MU`B^K!L}EMS_B1NT<#Dl|KFZ#dCco3ItPeVd>y@ z8BI;VOasP&Bsl^lZt>VyYxQKbbeE`^mYk|SAOd|Bnj2Jcad-l^WFM8YizfrVphQ<(R_K4X%rjC|JZ}O4^LfIL5(4LA%)u`p+ z7U@fp$O?+-g!++~I49`P#U8>rMfbZs}WyP_XXt@K~$^Gr$i;c zqly-ZzhHLcqs73!e9*ghbQYY8)#3;%1pq}xOq zVM+8oOPs`e2_zz_nBEXXkwb`FNe@&r5Uqm;aV5Ia5lvr>*zbd#o$u+32Ekx_g5H~J zWusnE`aasa#{t2p-V7yx@A9hk14MMjRl&nbRtHkFPGLne?Skd3rtXr#^sm4`7b@|x zc+bb(?=`uCN%eyzson=P)_X$ke8DE2r0jp{hU)NWR_>3DEK^6GL zxcYt7)%dhAii$EAbO&OwVpRILzBnBO*zRJS!5=E)2Ph5Zs<>svMgW8#vV%Cst(M95 zco`OrY@J|>PFb64{cT;Rn#xSWQaHGs+nYbOb5ska?PVxYP?6K8IilIxt}52iT>?r6 zt(MrrX749P0vEn6m0RzXYuzi;wZ<&ZFfA<)sR(l9ZHlw4t<%l$9WyVKAz?1f_ouRa z$4{DSWKmLI_as9e!!vi>oQ9tazBt5|Pau8b^^jJazwN$L&Fn)~-!2R@Yptib2t_(G ztW=0p7M1lJ_Bv^)8OZ#r0a6J2PRRp_ym7{Ues*RkG53X#is9JdQEa!UgC3?TmX~|B zj888+=_ptuCYlxYn%dF`Zr7wJDwWQ6Pd*jJSNAHFT7&s2%)q*{7|qjTrWVysdBND6 z^h$E~S4HDKnA#^l!Ss}A%I}av&Xy_d8d=JV&h{9VWO-5FVY5@ox*LGxD<2K#&-0WK z4Z9uj2COnvbjFLFv}8YXe~6YPtn5A^f~|wjL)3c_z=xIcyJ!r-;ThS=&xOcVlBR;o z^fvFqNhyKg2MN_#sCKEWp26K)c%i$7OT+bLmbGF0yR3R~stD$o`M~rFf9sjSN^N8coPKqHd-%VR|&A%XoW{Us;&xsG2#~q>fXFXhEdrdZsJ&-`M5wT zY=8B~j31N4u_JpQ^MVkVRk3MQDap5@emsv>fII+}sxD@f;mhb2lXAen;m*7%VwA~N zJJ$;B@I&{Fh;^41_!js`jpw~&;kM(4ZCnAK?o+i@8|-JHCt3#rH$!cqS|jZIa6Sn& zj!vVe_5|PO%@8NJ7mxIQEUK_vc)=UwV~s4Aga-4e^*S83{ot6~^nvpb5f=HIIr%va zc{^3f$IbI^=H!2!nEqdwlV3Bt|98B}&qe3|g}8waHt!9F;UXXyp>lij@^Dh|aKccL z+mnaqrWMZ5bHia@#+w4PKZ8f*)r3 zzMaYDza5tVEH3b?{_XD{$jX1N>c2x)_&E9gKeF;)AuAvlGjfZp+~FQ~%*h>W!h468 z{DMvXg`V88D!-VMJAmaD41u+BN3Q$}S@{ihxr1BoXqY=4#GNDJ&I7^yiSefD1Kc7&#x=G(_P#lGI#I} zNc`tm?ra%%Obz%~1)g8s^86xY?kJi&%!m8`nw0rZCCa~L5dVeC;#Vg6Zz-9dM*QDo z7TiC5_y0yaezG3Fqa9Rye^4?vH|_tAMEehF26JWnbl?2k6%3t2-rU5I3jHbiubc+! zT*B5FW~|}*xjWfUxeIIuFG*<_A@l+QGrl~whHcO#e)lW!pWYaV15B}Uo9EDPohZK! z81x2)A$6uy|HN)L0k}P=Z2pOH@xnN`-(uW+{4hZ8_ZaLIg=M-wV<1@W`6~v?Y`@32 z`FLU3^!FGSKQ9mLns1MP#sb58{wl}Gd7B}BFUJYu0{uB=7>LFPdsx4<2dfLqw131v zT;M;)2eZZ8KIq@;3h@42PCx*b-G48~1smU=F-`&AzsAP}%NKv$11^}a=Fj$EAD6#k z{IDzevmF1eM(B^R@bSXFD1XHG|9+NS0$hLf%guWWTK-;_o0p&e&-=p7$M@IsfYpUT zm;cmtaxj8zIN|Vfdp>n@_ZP5t5+*14sc*WYp44n@VQ!P3mXe=ZBO?c=pRX@Bp8)JL Mj7~= Date: Tue, 12 Dec 2023 23:05:56 -0700 Subject: [PATCH 42/43] Update README with file comparison info --- README.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 2a09ae8..30d5bc7 100644 --- a/README.md +++ b/README.md @@ -72,16 +72,15 @@ column2 SET column3 0.01 ``` -Currently implmented validation criteria include: +Currently implemented validation criteria include: | validation_criteria | explanation | | --- | --- | -| EXACT | the values in the two columns must be exactly the same; in this case `[foo,bar] != [bar,foo]` | -| SET | the values in the two columns must be the same set of values; in this case `[foo,bar] == [bar,foo]` | -| \ | the values in the two columns must be within `*100` of each other; e.g., 0.3 -> 30% difference allowed | -| IGNORE | the values in the two columns are assumed to match; in this case `foo == bar` | +| EXACT | The values in the two columns must be exactly the same; in this case `[foo,bar] != [bar,foo]`. When applied to columns referencing files, file contents will be compared to check if they are identical.| +| SET | The values in the two columns must be the same set of values; in this case `[foo,bar] == [bar,foo]`. When applied to columns referencing files, the lines within the files will be sorted alphabetically before comparing.| +| \ | The values in the two columns must be within `*100` of each other; e.g., 0.3 -> 30% difference allowed. | +| IGNORE | The values in the two columns are assumed to match; in this case `foo == bar`. | -Future comparisons to include `FILE-EXACT`, `FILE-SET`, `FILE-`. #### Optional: `column_translation` @@ -149,3 +148,6 @@ This file (available as an HTML and PDF) is a summary of the differences between - the number of samples failing the validation criteria If a `validation_criteria.tsv` file was provided, a definition of the (currently implemented) validation criteria are provided at the bottom of the table + +#### `__diff.txt` +Shows the differing lines within mismatching files for a given sample and column. Each pair of mismatching files generates a separate file. \ No newline at end of file From 69afb0bccec902063356e780daa16515f6ac8875 Mon Sep 17 00:00:00 2001 From: sam-baird Date: Sat, 27 Jan 2024 00:06:33 +0000 Subject: [PATCH 43/43] Use shallow=False for filecmp.cmp() --- theiavalidate/Validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py index 8fdd3a7..5c66144 100644 --- a/theiavalidate/Validator.py +++ b/theiavalidate/Validator.py @@ -260,7 +260,7 @@ def compare_files(self, file_df1, file_df2): elif (not pd.isnull(uri1) and not pd.isnull(uri2)): file1 = os.path.join(self.table1_files_dir, uri1.removeprefix("gs://")) file2 = os.path.join(self.table2_files_dir, uri2.removeprefix("gs://")) - is_match = filecmp.cmp(file1, file2) + is_match = filecmp.cmp(file1, file2, shallow=False) self.file_exact_matches.loc[row, col] = is_match if is_match: # don't add URIs to exact differences table if files match