From 7ece335e98c5830453b4e36260ab777a3b989924 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Thu, 7 Dec 2023 23:50:24 -0700
Subject: [PATCH 01/43] Fix reference to table files

---
 theiavalidate/Validator.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index c0dda71..ff92109 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -5,6 +5,7 @@
 import os
 import pandas as pd
 import pdfkit as pdf
+import subprocess
 import sys
 
 class Validator:
@@ -317,6 +318,18 @@ def compare(self):
     
     self.logger.info("Counting how many cells have values")
     self.count_populated_cells()
+
+    # test localizing files to compare using gcloud storage
+      # create directories for holding files to compare
+    dir1 = f"{self.table1_name}/"
+    dir2 = f"{self.table2_name}/"
+    os.mkdir(dir1)
+    os.mkdir(dir2)
+
+    # localize files to compare
+    # TODO map gs:// URI to local path
+    self.table1.apply(localize_files, dir=dir1, axis=1)
+    self.table2.apply(localize_files, dir=dir2, axis=1)
     
     self.logger.info("Performing an exact string match")
     self.perform_exact_match()
@@ -329,4 +342,9 @@ def compare(self):
     self.make_pdf_report()
     
     self.logger.info("Done!")
-    
\ No newline at end of file
+
+def localize_files(row, dir):
+  for value in row:
+    if isinstance(value, str) and value.startswith("gs://"):
+      # copy file to local directory
+      subprocess.run(["gcloud", "storage", "cp", value, dir])
\ No newline at end of file

From f9942d4d469ba9a555dac6bcf8c312e74ed8a007 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Fri, 8 Dec 2023 19:17:08 -0700
Subject: [PATCH 02/43] Remove test localize files code

---
 theiavalidate/Validator.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index ff92109..c9f8386 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -319,18 +319,6 @@ def compare(self):
     self.logger.info("Counting how many cells have values")
     self.count_populated_cells()
 
-    # test localizing files to compare using gcloud storage
-      # create directories for holding files to compare
-    dir1 = f"{self.table1_name}/"
-    dir2 = f"{self.table2_name}/"
-    os.mkdir(dir1)
-    os.mkdir(dir2)
-
-    # localize files to compare
-    # TODO map gs:// URI to local path
-    self.table1.apply(localize_files, dir=dir1, axis=1)
-    self.table2.apply(localize_files, dir=dir2, axis=1)
-    
     self.logger.info("Performing an exact string match")
     self.perform_exact_match()
 
@@ -342,9 +330,3 @@ def compare(self):
     self.make_pdf_report()
     
     self.logger.info("Done!")
-
-def localize_files(row, dir):
-  for value in row:
-    if isinstance(value, str) and value.startswith("gs://"):
-      # copy file to local directory
-      subprocess.run(["gcloud", "storage", "cp", value, dir])
\ No newline at end of file

From 6d3cbbd584bb3815362d5b7a5d863fa5fae7527d Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sat, 9 Dec 2023 05:49:56 +0000
Subject: [PATCH 03/43] Retain path while localizing files

---
 .gitignore                 |  5 ++++-
 theiavalidate/Validator.py | 24 ++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index cb5fc37..7de3fa6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -160,4 +160,7 @@ cython_debug/
 #.idea/
 
 # IDE
-.vscode/
\ No newline at end of file
+.vscode/
+
+# testing files
+sandbox/
\ No newline at end of file
diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index c9f8386..9eb1266 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -319,6 +319,19 @@ def compare(self):
     self.logger.info("Counting how many cells have values")
     self.count_populated_cells()
 
+    # test localizing files to compare using gcloud storage
+      # create directories for holding files to compare
+    dir1 = f"{self.table1_name}/"
+    dir2 = f"{self.table2_name}/"
+    os.mkdir(dir1)
+    os.mkdir(dir2)
+
+    # localize files to compare
+    self.table1.apply(localize_files, dir=dir1, axis=1)
+    self.table2.apply(localize_files, dir=dir2, axis=1)
+
+    subprocess.run(["ls", "-R", "compare_files"])
+    
     self.logger.info("Performing an exact string match")
     self.perform_exact_match()
 
@@ -330,3 +343,14 @@ def compare(self):
     self.make_pdf_report()
     
     self.logger.info("Done!")
+
+def localize_files(row, dir):
+  for value in row:
+    if isinstance(value, str) and value.startswith("gs://"):
+      # copy files to to compare_files/ directory
+      # it would be much faster to copy them all at once, but any files with
+      # the same name would be clobbered, so create local directories matching
+      # gsutil path and loop to copy
+      destination_path = os.path.dirname(value[5:])
+      os.mkdirs(os.path.join("./compare_files/", destination_path)
+      subprocess.run(["gsutil", "-m", "cp", value, destination_path])
\ No newline at end of file

From 8975b9cded3e2ccc4dbfb0625b1dbe033f02bfc8 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Fri, 8 Dec 2023 23:21:07 -0700
Subject: [PATCH 04/43] Change directory names

---
 theiavalidate/Validator.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 9eb1266..47b4a8b 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -321,8 +321,8 @@ def compare(self):
 
     # test localizing files to compare using gcloud storage
       # create directories for holding files to compare
-    dir1 = f"{self.table1_name}/"
-    dir2 = f"{self.table2_name}/"
+    dir1 = f"{self.table1_name}_files/"
+    dir2 = f"{self.table2_name}_files/"
     os.mkdir(dir1)
     os.mkdir(dir2)
 
@@ -344,7 +344,7 @@ def compare(self):
     
     self.logger.info("Done!")
 
-def localize_files(row, dir):
+def localize_files(row, directory):
   for value in row:
     if isinstance(value, str) and value.startswith("gs://"):
       # copy files to to compare_files/ directory
@@ -352,5 +352,5 @@ def localize_files(row, dir):
       # the same name would be clobbered, so create local directories matching
       # gsutil path and loop to copy
       destination_path = os.path.dirname(value[5:])
-      os.mkdirs(os.path.join("./compare_files/", destination_path)
+      os.makedirs(os.path.join(directory, destination_path))
       subprocess.run(["gsutil", "-m", "cp", value, destination_path])
\ No newline at end of file

From 5692fb500a7ebd9f08ed4605742c74e3883af7c6 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sat, 9 Dec 2023 00:35:02 -0700
Subject: [PATCH 05/43] Fix bugs with localizing files

---
 theiavalidate/Validator.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 47b4a8b..60c7c20 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -319,18 +319,17 @@ def compare(self):
     self.logger.info("Counting how many cells have values")
     self.count_populated_cells()
 
-    # test localizing files to compare using gcloud storage
-      # create directories for holding files to compare
-    dir1 = f"{self.table1_name}_files/"
-    dir2 = f"{self.table2_name}_files/"
+    dir1 = f"table1_files/"
+    dir2 = f"table2_files/"
     os.mkdir(dir1)
     os.mkdir(dir2)
 
     # localize files to compare
-    self.table1.apply(localize_files, dir=dir1, axis=1)
-    self.table2.apply(localize_files, dir=dir2, axis=1)
+    self.table1.apply(localize_files, directory=dir1, axis=1)
+    self.table2.apply(localize_files, directory=dir2, axis=1)
 
-    subprocess.run(["ls", "-R", "compare_files"])
+    subprocess.run(["ls", "-R", dir1])
+    subprocess.run(["ls", "-R", dir2])
     
     self.logger.info("Performing an exact string match")
     self.perform_exact_match()
@@ -351,6 +350,7 @@ def localize_files(row, directory):
       # it would be much faster to copy them all at once, but any files with
       # the same name would be clobbered, so create local directories matching
       # gsutil path and loop to copy
-      destination_path = os.path.dirname(value[5:])
-      os.makedirs(os.path.join(directory, destination_path))
-      subprocess.run(["gsutil", "-m", "cp", value, destination_path])
\ No newline at end of file
+      remote_path = os.path.dirname(value[5:])  # exclude 'gs://' prefix
+      destination_path = os.path.join(directory, remote_path)
+      os.makedirs(destination_path)
+      subprocess.run(["gsutil", "-m", "cp", value, destination_path])

From 205c68abfbf306eeeef2e27948706cf7254fd9c9 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sat, 9 Dec 2023 13:53:11 -0700
Subject: [PATCH 06/43] Determine columns with GCP URIs

---
 theiavalidate/Validator.py     | 35 +++++++++++++++++++++++++++++-----
 theiavalidate/theiavalidate.py |  9 +++++++--
 2 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 60c7c20..b1fd3d0 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -39,7 +39,8 @@ def __init__(self, options):
     self.validation_criteria = options.validation_criteria
     self.columns_to_compare = options.columns_to_compare
     self.columns_to_compare.append("samples")
-    
+    self.file_columns = set()  # columns that contain GCP URIs to files
+
     self.output_prefix = options.output_prefix
     self.na_values = options.na_values
       
@@ -135,6 +136,29 @@ def count_populated_cells(self):
     self.logger.debug("Creating the summary table with the number of populated cells")
     self.summary_output = pd.concat([table1_populated_rows, table2_populated_rows], join="outer", axis=1)
   
+
+  """
+  This function determines columns with GCP URIs for file comparisons so that
+  they are excluded from regular comparisons and instead use filecmp to compare
+  the downloaded files
+  """
+  def determine_file_columns(self):
+    for df in [self.table1, self.table2]:
+      # select columns with at least one GCP URI among nulls
+      file_columns = df.columns[(df.apply(lambda x: x.astype(str).str.startswith("gs://")
+                                          | x.isnull()).all())
+                                & (~df.isnull().all())]
+
+      file_columns = file_columns.tolist()
+      self.file_columns.update(file_columns)
+
+    # Ensure file_columns set only has GCP URIs and nulls
+    for df in [self.table1, self.table2]:
+      remove_columns = df.columns[df.apply(lambda x: x.astype(str).str.startswith("gs://")
+                                          | x.isnull().all())]
+      remove_columns = set(remove_columns.tolist())
+      self.file_columns = self.file_columns - remove_columns
+
   """
   This function performs an exact match and creates and Excel file that contains the exact match differences
   """
@@ -319,17 +343,18 @@ def compare(self):
     self.logger.info("Counting how many cells have values")
     self.count_populated_cells()
 
+    self.logger.info("Determining columns for file comparisons")
+    self.determine_file_columns()
+
     dir1 = f"table1_files/"
     dir2 = f"table2_files/"
     os.mkdir(dir1)
     os.mkdir(dir2)
 
-    # localize files to compare
+    self.logger.info("Localizing files to compare...")
     self.table1.apply(localize_files, directory=dir1, axis=1)
     self.table2.apply(localize_files, directory=dir2, axis=1)
 
-    subprocess.run(["ls", "-R", dir1])
-    subprocess.run(["ls", "-R", dir2])
     
     self.logger.info("Performing an exact string match")
     self.perform_exact_match()
@@ -350,7 +375,7 @@ def localize_files(row, directory):
       # it would be much faster to copy them all at once, but any files with
       # the same name would be clobbered, so create local directories matching
       # gsutil path and loop to copy
-      remote_path = os.path.dirname(value[5:])  # exclude 'gs://' prefix
+      remote_path = os.path.dirname(value.removeprefix('gs://'))
       destination_path = os.path.join(directory, remote_path)
       os.makedirs(destination_path)
       subprocess.run(["gsutil", "-m", "cp", value, destination_path])
diff --git a/theiavalidate/theiavalidate.py b/theiavalidate/theiavalidate.py
index bbdc82d..c04ae3b 100644
--- a/theiavalidate/theiavalidate.py
+++ b/theiavalidate/theiavalidate.py
@@ -5,6 +5,11 @@
 from __init__ import __VERSION__
 from Validator import Validator
 
+DEFAULT_NA_VALUES = [
+  '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a',
+  '', '#NA', 'NULL', 'null', 'NaN','-NaN', 'nan', '-nan', 'None'
+]
+
 def main():
   parser = argparse.ArgumentParser(
     description = "This tool compares two tab-delimited files and outputs a report of the differences between the two files.",
@@ -25,8 +30,8 @@ def main():
   parser.add_argument("-o", "--output_prefix", 
                       help="the output file name prefix\ndo not include any spaces", default="theiavalidate", metavar="\b")
   parser.add_argument("-n", "--na_values", 
-                      help="the values that should be considered NA\ndefault values = ['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', '', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', 'None']", 
-                      default= ['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', '', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', 'None'], metavar="\b", type=int)
+                      help=f"the values that should be considered NA\ndefault values = {DEFAULT_NA_VALUES}", 
+                      default=DEFAULT_NA_VALUES, metavar="\b", type=int)
   parser.add_argument("--verbose", 
                       help="increase stdout verbosity", action="store_true", default=False)
   parser.add_argument("--debug", 

From 7c02221f708491d2ff2abf25bec6e3d1978c6dd0 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sat, 9 Dec 2023 15:25:22 -0700
Subject: [PATCH 07/43] Add unit tests for determine_file_columns()

---
 .devcontainer/devcontainer.json |  26 ++++++++
 __init__.py                     |   0
 tests/__init__.py               |   0
 tests/test_validator.py         | 109 ++++++++++++++++++++++++++++++++
 theiavalidate/Validator.py      |   6 +-
 theiavalidate/theiavalidate.py  |   6 +-
 6 files changed, 142 insertions(+), 5 deletions(-)
 create mode 100644 .devcontainer/devcontainer.json
 create mode 100644 __init__.py
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_validator.py

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 0000000..8d96444
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,26 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
+{
+	"name": "Existing Dockerfile",
+	"build": {
+		// Sets the run context to one level up instead of the .devcontainer folder.
+		"context": "..",
+		// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
+		"dockerfile": "../Dockerfile"
+	}
+
+	// Features to add to the dev container. More info: https://containers.dev/features.
+	// "features": {},
+
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	// "forwardPorts": [],
+
+	// Uncomment the next line to run commands after the container is created.
+	// "postCreateCommand": "cat /etc/os-release",
+
+	// Configure tool-specific properties.
+	// "customizations": {},
+
+	// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
+	// "remoteUser": "devcontainer"
+}
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_validator.py b/tests/test_validator.py
new file mode 100644
index 0000000..224cc07
--- /dev/null
+++ b/tests/test_validator.py
@@ -0,0 +1,109 @@
+from theiavalidate.Validator import Validator
+from theiavalidate.theiavalidate import DEFAULT_NA_VALUES
+
+import numpy as np
+import pandas as pd
+import unittest
+
+class MockOptions:
+  """
+  Mock the "options" object that is created in theiavalidate.py. In
+  theiavalidate.py, this object is created from command-line arguments using
+  the argparse package, but here we will simulate this object with a
+  different class to more easily create Validator objects.
+  """
+  def __init__(self, options_dict=None):
+    # defaults
+    self.table1 = None
+    self.table2 = None
+    self.version = None
+    self.columns_to_compare = []
+    self.validation_criteria = None
+    self.column_translation = None
+    self.output_prefix = None
+    self.na_values = DEFAULT_NA_VALUES
+    self.verbose = False
+    self.debug = False
+
+    # overwrite defaults with options_dict
+    if options_dict is not None:
+      for key, value in options_dict.items():
+        setattr(self, key, value)
+
+
+class TestDetermineFileColumns(unittest.TestCase):
+    def setUp(self):
+      self.validator = Validator(MockOptions())
+
+    def run_determine_file_columns(self, data1, data2):
+      self.validator.table1 = pd.DataFrame(data1)
+      self.validator.table2 = pd.DataFrame(data2)
+      self.validator.determine_file_columns()
+
+    def test_no_file_columns(self):
+      data = {
+        "col1": [1, 2, 3],
+        "col2": ["foo", "bar", "baz"]
+      }
+      self.run_determine_file_columns(data, data)
+      self.assertEqual(len(self.validator.file_columns), 0)
+
+    def test_some_file_columns(self):
+      data1 = {
+        "col1": [1, 2, 3],
+        "col2": ["gs://foo", "gs://bar", "gs://baz"]
+      }
+      data2 = {
+        "col1": [1, 2, 3],
+        "col2": ["gs://eggs", "gs://spam", "gs://monty"]
+      }
+      self.run_determine_file_columns(data1, data2)
+      self.assertEqual(self.validator.file_columns, {"col2"})
+
+    def test_missing_uri(self):
+      data1 = {
+        "col1": [1, 2, 3],
+        "col2": ["gs://foo", np.nan, "gs://baz"]
+      }
+      data2 = {
+        "col1": [1, 2, 3],
+        "col2": ["gs://eggs", "gs://spam", "gs://monty"]
+      }
+      self.run_determine_file_columns(data1, data2)
+      self.assertEqual(self.validator.file_columns, {"col2"})
+
+    def test_both_columns_null(self):
+      data1 = {
+        "col1": ["gs://foo", "gs://bar", "gs://baz"],
+        "col2": [np.nan, np.nan, np.nan]
+      }
+      data2 = {
+        "col1": ["gs://eggs", "gs://spam", "gs://monty"],
+        "col2": [np.nan, np.nan, np.nan]
+      }
+      self.run_determine_file_columns(data1, data2)
+      self.assertEqual(self.validator.file_columns, {"col1"})
+
+    def test_one_column_null(self):
+      data1 = {
+        "col1": ["gs://foo", "gs://bar", "gs://baz"],
+        "col2": ["gs://x", "gs://y", "gs://z"]
+      }
+      data2 = {
+        "col1": ["gs://eggs", "gs://spam", "gs://monty"],
+        "col2": [np.nan, np.nan, np.nan]
+      }
+      self.run_determine_file_columns(data1, data2)
+      self.assertEqual(self.validator.file_columns, {"col1", "col2"})
+
+    def test_one_column_not_null(self):
+      data1 = {
+        "col1": ["gs://foo", "gs://bar", "gs://baz"],
+        "col2": ["gs://x", "gs://y", "gs://z"]
+      }
+      data2 = {
+        "col1": ["gs://eggs", "gs://spam", "gs://monty"],
+        "col2": [1, 2, 3]
+      }
+      self.run_determine_file_columns(data1, data2)
+      self.assertEqual(self.validator.file_columns, {"col1"})
\ No newline at end of file
diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index b1fd3d0..a3e29f7 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -154,8 +154,10 @@ def determine_file_columns(self):
 
     # Ensure file_columns set only has GCP URIs and nulls
     for df in [self.table1, self.table2]:
-      remove_columns = df.columns[df.apply(lambda x: x.astype(str).str.startswith("gs://")
-                                          | x.isnull().all())]
+      remove_columns = df.columns[~(df.apply(lambda x: x.astype(str).str.startswith('gs://')
+                                             | x.isnull()).all())]
+
+# Convert the Index object to a set
       remove_columns = set(remove_columns.tolist())
       self.file_columns = self.file_columns - remove_columns
 
diff --git a/theiavalidate/theiavalidate.py b/theiavalidate/theiavalidate.py
index c04ae3b..f18ff5e 100644
--- a/theiavalidate/theiavalidate.py
+++ b/theiavalidate/theiavalidate.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
 
 import argparse
-import CheckInputs
-from __init__ import __VERSION__
-from Validator import Validator
+from . import CheckInputs
+from .__init__ import __VERSION__
+from .Validator import Validator
 
 DEFAULT_NA_VALUES = [
   '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a',

From 8e4a3e3b57eca9fd8565c91ac3e80892fa892c5f Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sat, 9 Dec 2023 15:34:57 -0700
Subject: [PATCH 08/43] Only apply localize_files to file columns

---
 tests/test_validator.py    |  3 ++-
 theiavalidate/Validator.py | 12 ++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/tests/test_validator.py b/tests/test_validator.py
index 224cc07..3db1996 100644
--- a/tests/test_validator.py
+++ b/tests/test_validator.py
@@ -106,4 +106,5 @@ def test_one_column_not_null(self):
         "col2": [1, 2, 3]
       }
       self.run_determine_file_columns(data1, data2)
-      self.assertEqual(self.validator.file_columns, {"col1"})
\ No newline at end of file
+      self.assertEqual(self.validator.file_columns, {"col1"})
+      
\ No newline at end of file
diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index a3e29f7..c5fb2f0 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -140,7 +140,7 @@ def count_populated_cells(self):
   """
   This function determines columns with GCP URIs for file comparisons so that
   they are excluded from regular comparisons and instead use filecmp to compare
-  the downloaded files
+  the downloaded files.
   """
   def determine_file_columns(self):
     for df in [self.table1, self.table2]:
@@ -157,7 +157,7 @@ def determine_file_columns(self):
       remove_columns = df.columns[~(df.apply(lambda x: x.astype(str).str.startswith('gs://')
                                              | x.isnull()).all())]
 
-# Convert the Index object to a set
+      # Convert the Index object to a set
       remove_columns = set(remove_columns.tolist())
       self.file_columns = self.file_columns - remove_columns
 
@@ -348,14 +348,14 @@ def compare(self):
     self.logger.info("Determining columns for file comparisons")
     self.determine_file_columns()
 
-    dir1 = f"table1_files/"
-    dir2 = f"table2_files/"
+    dir1 = "table1_files/"
+    dir2 = "table2_files/"
     os.mkdir(dir1)
     os.mkdir(dir2)
 
     self.logger.info("Localizing files to compare...")
-    self.table1.apply(localize_files, directory=dir1, axis=1)
-    self.table2.apply(localize_files, directory=dir2, axis=1)
+    self.table1[self.file_columns].apply(localize_files, directory=dir1)
+    self.table2[self.file_columns].apply(localize_files, directory=dir2)
 
     
     self.logger.info("Performing an exact string match")

From c1c5c3bb98bae5631836e27211c4cc103f232aaf Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sat, 9 Dec 2023 20:07:44 -0700
Subject: [PATCH 09/43] Perform exact file comparisons

---
 theiavalidate/Validator.py | 77 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 73 insertions(+), 4 deletions(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index c5fb2f0..0d59f77 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -1,5 +1,8 @@
 from datetime import date
 from pretty_html_table import build_table
+
+import difflib
+import filecmp
 import logging
 import numpy as np
 import os
@@ -12,6 +15,7 @@ class Validator:
   """
   This class runs the parsing module for theiavalidate
   """
+  NUMBER_OF_DIFFERENCES_COLUMN_HEADER = 
   def __init__(self, options):
     logging.basicConfig(encoding='utf-8', level=logging.ERROR, stream=sys.stderr)
     self.logger = logging.getLogger(__name__)
@@ -166,30 +170,96 @@ def determine_file_columns(self):
   """
   def perform_exact_match(self):
     self.logger.debug("Performing an exact match and removing the sample name column")
+
+    if self.file_columns:
+      # exclude file_columns for string comparison
+      table1 = self.table1.drop(list(self.file_columns), axis=1)
+      table2 = self.table2.drop(list(self.file_columns), axis=1)
+
+      # handle file comparisons separately from strings
+
+      # TODO: set index to samples column in main table earlier?
+      files_df1 = self.table1.set_index("samples") 
+      files_df2 = self.table2.set_index("samples")
+      files_df1 = files_df1[list(self.file_columns)]
+      files_df2 = files_df2[list(self.file_columns)]
+      file_number_of_differences = compare_files(files_df1, files_df2)
+    else:
+      table1 = self.table1
+      table2 = self.table2
+    
     # count the number of differences using exact string matches
     # temporarily make NaNs null since NaN != NaN for the pd.DataFrame.eq() function
     # also: remove the samplename row
-    number_of_differences = pd.DataFrame((~self.table1.fillna("NULL").astype(str).eq(self.table2.fillna("NULL").astype(str))).sum(), columns = ["Number of differences (exact match)"])
+    number_of_differences = pd.DataFrame((~table1.fillna("NULL").astype(str).eq(table2.fillna("NULL").astype(str))).sum(), columns = ["Number of differences (exact match)"])
+
     number_of_differences.drop("samples", axis=0, inplace=True)
+
     
     # add the number of differences to the summary output table
     self.logger.debug("Adding the number of exact match differences to the summary table")
     self.summary_output = pd.concat([self.summary_output, number_of_differences], join="outer", axis=1)
     
+
     # get a table of self-other differences
     # also: temporarily drop the sample name column for comparison and then set it as the index for the output data frame
     self.logger.debug("Creating a table of self-other differences")
-    exact_differences_table = self.table1.drop("samples", axis=1).compare(self.table2.drop("samples", axis=1), keep_shape=True).set_index(self.table1["samples"])
+    exact_differences_table = table1.drop("samples", axis=1).compare(table2.drop("samples", axis=1), keep_shape=True).set_index(table1["samples"])
     # rename the self and other with the table names
     self.logger.debug("Renaming the self and other to be the table names")
     exact_differences_table.rename(columns={"self": self.table1_name, "other": self.table2_name}, level=-1, inplace=True)
     # replace matching values (NAs) with blanks
     self.logger.debug("Replacing all NA values with blanks")
     exact_differences_table.replace(np.nan, "", inplace=True)
+
     
     self.logger.debug("Writing the self-other differences table to a TSV file")
     exact_differences_table.to_csv(self.output_prefix + "_exact_differences.tsv", sep="\t", index=True)
 
+
+    def compare_files(df1, df2):
+      comparison_df = pd.DataFrame(index=df1.index, columns=df1.columns)
+
+      for row in df1.index:
+        for col in df1.columns:
+          uri1 = df1.loc[row, col]
+          uri2 = df2.loc[row, col]
+          file1 = os.path.join("table1_files", uri1.removeprefix("gs://"))
+          file2 = os.path.join("table2_files", uri2.removeprefix("gs://"))
+          if pd.isnull(file1) and pd.isnull(file2):
+            # count two nulls as matching
+            comparison_df.loc[row, col] = True
+          elif (not pd.isnull(file1) and not pd.isnull(file2)):
+            is_match = filecmp.cmp(file1, file2)
+            comparison_df.loc[row, col] = is_match
+            if not is_match:
+              output_filename = f"{row}_{col}_diff.txt"
+              create_diff(file1, file2, output_filename)
+          else:
+            # count as not matching if pair is missing
+            comparison_df.loc[row, col] = False
+        
+        number_of_differences = pd.DataFrame(columns=["Number of differences (exact match)"])
+        for col in number_of_differences.columns:
+          count = comparison_df[col].dropna().ne(True).sum()
+          number_of_differences.loc[col] = count
+
+        return number_of_differences
+
+    def create_diff(file1, file2, output_filename):
+      # create unified diff
+      with open(file1, "r") as f1, open(file2, "r") as f2:
+        diff = difflib.unified_diff(
+          f1.readlines(),
+          f2.readlines(),
+          fromfile=file1,
+          tofile=file2,
+          lineterm='',
+        )
+        diff = "".join(diff)
+        with open(output_filename, "w") as out:
+          out.write(diff)
+
   """
   This function calculates the percent difference between two values
   """
@@ -356,7 +426,6 @@ def compare(self):
     self.logger.info("Localizing files to compare...")
     self.table1[self.file_columns].apply(localize_files, directory=dir1)
     self.table2[self.file_columns].apply(localize_files, directory=dir2)
-
     
     self.logger.info("Performing an exact string match")
     self.perform_exact_match()
@@ -377,7 +446,7 @@ def localize_files(row, directory):
       # it would be much faster to copy them all at once, but any files with
       # the same name would be clobbered, so create local directories matching
       # gsutil path and loop to copy
-      remote_path = os.path.dirname(value.removeprefix('gs://'))
+      remote_path = os.path.dirname(value.removeprefix("gs://"))
       destination_path = os.path.join(directory, remote_path)
       os.makedirs(destination_path)
       subprocess.run(["gsutil", "-m", "cp", value, destination_path])

From cf4c0ff87b9acc4aa687c2e903b1273125314da9 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sat, 9 Dec 2023 20:21:34 -0700
Subject: [PATCH 10/43] Fix typos in Validator

---
 tests/test_validator.py    |  6 +++++-
 theiavalidate/Validator.py | 14 +++++++-------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/tests/test_validator.py b/tests/test_validator.py
index 3db1996..c16bb95 100644
--- a/tests/test_validator.py
+++ b/tests/test_validator.py
@@ -107,4 +107,8 @@ def test_one_column_not_null(self):
       }
       self.run_determine_file_columns(data1, data2)
       self.assertEqual(self.validator.file_columns, {"col1"})
-      
\ No newline at end of file
+
+
+class TestCompareFiles(unittest.TestCase):
+  def setUp(self):
+    self.validator = Validator(MockOptions())
\ No newline at end of file
diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 0d59f77..df41bcb 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -15,7 +15,6 @@ class Validator:
   """
   This class runs the parsing module for theiavalidate
   """
-  NUMBER_OF_DIFFERENCES_COLUMN_HEADER = 
   def __init__(self, options):
     logging.basicConfig(encoding='utf-8', level=logging.ERROR, stream=sys.stderr)
     self.logger = logging.getLogger(__name__)
@@ -217,13 +216,14 @@ def perform_exact_match(self):
     exact_differences_table.to_csv(self.output_prefix + "_exact_differences.tsv", sep="\t", index=True)
 
 
-    def compare_files(df1, df2):
-      comparison_df = pd.DataFrame(index=df1.index, columns=df1.columns)
+    def compare_files(file_df1, file_df2):
+      comparison_df = pd.DataFrame(index=file_df1.index,
+                                   columns=file_df1.columns)
 
-      for row in df1.index:
-        for col in df1.columns:
-          uri1 = df1.loc[row, col]
-          uri2 = df2.loc[row, col]
+      for row in file_df1.index:
+        for col in file_df1.columns:
+          uri1 = file_df1.loc[row, col]
+          uri2 = file_df2.loc[row, col]
           file1 = os.path.join("table1_files", uri1.removeprefix("gs://"))
           file2 = os.path.join("table2_files", uri2.removeprefix("gs://"))
           if pd.isnull(file1) and pd.isnull(file2):

From dd6de05bacc8c2c9ae8dcc8c338a35b74534f9cd Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sat, 9 Dec 2023 21:09:56 -0700
Subject: [PATCH 11/43] Fix indentation

---
 theiavalidate/Validator.py | 86 +++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 43 deletions(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index df41bcb..913e9f0 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -216,49 +216,49 @@ def perform_exact_match(self):
     exact_differences_table.to_csv(self.output_prefix + "_exact_differences.tsv", sep="\t", index=True)
 
 
-    def compare_files(file_df1, file_df2):
-      comparison_df = pd.DataFrame(index=file_df1.index,
-                                   columns=file_df1.columns)
-
-      for row in file_df1.index:
-        for col in file_df1.columns:
-          uri1 = file_df1.loc[row, col]
-          uri2 = file_df2.loc[row, col]
-          file1 = os.path.join("table1_files", uri1.removeprefix("gs://"))
-          file2 = os.path.join("table2_files", uri2.removeprefix("gs://"))
-          if pd.isnull(file1) and pd.isnull(file2):
-            # count two nulls as matching
-            comparison_df.loc[row, col] = True
-          elif (not pd.isnull(file1) and not pd.isnull(file2)):
-            is_match = filecmp.cmp(file1, file2)
-            comparison_df.loc[row, col] = is_match
-            if not is_match:
-              output_filename = f"{row}_{col}_diff.txt"
-              create_diff(file1, file2, output_filename)
-          else:
-            # count as not matching if pair is missing
-            comparison_df.loc[row, col] = False
-        
-        number_of_differences = pd.DataFrame(columns=["Number of differences (exact match)"])
-        for col in number_of_differences.columns:
-          count = comparison_df[col].dropna().ne(True).sum()
-          number_of_differences.loc[col] = count
-
-        return number_of_differences
-
-    def create_diff(file1, file2, output_filename):
-      # create unified diff
-      with open(file1, "r") as f1, open(file2, "r") as f2:
-        diff = difflib.unified_diff(
-          f1.readlines(),
-          f2.readlines(),
-          fromfile=file1,
-          tofile=file2,
-          lineterm='',
-        )
-        diff = "".join(diff)
-        with open(output_filename, "w") as out:
-          out.write(diff)
+  def compare_files(file_df1, file_df2):
+    comparison_df = pd.DataFrame(index=file_df1.index,
+                                  columns=file_df1.columns)
+
+    for row in file_df1.index:
+      for col in file_df1.columns:
+        uri1 = file_df1.loc[row, col]
+        uri2 = file_df2.loc[row, col]
+        file1 = os.path.join("table1_files", uri1.removeprefix("gs://"))
+        file2 = os.path.join("table2_files", uri2.removeprefix("gs://"))
+        if pd.isnull(file1) and pd.isnull(file2):
+          # count two nulls as matching
+          comparison_df.loc[row, col] = True
+        elif (not pd.isnull(file1) and not pd.isnull(file2)):
+          is_match = filecmp.cmp(file1, file2)
+          comparison_df.loc[row, col] = is_match
+          if not is_match:
+            output_filename = f"{row}_{col}_diff.txt"
+            create_diff(file1, file2, output_filename)
+        else:
+          # count as not matching if pair is missing
+          comparison_df.loc[row, col] = False
+      
+      number_of_differences = pd.DataFrame(columns=["Number of differences (exact match)"])
+      for col in number_of_differences.columns:
+        count = comparison_df[col].dropna().ne(True).sum()
+        number_of_differences.loc[col] = count
+
+      return number_of_differences
+
+  def create_diff(file1, file2, output_filename):
+    # create unified diff
+    with open(file1, "r") as f1, open(file2, "r") as f2:
+      diff = difflib.unified_diff(
+        f1.readlines(),
+        f2.readlines(),
+        fromfile=file1,
+        tofile=file2,
+        lineterm='',
+      )
+      diff = "".join(diff)
+      with open(output_filename, "w") as out:
+        out.write(diff)
 
   """
   This function calculates the percent difference between two values

From 2b2d6d252d12747337f0cf5d3b2b16b5f26083e9 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sat, 9 Dec 2023 21:11:26 -0700
Subject: [PATCH 12/43] Add self as argument to methods

---
 theiavalidate/Validator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 913e9f0..328ca88 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -216,7 +216,7 @@ def perform_exact_match(self):
     exact_differences_table.to_csv(self.output_prefix + "_exact_differences.tsv", sep="\t", index=True)
 
 
-  def compare_files(file_df1, file_df2):
+  def compare_files(self, file_df1, file_df2):
     comparison_df = pd.DataFrame(index=file_df1.index,
                                   columns=file_df1.columns)
 
@@ -246,7 +246,7 @@ def compare_files(file_df1, file_df2):
 
       return number_of_differences
 
-  def create_diff(file1, file2, output_filename):
+  def create_diff(self, file1, file2, output_filename):
     # create unified diff
     with open(file1, "r") as f1, open(file2, "r") as f2:
       diff = difflib.unified_diff(

From 901c2805fb705d43dc713eb01e7583254f90facb Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sat, 9 Dec 2023 21:20:28 -0700
Subject: [PATCH 13/43] Add directory to store files as Validator init variable

---
 tests/test_validator.py    | 30 +++++++++++++++++++++++++++++-
 theiavalidate/Validator.py | 14 ++++++++------
 2 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/tests/test_validator.py b/tests/test_validator.py
index c16bb95..678811a 100644
--- a/tests/test_validator.py
+++ b/tests/test_validator.py
@@ -111,4 +111,32 @@ def test_one_column_not_null(self):
 
 class TestCompareFiles(unittest.TestCase):
   def setUp(self):
-    self.validator = Validator(MockOptions())
\ No newline at end of file
+    self.validator = Validator(MockOptions())
+    self.file_comparison_dir = "tests/file1_files"
+    self.file_comparison_dir = "tests/file2_files"
+
+  def test_matching_files(self):
+    df1 = pd.DataFrame({
+      "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-2.txt"],
+      "col2": ["gs://match2-1.txt", "gs://match2-2.txt", "gs://match2-3.txt"]
+    })
+    df2 = pd.DataFrame({
+      "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-2.txt"],
+      "col2": ["gs://match2-1.txt", "gs://match2-2.txt", "gs://match2-3.txt"]
+    })
+    observed = self.validator.compare_files(df1, df2)
+    expected = pd.DataFrame({
+      "Number of differences (exact match)": [0, 0]
+    })
+    expected.index = ["col1, col2"]
+    pd.testing.assert_frame_equal(observed, expected)
+
+  def test_mismatching_files(self):
+    pass
+
+  def test_mix_matching_files(self):
+    pass
+
+  def test_null_file(self):
+    pass
+
diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 328ca88..7b37be9 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -43,6 +43,8 @@ def __init__(self, options):
     self.columns_to_compare = options.columns_to_compare
     self.columns_to_compare.append("samples")
     self.file_columns = set()  # columns that contain GCP URIs to files
+    self.table1_files_dir = "table1_files"
+    self.table2_files_dir = "table2_files"
 
     self.output_prefix = options.output_prefix
     self.na_values = options.na_values
@@ -182,7 +184,7 @@ def perform_exact_match(self):
       files_df2 = self.table2.set_index("samples")
       files_df1 = files_df1[list(self.file_columns)]
       files_df2 = files_df2[list(self.file_columns)]
-      file_number_of_differences = compare_files(files_df1, files_df2)
+      file_number_of_differences = self.compare_files(files_df1, files_df2)
     else:
       table1 = self.table1
       table2 = self.table2
@@ -224,8 +226,8 @@ def compare_files(self, file_df1, file_df2):
       for col in file_df1.columns:
         uri1 = file_df1.loc[row, col]
         uri2 = file_df2.loc[row, col]
-        file1 = os.path.join("table1_files", uri1.removeprefix("gs://"))
-        file2 = os.path.join("table2_files", uri2.removeprefix("gs://"))
+        file1 = os.path.join(self.table1_files_dir, uri1.removeprefix("gs://"))
+        file2 = os.path.join(self.table2_files_dir, uri2.removeprefix("gs://"))
         if pd.isnull(file1) and pd.isnull(file2):
           # count two nulls as matching
           comparison_df.loc[row, col] = True
@@ -234,7 +236,7 @@ def compare_files(self, file_df1, file_df2):
           comparison_df.loc[row, col] = is_match
           if not is_match:
             output_filename = f"{row}_{col}_diff.txt"
-            create_diff(file1, file2, output_filename)
+            self._create_diff(file1, file2, output_filename)
         else:
           # count as not matching if pair is missing
           comparison_df.loc[row, col] = False
@@ -418,8 +420,8 @@ def compare(self):
     self.logger.info("Determining columns for file comparisons")
     self.determine_file_columns()
 
-    dir1 = "table1_files/"
-    dir2 = "table2_files/"
+    dir1 = f"{self.table1_files_dir}/"
+    dir2 = f"{self.table2_files_dir}/"
     os.mkdir(dir1)
     os.mkdir(dir2)
 

From 22aa10272f9f0cb2378cbe0f88332334a9fa6cbb Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sat, 9 Dec 2023 21:42:28 -0700
Subject: [PATCH 14/43] Fix wrong DataFrame variable in compare_files()

---
 theiavalidate/Validator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 7b37be9..ccca839 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -220,7 +220,7 @@ def perform_exact_match(self):
 
   def compare_files(self, file_df1, file_df2):
     comparison_df = pd.DataFrame(index=file_df1.index,
-                                  columns=file_df1.columns)
+                                 columns=file_df1.columns)
 
     for row in file_df1.index:
       for col in file_df1.columns:
@@ -242,7 +242,7 @@ def compare_files(self, file_df1, file_df2):
           comparison_df.loc[row, col] = False
       
       number_of_differences = pd.DataFrame(columns=["Number of differences (exact match)"])
-      for col in number_of_differences.columns:
+      for col in comparison_df.columns:
         count = comparison_df[col].dropna().ne(True).sum()
         number_of_differences.loc[col] = count
 

From 425ff5cfad2ac7ebb7d44a0445f0a56cdaae32d2 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sat, 9 Dec 2023 21:58:41 -0700
Subject: [PATCH 15/43] Fix typo _create_diff -> create_dff

---
 theiavalidate/Validator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index ccca839..c7233d1 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -236,7 +236,7 @@ def compare_files(self, file_df1, file_df2):
           comparison_df.loc[row, col] = is_match
           if not is_match:
             output_filename = f"{row}_{col}_diff.txt"
-            self._create_diff(file1, file2, output_filename)
+            self.create_diff(file1, file2, output_filename)
         else:
           # count as not matching if pair is missing
           comparison_df.loc[row, col] = False

From cf461db59e90d4fd7e98950a6207006aa1d9451e Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sat, 9 Dec 2023 22:29:20 -0700
Subject: [PATCH 16/43] Fix another indentation error

---
 theiavalidate/Validator.py | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index c7233d1..3a5bd05 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -45,6 +45,7 @@ def __init__(self, options):
     self.file_columns = set()  # columns that contain GCP URIs to files
     self.table1_files_dir = "table1_files"
     self.table2_files_dir = "table2_files"
+    self.diff_dir = "file_diffs"
 
     self.output_prefix = options.output_prefix
     self.na_values = options.na_values
@@ -222,12 +223,13 @@ def compare_files(self, file_df1, file_df2):
     comparison_df = pd.DataFrame(index=file_df1.index,
                                  columns=file_df1.columns)
 
-    for row in file_df1.index:
-      for col in file_df1.columns:
+    for col in file_df1.columns:
+      for row in file_df1.index:
         uri1 = file_df1.loc[row, col]
         uri2 = file_df2.loc[row, col]
         file1 = os.path.join(self.table1_files_dir, uri1.removeprefix("gs://"))
         file2 = os.path.join(self.table2_files_dir, uri2.removeprefix("gs://"))
+        print(f"files: {file1}, {file2}")
         if pd.isnull(file1) and pd.isnull(file2):
           # count two nulls as matching
           comparison_df.loc[row, col] = True
@@ -236,19 +238,20 @@ def compare_files(self, file_df1, file_df2):
           comparison_df.loc[row, col] = is_match
           if not is_match:
             output_filename = f"{row}_{col}_diff.txt"
-            self.create_diff(file1, file2, output_filename)
+            output_path = os.path.join(self.diff_dir, output_filename)
+            self.create_diff(file1, file2, output_path)
         else:
           # count as not matching if pair is missing
           comparison_df.loc[row, col] = False
-      
-      number_of_differences = pd.DataFrame(columns=["Number of differences (exact match)"])
-      for col in comparison_df.columns:
-        count = comparison_df[col].dropna().ne(True).sum()
-        number_of_differences.loc[col] = count
 
-      return number_of_differences
+    number_of_differences = pd.DataFrame(columns=["Number of differences (exact match)"])
+    for col in comparison_df.columns:
+      count = comparison_df[col].dropna().ne(True).sum()
+      number_of_differences.loc[col] = count
+
+    return number_of_differences
 
-  def create_diff(self, file1, file2, output_filename):
+  def create_diff(self, file1, file2, output_path):
     # create unified diff
     with open(file1, "r") as f1, open(file2, "r") as f2:
       diff = difflib.unified_diff(
@@ -259,7 +262,9 @@ def create_diff(self, file1, file2, output_filename):
         lineterm='',
       )
       diff = "".join(diff)
-      with open(output_filename, "w") as out:
+
+      os.makedirs(os.path.dirname(output_path), exist_ok=True)
+      with open(output_path, "w") as out:
         out.write(diff)
 
   """

From 494b8b4dc6e29fd1152cbcb00e13b0e469a53085 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sat, 9 Dec 2023 22:47:56 -0700
Subject: [PATCH 17/43] rearrange order of execution in compare_files()

---
 theiavalidate/Validator.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 3a5bd05..7620684 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -227,13 +227,12 @@ def compare_files(self, file_df1, file_df2):
       for row in file_df1.index:
         uri1 = file_df1.loc[row, col]
         uri2 = file_df2.loc[row, col]
-        file1 = os.path.join(self.table1_files_dir, uri1.removeprefix("gs://"))
-        file2 = os.path.join(self.table2_files_dir, uri2.removeprefix("gs://"))
-        print(f"files: {file1}, {file2}")
-        if pd.isnull(file1) and pd.isnull(file2):
+        if pd.isnull(uri1) and pd.isnull(uri2):
           # count two nulls as matching
           comparison_df.loc[row, col] = True
-        elif (not pd.isnull(file1) and not pd.isnull(file2)):
+        elif (not pd.isnull(uri1) and not pd.isnull(uri2)):
+          file1 = os.path.join(self.table1_files_dir, uri1.removeprefix("gs://"))
+          file2 = os.path.join(self.table2_files_dir, uri2.removeprefix("gs://"))
           is_match = filecmp.cmp(file1, file2)
           comparison_df.loc[row, col] = is_match
           if not is_match:

From 710b30aff9b67bc33255a59782651ba7bf49acc5 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sat, 9 Dec 2023 22:49:10 -0700
Subject: [PATCH 18/43] Add unit tests for compare_files()

---
 .DS_Store                          | Bin 0 -> 6148 bytes
 file_diffs/0_col1_diff.txt         |   5 +++
 file_diffs/0_col2_diff.txt         |   3 ++
 file_diffs/1_col1_diff.txt         |   3 ++
 file_diffs/1_col2_diff.txt         |   3 ++
 file_diffs/2_col1_diff.txt         |   5 +++
 file_diffs/2_col2_diff.txt         |   3 ++
 tests/table1_files/match1-1.txt    |   3 ++
 tests/table1_files/match1-2.txt    |   3 ++
 tests/table1_files/match1-3.txt    |   3 ++
 tests/table1_files/match2-1.txt    |   2 +
 tests/table1_files/match2-2.txt    |   2 +
 tests/table1_files/match2-3.txt    |   2 +
 tests/table1_files/mismatch1-1.txt |   3 ++
 tests/table1_files/mismatch1-2.txt |   2 +
 tests/table1_files/mismatch1-3.txt |   4 ++
 tests/table1_files/mismatch2-1.txt |   2 +
 tests/table1_files/mismatch2-2.txt |   2 +
 tests/table1_files/mismatch2-3.txt |   2 +
 tests/table2_files/match1-1.txt    |   3 ++
 tests/table2_files/match1-2.txt    |   3 ++
 tests/table2_files/match1-3.txt    |   3 ++
 tests/table2_files/match2-1.txt    |   2 +
 tests/table2_files/match2-2.txt    |   2 +
 tests/table2_files/match2-3.txt    |   2 +
 tests/table2_files/mismatch1-1.txt |   3 ++
 tests/table2_files/mismatch1-2.txt |   3 ++
 tests/table2_files/mismatch1-3.txt |   3 ++
 tests/table2_files/mismatch2-1.txt |   1 +
 tests/table2_files/mismatch2-2.txt |   2 +
 tests/table2_files/mismatch2-3.txt |   1 +
 tests/test_validator.py            |  58 ++++++++++++++++++++++++-----
 32 files changed, 129 insertions(+), 9 deletions(-)
 create mode 100644 .DS_Store
 create mode 100644 file_diffs/0_col1_diff.txt
 create mode 100644 file_diffs/0_col2_diff.txt
 create mode 100644 file_diffs/1_col1_diff.txt
 create mode 100644 file_diffs/1_col2_diff.txt
 create mode 100644 file_diffs/2_col1_diff.txt
 create mode 100644 file_diffs/2_col2_diff.txt
 create mode 100644 tests/table1_files/match1-1.txt
 create mode 100644 tests/table1_files/match1-2.txt
 create mode 100644 tests/table1_files/match1-3.txt
 create mode 100644 tests/table1_files/match2-1.txt
 create mode 100644 tests/table1_files/match2-2.txt
 create mode 100644 tests/table1_files/match2-3.txt
 create mode 100644 tests/table1_files/mismatch1-1.txt
 create mode 100644 tests/table1_files/mismatch1-2.txt
 create mode 100644 tests/table1_files/mismatch1-3.txt
 create mode 100644 tests/table1_files/mismatch2-1.txt
 create mode 100644 tests/table1_files/mismatch2-2.txt
 create mode 100644 tests/table1_files/mismatch2-3.txt
 create mode 100644 tests/table2_files/match1-1.txt
 create mode 100644 tests/table2_files/match1-2.txt
 create mode 100644 tests/table2_files/match1-3.txt
 create mode 100644 tests/table2_files/match2-1.txt
 create mode 100644 tests/table2_files/match2-2.txt
 create mode 100644 tests/table2_files/match2-3.txt
 create mode 100644 tests/table2_files/mismatch1-1.txt
 create mode 100644 tests/table2_files/mismatch1-2.txt
 create mode 100644 tests/table2_files/mismatch1-3.txt
 create mode 100644 tests/table2_files/mismatch2-1.txt
 create mode 100644 tests/table2_files/mismatch2-2.txt
 create mode 100644 tests/table2_files/mismatch2-3.txt

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..51227d71a94e52683c4d00d01ac912a7dfa75f3b
GIT binary patch
literal 6148
zcmeHKI|>3p3{CuiU}I@HSMUad=n3`$7K)81_^Y?_TprDrPoXS!S|~4&yqQeiEc=Sh
zMnrUeS<gf!A~J#-%FRO8?A*L#z04>Ojyq1*=D6P-_S;VFyM%Fva+Zyp5TE&WLZbpy
zfC^9nDnJE3tw46z(fHFB^FAs-1%6%u`#u!7VNGlU{nLTqBLHxKv>VnwO8|={fHkoV
zL<Xio1qM}f#L%E4U$U+ywt+zx&EZ4y$(j?2`qOcL@p91`$VdgKz^elN*iNkfFX3<I
z|5p-sRDcTnl>$0gG>bW2DSK<@<*e5h_zG?{ceokWPC@W?4D@!4jkV*c7e!sMHO^~d
U8|ZZ8oet#BfayY`0^e5P0W<p)>;M1&

literal 0
HcmV?d00001

diff --git a/file_diffs/0_col1_diff.txt b/file_diffs/0_col1_diff.txt
new file mode 100644
index 0000000..a0a1ba9
--- /dev/null
+++ b/file_diffs/0_col1_diff.txt
@@ -0,0 +1,5 @@
+--- tests/table1_files/mismatch1-1.txt+++ tests/table2_files/mismatch1-1.txt@@ -1,3 +1,3 @@-foo
+-bar
++eggs
++spam
+ 
diff --git a/file_diffs/0_col2_diff.txt b/file_diffs/0_col2_diff.txt
new file mode 100644
index 0000000..852e058
--- /dev/null
+++ b/file_diffs/0_col2_diff.txt
@@ -0,0 +1,3 @@
+--- tests/table1_files/mismatch2-3.txt+++ tests/table2_files/mismatch2-3.txt@@ -1,2 +1 @@-hello, world
+-
++hello, world!
diff --git a/file_diffs/1_col1_diff.txt b/file_diffs/1_col1_diff.txt
new file mode 100644
index 0000000..7f6663f
--- /dev/null
+++ b/file_diffs/1_col1_diff.txt
@@ -0,0 +1,3 @@
+--- tests/table1_files/mismatch1-2.txt+++ tests/table2_files/mismatch1-2.txt@@ -1,2 +1,3 @@+foo
+ foo
+ 
diff --git a/file_diffs/1_col2_diff.txt b/file_diffs/1_col2_diff.txt
new file mode 100644
index 0000000..e08f21a
--- /dev/null
+++ b/file_diffs/1_col2_diff.txt
@@ -0,0 +1,3 @@
+--- tests/table1_files/mismatch2-2.txt+++ tests/table2_files/mismatch2-2.txt@@ -1,2 +1,2 @@-5 6 6
++4 5 6 
+ 
diff --git a/file_diffs/2_col1_diff.txt b/file_diffs/2_col1_diff.txt
new file mode 100644
index 0000000..89a7f0c
--- /dev/null
+++ b/file_diffs/2_col1_diff.txt
@@ -0,0 +1,5 @@
+--- tests/table1_files/mismatch1-3.txt+++ tests/table2_files/mismatch1-3.txt@@ -1,4 +1,3 @@+spam
+ 
+-spam
+ eggs
+-
diff --git a/file_diffs/2_col2_diff.txt b/file_diffs/2_col2_diff.txt
new file mode 100644
index 0000000..852e058
--- /dev/null
+++ b/file_diffs/2_col2_diff.txt
@@ -0,0 +1,3 @@
+--- tests/table1_files/mismatch2-3.txt+++ tests/table2_files/mismatch2-3.txt@@ -1,2 +1 @@-hello, world
+-
++hello, world!
diff --git a/tests/table1_files/match1-1.txt b/tests/table1_files/match1-1.txt
new file mode 100644
index 0000000..7bd112e
--- /dev/null
+++ b/tests/table1_files/match1-1.txt
@@ -0,0 +1,3 @@
+foo
+bar
+
diff --git a/tests/table1_files/match1-2.txt b/tests/table1_files/match1-2.txt
new file mode 100644
index 0000000..42f0295
--- /dev/null
+++ b/tests/table1_files/match1-2.txt
@@ -0,0 +1,3 @@
+baz
+eggs
+
diff --git a/tests/table1_files/match1-3.txt b/tests/table1_files/match1-3.txt
new file mode 100644
index 0000000..fe05684
--- /dev/null
+++ b/tests/table1_files/match1-3.txt
@@ -0,0 +1,3 @@
+spam
+monty
+
diff --git a/tests/table1_files/match2-1.txt b/tests/table1_files/match2-1.txt
new file mode 100644
index 0000000..2b70035
--- /dev/null
+++ b/tests/table1_files/match2-1.txt
@@ -0,0 +1,2 @@
+1 2 3
+
diff --git a/tests/table1_files/match2-2.txt b/tests/table1_files/match2-2.txt
new file mode 100644
index 0000000..8db5eef
--- /dev/null
+++ b/tests/table1_files/match2-2.txt
@@ -0,0 +1,2 @@
+4 5 6
+
diff --git a/tests/table1_files/match2-3.txt b/tests/table1_files/match2-3.txt
new file mode 100644
index 0000000..ee64adb
--- /dev/null
+++ b/tests/table1_files/match2-3.txt
@@ -0,0 +1,2 @@
+7 8 9
+
diff --git a/tests/table1_files/mismatch1-1.txt b/tests/table1_files/mismatch1-1.txt
new file mode 100644
index 0000000..7bd112e
--- /dev/null
+++ b/tests/table1_files/mismatch1-1.txt
@@ -0,0 +1,3 @@
+foo
+bar
+
diff --git a/tests/table1_files/mismatch1-2.txt b/tests/table1_files/mismatch1-2.txt
new file mode 100644
index 0000000..75d7bfb
--- /dev/null
+++ b/tests/table1_files/mismatch1-2.txt
@@ -0,0 +1,2 @@
+foo
+
diff --git a/tests/table1_files/mismatch1-3.txt b/tests/table1_files/mismatch1-3.txt
new file mode 100644
index 0000000..d86174f
--- /dev/null
+++ b/tests/table1_files/mismatch1-3.txt
@@ -0,0 +1,4 @@
+
+spam
+eggs
+
diff --git a/tests/table1_files/mismatch2-1.txt b/tests/table1_files/mismatch2-1.txt
new file mode 100644
index 0000000..2b70035
--- /dev/null
+++ b/tests/table1_files/mismatch2-1.txt
@@ -0,0 +1,2 @@
+1 2 3
+
diff --git a/tests/table1_files/mismatch2-2.txt b/tests/table1_files/mismatch2-2.txt
new file mode 100644
index 0000000..a28f8ae
--- /dev/null
+++ b/tests/table1_files/mismatch2-2.txt
@@ -0,0 +1,2 @@
+5 6 6
+
diff --git a/tests/table1_files/mismatch2-3.txt b/tests/table1_files/mismatch2-3.txt
new file mode 100644
index 0000000..ae0e511
--- /dev/null
+++ b/tests/table1_files/mismatch2-3.txt
@@ -0,0 +1,2 @@
+hello, world
+
diff --git a/tests/table2_files/match1-1.txt b/tests/table2_files/match1-1.txt
new file mode 100644
index 0000000..7bd112e
--- /dev/null
+++ b/tests/table2_files/match1-1.txt
@@ -0,0 +1,3 @@
+foo
+bar
+
diff --git a/tests/table2_files/match1-2.txt b/tests/table2_files/match1-2.txt
new file mode 100644
index 0000000..42f0295
--- /dev/null
+++ b/tests/table2_files/match1-2.txt
@@ -0,0 +1,3 @@
+baz
+eggs
+
diff --git a/tests/table2_files/match1-3.txt b/tests/table2_files/match1-3.txt
new file mode 100644
index 0000000..fe05684
--- /dev/null
+++ b/tests/table2_files/match1-3.txt
@@ -0,0 +1,3 @@
+spam
+monty
+
diff --git a/tests/table2_files/match2-1.txt b/tests/table2_files/match2-1.txt
new file mode 100644
index 0000000..2b70035
--- /dev/null
+++ b/tests/table2_files/match2-1.txt
@@ -0,0 +1,2 @@
+1 2 3
+
diff --git a/tests/table2_files/match2-2.txt b/tests/table2_files/match2-2.txt
new file mode 100644
index 0000000..8db5eef
--- /dev/null
+++ b/tests/table2_files/match2-2.txt
@@ -0,0 +1,2 @@
+4 5 6
+
diff --git a/tests/table2_files/match2-3.txt b/tests/table2_files/match2-3.txt
new file mode 100644
index 0000000..ee64adb
--- /dev/null
+++ b/tests/table2_files/match2-3.txt
@@ -0,0 +1,2 @@
+7 8 9
+
diff --git a/tests/table2_files/mismatch1-1.txt b/tests/table2_files/mismatch1-1.txt
new file mode 100644
index 0000000..34ae2c6
--- /dev/null
+++ b/tests/table2_files/mismatch1-1.txt
@@ -0,0 +1,3 @@
+eggs
+spam
+
diff --git a/tests/table2_files/mismatch1-2.txt b/tests/table2_files/mismatch1-2.txt
new file mode 100644
index 0000000..7cd519a
--- /dev/null
+++ b/tests/table2_files/mismatch1-2.txt
@@ -0,0 +1,3 @@
+foo
+foo
+
diff --git a/tests/table2_files/mismatch1-3.txt b/tests/table2_files/mismatch1-3.txt
new file mode 100644
index 0000000..fbabddf
--- /dev/null
+++ b/tests/table2_files/mismatch1-3.txt
@@ -0,0 +1,3 @@
+spam
+
+eggs
diff --git a/tests/table2_files/mismatch2-1.txt b/tests/table2_files/mismatch2-1.txt
new file mode 100644
index 0000000..8d04f96
--- /dev/null
+++ b/tests/table2_files/mismatch2-1.txt
@@ -0,0 +1 @@
+1 2
diff --git a/tests/table2_files/mismatch2-2.txt b/tests/table2_files/mismatch2-2.txt
new file mode 100644
index 0000000..336a0f9
--- /dev/null
+++ b/tests/table2_files/mismatch2-2.txt
@@ -0,0 +1,2 @@
+4 5 6 
+
diff --git a/tests/table2_files/mismatch2-3.txt b/tests/table2_files/mismatch2-3.txt
new file mode 100644
index 0000000..270c611
--- /dev/null
+++ b/tests/table2_files/mismatch2-3.txt
@@ -0,0 +1 @@
+hello, world!
diff --git a/tests/test_validator.py b/tests/test_validator.py
index 678811a..39b2b61 100644
--- a/tests/test_validator.py
+++ b/tests/test_validator.py
@@ -112,31 +112,71 @@ def test_one_column_not_null(self):
 class TestCompareFiles(unittest.TestCase):
   def setUp(self):
     self.validator = Validator(MockOptions())
-    self.file_comparison_dir = "tests/file1_files"
-    self.file_comparison_dir = "tests/file2_files"
+    self.validator.table1_files_dir = "tests/table1_files"
+    self.validator.table2_files_dir = "tests/table2_files"
+    self.diff_dir = "/dev/null"
 
   def test_matching_files(self):
     df1 = pd.DataFrame({
-      "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-2.txt"],
+      "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"],
       "col2": ["gs://match2-1.txt", "gs://match2-2.txt", "gs://match2-3.txt"]
     })
     df2 = pd.DataFrame({
-      "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-2.txt"],
+      "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"],
       "col2": ["gs://match2-1.txt", "gs://match2-2.txt", "gs://match2-3.txt"]
     })
     observed = self.validator.compare_files(df1, df2)
     expected = pd.DataFrame({
       "Number of differences (exact match)": [0, 0]
     })
-    expected.index = ["col1, col2"]
+    expected.index = ["col1", "col2"]
     pd.testing.assert_frame_equal(observed, expected)
 
   def test_mismatching_files(self):
-    pass
+    df1 = pd.DataFrame({
+      "col1": ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"],
+      "col2": ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"]
+    })
+    df2 = pd.DataFrame({
+      "col1": ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"],
+      "col2": ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"]
+    })
+    observed = self.validator.compare_files(df1, df2)
+    expected = pd.DataFrame({
+      "Number of differences (exact match)": [3, 3]
+    })
+    expected.index = ["col1", "col2"]
+    pd.testing.assert_frame_equal(observed, expected)
 
   def test_mix_matching_files(self):
-    pass
+    df1 = pd.DataFrame({
+      "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"],
+      "col2": ["gs://mismatch2-3.txt", "gs://match2-2.txt", "gs://mismatch2-3.txt"]
+    })
+    df2 = pd.DataFrame({
+      "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"],
+      "col2": ["gs://mismatch2-3.txt", "gs://match2-2.txt", "gs://mismatch2-3.txt"]
+    })
+    observed = self.validator.compare_files(df1, df2)
+    expected = pd.DataFrame({
+      "Number of differences (exact match)": [0, 2]
+    })
+    expected.index = ["col1", "col2"]
+    pd.testing.assert_frame_equal(observed, expected) 
 
-  def test_null_file(self):
-    pass
+  def test_null_files(self):
+    df1 = pd.DataFrame({
+      "col1": [np.nan, "gs://match1-2.txt", "gs://match1-3.txt"],
+      "col2": [np.nan, "gs://match2-2.txt", "gs://mismatch2-3.txt"]
+    })
+    df2 = pd.DataFrame({
+      "col1": [np.nan, "gs://match1-2.txt", "gs://match1-3.txt"],
+      "col2": ["gs://match2-3.txt", np.nan, np.nan]
+    })
+    observed = self.validator.compare_files(df1, df2)
+    expected = pd.DataFrame({
+      "Number of differences (exact match)": [0, 3]
+    })
+    expected.index = ["col1", "col2"]
+    pd.testing.assert_frame_equal(observed, expected) 
 

From e7fb41af00c390bb9e08b696d1793d4be938858e Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sat, 9 Dec 2023 23:01:40 -0700
Subject: [PATCH 19/43] Apppend file number of differences to summary_output

---
 theiavalidate/Validator.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 7620684..f3b111c 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -173,6 +173,7 @@ def determine_file_columns(self):
   def perform_exact_match(self):
     self.logger.debug("Performing an exact match and removing the sample name column")
 
+    file_number_of_differences = None
     if self.file_columns:
       # exclude file_columns for string comparison
       table1 = self.table1.drop(list(self.file_columns), axis=1)
@@ -201,7 +202,8 @@ def perform_exact_match(self):
     # add the number of differences to the summary output table
     self.logger.debug("Adding the number of exact match differences to the summary table")
     self.summary_output = pd.concat([self.summary_output, number_of_differences], join="outer", axis=1)
-    
+    if file_number_of_differences is not None:
+      self.summary_output = pd.concat([self.summary_output, file_number_of_differences], join="outer", axis="1")
 
     # get a table of self-other differences
     # also: temporarily drop the sample name column for comparison and then set it as the index for the output data frame

From 59912a3252ce667cf1d2b501188d9ee34dfad08c Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sat, 9 Dec 2023 23:19:46 -0700
Subject: [PATCH 20/43] __init__ hacks to get imports to work

---
 __init__.py                    | 2 ++
 tests/__init__.py              | 2 ++
 theiavalidate/__init__.py      | 3 ++-
 theiavalidate/theiavalidate.py | 6 +++---
 4 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/__init__.py b/__init__.py
index e69de29..c0986f5 100644
--- a/__init__.py
+++ b/__init__.py
@@ -0,0 +1,2 @@
+__VERSION__ = "v0.0.1"
+import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__)))
\ No newline at end of file
diff --git a/tests/__init__.py b/tests/__init__.py
index e69de29..c0986f5 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -0,0 +1,2 @@
+__VERSION__ = "v0.0.1"
+import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__)))
\ No newline at end of file
diff --git a/theiavalidate/__init__.py b/theiavalidate/__init__.py
index 9a65ac3..c0986f5 100644
--- a/theiavalidate/__init__.py
+++ b/theiavalidate/__init__.py
@@ -1 +1,2 @@
-__VERSION__ = "v0.0.1"
\ No newline at end of file
+__VERSION__ = "v0.0.1"
+import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__)))
\ No newline at end of file
diff --git a/theiavalidate/theiavalidate.py b/theiavalidate/theiavalidate.py
index f18ff5e..c04ae3b 100644
--- a/theiavalidate/theiavalidate.py
+++ b/theiavalidate/theiavalidate.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
 
 import argparse
-from . import CheckInputs
-from .__init__ import __VERSION__
-from .Validator import Validator
+import CheckInputs
+from __init__ import __VERSION__
+from Validator import Validator
 
 DEFAULT_NA_VALUES = [
   '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a',

From 0d752d1c9a1b1c1731d5bb409bd383d16b14ff97 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sun, 10 Dec 2023 00:51:31 -0700
Subject: [PATCH 21/43] Use list instead of set for indexing

---
 theiavalidate/Validator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index f3b111c..18102c8 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -432,8 +432,8 @@ def compare(self):
     os.mkdir(dir2)
 
     self.logger.info("Localizing files to compare...")
-    self.table1[self.file_columns].apply(localize_files, directory=dir1)
-    self.table2[self.file_columns].apply(localize_files, directory=dir2)
+    self.table1[list(self.file_columns)].apply(localize_files, directory=dir1)
+    self.table2[list(self.file_columns)].apply(localize_files, directory=dir2)
     
     self.logger.info("Performing an exact string match")
     self.perform_exact_match()

From abf7f0044a5474b8c776a51f7b7219e2ddba8a01 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sun, 10 Dec 2023 01:13:51 -0700
Subject: [PATCH 22/43] Add exist_ok=True to os.makedirs()

---
 theiavalidate/Validator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 18102c8..2641a5e 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -456,5 +456,5 @@ def localize_files(row, directory):
       # gsutil path and loop to copy
       remote_path = os.path.dirname(value.removeprefix("gs://"))
       destination_path = os.path.join(directory, remote_path)
-      os.makedirs(destination_path)
+      os.makedirs(destination_path, exist_ok=True)
       subprocess.run(["gsutil", "-m", "cp", value, destination_path])

From 275281eedb5d2cbfe602d87a8b28c7d0921e5eec Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sun, 10 Dec 2023 01:22:49 -0700
Subject: [PATCH 23/43] fix typo axis="1" -> axis=1

---
 theiavalidate/Validator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 2641a5e..5d1a73b 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -203,7 +203,7 @@ def perform_exact_match(self):
     self.logger.debug("Adding the number of exact match differences to the summary table")
     self.summary_output = pd.concat([self.summary_output, number_of_differences], join="outer", axis=1)
     if file_number_of_differences is not None:
-      self.summary_output = pd.concat([self.summary_output, file_number_of_differences], join="outer", axis="1")
+      self.summary_output = pd.concat([self.summary_output, file_number_of_differences], join="outer", axis=1)
 
     # get a table of self-other differences
     # also: temporarily drop the sample name column for comparison and then set it as the index for the output data frame

From 169dbfd9906ac5ab6c8783c39ef8631c65895d5b Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sun, 10 Dec 2023 20:35:07 -0700
Subject: [PATCH 24/43] Fix issues appending file number of differences to
 summary output

---
 theiavalidate/Validator.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 5d1a73b..778f3df 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -15,6 +15,7 @@ class Validator:
   """
   This class runs the parsing module for theiavalidate
   """
+  NUM_DIFFERENCES_COL = "Number of differences (exact match)"
   def __init__(self, options):
     logging.basicConfig(encoding='utf-8', level=logging.ERROR, stream=sys.stderr)
     self.logger = logging.getLogger(__name__)
@@ -180,7 +181,6 @@ def perform_exact_match(self):
       table2 = self.table2.drop(list(self.file_columns), axis=1)
 
       # handle file comparisons separately from strings
-
       # TODO: set index to samples column in main table earlier?
       files_df1 = self.table1.set_index("samples") 
       files_df2 = self.table2.set_index("samples")
@@ -194,7 +194,7 @@ def perform_exact_match(self):
     # count the number of differences using exact string matches
     # temporarily make NaNs null since NaN != NaN for the pd.DataFrame.eq() function
     # also: remove the samplename row
-    number_of_differences = pd.DataFrame((~table1.fillna("NULL").astype(str).eq(table2.fillna("NULL").astype(str))).sum(), columns = ["Number of differences (exact match)"])
+    number_of_differences = pd.DataFrame((~table1.fillna("NULL").astype(str).eq(table2.fillna("NULL").astype(str))).sum(), columns = [self.NUM_DIFFERENCES_COL])
 
     number_of_differences.drop("samples", axis=0, inplace=True)
 
@@ -203,7 +203,11 @@ def perform_exact_match(self):
     self.logger.debug("Adding the number of exact match differences to the summary table")
     self.summary_output = pd.concat([self.summary_output, number_of_differences], join="outer", axis=1)
     if file_number_of_differences is not None:
-      self.summary_output = pd.concat([self.summary_output, file_number_of_differences], join="outer", axis=1)
+      self.summary_output = self.summary_output.combine_first(file_number_of_differences)
+    self.summary_output[self.NUM_DIFFERENCES_COL] = self.summary_output[self.NUM_DIFFERENCES_COL].astype(int)
+
+    # Ensure number of differences column is the last column
+    self.summary_output[self.NUM_DIFFERENCES_COL] = self.summary_output.pop(self.NUM_DIFFERENCES_COL)
 
     # get a table of self-other differences
     # also: temporarily drop the sample name column for comparison and then set it as the index for the output data frame
@@ -245,7 +249,7 @@ def compare_files(self, file_df1, file_df2):
           # count as not matching if pair is missing
           comparison_df.loc[row, col] = False
 
-    number_of_differences = pd.DataFrame(columns=["Number of differences (exact match)"])
+    number_of_differences = pd.DataFrame(columns=[self.NUM_DIFFERENCES_COL])
     for col in comparison_df.columns:
       count = comparison_df[col].dropna().ne(True).sum()
       number_of_differences.loc[col] = count

From f5e9392b8894390e7db143f2ee98177ea0525405 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sun, 10 Dec 2023 21:20:45 -0700
Subject: [PATCH 25/43] Refactor compare_files() and set instead of return

The DataFrames created in compare_files() will be useful
in other parts of the code. Particularly, for creating
a table similar to the exact_differences_table
and for implementing validation criteria for factors.
Break calculating file number of differences in a new
method. Also, set DataFrames as properties rather than return values
to make the DataFrames more accessible.
---
 tests/test_validator.py    | 30 +++++++++++++++++-------------
 theiavalidate/Validator.py | 27 ++++++++++++++++-----------
 2 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/tests/test_validator.py b/tests/test_validator.py
index 39b2b61..ed99cc7 100644
--- a/tests/test_validator.py
+++ b/tests/test_validator.py
@@ -109,7 +109,7 @@ def test_one_column_not_null(self):
       self.assertEqual(self.validator.file_columns, {"col1"})
 
 
-class TestCompareFiles(unittest.TestCase):
+class TestFileNumberOfDifferences(unittest.TestCase):
   def setUp(self):
     self.validator = Validator(MockOptions())
     self.validator.table1_files_dir = "tests/table1_files"
@@ -125,12 +125,13 @@ def test_matching_files(self):
       "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"],
       "col2": ["gs://match2-1.txt", "gs://match2-2.txt", "gs://match2-3.txt"]
     })
-    observed = self.validator.compare_files(df1, df2)
+    self.validator.compare_files(df1, df2)
+    self.validator.set_file_number_of_differences()
     expected = pd.DataFrame({
-      "Number of differences (exact match)": [0, 0]
+      self.validator.NUM_DIFFERENCES_COL: [0, 0]
     })
     expected.index = ["col1", "col2"]
-    pd.testing.assert_frame_equal(observed, expected)
+    pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected)
 
   def test_mismatching_files(self):
     df1 = pd.DataFrame({
@@ -141,12 +142,13 @@ def test_mismatching_files(self):
       "col1": ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"],
       "col2": ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"]
     })
-    observed = self.validator.compare_files(df1, df2)
+    self.validator.compare_files(df1, df2)
+    self.validator.set_file_number_of_differences()
     expected = pd.DataFrame({
-      "Number of differences (exact match)": [3, 3]
+      self.validator.NUM_DIFFERENCES_COL: [3, 3]
     })
     expected.index = ["col1", "col2"]
-    pd.testing.assert_frame_equal(observed, expected)
+    pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected)
 
   def test_mix_matching_files(self):
     df1 = pd.DataFrame({
@@ -157,12 +159,13 @@ def test_mix_matching_files(self):
       "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"],
       "col2": ["gs://mismatch2-3.txt", "gs://match2-2.txt", "gs://mismatch2-3.txt"]
     })
-    observed = self.validator.compare_files(df1, df2)
+    self.validator.compare_files(df1, df2)
+    self.validator.set_file_number_of_differences()
     expected = pd.DataFrame({
-      "Number of differences (exact match)": [0, 2]
+      self.validator.NUM_DIFFERENCES_COL: [0, 2]
     })
     expected.index = ["col1", "col2"]
-    pd.testing.assert_frame_equal(observed, expected) 
+    pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) 
 
   def test_null_files(self):
     df1 = pd.DataFrame({
@@ -173,10 +176,11 @@ def test_null_files(self):
       "col1": [np.nan, "gs://match1-2.txt", "gs://match1-3.txt"],
       "col2": ["gs://match2-3.txt", np.nan, np.nan]
     })
-    observed = self.validator.compare_files(df1, df2)
+    self.validator.compare_files(df1, df2)
+    self.validator.set_file_number_of_differences()
     expected = pd.DataFrame({
-      "Number of differences (exact match)": [0, 3]
+      self.validator.NUM_DIFFERENCES_COL: [0, 3]
     })
     expected.index = ["col1", "col2"]
-    pd.testing.assert_frame_equal(observed, expected) 
+    pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) 
 
diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 778f3df..85f8f33 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -43,11 +43,17 @@ def __init__(self, options):
     self.validation_criteria = options.validation_criteria
     self.columns_to_compare = options.columns_to_compare
     self.columns_to_compare.append("samples")
+
     self.file_columns = set()  # columns that contain GCP URIs to files
     self.table1_files_dir = "table1_files"
     self.table2_files_dir = "table2_files"
     self.diff_dir = "file_diffs"
 
+    # DataFrames for holding file comparison results
+    self.file_exact_matches = None
+    self.file_exact_differences_table = None
+    self.file_number_of_differences = None
+
     self.output_prefix = options.output_prefix
     self.na_values = options.na_values
       
@@ -226,8 +232,8 @@ def perform_exact_match(self):
 
 
   def compare_files(self, file_df1, file_df2):
-    comparison_df = pd.DataFrame(index=file_df1.index,
-                                 columns=file_df1.columns)
+    self.file_exact_matches = pd.DataFrame(index=file_df1.index,
+                                           columns=file_df1.columns)
 
     for col in file_df1.columns:
       for row in file_df1.index:
@@ -235,26 +241,25 @@ def compare_files(self, file_df1, file_df2):
         uri2 = file_df2.loc[row, col]
         if pd.isnull(uri1) and pd.isnull(uri2):
           # count two nulls as matching
-          comparison_df.loc[row, col] = True
+          self.file_exact_matches.loc[row, col] = True
         elif (not pd.isnull(uri1) and not pd.isnull(uri2)):
           file1 = os.path.join(self.table1_files_dir, uri1.removeprefix("gs://"))
           file2 = os.path.join(self.table2_files_dir, uri2.removeprefix("gs://"))
           is_match = filecmp.cmp(file1, file2)
-          comparison_df.loc[row, col] = is_match
+          self.file_exact_matches.loc[row, col] = is_match
           if not is_match:
             output_filename = f"{row}_{col}_diff.txt"
             output_path = os.path.join(self.diff_dir, output_filename)
             self.create_diff(file1, file2, output_path)
         else:
           # count as not matching if pair is missing
-          comparison_df.loc[row, col] = False
-
-    number_of_differences = pd.DataFrame(columns=[self.NUM_DIFFERENCES_COL])
-    for col in comparison_df.columns:
-      count = comparison_df[col].dropna().ne(True).sum()
-      number_of_differences.loc[col] = count
+          self.file_exact_matches.loc[row, col] = False
 
-    return number_of_differences
+  def set_file_number_of_differences(self):
+    self.file_number_of_differences = pd.DataFrame(columns=[self.NUM_DIFFERENCES_COL])
+    for col in self.file_exact_matches.columns:
+      count = self.file_exact_matches[col].dropna().ne(True).sum()
+      self.file_number_of_differences.loc[col] = count
 
   def create_diff(self, file1, file2, output_path):
     # create unified diff

From 3b768097836d36c7ab3174c3c7e1c3cf26b35379 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sun, 10 Dec 2023 21:34:34 -0700
Subject: [PATCH 26/43] Update perform_exact_match() to use new Validator props

---
 theiavalidate/Validator.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 85f8f33..9cf26b8 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -192,7 +192,7 @@ def perform_exact_match(self):
       files_df2 = self.table2.set_index("samples")
       files_df1 = files_df1[list(self.file_columns)]
       files_df2 = files_df2[list(self.file_columns)]
-      file_number_of_differences = self.compare_files(files_df1, files_df2)
+      self.compare_files(files_df1, files_df2)
     else:
       table1 = self.table1
       table2 = self.table2
@@ -208,8 +208,10 @@ def perform_exact_match(self):
     # add the number of differences to the summary output table
     self.logger.debug("Adding the number of exact match differences to the summary table")
     self.summary_output = pd.concat([self.summary_output, number_of_differences], join="outer", axis=1)
-    if file_number_of_differences is not None:
-      self.summary_output = self.summary_output.combine_first(file_number_of_differences)
+
+    self.set_file_number_of_differences()
+    if self.file_number_of_differences is not None:
+      self.summary_output = self.summary_output.combine_first(self.file_number_of_differences)
     self.summary_output[self.NUM_DIFFERENCES_COL] = self.summary_output[self.NUM_DIFFERENCES_COL].astype(int)
 
     # Ensure number of differences column is the last column

From 1358a75126a2b9d011a11eecef01572730a6e937 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sun, 10 Dec 2023 22:47:40 -0700
Subject: [PATCH 27/43] Add file URIs to exact_differences_table

---
 theiavalidate/Validator.py | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 9cf26b8..55c47ba 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -51,7 +51,7 @@ def __init__(self, options):
 
     # DataFrames for holding file comparison results
     self.file_exact_matches = None
-    self.file_exact_differences_table = None
+    self.file_exact_differences = None
     self.file_number_of_differences = None
 
     self.output_prefix = options.output_prefix
@@ -180,7 +180,6 @@ def determine_file_columns(self):
   def perform_exact_match(self):
     self.logger.debug("Performing an exact match and removing the sample name column")
 
-    file_number_of_differences = None
     if self.file_columns:
       # exclude file_columns for string comparison
       table1 = self.table1.drop(list(self.file_columns), axis=1)
@@ -214,7 +213,7 @@ def perform_exact_match(self):
       self.summary_output = self.summary_output.combine_first(self.file_number_of_differences)
     self.summary_output[self.NUM_DIFFERENCES_COL] = self.summary_output[self.NUM_DIFFERENCES_COL].astype(int)
 
-    # Ensure number of differences column is the last column
+    # ensure number of differences column is the last column
     self.summary_output[self.NUM_DIFFERENCES_COL] = self.summary_output.pop(self.NUM_DIFFERENCES_COL)
 
     # get a table of self-other differences
@@ -224,6 +223,10 @@ def perform_exact_match(self):
     # rename the self and other with the table names
     self.logger.debug("Renaming the self and other to be the table names")
     exact_differences_table.rename(columns={"self": self.table1_name, "other": self.table2_name}, level=-1, inplace=True)
+
+    # add file exact differences
+    exact_differences_table = pd.concat([exact_differences_table, self.file_exact_differences], axis=1)
+
     # replace matching values (NAs) with blanks
     self.logger.debug("Replacing all NA values with blanks")
     exact_differences_table.replace(np.nan, "", inplace=True)
@@ -236,6 +239,13 @@ def perform_exact_match(self):
   def compare_files(self, file_df1, file_df2):
     self.file_exact_matches = pd.DataFrame(index=file_df1.index,
                                            columns=file_df1.columns)
+    
+    # create similar table to one generated by df1.compare(df2)
+    # for adding to the exact differences TSV
+    self.file_exact_differences = pd.DataFrame(
+      index=file_df1.index,
+      columns=pd.MultiIndex.from_product([file_df1.columns, [self.table1_name, self.table2_name]])
+    )
 
     for col in file_df1.columns:
       for row in file_df1.index:
@@ -249,13 +259,22 @@ def compare_files(self, file_df1, file_df2):
           file2 = os.path.join(self.table2_files_dir, uri2.removeprefix("gs://"))
           is_match = filecmp.cmp(file1, file2)
           self.file_exact_matches.loc[row, col] = is_match
-          if not is_match:
+          if is_match:
+            # don't add URIs to exact differences table if files match
+            self.file_exact_differences.loc[row, (col, self.table1_name)] = np.nan
+            self.file_exact_differences.loc[row, (col, self.table2_name)] = np.nan
+            continue
+          else:
             output_filename = f"{row}_{col}_diff.txt"
             output_path = os.path.join(self.diff_dir, output_filename)
             self.create_diff(file1, file2, output_path)
         else:
           # count as not matching if pair is missing
           self.file_exact_matches.loc[row, col] = False
+        
+        self.file_exact_differences.loc[row, (col, self.table1_name)] = uri1
+        self.file_exact_differences.loc[row, (col, self.table2_name)] = uri2
+
 
   def set_file_number_of_differences(self):
     self.file_number_of_differences = pd.DataFrame(columns=[self.NUM_DIFFERENCES_COL])

From 208bfcd7c48a7f3097da9047da83c5b1aac4289a Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sun, 10 Dec 2023 23:54:49 -0700
Subject: [PATCH 28/43] Only run set_file_number_of_differences() if have files

---
 theiavalidate/Validator.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 55c47ba..89f9edb 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -192,6 +192,7 @@ def perform_exact_match(self):
       files_df1 = files_df1[list(self.file_columns)]
       files_df2 = files_df2[list(self.file_columns)]
       self.compare_files(files_df1, files_df2)
+      self.set_file_number_of_differences()
     else:
       table1 = self.table1
       table2 = self.table2
@@ -208,7 +209,6 @@ def perform_exact_match(self):
     self.logger.debug("Adding the number of exact match differences to the summary table")
     self.summary_output = pd.concat([self.summary_output, number_of_differences], join="outer", axis=1)
 
-    self.set_file_number_of_differences()
     if self.file_number_of_differences is not None:
       self.summary_output = self.summary_output.combine_first(self.file_number_of_differences)
     self.summary_output[self.NUM_DIFFERENCES_COL] = self.summary_output[self.NUM_DIFFERENCES_COL].astype(int)
@@ -322,7 +322,6 @@ def validate(self, column):
 
           self.validation_table[(column.name, self.table1_name)] = self.table1[column.name].where(exact_matches)
           self.validation_table[(column.name, self.table2_name)] = self.table2[column.name].where(exact_matches)
-
           number_of_differences = exact_matches.sum()
           return ("EXACT", number_of_differences)
         elif column[0] == "IGNORE": # do not check; there are no failures (0)
@@ -367,7 +366,8 @@ def run_validation_checks(self):
       
       self.logger.debug("Performing the validation checks")
       self.summary_output[["Validation Criteria", "Number of samples failing the validation criteria"]] = pd.DataFrame(self.validation_criteria.apply(lambda x: self.validate(x), result_type="expand")).transpose()
-      
+      print("validation criteria table:")
+      print(self.validation_table)
       # format the validation criteria differences table
       self.logger.debug("Formatting the validation criteria differences table")
       self.validation_table.set_index(self.table1["samples"], inplace=True)

From 5dc584becc14fc53fdc9deaa1113c6d460b39500 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Mon, 11 Dec 2023 07:16:55 +0000
Subject: [PATCH 29/43] Remove ununsed .transpose()

---
 theiavalidate/Validator.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 89f9edb..6bb615a 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -367,12 +367,10 @@ def run_validation_checks(self):
       self.logger.debug("Performing the validation checks")
       self.summary_output[["Validation Criteria", "Number of samples failing the validation criteria"]] = pd.DataFrame(self.validation_criteria.apply(lambda x: self.validate(x), result_type="expand")).transpose()
       print("validation criteria table:")
-      print(self.validation_table)
       # format the validation criteria differences table
       self.logger.debug("Formatting the validation criteria differences table")
       self.validation_table.set_index(self.table1["samples"], inplace=True)
       self.validation_table.rename_axis(None, axis="index", inplace=True)
-      self.validation_table.transpose()
       
       self.validation_table.columns = pd.MultiIndex.from_tuples(self.validation_table.columns, names=["Column", "Table"])
 

From 745011781c2374e7e20d5362f8f9d279834d1780 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Mon, 11 Dec 2023 14:12:14 -0700
Subject: [PATCH 30/43] Implement EXACT and IGNORE validation for file
 comparisons

---
 theiavalidate/Validator.py | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 6bb615a..2041c9e 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -53,6 +53,7 @@ def __init__(self, options):
     self.file_exact_matches = None
     self.file_exact_differences = None
     self.file_number_of_differences = None
+    self.file_validations = None
 
     self.output_prefix = options.output_prefix
     self.na_values = options.na_values
@@ -274,7 +275,8 @@ def compare_files(self, file_df1, file_df2):
         
         self.file_exact_differences.loc[row, (col, self.table1_name)] = uri1
         self.file_exact_differences.loc[row, (col, self.table2_name)] = uri2
-
+    
+    self.file_exact_matches = self.file_exact_matches.astype(bool)
 
   def set_file_number_of_differences(self):
     self.file_number_of_differences = pd.DataFrame(columns=[self.NUM_DIFFERENCES_COL])
@@ -315,7 +317,11 @@ def percent_difference(self, value1, value2):
   def validate(self, column):
     if column.name in self.table1.columns:
       # check the data type of the validation criteria; based on its type, we can assume the comparison to perform
-      if pd.api.types.is_string_dtype(column) == True: # if a string
+      if column.name in self.file_columns:
+        # handle file validation separately from strings, floats
+        validation_criterion, number_of_differences = self.validate_files(column)
+        return (validation_criterion, number_of_differences)
+      elif pd.api.types.is_string_dtype(column) == True: # if a string
         if column[0] == "EXACT": # count the number of exact match failures/differences
           self.logger.debug("Performing an exact match on column {} and counting the number of differences".format(column.name))
           exact_matches = ~self.table1[column.name].fillna("NULL").eq(self.table2[column.name].fillna("NULL"))
@@ -357,16 +363,38 @@ def validate(self, column):
     else:
       self.logger.debug("Column {} was not found; indicating np.nan failures".format(column.name))
       return ("COLUMN " + column.name + " NOT FOUND", np.nan)
+
+  def validate_files(self, column):
+    validation_criterion = column.iloc[0]
+    if validation_criterion == "EXACT":
+      # we already know where the exact matches are from compare_files()
+      self.validation_table[(column.name, self.table1_name)] = (self.table1
+        .set_index("samples")[column.name]
+        .where(~self.file_exact_matches[column.name])
+        .reset_index()[column.name]
+      )
+      self.validation_table[(column.name, self.table2_name)] = (self.table2
+        .set_index("samples")[column.name]
+        .where(~self.file_exact_matches[column.name])
+        .reset_index()[column.name]
+      )
+      number_of_differences = self.file_number_of_differences.loc[column.name, self.NUM_DIFFERENCES_COL]
+    elif validation_criterion == "IGNORE":
+      number_of_differences = 0
+    elif validation_criterion == "SET":
+      pass
+    else:
+      raise Exception("Only EXACT, IGNORE, and SET validation criteria implemented for file columns")
+    return (validation_criterion, number_of_differences)
   
   """ 
   This function creates, formats, and runs the validation criteria checks
-  """                                                                
+  """
   def run_validation_checks(self):
       self.validation_table = pd.DataFrame()
       
       self.logger.debug("Performing the validation checks")
       self.summary_output[["Validation Criteria", "Number of samples failing the validation criteria"]] = pd.DataFrame(self.validation_criteria.apply(lambda x: self.validate(x), result_type="expand")).transpose()
-      print("validation criteria table:")
       # format the validation criteria differences table
       self.logger.debug("Formatting the validation criteria differences table")
       self.validation_table.set_index(self.table1["samples"], inplace=True)

From 40eb2849a515a48b4ab4fbf6dd0dd543ba195efd Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Mon, 11 Dec 2023 18:04:01 -0700
Subject: [PATCH 31/43] Implement SET validation for file comparisons

---
 theiavalidate/Validator.py | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 2041c9e..44645c5 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -382,11 +382,41 @@ def validate_files(self, column):
     elif validation_criterion == "IGNORE":
       number_of_differences = 0
     elif validation_criterion == "SET":
-      pass
+      # for SET, sort lines in files then compare
+      concat_columns = pd.concat([self.table1[column.name], self.table2[column.name]], axis=1)
+      concat_columns = concat_columns.applymap(
+        lambda x: x.removeprefix("gs://") if pd.notnull(x) else x
+      )
+      sorted_file_matches = concat_columns.apply(self.compare_sorted_files, axis=1)
+      self.validation_table[(column.name, self.table1_name)] = (self.table1[column.name]
+        .where(~sorted_file_matches)
+      )
+      self.validation_table[(column.name, self.table2_name)] = (self.table2[column.name]
+        .where(~sorted_file_matches)
+      )
+      number_of_differences = len(sorted_file_matches) - sorted_file_matches.sum()
     else:
       raise Exception("Only EXACT, IGNORE, and SET validation criteria implemented for file columns")
     return (validation_criterion, number_of_differences)
   
+  def compare_sorted_files(self, row):
+    file1 = row.iloc[0]
+    file2 = row.iloc[1]
+    if pd.isnull(file1) and pd.isnull(file2):
+      # count two nulls as matching
+      return True
+    if pd.notnull(file1) and pd.notnull(file2):
+      file1 = os.path.join(self.table1_files_dir, file1)
+      file2 = os.path.join(self.table2_files_dir, file2)
+      with open(file1, "r") as f1, open(file2, "r") as f2:
+          lines1 = f1.readlines()
+          lines2 = f2.readlines()
+      lines1.sort()
+      lines2.sort()
+      return lines1 == lines2
+    # count null + not-null as mismatching
+    return False
+  
   """ 
   This function creates, formats, and runs the validation criteria checks
   """

From eec95fa5b1efdb8f3d333a85011d86dba5685f1b Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Mon, 11 Dec 2023 19:39:46 -0700
Subject: [PATCH 32/43] Add more unit tests for compare_files()

---
 tests/test_validator.py | 191 +++++++++++++++++++++++++++++++++-------
 1 file changed, 160 insertions(+), 31 deletions(-)

diff --git a/tests/test_validator.py b/tests/test_validator.py
index ed99cc7..610ab41 100644
--- a/tests/test_validator.py
+++ b/tests/test_validator.py
@@ -96,6 +96,18 @@ def test_one_column_null(self):
       self.run_determine_file_columns(data1, data2)
       self.assertEqual(self.validator.file_columns, {"col1", "col2"})
 
+    def test_mixed_nulls(self):
+      data1 = {
+        "col1": ["gs://foo", "gs://foo", np.nan],
+        "col2": ["gs://x", "gs://y", np.nan]
+      }
+      data2 = {
+        "col1": ["gs://eggs", np.nan, np.nan],
+        "col2": [np.nan, "gs://b", np.nan]
+      }
+      self.run_determine_file_columns(data1, data2)
+      self.assertEqual(self.validator.file_columns, {"col1", "col2"})
+
     def test_one_column_not_null(self):
       data1 = {
         "col1": ["gs://foo", "gs://bar", "gs://baz"],
@@ -109,14 +121,19 @@ def test_one_column_not_null(self):
       self.assertEqual(self.validator.file_columns, {"col1"})
 
 
-class TestFileNumberOfDifferences(unittest.TestCase):
+class TestCompareFiles(unittest.TestCase):
+  SAMPLES_INDEX = ["sample1", "sample2", "sample3"]
+  COLUMNS_INDEX = ["col1", "col2"]
+
   def setUp(self):
     self.validator = Validator(MockOptions())
+    self.validator.table1_name = "table1"
+    self.validator.table2_name = "table2"
     self.validator.table1_files_dir = "tests/table1_files"
     self.validator.table2_files_dir = "tests/table2_files"
     self.diff_dir = "/dev/null"
 
-  def test_matching_files(self):
+  def create_matching_files_tables(self):
     df1 = pd.DataFrame({
       "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"],
       "col2": ["gs://match2-1.txt", "gs://match2-2.txt", "gs://match2-3.txt"]
@@ -125,15 +142,11 @@ def test_matching_files(self):
       "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"],
       "col2": ["gs://match2-1.txt", "gs://match2-2.txt", "gs://match2-3.txt"]
     })
-    self.validator.compare_files(df1, df2)
-    self.validator.set_file_number_of_differences()
-    expected = pd.DataFrame({
-      self.validator.NUM_DIFFERENCES_COL: [0, 0]
-    })
-    expected.index = ["col1", "col2"]
-    pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected)
+    for df in [df1, df2]:
+      df.index = self.SAMPLES_INDEX
+    return df1, df2
 
-  def test_mismatching_files(self):
+  def create_mismatching_files_tables(self):
     df1 = pd.DataFrame({
       "col1": ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"],
       "col2": ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"]
@@ -142,45 +155,161 @@ def test_mismatching_files(self):
       "col1": ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"],
       "col2": ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"]
     })
-    self.validator.compare_files(df1, df2)
-    self.validator.set_file_number_of_differences()
-    expected = pd.DataFrame({
-      self.validator.NUM_DIFFERENCES_COL: [3, 3]
-    })
-    expected.index = ["col1", "col2"]
-    pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected)
-
-  def test_mix_matching_files(self):
+    for df in [df1, df2]:
+      df.index = self.SAMPLES_INDEX
+    return df1, df2
+  
+  def create_mix_matching_files_tables(self):
     df1 = pd.DataFrame({
       "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"],
-      "col2": ["gs://mismatch2-3.txt", "gs://match2-2.txt", "gs://mismatch2-3.txt"]
+      "col2": ["gs://mismatch2-1.txt", "gs://match2-2.txt", "gs://mismatch2-3.txt"]
     })
     df2 = pd.DataFrame({
       "col1": ["gs://match1-1.txt", "gs://match1-2.txt", "gs://match1-3.txt"],
-      "col2": ["gs://mismatch2-3.txt", "gs://match2-2.txt", "gs://mismatch2-3.txt"]
+      "col2": ["gs://mismatch2-1.txt", "gs://match2-2.txt", "gs://mismatch2-3.txt"]
     })
-    self.validator.compare_files(df1, df2)
-    self.validator.set_file_number_of_differences()
-    expected = pd.DataFrame({
-      self.validator.NUM_DIFFERENCES_COL: [0, 2]
-    })
-    expected.index = ["col1", "col2"]
-    pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) 
+    for df in [df1, df2]:
+      df.index = self.SAMPLES_INDEX
+    return df1, df2
 
-  def test_null_files(self):
+  def create_null_files_tables(self):
     df1 = pd.DataFrame({
       "col1": [np.nan, "gs://match1-2.txt", "gs://match1-3.txt"],
       "col2": [np.nan, "gs://match2-2.txt", "gs://mismatch2-3.txt"]
     })
     df2 = pd.DataFrame({
       "col1": [np.nan, "gs://match1-2.txt", "gs://match1-3.txt"],
-      "col2": ["gs://match2-3.txt", np.nan, np.nan]
+      "col2": ["gs://match2-1.txt", np.nan, np.nan]
+    })
+    for df in [df1, df2]:
+      df.index = self.SAMPLES_INDEX
+    return df1, df2
+  
+  def test_matching_files_exact_matches(self):
+    df1, df2 = self.create_matching_files_tables()
+    self.validator.compare_files(df1, df2)
+    expected = pd.DataFrame({
+      "col1": [True, True, True],
+      "col2": [True, True, True]
+    })
+    expected.index = self.SAMPLES_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_exact_matches, expected)
+
+  def test_mismatching_files_exact_matches(self):
+    df1, df2 = self.create_mismatching_files_tables()
+    self.validator.compare_files(df1, df2)
+    expected = pd.DataFrame({
+      "col1": [False, False, False],
+      "col2": [False, False, False]
+    })
+    expected.index = self.SAMPLES_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_exact_matches, expected)
+
+  def test_mix_matching_files_exact_matches(self):
+    df1, df2 = self.create_mix_matching_files_tables()
+    self.validator.compare_files(df1, df2)
+    expected = pd.DataFrame({
+      "col1": [True, True, True],
+      "col2": [False, True, False]
     })
+    expected.index = self.SAMPLES_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_exact_matches, expected)
+
+  def test_null_files_exact_matches(self):
+    df1, df2 = self.create_null_files_tables()
+    self.validator.compare_files(df1, df2)
+    expected = pd.DataFrame({
+      "col1": [True, True, True],
+      "col2": [False, False, False]
+    })
+    expected.index = self.SAMPLES_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_exact_matches, expected)
+
+  def test_null_files_number_of_differences(self):
+    df1, df2 = self.create_null_files_tables()
     self.validator.compare_files(df1, df2)
     self.validator.set_file_number_of_differences()
     expected = pd.DataFrame({
       self.validator.NUM_DIFFERENCES_COL: [0, 3]
     })
-    expected.index = ["col1", "col2"]
+    expected.index = self.COLUMNS_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected)
+
+  def test_mismatching_files_number_of_differences(self):
+    df1, df2 = self.create_mismatching_files_tables()
+    self.validator.compare_files(df1, df2)
+    self.validator.set_file_number_of_differences()
+    expected = pd.DataFrame({
+      self.validator.NUM_DIFFERENCES_COL: [3, 3]
+    })
+    expected.index = self.COLUMNS_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected)
+
+  def test_mix_matching_files_number_of_differences(self):
+    df1, df2 = self.create_mix_matching_files_tables()
+    self.validator.compare_files(df1, df2)
+    self.validator.set_file_number_of_differences()
+    expected = pd.DataFrame({
+      self.validator.NUM_DIFFERENCES_COL: [0, 2]
+    })
+    expected.index = self.COLUMNS_INDEX
     pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected) 
 
+  def test_null_files_number_of_differences(self):
+    df1, df2 = self.create_null_files_tables()
+    self.validator.compare_files(df1, df2)
+    self.validator.set_file_number_of_differences()
+    expected = pd.DataFrame({
+      self.validator.NUM_DIFFERENCES_COL: [0, 3]
+    })
+    expected.index = self.COLUMNS_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_number_of_differences, expected)
+
+  def test_matching_files_exact_differences(self):
+    df1, df2 = self.create_matching_files_tables()
+    self.validator.compare_files(df1, df2)
+    expected = pd.DataFrame({
+      ("col1", "table1"): [np.nan, np.nan, np.nan],
+      ("col1", "table2"): [np.nan, np.nan, np.nan],
+      ("col2", "table1"): [np.nan, np.nan, np.nan],
+      ("col2", "table2"): [np.nan, np.nan, np.nan]
+    }).astype(object)
+    expected.index = self.SAMPLES_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected)
+
+  def test_mismatching_files_exact_differences(self):
+    df1, df2 = self.create_mismatching_files_tables()
+    self.validator.compare_files(df1, df2)
+    expected = pd.DataFrame({
+      ("col1", "table1"): ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"],
+      ("col1", "table2"): ["gs://mismatch1-1.txt", "gs://mismatch1-2.txt", "gs://mismatch1-3.txt"],
+      ("col2", "table1"): ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"],
+      ("col2", "table2"): ["gs://mismatch2-1.txt", "gs://mismatch2-2.txt", "gs://mismatch2-3.txt"]
+    }).astype(object)
+    expected.index = self.SAMPLES_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected)
+
+  def test_mix_matching_files_exact_differences(self):
+    df1, df2 = self.create_mix_matching_files_tables()
+    self.validator.compare_files(df1, df2)
+    expected = pd.DataFrame({
+      ("col1", "table1"): [np.nan, np.nan, np.nan],
+      ("col1", "table2"): [np.nan, np.nan, np.nan],
+      ("col2", "table1"): ["gs://mismatch2-1.txt", np.nan, "gs://mismatch2-3.txt"],
+      ("col2", "table2"): ["gs://mismatch2-1.txt", np.nan, "gs://mismatch2-3.txt"]
+    }).astype(object)
+    expected.index = self.SAMPLES_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected)
+
+  def test_null_files_exact_differences(self):
+    df1, df2 = self.create_mix_matching_files_tables()
+    self.validator.compare_files(df2, df2)
+    expected = pd.DataFrame({
+      ("col1", "table1"): [np.nan, np.nan, np.nan],
+      ("col1", "table2"): [np.nan, np.nan, np.nan],
+      ("col2", "table1"): [np.nan, "gs://match2-2.txt", "gs://mismatch2-3.txt"],
+      ("col2", "table2"): ["gs://match2-1.txt", np.nan, np.nan]
+    }).astype(object)
+    expected.index = self.SAMPLES_INDEX
+    pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected)
+

From 5a8209792b89c52ce3b59b2340b91fbf6c14bb33 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Mon, 11 Dec 2023 21:04:11 -0700
Subject: [PATCH 33/43] Add unit tests for validate_files()

---
 tests/test_validator.py | 109 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 107 insertions(+), 2 deletions(-)

diff --git a/tests/test_validator.py b/tests/test_validator.py
index 610ab41..ba51957 100644
--- a/tests/test_validator.py
+++ b/tests/test_validator.py
@@ -302,8 +302,8 @@ def test_mix_matching_files_exact_differences(self):
     pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected)
 
   def test_null_files_exact_differences(self):
-    df1, df2 = self.create_mix_matching_files_tables()
-    self.validator.compare_files(df2, df2)
+    df1, df2 = self.create_null_files_tables()
+    self.validator.compare_files(df1, df2)
     expected = pd.DataFrame({
       ("col1", "table1"): [np.nan, np.nan, np.nan],
       ("col1", "table2"): [np.nan, np.nan, np.nan],
@@ -313,3 +313,108 @@ def test_null_files_exact_differences(self):
     expected.index = self.SAMPLES_INDEX
     pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected)
 
+class TestValidateFiles(unittest.TestCase):
+  SAMPLES_INDEX = ["sample1", "sample2", "sample3", "sample4", "sample5"]
+  COLUMNS_INDEX = ["exact_col", "set_col", "ignore_col", "float_col"]
+  TABLE1_FILE_URIS = ["gs://match1-1.txt", "gs://mismatch1-2.txt", "gs://match1-3", "gs://sortmatch1-1.txt", np.nan]
+  TABLE2_FILE_URIS = ["gs://match1-1.txt", "gs://mismatch1-2.txt", np.nan, "gs://sortmatch1-1.txt", np.nan]
+  EXACT_MATCHES_MASK = [True, False, False, False, True]
+
+  def setUp(self):
+    self.validator = Validator(MockOptions())
+    self.validator.validation_criteria = pd.DataFrame({
+      "exact_col": "EXACT",
+      "set_col": "SET",
+      "ignore_col": "IGNORE",
+      "float_col": 0.1,
+    }, index=["column", "criteria"]
+    )
+
+    # This numeric convertion is done in Validator init method
+    self.validator.validation_criteria = (self.validator.validation_criteria
+      .apply(pd.to_numeric, errors="ignore").convert_dtypes()
+    )
+
+    self.validator.table1 = pd.DataFrame({
+      "samples": self.SAMPLES_INDEX,
+      "exact_col": self.TABLE1_FILE_URIS,
+      "set_col": self.TABLE1_FILE_URIS,
+      "ignore_col": self.TABLE1_FILE_URIS,
+      "float_col": self.TABLE1_FILE_URIS  # uh-oh
+    })
+    
+    self.validator.table2 = pd.DataFrame({
+      "samples": self.SAMPLES_INDEX,
+      "exact_col": self.TABLE2_FILE_URIS,
+      "set_col": self.TABLE2_FILE_URIS,
+      "ignore_col": self.TABLE2_FILE_URIS,
+      "float_col": self.TABLE2_FILE_URIS  # uh-oh
+    })
+
+    self.validator.file_exact_matches = pd.DataFrame({
+      "exact_col": self.EXACT_MATCHES_MASK,
+      "set_col": self.EXACT_MATCHES_MASK,
+      "ignore_col": self.EXACT_MATCHES_MASK,
+      "float_col": self.EXACT_MATCHES_MASK
+    })
+    self.validator.file_exact_matches.index = self.SAMPLES_INDEX
+
+    self.validator.file_number_of_differences = pd.DataFrame({
+      self.validator.NUM_DIFFERENCES_COL: [3, 3, 3, 3]
+    })
+    self.validator.file_number_of_differences.index = self.COLUMNS_INDEX
+
+    self.validator.table1_name = "table1"
+    self.validator.table2_name = "table2"
+    self.validator.table1_files_dir = "tests/table1_files"
+    self.validator.table2_files_dir = "tests/table2_files"
+
+    self.validator.validation_table = pd.DataFrame()
+
+  def test_validate_exact(self):
+    column = self.validator.validation_criteria["exact_col"]
+    observed = self.validator.validate_files(column)
+    expected = ("EXACT", 3)
+    self.assertEqual(observed, expected)
+
+  def test_validate_ignore(self):
+    column = self.validator.validation_criteria["ignore_col"]
+    observed = self.validator.validate_files(column)
+    expected = ("IGNORE", 0)
+    self.assertEqual(observed, expected)
+
+  def test_validate_set(self):
+    column = self.validator.validation_criteria["set_col"]
+    observed = self.validator.validate_files(column)
+    expected = ("SET", 2)  # sorted file should not count as different
+    self.assertEqual(observed, expected)
+
+  def test_validate_float(self):
+    # have not implemented % difference for files
+    column = self.validator.validation_criteria["set_col"]
+    self.assertRaises(Exception, self.validator.validate_files(column))
+
+  def test_validation_table(self):
+    for column in ["exact_col", "set_col", "ignore_col"]:
+      column = self.validator.validation_criteria[column]
+      self.validator.validate_files(column)
+    
+    # these steps are done in run_validation_checks
+    self.validator.validation_table.set_index(self.validator.table1["samples"], inplace=True)
+    self.validator.validation_table.rename_axis(None, axis="index", inplace=True)
+    self.validator.validation_table.columns = pd.MultiIndex.from_tuples(
+      self.validator.validation_table.columns, names=["Column", "Table"]
+    )
+
+    # exact_col should count sortmatch file as a mismatch, while set_col should
+    # count it as a match. No column should be generated for ignore_col
+    expected = pd.DataFrame({
+      ("exact_col", "table1"): [np.nan, "gs://mismatch1-2.txt", "gs://match1-3", "gs://sortmatch1-1.txt", np.nan],
+      ("exact_col", "table2"): [np.nan, "gs://mismatch1-2.txt", np.nan, "gs://sortmatch1-1.txt", np.nan],
+      ("set_col", "table1"): [np.nan, "gs://mismatch1-2.txt", "gs://match1-3", np.nan, np.nan],
+      ("set_col", "table2"): [np.nan, "gs://mismatch1-2.txt", np.nan, np.nan, np.nan],
+    })
+    expected.set_index(self.validator.table1["samples"], inplace=True)
+    expected.rename_axis(None, axis="index", inplace=True)
+    expected.columns = pd.MultiIndex.from_tuples(expected.columns, names=["Column", "Table"]
+    pd.testing.assert_frame_equal(self.validator.validation_table, expected)

From 2d31b845dc815ddf07a2844814043eb23e55ce7e Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Mon, 11 Dec 2023 21:17:51 -0700
Subject: [PATCH 34/43] Reformat and add more documentation to testing file

---
 tests/test_validator.py | 197 ++++++++++++++++++++++------------------
 1 file changed, 108 insertions(+), 89 deletions(-)

diff --git a/tests/test_validator.py b/tests/test_validator.py
index ba51957..baee834 100644
--- a/tests/test_validator.py
+++ b/tests/test_validator.py
@@ -32,96 +32,105 @@ def __init__(self, options_dict=None):
 
 
 class TestDetermineFileColumns(unittest.TestCase):
-    def setUp(self):
-      self.validator = Validator(MockOptions())
-
-    def run_determine_file_columns(self, data1, data2):
-      self.validator.table1 = pd.DataFrame(data1)
-      self.validator.table2 = pd.DataFrame(data2)
-      self.validator.determine_file_columns()
-
-    def test_no_file_columns(self):
-      data = {
-        "col1": [1, 2, 3],
-        "col2": ["foo", "bar", "baz"]
-      }
-      self.run_determine_file_columns(data, data)
-      self.assertEqual(len(self.validator.file_columns), 0)
-
-    def test_some_file_columns(self):
-      data1 = {
-        "col1": [1, 2, 3],
-        "col2": ["gs://foo", "gs://bar", "gs://baz"]
-      }
-      data2 = {
-        "col1": [1, 2, 3],
-        "col2": ["gs://eggs", "gs://spam", "gs://monty"]
-      }
-      self.run_determine_file_columns(data1, data2)
-      self.assertEqual(self.validator.file_columns, {"col2"})
-
-    def test_missing_uri(self):
-      data1 = {
-        "col1": [1, 2, 3],
-        "col2": ["gs://foo", np.nan, "gs://baz"]
-      }
-      data2 = {
-        "col1": [1, 2, 3],
-        "col2": ["gs://eggs", "gs://spam", "gs://monty"]
-      }
-      self.run_determine_file_columns(data1, data2)
-      self.assertEqual(self.validator.file_columns, {"col2"})
-
-    def test_both_columns_null(self):
-      data1 = {
-        "col1": ["gs://foo", "gs://bar", "gs://baz"],
-        "col2": [np.nan, np.nan, np.nan]
-      }
-      data2 = {
-        "col1": ["gs://eggs", "gs://spam", "gs://monty"],
-        "col2": [np.nan, np.nan, np.nan]
-      }
-      self.run_determine_file_columns(data1, data2)
-      self.assertEqual(self.validator.file_columns, {"col1"})
-
-    def test_one_column_null(self):
-      data1 = {
-        "col1": ["gs://foo", "gs://bar", "gs://baz"],
-        "col2": ["gs://x", "gs://y", "gs://z"]
-      }
-      data2 = {
-        "col1": ["gs://eggs", "gs://spam", "gs://monty"],
-        "col2": [np.nan, np.nan, np.nan]
-      }
-      self.run_determine_file_columns(data1, data2)
-      self.assertEqual(self.validator.file_columns, {"col1", "col2"})
-
-    def test_mixed_nulls(self):
-      data1 = {
-        "col1": ["gs://foo", "gs://foo", np.nan],
-        "col2": ["gs://x", "gs://y", np.nan]
-      }
-      data2 = {
-        "col1": ["gs://eggs", np.nan, np.nan],
-        "col2": [np.nan, "gs://b", np.nan]
-      }
-      self.run_determine_file_columns(data1, data2)
-      self.assertEqual(self.validator.file_columns, {"col1", "col2"})
-
-    def test_one_column_not_null(self):
-      data1 = {
-        "col1": ["gs://foo", "gs://bar", "gs://baz"],
-        "col2": ["gs://x", "gs://y", "gs://z"]
-      }
-      data2 = {
-        "col1": ["gs://eggs", "gs://spam", "gs://monty"],
-        "col2": [1, 2, 3]
-      }
-      self.run_determine_file_columns(data1, data2)
-      self.assertEqual(self.validator.file_columns, {"col1"})
+  """
+  Test detecting which columns in the tables correspond to files. If there is at
+  least one URI and no other values except np.nan in both tables, we should
+  treat the column as a "file_column".
+  """
+  def setUp(self):
+    self.validator = Validator(MockOptions())
+
+  def run_determine_file_columns(self, data1, data2):
+    self.validator.table1 = pd.DataFrame(data1)
+    self.validator.table2 = pd.DataFrame(data2)
+    self.validator.determine_file_columns()
+
+  def test_no_file_columns(self):
+    data = {
+      "col1": [1, 2, 3],
+      "col2": ["foo", "bar", "baz"]
+    }
+    self.run_determine_file_columns(data, data)
+    self.assertEqual(len(self.validator.file_columns), 0)
+
+  def test_some_file_columns(self):
+    data1 = {
+      "col1": [1, 2, 3],
+      "col2": ["gs://foo", "gs://bar", "gs://baz"]
+    }
+    data2 = {
+      "col1": [1, 2, 3],
+      "col2": ["gs://eggs", "gs://spam", "gs://monty"]
+    }
+    self.run_determine_file_columns(data1, data2)
+    self.assertEqual(self.validator.file_columns, {"col2"})
+
+  def test_missing_uri(self):
+    data1 = {
+      "col1": [1, 2, 3],
+      "col2": ["gs://foo", np.nan, "gs://baz"]
+    }
+    data2 = {
+      "col1": [1, 2, 3],
+      "col2": ["gs://eggs", "gs://spam", "gs://monty"]
+    }
+    self.run_determine_file_columns(data1, data2)
+    self.assertEqual(self.validator.file_columns, {"col2"})
+
+  def test_both_columns_null(self):
+    data1 = {
+      "col1": ["gs://foo", "gs://bar", "gs://baz"],
+      "col2": [np.nan, np.nan, np.nan]
+    }
+    data2 = {
+      "col1": ["gs://eggs", "gs://spam", "gs://monty"],
+      "col2": [np.nan, np.nan, np.nan]
+    }
+    self.run_determine_file_columns(data1, data2)
+    self.assertEqual(self.validator.file_columns, {"col1"})
+
+  def test_one_column_null(self):
+    data1 = {
+      "col1": ["gs://foo", "gs://bar", "gs://baz"],
+      "col2": ["gs://x", "gs://y", "gs://z"]
+    }
+    data2 = {
+      "col1": ["gs://eggs", "gs://spam", "gs://monty"],
+      "col2": [np.nan, np.nan, np.nan]
+    }
+    self.run_determine_file_columns(data1, data2)
+    self.assertEqual(self.validator.file_columns, {"col1", "col2"})
+
+  def test_mixed_nulls(self):
+    data1 = {
+      "col1": ["gs://foo", "gs://foo", np.nan],
+      "col2": ["gs://x", "gs://y", np.nan]
+    }
+    data2 = {
+      "col1": ["gs://eggs", np.nan, np.nan],
+      "col2": [np.nan, "gs://b", np.nan]
+    }
+    self.run_determine_file_columns(data1, data2)
+    self.assertEqual(self.validator.file_columns, {"col1", "col2"})
+
+  def test_one_column_not_null(self):
+    data1 = {
+      "col1": ["gs://foo", "gs://bar", "gs://baz"],
+      "col2": ["gs://x", "gs://y", "gs://z"]
+    }
+    data2 = {
+      "col1": ["gs://eggs", "gs://spam", "gs://monty"],
+      "col2": [1, 2, 3]
+    }
+    self.run_determine_file_columns(data1, data2)
+    self.assertEqual(self.validator.file_columns, {"col1"})
 
 
 class TestCompareFiles(unittest.TestCase):
+  """
+  Test comparing files (exact match). Identical files or two np.nans
+  should count as an exact match, anything else should count as a mismatch.
+  """
   SAMPLES_INDEX = ["sample1", "sample2", "sample3"]
   COLUMNS_INDEX = ["col1", "col2"]
 
@@ -131,7 +140,7 @@ def setUp(self):
     self.validator.table2_name = "table2"
     self.validator.table1_files_dir = "tests/table1_files"
     self.validator.table2_files_dir = "tests/table2_files"
-    self.diff_dir = "/dev/null"
+    self.diff_dir = "/dev/null"  # discard diff files
 
   def create_matching_files_tables(self):
     df1 = pd.DataFrame({
@@ -314,6 +323,12 @@ def test_null_files_exact_differences(self):
     pd.testing.assert_frame_equal(self.validator.file_exact_differences, expected)
 
 class TestValidateFiles(unittest.TestCase):
+  """
+  Test comparing files using the validation criteria. EXACT follows the same
+  logic as compare_files(), SET should treat files as matching if after
+  sorting they are identical, IGNORE should "skip" the files. Other criteria
+  should result in an Exception.
+  """
   SAMPLES_INDEX = ["sample1", "sample2", "sample3", "sample4", "sample5"]
   COLUMNS_INDEX = ["exact_col", "set_col", "ignore_col", "float_col"]
   TABLE1_FILE_URIS = ["gs://match1-1.txt", "gs://mismatch1-2.txt", "gs://match1-3", "gs://sortmatch1-1.txt", np.nan]
@@ -335,6 +350,8 @@ def setUp(self):
       .apply(pd.to_numeric, errors="ignore").convert_dtypes()
     )
 
+    # assign the same URIs to each column, will test that the validation
+    # results vary depending on the the validation criterion
     self.validator.table1 = pd.DataFrame({
       "samples": self.SAMPLES_INDEX,
       "exact_col": self.TABLE1_FILE_URIS,
@@ -351,6 +368,7 @@ def setUp(self):
       "float_col": self.TABLE2_FILE_URIS  # uh-oh
     })
 
+    # the exact matches will be identical regardless of validation criteria
     self.validator.file_exact_matches = pd.DataFrame({
       "exact_col": self.EXACT_MATCHES_MASK,
       "set_col": self.EXACT_MATCHES_MASK,
@@ -407,7 +425,8 @@ def test_validation_table(self):
     )
 
     # exact_col should count sortmatch file as a mismatch, while set_col should
-    # count it as a match. No column should be generated for ignore_col
+    # count it as a match.
+    # no column should be generated for ignore_col.
     expected = pd.DataFrame({
       ("exact_col", "table1"): [np.nan, "gs://mismatch1-2.txt", "gs://match1-3", "gs://sortmatch1-1.txt", np.nan],
       ("exact_col", "table2"): [np.nan, "gs://mismatch1-2.txt", np.nan, "gs://sortmatch1-1.txt", np.nan],

From cd3c67f1906daed4022e6fe1793fa04fa91a6b5b Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Mon, 11 Dec 2023 21:32:40 -0700
Subject: [PATCH 35/43] Reformat/add docstrings

---
 theiavalidate/Validator.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 44645c5..61a6ae3 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -151,12 +151,11 @@ def count_populated_cells(self):
     self.summary_output = pd.concat([table1_populated_rows, table2_populated_rows], join="outer", axis=1)
   
 
-  """
-  This function determines columns with GCP URIs for file comparisons so that
-  they are excluded from regular comparisons and instead use filecmp to compare
-  the downloaded files.
-  """
   def determine_file_columns(self):
+    """
+    Determine the columns with GCP URIs so that they are excluded from regular
+    comparisons and instead file comparisons are performed.
+    """
     for df in [self.table1, self.table2]:
       # select columns with at least one GCP URI among nulls
       file_columns = df.columns[(df.apply(lambda x: x.astype(str).str.startswith("gs://")
@@ -238,6 +237,9 @@ def perform_exact_match(self):
 
 
   def compare_files(self, file_df1, file_df2):
+    """
+    Determine which pairs of files referenced in the DataFrames are identical
+    """
     self.file_exact_matches = pd.DataFrame(index=file_df1.index,
                                            columns=file_df1.columns)
     
@@ -275,7 +277,7 @@ def compare_files(self, file_df1, file_df2):
         
         self.file_exact_differences.loc[row, (col, self.table1_name)] = uri1
         self.file_exact_differences.loc[row, (col, self.table2_name)] = uri2
-    
+
     self.file_exact_matches = self.file_exact_matches.astype(bool)
 
   def set_file_number_of_differences(self):
@@ -365,6 +367,11 @@ def validate(self, column):
       return ("COLUMN " + column.name + " NOT FOUND", np.nan)
 
   def validate_files(self, column):
+    """
+    Perform validation of matching file contents based on which of EXACT,
+    IGNORE, or SET is assigned as the column's validation criterion. For SET,
+    sort lines in file before comparing.
+    """
     validation_criterion = column.iloc[0]
     if validation_criterion == "EXACT":
       # we already know where the exact matches are from compare_files()
@@ -400,6 +407,9 @@ def validate_files(self, column):
     return (validation_criterion, number_of_differences)
   
   def compare_sorted_files(self, row):
+    """
+    Compare two files sorted alphabetically by line for a pair of file URIs.
+    """
     file1 = row.iloc[0]
     file2 = row.iloc[1]
     if pd.isnull(file1) and pd.isnull(file2):
@@ -534,6 +544,9 @@ def compare(self):
     self.logger.info("Done!")
 
 def localize_files(row, directory):
+  """
+  Download files to compare from GCP.
+  """
   for value in row:
     if isinstance(value, str) and value.startswith("gs://"):
       # copy files to to compare_files/ directory

From c292900938f3294e73d4259fbcd0c2edcbc82abe Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Mon, 11 Dec 2023 21:37:03 -0700
Subject: [PATCH 36/43] Remove unnecessary argument to MockOptions

---
 tests/test_validator.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/tests/test_validator.py b/tests/test_validator.py
index baee834..69e2fd4 100644
--- a/tests/test_validator.py
+++ b/tests/test_validator.py
@@ -1,3 +1,6 @@
+# To run these unit tests, run "python3 -m unittest" from the root of the
+# project directory.
+
 from theiavalidate.Validator import Validator
 from theiavalidate.theiavalidate import DEFAULT_NA_VALUES
 
@@ -12,8 +15,7 @@ class MockOptions:
   the argparse package, but here we will simulate this object with a
   different class to more easily create Validator objects.
   """
-  def __init__(self, options_dict=None):
-    # defaults
+  def __init__(self):
     self.table1 = None
     self.table2 = None
     self.version = None
@@ -25,10 +27,6 @@ def __init__(self, options_dict=None):
     self.verbose = False
     self.debug = False
 
-    # overwrite defaults with options_dict
-    if options_dict is not None:
-      for key, value in options_dict.items():
-        setattr(self, key, value)
 
 
 class TestDetermineFileColumns(unittest.TestCase):
@@ -435,5 +433,5 @@ def test_validation_table(self):
     })
     expected.set_index(self.validator.table1["samples"], inplace=True)
     expected.rename_axis(None, axis="index", inplace=True)
-    expected.columns = pd.MultiIndex.from_tuples(expected.columns, names=["Column", "Table"]
+    expected.columns = pd.MultiIndex.from_tuples(expected.columns, names=["Column", "Table"])
     pd.testing.assert_frame_equal(self.validator.validation_table, expected)

From 0354c743ff95f079f990aca6b35e2a9cde4792b0 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Mon, 11 Dec 2023 21:38:17 -0700
Subject: [PATCH 37/43] Adjust whitespace

---
 tests/test_validator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_validator.py b/tests/test_validator.py
index 69e2fd4..453f79f 100644
--- a/tests/test_validator.py
+++ b/tests/test_validator.py
@@ -8,6 +8,7 @@
 import pandas as pd
 import unittest
 
+
 class MockOptions:
   """
   Mock the "options" object that is created in theiavalidate.py. In
@@ -28,7 +29,6 @@ def __init__(self):
     self.debug = False
 
 
-
 class TestDetermineFileColumns(unittest.TestCase):
   """
   Test detecting which columns in the tables correspond to files. If there is at

From 3422a73869972b7e1910a59f3d8d7b310c25f99f Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Mon, 11 Dec 2023 21:39:32 -0700
Subject: [PATCH 38/43] Add sortmatch files for testing SET criterion

---
 file_diffs/sample1_col1_diff.txt    | 5 +++++
 file_diffs/sample1_col2_diff.txt    | 3 +++
 file_diffs/sample2_col1_diff.txt    | 3 +++
 file_diffs/sample2_col2_diff.txt    | 3 +++
 file_diffs/sample3_col1_diff.txt    | 5 +++++
 file_diffs/sample3_col2_diff.txt    | 3 +++
 tests/table1_files/sortmatch1-1.txt | 3 +++
 tests/table2_files/sortmatch1-1.txt | 3 +++
 8 files changed, 28 insertions(+)
 create mode 100644 file_diffs/sample1_col1_diff.txt
 create mode 100644 file_diffs/sample1_col2_diff.txt
 create mode 100644 file_diffs/sample2_col1_diff.txt
 create mode 100644 file_diffs/sample2_col2_diff.txt
 create mode 100644 file_diffs/sample3_col1_diff.txt
 create mode 100644 file_diffs/sample3_col2_diff.txt
 create mode 100644 tests/table1_files/sortmatch1-1.txt
 create mode 100644 tests/table2_files/sortmatch1-1.txt

diff --git a/file_diffs/sample1_col1_diff.txt b/file_diffs/sample1_col1_diff.txt
new file mode 100644
index 0000000..a0a1ba9
--- /dev/null
+++ b/file_diffs/sample1_col1_diff.txt
@@ -0,0 +1,5 @@
+--- tests/table1_files/mismatch1-1.txt+++ tests/table2_files/mismatch1-1.txt@@ -1,3 +1,3 @@-foo
+-bar
++eggs
++spam
+ 
diff --git a/file_diffs/sample1_col2_diff.txt b/file_diffs/sample1_col2_diff.txt
new file mode 100644
index 0000000..4ca418d
--- /dev/null
+++ b/file_diffs/sample1_col2_diff.txt
@@ -0,0 +1,3 @@
+--- tests/table1_files/mismatch2-1.txt+++ tests/table2_files/mismatch2-1.txt@@ -1,2 +1 @@-1 2 3
+-
++1 2
diff --git a/file_diffs/sample2_col1_diff.txt b/file_diffs/sample2_col1_diff.txt
new file mode 100644
index 0000000..7f6663f
--- /dev/null
+++ b/file_diffs/sample2_col1_diff.txt
@@ -0,0 +1,3 @@
+--- tests/table1_files/mismatch1-2.txt+++ tests/table2_files/mismatch1-2.txt@@ -1,2 +1,3 @@+foo
+ foo
+ 
diff --git a/file_diffs/sample2_col2_diff.txt b/file_diffs/sample2_col2_diff.txt
new file mode 100644
index 0000000..e08f21a
--- /dev/null
+++ b/file_diffs/sample2_col2_diff.txt
@@ -0,0 +1,3 @@
+--- tests/table1_files/mismatch2-2.txt+++ tests/table2_files/mismatch2-2.txt@@ -1,2 +1,2 @@-5 6 6
++4 5 6 
+ 
diff --git a/file_diffs/sample3_col1_diff.txt b/file_diffs/sample3_col1_diff.txt
new file mode 100644
index 0000000..89a7f0c
--- /dev/null
+++ b/file_diffs/sample3_col1_diff.txt
@@ -0,0 +1,5 @@
+--- tests/table1_files/mismatch1-3.txt+++ tests/table2_files/mismatch1-3.txt@@ -1,4 +1,3 @@+spam
+ 
+-spam
+ eggs
+-
diff --git a/file_diffs/sample3_col2_diff.txt b/file_diffs/sample3_col2_diff.txt
new file mode 100644
index 0000000..852e058
--- /dev/null
+++ b/file_diffs/sample3_col2_diff.txt
@@ -0,0 +1,3 @@
+--- tests/table1_files/mismatch2-3.txt+++ tests/table2_files/mismatch2-3.txt@@ -1,2 +1 @@-hello, world
+-
++hello, world!
diff --git a/tests/table1_files/sortmatch1-1.txt b/tests/table1_files/sortmatch1-1.txt
new file mode 100644
index 0000000..86e041d
--- /dev/null
+++ b/tests/table1_files/sortmatch1-1.txt
@@ -0,0 +1,3 @@
+foo
+bar
+baz
diff --git a/tests/table2_files/sortmatch1-1.txt b/tests/table2_files/sortmatch1-1.txt
new file mode 100644
index 0000000..4fc6926
--- /dev/null
+++ b/tests/table2_files/sortmatch1-1.txt
@@ -0,0 +1,3 @@
+baz
+foo
+bar

From 5a06a70ccf83fede28686300a1fdc67f1637c0c9 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Mon, 11 Dec 2023 21:43:42 -0700
Subject: [PATCH 39/43] Remove ignored files

---
 .devcontainer/devcontainer.json  | 26 --------------------------
 .gitignore                       |  4 +++-
 file_diffs/0_col1_diff.txt       |  5 -----
 file_diffs/0_col2_diff.txt       |  3 ---
 file_diffs/1_col1_diff.txt       |  3 ---
 file_diffs/1_col2_diff.txt       |  3 ---
 file_diffs/2_col1_diff.txt       |  5 -----
 file_diffs/2_col2_diff.txt       |  3 ---
 file_diffs/sample1_col1_diff.txt |  5 -----
 file_diffs/sample1_col2_diff.txt |  3 ---
 file_diffs/sample2_col1_diff.txt |  3 ---
 file_diffs/sample2_col2_diff.txt |  3 ---
 file_diffs/sample3_col1_diff.txt |  5 -----
 file_diffs/sample3_col2_diff.txt |  3 ---
 14 files changed, 3 insertions(+), 71 deletions(-)
 delete mode 100644 .devcontainer/devcontainer.json
 delete mode 100644 file_diffs/0_col1_diff.txt
 delete mode 100644 file_diffs/0_col2_diff.txt
 delete mode 100644 file_diffs/1_col1_diff.txt
 delete mode 100644 file_diffs/1_col2_diff.txt
 delete mode 100644 file_diffs/2_col1_diff.txt
 delete mode 100644 file_diffs/2_col2_diff.txt
 delete mode 100644 file_diffs/sample1_col1_diff.txt
 delete mode 100644 file_diffs/sample1_col2_diff.txt
 delete mode 100644 file_diffs/sample2_col1_diff.txt
 delete mode 100644 file_diffs/sample2_col2_diff.txt
 delete mode 100644 file_diffs/sample3_col1_diff.txt
 delete mode 100644 file_diffs/sample3_col2_diff.txt

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
deleted file mode 100644
index 8d96444..0000000
--- a/.devcontainer/devcontainer.json
+++ /dev/null
@@ -1,26 +0,0 @@
-// For format details, see https://aka.ms/devcontainer.json. For config options, see the
-// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile
-{
-	"name": "Existing Dockerfile",
-	"build": {
-		// Sets the run context to one level up instead of the .devcontainer folder.
-		"context": "..",
-		// Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
-		"dockerfile": "../Dockerfile"
-	}
-
-	// Features to add to the dev container. More info: https://containers.dev/features.
-	// "features": {},
-
-	// Use 'forwardPorts' to make a list of ports inside the container available locally.
-	// "forwardPorts": [],
-
-	// Uncomment the next line to run commands after the container is created.
-	// "postCreateCommand": "cat /etc/os-release",
-
-	// Configure tool-specific properties.
-	// "customizations": {},
-
-	// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
-	// "remoteUser": "devcontainer"
-}
diff --git a/.gitignore b/.gitignore
index 7de3fa6..08a404d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -161,6 +161,8 @@ cython_debug/
 
 # IDE
 .vscode/
+.devcontainer
 
 # testing files
-sandbox/
\ No newline at end of file
+sandbox/
+file_diffs/
\ No newline at end of file
diff --git a/file_diffs/0_col1_diff.txt b/file_diffs/0_col1_diff.txt
deleted file mode 100644
index a0a1ba9..0000000
--- a/file_diffs/0_col1_diff.txt
+++ /dev/null
@@ -1,5 +0,0 @@
---- tests/table1_files/mismatch1-1.txt+++ tests/table2_files/mismatch1-1.txt@@ -1,3 +1,3 @@-foo
--bar
-+eggs
-+spam
- 
diff --git a/file_diffs/0_col2_diff.txt b/file_diffs/0_col2_diff.txt
deleted file mode 100644
index 852e058..0000000
--- a/file_diffs/0_col2_diff.txt
+++ /dev/null
@@ -1,3 +0,0 @@
---- tests/table1_files/mismatch2-3.txt+++ tests/table2_files/mismatch2-3.txt@@ -1,2 +1 @@-hello, world
--
-+hello, world!
diff --git a/file_diffs/1_col1_diff.txt b/file_diffs/1_col1_diff.txt
deleted file mode 100644
index 7f6663f..0000000
--- a/file_diffs/1_col1_diff.txt
+++ /dev/null
@@ -1,3 +0,0 @@
---- tests/table1_files/mismatch1-2.txt+++ tests/table2_files/mismatch1-2.txt@@ -1,2 +1,3 @@+foo
- foo
- 
diff --git a/file_diffs/1_col2_diff.txt b/file_diffs/1_col2_diff.txt
deleted file mode 100644
index e08f21a..0000000
--- a/file_diffs/1_col2_diff.txt
+++ /dev/null
@@ -1,3 +0,0 @@
---- tests/table1_files/mismatch2-2.txt+++ tests/table2_files/mismatch2-2.txt@@ -1,2 +1,2 @@-5 6 6
-+4 5 6 
- 
diff --git a/file_diffs/2_col1_diff.txt b/file_diffs/2_col1_diff.txt
deleted file mode 100644
index 89a7f0c..0000000
--- a/file_diffs/2_col1_diff.txt
+++ /dev/null
@@ -1,5 +0,0 @@
---- tests/table1_files/mismatch1-3.txt+++ tests/table2_files/mismatch1-3.txt@@ -1,4 +1,3 @@+spam
- 
--spam
- eggs
--
diff --git a/file_diffs/2_col2_diff.txt b/file_diffs/2_col2_diff.txt
deleted file mode 100644
index 852e058..0000000
--- a/file_diffs/2_col2_diff.txt
+++ /dev/null
@@ -1,3 +0,0 @@
---- tests/table1_files/mismatch2-3.txt+++ tests/table2_files/mismatch2-3.txt@@ -1,2 +1 @@-hello, world
--
-+hello, world!
diff --git a/file_diffs/sample1_col1_diff.txt b/file_diffs/sample1_col1_diff.txt
deleted file mode 100644
index a0a1ba9..0000000
--- a/file_diffs/sample1_col1_diff.txt
+++ /dev/null
@@ -1,5 +0,0 @@
---- tests/table1_files/mismatch1-1.txt+++ tests/table2_files/mismatch1-1.txt@@ -1,3 +1,3 @@-foo
--bar
-+eggs
-+spam
- 
diff --git a/file_diffs/sample1_col2_diff.txt b/file_diffs/sample1_col2_diff.txt
deleted file mode 100644
index 4ca418d..0000000
--- a/file_diffs/sample1_col2_diff.txt
+++ /dev/null
@@ -1,3 +0,0 @@
---- tests/table1_files/mismatch2-1.txt+++ tests/table2_files/mismatch2-1.txt@@ -1,2 +1 @@-1 2 3
--
-+1 2
diff --git a/file_diffs/sample2_col1_diff.txt b/file_diffs/sample2_col1_diff.txt
deleted file mode 100644
index 7f6663f..0000000
--- a/file_diffs/sample2_col1_diff.txt
+++ /dev/null
@@ -1,3 +0,0 @@
---- tests/table1_files/mismatch1-2.txt+++ tests/table2_files/mismatch1-2.txt@@ -1,2 +1,3 @@+foo
- foo
- 
diff --git a/file_diffs/sample2_col2_diff.txt b/file_diffs/sample2_col2_diff.txt
deleted file mode 100644
index e08f21a..0000000
--- a/file_diffs/sample2_col2_diff.txt
+++ /dev/null
@@ -1,3 +0,0 @@
---- tests/table1_files/mismatch2-2.txt+++ tests/table2_files/mismatch2-2.txt@@ -1,2 +1,2 @@-5 6 6
-+4 5 6 
- 
diff --git a/file_diffs/sample3_col1_diff.txt b/file_diffs/sample3_col1_diff.txt
deleted file mode 100644
index 89a7f0c..0000000
--- a/file_diffs/sample3_col1_diff.txt
+++ /dev/null
@@ -1,5 +0,0 @@
---- tests/table1_files/mismatch1-3.txt+++ tests/table2_files/mismatch1-3.txt@@ -1,4 +1,3 @@+spam
- 
--spam
- eggs
--
diff --git a/file_diffs/sample3_col2_diff.txt b/file_diffs/sample3_col2_diff.txt
deleted file mode 100644
index 852e058..0000000
--- a/file_diffs/sample3_col2_diff.txt
+++ /dev/null
@@ -1,3 +0,0 @@
---- tests/table1_files/mismatch2-3.txt+++ tests/table2_files/mismatch2-3.txt@@ -1,2 +1 @@-hello, world
--
-+hello, world!

From ce7fae175af0fc029cc255bbdf7f37b0e5d9ac7e Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Tue, 12 Dec 2023 21:19:55 -0700
Subject: [PATCH 40/43] Remove outdated comment

---
 theiavalidate/Validator.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 61a6ae3..8fdd3a7 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -549,8 +549,7 @@ def localize_files(row, directory):
   """
   for value in row:
     if isinstance(value, str) and value.startswith("gs://"):
-      # copy files to to compare_files/ directory
-      # it would be much faster to copy them all at once, but any files with
+      # it would be much faster to copy files all at once, but any files with
       # the same name would be clobbered, so create local directories matching
       # gsutil path and loop to copy
       remote_path = os.path.dirname(value.removeprefix("gs://"))

From 5c914c097afb065f7b6a398ecfc139b95bd1ee72 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Tue, 12 Dec 2023 21:44:08 -0700
Subject: [PATCH 41/43] Add example files for file comparison

---
 ...mple-validation_criteria_exact_sort_file.tsv |   7 +++++++
 .../file_comparison_column_translation.tsv      |   2 ++
 .../file_comparison_columns_to_compare.txt      |   1 +
 .../file_comparison/file_comparison_table1.tsv  |   6 ++++++
 .../file_comparison/file_comparison_table2.tsv  |   6 ++++++
 .../outputs/diffs/sample02_file_column_diff.txt |   5 +++++
 .../diffs/sample02_sort_file_column_diff.txt    |   5 +++++
 .../outputs/diffs/sample03_file_column_diff.txt |   3 +++
 .../diffs/sample03_sort_file_column_diff.txt    |   4 ++++
 .../file_comparison_exact_differences.tsv       |   8 ++++++++
 .../outputs/file_comparison_summary.pdf         | Bin 0 -> 28661 bytes
 ...ison_validation_criteria_differences (2).tsv |   7 +++++++
 .../outputs/filtered_file_comparison_table1.tsv |   6 ++++++
 .../outputs/filtered_file_comparison_table2.tsv |   6 ++++++
 14 files changed, 66 insertions(+)
 create mode 100644 examples/file_comparison/example-validation_criteria_exact_sort_file.tsv
 create mode 100644 examples/file_comparison/file_comparison_column_translation.tsv
 create mode 100644 examples/file_comparison/file_comparison_columns_to_compare.txt
 create mode 100644 examples/file_comparison/file_comparison_table1.tsv
 create mode 100644 examples/file_comparison/file_comparison_table2.tsv
 create mode 100644 examples/file_comparison/outputs/diffs/sample02_file_column_diff.txt
 create mode 100644 examples/file_comparison/outputs/diffs/sample02_sort_file_column_diff.txt
 create mode 100644 examples/file_comparison/outputs/diffs/sample03_file_column_diff.txt
 create mode 100644 examples/file_comparison/outputs/diffs/sample03_sort_file_column_diff.txt
 create mode 100644 examples/file_comparison/outputs/file_comparison_exact_differences.tsv
 create mode 100644 examples/file_comparison/outputs/file_comparison_summary.pdf
 create mode 100644 examples/file_comparison/outputs/file_comparison_validation_criteria_differences (2).tsv
 create mode 100644 examples/file_comparison/outputs/filtered_file_comparison_table1.tsv
 create mode 100644 examples/file_comparison/outputs/filtered_file_comparison_table2.tsv

diff --git a/examples/file_comparison/example-validation_criteria_exact_sort_file.tsv b/examples/file_comparison/example-validation_criteria_exact_sort_file.tsv
new file mode 100644
index 0000000..1590aa0
--- /dev/null
+++ b/examples/file_comparison/example-validation_criteria_exact_sort_file.tsv
@@ -0,0 +1,7 @@
+column	criteria
+assembly_length	0.01
+gambit_predicted_taxon	EXACT
+amrfinderplus_amr_core_genes	SET
+extra_column	IGNORE
+file_column	EXACT
+sort_file_column	SET
diff --git a/examples/file_comparison/file_comparison_column_translation.tsv b/examples/file_comparison/file_comparison_column_translation.tsv
new file mode 100644
index 0000000..3cf7192
--- /dev/null
+++ b/examples/file_comparison/file_comparison_column_translation.tsv
@@ -0,0 +1,2 @@
+amrfinderplus_amr_genes	amrfinderplus_amr_core_genes
+extra_column2	extra_column
\ No newline at end of file
diff --git a/examples/file_comparison/file_comparison_columns_to_compare.txt b/examples/file_comparison/file_comparison_columns_to_compare.txt
new file mode 100644
index 0000000..d67db40
--- /dev/null
+++ b/examples/file_comparison/file_comparison_columns_to_compare.txt
@@ -0,0 +1 @@
+"assembly_length,gambit_predicted_taxon,amrfinderplus_amr_core_genes,extra_column,file_column,sort_file_column"
\ No newline at end of file
diff --git a/examples/file_comparison/file_comparison_table1.tsv b/examples/file_comparison/file_comparison_table1.tsv
new file mode 100644
index 0000000..1d42049
--- /dev/null
+++ b/examples/file_comparison/file_comparison_table1.tsv
@@ -0,0 +1,6 @@
+entity:table1_with_files_id	amrfinderplus_amr_core_genes	assembly_length	extra_column	file_column	gambit_predicted_taxon	sort_file_column
+sample01	tet(A),aph(6)-Id,aph(3'')-Ib	4783605	extra_value	gs://path/to/table1_files/match1-1.txt	Salmonella enterica	gs://path/to/table1_files/match1-1.txt
+sample02	glpT_E448K,gyrA_D87G,gyrA_S83L,sat2,dfrA1,parC_S80I,blaCTX-M-27	5226301		gs://path/to/table1_files/mismatch1-1.txt	Shigella sonnei	gs://path/to/table1_files/mismatch1-1.txt
+sample03		4719410	extra_value	gs://path/to/table1_files/mismatch2-1.txt	Shigella	gs://path/to/table1_files/sortmatch1-1.txt
+sample04	sul1,aadA7,parC_S87L,gyrA_T83I	6674526		gs://path/to/table1_files/mismatch2-1.txt	Pseudomonas aeruginosa	gs://path/to/table1_files/mismatch1-1.txt
+sample05	parC_S80Y,tet(38),mecR1,murA_G257D,fosB,gyrA_S84L,mecA	2773544			Staphylococcus aureus	
diff --git a/examples/file_comparison/file_comparison_table2.tsv b/examples/file_comparison/file_comparison_table2.tsv
new file mode 100644
index 0000000..0e39e38
--- /dev/null
+++ b/examples/file_comparison/file_comparison_table2.tsv
@@ -0,0 +1,6 @@
+entity:table2_with_files_id	amrfinderplus_amr_genes	assembly_length	extra_column2	file_column	gambit_predicted_taxon	sort_file_column
+sample01	aph(3'')-Ib,aph(6)-Id,tet(A)	4783610	extra_value	gs://path/to/table2_files/match1-1.txt	Salmonella enterica	gs://path/to/table2_files/match1-1.txt
+sample02	glpT_E448K,gyrA_D87G,gyrA_S83L,parC_S80I,blaCTX-M-27,sat2,dfrA1	5274928		gs://path/to/table2_files/mismatch1-1.txt	Shigella sonnei	gs://path/to/table2_files/mismatch1-1.txt
+sample03	glpT_E448K,gyrA_D87G,gyrA_S83L,sat2	5287603		gs://path/to/table2_files/mismatch2-1.txt	Shigella sonnei	gs://path/to/table2_files/sortmatch1-1.txt
+sample04	parC_S87L,gyrA_T83I,sul1,aadA7	6674503	extra_value		Pseudomonas aeruginosa	
+sample05	parC_S80Y,tet(38),fosB,gyrA_S84L,mecA,mecR1,murA_G257D	2771914			Staphylococcus aureus	
diff --git a/examples/file_comparison/outputs/diffs/sample02_file_column_diff.txt b/examples/file_comparison/outputs/diffs/sample02_file_column_diff.txt
new file mode 100644
index 0000000..c6aa9ad
--- /dev/null
+++ b/examples/file_comparison/outputs/diffs/sample02_file_column_diff.txt
@@ -0,0 +1,5 @@
+--- table1_files/path/to/table1_files/mismatch1-1.txt+++ table2_files/path/to/table2_files/mismatch1-1.txt@@ -1,3 +1,3 @@-foo
+-bar
++eggs
++spam
+ 
diff --git a/examples/file_comparison/outputs/diffs/sample02_sort_file_column_diff.txt b/examples/file_comparison/outputs/diffs/sample02_sort_file_column_diff.txt
new file mode 100644
index 0000000..c6aa9ad
--- /dev/null
+++ b/examples/file_comparison/outputs/diffs/sample02_sort_file_column_diff.txt
@@ -0,0 +1,5 @@
+--- table1_files/path/to/table1_files/mismatch1-1.txt+++ table2_files/path/to/table2_files/mismatch1-1.txt@@ -1,3 +1,3 @@-foo
+-bar
++eggs
++spam
+ 
diff --git a/examples/file_comparison/outputs/diffs/sample03_file_column_diff.txt b/examples/file_comparison/outputs/diffs/sample03_file_column_diff.txt
new file mode 100644
index 0000000..aebe16f
--- /dev/null
+++ b/examples/file_comparison/outputs/diffs/sample03_file_column_diff.txt
@@ -0,0 +1,3 @@
+--- table1_files/path/to/table1_files/mismatch2-1.txt+++ table2_files/path/to/table2_files/mismatch2-1.txt@@ -1,2 +1 @@-1 2 3
+-
++1 2
diff --git a/examples/file_comparison/outputs/diffs/sample03_sort_file_column_diff.txt b/examples/file_comparison/outputs/diffs/sample03_sort_file_column_diff.txt
new file mode 100644
index 0000000..fad4e1e
--- /dev/null
+++ b/examples/file_comparison/outputs/diffs/sample03_sort_file_column_diff.txt
@@ -0,0 +1,4 @@
+--- table1_files/path/to/table1_files/sortmatch1-1.txt+++ table2_files/path/to/table2_files/sortmatch1-1.txt@@ -1,3 +1,3 @@+baz
+ foo
+ bar
+-baz
diff --git a/examples/file_comparison/outputs/file_comparison_exact_differences.tsv b/examples/file_comparison/outputs/file_comparison_exact_differences.tsv
new file mode 100644
index 0000000..9e07948
--- /dev/null
+++ b/examples/file_comparison/outputs/file_comparison_exact_differences.tsv
@@ -0,0 +1,8 @@
+	amrfinderplus_amr_core_genes	amrfinderplus_amr_core_genes	assembly_length	assembly_length	extra_column	extra_column	gambit_predicted_taxon	gambit_predicted_taxon	sort_file_column	sort_file_column	file_column	file_column
+	table1_with_files.tsv	table2_with_files.tsv	table1_with_files.tsv	table2_with_files.tsv	table1_with_files.tsv	table2_with_files.tsv	table1_with_files.tsv	table2_with_files.tsv	table1_with_files.tsv	table2_with_files.tsv	table1_with_files.tsv	table2_with_files.tsv
+samples												
+sample01	tet(A),aph(6)-Id,aph(3'')-Ib	aph(3'')-Ib,aph(6)-Id,tet(A)	4783605	4783610								
+sample02	glpT_E448K,gyrA_D87G,gyrA_S83L,sat2,dfrA1,parC_S80I,blaCTX-M-27	glpT_E448K,gyrA_D87G,gyrA_S83L,parC_S80I,blaCTX-M-27,sat2,dfrA1	5226301	5274928					gs://path/to/table1_files/mismatch1-1.txt	gs://path/to/table2_files/mismatch1-1.txt	gs://path/to/table1_files/mismatch1-1.txt	gs://path/to/table2_files/mismatch1-1.txt
+sample03		glpT_E448K,gyrA_D87G,gyrA_S83L,sat2	4719410	5287603	extra_value		Shigella	Shigella sonnei	gs://path/to/table1_files/sortmatch1-1.txt	gs://path/to/table2_files/sortmatch1-1.txt	gs://path/to/table1_files/mismatch2-1.txt	gs://path/to/table2_files/mismatch2-1.txt
+sample04	sul1,aadA7,parC_S87L,gyrA_T83I	parC_S87L,gyrA_T83I,sul1,aadA7	6674526	6674503		extra_value			gs://path/to/table1_files/mismatch1-1.txt		gs://path/to/table1_files/mismatch2-1.txt	
+sample05	parC_S80Y,tet(38),mecR1,murA_G257D,fosB,gyrA_S84L,mecA	parC_S80Y,tet(38),fosB,gyrA_S84L,mecA,mecR1,murA_G257D	2773544	2771914								
diff --git a/examples/file_comparison/outputs/file_comparison_summary.pdf b/examples/file_comparison/outputs/file_comparison_summary.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..f36eb86164536b50619171e7611d61b95d8cb1f6
GIT binary patch
literal 28661
zcmeFZWmH_vw(pI*lb`__cWc~(1b27W;O-hcxH|+5?gaPX7Tlcx!3h=!@HUWVKYO3O
z&wJ1PaPOzvV|3T5TC-+Z*BWz;-+wJ~c@a^1W(HOS@}AtDJOpL{6TsH+JpwN;0;95p
zla&d8iW-4Y*uliW$<_ft_3#Mh3T6pr4(0@A4Q2)UwFR>SGX^sS1AsAsF@Q0H0YSg)
zpc?WHw#LpzCNEVKK!yM?wr7K<U*KOZEo^N>44go1h;Rd$fGo^FW*`$Akb{|BlNy1a
zAHl@N_^B_zf9{jJoe6+Z*ucra%GM0jjDeYnBY*{Dp!iP}SpTiUJ2xjWB_~i*2#iXC
z04E3Mrvn)P69W?vfzikSz|6n|8l~V%Dkz}@Fg37pe6}%gv;;89+S-`>-5<6;?zKS8
zJ2{y+*Z>$sOk6CCOcce0bpGlz`@i+!sjp8xb}+GV`qO1bVOtv~P}UK^%=A14Mnw}x
zTW1F&(4d%~hOLOeC}?A2`(*GkNJbeGV+#WzTQ`8#Qw?@zAOq9$2f)Gg*P+foO~td9
zzZO*3*2>mF$<DyY1n{)J^1@1XfTzKLT6sGBdlb)kv%d<&91Pr_Tr&P+7N4t&DH#Bs
zX8L*N|1lLH$4{oBpn*MGiE;oqUYZc)1aSU2;sS8}qoTT;;d>LKXJb&szi0V*z)wBb
z0{mlApJ(Z>(MX%vm^qmPn4kW#pb>~#Sb^38z$j`3T4oUwBU@vWCvT2Up#QElf?LLs
z&Wi04JDTsc-ltt~)n3L}h+uGuG$foV@OKc02J<}=75L%x>a!vDccj#capd=8chXa`
zyD*=PzO`-<ocLT5Yc~CQ*#EK_&G>cneroM&%c;j>U9n_&$(7!QPeTvl$Scj^&6T22
zsk0jQ7I(`>*GfhgO(oQ#$h9TvSS8Pm;g<3AbQk3GtD&iTIegXMmcDnN*1twyT^gMY
zSE?wqqGl|I@LM}sONu2~++9^}XsN0+PY2`MRH|)#Iqvbb@p&k`IZW2#DA{aHvTtfR
ziPkxm;>i4!y>Zdj*n-_hqry~NvYEZcK}0u0YEe@XQ-Wqg6ofZbCaJ>QVzwxL!xKb0
z)mOV67Kj$XL+wUhE3u)@&HQFu*^uf$QWq&v2C2X1wJ<rRu;Wgzeuf!lPqmXFXI619
zQ_Y#-fR`a>k}guJY(yctE>hBX4}laTOuD^2K9n0N?o@qXM?BM4q2`ck3PiD!$E1S(
z3|_jnN{=KOY^f8qq=M@mP!^Rl35YLcTNy>3&O+p&zms@MK`q_Qxf{xzCnY+asv@Oa
zK>7ozAKf`PKH@VAvYavn*$?(LOcO_ZXPUZA#cfgAejtlxJzZ^XqERYtIqz_*`Hn`R
zC~ZrMdps{pIo&t+JRX>G-fvvB+|RilE|~H+-!!H;pOS@90bEVoM1t~&p(s^R6EWJn
zn!H4Xg>OTjx6R2*xcM}>atFVakf-5*Y`4vK!a#oIoCwQ*k9UcjU9{|uj^;6_>+t$k
z4LkQNUF|F&s9QBrWvyYaHTSK%e*M{xY*Y|e|82-OpJ2l$pE|J~c7~m%C-juG#8`z?
z`?{7W)=s3}t0WGQ`f6M0F19%#Bh~xY-rMr;G!1G!q5G;65oFd7Y+74)<cGEF(vi7e
zmY9mtMw)(-7H%7u6A`R)sZV+~3fOipYTn_Dxu23Y*HLJkD0(7!r45d4-1|?f&s2<t
zm!vK@TF<Xmp~Nks7}<}x%|!H|^NMxGn!^fOe?Wh?9-pLn(1ki)bX=<t=9$3?MhRW|
zf%?@_Uj&#^Lhcz29#4DF<e>OGuau2dCvYs5UV8+mDwg!R;zM4F>7fg^RTcskuTf`Y
zs&h!Q)ie$rQ$3toc=0!cV-g}hmse#UJU@Tn_=ZEt-MQFwBy${5Na{S;in#Dmx>73q
z8g{(vD)=(^8ukO%>?Nr-eqAQmcNR-TCWqt9+DV+_s%70*m(4C!qSdyThiT3&qkVYR
ze8UmU8G{ZO@j2(%xs^hu>;@6l#{SGJ{br{jUlZ){md{WO-``@;DcYk0n=EmfgFp31
zi-iG0-I0YWX^^N*g%ErU3<k7&CLURkENq16b*yB5I){9!E!7N8T==MheNH?=pTdSx
z>K2%5iG+i6YmJHYFcYL=UD~qfx^noAu!p7vc@f>nTzf(mb}8qK19A`1!?O|TXrLyD
z$x~tWKvzy2rV0gXv-h6dynni;%&Wy=Z>bDfAR<{P+6vYZHn<F$x3d_-1e=K{v?@7P
z7gr~w{Z|yMfXLCF{S*!f5qa__z#&9-i_`>FEqaHJR&{wxfsUkyoy*Q?AyQMkjjH!H
zum-Iquk2_K^$uk&%~Qu(M(gZwdYN;j+doUbyll4K(f{lWc+vL`|Ng4f%{~Ve%fZw=
zO*DZx+$vSEr9NZj$WG}FZ%^(hvV?C@pYeTDmQY9BTi?ZaQuXvJnY=lAcv5}_Vt5_=
z2n)n3Ici&KdjvDi#wivH6LOWax_2S>V3d>z6rF+02A){#cTm(_twyYS6$Y5&$A_G7
zxDcpYx>^wjx9EJXSs^X6D!%c=Jg!p%z{b89PhV}*jKU$Cd_ImufHkMp*D{RUi!NWY
zuiHX9OmK5Gj!pEtrUYho-O*xI0*c{0aMi6sgVV3Xsjs<bf!~|HR_v5&d@`@<N?o>x
zd-tme^%IqVE2+%9%W$Uia*I+>{2`jKf^V`l@1zrHr@!gvl|HU=TRE$>+0sidHUYbw
zy9!~hW92Uu<R88lh1s(>qP>oe`S=ZzV|YNZ-eb{$S!kmvFM=7KZ#!pq5q+Yp+H@T)
z)i81&<-6wbP#GjcJAQx^T}ISqpJW_wO5CrWQ?bgV=uniEB&eY6NR)-96OoX^<w*7o
z9xR?Qur3zD?Dfw+8C^f;F}J`MufnO$k>sZ>1*%tiY%<`BxK~!M>2IVd-&OdrsQ$*K
z*~;RPdh>V%kpc36{)u(z8?Q9x-K-M}zux3hK8}eSCjp-@n>RNBse0_}4oO+#iGD>=
zNP-VxS%k#y9z`Pau{8#_cY?5FDXb<&CQN3yET?Q!OkUQyDjb2OrzNl0es=a9nkDS=
zxL-Zwu^#hmmTZ5Oqee1C<Fcw<!)j3eayKU!M`e?dD(`{oZ=@!p@%uMTMn1<qEi{m$
z9dV#vZ*qoXY{Fu|^i7M#oEgnXgOu~3g0yPn6mu&(9qLXKbPDRUIov!Y2lKvFS=jiI
zz7CKHd{+x<kJ&kMk$e^N_R~Hj!GI(sZnXBA)4r>x(PpF0=p`${kRu0`98TACj*EKh
zhN4UkzSfmdjG2cix+j4y{wzwN=$Z28FApwf593f8^VSf3hLPvJ5~CR5;^;l3y9NdE
zm1TD5=murzgdMYUdhqt*(g6aMq>1k%IJZg1ptKccnp@}DtB5crjKDjn_Q2^QSsLHB
zLJWk6NfeD!k{GkTu}sJzAXKC84D04<L!U4rUJ97TLQ9DIL|#wu`&BM<+lF}fPFHiG
zI^0U_xxOWV*2M1y+P;2z!i<)kkp^j3R1~vqv<<9H5}!DnhFER4)Ve6~40uJRusF3%
zt^~Y<xS0)3M{Jg`K^sgL8)6ut7Tvdml4DX;2SL(-n*s^D>jo0`eZuY~geEhVmwH$U
zYAV->3hKp%33>zJ_5@MIGUxWQ#HZV}N8)Qz-&d}qw)R8$6nBobYT~OhEfBSD6kBy)
zC%@vYIxB*9&X81?^hU~nN^aYB%6@fDwGUlD9@7Aj*19UNW;LPV?PwN?#E6RLPjeMi
zl%}^<6q(P4uInXDP-PZ`C5tpX^hUx%9$J8P#X@7Z@L;fepEmr3S~Uofv(*uYnhM!K
zTU|ku-ojQB?NkOSZRV^}JL`r~8_SmC5!^OME9xn<{S!|8iOip4ucvqn$ie~qEAnOj
zJNyPPDnG~TPtl-=iKCH&g&pWN;w3tiHLwOn@(OZ-pbxEx$$JA;XC(t0N6<+t12acZ
zWcgwy^c?HbGcy617}%INIGF+Ttn5GrPA(>PARB<0gNucMm5qam1r#r{02x?;%$yvc
zc#%;8bc0}FBxqv>x`SZ?xe|0VdJ4O_fSe3KAdsDn9aQ$zqo+c8APXA<Ckq!RD+{Pp
z*uYNQ#KO${zZfeynOLg=SXmgjSlK`~Dr|qc2Z}mPfB@#dqG$;bB@-tA^GlrjPqFM@
zw+v5V_e<Q$46*_OIYDmzieq0w;{SCV%Y$gAJ>z_REap%FseVI>A8bdikwh>XM{iJC
z-D5^yo?ow~>F#RM?<XMFVp~w}a!r5ImPnVP&%_QB0Uk2jweF|%Ce7X=%@m`s{HzW)
z)CfO58k3EaY#%L|`lv2YBAuFfR%x8?r0rh3CPn~<TdCY-&HFmb^I>=SXOwCnhEQS5
zU08gKeq)8tO7}w$(@&h%xeGGbtt?cOv7oaKFnRAvy04o*@0H+wjaa2<Eh1u)OYZmB
zt-)VAdMe!}F%q@^ptP&(@un*lDL60s{u}3bE$}W~X)+0^DYg#1n;-F-m%d;dKk*8C
zeN}jXUJ1@wM<$!hMraxN*gJ4yJXLL%QGD5+o;a$QTz2<Wa?i%CS{xzQyI`^TWf?^6
zK;H(I<BzdR@I<)OsRkoC;|Ya(sWO34zkkA^3!q?dTs3bVe*l<J6Hg7(43*Fh(QGm}
z$U5r!9upYZ?#~eY)}_|~i(wYhhFuY6YDVD9c|8_f6P&7IqGIw|DuX=u1D;gGr%ziw
z_{%f9W1MT8bEh&}U@I3iesQ~^J_q4lWAAh1nQL>cEVd?tHFj!;%o3866Q)N>H6S>U
zYw?oV$4XPxR5UadZ!LE-HXu3ZTP-#gDi1$-UgLEvHBv7+_7}={Gk!Hn=-Xrr3MeH#
z{v_O|R*|YGN#wJ==G$>ktjQkPp{vKAzUsXDV0O~{b8&y>tvjXsgX8sGKH+?dg+VqK
zq<FH{MJX-)ZL9r$T)F{L$Iql?B&6BoafkYShw)(S4yS(X<omCD9WpkBo>{=vqXVL4
zkMXwx_sf0AGNa?mo)i=U4F{-g`w4gJsCbnf4=%Rs4UHRpTv_cG*FMF|GG^lycM?)2
zDvndL)7D>37Z5qDZ-?_rxV>RcUEW+YA=UwBCc`iJkAGjCI!6(q_;#IjHE9J!(1!$%
zzk!I)gud~GbgPP2>I~SF((b_bIl0`p20J{Axuf`W@CEz_il1R1t7<}1vN8u;K8%jV
zVct1>^Z9a@Fhlu~?a~+u0fZC+p$n-KmleY>?x$v<y`ucOd;}!vY@o{*SSsYY6ZVHd
zLY@|D+5)XFnQXF+2f@D5YPYV(>ifR2rqf@1;+s5g(kWCqQef>=AyoY>JQ9_1-r3qG
zj;Fz=_-nSk8*RF~TC!=njGTI`nw;Z4EycXRA-`S`v$e(ZnV!cyRr-N?Xy2;1y6BSq
zP3p^o#{K4o*s`#b<63&lT%{{i$eN{%hn?0>gcu$CU*#<@WjY2LEy2l2(yj^PC0Kz_
zqcm!xtu+>rQbA6~X^n-pETS&_Li?mV)GLLSXYGfbE?bqlK}2|~)dg!Se02s2^ec}U
z_KR*m7k15cjtaBuM>C~-Z;cj(J=be%7VAhww$Oe>RFR6Sz!0hQ@3Z!=&P7~@aCCep
zyge12TQakCXuIEt?3^F)#o=f)#bFF<6*5z$vmd<y{cQ^<v(P;7aqWHa%b79D!TGsX
z6j&;An*)-^)A*5F>jU^N*PI69qm^!N;`<Kz5N>(z)8i@k2IEDGh?@@h-?&*}m*-{m
zyR_Y`BuBzV+_hf=*|_rMz;$yt+rZs;?7R7#u-e|ucMN@BIqxdj8siCOi5lCm%|<((
zb1`p3ck?9b)N@uzb<b6R;%{Ah#QlvETsj1%Z42J}C6xGc=;fE*@Bt=NK_q@~)uv#$
zGg_H=)_|f!8AwdQlth1Hwiwt#9=v33m!p%t{JL7B$A__~sIhx9qQ&<6PqjosZc`x*
zEj-tJmaPUEUbh;O+1_cZ()d{`uIqcuMn%;>_n0N42?{<qoz5!^#JfgkQ0w@ya%o5>
z#bM=W8wR(Zaz{9w`lrT#mjEE?QenZP!N{?|U<4t{b6;OlsLK?jVlNBmv^2K+oGn(|
ze*JjR(%xWje;aw%;WOvaY=8fg$fwHGo-e4r{_^AgW-!C<8079T=epO~^7<URje5sf
z6QpYeY{-i6J?$5PJ-zy&$MpukN1rs%HYZP&rPSjp%YTD$S-IHI)8SCy+Yec)YJI=u
zo2U(aAf{-MCfjy?y`NCEpj)*d8B-n}^S){hW1w);xi*m}S$#o;iC*^tz5jX~dfG?~
zKR@k<{9IIISi65$dvCKVk#}9qkFf(6UZ=f@PY;*^+F4e5zoPwBUcZ*1#Ma>mdB1S7
z_8P&=#_fWcw!#Jc-N4#R%n|Jnm~|w9WKOn&bUB)p$<&aUil2#2B_1PB#pv;>cjwKm
zrC4;kD97<aHNE3?^2w6P@7<ng{#&NZ_9j|hk?U5@R<_^ns09Mt5=yL>dGU_B4ZjV0
zHs9S{;R>+H9xaKPX(K+6e$F2IB^S0Y&@E2F{$9kG698-+jH>-UcP${k|Ka-aRt+_j
z^|G^a4?F5j;l(NXR!E;^Nc0=7LzD?@6uc9hlUBRab;Jt~kNYm{cKaW@eLwqt7Wv4^
zW#+CZ-~OWhw0pn)1#iNd$uR|BNTeE>^l@65zIvrt{F=m(+tw4<n&xw9v9%Jp;y^+t
zHYK)A)#iB5(SCWJ?E<3~(t)>7r7*KS-|XsgoKsgYsxew%vsK*XNtLO<ell~rF!xQR
zG;6+nNKWtfW=%`ImaRYLQ42?sOdem@i5FLmsGS?4an{Z({N!8e->PA2u(C>#rgHep
zg0lFoxCcJ_Df@(YgbvHB%XY_*{cSxw862UF889uLXS`-^Sv?OlW<j+>XW?gAd5ewL
z1pU~ue7vn^8^qT$J&qY`+@6>#?|I)s>3I3J?>F#p=>GgF5K3Zk5j|j!<C~DMO<^Es
zF$6X_7xVb5sJ2uJ8&KGt%~io6zN{l_c<M98_u<e<PbR8mEFvT5?u(89Y5$5`rzhjL
z<z0Qk&u?`cmhG`4vi%QS2kxSDw(0#=G;Wca@GCr}@IqBFz-}CB*1Q>;zVcD!2Onz6
zy>bXdigg`?yXTFZs+{V`6Wc%84>bzTTdbb{w##|cI@`3^wDb#qx5^U5fjguw9ARX#
zb1(Z~ue|A3kwqI@p)n1gQ6xd{yGQfLOyd%xBsP~Ww^TNl_;GG_jV+AsH>jFh)cHu;
zEd5G@$g=4nr4{#!zj?~eX12fYtjxOMr8fVrEVswCr+);O&u17ocS2XhHj*^`HdkK#
zLHbiiZJ%rkM_Kj95eH>WBaYM~^Xfiw<6u&g$f6j@{;gRz8jbI6qj-8MZ6m+9EWOzF
zb*}u`vP4bkA4}L6NR^U^C}i{_k#NaqBw`3BrJM#9e@Z0K`C0O5KvlxvI732F(t?Rn
z;<c)fV20&~!kXTbA)nSWoyU>2a>DP}S-4U5s08!fpW;G_v%{B2IVzLBEhaLz4(7A)
z!}1cZ+U=%d?=fdN)Do@%Lx)y}uO4g^77F9km*_Er(QgK~g;?i2RP$>Nv_vbHku;T;
zxw!b@<>LfisaN;e-xc-zHN4K0iiV_nqX0qq&Z==-2vkC!a1BN2y+c`2<U+HuRiyZR
z@u*~TNM(%2cx|=DDqZ>Dxp`&6aV#4{F;Fhwonk3dTEtyiLL!XSPL<9O2YOn6arsTs
zq+KmUpn^CN(yIbn2Q3Q7TA2EB|EO2f`h0t>{;b^rW$)q1M-MFV@jfa<3b7KFa~iT(
zmuL~1O-0gL$j74KQJT9Flr=<5chQ<|?7_aNuTt|bMMz3O&kkp=gl+{M61O#HX{f4U
zL+0NvXvz$hC~#GL=5w&t4BknCK(QjKzIU=>4Q1PK1>RO55$4B$`@PQcaQ`i*|2`;D
z&~N{1CcOG^jyCsg^F&%WkBAdo*?k@>mu8fgm&wP?1FuOYw@AvCeMpQ5gnD-NNn8y#
zct-Z(L;GKjvo}{`!x?_BgGmX@0<jC_=e(JU{Gj>6v57S(H>HRhyjaO&igdN(>6<)a
zBo}L%`)gV(=;PD&jS4$FV;G^!v-|FGX$@r%5cFs0iKNtZ0mLXuQzPBQq}X}))ZupB
zs4|D_FyPkbHn%kq;I_BACjQCFxjVh9IJ#`xv<RMLQvAURXwaxK#8?DsR`<OI33VhE
zK2iO=(6RMUkK8wv`}YBcx|D_#9~3Yl5fqlK9WjOBoi6Ha)MxBx^1CQ<)~j>55Hd{A
zKNiP>U-;{eNH!Owz8ZTiX*`^R<z6sojpe*YT@{5>G?7(Ut|18XP5;X-uY;R|!>j?!
zM(9EWET(Js<ooJN%B02Fj6rLP-EZ9IiPz1oB9Nb>r}L@izoGS)Wi^l`AX4?6DovcO
ztC$<IycI#uhBsQ0yRn}fvT6v*73T?lLrEu_Y~UnHcTCp#S>JSEDTfJrhmDCHGap>{
zORgYoK)n%pWp(!(`NrtH5e$bhCJyYov|{9+bg>jwMFB;CFIL&zmi)fbeOTOCBAdpE
zbYbs5va)^C{S+^FV8vXFnAsoOpRyw%aiPS<VdOyLghyND{%LN3@^c`iA%vqS#)-OY
zE{+4uVIsqXsNA=v_lh-+p95EoZYyMr>U`>=69&J>X<%&gNeki2WPLE?&bwqLv?Ux~
zl~_*Zlw&vJI{2JoMm|F8-g*_0^i!NiS2UZjCUMs|p}UaPPw}gr?+)j#fr;$#Hy)g_
zg(7Rv5~ea^;yAZE8MV?vx0;r7R^8_lsbX%KOH*jck6X~jv77?s7~xq;GL{s~UE|F>
zgCp^&+fp{TblE<N;qvW!`55jtpE<Z9$px@3w#n;lStPzTub2qN#xKP)mANY$6H6zM
z$aN6^{vLH^Q6Yl)1K|p6Heb@$k)QHzuZK-$qFIzO^-64LMV$_KBX(J5M7_i*#ktpI
zC*Zqv0?UH9fq|AjDO>XWLHZ`h%WOSXEQ?=1(i*m5<k{0lK=%Rmg|sQ6(7X_q5A;RX
zzq>IWtAcyMu48z5Z%c{yz|MDfYM}K&xblp^tpBK8?~S5J*$?n!wjs|&VB2@@tbpr_
znx}MQ=8Ej}%6iYN=96#kIE&&++_`32?g9RV=nVdWPn{^tKICgraTA&$HaA8J^cclD
zm7l9&17bxc7`CXlhk|MR5=z1+=vf&JuK|Ax`1m}EMi&Tc@NWoS@8l5!6ng_MatMNW
zM;rCzb%Nw6%mjyr3^V!_E%zt4h4*}Aq8xB`P&+^7{YFm-%du|tTkEsDlchC8^J1A6
z#U9#y#|IVLjSz^b`RW#Fo0maZKapZOxhj3V6_?swfpo*3yt?|erWx;vJ{LkcRW>Ty
z>$iJMd*5JBAC@EbqkR(U2Pf;prY=0EC?TL1t%Y!dOw=G%vUikpr**1db<0}DKDNFp
zgl?TRQ}Q1>pFOE6v;2uEI23DNSA%8rEks_o&Y2UNlpd$!2y0h|3w3zck6~Eg1J_vi
zrDcCCGDqsN!DfvGwpJDHJ+;0)&X`3{pg!h0n`1KSSUna0?p{-?j+X<rYUHo?bydd~
zueKMvB|AsD%i+@Tpp&!NvXe$wXdlaz)LSC21FZ!J((7AwF29$evbd&?iv+3IA{{u^
z!c04;gatU~#Fa|J*s=K*nw71FDPCkyQc1a8c{<b`)>1_em(h}!VL`E3zp>7G3^X*#
zLSk`SO;?7oE7M90i($0v3jgIP%7?skpIyzqzO~qyWYRkHN&b;$k;jNnid?=YH><4v
zw4~REVfaAAyz{W7KvaY8#2aD>2y89|I87*Zx+6J{hA#AAvf_THZRx)pbj@UN9lxEL
zMm2nQpYttou73{IaO>*B&Ff>>YKhAa;k3WZ!R3DGh9Gk_q5N<oz`6>%OuR{H$GCfJ
zR}|EvUNgAqSWq*3M|ns&r)IdtTkef_D`~aduBe+!JEdlbd4X6Vtrr%HctpGN_``|o
zPV$!<?J;fU$pxRU>!24u2W^;~mTOq-=JDjsN+j)+jv=ObGPe2Y)QXAWo{>>$D(#wy
zVfH2iCjN}ckidcNAOq&z00U-+p$>%$xDLfbwR0-Y4}D~Pov`J!#jEby!~=qAd+P^@
z%c%L3RrJSR53$46wmdJ?y{!|1YQ#MPnZWC<BZ7t0%iokk^HHcRp;2-M$=@AZyynTB
zz54iTb+r3KUORbF{NT}!LW$&uc|0Opv(Oh8Tb{v~4A7gv&{M(Ga~bv)h#H?WPIVoY
zFwqyJ&+f%udhc%Oz%*5QCPI_<x%thOy^4f=CC#;#9z?ChI`)3eckF$HL8hdlc2QMw
zo0_CMc0w)A4OINQwl)dho#^P?jER(9XO-|lYKfyQNiSBcK5i4X#EDZWNBGF=p%SDz
z4`piPT+g>IXowB|eba;sJ<dT=`+`}qD#+|qE8OOE;50FAuTo6Gim!(nvT_y36T$7k
zbFLv=CTcQf-+Cr~jvIpZqw6D=QObpf->Xi+7%kkcW7rSKb|ZloO=D!*gdtSjyaoH*
z;Q;QCJA|teW|9ZY4I_VBCE}`@42@c5{sDI6jf|;LK)iq8YN@faK1E^`96UlvvE<wI
z88F?@tqp1o@zDvvI$b$%<meoYqR-m#T_|9gV5;<IVzF2w;ENIRbz{;q#Gm|i?aG+(
z193Ms*pQ)2mT{CC3*8!%j#lZ!oS8|tKhjFru@lQHOY~z1p;+S@p}(bcro9JB>U*FA
zV;lqOazEg*I;yIZ$6tdvMq0Kh0tcH-q%1pOM8cHGisE28<lQ9|cI>}?zttCwl6w1j
z*0{utjT~91uJcNrBnvHyg524T4+l(<8cTRkW+n_QQF)y<JwcfQ?yE8Vw|G%@j}i4C
znQq|%D!{j1lyo9%R&xJ1aB$_6gf~R~_5I&Tqmk#OC}G`mJ4aK|#1g-DoDrAdm<z>r
zGq|`7tUK0(r^CErD?A(0exr)8cm+}V%0MfO267mx%xyo3I1Xx-ZvO3FzMjN3PW+cW
zG-mA~V?b;EaXuZE^a6M-PF1Ud2YKeTB%Xf4)H`l;-Q??Iobf^<tR8n&T%egGXRHbv
zIdv}HC`AO>S^fYa8ZEoNX!w-`9{{J<K+~XGg>j@fTwJgl0&B-un`!3z7JXL~94f39
ziDerx(RGNs8C7H7+M(t5#8swcHIExj-+cP{*BfBy=Pn|%XZncOE@fM#zX?VOkM9)C
z2Cz|T^+WHLcp!a<Mt<?6LFkM=M5(qzTscFL@~+!ni-B2?g=*0&`{=0yEU$yo7T0H6
zx`|H3X5H043QU4?`Zu9npUc`(ZQA}oa1(B=rJcq_x&sqq$vHZ8tqb%EuUvr@0?RGb
z-))ZRsHflbmqAats1hGoI<&kFR(0nDAO;K&M*-?P$=BE0*$f>UNAclKUq6U*Vw}4#
zL6=R)T8dSz^lJv_5R9&d3e0z@-MoEu67Dy?jVxk;;C6pRxO!-P-GE?kMFKJMin0=g
zg1zpR+l$z?v!^GsouUKBkMN#YPYH@x@H!_9tY+!eA>5s+YZp}K0vWHBbvmkqADHfq
zwa}RSZxl4O^)M*m*f+G6_XgP^p281RRpL@qaEAz<Fj^Ck&2O{2LD$2MZc#&<GFPyK
zd29Ij!=Er<2I^Jdm}PKSt;1i)Y0R89mQ?N1i_xb(r}FUa)th2~VcW<EuctY31{%SW
zAj%Q2wCtc-*}w4|nf|yB*@`bq!gASnYck-MCQcqc^eR>w1_Mu&`;9;KSKgnE6y*Lm
zhZqWh@-egpWL8rSR#?!lYEgS4(){Ql-wTBpcNfr&l!SSf8L8JtQ78EAlA|e~4|Z+q
zCDtQlD2DlPaj7F#m2eKn#EWoOe5RIiQr1(}GfhU|J`vnzerFcGs5Fe*6MhgURW$f8
z+_O{sAw>iB>|^l<^>&m^Yz^4zYUh~5?YvNl+VKc`glVs?ZiWKYO_%aseu8=omZQ_^
z8Oaf(d8v}bd6Tk4^>$TI9op*L=5><y<Sy-+VRG%NhS2qj1~*uVQg&IwagEwL7_~#>
zJ`y_^w1(K(_>8WDy^{zX+i_t!vvFY><d_{dabb}RsW)E}q^*|Y!t}%kdr4N=j%Ysw
zSlceWQxT(M{8gYVHc}CF14_!OxHqv?Ko|c~QXlG>Si9NWw}Y?w4_kv!JN&vsWVCNi
zC0_9(A!ctbZsZ~2tps-ve1uU&OkR}wmEFaCxOCaog5qh${JT__{=u6K(($V=ea4Ld
z(g6&HcUUtNSt)&e>Z){RA~WN6U#>^&S$Uqm|G-$DxWE%*VP|Iko3Z>YFZyT30unVj
z8#=v6q$EV1sLK<70U|I88916eOL9Qa<8RU;48j%ejv%lhVPk3wQtJRf@*^_~5aMwM
zPzi#hS0>Z|MmYy#69)?$GXRx@F$k7eIJtugm7MMDtW2z*;0Y4|ggKt&K7Ruqz%v4R
zQWZ&vDBFrjh{za#)J1>QM_^QY5-(}7GlBF=EbM>2Y@ief(lar?<Uv4(oeNaP#R=eG
z<pi*SY&kfX06<U~8wVSJg@psa0>Ug$^|{zy{BVHEIY9L1*(WEcE+;F91pSK`>Pd<8
zA|C?$g<aS{t^O?&dg`%+jfE57Nx)|G#Bxk6Y>XZLh_V2Npy{ze0BNd>EkKH;m)!HO
zk+p%{^Kkz+=2Jyj>EEXkU~FOvas-;N7e4m94D^3Z9X+Vd-xkko4b&G<4d$25Jc*th
zpY&Z01~wqE8Sj%aj32-&3i>>&>Hf=k;7ct-(|<MpQ|mc-_Q4GD!NkPO$qr7Wz^
zSr)eEgpK7nVf~Z)bMB=E+f(CAAVuGc$sfyiFNLBng)bFYxn6SY&k2j*a{_#^WO?xd
zd})E@k28*!3NKz*UR<+*dI)Nd71a8(WbChjfoSFP0{vrJjhr2xRCv#;_&nK9lW1XM
z@{fhKv$cELiO=7E-;#gr$fq6iH{krI&G%FY5)}UDUH8rgv<y!>>K{9TUi>c@X>4HQ
z^t4YM0b0yYTU-%@GL>yrY%HEu4n#Yjiv9^JK5@qXB8OxBC(+yE<D=id_a68p9Q<n+
z{N4S(x9`93S(X37XX%A(t&IO$NXyQ}z{0}D#R7t~%xnxyEKigcz|6$~I_BbF2k}~F
z&>@fu$i@DDAJPK<2nC<6dZ3LB{D*e&56pf(|F5(81ZhE7_-{xH;=cbgu^|t<oz_n3
zW99HZ5uCmFVIbQ15?2KFI~04XsSK(HE81#uvAF0r>9-mlH3@s0`-4%RcGTgVq@s@m
z{bK?K1zTdQc{G}*ER;;u7msXscJ`|2kRhRmI;E$QOwFicZcI8JJx}&e(S<M|cxqMY
z?oWIlU9(R<N>dRV<25C!!=J$(s9GJhomf`=biX;mm5va}*8G)R*?oH*Smhf%s7Q9J
zR2d($A7W6+41*t4nY;ovXO~SEMLx3<5;fwM5+haEbx9EBq^`-{>cysw=tVDK;TlXw
z5ovc%r~NY_AtCVx>}ue_7dVZjx1Wy)hZPJddv$F)ve7&eA1u7HMCfbbs-?3{>$$D4
zJ{}>*ao<Y;UDD`}E9=pHB-Exci8D^8iU?}!7o2Q0;@W5B_V%ZW!fvW#K3_R&P5Yqb
z(~)kUpO>7ItL&&K;BzI`j|p5hUtQMVo@B_ssSk|HdwXwvXFEW-uKhYFe)EXxzQKf8
z=DmNlT3zL$uVeW9fae?->w2N_#f7CyNO`t)P|EHIgQGGx4J{1~ZJH`88#^2laul#X
z_eWYm9;+yQf-G&RhPF<!i=JHG;_9f@7s{+~k^QZtp46l4`;3Fy(%G^WFZQGTRm);N
zu4dm(6%Jmq&%Bc23tJ88AkCAx)%!Bo--&*nCPqDzX$5RECQ-mfV3H%$H?BVFCDb&5
z`ukwY9k`j}+KC_U+RdjZ4J5G7-<IujtM{^B1R5@sd(Vo&R9fZ>x9AZZRM+Wo`##7|
zN#Y^!v0iCO-J#+cEY#W_LTf#C5zH|Z$z(%i62dgOa+<~ucNQ9l#W<UcPQDBHeK>?r
zD%j>#$lx*hMt@K;!P#=QOlg|!V79#JTZ^^j@jlP>Vbjn3<ymW8YsScSbOe50JcJ+*
zLNBUmqhE3sO&<w4^CK;aP~Ma{A|JwPVoiG9s_;PaX$xZcQ+!tjkGu^p>??Gbut%=c
z@MtP1t`1jIotLSfUnnNh{qfVAzt(e)<;$wq!RR8(npeW4En)7A)1vU&M7y+F`F^zC
zNtQ)%^lGaQpN{fOE`c8ZHNtJDuBOB=(eFOtJiV|AJ;b6oedtrq1N$6)SaU|Ue73}u
zItXb)KHXi-<L0FniGi(!Mc1EYT(y?v&abC5){=ycimdL}<SJT|6ia2?Oo+^@V5)RP
zu^{+`9WQeRDLSopny*UUem+l?iA<t&t<>r8IXYf$u1~1YNzVJd{%{Xlzh=Qm!0C4C
zcUgG6`Yoiz)1#S)@aU+}c#Z$4(Z!4C_r!wJRl9emC-Uh7^u(uiZxjaxD~QT>JD13`
zUD~sZzqNJ)O7{?sEUZsrNfmy%_a6$3=uI5*$jC-&-cg_34=3q%*8aS(Yye+xCOl1G
zq)%zBnS`-je^BXO-FY2Mwf>6j)V3HvtZZDW>`LJxQmIqzbk^y?UFr+lwuN%kb}?}*
zccKh@-)cav)5==?Fn)i}5fy=)AJ<U7s5QJhmdW8;P;1C=*K6oIXY|<Wd{CD9aepM(
zU5&2MVdlK)HA|sRz5CBkq<kwEHK|bE+iXj6emznW1gCIO%>AD{C4_t8gjm_>t>0@a
zM#~<cZIDXCVW?lNfq^T2v2x6x5#CyCJJR~HdUYfcKH1UnQT4H;X$SGg+eV&JQ^&gF
z3cd@1=KQ_`4Mky&hJyDecr8b<pGz6=xK6h^Rzxe2W@abCC9&x{&TFKo3GmRF^DgII
z!OUuzO~ZSpkrIZHt=VzGO;X8ui^)_`41Ryck-F<AbglYO5Nl{XNgDcNL&T}0egtAk
z>zyiXB{v-~+UTmK|7+W?b!^xp)LgwB0l%8GAHzYf@dIrWw0`U(FD6szHQwJXGWx81
z*%?5;`>@!NMX`vn_8a<wbz$<_(jx++S$59{zjqfYClEuC<!(m#AeQ<~r)C0;NMRkn
zkkO)d`fbB@)`BBtf$On5MN9dK0iP0;<*Y)pP2}b0f|Hf4yPgVek3Fz@1=`k@r0m<&
zSVo~dR9Io57I*4lC$3=`jjxsEkK0rUA=t>?^Je}8PxtjZonwkCs5BoV4YLbYc2gSM
z&J5>}4_1_O_7~gfFO69Z-JQxPWr%J<4$@{vnlfCjSEGv+QVh($bUwnYe-V6bFc3m8
zEaN_-#hOo$vt%9~EK684avSgR!(3n?rwtiRL`$D^k=cNF;5~9~qj`Up8^m#)@OM41
z%N-2=1Bc&YrDd}vF3tCQ(R*yCEjpfp_`kj>SohBQ{z#Z%H?7U!=6YYBF)1w}NU0>Q
z>EjZ&`j|)}K~US@ZW1av=$P|UZgNRx+(ULgZneT&)EvAf&708fxKmTo|0swRL8CqZ
z<-PSHligj`jJ?Av*<;GytD1ZI8n8|%wcK7dEND|cGGPUcVLJZdWI2A5;&X<Q!gA{-
zYaUF}FDo9ykr#2E?F~Nn<9l95&fMAUH6VQ6Bd?o0?KE*^NkbbP?CWmiHRtLtySA@p
zjjVrNV&7C++!z>Y)goeC5*&Gie}4@l^r^G0Yb^5k_;^+KsHuNFWcKnh5Wv#rcGTL`
zrgg<PUmG+@`@Lc!>I`?sX-yZ+4y9F373Eb&US`$en_BUv7$hq8Fjn;)ChfICKHm;k
z(v`io=-E||wsgg7kE7nD4$uAhYBgP-yXeY?hp)-H-lNlGkTS9%h~b2O3~zl)$oFI_
zJ0xra#%(P;W!p-Rs>|x>CJBoOj2NT&Bw*`c!EbY1y3eD=dUhf`PG>^9!|?drKHl9P
ztbCbR^yF}F!#|ml{k>nWrsr{XUHNc!l&r_)a#*m|M!)xgi06H0He!<Th0CqG2S=Ot
zp?i7PLwdZJRr_^?f4oBI<B`6^#AzIfw`$m?`TN#M@qU@Bqn1~fQ;WO0H%GR5Il|eh
zi8^6j>R0=p3?xHBvc~#gF#QhzUwN<T5-l#8%qq1fKHs-`D_jz|w*KlPaD7|6N(CHq
z88yl_!1wp6oy^fFS4;O0to>B73_(7qpd5!+W?Ed%BzZ(ia>v`YJH~U(o4GXk1%9#4
z*GrnGSqTsBWWL$t&NaY)E4BV$woH}9<H#GL8QAQ5vF&*$rsE75ok24v>vbko=Vki3
z+$xQuqjoE11|4qor(fkq7|82oOe8quu_;aBt&2Ql`471W^zS<{7Fbcc4RMVpKC3P@
z<oTYkq+0trUTRle&$ABmP~t~>aNHYoB@!UMOG4{-yqvZcxSKYyd=O(j#`LX&Hi;H)
zRWhL6g4G|2!|R=uK3CgWd2JF+TDX}_v&1MqZ6Elio`^Va%3~n!iYTV%ifq3&3lIXL
z$E&W0Wws=24MhX8o(UIWcS)231_zY`Uh1~kmVSIzk3e@w4}3)|;-k&Qp#jG$iW;=5
z>!mEPH)|wo^>cASr|uK>^afYS63U%DsqSFxA&sF>ZN~>r%qvip9eL3c9h#bSUo_jz
ze`W1-luwn-X>e${stXw1Qy!p}k1tv^)HnDn(Kje8U0eCqv>c%)l!deU>EXo5-84ZU
zBsTdH=@^tzr^@QF%wielo#0v0rdn+DEy5h+R5!#+(Q)8g!ZwYZr&MQ&4$(8>AM6|Z
zJEvB+JI!)bv=6I~18CKJ2i3>Prqnvj%Z9_OFX3~<txqomlS!Sc3-bdWmesWeSZ~dY
ztjROp_u&zKpqs<9<Vm#*!vT36a`*!BT0$?tIy*H;GprYJSs<mV!GU{#OV;2KR-Gk5
zqxv|no*Zvcx<)oiCId7Mq#~1rk%;(wI_=B`AFp|%Etu3(N~j8H7Z8tZCrYkyqY{dw
zDuY3J=plhH)}p^d0q<syF|akXnTV7P>_qDFgp#eP%~?z-!h?_$errkDnZI7QE)0=^
zDq0P#$^%{zvb`>XGbR}-BMJvkzUfB}eS*u$0_L`57X|5s_gTsfwZ_zun<cjw)`+Hq
zTYFfIPEU6nR`e#gF-_&Qh9q|r$7>gRSCTHsP30um?XohYL3HeNutlo3!jtk2X54H8
z9OBi~U|QRmigHmQMvCv{#~dVL-Fd&VH2I70j*#x>zmfwtD<FYU=%lkc=oF-e#>Rh}
z0u?I4o{t$@{MKzl9i2(QEQYY|9Tsbf=zRs)$LV;AMq=6pH}0=$jX%;QnbX~pBl~G5
z7}6u+sYvDs-m)!EZIh?T`tz%RWKWa|UA?923b`n!KRAQILJ;+*PK*5(OnQrQJ~V<=
zA~BH_mZYjUJEOnlPH;4t;vxr$R)%s8B?s+V>O&!?K1mm4;c#ze*(u@@=)mAk8RLv0
zd}(JIqhYG=%0%bH6dvGQ2C;J*nBa-D05p;WsM$~UX>S<Ch;baryEqeBk#yq+D_R1f
zN)5|pq!E#L*O&DU%M?e0<m}>?S^$o3j+wNJ8%bN4!kSm1JuRv_@6Jfb4urfUdR5^n
zIL}}8c-9cl0Td;0RbBC1=E@6{ro5%ET`A8KMGuP5_>G{dWQmd0ReJh2gR#X!h38mV
z>`Df{y7?k(aczWfyI_9cEdN<?R}*nod8g|fo2;C-17!szP&=T8qfaLM1kYhmWk|4I
zpQPM|b<-G#OwPCyBEAxl#r$2m-qC!G7!I8onu#PT14R{xzPL>hfbn_H64FlXYYR8B
z3k-rRX<ARG21@lFH?AmAsDY{?@+g^or8<ccOhGTLvyJ0tVY680Uhc4~Y3tE&pg2}e
zrqaO1$goBTZXl^CAEDE-y>My4tU&E_;ddt+7WZh?*vf#p*JOC2-elw90l3HtBR{Bx
zg_=SQq1K?0DSZ%R4g)a_dW*7|kFdW==9{rk2^r)Q2u5we6qPb@3&Xf#=t~=!QqAXZ
zV!vi(V#ds0Z+$N!+^}7SOA8(Ode_g8GYTj*z}2&?{ll`E{X&x@vn$J@EMU3H8$$r`
z*OrylCqr_KL=_;JL!$W)=_(;gQc>$~AbKtq+aJ0i0e_)H-jU}BqUY2#?$vXtua&#Q
zar!f2J86@WsRA+(JPX3>nAH{4oFz?B5+^*J*4Qi}=H#xtO@K|t!BEhs@e`tE<AT(g
zXSXR497P5vP~BgP2ENnT5<vef3v@5=n_C|I&Nms6d^RJ@ex4Dbn352zPy*MjCZ~v6
z`qLpgMBi?_tYPMzZehQ4f8#CC?<|X9lg+Hxf1gs=j)Om}&$tHvG6P7$1<BavA(5i6
z!z)s~;+!=~z;2P>%hcoedjyYnXiq}vS6J!+#qbJx85DJ$XLUQp6b<d9b~W-in`*7-
z;~28Uh&c<XgcC}b*U2Xwv;rKKy>Bh+-V;?Rx~M^F)n2OiOe{0gipV(3xp<dMX>-av
zX8;0$v#!LPt&mc*{UCmz-*szM|EFlg&P>9__x>wbbszChLhoj9BHOC4aPIT*?e{q$
zP8sZM{R#Zjx9S@gp?1Cfq&hvsi6&3M8US|RL+U$M;9Cez_nF7wLrv%RgDtCVNOKWq
z_J4uV1Xh!-Losp}5zjNq)9(y@d?ho+y+H$ZvlQ84(|PpHE5MiXO3d(x<z|`<`|Y8S
z7({L70@54j^~Cw3;Bf{{Y!UBf1=CC|qM-?dYiAAK0e>6#%skVZLx{yq5&YBFLiBKw
z#9R58M^xJuf^LG&m|2rtKk-$u4j{0SKS^bxetOK}-2n`(fip&(hv-uVHrPY@U{>z)
zSceLpj>_Y&P-Wv^u8SeypJq(tPMC?UZb7_LwGj1&Z-F1viiyK#!^lv^z|^XCrud1H
zR5THM?k=xH;KmK-anb2+>mzn8#yGTSwWrx%eAjl2N$_sbS`<XjIUrS{H<WCqb&5gs
zyluc&5Mc$vW~;49+zk5Q4(TB6cVLWzIY#+~A)Uu*0=_lDNWVC3pKVc>yX$HX0$0w`
zshn?Ct>S%}y>S*#`tqkOphdD~74&^^&pK1eT(D0-CU|953$OfK_f1CL_n$5nI}FEo
zh%4syYx{IV^<OjnvVxXQtsIt{x+@;)^5@FBYh~?E*cUJxemRw0pPSBU?eSkMmkVIN
zc5OpH9KR%FN<WdDoQEy^nK{U`N<V!O!OSC(%)1LegfZ)5mX)OO5ux#-U9sCnP<~i8
zceI%HRr;xMN|#5a5^i7pdnXE^_6&Q~!!^!hi}{R4={oT2X_!K~cxRNaw`vosxLX=D
zQFvJQ`NN&1Xg{0Eaxg!EdmksgF}n=7hPNsmxG4^p=+Z~uMW1FPy+x9nFixo)!JQKm
zaBiM}JdU2?4NV;R(C4a4w8|SQ!;QL$?M~P`_MT9AaWLeg8_fv!NW2+`V5U`fH-AkN
zMmCe0;iG^WO&aZM3ZzGlUCFZiZXYK{a`wTg#eRc|UW!cG*JRC{j^r-E?p<F~X_?Oj
zcO`=!ottAE3xrw|#}W%m+Lh#zFLoW<;%^#dl5war8D#iZ8ab!PU0{yRNsxV7i=>gf
ztqeIF&x)iEjd)EAw+f|$%vL^fnHZk%m~3KO5B2hQCWQH#D^Ub<?}3-$$7<;;*);l|
znTlB{ru!<_lCm4P^a|80pr<>yD5rpX-_(*`ft!k%b>D4_MU9MbJHJciJTus3I-|~-
z)E)>!<*5uC?=@?1Ouwnh4I5AWj)H9{lqcKi%SSDZhck(;#vr75hN!veX}y_l|BG?z
zthBaf1a@1*qjC1@_@<T}!)g-FU~dg?)=2(qm8jFtak36yNKkS0AXqHczZ^T<emM>w
zWKzvpC8fof(-`)3j;80?LQTIR@2Gw70T79CWtm{8y{0CiBsam96YDoJD3el+JEQXn
zVaVEH)`<0=%^@jx>qI$#VhE07E{&UzsV(D3&DPCQ69?^g;fH|t@dE-GAxt4|hO$vj
zagiNo=18Z#{o6?JCw@K@a8;r}K2z^U*Ii)6P5|P{(eEZVv=wEG<7jTe`Y`qLOQA)f
z*1MdP31BYO=13^RGZZFcwbeygV>#ilhz5#l*8LN~IJ-tRL>NdoaidDBon)a+<Lp1g
z<mBv?K-D$TWngb|x+`KSvN*+9jcADUP@B&dyv|DOLb2mUDZ|^0CZ$h-ti%0ep15Bk
zEY!nnR|XH(jr1)e4k7>uAv(#M?s9HrqhIt!^=niNW16gAExdpCzA_oZ_CSRp9n@aF
z|3yp$#-45<(sm{wQMC(6!&^<AT<H|-o^ZS`UnBr|Ab8dfITWK<WXuWSx6sC59f=$b
zyP6o#K^a1bA+vb0MZy9q7rK-5%v_r@EkVBPQXgErKpcvEPK6ReS<$ajTSzel@G7?U
ztm{<`^XGmVFnH9&Za*?wfV5A|7BzeV5?oGqe(yFM5>9+H1iL<CBF@Gc35S``n|D~g
zU&774lF+`QJ~RvtLE@*Rg=5W7CDG8$jP7|axYXKJ@@`%=Z>SSUAliyFfF=Q9iDCtC
z0wzT5;>Np{{)N!}1L<t`Y9LLk^`Y~6$rAhX>GTtn$Uz8Q?erLK92EEUK!Xr^k(3)n
zsp%9v`iwcd0a+NO$d#WuGpcF+DT4(X)CW%Kf>gU1)bGmV`2Yj(j}9a>DetdORXZ4a
zKRnFJSMscHtO8CDZaiD*v#@lJP`aJ#<F7r0j~<TELNLL<FxCa#X*Df<39>!E)y!C)
zntOc;^<ji)eRZq?r4sh4wf%e4F;-<|q%9a{wm>gWYvw8%0QndC^xK_FDCinlxtt~f
zh86r)g4GEuO3TY&FF(u_L_3ZX2Uwxi@5d;vey+~2&A$av*UiD}_$w>fz?6A23DF?X
z*zQ^ypjxeuh$+E>^rr@ez+FEUO%cp*Z~Kj)lE88j+Jh(ieem>vva4EI`PRkZCfp_8
zxpn-?EXZZi>p>RP^3wHAbD6kut)Ii~?W^lxl;|=1(8=YED;I9}k1bOx6ur!0p+?P|
z(9TTY6W`C=A&yZfQC-~A+6=)xE?{^^iFx6hVP%{EEu%hq6SYnF?CTj4-Gd%WeBuyL
zc0C=Y+elxp{Tm_V28iHxH1;oH_+*b!+&*-Zl!FUY(H`#GzC+!E@kZGCh2#?y*}E!o
z;->_$h*tG=>liFhunWejDz0{2Cu_xkD-U6fJwQt?GDY-qd0(V*CLXNOP747-ZLNnq
zfjpG1jN8M7_LY@FRHny|uNUB}$cf@|ZL!3-Kz-HxxVr91l2oMrTAY1(h<J;(?`LHR
zV1{7ixo~oV6$uz%Nu#wI81f@@!MNlofF4QhAP_-+b4YWB9|<hOF+m_-hlYnr6mppO
zrM7JaQ=;r8E*sIfUPkgB*8xNB-IU}c8o>;<s#<D!zz{6DJ6U;69dK_m!l2)IwR~#f
z!MitT|3k_u?3v^K2lY>=o4fO{)nVQ-vD-bN)Rwam5_r=>J>6qPYMZ8&z5Il27*j{5
zpDrb(QRjdDNSud0OjQ4*2C73_9oN48#)F()rLuRt9VE1BQ?$QHs*{o*CzjCu>=K51
zgdtGr5|&u?aXb>c*~GAzRp2BkOlKk~43{2L;5sQR(l+&`sZQEzGAYbACMhgvnJ*Ln
zKET@bj>}oB4Djohli0{%)D0*pYvkG7ohGQ3Xd(6CUjY&HxVM8G{8mGKA-;ZnA(>5Q
z8)n~bn5?`cJwvW-&6R)tzMN-e!uDz$+qH!`u(SDs=yPD_g4Ojn<GAryX{uj&;fpx_
z8#~y!=S=6UnUkz^r9zrc$;^1f@wP`!=m8r&u_^ICAg{lkzheizUF7NX6XpT_{a%s(
z2IGN9(f@*>|Dj<|Na_CwNPlK^{}YhT2?F9AATZ6$#0mr*fT%hLCx8_M;#ffRoE=2P
zSy@2j{E4VPQF73E7SO|vtRS1ej-M*CgRtBaY3Bm5v8Ox>Cy1=G{V50e0zGI6U}t4}
zPB}P0HY`tepte9Z|4Q0{Pj4&vi!uIrz>$rU^Z$|d{}E{if{6DMX=i-_;2baHn_c9Y
zZnM7taQ2r%j+Y$M3qNOn!Qt#L7@gyVinG7Kc#ap0&My4yh2w?0v;Rr{RR06q|Iqsv
zXB;o2o&ANa|6%eRe?a}8zPvO9d@<qpWBKBf?ZuMyPhVb~ae&VKA0_mEgYo}nYybZc
z`agU6e~9UyuOk17>7W1n|AgtEN%OygWdPg1V*3Bc>Ho9F=fC0ff<piF95qO>15%Ov
z#|No7K#2bN0cIuuJqs&413L%T)8zrk%*4jP3OagX^h_X%&+&92_`m)jHSo{7g8ndi
zmY268{jGR-KL4*%31al1N4Nj!L28!&;R}JjawEFEnrYe{;m07QVGaxpmyV+#ZYE<U
zrh|INz93UWynvi8;|A9|cx-jWm!7F3goI9H4GvlGImhp{HMI=&yZqVQ28AkfOY06z
z$>JHGRjf8i6|2gNSqR)?-?0pDXpxSq)sy42%%3;ok3N}#f#^aDdp7xk1FH%e1dDfb
z9bH2(HwQnEQQUmw&wh(B`M`cEMfH5wb);d#21ww5u&iyZifOw$$2NE%VtS0@c%^mu
zEwRZUkix>?p*Ck5<G7XkIQW+(({*5=3cB*gA813_@Q(a+fxR<&i~((4ltMx5f%S8n
zm81kh&3G&9fv5jpduJIH<+nF%7#d;dMnJl27)qo|x<jP|k?t581f;tg>6DNfIus;D
zx<k4IK}zZwl=DAFeb;)<m*?AiuXV3k{MdEJ`rRM)wXco2t?o$f)=-M`<-2=?QpF1Y
zUX!fPg5R|puS^)EJz(Y$Azy0L+0n*vT>N8*hZbs91HzjOTS|teU<ryBr4zZcNNT3=
z_0T{lT_7}&f=!UY^6kUZx7FaCKII#h5)PTO%OKG!y?HJty5XLNhC0qRVxm!olcks6
zI&dxr)~m2iA~R+>zX{$<S6J+0AP!lLpCiD_ZCqy6c0B<#0yMkMXP$c{wi>E+tfpG5
zlr_(#UUojwVVp=xm>A+2Dq$+i(AAO?tq!$(N%E4U^d7w>v&fT0!|wIh4Rd1<yQFH%
zOs_XLq80T%Ym}TXt<OVw7I~_gJf=t9)_>uyyF#0s=f+O4wXk?ELUIvdps%74-jpO#
zWjVb!p}Fqv=@?<>`D0axJQiEp(eF`^H*O=w8{AG=MfQ&Yu6tw;&&SlKHcl^`kFF$N
z5uQOMTmgE41-=Dxeu{=HKDd>78d|AILg(o0>}Bby&vbM&RCRSOms_jab{C^s8e2&|
zNJ)JFG2dHubZK0yut$U|gj@%lAP%=8CxEuy2@Kel_>|a_pNXni1Pk_siW<9X8qcI&
z_0dQNIUamm!lroQWiq_Q`L!$4e*&p1E%gbfjrj~ubNlw`J|Rm9#k}{)K;#~=wgwdH
za5HUiCjH2Ue9L1eDn=NG9D7yd=A~lFTP!^kH3EMOFf{Ny5c-}s&M%16uqd4KdylR6
zk#4ibwAWsI!Fs2-%~^GK!QjK49k2*+iEu5Yed^GeE`Q3-_!QyTbk1Ztt`bUV<xP-^
zJOaO{0c>HG5$l{9H%ZUmXvV5z(-}TlN=)PppM63p1T|XeUT$l?sWl1@YMUUAYC8fF
z6ZWr?xGvwHjj!69t+S?7uiJwoY3H3$7cItqLpqLezQhOC@GE%{EP;dBIR+)eP@Avf
zJj)yPb7h*z(9}e5^|yVb6n+@jZbjn~)<)(ry}vQMoCsU{hZ1jjwRH0`etHHx!;Pyg
zNOR<KVkpR5N{1gZbKn>eD&(vDwS#Xt&E<>YRbTw}(~UqD+qY1+K=1A2`T^#319!Qj
zQc=UQ01ps0{|-5~#+Ti)khaQaJ_zf!a3Lm@eelt!hkmM3_kMs90XSzBP6ed+Nar8&
zEwWerThNz8#vh2)RvXNYL2fKYJ)fUj8|eD%H2eCDK|P`U6_%VmE6();$yFzQ<?HSO
z*CpC^gI7+P^K*Tnbc9_I;~*Q0q8k)5QwOp0i%h69#2wo8Ov`&Eysw*nD`rTl!H~OZ
z?^(XQhsvxkeGXYi$e}JH!<V{nU0-D>*%S-Z=F$(1MiTOlj{UDOCldX!jo$fASWA>@
zW3|-GET>C#Mp~Y<Y!Y5j`*@aeyqeE^322(vB$#zg7ihZrT!O9&RT^ZZhG-L+Z9dV!
zAje;S31KyB|0vEAR?H|r>-@DP@E#>uW$hsriPj0FAk-eOo9wH|w|bAS=jE-?7kl%A
zY})pW7ZAp9#d^#=gBnCUzK2c57ZOi1-QRj3>D%{+<^4!qXB=*!poePQ3{;@2XUc#k
zRNLN`S#`Lhel#|2qZQVk9e3LZ-)6Rk9xk3#R};)DzcM}tD=jML>4NOFc)R9N&fz>U
z7!=c#zi|1r(vyHvh5gdJbLXFW)39qt`taAe-dtbx+xj#v%&AnQRcDuBkZoA$Q}WkG
zt*2H9)$X34;T3RRzte)dkxqsyH2~nf=>GUpVN`N7aR4z_aocWJtD&c~B(LMVpeJE>
zH8>Z3!Uj*<m7gbUrG9K#f3d<`G{Lj~xo&GZVtf{8mxr&C>$4x_m#-7EzHLUqFVenN
z&?eg3v)@~}e2i_9T&b!f-=M@#Iw5npoPB!bY?)x72NKimfBem{{tL}hg(K^|9lq(^
z8R2x$#%nS=TK7%1wMis6dzEQ|PLX-{$?fHGy^Mw`G$C^~lVwFwu9QLthLW5O`|Oto
z^K&63LyipMnWWx%<+ZhYG1lcg%GMVq^FNMvgwIW`8l88@WTfp-Tq4_Lrb|72Ox_>Q
zSn`MGPcG$JX;DD-z&STE9k!EjwYt%^DZ-D#O%%0`3_~)z`&Mgw0j0{OUDRrL1|cX{
z^R_^si_0EAYA-pKt%h)fyw@Aqp&Y3w>!;OcCohy;wD+G@7eU?j1b1GsP&bzI>48Wp
zHYs$QoTh7bE<5&H*pH1o_u@&P)lAP+n|zLBNMbu)N?b?zYF$0RhY24g|IOZpi6L%*
z0jt@TDG2|4Z&c?H$-(ewyBF{~U2Pz=aRyh7BO7O!S;qdr8w2eIi(~$DIpyJ%<b=eA
z@{&QedbX}fdWtR=Rzl(!N&k{@84J@fX<<@=BHz&KT`ln;sZUwMdjRHqP9uUC&j`aH
z@ml7@5&|w)>i+OXJ<M6kN5@Os58hO`@Lr#Ps&Y0wyUy^+UvACv5<<<2at(T=2|44t
zHas4D6r*V1LAvODNt~(0Z$RSaz4iQZexC^?fihy(yk11wp`UB%!g|nm8>hR5S&F|m
zqt^nIM#;jGVw|r3%Kq^dc{Gx#quls(dMr0RNqC=J0}Y8G)l15~&B!;eOy+ZZSm!q1
z**a->7}^c^DFcDbNOPOn20?(n(Fzv4Ol^8$k-9DCHS?S*G+%Q>PCc1M5O~ydg3&o<
zwp<7z3w7sMD6A14s@?B7sydePbdU^LS&53;6KO9exqy4FzRrsb=;OI$C{hXPwmG`M
zalPhIf#5OPZCb?Iav|e_nY9$(6wp)JZnWm5lUlY$rSWvGiSVBk&Py1Ry>_uzhbDbe
zI^F@itUJ4E&=o+f#a9G8JNDgS{v!3>FVs=97k+St2!AL0?Q@l?)}_fz;gEe53K2IK
zfSqV97)fa@yEwpKQbILY)0s{|yehq88f)5`_d=qtQ0``v(e-SPV=?Soi-Gr3x3x!K
zM*~l+XJ_`#2DEY%V#)QxrF&rOG5qn-*q=!sCnvoXh@jUiO+VUGgv?5>A==#6I-G1h
zy}pvOqhVL|cA)fj#H^>7^!@RJfJRnGSpcIAYgtvTmUAzpK>UR5dFEgvmGMYv*ys<T
zTo;QJe7D#daWbVwse0m%wpOjK_XdUT)xEW{X$CHyw0w}4O%ZI{GYV%gP|v?~p5ifx
zY`e%LPjlVTt5FhPbO10WzM0N@UUi+#I;}CLKkOKr=>4omx3nB;Ie5|f@T-oe%v-3@
zW=I{<%8c@^CPlB62v|he^}+|xp4U85-=eA6pGYwsKOOlM$VfJl_L)dEC(a)|d3)8N
z$Jv4PkYlEIIW$d|1x4{fM_ZcXRLzk_YX?(}Ch8$-tCO4iBv!XDNbrQUL4K6c{4&M@
zL{j}(o|Oz|TSx=><3s5Y>(y9RS^STsFT@p<8_hq&ey`3cgOV+}?w<$OyFw2syf;dt
zcBcwGZ#0XLA-$;=lV67kqsQJ|JnU2Ni)w8`St*|U)P0KB9XTV{=k0;-YP?+E)-kzg
z7j{T|uq5mw-bQvdD8G$VO1?p8JRCsAy+Ascnq`mYEo?V*>Dc*0H>wWT#t>nqE&U2P
zgyLiIkK&rXc|>l6eYf;&YuX?}T5i^F@HuvybBD5;uPO@8zw4E9>3dCX`IJ$RKDmmL
znV3_IW6ji?Whf(=@naM5Kq7l7|HyUzMVE)P-94LW5O=kVo3xv$NfhNsd5!rBNMx?2
zO1TJnKE<{6RM}^5NA1(#xNL-33r=3Dbdzk3phL_^G$*4{?fty#xW432$Y51>YFhsy
z1X<00nqLNTu{p)gIkUK^Q;X}$PTGW*^k@zvD=2}l!Igo8nTVNC#QW&_aLZU(Jy-l8
zccby=*pc<_p!!bCnC~Y`&Uy7gP1ef|HEz-*AI(-#mb$xUP8?0Y+?U>@SO|E>!?0Gc
zxLkBvU58IU$JGqUs%|lT)8*5?JQvYa%8WdjOhz%8m|=Nv!_R41w<$a{h!$=wFgF+8
zq<smOMOCk~Mk@@Tz=-nWnTDv9x;*JzS@U}WUmQzaqXsm}UK^Wf&!^SopR12Tp&rxP
z@+UwH_Hlis#5YSL&k@soupD4>o;bBCD-OMD9kTf84aT(n`rK@E93)6fB!SVA(l4w}
z4~a6j^&=ykMT8uhEz$HZ#@kMT<qN|}O^{wJN=K)GrsMIK26}TLL41sb$dhyP;Te!m
zoxr)V23}6rAs9XC@pTHceWb|V@L1q7V!ivkprgI~Xq|L}ZGYh$WwWy)|BBfJaK*Uk
zid$^u7IF&Q9hb1Dm>}=LyJ@_?_)hJ7i+3&j@DoY`&DEK*hw^f%SXB@-_r21p9&+t0
z#-PwEJSO#8S+tmIWdDwuP4a=6S=|T$Mau#FSdtox<aqpdq`85WzVwyYS#Cfg5p*}J
z3*cJ$hFPHflE=c?{p+j!1hDsIhOIG8X6???5cH!_nm)X~AsUW%1GzN9Sw7!V4dRrU
zulxOtk17EKSr`2878a%6P2%-Eu%R*OeFmmkvtPoXl{D+>v%WMr#(w?&vKQD8X6$$c
zTvOT1#i}H0Qu4z7iq>qWGo8Qd{;e`<VWr&z3s3;y$$-H3*KeKj4r5Wcw)Ec|yz)pp
z<UGk=+%*FL^A~4bF6@P_p75bNSRo6%)(|-Sm_{)INg7wudbl?@JBW$7zL$kf+V_TH
zahd2Fck}MSJ-^HYl)`vh#wc%5{dOG7*tS8+@|h{GaW)?Yj=gT5A)g|J!6)!eoFfcG
z%L-bueno>)H<wpX*$MPerYrUGE4sr?eSw=dC#Z*ZX~WOXEI%tsP(<Jij<*6mUK9n=
zoF-JHi{!Qddn@_0FN~)|(06Vko>bQXJmE=Sx78k;GM@50LH`7@TaWf8jfEm?CV%DM
z9=$J$=R0JsN0gekBjz=8F^s=QwUkvbG8~^P+(Oo?iwZqXi!Pf)z!(mAM6+b}88fHl
z2cOCVH)zo*LTf~kG6>Tv?f~J_U}bugX^zJ*;dj96nJ_)bJx_y`w;PF$vklq}J{D!E
z9^g+F9FzLF<zwu_Kud$+*4+Nd!ExuUi7nvvb%E_f#e_)qP5Je{ZvBh-q+`j6ig)q5
zfz$reJJnf(-bdU{)7D971BBydz%$KG?iG>AFK)fZHs7hqByK`?rjjL|G)G5(D=CbO
z*}^GJjqiucOzT9k2PqUszEfpS>gi6(ElzLm)_!(kc<nlMCdcDAQ$G!%PUYO=A={v4
z^AS*aKhtl>D_0!HB|T}F86glKkv64+u%l;H7WfUDY{#%UEaKRVd(LOfo|n0k^4+(!
zqDU5_6_!IeDA>xA?ocVMzZ>r}c=Wox8_Jm0Qw0B*@fETP{<#XrVGi+^2Nrro<eB3|
z=&fMRZFp3QaRdd7-UA{jYFrgm62tM(PB;U28a(vQB$-HB_LLz>NT{7D-DM*@H%f}p
zDhh@kBM_V2mjF(lBXHG7;o!)Z4x<gncMHB**mq^Q8PN%;5#{s)G7~rU@v*z{rIWUR
zfu5#j*D1!qaceSpSg82N_NQHM(HHPB4uJr=BcLi_E{l|_RHW|rdwTXTUb(X+XW*V{
z%xU1cY%{~A<2M^7+Otkks<=@ga}buBp~Rzw2hv`gr6_f@h#7h<v`DQ7#T=*;QhX*;
zu|?7)VnN_lJ^3W!1TVGZinm2vrq7$;CZ(9%8O6sHxJ*Y7t)uXfoj-_%1gCN%zb=wU
zIB*7$v%t$Br6(+rAgu)B8^otp0S-d#kll={Wc6O;lSRJ`*Z!y`w2k{ntwy^HHLP+z
zxm&Dr7wxoZc>W`WJzY#=9;(uVH{QN<L#J&kYVQRc+2wS>ui$c>GC44^$az*4&v3VI
zC?B}lTNj#~aIqaa8DtA%=RCn|Pen|;5z-Gf*~2#oel8Dv7dk6{DoL-+k&j!Tzf}Mc
za?(~%Q)er4PxCvZQw4T?_Y4ROmlFe&ahV!X9;0g#5{)_mUWPlwkPrbxp+!NG#3zWz
zfbzq)?y;yl+(%0}JRD3pi8L?gzcZ<~KC_cHJS-O81XN(CvT`-X4M(9!i>>#J#t9WL
zMh}RcMTxN%c`Gp%IEiafAF#kD@UMYBM5M`K=eABDG;uYiWq+H-$6!r-)~=iWT-A9*
z*{faNQ9Rt7UUG7TPHrj66q4U(oxVLBZF-*@sOmv)l@V${qC_Sv0hCZ1l1oj3gz841
z5Y$JRgGYZPB?`U~NSKu!$u(YSW~WmX8JF3-rz$JyAGAKk8K<I<yIIf{@Y+u}g#Hq4
zIp<6x*+~Rg&6-jf9X8^k9_bZ6DoC24<RS+bjys6KM3%Ra6OYV3fN10!Rb>_pGQ_Kj
z9no<I%8K-;ry!XYXPPW1ou!swg5&NbRTfHGVt=xt%=x5oeMVgu?6->ZWD=2HRqAyC
zyK~~E301F8uyj^bw*z%9&Vn)PPB9B}p=9A#p0q*_y&P{G=91PgzEas6xVTWJ!uR^i
zJl5X<<T2g-6FKGr8rciEKWtugL#{q%v8cO8I=+M?i&{Hz0oey1)3<zchZ092o1RiV
z+Gvv0imJw-Rl61^f#9(?jDYa$L9L;|tcxKegqwwG(5>^Fu(1z0d2=hu*k_XCY;s4u
zV3#L7<0%xRr)s1X_Jmky@?`;fi&8xFIhq=uj0ivDoRw%QWT$8<WXjgJ^&REYbjyPt
zDQ9u7sOC`|PhaQAxmJ-Yxvbzbt9KENGG`S)oye|BxmZSbQ;MdGi{vHox2RtAQqcA(
znLH0yi2}CUa=c4^Le!;Ezj6_E6Bd3vt#({V98%S|`Pia_M3?dY!|IOk;i|5I5JEAL
zt!xi1os~CPj_pcB0|S*71uk5l?^hd0=Gx<gqe(y9&TG&h2cFS2B(<Y=42}w<u)#sd
zmZ)E{nQ`SC!CT$O9}%;!@33GIV#A0`H}g$UN{+L~qn<C4`-EJW&g1e?pfjwm7;i#d
z0|c#B3q>26F<ghc9JHE5Y$(U4TwW9fOLf=LW<`Xn<Bj8VDA@7B$&@=uYdu_^2HM@%
z$f9R{*H<3VH;j5w!b;fZXMvWGiwvnlum*fa^b}ws3W$vpBP|M$q=4P6BKGW3=9ey}
zFSwPAiVYhz&Z&Ov%c*_MKCJIdDm;8>^x?6!G2jxd8Ew%#hj=E|d+hNVNcGVaQ=ogJ
zOpWaK>C^zHJl%Z1?Xal`JDY-d1r3_}L<zKhY@>oiN0Equ^vqd-79|PduD%p(aJGtX
zi>}S`t9d!+OZYY0w7qh`icOdr($A*@MaMfm`y`!{m;y;31l+{qmmn$Z);~~u$+R<@
z>`WG0SEtU=q{0$r#`=UYK4hkmrZ`ZVw9PvFXov?(eyS?&yzo)4Zh<$ReZ21lHB{n5
zLUx9^80ffAtJ_FMjVvoldr*(C$dcWCF@)~O#dJ++pF&;F-2KK}w!d~O!#6cBjnT@C
z{F#O3;3tbgxp_C32|P^m?Rz%ey)E^5+nfs($9i`@xqa{X8f9${F@foXqGJg;7y2tF
zKF!hbS;JuQyv0)!Iqd}2A*Jx1{F5ZW9w93DoO-5NBaqZbB7E4rLT#&Xg?L9H*wY?a
zeqx{8l_fH=Nz*G66Svwk^E!8I0mr5AP!%)IyZY5~M9%vw)_lAi_~Siov-ENG1)0k6
zH{ZM%Ka1ItZrGUGA1ji+N%2>q@gkAYSB;%a@10D$bkfB?r{H2dPSQRwSPC?iQc~Ks
zOClzK5I7!!Er!D7y&W8`kBQ!1^7jcF#zaXW)f409!0R7|_r%7HvP(+{r?h@SG^-#c
z@YAx@K<$yN_dAsogYY8*WIUu0DB~axgxY1^7h#~}WfG(A-a7>l5OFF}8SBTiOyWOd
zXTpcK17IHkTvTF-6U92DH0Y$(d<O>aRkK1p0mBHGjb9#ck>foNpp4=`70guP96|65
zS`9h&9}4SZVo>G6O_;M!lKtG^MU`B^K!L}EMS_B1NT<#Dl|KFZ#dCco3ItPeVd>y@
z8BI;VOasP&Bsl^lZt>VyYxQKbbeE`^mYk|SA<S2?hoF^mwjztS4rd*}DA=tbgLdqZ
zKOBe>Od|Bnj2Jcad-l^WFM8YizfrVphQ<(R_K4X%rjC|JZ}O4^LfIL5(4LA%)u`p+
z7U@fp$O?+-g!++~I49`P#U8>rM<ulYP{(Pw%i+nER>fbZs}WyP_XXt@K~$^Gr$i;c
zqly-ZzhHLcqs73!e9*ghbQYY8)<N7rfdN;%<f7iIIUaL{o71i>#3;%1pq}x<y(>Oq
zVM+8oOPs`e2_zz_nBEXXkwb`FNe@&r5Uqm;aV5Ia5lvr>*zbd#o$u+32Ekx_g5H~J
zWusnE`aasa#{t2p-V7yx@A9hk14MMjRl&nbRtHkFPGLne?Skd3rtXr#^sm4`7b@|x
zc+bb(?=`uCN%eyzson=<X3O$woWmdScR>P)_X$ke8DE2r0jp{hU)NWR_>3DEK^6GL
zxcYt7)%dhAii$EAbO&OwVpRILzBnBO*zRJS!5=E)2Ph5Zs<>svMgW8#vV%Cst(M95
zco`OrY@J|>PFb64{cT;Rn#xSWQaHGs+nYbOb5ska?PVxYP?6K8IilIxt}52iT>?r6
zt(MrrX749P0vEn6m0RzXYuzi;wZ<&ZFfA<)sR(l9ZHlw4t<%l$9WyVKAz?1f_ouRa
z$4{DSWKmLI_as9e!!vi>oQ9tazBt5|Pau8b^^jJazwN$L&Fn)~-!2R@Yptib2t_(G
ztW=0p7M1lJ_Bv^)8OZ#r0a6J2PRRp_ym7{Ues*RkG53X#is9JdQEa!UgC3?TmX~|B
zj888+=_ptuCYlxYn%dF`Zr7wJDwWQ6Pd*jJSNAHFT7&s2%)q*{7|qjTrWVysdBND6
z^h$E~S4HDKnA#^l!Ss}A%I}av&Xy_d8d=JV&h{9VWO-5FVY5@ox*LGxD<2K#&-0WK
z4Z9uj2COnvbjFLFv}8YXe~6YPtn5A^f~|wjL)3c_z=xIcyJ!r-;ThS=&xOcVlBR;o
z^fvFqNhyKg2MN_#sCKEWp26K)c%i$7OT+bLmbGF0yR3R~stD$o<v1-I`yBr3cFoC6
z>`M~rFf9sjSN^N8coPKqHd-%VR|&A%XoW{Us;&xsG2#~q>fXFXhEdrdZsJ&-`M5wT
zY=8B~j31N4u_JpQ^MVkVRk3MQDap5@emsv>fII+}sxD@f;mhb2lXAen;m*7%VwA~N
zJJ$;B@I&{Fh;^41_!js`jpw~&;kM(4ZCnAK?o+i@8|-JHCt3#rH$!cqS|jZIa6Sn&
zj!vVe_5|PO%@8NJ7mxIQEUK_vc)=UwV~s4Aga-4e^*S83{ot6~^nvpb5f=HIIr%va
zc{^3f$IbI^=H!2!nEqdwlV3Bt|98B}&qe3|g}8waHt!9F;UXXyp>lij@^Dh|aKccL
z+mnaqr<?%{Tks8rZR2wLKd=DzPuv6qlQ;<6A}RlfA7E{Oe*Jwbbhuq4ek*t2gH8S3
zA}F{0@bbd26Fv|Xn3o53{_5*zA26JSPe6d`cG8+3HU>WMZ5bHia@#+w4PKZ8f*)r3
zzMaYDza5tVEH3b?{_XD{$jX1N>c2x)_&E9gKeF;)AuAvlGjfZp+~FQ~%*h>W!h468
z{DMvXg`V88D!-VMJAmaD41u+BN3Q$}S@{ihxr1BoXqY=4#GNDJ&I7^yi<G$?6Rfd2
zWaW-PxpPbW%UAJlW5n&?e+}?XTXDy+@cu@w++i^P@>SefD1Kc7&#x=G(_P#lGI#I}
zNc`tm?ra%%Obz%~1)g8s^86xY?kJi&%!m8`nw0rZCCa~L5dVeC;#Vg6Zz-9dM*QDo
z7TiC5_y0yaezG3Fqa9Rye^4?vH|_tAMEehF26JWnbl?2k6%3t2-rU5I3jHbiubc+!
zT*B5FW~|}*xjWfUxeIIuFG*<_A@l+QGrl~whHcO#e)lW!pWYaV15B}Uo9EDPohZK!
z81x2)A$6uy|HN)L0k}P=Z2pOH@xnN`-(uW+{4hZ8_ZaLIg=M-wV<1@W`6~v?Y`@32
z`FLU3^!FGSKQ9mLns1MP#sb58{wl}Gd7B}BFUJYu0{uB=7>LFPdsx4<2dfLqw131v
zT;M;)2eZZ8KIq@;3h@42PCx*b-G48~1smU=F-`&AzsAP}%NKv$11^}a=Fj$EAD6#k
z{IDzevmF1eM(B^R@bSXFD1XHG|9+NS0$hLf%guWWTK-;_o0p&e&-=p7$M@IsfYpUT
zm;cmtaxj8zIN|Vfdp>n@_ZP5t5+*14sc*WYp44n@VQ!P3mXe=ZBO?c=pRX@Bp8)JL
Mj7~=<r7Vs9Kc?U(iU0rr

literal 0
HcmV?d00001

diff --git a/examples/file_comparison/outputs/file_comparison_validation_criteria_differences (2).tsv b/examples/file_comparison/outputs/file_comparison_validation_criteria_differences (2).tsv
new file mode 100644
index 0000000..8dd9d37
--- /dev/null
+++ b/examples/file_comparison/outputs/file_comparison_validation_criteria_differences (2).tsv	
@@ -0,0 +1,7 @@
+Column	assembly_length	assembly_length	gambit_predicted_taxon	gambit_predicted_taxon	amrfinderplus_amr_core_genes	amrfinderplus_amr_core_genes	file_column	file_column	sort_file_column	sort_file_column
+Table	table1_with_files.tsv	table2_with_files.tsv	table1_with_files.tsv	table2_with_files.tsv	table1_with_files.tsv	table2_with_files.tsv	table1_with_files.tsv	table2_with_files.tsv	table1_with_files.tsv	table2_with_files.tsv
+sample01										
+sample02							gs://path/to/table1_files/mismatch1-1.txt	gs://path/to/table2_files/mismatch1-1.txt	gs://path/to/table1_files/mismatch1-1.txt	gs://path/to/table2_files/mismatch1-1.txt
+sample03	4719410.0	5287603.0	Shigella	Shigella sonnei		glpT_E448K,gyrA_D87G,gyrA_S83L,sat2	gs://path/to/table1_files/mismatch2-1.txt	gs://path/to/table2_files/mismatch2-1.txt		
+sample04							gs://path/to/table1_files/mismatch2-1.txt		gs://path/to/table1_files/mismatch1-1.txt	
+sample05										
diff --git a/examples/file_comparison/outputs/filtered_file_comparison_table1.tsv b/examples/file_comparison/outputs/filtered_file_comparison_table1.tsv
new file mode 100644
index 0000000..ad16c0e
--- /dev/null
+++ b/examples/file_comparison/outputs/filtered_file_comparison_table1.tsv
@@ -0,0 +1,6 @@
+samples	amrfinderplus_amr_core_genes	assembly_length	extra_column	file_column	gambit_predicted_taxon	sort_file_column
+sample01	tet(A),aph(6)-Id,aph(3'')-Ib	4783605	extra_value	gs://path/to/table1_files/match1-1.txt	Salmonella enterica	gs://path/to/table1_files/match1-1.txt
+sample02	glpT_E448K,gyrA_D87G,gyrA_S83L,sat2,dfrA1,parC_S80I,blaCTX-M-27	5226301		gs://path/to/table1_files/mismatch1-1.txt	Shigella sonnei	gs://path/to/table1_files/mismatch1-1.txt
+sample03		4719410	extra_value	gs://path/to/table1_files/mismatch2-1.txt	Shigella	gs://path/to/table1_files/sortmatch1-1.txt
+sample04	sul1,aadA7,parC_S87L,gyrA_T83I	6674526		gs://path/to/table1_files/mismatch2-1.txt	Pseudomonas aeruginosa	gs://path/to/table1_files/mismatch1-1.txt
+sample05	parC_S80Y,tet(38),mecR1,murA_G257D,fosB,gyrA_S84L,mecA	2773544			Staphylococcus aureus	
diff --git a/examples/file_comparison/outputs/filtered_file_comparison_table2.tsv b/examples/file_comparison/outputs/filtered_file_comparison_table2.tsv
new file mode 100644
index 0000000..dfc036f
--- /dev/null
+++ b/examples/file_comparison/outputs/filtered_file_comparison_table2.tsv
@@ -0,0 +1,6 @@
+samples	amrfinderplus_amr_core_genes	assembly_length	extra_column	file_column	gambit_predicted_taxon	sort_file_column
+sample01	aph(3'')-Ib,aph(6)-Id,tet(A)	4783610	extra_value	gs://path/to/table2_files/match1-1.txt	Salmonella enterica	gs://path/to/table2_files/match1-1.txt
+sample02	glpT_E448K,gyrA_D87G,gyrA_S83L,parC_S80I,blaCTX-M-27,sat2,dfrA1	5274928		gs://path/to/table2_files/mismatch1-1.txt	Shigella sonnei	gs://path/to/table2_files/mismatch1-1.txt
+sample03	glpT_E448K,gyrA_D87G,gyrA_S83L,sat2	5287603		gs://path/to/table2_files/mismatch2-1.txt	Shigella sonnei	gs://path/to/table2_files/sortmatch1-1.txt
+sample04	parC_S87L,gyrA_T83I,sul1,aadA7	6674503	extra_value		Pseudomonas aeruginosa	
+sample05	parC_S80Y,tet(38),fosB,gyrA_S84L,mecA,mecR1,murA_G257D	2771914			Staphylococcus aureus	

From 81b77f1ccd94c04f9692265ed5e4f10c1d3c9e10 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Tue, 12 Dec 2023 23:05:56 -0700
Subject: [PATCH 42/43] Update README with file comparison info

---
 README.md | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 2a09ae8..30d5bc7 100644
--- a/README.md
+++ b/README.md
@@ -72,16 +72,15 @@ column2         SET
 column3         0.01
 ```
 
-Currently implmented validation criteria include:
+Currently implemented validation criteria include:
 
 | validation_criteria | explanation |
 | --- | --- |
-| EXACT | the values in the two columns must be exactly the same; in this case `[foo,bar] != [bar,foo]` |
-| SET | the values in the two columns must be the same set of values; in this case `[foo,bar] == [bar,foo]` |
-| \<FLOAT\> | the values in the two columns must be within `<FLOAT>*100` of each other; e.g., 0.3 -> 30% difference allowed |
-| IGNORE | the values in the two columns are assumed to match; in this case `foo == bar` |
+| EXACT | The values in the two columns must be exactly the same; in this case `[foo,bar] != [bar,foo]`. When applied to columns referencing files, file contents will be compared to check if they are identical.|
+| SET | The values in the two columns must be the same set of values; in this case `[foo,bar] == [bar,foo]`. When applied to columns referencing files, the lines within the files will be sorted alphabetically before comparing.|
+| \<FLOAT\> | The values in the two columns must be within `<FLOAT>*100` of each other; e.g., 0.3 -> 30% difference allowed. |
+| IGNORE | The values in the two columns are assumed to match; in this case `foo == bar`. |
 
-Future comparisons to include `FILE-EXACT`, `FILE-SET`, `FILE-<FLOAT>`.
 
 #### Optional: `column_translation`
 
@@ -149,3 +148,6 @@ This file (available as an HTML and PDF) is a summary of the differences between
   - the number of samples failing the validation criteria
 
 If a `validation_criteria.tsv` file was provided, a definition of the (currently implemented) validation criteria are provided at the bottom of the table
+
+#### `<sample>_<column>_diff.txt`
+Shows the differing lines within mismatching files for a given sample and column. Each pair of mismatching files generates a separate file.
\ No newline at end of file

From 69afb0bccec902063356e780daa16515f6ac8875 Mon Sep 17 00:00:00 2001
From: sam-baird <sam.baird@state.co.us>
Date: Sat, 27 Jan 2024 00:06:33 +0000
Subject: [PATCH 43/43] Use shallow=False for filecmp.cmp()

---
 theiavalidate/Validator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/theiavalidate/Validator.py b/theiavalidate/Validator.py
index 8fdd3a7..5c66144 100644
--- a/theiavalidate/Validator.py
+++ b/theiavalidate/Validator.py
@@ -260,7 +260,7 @@ def compare_files(self, file_df1, file_df2):
         elif (not pd.isnull(uri1) and not pd.isnull(uri2)):
           file1 = os.path.join(self.table1_files_dir, uri1.removeprefix("gs://"))
           file2 = os.path.join(self.table2_files_dir, uri2.removeprefix("gs://"))
-          is_match = filecmp.cmp(file1, file2)
+          is_match = filecmp.cmp(file1, file2, shallow=False)
           self.file_exact_matches.loc[row, col] = is_match
           if is_match:
             # don't add URIs to exact differences table if files match