From 1eb6e332844dbf849d4bc2c1d2a6e03d117f5593 Mon Sep 17 00:00:00 2001
From: kishorek <libcomet@gmail.com>
Date: Mon, 22 Jun 2020 10:13:05 +0530
Subject: [PATCH] initial import

---
 LICENSE      |  21 ++++
 README.md    |  46 ++++++++
 cli.py       | 117 +++++++++++++++++++++
 comparesv.py | 292 +++++++++++++++++++++++++++++++++++++++++++++++++++
 setup.cfg    |   2 +
 setup.py     |  54 ++++++++++
 tests.py     | 100 ++++++++++++++++++
 version.py   |   1 +
 8 files changed, 633 insertions(+)
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 cli.py
 create mode 100644 comparesv.py
 create mode 100644 setup.cfg
 create mode 100644 setup.py
 create mode 100644 tests.py
 create mode 100644 version.py

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..a4ca423
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 Kishore Kumar
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..c83f261
--- /dev/null
+++ b/README.md
@@ -0,0 +1,46 @@
+# comparesc
+### CSV Comparison on steriods 
+
+## Usage
+
+```console
+comparesv [-h] [-v] [--enc1 ENCODING] [--enc2 ENCODING] [-i]
+              [-rm ROW_MATCH] [-cm COLUMN_MATCH] [-sm STRING_MATCH] [-ir]
+              [-ic] [-is] [-s]
+              [FILE1] [FILE2]
+
+CSV files comparison
+
+positional arguments:
+  FILE1                 the first CSV file
+  FILE2                 the second CSV file
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -v, --version         show program's version number and exit
+  --enc1 ENCODING       encoding of the first file (default is to autodetect)
+  --enc2 ENCODING       encoding of the second file (default is to autodetect)
+  -i, --ignore-case     ignore case (default is case-sensitive)
+  -rm ROW_MATCH, --row-match ROW_MATCH
+                        Logic to be used to identify the rows. Possible
+                        options 'order', 'fuzzy', 'deep' (default is order)
+  -cm COLUMN_MATCH, --column-match COLUMN_MATCH
+                        Logic to be used to identify the columns. Possible
+                        options 'exact','fuzzy' (default is exact)
+  -sm STRING_MATCH, --string-match STRING_MATCH
+                        Logic to be used to identify the columns. Possible
+                        options 'exact','fuzzy' (default is exact)
+  -ir, --include-addnl-rows
+                        Include added additional added rows from second file
+                        (default is false)
+  -ic, --include-addnl-columns
+                        Include added additional columns from second file
+                        (default is false)
+  -is, --include-stats  Include stats (default is false)
+  -s, --save-output     Save output to file
+```
+
+## Description
+
+
+
diff --git a/cli.py b/cli.py
new file mode 100644
index 0000000..7cd7906
--- /dev/null
+++ b/cli.py
@@ -0,0 +1,117 @@
+import sys
+import os
+import io
+import csv
+import logging
+import warnings
+import argparse
+import chardet
+import tqdm
+import comparesv
+from version import __version__
+
+from pprint import pprint
+
+# This file is derived from https://github.com/maxharlow/csvmatch/blob/master/cli.py
+
+def main():
+    logging.captureWarnings(True)
+    logging.basicConfig(level=logging.WARN, format='Warning: %(message)s')
+    warnings.formatwarning = lambda e, *args: str(e)
+    sys.stderr.write('Starting up...\n')
+    try:
+        file1, file2, args = arguments()
+        print(args)
+        data1, headers1 = read(*file1)
+        data2, headers2 = read(*file2)
+        results = comparesv.run(data1, headers1, data2, headers2, ticker=ticker, **args)
+        # formatted = format(results['values'],results['headers'])
+        if args.get("save_output"):
+            save_file("values.csv", results['headers'], results['values'])
+            save_file("results.csv", results['headers'], results['results'])
+        pprint(results['stats'])
+        sys.stdout.flush()
+    except BaseException as e:
+        sys.exit(e)
+
+
+def ticker(text, total):
+    progress = tqdm.tqdm(bar_format=text + ' |{bar}| {percentage:3.0f}% / {remaining} left', total=total)
+    return progress.update
+
+
+def read(filename, encoding):
+    if not os.path.isfile(filename) and filename != '-':
+        raise Exception(filename + ': no such file')
+    file = sys.stdin if filename == '-' else io.open(filename, 'rb')
+    text = file.read()
+    if text == '':
+        raise Exception(filename + ': file is empty')
+    if not encoding:
+        detector = chardet.universaldetector.UniversalDetector()
+        text_lines = text.split(b'\n')
+        for i in range(0, len(text_lines)):
+            detector.feed(text_lines[i])
+            if detector.done:
+                break
+        detector.close()
+        encoding = detector.result['encoding']  # can't always be relied upon
+        sys.stderr.write(filename + ': autodetected character encoding as ' + encoding.upper() + '\n')
+    try:
+        text_decoded = text.decode(encoding)
+        reader = csv.reader(io.StringIO(text_decoded, newline=None))
+        headers = next(reader)
+        return list(reader), headers
+    except UnicodeDecodeError as e:
+        raise Exception(filename + ': could not read file -- try specifying the encoding')
+    except csv.Error as e:
+        raise Exception(filename + ': could not read file as a CSV')
+
+
+def arguments():
+    parser = argparse.ArgumentParser(description='CSV files comparison')
+    parser.add_argument('-v', '--version', action='version', version=__version__)
+    parser.add_argument('FILE1', nargs='?', default='-', help='the first CSV file')
+    parser.add_argument('FILE2', nargs='?', default='-', help='the second CSV file')
+    parser.add_argument('--enc1', type=str, metavar='ENCODING', help='encoding of the first file (default is to autodetect)')
+    parser.add_argument('--enc2', type=str, metavar='ENCODING', help='encoding of the second file (default is to autodetect)')
+    parser.add_argument('-i', '--ignore-case', action='store_true', help='ignore case (default is case-sensitive)')
+    parser.add_argument('-rm', '--row-match', default='order', help='Logic to be used to identify the rows. Possible options \'order\', \'fuzzy\', \'deep\' (default is order)')
+    parser.add_argument('-cm', '--column-match', default='exact', help='Logic to be used to identify the columns. Possible options \'exact\',\'fuzzy\' (default is exact)')
+    parser.add_argument('-sm', '--string-match', default='exact', help='Logic to be used to identify the columns. Possible options \'exact\',\'fuzzy\' (default is exact)')
+    parser.add_argument('-ir', '--include-addnl-rows', action='store_true', help='Include additional rows from second file (default is false)')
+    parser.add_argument('-ic', '--include-addnl-columns', action='store_true', help='Include additional columns from second file (default is false)')
+    parser.add_argument('-is', '--include-stats', default=True, action='store_true', help='Include stats (default is true)')
+    parser.add_argument('-s', '--save-output', default=True, action='store_true', help='Save output to file. This saves the output in the current directory (default is true)')
+
+    args = vars(parser.parse_args())
+    if args['FILE1'] == '-' and args['FILE2'] == '-':
+        parser.print_help(sys.stderr)
+        parser.exit(1)
+    file1 = args.pop('FILE1')
+    file2 = args.pop('FILE2')
+    enc1 = args.pop('enc1')
+    enc2 = args.pop('enc2')
+    return (file1, enc1), (file2, enc2), args
+
+def save_file(file_name, keys, results):
+    updated_keys = ['S.No'] + keys
+    updated_results = [[idx+1]+result for idx,result in enumerate(results)]
+
+    curr_dir = os.getcwd()
+    with open(os.getcwd() + os.path.sep + file_name, 'w') as file:
+        writer = csv.writer(file, lineterminator='\n')  # can't use dictwriter as headers are printed even when there's no results
+        writer.writerow(updated_keys)
+        writer.writerows(updated_results)
+
+def format(results, keys):
+    writer_io = io.StringIO()
+    writer = csv.writer(writer_io, lineterminator='\n')  # can't use dictwriter as headers are printed even when there's no results
+    writer.writerow(keys)
+    writer.writerows(results)
+    return writer_io.getvalue()[:-1]
+
+
+if __name__ == '__main__':
+    main()
+
diff --git a/comparesv.py b/comparesv.py
new file mode 100644
index 0000000..d2199b2
--- /dev/null
+++ b/comparesv.py
@@ -0,0 +1,292 @@
+import os
+from fuzzywuzzy import fuzz, process
+from collections import OrderedDict
+import time
+
+ROW_THRESHOLD = 80
+CELL_THRESHOLD = 80
+
+__version__ = 0.01
+
+def run(data1,
+        headers1,
+        data2,
+        headers2,
+        ignore_case=False,
+        row_match='order',
+        column_match='exact',
+        string_match='exact',
+        include_addnl_rows=False,
+        include_addnl_columns=False,
+        include_stats=True,
+        save_output=True,
+        ticker=None):
+    headers1 = cleanup(headers1)
+    headers2 = cleanup(headers2)
+    matched_headers = prepare_headers(data1, headers1, headers2, column_match)
+    comparison_output, added_rows, deleted_rows = compare_data(data1, data2, headers1, headers2, matched_headers, row_match=row_match, string_match=string_match, include_addnl_rows=include_addnl_rows,
+                                                               include_addnl_columns=include_addnl_columns, ignore_case=ignore_case)
+
+    rows_result_list, rows_values_list = populate_output(comparison_output)
+    final_headers = headers1
+    if include_addnl_columns:
+        updated_headers = populate_headers(matched_headers, headers2)
+        final_headers = final_headers + updated_headers
+
+    final_result = {}
+    final_result["results"] = rows_result_list
+    final_result["values"] = rows_values_list
+    final_result["added"] = added_rows
+    final_result["deleted"] = deleted_rows
+    final_result["headers"] = final_headers
+
+    if include_stats:
+        stats = populate_stats(final_headers, rows_result_list)
+        final_result["stats"] = stats
+
+    return final_result
+
+
+def populate_stats(headers, results_list):
+    stat = {}
+    for index, header in enumerate(headers):
+        header_data = [result[index] for result in results_list]
+        total_records = len(header_data)
+        matched_records = header_data.count(True)
+        match_percentage = 100 * matched_records/total_records
+        stat[header] = "{:.2f}".format(match_percentage)
+    return stat
+
+
+def populate_output(rows_match):
+    rows_result_list = []
+    rows_values_list = []
+    for match in rows_match:
+        row_result = []
+        value_result = []
+        for item in match:
+            value_result.append(item[0])
+            row_result.append(item[1])
+        rows_result_list.append(row_result)
+        rows_values_list.append(value_result)
+
+    return rows_result_list, rows_values_list
+
+
+def populate_headers(header_index, headers2):
+    mapped_headers2 = [value['matched_header'] for item, value in header_index.items() if value['index'] > -1]
+    addnl_headers2 = [header for header in headers2 if header not in mapped_headers2]
+    return addnl_headers2
+
+
+def compare_data(data1, data2, headers1, headers2, matched_headers, **kwargs):
+    added_rows = []
+    common_rows = []
+    deleted_rows = []
+
+    rows_output = []
+    row_match = kwargs['row_match']
+    data2_compared_indices = []
+
+    # Comparing the data1 rows with available rows in data2
+    for index, row1 in enumerate(data1):
+        row2 = None
+        data2_indices = list(range(len(data2)))
+        data2_indices_left = [item for item in data2_indices if item not in data2_compared_indices]
+        if row_match == 'order' and index < len(data2):
+            row2 = data2[index]
+            data2_compared_indices.append(index)
+        elif row_match == 'fuzzy':
+            row2, row2_index = fuzzy_row_find(row1, data2, headers1, matched_headers, data2_indices_left)
+            data2_compared_indices.append(row2_index)
+        elif row_match == 'deep':
+            row2, row2_index = deep_row_find(row1, data2, headers1, headers2, matched_headers, data2_indices_left, kwargs)
+            data2_compared_indices.append(row2_index)
+
+        row_compare_result, mode = compare_rows(row1, row2, matched_headers, headers2, kwargs)
+        if mode == 'added':
+            added_rows.append(row2)
+        elif mode == 'deleted':
+            deleted_rows.append(row1)
+        # else:
+        #     common_rows.append(row1)
+        rows_output.append(row_compare_result)
+
+    if kwargs.get('include_addnl_rows'):
+        # Calculate and process the remaining records left in data2
+        data2_indices = list(range(len(data2)))
+        data2_indices_left = [item for item in data2_indices if item not in data2_compared_indices]
+        for index in data2_indices_left:
+            row1 = None
+            row2 = data2[index]
+            row_compare_result, mode = compare_rows(row1, row2, matched_headers, headers2, kwargs)
+            added_rows.append(row2)
+            rows_output.append(row_compare_result)
+    return rows_output, added_rows, deleted_rows
+
+
+def cleanup(headers):
+    return [header.strip() for header in headers]
+
+
+def exist_in_list(option, option_list):
+    cleaned_list = [str(o).lower().strip() for o in option_list]
+    exists = str(option).lower().strip() in cleaned_list
+    index = -1
+    if exists:
+        index = cleaned_list.index(str(option).lower().strip())
+    return exists, index
+
+
+def prepare_headers(data1, headers1, headers2, column_match):
+    mapped_headers_index = OrderedDict()
+    updated_indices2 = []
+    for index, header in enumerate(headers1):
+        index = -1
+        if column_match == 'exact':
+            exists, index = exist_in_list(header, headers2)
+        elif column_match == 'fuzzy':
+            indices2_left = [x for x in range(len(headers2)) if x not in updated_indices2]
+            index = fuzzy_column_index(header, headers2)
+            updated_indices2.append(index)
+
+        column_data = {}
+        column_data['index'] = index
+        if index != -1:
+            column_data['matched_header'] = headers2[index]
+            column_data['type'] = predict_column_type([val[index] for val in data1])
+        mapped_headers_index[header] = column_data
+
+    return mapped_headers_index
+
+
+def fuzzy_column_index(header, headers_list):
+    exist, index = exist_in_list(header, headers_list)
+    if exist:
+        return index
+
+    highest = process.extractOne(header, headers_list)
+    if highest[1] < ROW_THRESHOLD:
+        return -1
+    return headers_list.index(highest[0])
+
+
+def deep_row_find(row, data2, headers1, headers2, matched_headers, data2_indices_left, opts):
+    """
+    1. Take a row from data1
+    2. Compare against all the rows in data2 by column wise data
+    3. Get the best matched one
+    """
+    count = 0
+    selected_index = -1
+    selected_row = None
+    for index in data2_indices_left:
+        row2 = data2[index]
+        row_comparison = compare_rows(row, row2, matched_headers, headers2, opts)
+        results = [x[1] for x in row_comparison[0]]
+        if results.count(True) > count:
+            count = results.count(True)
+            selected_index = index
+            selected_row = row2
+    
+    return selected_row, selected_index
+
+
+def fuzzy_row_find(row, data2, headers1, matched_headers, data2_indices_left):
+    row1 = ' '.join(str(x) for x in row)
+    rows_list2 = [' '.join(str(x) for x in elem) for index, elem in enumerate(data2) if index in data2_indices_left]
+    highest = process.extractOne(row1, rows_list2)
+
+    if highest[1] < ROW_THRESHOLD:
+        return None, None
+
+    index = rows_list2.index(highest[0])
+    return data2[index], index
+
+
+def compare_rows(row1, row2, header_index, headers2, opts):
+    mode = "existing"
+    if not row1:
+        mode = "added"
+    if not row2:
+        mode = "deleted"
+
+    row_result = []
+    for index, column in enumerate(header_index.keys()):
+        result = None
+        column_info = header_index[column]
+        cell1 = row1[index] if row1 else ""
+        cell2 = row2[column_info['index']] if row2 and column_info['index'] > -1 else ""
+
+        result = compare_cells(cell1, cell2, fetch_compare_mode(column_info.get('type'), opts['string_match']), opts['ignore_case'])
+        output = [f"[{cell1}]:[{cell2}]", result]
+        row_result.append(output)
+
+    if opts.get('include_addnl_columns'):
+        mapped_headers2 = [value['matched_header'] for item, value in header_index.items() if value['index'] > -1]
+        addnl_headers2_indices = [headers2.index(header) for header in headers2 if header not in mapped_headers2]
+        for index in addnl_headers2_indices:
+            cell1 = ""
+            cell2 = row2[index]
+
+            result = compare_cells(cell1, cell2, "str", opts['ignore_case'])
+            output = [f"[{cell1}]:[{cell2}]", result]
+            row_result.append(output)
+
+    return row_result, mode
+
+
+def fetch_compare_mode(data_type, string_match):
+    if data_type == 'str' and string_match == 'fuzzy':
+        return "fuzzy_string"
+    else:
+        return data_type
+
+
+def compare_cells(cell1, cell2, comparison_type, ignore_case):
+    if not cell1 or not cell2:
+        return False
+
+    try:
+        if comparison_type == 'fuzzy_string':
+            if fuzz.token_set_ratio(cell1, cell2) > CELL_THRESHOLD:
+                return True
+        elif comparison_type == 'int':
+            return int(cell1) == int(cell2)
+        elif comparison_type == 'float':
+            return float(cell1) == float(cell2)
+        else:
+            if ignore_case:
+                cell1 = cell1.lower()
+                cell2 = cell2.lower()
+
+            return str(cell1).strip() == str(cell2).strip()
+    except:
+        if ignore_case:
+            cell1 = cell1.lower()
+            cell2 = cell2.lower()
+        return str(cell1).strip() == str(cell2).strip()
+
+
+def predict_column_type(data):
+    """
+    Predict the data type of the elements present in a list. It will be defaulted to string.
+
+    Args:
+        data : array
+
+    Returns:
+        type: Column data type
+    """
+    data_types = [type(item) for item in data]
+    data_types = list(set(data_types))
+    if len(data_types) == 1:
+        return data_types[0].__name__
+    elif str in data_types:
+        return "str"
+    elif float in data_types:
+        return "float"
+    elif int in data_types:
+        return "int"
+    else:
+        return "str"
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..3c6e79c
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,2 @@
+[bdist_wheel]
+universal=1
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..c99cf86
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,54 @@
+try:
+    from setuptools import setup
+except ImportError:
+    from distutils.core import setup
+
+from version import __version__
+import os
+
+def open_file(fname):
+    return open(os.path.join(os.path.dirname(__file__), fname))
+
+setup(
+    name='comparesv',
+    packages=[],
+    version=__version__,
+    license='MIT',
+    description='CSV Comparison on steroids',
+    long_description=open_file('README.md').read(),
+    author='Kishore Kumar',
+    author_email='ukisho@gmail.com',
+    url='https://github.com/kishorek',
+    download_url='https://github.com/user/reponame/archive/v_01.tar.gz',
+    keywords=['CSV', 'Comparison', 'Compare'],
+    install_requires=[
+        'chardet==3.0.4',
+        'tqdm==4.18.0',
+        'unidecode==1.1.1',
+        'doublemetaphone==0.1',
+        'fuzzywuzzy==0.18.0'
+    ],
+    setup_requires=[
+        'chardet==3.0.4',
+        'tqdm==4.18.0',
+        'unidecode==1.1.1',
+        'doublemetaphone==0.1',
+        'fuzzywuzzy==0.18.0'
+    ],
+    entry_points={
+        'console_scripts': [
+            'comparesv = cli:main'
+        ]
+    },
+    classifiers=[
+        'Development Status :: 3 - Alpha',      # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package
+        'Intended Audience :: Developers',
+        'Intended Audience :: Developers',
+        'Topic :: Software Development :: Build Tools',
+        'License :: OSI Approved :: MIT License',
+        'Programming Language :: Python :: 3.6',
+        'Natural Language :: English',
+        'Topic :: Scientific/Engineering :: Information Analysis',
+        'Topic :: Utilities'
+    ]
+)
diff --git a/tests.py b/tests.py
new file mode 100644
index 0000000..d12fd47
--- /dev/null
+++ b/tests.py
@@ -0,0 +1,100 @@
+import comparesv
+
+def test_basic():
+    h1 = ["id", "age"]
+    h2 = ["id", "age"]
+    d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
+    d2 = [["A1", 23], ["A2", 24], ["A3", 34]]
+
+    result = [[True, True], [True, True], [True, True]]
+    values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']]
+    output = comparesv.run(d1, h1, d2, h2)
+    assert result == output['results']
+    assert values == output['values']
+
+def test_column_order():
+    h1 = ["id", "age"]
+    h2 = ["age", "id"]
+    d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
+    d2 = [[23, "A1"], [24, "A2"], [34, "A3"]]
+
+    result = [[True, True], [True, True], [True, True]]
+    values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']]
+    output = comparesv.run(d1, h1, d2, h2)
+    assert result == output['results']
+    assert values == output['values']
+
+def test_fuzzy_column_order():
+    h1 = ["id", "age"]
+    h2 = ["age of student", "identity"]
+    d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
+    d2 = [[23, "A1"], [24, "A2"], [34, "A3"]]
+
+    result = [[True, True], [True, True], [True, True]]
+    values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']]
+    output = comparesv.run(d1, h1, d2, h2, column_match='fuzzy')
+    assert result == output['results']
+    assert values == output['values']
+
+def test_row_order_fuzzy():
+    h1 = ["id", "age"]
+    h2 = ["id", "age"]
+    d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
+    d2 = [["A2", 24], ["A1", 23], ["A3", 34]]
+
+    result = [[True, True], [True, True], [True, True]]
+    values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]']]
+    output = comparesv.run(d1, h1, d2, h2, row_match='fuzzy')
+    assert result == output['results']
+    assert values == output['values']
+
+def test_extra_column():
+    h1 = ["id", "age", "name"]
+    h2 = ["id", "age"]
+    d1 = [["A1", 23, "Alpha"], ["A2", 24, "Beta"], ["A3", 34, "Gamma"]]
+    d2 = [["A2", 24], ["A1", 23], ["A3", 34]]
+
+    result = [[True, True, False], [True, True, False], [True, True, False]]
+    values = [['[A1]:[A1]', '[23]:[23]', '[Alpha]:[]'], ['[A2]:[A2]', '[24]:[24]', '[Beta]:[]'], ['[A3]:[A3]', '[34]:[34]', '[Gamma]:[]']]
+    output = comparesv.run(d1, h1, d2, h2, row_match='fuzzy')
+    assert result == output['results']
+    assert values == output['values']
+
+def test_include_extra_rows():
+    h1 = ["id", "age"]
+    h2 = ["id", "age"]
+    d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
+    d2 = [["A1", 23], ["A2", 24], ["A3", 34],["A4", 34]]
+
+    result = [[True, True], [True, True], [True, True], [False, False]]
+    values = [['[A1]:[A1]', '[23]:[23]'], ['[A2]:[A2]', '[24]:[24]'], ['[A3]:[A3]', '[34]:[34]'],['[]:[A4]', '[]:[34]']]
+    output = comparesv.run(d1, h1, d2, h2, include_addnl_rows=True)
+    assert result == output['results']
+    assert values == output['values']
+
+def test_include_extra_column():
+    h1 = ["id", "age"]
+    h2 = ["id", "age", "name"]
+    d1 = [["A2", 24], ["A1", 23], ["A3", 34]]
+    d2 = [["A1", 23, "Alpha"], ["A2", 24, "Beta"], ["A3", 34, "Gamma"]]
+
+    output = comparesv.run(d1, h1, d2, h2, include_addnl_columns=True)
+    result = [[False, False, False], [False, False, False], [True, True, False]]
+    values = [['[A2]:[A1]', '[24]:[23]', '[]:[Alpha]'],
+            ['[A1]:[A2]', '[23]:[24]', '[]:[Beta]'],
+            ['[A3]:[A3]', '[34]:[34]', '[]:[Gamma]']]
+
+    assert result == output['results']
+    assert values == output['values']
+
+def test_basic_case():
+    h1 = ["id", "age"]
+    h2 = ["id", "age"]
+    d1 = [["A1", 23], ["A2", 24], ["A3", 34]]
+    d2 = [["a1", 23], ["a2", 24], ["a3", 34]]
+
+    result = [[True, True], [True, True], [True, True]]
+    values = [['[A1]:[a1]', '[23]:[23]'], ['[A2]:[a2]', '[24]:[24]'], ['[A3]:[a3]', '[34]:[34]']]
+    output = comparesv.run(d1, h1, d2, h2, ignore_case=True)
+    assert result == output['results']
+    assert values == output['values']
\ No newline at end of file
diff --git a/version.py b/version.py
new file mode 100644
index 0000000..76a9af4
--- /dev/null
+++ b/version.py
@@ -0,0 +1 @@
+__version__ = 0.01
\ No newline at end of file