initial import

kishorek · Jun 22, 2020 · 1eb6e33 · 1eb6e33
commit 1eb6e33
Show file tree

Hide file tree

Showing 8 changed files with 633 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 Kishore Kumar
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,46 @@
+# comparesc
+### CSV Comparison on steriods 
+
+## Usage
+
+```console
+comparesv [-h] [-v] [--enc1 ENCODING] [--enc2 ENCODING] [-i]
+              [-rm ROW_MATCH] [-cm COLUMN_MATCH] [-sm STRING_MATCH] [-ir]
+              [-ic] [-is] [-s]
+              [FILE1] [FILE2]
+
+CSV files comparison
+
+positional arguments:
+  FILE1                 the first CSV file
+  FILE2                 the second CSV file
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -v, --version         show program's version number and exit
+  --enc1 ENCODING       encoding of the first file (default is to autodetect)
+  --enc2 ENCODING       encoding of the second file (default is to autodetect)
+  -i, --ignore-case     ignore case (default is case-sensitive)
+  -rm ROW_MATCH, --row-match ROW_MATCH
+                        Logic to be used to identify the rows. Possible
+                        options 'order', 'fuzzy', 'deep' (default is order)
+  -cm COLUMN_MATCH, --column-match COLUMN_MATCH
+                        Logic to be used to identify the columns. Possible
+                        options 'exact','fuzzy' (default is exact)
+  -sm STRING_MATCH, --string-match STRING_MATCH
+                        Logic to be used to identify the columns. Possible
+                        options 'exact','fuzzy' (default is exact)
+  -ir, --include-addnl-rows
+                        Include added additional added rows from second file
+                        (default is false)
+  -ic, --include-addnl-columns
+                        Include added additional columns from second file
+                        (default is false)
+  -is, --include-stats  Include stats (default is false)
+  -s, --save-output     Save output to file
+```
+
+## Description
+
+
+
diff --git a/cli.py b/cli.py
@@ -0,0 +1,117 @@
+import sys
+import os
+import io
+import csv
+import logging
+import warnings
+import argparse
+import chardet
+import tqdm
+import comparesv
+from version import __version__
+
+from pprint import pprint
+
+# This file is derived from https://github.com/maxharlow/csvmatch/blob/master/cli.py
+
+def main():
+    logging.captureWarnings(True)
+    logging.basicConfig(level=logging.WARN, format='Warning: %(message)s')
+    warnings.formatwarning = lambda e, *args: str(e)
+    sys.stderr.write('Starting up...\n')
+    try:
+        file1, file2, args = arguments()
+        print(args)
+        data1, headers1 = read(*file1)
+        data2, headers2 = read(*file2)
+        results = comparesv.run(data1, headers1, data2, headers2, ticker=ticker, **args)
+        # formatted = format(results['values'],results['headers'])
+        if args.get("save_output"):
+            save_file("values.csv", results['headers'], results['values'])
+            save_file("results.csv", results['headers'], results['results'])
+        pprint(results['stats'])
+        sys.stdout.flush()
+    except BaseException as e:
+        sys.exit(e)
+
+
+def ticker(text, total):
+    progress = tqdm.tqdm(bar_format=text + ' |{bar}| {percentage:3.0f}% / {remaining} left', total=total)
+    return progress.update
+
+
+def read(filename, encoding):
+    if not os.path.isfile(filename) and filename != '-':
+        raise Exception(filename + ': no such file')
+    file = sys.stdin if filename == '-' else io.open(filename, 'rb')
+    text = file.read()
+    if text == '':
+        raise Exception(filename + ': file is empty')
+    if not encoding:
+        detector = chardet.universaldetector.UniversalDetector()
+        text_lines = text.split(b'\n')
+        for i in range(0, len(text_lines)):
+            detector.feed(text_lines[i])
+            if detector.done:
+                break
+        detector.close()
+        encoding = detector.result['encoding']  # can't always be relied upon
+        sys.stderr.write(filename + ': autodetected character encoding as ' + encoding.upper() + '\n')
+    try:
+        text_decoded = text.decode(encoding)
+        reader = csv.reader(io.StringIO(text_decoded, newline=None))
+        headers = next(reader)
+        return list(reader), headers
+    except UnicodeDecodeError as e:
+        raise Exception(filename + ': could not read file -- try specifying the encoding')
+    except csv.Error as e:
+        raise Exception(filename + ': could not read file as a CSV')
+
+
+def arguments():
+    parser = argparse.ArgumentParser(description='CSV files comparison')
+    parser.add_argument('-v', '--version', action='version', version=__version__)
+    parser.add_argument('FILE1', nargs='?', default='-', help='the first CSV file')
+    parser.add_argument('FILE2', nargs='?', default='-', help='the second CSV file')
+    parser.add_argument('--enc1', type=str, metavar='ENCODING', help='encoding of the first file (default is to autodetect)')
+    parser.add_argument('--enc2', type=str, metavar='ENCODING', help='encoding of the second file (default is to autodetect)')
+    parser.add_argument('-i', '--ignore-case', action='store_true', help='ignore case (default is case-sensitive)')
+    parser.add_argument('-rm', '--row-match', default='order', help='Logic to be used to identify the rows. Possible options \'order\', \'fuzzy\', \'deep\' (default is order)')
+    parser.add_argument('-cm', '--column-match', default='exact', help='Logic to be used to identify the columns. Possible options \'exact\',\'fuzzy\' (default is exact)')
+    parser.add_argument('-sm', '--string-match', default='exact', help='Logic to be used to identify the columns. Possible options \'exact\',\'fuzzy\' (default is exact)')
+    parser.add_argument('-ir', '--include-addnl-rows', action='store_true', help='Include additional rows from second file (default is false)')
+    parser.add_argument('-ic', '--include-addnl-columns', action='store_true', help='Include additional columns from second file (default is false)')
+    parser.add_argument('-is', '--include-stats', default=True, action='store_true', help='Include stats (default is true)')
+    parser.add_argument('-s', '--save-output', default=True, action='store_true', help='Save output to file. This saves the output in the current directory (default is true)')
+
+    args = vars(parser.parse_args())
+    if args['FILE1'] == '-' and args['FILE2'] == '-':
+        parser.print_help(sys.stderr)
+        parser.exit(1)
+    file1 = args.pop('FILE1')
+    file2 = args.pop('FILE2')
+    enc1 = args.pop('enc1')
+    enc2 = args.pop('enc2')
+    return (file1, enc1), (file2, enc2), args
+
+def save_file(file_name, keys, results):
+    updated_keys = ['S.No'] + keys
+    updated_results = [[idx+1]+result for idx,result in enumerate(results)]
+
+    curr_dir = os.getcwd()
+    with open(os.getcwd() + os.path.sep + file_name, 'w') as file:
+        writer = csv.writer(file, lineterminator='\n')  # can't use dictwriter as headers are printed even when there's no results
+        writer.writerow(updated_keys)
+        writer.writerows(updated_results)
+
+def format(results, keys):
+    writer_io = io.StringIO()
+    writer = csv.writer(writer_io, lineterminator='\n')  # can't use dictwriter as headers are printed even when there's no results
+    writer.writerow(keys)
+    writer.writerows(results)
+    return writer_io.getvalue()[:-1]
+
+
+if __name__ == '__main__':
+    main()
+