CenterForOpenScience · TomBaxter · Oct 11, 2017 · Nov 16, 2017 · Nov 22, 2017 · Dec 21, 2017
diff --git a/mfr/extensions/tabular/libs/__init__.py b/mfr/extensions/tabular/libs/__init__.py
@@ -8,6 +8,11 @@ def csv_stdlib():
     return csv_stdlib
 
 
+def tsv_stdlib():
+    from ..libs.stdlib_tools import tsv_stdlib
+    return tsv_stdlib
+
+
 def csv_pandas():
     from ..libs.panda_tools import csv_pandas
     return csv_pandas

diff --git a/mfr/extensions/tabular/libs/stdlib_tools.py b/mfr/extensions/tabular/libs/stdlib_tools.py
@@ -1,57 +1,87 @@
-import re
 import csv
+from http import HTTPStatus
 
-from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError
 from mfr.extensions.tabular import utilities
+from mfr.extensions.tabular.exceptions import (EmptyTableError,
+                                               TabularRendererError)
 
 
 def csv_stdlib(fp):
-    """Read and convert a csv file to JSON format using the python standard library
-    :param fp: File pointer object
-    :return: tuple of table headers and data
-    """
-    data = fp.read(2048)
+    try:
+        # CSVs are always values seperated by commas
+        # sniff for quoting, and spaces after commas
+        dialect = csv.Sniffer().sniff(fp.read(), ',')
+    except:
+        dialect = csv.excel
     fp.seek(0)
 
+    reader = csv.DictReader(fp, dialect=dialect)
+    return parse_stdlib(reader, 'csv')
+
+def tsv_stdlib(fp):
     try:
-        dialect = csv.Sniffer().sniff(data)
-    except csv.Error:
-        dialect = csv.excel
-    else:
-        _set_dialect_quote_attrs(dialect, data)
+        # TSVs are always values seperated by TABs
+        # sniff for quoting, and spaces after TABs
+        dialect = csv.Sniffer().sniff(fp.read(), '\t')
+    except:
+        dialect = csv.excel_tab
+    fp.seek(0)
 
     reader = csv.DictReader(fp, dialect=dialect)
+    return parse_stdlib(reader, 'tsv')
+
+def parse_stdlib(reader, ext):
+    """Read and convert a csv like file to JSON format using the python standard library
+    :param fp: File pointer object
+    :return: tuple of table headers and data
+    """
     columns = []
     # update the reader field names to avoid duplicate column names when performing row extraction
-    for idx, fieldname in enumerate(reader.fieldnames or []):
-        column_count = sum(1 for column in columns if fieldname == column['name'])
-        if column_count:
-            unique_fieldname = '{}-{}'.format(fieldname, column_count + 1)
-            reader.fieldnames[idx] = unique_fieldname
-        else:
-            unique_fieldname = fieldname
-        columns.append({
-            'id': unique_fieldname,
-            'field': unique_fieldname,
-            'name': fieldname,
-            'sortable': True,
-        })
-
     try:
+        for idx, fieldname in enumerate(reader.fieldnames or []):
+            column_count = sum(1 for column in columns if fieldname == column['name'])
+            if column_count:
+                unique_fieldname = '{}-{}'.format(fieldname, column_count + 1)
+                reader.fieldnames[idx] = unique_fieldname
+            else:
+                unique_fieldname = fieldname
+            columns.append({
+                'id': unique_fieldname,
+                'field': unique_fieldname,
+                'name': fieldname,
+                'sortable': True,
+            })
+
         rows = [row for row in reader]
     except csv.Error as e:
         if any("field larger than field limit" in errorMsg for errorMsg in e.args):
             raise TabularRendererError(
                 'This file contains a field too large to render. '
                 'Please download and view it locally.',
-                code=400,
-                extension='csv',
+                code=HTTPStatus.BAD_REQUEST,
+                extension=ext,
             ) from e
         else:
-            raise TabularRendererError('csv.Error: {}'.format(e), extension='csv') from e
+            raise TabularRendererError(
+                'Cannot render file as {}. The file may be empty or corrupt'.format(ext),
+                code=HTTPStatus.BAD_REQUEST,
+                extension=ext
+            ) from e
+
+    # Outside other except because the `if any` line causes more errors to be raised
+    # on certain exceptions
+    except Exception as e:
+        raise TabularRendererError(
+            'Cannot render file as {}. The file may be empty or corrupt'.format(ext),
+            code=HTTPStatus.BAD_REQUEST,
+            extension=ext
+        ) from e
 
     if not columns and not rows:
-        raise EmptyTableError('Table empty or corrupt.', extension='csv')
+        raise EmptyTableError(
+            'Cannot render file as {}. The file may be empty or corrupt'.format(ext),
+            code=HTTPStatus.BAD_REQUEST,
+            extension=ext)
 
     return {'Sheet 1': (columns, rows)}
 
@@ -67,26 +97,3 @@ def sav_stdlib(fp):
     with open(csv_file.name, 'r') as file:
         csv_file.close()
         return csv_stdlib(file)
-
-
-def _set_dialect_quote_attrs(dialect, data):
-    """Set quote-related dialect attributes based on up to 2kb of csv data.
-
-    The regular expressions search for things that look like the beginning of
-    a list, wrapped in a quotation mark that is not dialect.quotechar, with
-    list items wrapped in dialect.quotechar and seperated by commas.
-
-    Example matches include:
-        "['1', '2', '3'         for quotechar == '
-        '{"a", "b", "c"         for quotechar == "
-    """
-    if dialect.quotechar == '"':
-        if re.search('\'[[({]".+",', data):
-            dialect.quotechar = "'"
-        if re.search("'''[[({]\".+\",", data):
-            dialect.doublequote = True
-    elif dialect.quotechar == "'":
-        if re.search("\"[[({]'.+',", data):
-            dialect.quotechar = '"'
-        if re.search('"""[[({]\'.+\',', data):
-            dialect.doublequote = True
diff --git a/mfr/extensions/tabular/settings.py b/mfr/extensions/tabular/settings.py
@@ -10,7 +10,7 @@
 
 LIBS = config.get('LIBS', {
     '.csv': [libs.csv_stdlib],
-    '.tsv': [libs.csv_stdlib],
+    '.tsv': [libs.tsv_stdlib],
     '.gsheet': [libs.xlsx_xlrd],
     '.xlsx': [libs.xlsx_xlrd],
     '.xls': [libs.xlsx_xlrd],

diff --git a/tests/extensions/ipynb/files/no_metadata.ipynb b/tests/extensions/ipynb/files/no_metadata.ipynb
@@ -528,8 +528,7 @@
    ]
   }
  ],
- "cells": [],
  "metadata": {},
  "nbformat": 3,
  "nbformat_minor": 0
-}
+}
diff --git a/tests/extensions/tabular/files/invalid_null.csv b/tests/extensions/tabular/files/invalid_null.csv
diff --git a/tests/extensions/tabular/test_stdlib_tools.py b/tests/extensions/tabular/test_stdlib_tools.py
@@ -0,0 +1,58 @@
+import os
+from http import HTTPStatus
+from collections import OrderedDict
+
+import pytest
+
+from mfr.extensions.tabular.libs import stdlib_tools
+from mfr.extensions.tabular.exceptions import(EmptyTableError,
+                                              TabularRendererError)
+
+BASE = os.path.dirname(os.path.abspath(__file__))
+
+
+class TestTabularStdlibTools:
+
+    def test_csv_stdlib(self):
+        with open(os.path.join(BASE, 'files', 'test.csv')) as fp:
+            sheets = stdlib_tools.csv_stdlib(fp)
+
+        sheet = sheets.popitem()[1]
+        assert sheet[0] == [
+            {'id': 'one', 'field': 'one', 'name': 'one', 'sortable': True},
+            {'id': 'two', 'field': 'two', 'name': 'two', 'sortable': True},
+            {'id': 'three', 'field': 'three', 'name': 'three', 'sortable': True}
+        ]
+        assert sheet[1][0] == OrderedDict([('one', 'à'), ('two', 'b'), ('three', 'c')])
+        assert sheet[1][1] == OrderedDict([('one', '1'), ('two', '2'), ('three', '3')])
+
+    def test_tsv_stdlib(self):
+        with open(os.path.join(BASE, 'files', 'test.tsv')) as fp:
+            sheets = stdlib_tools.tsv_stdlib(fp)
+
+        sheet = sheets.popitem()[1]
+        assert sheet[0] == [
+            {'id': 'one', 'field': 'one', 'name': 'one', 'sortable': True},
+            {'id': 'two', 'field': 'two', 'name': 'two', 'sortable': True},
+            {'id': 'three', 'field': 'three', 'name': 'three', 'sortable': True}
+        ]
+        assert sheet[1][0] == OrderedDict([('one', 'a'), ('two', 'b'), ('three', 'c')])
+        assert sheet[1][1] == OrderedDict([('one', '1'), ('two', '2'), ('three', '3')])
+
+    def test_tsv_stdlib_exception_raises(self):
+        with open(os.path.join(BASE, 'files', 'invalid.tsv')) as fp:
+            with pytest.raises(EmptyTableError) as e:
+                stdlib_tools.tsv_stdlib(fp)
+                assert e.value.code == HTTPStatus.BAD_REQUEST
+
+    def test_csv_stdlib_exception_raises(self):
+        with open(os.path.join(BASE, 'files', 'invalid.csv')) as fp:
+            with pytest.raises(EmptyTableError) as e:
+                stdlib_tools.tsv_stdlib(fp)
+                assert e.value.code == HTTPStatus.BAD_REQUEST
+
+    def test_csv_stdlib_other_exception_raises(self):
+        with open(os.path.join(BASE, 'files', 'invalid_null.csv')) as fp:
+            with pytest.raises(TabularRendererError) as e:
+                stdlib_tools.tsv_stdlib(fp)
+                assert e.value.code == HTTPStatus.BAD_REQUEST
diff --git a/tests/extensions/zip/test_renderer.py b/tests/extensions/zip/test_renderer.py
@@ -65,7 +65,7 @@ class TestZipRenderer:
 
     def test_render(self, renderer):
         body = renderer.render()
-        parsed_html = BeautifulSoup(body)
+        parsed_html = BeautifulSoup(body, "html.parser")
         rows = parsed_html.findChildren('table')[0].findChildren(['tr'])
 
         name = rows[2].findChildren('td')[0].get_text().strip()