Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SVCS-531] Improve tabular renderer to handle more TSV cases #308

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions mfr/extensions/tabular/libs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ def csv_stdlib():
return csv_stdlib


def tsv_stdlib():
from ..libs.stdlib_tools import tsv_stdlib
return tsv_stdlib


def csv_pandas():
from ..libs.panda_tools import csv_pandas
return csv_pandas
Expand Down
113 changes: 60 additions & 53 deletions mfr/extensions/tabular/libs/stdlib_tools.py
Original file line number Diff line number Diff line change
@@ -1,57 +1,87 @@
import re
import csv
from http import HTTPStatus

from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError
from mfr.extensions.tabular import utilities
from mfr.extensions.tabular.exceptions import (EmptyTableError,
TabularRendererError)


def csv_stdlib(fp):
"""Read and convert a csv file to JSON format using the python standard library
:param fp: File pointer object
:return: tuple of table headers and data
"""
data = fp.read(2048)
try:
# CSVs are always values seperated by commas
# sniff for quoting, and spaces after commas
dialect = csv.Sniffer().sniff(fp.read(), ',')
except:
dialect = csv.excel
fp.seek(0)

reader = csv.DictReader(fp, dialect=dialect)
return parse_stdlib(reader, 'csv')

def tsv_stdlib(fp):
try:
dialect = csv.Sniffer().sniff(data)
except csv.Error:
dialect = csv.excel
else:
_set_dialect_quote_attrs(dialect, data)
# TSVs are always values seperated by TABs
# sniff for quoting, and spaces after TABs
dialect = csv.Sniffer().sniff(fp.read(), '\t')
except:
dialect = csv.excel_tab
fp.seek(0)

reader = csv.DictReader(fp, dialect=dialect)
return parse_stdlib(reader, 'tsv')

def parse_stdlib(reader, ext):
"""Read and convert a csv like file to JSON format using the python standard library
:param fp: File pointer object
:return: tuple of table headers and data
"""
columns = []
# update the reader field names to avoid duplicate column names when performing row extraction
for idx, fieldname in enumerate(reader.fieldnames or []):
column_count = sum(1 for column in columns if fieldname == column['name'])
if column_count:
unique_fieldname = '{}-{}'.format(fieldname, column_count + 1)
reader.fieldnames[idx] = unique_fieldname
else:
unique_fieldname = fieldname
columns.append({
'id': unique_fieldname,
'field': unique_fieldname,
'name': fieldname,
'sortable': True,
})

try:
for idx, fieldname in enumerate(reader.fieldnames or []):
column_count = sum(1 for column in columns if fieldname == column['name'])
if column_count:
unique_fieldname = '{}-{}'.format(fieldname, column_count + 1)
reader.fieldnames[idx] = unique_fieldname
else:
unique_fieldname = fieldname
columns.append({
'id': unique_fieldname,
'field': unique_fieldname,
'name': fieldname,
'sortable': True,
})

rows = [row for row in reader]
except csv.Error as e:
if any("field larger than field limit" in errorMsg for errorMsg in e.args):
raise TabularRendererError(
'This file contains a field too large to render. '
'Please download and view it locally.',
code=400,
extension='csv',
code=HTTPStatus.BAD_REQUEST,
extension=ext,
) from e
else:
raise TabularRendererError('csv.Error: {}'.format(e), extension='csv') from e
raise TabularRendererError(
'Cannot render file as {}. The file may be empty or corrupt'.format(ext),
code=HTTPStatus.BAD_REQUEST,
extension=ext
) from e

# Outside other except because the `if any` line causes more errors to be raised
# on certain exceptions
except Exception as e:
raise TabularRendererError(
'Cannot render file as {}. The file may be empty or corrupt'.format(ext),
code=HTTPStatus.BAD_REQUEST,
extension=ext
) from e

if not columns and not rows:
raise EmptyTableError('Table empty or corrupt.', extension='csv')
raise EmptyTableError(
'Cannot render file as {}. The file may be empty or corrupt'.format(ext),
code=HTTPStatus.BAD_REQUEST,
extension=ext)

return {'Sheet 1': (columns, rows)}

Expand All @@ -67,26 +97,3 @@ def sav_stdlib(fp):
with open(csv_file.name, 'r') as file:
csv_file.close()
return csv_stdlib(file)


def _set_dialect_quote_attrs(dialect, data):
"""Set quote-related dialect attributes based on up to 2kb of csv data.

The regular expressions search for things that look like the beginning of
a list, wrapped in a quotation mark that is not dialect.quotechar, with
list items wrapped in dialect.quotechar and seperated by commas.

Example matches include:
"['1', '2', '3' for quotechar == '
'{"a", "b", "c" for quotechar == "
"""
if dialect.quotechar == '"':
if re.search('\'[[({]".+",', data):
dialect.quotechar = "'"
if re.search("'''[[({]\".+\",", data):
dialect.doublequote = True
elif dialect.quotechar == "'":
if re.search("\"[[({]'.+',", data):
dialect.quotechar = '"'
if re.search('"""[[({]\'.+\',', data):
dialect.doublequote = True
2 changes: 1 addition & 1 deletion mfr/extensions/tabular/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

LIBS = config.get('LIBS', {
'.csv': [libs.csv_stdlib],
'.tsv': [libs.csv_stdlib],
'.tsv': [libs.tsv_stdlib],
'.gsheet': [libs.xlsx_xlrd],
'.xlsx': [libs.xlsx_xlrd],
'.xls': [libs.xlsx_xlrd],
Expand Down
3 changes: 1 addition & 2 deletions tests/extensions/ipynb/files/no_metadata.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -528,8 +528,7 @@
]
}
],
"cells": [],
"metadata": {},
"nbformat": 3,
"nbformat_minor": 0
}
}
Binary file added tests/extensions/tabular/files/invalid_null.csv
Binary file not shown.
58 changes: 58 additions & 0 deletions tests/extensions/tabular/test_stdlib_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import os
from http import HTTPStatus
from collections import OrderedDict

import pytest

from mfr.extensions.tabular.libs import stdlib_tools
from mfr.extensions.tabular.exceptions import(EmptyTableError,
TabularRendererError)

BASE = os.path.dirname(os.path.abspath(__file__))


class TestTabularStdlibTools:

def test_csv_stdlib(self):
with open(os.path.join(BASE, 'files', 'test.csv')) as fp:
sheets = stdlib_tools.csv_stdlib(fp)

sheet = sheets.popitem()[1]
assert sheet[0] == [
{'id': 'one', 'field': 'one', 'name': 'one', 'sortable': True},
{'id': 'two', 'field': 'two', 'name': 'two', 'sortable': True},
{'id': 'three', 'field': 'three', 'name': 'three', 'sortable': True}
]
assert sheet[1][0] == OrderedDict([('one', 'à'), ('two', 'b'), ('three', 'c')])
assert sheet[1][1] == OrderedDict([('one', '1'), ('two', '2'), ('three', '3')])

def test_tsv_stdlib(self):
with open(os.path.join(BASE, 'files', 'test.tsv')) as fp:
sheets = stdlib_tools.tsv_stdlib(fp)

sheet = sheets.popitem()[1]
assert sheet[0] == [
{'id': 'one', 'field': 'one', 'name': 'one', 'sortable': True},
{'id': 'two', 'field': 'two', 'name': 'two', 'sortable': True},
{'id': 'three', 'field': 'three', 'name': 'three', 'sortable': True}
]
assert sheet[1][0] == OrderedDict([('one', 'a'), ('two', 'b'), ('three', 'c')])
assert sheet[1][1] == OrderedDict([('one', '1'), ('two', '2'), ('three', '3')])

def test_tsv_stdlib_exception_raises(self):
with open(os.path.join(BASE, 'files', 'invalid.tsv')) as fp:
with pytest.raises(EmptyTableError) as e:
stdlib_tools.tsv_stdlib(fp)
assert e.value.code == HTTPStatus.BAD_REQUEST

def test_csv_stdlib_exception_raises(self):
with open(os.path.join(BASE, 'files', 'invalid.csv')) as fp:
with pytest.raises(EmptyTableError) as e:
stdlib_tools.tsv_stdlib(fp)
assert e.value.code == HTTPStatus.BAD_REQUEST

def test_csv_stdlib_other_exception_raises(self):
with open(os.path.join(BASE, 'files', 'invalid_null.csv')) as fp:
with pytest.raises(TabularRendererError) as e:
stdlib_tools.tsv_stdlib(fp)
assert e.value.code == HTTPStatus.BAD_REQUEST
2 changes: 1 addition & 1 deletion tests/extensions/zip/test_renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ class TestZipRenderer:

def test_render(self, renderer):
body = renderer.render()
parsed_html = BeautifulSoup(body)
parsed_html = BeautifulSoup(body, "html.parser")
rows = parsed_html.findChildren('table')[0].findChildren(['tr'])

name = rows[2].findChildren('td')[0].get_text().strip()
Expand Down