From 617306f8becdfcc33d6c872f4e27c22907f0b959 Mon Sep 17 00:00:00 2001 From: Addison Schiller Date: Wed, 11 Oct 2017 09:43:05 -0400 Subject: [PATCH 1/4] Separate csv and tsv function and remove use of sniff Csv.sniff could cause random characters or spaces to be used as the delimiter. Separating these functions and using a hard coded dialect fixes this display problem. --- mfr/extensions/tabular/libs/__init__.py | 5 +++ mfr/extensions/tabular/libs/stdlib_tools.py | 39 +++++++++++++++------ mfr/extensions/tabular/settings.py | 2 +- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/mfr/extensions/tabular/libs/__init__.py b/mfr/extensions/tabular/libs/__init__.py index 609dc7a35..14e9cb375 100644 --- a/mfr/extensions/tabular/libs/__init__.py +++ b/mfr/extensions/tabular/libs/__init__.py @@ -8,6 +8,11 @@ def csv_stdlib(): return csv_stdlib +def tsv_stdlib(): + from ..libs.stdlib_tools import tsv_stdlib + return tsv_stdlib + + def csv_pandas(): from ..libs.panda_tools import csv_pandas return csv_pandas diff --git a/mfr/extensions/tabular/libs/stdlib_tools.py b/mfr/extensions/tabular/libs/stdlib_tools.py index 542d5744e..2a5d30913 100644 --- a/mfr/extensions/tabular/libs/stdlib_tools.py +++ b/mfr/extensions/tabular/libs/stdlib_tools.py @@ -1,26 +1,45 @@ import re import csv -from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError from mfr.extensions.tabular import utilities +from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError def csv_stdlib(fp): - """Read and convert a csv file to JSON format using the python standard library - :param fp: File pointer object - :return: tuple of table headers and data - """ - data = fp.read(2048) + data = fp.seek(2048) fp.seek(0) + # set the dialect instead of sniffing for it. + # sniffing can cause things like spaces or characters to be the delimiter + dialect = csv.excel + try: + _set_dialect_quote_attrs(dialect, data) + except: + # if this errors it is not an exception + pass + reader = csv.DictReader(fp, dialect=dialect) + return parse_stdlib(reader) + +def tsv_stdlib(fp): + data = fp.seek(2048) + fp.seek(0) + # set the dialect instead of sniffing for it. + # sniffing can cause things like spaces or characters to be the delimiter + dialect = csv.excel_tab try: - dialect = csv.Sniffer().sniff(data) - except csv.Error: - dialect = csv.excel - else: _set_dialect_quote_attrs(dialect, data) + except: + # if this errors it is not an exception + pass reader = csv.DictReader(fp, dialect=dialect) + return parse_stdlib(reader) + +def parse_stdlib(reader): + """Read and convert a csv like file to JSON format using the python standard library + :param fp: File pointer object + :return: tuple of table headers and data + """ columns = [] # update the reader field names to avoid duplicate column names when performing row extraction for idx, fieldname in enumerate(reader.fieldnames or []): diff --git a/mfr/extensions/tabular/settings.py b/mfr/extensions/tabular/settings.py index 87d46e885..8895e3cff 100644 --- a/mfr/extensions/tabular/settings.py +++ b/mfr/extensions/tabular/settings.py @@ -10,7 +10,7 @@ LIBS = config.get('LIBS', { '.csv': [libs.csv_stdlib], - '.tsv': [libs.csv_stdlib], + '.tsv': [libs.tsv_stdlib], '.gsheet': [libs.xlsx_xlrd], '.xlsx': [libs.xlsx_xlrd], '.xls': [libs.xlsx_xlrd], From 0ff858931090ae6dfd9e718628f9ae9fc05879dc Mon Sep 17 00:00:00 2001 From: Addison Schiller Date: Thu, 16 Nov 2017 14:39:54 -0500 Subject: [PATCH 2/4] Add tests --- tests/extensions/tabular/test_stdlib_tools.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 tests/extensions/tabular/test_stdlib_tools.py diff --git a/tests/extensions/tabular/test_stdlib_tools.py b/tests/extensions/tabular/test_stdlib_tools.py new file mode 100644 index 000000000..d6c927e2b --- /dev/null +++ b/tests/extensions/tabular/test_stdlib_tools.py @@ -0,0 +1,51 @@ +import os +from collections import OrderedDict + +import pytest + +from mfr.extensions.tabular.libs import stdlib_tools +from mfr.extensions.tabular.exceptions import EmptyTableError + + +BASE = os.path.dirname(os.path.abspath(__file__)) + + +class TestTabularStdlibTools: + + def test_csv_stdlib(self): + with open(os.path.join(BASE, 'files', 'test.csv')) as fp: + sheets = stdlib_tools.csv_stdlib(fp) + + sheet = sheets.popitem()[1] + assert sheet[0] == [ + {'id': 'one', 'field': 'one', 'name': 'one', 'sortable': True}, + {'id': 'two', 'field': 'two', 'name': 'two', 'sortable': True}, + {'id': 'three', 'field': 'three', 'name': 'three', 'sortable': True} + ] + assert sheet[1][0] == OrderedDict([('one', 'à'), ('two', 'b'), ('three', 'c')]) + assert sheet[1][1] == OrderedDict([('one', '1'), ('two', '2'), ('three', '3')]) + + def test_tsv_stdlib(self): + with open(os.path.join(BASE, 'files', 'test.tsv')) as fp: + sheets = stdlib_tools.tsv_stdlib(fp) + + sheet = sheets.popitem()[1] + assert sheet[0] == [ + {'id': 'one', 'field': 'one', 'name': 'one', 'sortable': True}, + {'id': 'two', 'field': 'two', 'name': 'two', 'sortable': True}, + {'id': 'three', 'field': 'three', 'name': 'three', 'sortable': True} + ] + assert sheet[1][0] == OrderedDict([('one', 'a'), ('two', 'b'), ('three', 'c')]) + assert sheet[1][1] == OrderedDict([('one', '1'), ('two', '2'), ('three', '3')]) + + def test_tsv_stdlib_exception_raises(self): + with open(os.path.join(BASE, 'files', 'invalid.tsv')) as fp: + with pytest.raises(EmptyTableError) as e: + stdlib_tools.tsv_stdlib(fp) + assert e.value.code == 400 + + def test_csv_stdlib_exception_raises(self): + with open(os.path.join(BASE, 'files', 'invalid.csv')) as fp: + with pytest.raises(EmptyTableError) as e: + stdlib_tools.tsv_stdlib(fp) + assert e.value.code == 400 From b1083c5fe5936b9ee8572839850260a2f8b8af74 Mon Sep 17 00:00:00 2001 From: Addison Schiller Date: Wed, 22 Nov 2017 11:49:54 -0500 Subject: [PATCH 3/4] Better error handling --- mfr/extensions/tabular/libs/stdlib_tools.py | 51 +++++++++++------- .../extensions/tabular/files/invalid_null.csv | Bin 0 -> 9991 bytes tests/extensions/tabular/test_stdlib_tools.py | 15 ++++-- 3 files changed, 44 insertions(+), 22 deletions(-) create mode 100644 tests/extensions/tabular/files/invalid_null.csv diff --git a/mfr/extensions/tabular/libs/stdlib_tools.py b/mfr/extensions/tabular/libs/stdlib_tools.py index 2a5d30913..5c0856015 100644 --- a/mfr/extensions/tabular/libs/stdlib_tools.py +++ b/mfr/extensions/tabular/libs/stdlib_tools.py @@ -1,8 +1,10 @@ import re import csv +from http import HTTPStatus from mfr.extensions.tabular import utilities -from mfr.extensions.tabular.exceptions import EmptyTableError, TabularRendererError +from mfr.extensions.tabular.exceptions import (EmptyTableError, + TabularRendererError) def csv_stdlib(fp): @@ -42,35 +44,48 @@ def parse_stdlib(reader): """ columns = [] # update the reader field names to avoid duplicate column names when performing row extraction - for idx, fieldname in enumerate(reader.fieldnames or []): - column_count = sum(1 for column in columns if fieldname == column['name']) - if column_count: - unique_fieldname = '{}-{}'.format(fieldname, column_count + 1) - reader.fieldnames[idx] = unique_fieldname - else: - unique_fieldname = fieldname - columns.append({ - 'id': unique_fieldname, - 'field': unique_fieldname, - 'name': fieldname, - 'sortable': True, - }) - try: + for idx, fieldname in enumerate(reader.fieldnames or []): + column_count = sum(1 for column in columns if fieldname == column['name']) + if column_count: + unique_fieldname = '{}-{}'.format(fieldname, column_count + 1) + reader.fieldnames[idx] = unique_fieldname + else: + unique_fieldname = fieldname + columns.append({ + 'id': unique_fieldname, + 'field': unique_fieldname, + 'name': fieldname, + 'sortable': True, + }) + rows = [row for row in reader] except csv.Error as e: if any("field larger than field limit" in errorMsg for errorMsg in e.args): raise TabularRendererError( 'This file contains a field too large to render. ' 'Please download and view it locally.', - code=400, + code=HTTPStatus.BAD_REQUEST, extension='csv', ) from e else: - raise TabularRendererError('csv.Error: {}'.format(e), extension='csv') from e + raise TabularRendererError('Cannot render file as csv/tsv. ' + 'The file may be empty or corrupt', + code=HTTPStatus.BAD_REQUEST, + extension='csv') from e + + # Outside other except because the `if any` line causes more errors to be raised + # on certain exceptions + except Exception as e: + raise TabularRendererError('Cannot render file as csv/tsv. ' + 'The file may be empty or corrupt', + code=HTTPStatus.BAD_REQUEST, + extension='csv') from e if not columns and not rows: - raise EmptyTableError('Table empty or corrupt.', extension='csv') + raise EmptyTableError('Cannot render file as csv/tsv. ' + 'The file may be empty or corrupt', + code=HTTPStatus.BAD_REQUEST, extension='csv') return {'Sheet 1': (columns, rows)} diff --git a/tests/extensions/tabular/files/invalid_null.csv b/tests/extensions/tabular/files/invalid_null.csv new file mode 100644 index 0000000000000000000000000000000000000000..c25eed4af969d306ae1c00637b9bd9ff3b15bacb GIT binary patch literal 9991 zcmbVxcT`hPv~K7qO?vN5q)8K{#RdWS0n(*NM5Kv;fbK$mm@ zh;x8P04fSfN=gbU(v6CWikgO=mWI@r=;*G|Gc&QUFf%bTvvTlpva<28Gc$7ubMf%= z3knLda*BwF2#E0s2nzf=2^r~A8fqFwT3SW{HfA<~|K~>R05H=60swz0$V33-%w!bI zWW;WOAOJu{MY8r^ga50MkyB8TtfakmosKl1_6C5QjDmukWIqW2X>=&*J%Ey#ibYuA z0X3_!J&lMLn__rkHm&Hxs!n#35saA9OYaZYu5)m5ar1~vNZypXrL1yC_3k~jM~}62 zK)QNQOwG(KET36fJG^pqa&~cj?eosp&p#kADB@#eRCG*i+}EUU$tkJdf28H)=H(X@ z78RHLtgfl8t8Zv*>Oyw+^!EMw-9I`uJ~25pJu|zEURhmR-`L#R#vUFWpPZiI&M*GO zMFybwPgwtr?Ek>UOu|J@atkHRzqrWA{YjmInUYFaftuxkF^#O1v3B$xRgtOau;z8 zU(lw42;V~?1$Yx*VfnBTV$)P_o*utcZ`@V))vV6vQ7l)X{V`F>W4xX?6JmjDT{?*N z`d(|(+cZAo4=S`?F3^f>7CQ4psvH47LtIe;+eaEf^nPK2H5&3MA6%>U_d0+N>{NKR zn+P>*bmjV{ZRr5|-!g{^SHD$`jgY{Zmlvm8h%5i%+7ETSms*2SsRB^pbF)i8SnXcf z9nNdK;}b-{b0%)I0$1VdAo-nQbQ-i99>epgdNE+YCiULZ!fn0_vW}8TEj4U@IQ4Lk z^-volDAjKu;(ThJhqD738DwT}+Lbc(2AFUN`T@lsdtI0>JX#v=+L`9zZeh10XC4_nwmkI>ohJg6R#7yB5irZKRIbgCp{)0~V45_p^QC5Zcu>O@G zZf6-N*b?;S4giRK_yl9?rP!?WH`sm`&YV?1Ys9zt^8nnmHfTqw+1{` zIV@#w--3iDIb#V5^8v$4W{WIX-qguGxfd=K%?sUyrI0#x7~U-o#|y>T+OZM=m3hF6 z8#z(eZ)b~}+}cJX;=b*GzFMmbIub?+%2OdUm?bwmfdJ7~x9Re;A+ERV_0AE!0SHQ* z7isw?lWjr)5Z(6Hv5o51ap8KHO0~8)e^&IS;Hx5VSBi!(1QxlHn~{H(tz4K90|vxiGkAlOr=Bh|h;OX(b#{^RJ1yzw?d4}%cHVX+S8xI~1S z%vbBpJEMYCRuP^2c}Ww)1oglEN7DJ2ecd}wDlgcZi_}uWV zw3|!C#?dw1p`Di87B%NLMKh)dw?p(NSDjk12?cS6A`z^iYg4v^Y&|*wdaFi2uB}O%2uSfycQz#gLdhYbb_+CZF+G$=Zjjs%7_nUq%_pQGBptl6QpXp<%VMLt%Z&1$sGq~gf<&GpTFXt0X7Tm z1|Mr!E)s5F8B@^+Mys6**}n}{Ezl6pzfg1gnebntRqShMj5l(iNmE8nPG#yxx$xW( z>w6lBHV53orT(4tBBQJ;pnh)qbQ&)?&<)ioP+|T>3^$Q7vTLMfrfoxhro2FuE&Yw=6 zeQ;oIS$yR@o%@Oz>=3}aM1zDgx>h|3%+;>2i^T6u;83m2dqz#Ub0=lPu|z<1;fD}V z{*ZjM%u5WRqUaxa&qT`4Gtbokp_Y`v#}$#^mu=S8#zwe8K`g0?DQyKb9q?I2CYJ3x z1U_t;-ziVVw7%Ok;h7xo&q|=zut(o)TO7@7yS$1GUAeSb)0pS7i`<^1i&Ojw$n*nH zVX_U2|E}Sw`niqMS|6g2YC4v_Yw6Lq?w$3IfqnZvkmdeWg9Md2Dnsll~DroV3dj=KDHz>SD6Ws|ai)zJOot)qAB;lhAkd zUh6|!GQPP*TeulR1VoBJ7D_a)mIU%LW8p}zE4M#Hz^mbFkI>Lheo4=J)@QbudC%F7 zolPiT5CI)AKn%}TN0;*L{py7b+{}|!Up+x>!VrXJPlGC3yZrU(mV7KibV0guc;*HX zFd%jeT+|%qrwYDH{{C)0JKC)H)=!_5-BYQt&v%RZUi5IW89#LqrWkF2#se8^ATf(< z>3ZYLA}#mrdXGGZJtZAr+d<6p!RY1)U2QY5W$~N)u^DV*@paDYOtJ5tgOE)pr_rVN zU$p)Fj#Tuc^rvYMr4(3-+iasI0)D^5!10m7M8Mz3VDR3^^3ANakt)FmFNp`x+^4aB zMxey3rABH97X?pZNj)djfYewR0RPR)0Cp>d}`fEGr1Cz0ZOYRFTDtulTM?DhCE;H-hyvdWK zG^6BgZ-168Id_ky;euxy;sB0e*@P)*(YI%ipzX+n4RT@G|MEGE1#oFCyKN57igtP7 z+LyJe?bm3U>tYtI1$Ja!)C(FngsGgL6)YOrLp&j_q98B#TSKe~*MENLhuoIld^>ir z&fOJU4*U&c6akubOZ9nfZ;S}mhl~kw;=HMU#lAbY0$u7nXg;QUM%uZEfN3IN*~1Q> zRS3r?%FmqnF0`sTpgbV-Q^A*}n@IsH%S=%O5!aDGsH;cya}oMP`jY(g%`)oHSVZcT z6k@xL6Uz^4&$ZJ2qv0B+=3elb+SAao!q@v*5Fg z4Uxy^ws4}|0^DoQQ;})e05#kC3-$UEy}wLOoflKpvkNwhHu*hySR=alvc$WN_PZeV zrSV)f!!ZlZeqVcYJGqS^B7h6%%mD5cym10#*;3n2ck~P!5Y^T&w6eX)a29Z{7s9ho z1UNHA1{HSU)vf9$RBUWc$0ZUTb8L1#6-m6ZU0rW=)-0Ybcm7nW7Bdc6Tq=F4SRx#7vgigiZ^uM0ELm=wgH?%@xhM=Z3A(UU{J=qF1F= z>SNoOeWKA>+#1s%`*>Jq(@#)&K>A~+r(-rqAFh4BDhpCzq_n=CU1Dz*9VjfghicZU z;kY zoG(Pc=EJr!e9eh&SsNoS%aVQ(7|9~F{cw%s#g*J6#5cmf(dh6$(RdyNF-sjj)X?w& zTYYYUmbAOW&qYNEU0)!(6Nf~=?twG^S#SIG&YM&mc*l(iGW(0Ti-`YF9k>5Wbx4J< zOyvG0$Obr={`jCRkjwI+ErQ#nQFvWyO+H)FLO|d7h0`St{f>XY{A*uY){EZ zol@1ftZuY><>sb&bQPD%d3f2wx{IMzX6RSxAIzXiZ>q429o0nF3-h((K@Rl*0Sn2j z>v`}s@OR5IiG!!Gy#}Qxqj2{@qZ5RiBv|kl_6Yp5)u@dWD6MJG4ta(ie;xx~s}{xt zRp~5$+Z#dgUUt&h^i#+7nkKjE1F7B(m%XiGcCL`?`A^@LRbN&wT0mf02 zd0p>RSek+`rvx2|^zU2WvISaYULXCYUR}M7Dc89bns^>s=4uW59CE5w8#(cLluy}` znQJ015euP{?w9G(x$8LbRtwyK3S=UKgedb@Yv zyE*n)G7)F_;mQ!7a;%OMZ^dY?xLQWY$n4wx_SpEVx@a_RS*Wj+Z@N%aj>{b`PBkw) zF!jIU)?Yf?Q}+z@qE5t8)0T^SGpp!w-uMdHD1p#iBB4*Ex4su7JR8-`e9lL0Vfy94 z=7VFzAdq=8tuR`pxt!Ok=Ibb7;?#gO;lYVOR>A{5X0UNs5AepmXL&^nMaiGh^t+#X z)AnaXG?vj9%?b5SpmdiG!En$CppPDlW&VB0oToj;gfOV{SGwN!3uBr~Y zs!HXOoW3h~HW7ltnD)vp0b_Z62F^J<%8a+d(%#l$NQnNFydKEBE>1N?gp; zXS9JfH=9%ApZ9rUPrF~X`QdX2TF+ekEbX{3=7-OBOeolUcEyza9I~$mDZ>pp5q8qn z%FfodRXXOSI`M&d!_xSw5MQR)D+p)w@%hITHCmS+-`4DwtJWc?yL2ZDwlp2X=Q3d| z|CJ+D=sxoD;+ql{OS}=4W~fUAH3=Q^ZKnXPRt4yjV)8$tJKNN(=&oGMdArNDg=whO ziRE535g_&oGHt?yPiqs=x%EhzT?hku>b8fPb0+Ba;d_LLfSFRXuE^Iw6E5St#6Rq2 zuaY*Yyw`vv3cT~q3ET&K)J7Jtj_QLm)Zm{EmKWXsFo{>{AAC<97JnPLvu78ccC~?1 z#R^*D+|Y@|)5Q(d>C7mrYJ2B+!I8eZ!03l=2f%399n40;JLQQh^&;^>5cxAGO9yJf zcf0Udv)ay1{n{R2r8!I-=t;^p?LkNb#X-md6q^V zfPft)d;l(LC1Z!fLxBEh1g9%Z@aM+?*^{oG@GB|!kJk6L@U8+}1RC$$Cm_{?Wo*0~ z9E;N(!`m`Ln|T=ffp&U3;HQK>L3+u*HN_e}71lLsQQ4fSg*`$@ZI3`j=IM4Wk{An~ zYWvz@pGlBBhq1P)G8YUSs__S(e#8YX;VmF^damWaCBAiyw0tt3z7A-h%3!&BjChE% zS!v}&3q~chW|xoJlpRpy588`a+ByrdD12bD#+9Dtk2sBl-PuY1TV^?!S_wapE*d6i za9NfTx(G6LpMBxfwu7-674l`{8G#YJuPDajr7oNrrI(7!)JhqS)r!#TJ54^*7bRbg zv%_i>&@D+5TDJIR4WN^Z1`r*Ce%zwe8ec)9C^5PNx4nVN4CLHJ_{i4xff>IiX!NB> zo>|0(BaHD)SH`Z&vz#A=67A3nX@pf$8fMw}L`Z9u1Ne_6obZe1@hrS|NNvqQmNC;) zWw3nfa2I?P7#YeyS4OeEJ4{^*Q}I3nEwcB8nBySXSC28{X?JRwElunu_it;=xhkBI zQh1e3(2`qKPFt9wSIB_R{5P{?Slq29v1Y}Y<0FF%mYt8Mxq9V7nyC+HTnj=)C& zS_Bssu{(x2vrDnbRPZ>ai@n#AP93e)X7g6-ecY+5)7y+dx9-E&Yhxc9i>il|!Lw7= zQQ`*9T^y<&YOUpCkKi~uYJ#t)8NR|pK%~j}gTrSmavqfeAOf0-l+GvIjZSqZMt+Zl zI5;S7gnPXn$$)g|6}@SHgrascyTn{FwM!_F%8WZhc{nHG`@IzXJ;X^iK)e5`(od__^1^Ll4j$rsE>-pNT|ceQYvP+17S+)DKNNM0vX1SL zSOhta?>-K>tTcCNB8aE8T!gT;Fz4Zj03_2g-bw#^d79Zb&E`m7$IU=WOIU#EsXGQ3 z9im;hQtRqkGu)1O3(D zsXhKnt@+_#AM^oW=Db{q+q0(TatNYPT_mhhr&ROL+ z@P1&rg>3&Dhrj2M+!gW&XRrk3S2Cz@DU#Rin{$UI1JTF>NJn^u?H$Kojkij33z<`o zw1PYa&#SvowML)c;U@Y0^~O|^jUz8DMmfWwgMi=Lqusr|s4M}|`* zp7_~%?{d%`ngCz5 z$F-kk;3CQ%pI(?eb5T&`-CI%ohvxF2dx*_?EY zC~ncb6nCw9^3#nV*VJ_1`(<_AGtzcI{u0&STG$~--@>}&NQ0c^yRqOvy$@bAL}Bi% zhEh|hCUpI^$mM@r84x#ohIrHkbVkge1NO79pl;s#Fe?^oS4D5VZ2`|sI6Obp>^&9W zT^6I1+b={+W}weMCoWGTxemLrklFNQ+eIK|UacaQrL8?*n#bSR|TF-Tsux`UoBeqz+-lk_{~uI-Ld;+?L<{ zrDs%tjR&HY4xTs2AZI``y#U95IufL~v zgX8cR8;XN4u_)G76AWR*g$?NXb8)ivd+>*LbQJ+@Cl(w|pGW`a4`2Yd*$MP%)R$uztKTzdQ zcG03i!zV^v#(!SShofc;Z zI|lULIpLz~_d%^W7w6PRbwowynJI?*3}MhwVe6r30j(~~Gt3aUHkzujnDc7H&V2>K zwxi}8D<@O7&vMtnO!%J{8`4RWRnZ9{I-sl29wH!2JA_*D6XsdaXzty9toxg$kl~{I zB+FtdO5=m@PaSk~g*axqJPX$Dqt{}fo zuS@rl61;m@$tygmAQq-g(WfOs1i*{h4}Rt1{Rc=jGO#xhaGTK8?GK*%pWybsJ9w?TO6N6KPK*O(&2m_EUnhmH11!FZn9izqN*li=L#(7u@Z3ooji3|(P( z7^WY3%A{g;@rmhOFKw?`Sbd-;M?rJe57JM~{=j5hmOpKb2V5w49peQm(0 z;7tzIpQ{PeTOJyTw~VdaEYQl6Z+#xMWBPNdoJ3~7_ClcK_n z$o9Q-o@r^nN&jV9@_bToZk)#?{?%Ap(>oL*E$lM~hMm>5HWbL(o*TE?n_KbvhTZlZ z)jNM5x$GCyftrVA&VB})n`3>EZhl!gvXMP$PXgAs!-bNzW@}RXqaz#lztx2b)U_9u z$3EjpIIVVr9IC%DnF8i)`&();hdyz#$Pg=|tStO%)-NM4WzKb@tyS*k(zbs3@9BeLCEH9wamf~|?dqC5AVtB&D4%=~RZQnGUKaT##DthTfy2oigIo(Azmn^vc^#iSD0TTyr_p)>qU z9*}{g`!^+N@RzTUC`T;!ECN@hnDCb>RM9=Urah1wWhXAn^`jbMl?{y#t@7sIS87%_EWNCpLIZn% zIBjUcT&J}f&P8%IY|xX-4xdU0a^UrM$gG*7bQ~r;CjwT75ralC3Uqs6f*8gm61WXK zAF>##<*0GV+>g5R z={G&@2t(LF1H&zmlWFl5tm5PT$7kJI#Y5X#)U(S@L=|_ zJ~pPtuG{Xp8a{EyrH~jcMa*w$jl`DATPZRIvV&Bo>TpJ?+%JLh)N?<*e!qZyhN9K*3SH4haDW0`shT#M=4anYi0HjDjGrOM z=fAl!!PR*ySERXJ?K3ZqoO?J7EP1J%7X!%RQt+UnXQ5zBHD6ZEtix@Ftf1!cwUJRgf{ zMVB&=4?ltp@7>mSc>8BNXt{HR*XJPdnKe_lV64Dh)wKF>4W%4J^vmR&n#Kva=VC3l z-ddT1`rDG-gpr`MTi0Y^O>U03nb2OytT}()+|&V~R`a?$Iaa(O4ry|!`FV~5Ch^IK zJV2f4&FDPNdeX9i+(I|MTHowGaAHs6whB_dhrvlUxNp{RGS!7C^7y1NCb#~;wg6qe zm(;6n2Br#e$13$Ex~45w=iq`+$ke|zF!$O4-bhiQkdGJfJjJif*lfPNI|>~eajQc- zzPWDdezP?c2y8gcl-9Wu#>;aLK8PRdTX(3~J%aY;LYR6N1?$H+UVfds#Z_p|&DhGz zz6PWr+|yr?CIVPo(+V=FlGovUQyc;|{PoN_x&nwWEz2)QKJEof#*IiS))~{8Tzp3m`f6Jemi+B4BW)3kLX;H0kgUSBA(&XbH z)u+A0KaVxX8tVtTCw{oC7B_@bC5f*%J9#yasR-g?Z*Dz&L>?ErT#XyEYG*-6<- z($i1o{$4x5%@-$x#N582%e0-*K83O?1u$>esMf8d?mHKGJltdM^xii8FZEMsOP*Hl z-u9MuTXBkAVi4>~Gm5Geqy#U{bh1{YOQd~w)8VgQy`gt+;!;hee&jr@BXlE_7eOTlLL4%bY{w&_k{}$0eZm2XH zCmlGP9Q?Atn9c6zE;6&f@@z|P@8mD)gRBPGf)hSedYO0i-Pql#Vejs zBH-hIoo5siX1Rs649X61=9eG19!0-5vwpB}rZD{EW)7<_@A~82eb3Gp$-4n)KGZF$Mouq3) zi{U58Ot; z0+*b$R9W!L)MmVb^)bFQhzQVKBS~^jtoz39MzOU-QmLLyn?zLbO*zAJ(h-oiVh|G+ z@o?PTcQ^{p>x;Kx#BC=AQ~u>3k|MiFHK>emIrYuTP|HyS0E^ST`p0YXpL z3I|~&sA4dBzqaDa0yCMsY;~QoPG`3l<#5)Bb`gC z9mK!ebLg@YX>ix?_OX4SSzZ*-ul|C?`|g3=+KQD-E+#DhPGP=Y$qb#7(u|n>?5q83 zBU5em%y$_pa%D{{Azzh>WP`m)v#MPtr zE?Ku#$JRw{4HX|IkCIDzCp@A2(MF`7CS&m_FGw?>j8rTQ*85u%JyO1}!HjqokY(Mp z@LKiyj%NnTecq}ufR{GAoQK((p!hjcvQ*D``{4RJRP>@;ElyplHGj*r6Kx=9-5Rho zcQilytH+kfjuBKuUHhhmb%hrk1!oL#$aguM--hut2B&9AJgBo0aTeR7!Bx#6l#L%( zDea(+d61p$iJMLJP|H3$Dy+q)`~c_Yb*=6!D|Ei(C8@7&-MG;wn`8^${q2zD@^!1O xu!A5nrlBYMBVnJg%x6CnGEST9-PcLqf^to3kttummU0{4aDKHj)4U literal 0 HcmV?d00001 diff --git a/tests/extensions/tabular/test_stdlib_tools.py b/tests/extensions/tabular/test_stdlib_tools.py index d6c927e2b..3d958eb01 100644 --- a/tests/extensions/tabular/test_stdlib_tools.py +++ b/tests/extensions/tabular/test_stdlib_tools.py @@ -1,11 +1,12 @@ import os +from http import HTTPStatus from collections import OrderedDict import pytest from mfr.extensions.tabular.libs import stdlib_tools -from mfr.extensions.tabular.exceptions import EmptyTableError - +from mfr.extensions.tabular.exceptions import(EmptyTableError, + TabularRendererError) BASE = os.path.dirname(os.path.abspath(__file__)) @@ -42,10 +43,16 @@ def test_tsv_stdlib_exception_raises(self): with open(os.path.join(BASE, 'files', 'invalid.tsv')) as fp: with pytest.raises(EmptyTableError) as e: stdlib_tools.tsv_stdlib(fp) - assert e.value.code == 400 + assert e.value.code == HTTPStatus.BAD_REQUEST def test_csv_stdlib_exception_raises(self): with open(os.path.join(BASE, 'files', 'invalid.csv')) as fp: with pytest.raises(EmptyTableError) as e: stdlib_tools.tsv_stdlib(fp) - assert e.value.code == 400 + assert e.value.code == HTTPStatus.BAD_REQUEST + + def test_csv_stdlib_other_exception_raises(self): + with open(os.path.join(BASE, 'files', 'invalid_null.csv')) as fp: + with pytest.raises(TabularRendererError) as e: + stdlib_tools.tsv_stdlib(fp) + assert e.value.code == HTTPStatus.BAD_REQUEST From 3048f77d3f319b5f93e688c53e7a60aa4895e337 Mon Sep 17 00:00:00 2001 From: TomBaxter Date: Thu, 21 Dec 2017 11:39:54 -0500 Subject: [PATCH 4/4] Simplify CVS/TVS handling SVCS-531 Our handling of CVS/TSV files was convoluted and unnecessary, this cleaned up and simplified. Collateral- Updated two tests that were working fine, but producing unnecessary noise in the test logs. --- mfr/extensions/tabular/libs/stdlib_tools.py | 83 +++++++------------ .../extensions/ipynb/files/no_metadata.ipynb | 3 +- tests/extensions/zip/test_renderer.py | 2 +- 3 files changed, 30 insertions(+), 58 deletions(-) diff --git a/mfr/extensions/tabular/libs/stdlib_tools.py b/mfr/extensions/tabular/libs/stdlib_tools.py index 5c0856015..2673d5084 100644 --- a/mfr/extensions/tabular/libs/stdlib_tools.py +++ b/mfr/extensions/tabular/libs/stdlib_tools.py @@ -1,4 +1,3 @@ -import re import csv from http import HTTPStatus @@ -8,36 +7,30 @@ def csv_stdlib(fp): - data = fp.seek(2048) - fp.seek(0) - # set the dialect instead of sniffing for it. - # sniffing can cause things like spaces or characters to be the delimiter - dialect = csv.excel try: - _set_dialect_quote_attrs(dialect, data) + # CSVs are always values seperated by commas + # sniff for quoting, and spaces after commas + dialect = csv.Sniffer().sniff(fp.read(), ',') except: - # if this errors it is not an exception - pass + dialect = csv.excel + fp.seek(0) reader = csv.DictReader(fp, dialect=dialect) - return parse_stdlib(reader) + return parse_stdlib(reader, 'csv') def tsv_stdlib(fp): - data = fp.seek(2048) - fp.seek(0) - # set the dialect instead of sniffing for it. - # sniffing can cause things like spaces or characters to be the delimiter - dialect = csv.excel_tab try: - _set_dialect_quote_attrs(dialect, data) + # TSVs are always values seperated by TABs + # sniff for quoting, and spaces after TABs + dialect = csv.Sniffer().sniff(fp.read(), '\t') except: - # if this errors it is not an exception - pass + dialect = csv.excel_tab + fp.seek(0) reader = csv.DictReader(fp, dialect=dialect) - return parse_stdlib(reader) + return parse_stdlib(reader, 'tsv') -def parse_stdlib(reader): +def parse_stdlib(reader, ext): """Read and convert a csv like file to JSON format using the python standard library :param fp: File pointer object :return: tuple of table headers and data @@ -66,26 +59,29 @@ def parse_stdlib(reader): 'This file contains a field too large to render. ' 'Please download and view it locally.', code=HTTPStatus.BAD_REQUEST, - extension='csv', + extension=ext, ) from e else: - raise TabularRendererError('Cannot render file as csv/tsv. ' - 'The file may be empty or corrupt', - code=HTTPStatus.BAD_REQUEST, - extension='csv') from e + raise TabularRendererError( + 'Cannot render file as {}. The file may be empty or corrupt'.format(ext), + code=HTTPStatus.BAD_REQUEST, + extension=ext + ) from e # Outside other except because the `if any` line causes more errors to be raised # on certain exceptions except Exception as e: - raise TabularRendererError('Cannot render file as csv/tsv. ' - 'The file may be empty or corrupt', - code=HTTPStatus.BAD_REQUEST, - extension='csv') from e + raise TabularRendererError( + 'Cannot render file as {}. The file may be empty or corrupt'.format(ext), + code=HTTPStatus.BAD_REQUEST, + extension=ext + ) from e if not columns and not rows: - raise EmptyTableError('Cannot render file as csv/tsv. ' - 'The file may be empty or corrupt', - code=HTTPStatus.BAD_REQUEST, extension='csv') + raise EmptyTableError( + 'Cannot render file as {}. The file may be empty or corrupt'.format(ext), + code=HTTPStatus.BAD_REQUEST, + extension=ext) return {'Sheet 1': (columns, rows)} @@ -101,26 +97,3 @@ def sav_stdlib(fp): with open(csv_file.name, 'r') as file: csv_file.close() return csv_stdlib(file) - - -def _set_dialect_quote_attrs(dialect, data): - """Set quote-related dialect attributes based on up to 2kb of csv data. - - The regular expressions search for things that look like the beginning of - a list, wrapped in a quotation mark that is not dialect.quotechar, with - list items wrapped in dialect.quotechar and seperated by commas. - - Example matches include: - "['1', '2', '3' for quotechar == ' - '{"a", "b", "c" for quotechar == " - """ - if dialect.quotechar == '"': - if re.search('\'[[({]".+",', data): - dialect.quotechar = "'" - if re.search("'''[[({]\".+\",", data): - dialect.doublequote = True - elif dialect.quotechar == "'": - if re.search("\"[[({]'.+',", data): - dialect.quotechar = '"' - if re.search('"""[[({]\'.+\',', data): - dialect.doublequote = True diff --git a/tests/extensions/ipynb/files/no_metadata.ipynb b/tests/extensions/ipynb/files/no_metadata.ipynb index 8d5457eb0..948a6718b 100644 --- a/tests/extensions/ipynb/files/no_metadata.ipynb +++ b/tests/extensions/ipynb/files/no_metadata.ipynb @@ -528,8 +528,7 @@ ] } ], - "cells": [], "metadata": {}, "nbformat": 3, "nbformat_minor": 0 -} \ No newline at end of file +} diff --git a/tests/extensions/zip/test_renderer.py b/tests/extensions/zip/test_renderer.py index 162e27349..fb00942d2 100644 --- a/tests/extensions/zip/test_renderer.py +++ b/tests/extensions/zip/test_renderer.py @@ -65,7 +65,7 @@ class TestZipRenderer: def test_render(self, renderer): body = renderer.render() - parsed_html = BeautifulSoup(body) + parsed_html = BeautifulSoup(body, "html.parser") rows = parsed_html.findChildren('table')[0].findChildren(['tr']) name = rows[2].findChildren('td')[0].get_text().strip()