Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SVCS-530] Xlsx duplicate header error fix #288

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 31 additions & 5 deletions mfr/extensions/tabular/libs/xlrd_tools.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import xlrd
import uuid
from collections import OrderedDict
from ..exceptions import TableTooBigError

from ..utilities import header_population
import xlrd

from mfr.extensions.tabular.compat import range, basestring
from mfr.extensions.tabular.utilities import header_population
from mfr.extensions.tabular.exceptions import TableTooBigError


def xlsx_xlrd(fp):
"""Read and convert a xlsx file to JSON format using the xlrd library
def xlsx_xlrd(fp, max_iterations=5000):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

"""Read and convert a xlsx file to JSON format using the xlrd library.
:param fp: File pointer object
:return: tuple of table headers and data
"""
Expand All @@ -34,6 +36,30 @@ def xlsx_xlrd(fp):
for index, value in enumerate(fields)
]

# Duplicate header fields create errors, we need to rename them
duplicate_fields = set([x for x in fields if fields.count(x) > 1])
if len(duplicate_fields):
counts = {}
for field in duplicate_fields:
counts[field] = 1

for i, field in enumerate(fields):
if field in duplicate_fields:
increased_name = field + ' ({})'.format(counts[field])

# this triggers if you try to rename a header, and that new name
# already exists in fields. it will then increment to look for the
# next available name.
iteration = 0
while increased_name in fields:
iteration += 1
if iteration > max_iterations:
increased_name = field + ' ({})'.format(uuid.uuid4())
else:
counts[field] += 1
increased_name = field + ' ({})'.format(counts[field])

fields[i] = increased_name
data = []
for i in range(1, sheet.nrows):
row = []
Expand Down
Binary file not shown.
Binary file not shown.
34 changes: 33 additions & 1 deletion tests/extensions/tabular/test_xlsx_tools.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os

import pytest

from mfr.extensions.tabular.libs import xlrd_tools

BASE = os.path.dirname(os.path.abspath(__file__))
Expand All @@ -17,4 +19,34 @@ def test_xlsx_xlrd(self):
assert sheet[0][2] == {'field': 'three', 'id': 'three', 'name': 'three', 'sortable': True}
assert sheet[1][0] == {'one': 'a', 'two': 'b', 'three': 'c'}
assert sheet[1][1] == {'one': 1.0, 'two': 2.0, 'three': 3.0}
assert sheet[1][2] == {'one': u'wierd\\x97', 'two': u'char\\x98','three': u'set\\x99'}
assert sheet[1][2] == {'one': u'wierd\\x97', 'two': u'char\\x98', 'three': u'set\\x99'}

def test_xlsx_xlrd_duplicate_fields(self):
with open(os.path.join(BASE, 'files', 'test_duplicate.xlsx')) as fp:
sheets = xlrd_tools.xlsx_xlrd(fp)

sheet = sheets.popitem()[1]
assert sheet[0][0] == {'id': 'Name', 'name': 'Name', 'field': 'Name', 'sortable': True}
assert sheet[0][1] == {'id': 'Dup (1)', 'name': 'Dup (1)',
'field': 'Dup (1)', 'sortable': True}
assert sheet[0][2] == {'id': 'Dup (2)', 'name': 'Dup (2)',
'field': 'Dup (2)', 'sortable': True}
assert sheet[0][3] == {'id': 'Dup (3)', 'name': 'Dup (3)',
'field': 'Dup (3)', 'sortable': True}
assert sheet[0][4] == {'id': 'Dup (4)', 'name': 'Dup (4)',
'field': 'Dup (4)', 'sortable': True}
assert sheet[0][5] == {'id': 'Not Dup', 'name': 'Not Dup',
'field': 'Not Dup', 'sortable': True}
assert sheet[1][0] == {'Name': 1.0, 'Dup (1)': 2.0, 'Dup (2)': 3.0,
'Dup (3)': 4.0, 'Dup (4)': 5.0, 'Not Dup': 6.0}

def test_xlsx_xlrd_duplicate_fields_handle_naming(self):
with open(os.path.join(BASE, 'files', 'test_duplicate_uuid.xlsx')) as fp:
sheets = xlrd_tools.xlsx_xlrd(fp, max_iterations=10)

sheet = sheets.popitem()[1]

# this if you raise max iterations, it will be named dup (13) instead of dup (<uuid>)
assert sheet[0][1]['field'] != 'dup (13)'
# using `len` is an easy way to see the uuid has been appended
assert len(sheet[0][1]['field']) > 24