CenterForOpenScience · AddisonSchiller · Oct 10, 2017 · Nov 16, 2017 · Nov 16, 2017 · Nov 16, 2017
diff --git a/mfr/extensions/tabular/libs/xlrd_tools.py b/mfr/extensions/tabular/libs/xlrd_tools.py
@@ -1,13 +1,15 @@
-import xlrd
+import uuid
 from collections import OrderedDict
-from ..exceptions import TableTooBigError
 
-from ..utilities import header_population
+import xlrd
+
 from mfr.extensions.tabular.compat import range, basestring
+from mfr.extensions.tabular.utilities import header_population
+from mfr.extensions.tabular.exceptions import TableTooBigError
 
 
-def xlsx_xlrd(fp):
-    """Read and convert a xlsx file to JSON format using the xlrd library
+def xlsx_xlrd(fp, max_iterations=5000):
+    """Read and convert a xlsx file to JSON format using the xlrd library.
     :param fp: File pointer object
     :return: tuple of table headers and data
     """
@@ -34,6 +36,30 @@ def xlsx_xlrd(fp):
             for index, value in enumerate(fields)
         ]
 
+        # Duplicate header fields create errors, we need to rename them
+        duplicate_fields = set([x for x in fields if fields.count(x) > 1])
+        if len(duplicate_fields):
+            counts = {}
+            for field in duplicate_fields:
+                counts[field] = 1
+
+            for i, field in enumerate(fields):
+                if field in duplicate_fields:
+                    increased_name = field + ' ({})'.format(counts[field])
+
+                    # this triggers if you try to rename a header, and that new name
+                    # already exists in fields. it will then increment to look for the
+                    # next available name.
+                    iteration = 0
+                    while increased_name in fields:
+                        iteration += 1
+                        if iteration > max_iterations:
+                            increased_name = field + ' ({})'.format(uuid.uuid4())
+                        else:
+                            counts[field] += 1
+                            increased_name = field + ' ({})'.format(counts[field])
+
+                    fields[i] = increased_name
         data = []
         for i in range(1, sheet.nrows):
             row = []

diff --git a/tests/extensions/tabular/files/test_duplicate.xlsx b/tests/extensions/tabular/files/test_duplicate.xlsx
diff --git a/tests/extensions/tabular/files/test_duplicate_uuid.xlsx b/tests/extensions/tabular/files/test_duplicate_uuid.xlsx
diff --git a/tests/extensions/tabular/test_xlsx_tools.py b/tests/extensions/tabular/test_xlsx_tools.py
@@ -1,5 +1,7 @@
 import os
 
+import pytest
+
 from mfr.extensions.tabular.libs import xlrd_tools
 
 BASE = os.path.dirname(os.path.abspath(__file__))
@@ -17,4 +19,34 @@ def test_xlsx_xlrd(self):
         assert sheet[0][2] == {'field': 'three', 'id': 'three', 'name': 'three', 'sortable': True}
         assert sheet[1][0] == {'one': 'a', 'two': 'b', 'three': 'c'}
         assert sheet[1][1] == {'one': 1.0, 'two': 2.0, 'three': 3.0}
-        assert sheet[1][2] == {'one': u'wierd\\x97', 'two': u'char\\x98','three': u'set\\x99'}
+        assert sheet[1][2] == {'one': u'wierd\\x97', 'two': u'char\\x98', 'three': u'set\\x99'}
+
+    def test_xlsx_xlrd_duplicate_fields(self):
+        with open(os.path.join(BASE, 'files', 'test_duplicate.xlsx')) as fp:
+            sheets = xlrd_tools.xlsx_xlrd(fp)
+
+        sheet = sheets.popitem()[1]
+        assert sheet[0][0] == {'id': 'Name', 'name': 'Name', 'field': 'Name', 'sortable': True}
+        assert sheet[0][1] == {'id': 'Dup (1)', 'name': 'Dup (1)',
+                                'field': 'Dup (1)', 'sortable': True}
+        assert sheet[0][2] == {'id': 'Dup (2)', 'name': 'Dup (2)',
+                                'field': 'Dup (2)', 'sortable': True}
+        assert sheet[0][3] == {'id': 'Dup (3)', 'name': 'Dup (3)',
+                                'field': 'Dup (3)', 'sortable': True}
+        assert sheet[0][4] == {'id': 'Dup (4)', 'name': 'Dup (4)',
+                                'field': 'Dup (4)', 'sortable': True}
+        assert sheet[0][5] == {'id': 'Not Dup', 'name': 'Not Dup',
+                                'field': 'Not Dup', 'sortable': True}
+        assert sheet[1][0] == {'Name': 1.0, 'Dup (1)': 2.0, 'Dup (2)': 3.0,
+                            'Dup (3)': 4.0, 'Dup (4)': 5.0, 'Not Dup': 6.0}
+
+    def test_xlsx_xlrd_duplicate_fields_handle_naming(self):
+        with open(os.path.join(BASE, 'files', 'test_duplicate_uuid.xlsx')) as fp:
+            sheets = xlrd_tools.xlsx_xlrd(fp, max_iterations=10)
+
+        sheet = sheets.popitem()[1]
+
+        # this if you raise max iterations, it will be named dup (13) instead of dup (<uuid>)
+        assert sheet[0][1]['field'] != 'dup (13)'
+        # using `len` is an easy way to see the uuid has been appended
+        assert len(sheet[0][1]['field']) > 24