From 0d0dbea12d1ba5c7a0984b45898add15e33ad4fe Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Tue, 7 May 2024 14:14:18 +0000 Subject: [PATCH 01/16] feat(logic): strip white space; - Strip white space from cell values when loading into DataStore. --- ckanext/xloader/loader.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index 85be3f34..997ba631 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -169,11 +169,17 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): try: with UnknownEncodingStream(csv_filepath, file_format, decoding_result, skip_rows=skip_rows) as stream: - stream.save(**save_args) + for row in stream: + for _index, _cell in enumerate(row): + row[_index] = str(_cell).strip() # strip white space around cell values + stream.save(**save_args) # have to save inside of the tabulator stream iterator except (EncodingError, UnicodeDecodeError): with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING, skip_rows=skip_rows) as stream: - stream.save(**save_args) + for row in stream: + for _index, _cell in enumerate(row): + row[_index] = str(_cell).strip() # strip white space around cell values + stream.save(**save_args) # have to save inside of the tabulator stream iterator csv_filepath = f_write.name # datastore db connection @@ -441,6 +447,7 @@ def row_iterator(): logger.info('Saving chunk %s', i) for row in records: for column_index, column_name in enumerate(row): + row[column_name] = str(row[column_name]).strip() # strip white space around cell values if headers_dicts[column_index]['type'] in non_empty_types and row[column_name] == '': row[column_name] = None send_resource_to_datastore(resource_id, headers_dicts, records) From 616c948ce261955ddeb3c0ad56ba87efa8d13070 Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Tue, 7 May 2024 14:35:57 +0000 Subject: [PATCH 02/16] fix(logic): strip white space for load table; - Fixed white space stripping for `load_table`. --- ckanext/xloader/loader.py | 1 - ckanext/xloader/parser.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index 997ba631..3f8df083 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -447,7 +447,6 @@ def row_iterator(): logger.info('Saving chunk %s', i) for row in records: for column_index, column_name in enumerate(row): - row[column_name] = str(row[column_name]).strip() # strip white space around cell values if headers_dicts[column_index]['type'] in non_empty_types and row[column_name] == '': row[column_name] = None send_resource_to_datastore(resource_id, headers_dicts, records) diff --git a/ckanext/xloader/parser.py b/ckanext/xloader/parser.py index 11e756cd..ee9d05eb 100644 --- a/ckanext/xloader/parser.py +++ b/ckanext/xloader/parser.py @@ -31,6 +31,8 @@ def convert_types(self, extended_rows): for cell_index, cell_value in enumerate(row): if cell_value is None: row[cell_index] = '' + cell_value = cell_value.strip() # strip white space around cell values + row[cell_index] = cell_value.strip() # strip white space around cell values if not cell_value: continue cell_type = self.types[cell_index] if self.types else None From 860ca9eb8a82e86a857d70ea62382a707418cfae Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Tue, 7 May 2024 14:46:53 +0000 Subject: [PATCH 03/16] fix(logic): strip white space for load table; - Fixed white space stripping for `load_table`. --- ckanext/xloader/parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ckanext/xloader/parser.py b/ckanext/xloader/parser.py index ee9d05eb..b3ce1772 100644 --- a/ckanext/xloader/parser.py +++ b/ckanext/xloader/parser.py @@ -31,8 +31,8 @@ def convert_types(self, extended_rows): for cell_index, cell_value in enumerate(row): if cell_value is None: row[cell_index] = '' - cell_value = cell_value.strip() # strip white space around cell values - row[cell_index] = cell_value.strip() # strip white space around cell values + cell_value = str(cell_value).strip() # strip white space around cell values + row[cell_index] = str(cell_value).strip() # strip white space around cell values if not cell_value: continue cell_type = self.types[cell_index] if self.types else None From 88f96a866761aaeb90a03cdfa75655fc4bfc4376 Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Wed, 8 May 2024 13:53:38 +0000 Subject: [PATCH 04/16] fix(logic): strip white space; - Condition stripping on `str` type. --- ckanext/xloader/loader.py | 10 ++++++++-- ckanext/xloader/parser.py | 7 +++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index 3f8df083..a38eb107 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -171,14 +171,20 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): skip_rows=skip_rows) as stream: for row in stream: for _index, _cell in enumerate(row): - row[_index] = str(_cell).strip() # strip white space around cell values + if isinstance(_cell, str): + # strip white space around cell values + #TODO: condition behind DataDictionary option?? + row[_index] = _cell.strip() stream.save(**save_args) # have to save inside of the tabulator stream iterator except (EncodingError, UnicodeDecodeError): with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING, skip_rows=skip_rows) as stream: for row in stream: for _index, _cell in enumerate(row): - row[_index] = str(_cell).strip() # strip white space around cell values + if isinstance(_cell, str): + # strip white space around cell values + #TODO: condition behind DataDictionary option?? + row[_index] = _cell.strip() stream.save(**save_args) # have to save inside of the tabulator stream iterator csv_filepath = f_write.name diff --git a/ckanext/xloader/parser.py b/ckanext/xloader/parser.py index b3ce1772..890e8776 100644 --- a/ckanext/xloader/parser.py +++ b/ckanext/xloader/parser.py @@ -31,8 +31,11 @@ def convert_types(self, extended_rows): for cell_index, cell_value in enumerate(row): if cell_value is None: row[cell_index] = '' - cell_value = str(cell_value).strip() # strip white space around cell values - row[cell_index] = str(cell_value).strip() # strip white space around cell values + if isinstance(cell_value, str): + # strip white space around cell values + #TODO: condition behind DataDictionary option?? + cell_value = cell_value.strip() + row[cell_index] = cell_value.strip() if not cell_value: continue cell_type = self.types[cell_index] if self.types else None From 21a1eced97557c7ea68e994a64662617357160aa Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Wed, 8 May 2024 19:00:22 +0000 Subject: [PATCH 05/16] fix(tests): new code for tests; - Align test with new strip code. - Write headers to stream for `load_csv`. --- ckanext/xloader/loader.py | 2 ++ ckanext/xloader/tests/test_loader.py | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index daddc534..f5291b14 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -174,6 +174,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): try: with UnknownEncodingStream(csv_filepath, file_format, decoding_result, skip_rows=skip_rows) as stream: + stream.save(**save_args) # have to save headers for row in stream: for _index, _cell in enumerate(row): if isinstance(_cell, str): @@ -184,6 +185,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): except (EncodingError, UnicodeDecodeError): with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING, skip_rows=skip_rows) as stream: + stream.save(**save_args) # have to save headers for row in stream: for _index, _cell in enumerate(row): if isinstance(_cell, str): diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py index a8b23b53..3eacd5a0 100644 --- a/ckanext/xloader/tests/test_loader.py +++ b/ckanext/xloader/tests/test_loader.py @@ -1049,7 +1049,7 @@ def test_boston_311(self, Session): u"", u"ONTIME", u"Open", - u" ", + u"", # " " transforms to "" u"Street Light Outages", u"Public Works Department", u"Street Lights", @@ -1081,14 +1081,14 @@ def test_boston_311(self, Session): u"", u"ONTIME", u"Open", - u" ", + u"", # " " transforms to "" u"Graffiti Removal", u"Property Management", u"Graffiti", u"Graffiti Removal", u"PROP_GRAF_GraffitiRemoval", u"PROP", - u" https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", + u"https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # strip white spaces u"", u"522 Saratoga St East Boston MA 02128", Decimal("1"), @@ -1113,14 +1113,14 @@ def test_boston_311(self, Session): u"", u"ONTIME", u"Open", - u" ", + u"", # " " transforms to "" u"Graffiti Removal", u"Property Management", u"Graffiti", u"Graffiti Removal", u"PROP_GRAF_GraffitiRemoval", u"PROP", - u" https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", + u"https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", # strip white spaces u"", u"965 Bennington St East Boston MA 02128", Decimal("1"), From a6ab0a045083cfa430d6542f2397f2b1d78bd731 Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Sun, 12 May 2024 12:46:06 +0000 Subject: [PATCH 06/16] fix(logic): load csv white space; - Extended tabulator stream iterator to strip white space from cell values. --- ckanext/xloader/loader.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index f5291b14..a4c99cc0 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -174,25 +174,27 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): try: with UnknownEncodingStream(csv_filepath, file_format, decoding_result, skip_rows=skip_rows) as stream: - stream.save(**save_args) # have to save headers - for row in stream: - for _index, _cell in enumerate(row): - if isinstance(_cell, str): - # strip white space around cell values - #TODO: condition behind DataDictionary option?? - row[_index] = _cell.strip() - stream.save(**save_args) # have to save inside of the tabulator stream iterator + super_iter = stream.iter + def strip_white_space_iter(): + for row in super_iter(): + for _index, _cell in enumerate(row): + if isinstance(_cell, str): + row[_index] = _cell.strip() + yield row + stream.iter = strip_white_space_iter + stream.save(**save_args) except (EncodingError, UnicodeDecodeError): with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING, skip_rows=skip_rows) as stream: - stream.save(**save_args) # have to save headers - for row in stream: - for _index, _cell in enumerate(row): - if isinstance(_cell, str): - # strip white space around cell values - #TODO: condition behind DataDictionary option?? - row[_index] = _cell.strip() - stream.save(**save_args) # have to save inside of the tabulator stream iterator + super_iter = stream.iter + def strip_white_space_iter(): + for row in super_iter(): + for _index, _cell in enumerate(row): + if isinstance(_cell, str): + row[_index] = _cell.strip() + yield row + stream.iter = strip_white_space_iter + stream.save(**save_args) csv_filepath = f_write.name # datastore db connection From 54f87e0c7c1a0eca2031cb20922936329483c030 Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Tue, 14 May 2024 15:49:42 +0000 Subject: [PATCH 07/16] feat(logic): added `strip_extra_white` info field; - Added `strip_extra_white` info field and form fields. - Added validator for `strip_extra_white`. - Used `strip_extra_white` to control stripping white space. --- ckanext/xloader/loader.py | 64 ++++++++++--------- ckanext/xloader/parser.py | 13 ++-- ckanext/xloader/plugin.py | 17 ++++- .../datastore/snippets/dictionary_form.html | 11 ++++ ckanext/xloader/tests/test_loader.py | 12 ++-- ckanext/xloader/validators.py | 12 ++++ 6 files changed, 87 insertions(+), 42 deletions(-) create mode 100644 ckanext/xloader/templates/datastore/snippets/dictionary_form.html create mode 100644 ckanext/xloader/validators.py diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index a4c99cc0..abe03f41 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -170,33 +170,6 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): logger.info('Ensuring character coding is UTF8') f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False) try: - save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter} - try: - with UnknownEncodingStream(csv_filepath, file_format, decoding_result, - skip_rows=skip_rows) as stream: - super_iter = stream.iter - def strip_white_space_iter(): - for row in super_iter(): - for _index, _cell in enumerate(row): - if isinstance(_cell, str): - row[_index] = _cell.strip() - yield row - stream.iter = strip_white_space_iter - stream.save(**save_args) - except (EncodingError, UnicodeDecodeError): - with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING, - skip_rows=skip_rows) as stream: - super_iter = stream.iter - def strip_white_space_iter(): - for row in super_iter(): - for _index, _cell in enumerate(row): - if isinstance(_cell, str): - row[_index] = _cell.strip() - yield row - stream.iter = strip_white_space_iter - stream.save(**save_args) - csv_filepath = f_write.name - # datastore db connection engine = get_write_engine() @@ -238,11 +211,40 @@ def strip_white_space_iter(): else: fields = [ {'id': header_name, - 'type': 'text'} + 'type': 'text',} for header_name in headers] logger.info('Fields: %s', fields) + save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter} + try: + with UnknownEncodingStream(csv_filepath, file_format, decoding_result, + skip_rows=skip_rows) as stream: + super_iter = stream.iter + def strip_white_space_iter(): + for row in super_iter(): + for _index, _cell in enumerate(row): + # only strip white space if strip_extra_white is True + if fields[_index].get('info', {}).get('strip_extra_white', True) and isinstance(_cell, str): + row[_index] = _cell.strip() + yield row + stream.iter = strip_white_space_iter + stream.save(**save_args) + except (EncodingError, UnicodeDecodeError): + with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING, + skip_rows=skip_rows) as stream: + super_iter = stream.iter + def strip_white_space_iter(): + for row in super_iter(): + for _index, _cell in enumerate(row): + # only strip white space if strip_extra_white is True + if fields[_index].get('info', {}).get('strip_extra_white', True) and isinstance(_cell, str): + row[_index] = _cell.strip() + yield row + stream.iter = strip_white_space_iter + stream.save(**save_args) + csv_filepath = f_write.name + # Create table from ckan import model context = {'model': model, 'ignore_auth': True} @@ -401,6 +403,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): TYPES, TYPE_MAPPING = get_types() types = type_guess(stream.sample[1:], types=TYPES, strict=True) + info = [] # override with types user requested if existing_info: @@ -411,9 +414,12 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): 'timestamp': datetime.datetime, }.get(existing_info.get(h, {}).get('type_override'), t) for t, h in zip(types, headers)] + for h in headers: + info.append(existing_info.get(h, {})) + headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()] - type_converter = TypeConverter(types=types) + type_converter = TypeConverter(types=types, info=info) with UnknownEncodingStream(table_filepath, file_format, decoding_result, skip_rows=skip_rows, diff --git a/ckanext/xloader/parser.py b/ckanext/xloader/parser.py index 890e8776..d27cd0ce 100644 --- a/ckanext/xloader/parser.py +++ b/ckanext/xloader/parser.py @@ -18,8 +18,9 @@ class TypeConverter: as desired. """ - def __init__(self, types=None): + def __init__(self, types=None, info=None): self.types = types + self.info = info def convert_types(self, extended_rows): """ Try converting cells to numbers or timestamps if applicable. @@ -31,11 +32,11 @@ def convert_types(self, extended_rows): for cell_index, cell_value in enumerate(row): if cell_value is None: row[cell_index] = '' - if isinstance(cell_value, str): - # strip white space around cell values - #TODO: condition behind DataDictionary option?? - cell_value = cell_value.strip() - row[cell_index] = cell_value.strip() + if self.info: + # only strip white space if strip_extra_white is True + if self.info[cell_index].get('strip_extra_white', True) and isinstance(cell_value, str): + cell_value = cell_value.strip() + row[cell_index] = cell_value.strip() if not cell_value: continue cell_type = self.types[cell_index] if self.types else None diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py index e0ce027e..6b22d8d8 100644 --- a/ckanext/xloader/plugin.py +++ b/ckanext/xloader/plugin.py @@ -4,12 +4,13 @@ from ckan import plugins from ckan.plugins import toolkit +from ckanext.datastore.interfaces import IDataDictionaryForm from ckan.model.domain_object import DomainObjectOperation from ckan.model.resource import Resource from ckan.model.package import Package -from . import action, auth, helpers as xloader_helpers, utils +from . import action, auth, helpers as xloader_helpers, utils, validators from ckanext.xloader.utils import XLoaderFormats try: @@ -34,6 +35,8 @@ class xloaderPlugin(plugins.SingletonPlugin): plugins.implements(plugins.IResourceController, inherit=True) plugins.implements(plugins.IClick) plugins.implements(plugins.IBlueprint) + plugins.implements(plugins.IValidators) + plugins.implements(IDataDictionaryForm, inherit=True) # IClick def get_commands(self): @@ -207,6 +210,18 @@ def get_helpers(self): "is_resource_supported_by_xloader": xloader_helpers.is_resource_supported_by_xloader, } + # IValidators + + def get_validators(self): + return {'xloader_datastore_fields_validator': validators.datastore_fields_validator} + + # IDataDictionaryForm + + def update_datastore_create_schema(self, schema): + info_validator = toolkit.get_validator('xloader_datastore_fields_validator') + schema['fields']['info'] = [info_validator] + schema['fields']['info'] + return schema + def _should_remove_unsupported_resource_from_datastore(res_dict): if not toolkit.asbool(toolkit.config.get('ckanext.xloader.clean_datastore_tables', False)): diff --git a/ckanext/xloader/templates/datastore/snippets/dictionary_form.html b/ckanext/xloader/templates/datastore/snippets/dictionary_form.html new file mode 100644 index 00000000..1a91b00f --- /dev/null +++ b/ckanext/xloader/templates/datastore/snippets/dictionary_form.html @@ -0,0 +1,11 @@ +{% ckan_extends %} +{% import 'macros/form.html' as form %} + +{% block additional_fields %} + {{ super() }} + {{ form.select('info__' ~ position ~ '__strip_extra_white', + label=_('Strip Extra Leading and Trailing White Space'), options=[ + {'text': 'Yes', 'value': true}, + {'text': 'No', 'value': false}, + ], selected=field.get('info', {}).get('strip_extra_white')) }} +{% endblock %} diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py index 3eacd5a0..51334543 100644 --- a/ckanext/xloader/tests/test_loader.py +++ b/ckanext/xloader/tests/test_loader.py @@ -227,7 +227,7 @@ def test_boston_311(self, Session): None, u"ONTIME", u"Open", - u" ", + None, # " " transforms to None u"Street Light Outages", u"Public Works Department", u"Street Lights", @@ -259,14 +259,14 @@ def test_boston_311(self, Session): None, u"ONTIME", u"Open", - u" ", + None, # " " transforms to None u"Graffiti Removal", u"Property Management", u"Graffiti", u"Graffiti Removal", u"PROP_GRAF_GraffitiRemoval", u"PROP", - u" https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", + u"https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # strip white spaces None, u"522 Saratoga St East Boston MA 02128", u"1", @@ -291,14 +291,14 @@ def test_boston_311(self, Session): None, u"ONTIME", u"Open", - u" ", + None, # " " transforms to None u"Graffiti Removal", u"Property Management", u"Graffiti", u"Graffiti Removal", u"PROP_GRAF_GraffitiRemoval", u"PROP", - u" https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", + u"https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", # strip white spaces None, u"965 Bennington St East Boston MA 02128", u"1", @@ -1088,7 +1088,7 @@ def test_boston_311(self, Session): u"Graffiti Removal", u"PROP_GRAF_GraffitiRemoval", u"PROP", - u"https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # strip white spaces + u"https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # strip white spaces u"", u"522 Saratoga St East Boston MA 02128", Decimal("1"), diff --git a/ckanext/xloader/validators.py b/ckanext/xloader/validators.py new file mode 100644 index 00000000..a14f71f3 --- /dev/null +++ b/ckanext/xloader/validators.py @@ -0,0 +1,12 @@ +from ckan.plugins.toolkit import asbool + + +def datastore_fields_validator(value, context): + if 'strip_extra_white' not in value: + # default to True + value['strip_extra_white'] = True + + # bool value for strip_extra_white + value['strip_extra_white'] = asbool(value['strip_extra_white']) + + return value From 50080ea14d725385bd25cbb2ff395b249f6599d0 Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Tue, 14 May 2024 16:52:23 +0000 Subject: [PATCH 08/16] feat(logic): added `strip_extra_white` field; - Added `strip_extra_white` field and form fields. - Used `strip_extra_white` to control stripping white space. --- ckanext/xloader/loader.py | 26 ++++++++++++------- ckanext/xloader/parser.py | 8 +++--- ckanext/xloader/plugin.py | 19 +++++++------- .../datastore/snippets/dictionary_form.html | 4 +-- ckanext/xloader/validators.py | 12 --------- 5 files changed, 33 insertions(+), 36 deletions(-) delete mode 100644 ckanext/xloader/validators.py diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index abe03f41..817b55c1 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -177,10 +177,13 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): existing = datastore_resource_exists(resource_id) existing_info = {} if existing: - existing_fields = existing.get('fields', []) + ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id}) + existing_fields = ds_info.get('fields', []) existing_info = dict((f['id'], f['info']) for f in existing_fields if 'info' in f) + existing_fields_by_headers = dict((f['id'], f) + for f in existing_fields) # Column types are either set (overridden) in the Data Dictionary page # or default to text type (which is robust) @@ -195,6 +198,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): for f in fields: if f['id'] in existing_info: f['info'] = existing_info[f['id']] + f['strip_extra_white'] = existing_fields_by_headers[f['id']].get('strip_extra_white', True) ''' Delete or truncate existing datastore table before proceeding, @@ -211,7 +215,8 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): else: fields = [ {'id': header_name, - 'type': 'text',} + 'type': 'text', + 'strip_extra_white': True,} for header_name in headers] logger.info('Fields: %s', fields) @@ -225,7 +230,7 @@ def strip_white_space_iter(): for row in super_iter(): for _index, _cell in enumerate(row): # only strip white space if strip_extra_white is True - if fields[_index].get('info', {}).get('strip_extra_white', True) and isinstance(_cell, str): + if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str): row[_index] = _cell.strip() yield row stream.iter = strip_white_space_iter @@ -238,7 +243,7 @@ def strip_white_space_iter(): for row in super_iter(): for _index, _cell in enumerate(row): # only strip white space if strip_extra_white is True - if fields[_index].get('info', {}).get('strip_extra_white', True) and isinstance(_cell, str): + if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str): row[_index] = _cell.strip() yield row stream.iter = strip_white_space_iter @@ -388,10 +393,13 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): existing = datastore_resource_exists(resource_id) existing_info = None if existing: - existing_fields = existing.get('fields', []) + ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id}) + existing_fields = ds_info.get('fields', []) existing_info = dict( (f['id'], f['info']) for f in existing_fields if 'info' in f) + existing_fields_by_headers = dict((f['id'], f) + for f in existing_fields) # Some headers might have been converted from strings to floats and such. headers = encode_headers(headers) @@ -403,7 +411,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): TYPES, TYPE_MAPPING = get_types() types = type_guess(stream.sample[1:], types=TYPES, strict=True) - info = [] + fields = [] # override with types user requested if existing_info: @@ -415,11 +423,10 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): }.get(existing_info.get(h, {}).get('type_override'), t) for t, h in zip(types, headers)] for h in headers: - info.append(existing_info.get(h, {})) - + fields.append(existing_fields_by_headers.get(h, {})) headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()] - type_converter = TypeConverter(types=types, info=info) + type_converter = TypeConverter(types=types, fields=fields) with UnknownEncodingStream(table_filepath, file_format, decoding_result, skip_rows=skip_rows, @@ -440,6 +447,7 @@ def row_iterator(): for h in headers_dicts: if h['id'] in existing_info: h['info'] = existing_info[h['id']] + h['strip_extra_white'] = existing_fields_by_headers[h['id']].get('strip_extra_white', True) # create columns with types user requested type_override = existing_info[h['id']].get('type_override') if type_override in list(_TYPE_MAPPING.values()): diff --git a/ckanext/xloader/parser.py b/ckanext/xloader/parser.py index d27cd0ce..c587f187 100644 --- a/ckanext/xloader/parser.py +++ b/ckanext/xloader/parser.py @@ -18,9 +18,9 @@ class TypeConverter: as desired. """ - def __init__(self, types=None, info=None): + def __init__(self, types=None, fields=None): self.types = types - self.info = info + self.fields = fields def convert_types(self, extended_rows): """ Try converting cells to numbers or timestamps if applicable. @@ -32,9 +32,9 @@ def convert_types(self, extended_rows): for cell_index, cell_value in enumerate(row): if cell_value is None: row[cell_index] = '' - if self.info: + if self.fields: # only strip white space if strip_extra_white is True - if self.info[cell_index].get('strip_extra_white', True) and isinstance(cell_value, str): + if self.fields[cell_index].get('strip_extra_white', True) and isinstance(cell_value, str): cell_value = cell_value.strip() row[cell_index] = cell_value.strip() if not cell_value: diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py index 6b22d8d8..051185e6 100644 --- a/ckanext/xloader/plugin.py +++ b/ckanext/xloader/plugin.py @@ -10,7 +10,7 @@ from ckan.model.resource import Resource from ckan.model.package import Package -from . import action, auth, helpers as xloader_helpers, utils, validators +from . import action, auth, helpers as xloader_helpers, utils from ckanext.xloader.utils import XLoaderFormats try: @@ -35,7 +35,6 @@ class xloaderPlugin(plugins.SingletonPlugin): plugins.implements(plugins.IResourceController, inherit=True) plugins.implements(plugins.IClick) plugins.implements(plugins.IBlueprint) - plugins.implements(plugins.IValidators) plugins.implements(IDataDictionaryForm, inherit=True) # IClick @@ -210,18 +209,20 @@ def get_helpers(self): "is_resource_supported_by_xloader": xloader_helpers.is_resource_supported_by_xloader, } - # IValidators - - def get_validators(self): - return {'xloader_datastore_fields_validator': validators.datastore_fields_validator} - # IDataDictionaryForm def update_datastore_create_schema(self, schema): - info_validator = toolkit.get_validator('xloader_datastore_fields_validator') - schema['fields']['info'] = [info_validator] + schema['fields']['info'] + default = toolkit.get_validator('default') + boolean_validator = toolkit.get_validator('boolean_validator') + to_datastore_plugin_data = toolkit.get_validator('to_datastore_plugin_data') + schema['fields']['strip_extra_white'] = [default(True), boolean_validator, to_datastore_plugin_data('xloader')] return schema + def update_datastore_info_field(self, field, plugin_data): + # expose all our non-secret plugin data in the field + field.update(plugin_data.get('xloader', {})) + return field + def _should_remove_unsupported_resource_from_datastore(res_dict): if not toolkit.asbool(toolkit.config.get('ckanext.xloader.clean_datastore_tables', False)): diff --git a/ckanext/xloader/templates/datastore/snippets/dictionary_form.html b/ckanext/xloader/templates/datastore/snippets/dictionary_form.html index 1a91b00f..afdf80ff 100644 --- a/ckanext/xloader/templates/datastore/snippets/dictionary_form.html +++ b/ckanext/xloader/templates/datastore/snippets/dictionary_form.html @@ -3,9 +3,9 @@ {% block additional_fields %} {{ super() }} - {{ form.select('info__' ~ position ~ '__strip_extra_white', + {{ form.select('fields__' ~ position ~ '__strip_extra_white', label=_('Strip Extra Leading and Trailing White Space'), options=[ {'text': 'Yes', 'value': true}, {'text': 'No', 'value': false}, - ], selected=field.get('info', {}).get('strip_extra_white')) }} + ], selected=field.get('strip_extra_white')) }} {% endblock %} diff --git a/ckanext/xloader/validators.py b/ckanext/xloader/validators.py deleted file mode 100644 index a14f71f3..00000000 --- a/ckanext/xloader/validators.py +++ /dev/null @@ -1,12 +0,0 @@ -from ckan.plugins.toolkit import asbool - - -def datastore_fields_validator(value, context): - if 'strip_extra_white' not in value: - # default to True - value['strip_extra_white'] = True - - # bool value for strip_extra_white - value['strip_extra_white'] = asbool(value['strip_extra_white']) - - return value From 116c29fa92897f0b190c763e4bbee3e41f6bf6ef Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Tue, 14 May 2024 17:20:43 +0000 Subject: [PATCH 09/16] fix(logic): minor logic fixes; - Minor logic fixes for the new `strip_extra_white` field. --- ckanext/xloader/loader.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index 817b55c1..30940828 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -230,7 +230,7 @@ def strip_white_space_iter(): for row in super_iter(): for _index, _cell in enumerate(row): # only strip white space if strip_extra_white is True - if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str): + if fields and fields[_index].get('strip_extra_white', True) and isinstance(_cell, str): row[_index] = _cell.strip() yield row stream.iter = strip_white_space_iter @@ -243,7 +243,7 @@ def strip_white_space_iter(): for row in super_iter(): for _index, _cell in enumerate(row): # only strip white space if strip_extra_white is True - if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str): + if fields and fields[_index].get('strip_extra_white', True) and isinstance(_cell, str): row[_index] = _cell.strip() yield row stream.iter = strip_white_space_iter @@ -424,6 +424,10 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None): for t, h in zip(types, headers)] for h in headers: fields.append(existing_fields_by_headers.get(h, {})) + else: + # default strip_extra_white + for h in headers: + fields.append({'strip_extra_white': True}) headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()] type_converter = TypeConverter(types=types, fields=fields) @@ -452,6 +456,10 @@ def row_iterator(): type_override = existing_info[h['id']].get('type_override') if type_override in list(_TYPE_MAPPING.values()): h['type'] = type_override + else: + # default strip_extra_white + for h in headers_dicts: + h['strip_extra_white'] = True logger.info('Determined headers and types: %s', headers_dicts) From 43b9f94cc9284ae0bab2276b7f6deb4cc5c9ca8f Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Mon, 15 Jul 2024 15:34:40 +0000 Subject: [PATCH 10/16] feat(tests,i18n): updated tests; - Updated various tests. - Added more gettext. --- .../datastore/snippets/dictionary_form.html | 4 +- .../tests/samples/boston_311_sample.csv | 8 +- ckanext/xloader/tests/test_loader.py | 507 +++++++++--------- 3 files changed, 260 insertions(+), 259 deletions(-) diff --git a/ckanext/xloader/templates/datastore/snippets/dictionary_form.html b/ckanext/xloader/templates/datastore/snippets/dictionary_form.html index afdf80ff..02919354 100644 --- a/ckanext/xloader/templates/datastore/snippets/dictionary_form.html +++ b/ckanext/xloader/templates/datastore/snippets/dictionary_form.html @@ -5,7 +5,7 @@ {{ super() }} {{ form.select('fields__' ~ position ~ '__strip_extra_white', label=_('Strip Extra Leading and Trailing White Space'), options=[ - {'text': 'Yes', 'value': true}, - {'text': 'No', 'value': false}, + {'text': _('Yes'), 'value': true}, + {'text': _('No'), 'value': false}, ], selected=field.get('strip_extra_white')) }} {% endblock %} diff --git a/ckanext/xloader/tests/samples/boston_311_sample.csv b/ckanext/xloader/tests/samples/boston_311_sample.csv index 83e0d5f2..e3a7e5be 100644 --- a/ckanext/xloader/tests/samples/boston_311_sample.csv +++ b/ckanext/xloader/tests/samples/boston_311_sample.csv @@ -1,4 +1,4 @@ -CASE_ENQUIRY_ID,open_dt,target_dt,closed_dt,OnTime_Status,CASE_STATUS,CLOSURE_REASON,CASE_TITLE,SUBJECT,REASON,TYPE,QUEUE,Department,SubmittedPhoto,ClosedPhoto,Location,Fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,LOCATION_STREET_NAME,LOCATION_ZIPCODE,Latitude,Longitude,Source -101002153891,2017-07-06 23:38:43,2017-07-21 08:30:00,,ONTIME,Open, ,Street Light Outages,Public Works Department,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,,,480 Harvard St Dorchester MA 02124,8,07,4,B3,Greater Mattapan,9,Ward 14,1411,480 Harvard St,02124,42.288,-71.0927,Citizens Connect App -101002153890,2017-07-06 23:29:13,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg,,522 Saratoga St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0110,522 Saratoga St,02128,42.3807,-71.0259,Citizens Connect App -101002153889,2017-07-06 23:24:20,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg,,965 Bennington St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0112,965 Bennington St,02128,42.386,-71.008,Citizens Connect App +CASE_ENQUIRY_ID,open_dt,target_dt,closed_dt,OnTime_Status,CASE_STATUS,CLOSURE_REASON,CASE_TITLE,SUBJECT,REASON,TYPE,QUEUE,Department,SubmittedPhoto,ClosedPhoto,Location,Fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,LOCATION_STREET_NAME,LOCATION_ZIPCODE,Latitude,Longitude,Source +101002153891,2017-07-06 23:38:43,2017-07-21 08:30:00,,ONTIME,Open, ,Street Light Outages,Public Works Department ,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,,,480 Harvard St Dorchester MA 02124,8,07,4,B3,Greater Mattapan,9,Ward 14,1411,480 Harvard St,02124,42.288,-71.0927,Citizens Connect App +101002153890,2017-07-06 23:29:13,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg,,522 Saratoga St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0110,522 Saratoga St,02128,42.3807,-71.0259,Citizens Connect App +101002153889,2017-07-06 23:24:20,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg,,965 Bennington St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0112,965 Bennington St,02128,42.386,-71.008,Citizens Connect App diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py index 51334543..ec0ee8e4 100644 --- a/ckanext/xloader/tests/test_loader.py +++ b/ckanext/xloader/tests/test_loader.py @@ -102,6 +102,20 @@ def test_simple(self, Session): logger=logger, ) + records = self._get_records(Session, resource_id) + print(self._get_column_names(Session, resource_id)) + assert self._get_column_names(Session, resource_id) == [ + u"_id", + u"_full_text", + u"date", + u"temperature", + u"place", + ] + print(self._get_column_types(Session, resource_id)) + assert self._get_column_types(Session, resource_id) == [ + u"int4", + u"tsvector", + ] + [u"text"] * (len(records[0]) - 1) assert self._get_records( Session, resource_id, limit=1, exclude_full_text_column=False ) == [ @@ -113,7 +127,8 @@ def test_simple(self, Session): u"Galway", ) ] - assert self._get_records(Session, resource_id) == [ + print(records) + assert records == [ (1, u"2011-01-01", u"1", u"Galway"), (2, u"2011-01-02", u"-1", u"Galway"), (3, u"2011-01-03", u"0", u"Galway"), @@ -121,20 +136,6 @@ def test_simple(self, Session): (5, None, None, u"Berkeley"), (6, u"2011-01-03", u"5", None), ] - assert self._get_column_names(Session, resource_id) == [ - u"_id", - u"_full_text", - u"date", - u"temperature", - u"place", - ] - assert self._get_column_types(Session, resource_id) == [ - u"int4", - u"tsvector", - u"text", - u"text", - u"text", - ] def test_simple_with_indexing(self, Session): csv_filepath = get_sample_filepath("simple.csv") @@ -217,6 +218,45 @@ def test_boston_311(self, Session): ) records = self._get_records(Session, resource_id) + print(self._get_column_names(Session, resource_id)) + assert self._get_column_names(Session, resource_id) == [ + u"_id", + u"_full_text", + u"CASE_ENQUIRY_ID", + u"open_dt", + u"target_dt", + u"closed_dt", + u"OnTime_Status", + u"CASE_STATUS", + u"CLOSURE_REASON", + u"CASE_TITLE", + u"SUBJECT", + u"REASON", + u"TYPE", + u"QUEUE", + u"Department", + u"SubmittedPhoto", + u"ClosedPhoto", + u"Location", + u"Fire_district", + u"pwd_district", + u"city_council_district", + u"police_district", + u"neighborhood", + u"neighborhood_services_district", + u"ward", + u"precinct", + u"LOCATION_STREET_NAME", + u"LOCATION_ZIPCODE", + u"Latitude", + u"Longitude", + u"Source", + ] # noqa + print(self._get_column_types(Session, resource_id)) + assert self._get_column_types(Session, resource_id) == [ + u"int4", + u"tsvector", + ] + [u"text"] * (len(records[0]) - 1) print(records) assert records == [ ( @@ -229,7 +269,7 @@ def test_boston_311(self, Session): u"Open", None, # " " transforms to None u"Street Light Outages", - u"Public Works Department", + u"Public Works Department", # " " trailing whitespace gets trimmed u"Street Lights", u"Street Light Outages", u"PWDx_Street Light Outages", @@ -316,45 +356,6 @@ def test_boston_311(self, Session): u"Citizens Connect App", ), ] # noqa - print(self._get_column_names(Session, resource_id)) - assert self._get_column_names(Session, resource_id) == [ - u"_id", - u"_full_text", - u"CASE_ENQUIRY_ID", - u"open_dt", - u"target_dt", - u"closed_dt", - u"OnTime_Status", - u"CASE_STATUS", - u"CLOSURE_REASON", - u"CASE_TITLE", - u"SUBJECT", - u"REASON", - u"TYPE", - u"QUEUE", - u"Department", - u"SubmittedPhoto", - u"ClosedPhoto", - u"Location", - u"Fire_district", - u"pwd_district", - u"city_council_district", - u"police_district", - u"neighborhood", - u"neighborhood_services_district", - u"ward", - u"precinct", - u"LOCATION_STREET_NAME", - u"LOCATION_ZIPCODE", - u"Latitude", - u"Longitude", - u"Source", - ] # noqa - print(self._get_column_types(Session, resource_id)) - assert self._get_column_types(Session, resource_id) == [ - u"int4", - u"tsvector", - ] + [u"text"] * (len(records[0]) - 1) def test_brazilian(self, Session): csv_filepath = get_sample_filepath("brazilian_sample.csv") @@ -368,105 +369,6 @@ def test_brazilian(self, Session): ) records = self._get_records(Session, resource_id) - print(records) - assert records[0] == ( - 1, - u"01/01/1996 12:00:00 AM", - u"1100015", - u"ALTA FLORESTA D'OESTE", - u"RO", - None, - u"128", - u"0", - u"8", - u"119", - u"1", - u"0", - u"3613", - u"3051", - u"130", - u"7", - u"121", - u"3716", - u"3078", - u"127", - u"7", - None, - None, - None, - None, - u"6794", - u"5036", - u"1758", - None, - None, - None, - None, - None, - None, - u"337", - u"0.26112759", - u"0.17210683", - u"0.43323442", - u"0.13353115", - u"24.833692447908199", - None, - None, - u"22.704964", - u"67.080006197818605", - u"65.144188573097907", - u"74.672390253375497", - u"16.7913561569619", - u"19.4894563570641", - u"8.649237411458509", - u"7.60165422117368", - u"11.1540090366186", - u"17.263407056738099", - u"8.5269823", - u"9.2213373", - u"5.3085136", - u"52.472769803217503", - None, - None, - None, - None, - None, - None, - u"25.0011414302354", - u"22.830887000000001", - u"66.8150490097632", - u"64.893674212235595", - u"74.288246611754104", - u"17.0725384713319", - u"19.8404105332814", - u"8.856561911292371", - u"7.74275834336647", - u"11.357671741889", - u"17.9410577459881", - u"8.3696527", - u"8.9979973", - u"5.0570836", - u"53.286314230720798", - None, - None, - None, - None, - None, - u"122988", - None, - u"10.155015000000001", - u"14.826086999999999", - u"11.671533", - u"9.072917", - None, - None, - None, - None, - None, - None, - None, - None, - ) # noqa print(self._get_column_names(Session, resource_id)) assert self._get_column_names(Session, resource_id) == [ u"_id", @@ -572,6 +474,105 @@ def test_brazilian(self, Session): u"int4", u"tsvector", ] + [u"text"] * (len(records[0]) - 1) + print(records) + assert records[0] == ( + 1, + u"01/01/1996 12:00:00 AM", + u"1100015", + u"ALTA FLORESTA D'OESTE", + u"RO", + None, + u"128", + u"0", + u"8", + u"119", + u"1", + u"0", + u"3613", + u"3051", + u"130", + u"7", + u"121", + u"3716", + u"3078", + u"127", + u"7", + None, + None, + None, + None, + u"6794", + u"5036", + u"1758", + None, + None, + None, + None, + None, + None, + u"337", + u"0.26112759", + u"0.17210683", + u"0.43323442", + u"0.13353115", + u"24.833692447908199", + None, + None, + u"22.704964", + u"67.080006197818605", + u"65.144188573097907", + u"74.672390253375497", + u"16.7913561569619", + u"19.4894563570641", + u"8.649237411458509", + u"7.60165422117368", + u"11.1540090366186", + u"17.263407056738099", + u"8.5269823", + u"9.2213373", + u"5.3085136", + u"52.472769803217503", + None, + None, + None, + None, + None, + None, + u"25.0011414302354", + u"22.830887000000001", + u"66.8150490097632", + u"64.893674212235595", + u"74.288246611754104", + u"17.0725384713319", + u"19.8404105332814", + u"8.856561911292371", + u"7.74275834336647", + u"11.357671741889", + u"17.9410577459881", + u"8.3696527", + u"8.9979973", + u"5.0570836", + u"53.286314230720798", + None, + None, + None, + None, + None, + u"122988", + None, + u"10.155015000000001", + u"14.826086999999999", + u"11.671533", + u"9.072917", + None, + None, + None, + None, + None, + None, + None, + None, + ) # noqa def test_german(self, Session): csv_filepath = get_sample_filepath("german_sample.csv") @@ -585,20 +586,6 @@ def test_german(self, Session): ) records = self._get_records(Session, resource_id) - print(records) - assert records[0] == ( - 1, - u"Zürich", - u"68260", - u"65444", - u"62646", - u"6503", - u"28800", - u"1173", - u"6891", - u"24221", - u"672", - ) print(self._get_column_names(Session, resource_id)) assert self._get_column_names(Session, resource_id) == [ u"_id", @@ -619,6 +606,20 @@ def test_german(self, Session): u"int4", u"tsvector", ] + [u"text"] * (len(records[0]) - 1) + print(records) + assert records[0] == ( + 1, + u"Zürich", + u"68260", + u"65444", + u"62646", + u"6503", + u"28800", + u"1173", + u"6891", + u"24221", + u"672", + ) def test_with_blanks(self, Session): csv_filepath = get_sample_filepath("sample_with_blanks.csv") @@ -699,7 +700,6 @@ def test_reload(self, Session): logger=logger, ) - assert len(self._get_records(Session, resource_id)) == 6 assert self._get_column_names(Session, resource_id) == [ u"_id", u"_full_text", @@ -714,6 +714,7 @@ def test_reload(self, Session): u"text", u"text", ] + assert len(self._get_records(Session, resource_id)) == 6 @pytest.mark.skipif( not p.toolkit.check_ckan_version(min_version="2.7"), @@ -752,7 +753,6 @@ def test_reload_with_overridden_types(self, Session): fields=fields, resource_id=resource_id, logger=logger ) - assert len(self._get_records(Session, resource_id)) == 6 assert self._get_column_names(Session, resource_id) == [ u"_id", u"_full_text", @@ -767,6 +767,7 @@ def test_reload_with_overridden_types(self, Session): u"numeric", u"text", ] + assert len(self._get_records(Session, resource_id)) == 6 # check that rows with nulls are indexed correctly records = self._get_records( @@ -919,6 +920,20 @@ def test_simple(self, Session): # "'-01':4,5 '00':6,7,8 '1':1 '2011':3 'galway':2" # "'-01':2,3 '00':5,6 '1':7 '2011':1 'galway':8 't00':4" + assert self._get_column_names(Session, resource_id) == [ + u"_id", + u"_full_text", + u"date", + u"temperature", + u"place", + ] + assert self._get_column_types(Session, resource_id) == [ + u"int4", + u"tsvector", + u"timestamp", + u"numeric", + u"text", + ] assert self._get_records(Session, resource_id) == [ (1, datetime.datetime(2011, 1, 1, 0, 0), Decimal("1"), u"Galway",), ( @@ -947,20 +962,6 @@ def test_simple(self, Session): u"Berkeley", ), ] - assert self._get_column_names(Session, resource_id) == [ - u"_id", - u"_full_text", - u"date", - u"temperature", - u"place", - ] - assert self._get_column_types(Session, resource_id) == [ - u"int4", - u"tsvector", - u"timestamp", - u"numeric", - u"text", - ] def test_simple_large_file(self, Session): csv_filepath = get_sample_filepath("simple-large.csv") @@ -1039,6 +1040,74 @@ def test_boston_311(self, Session): ) records = self._get_records(Session, resource_id) + print(self._get_column_names(Session, resource_id)) + assert self._get_column_names(Session, resource_id) == [ + u"_id", # int4 + u"_full_text", # tsvector + u"CASE_ENQUIRY_ID", # numeric + u"open_dt", # timestamp + u"target_dt", # timestamp + u"closed_dt", # text + u"OnTime_Status", # text + u"CASE_STATUS", # text + u"CLOSURE_REASON", # text + u"CASE_TITLE", # text + u"SUBJECT", # text + u"REASON", # text + u"TYPE", # text + u"QUEUE", # text + u"Department", # text + u"SubmittedPhoto", # text + u"ClosedPhoto", # text + u"Location", # text + u"Fire_district", # numeric + u"pwd_district", # numeric + u"city_council_district", # numeric + u"police_district", # text + u"neighborhood", # text + u"neighborhood_services_district", # numeric + u"ward", # text + u"precinct", # numeric + u"LOCATION_STREET_NAME", # text + u"LOCATION_ZIPCODE", # numeric + u"Latitude", # numeric + u"Longitude", # numeric + u"Source", # text + ] # noqa + print(self._get_column_types(Session, resource_id)) + assert self._get_column_types(Session, resource_id) == [ + u"int4", # _id + u"tsvector", # _full_text + u"numeric", # CASE_ENQUIRY_ID + u"timestamp", # open_dt + u"timestamp", # target_dt + u"text", # closed_dt + u"text", # OnTime_Status + u"text", # CASE_STATUS + u"text", # CLOSURE_REASON + u"text", # CASE_TITLE + u"text", # SUBJECT + u"text", # REASON + u"text", # TYPE + u"text", # QUEUE + u"text", # Department + u"text", # SubmittedPhoto + u"text", # ClosedPhoto + u"text", # Location + u"numeric", # Fire_district + u"numeric", # pwd_district + u"numeric", # city_council_district + u"text", # police_district + u"text", # neighborhood + u"numeric", # neighborhood_services_district + u"text", # ward + u"numeric", # precinct + u"text", # LOCATION_STREET_NAME + u"numeric", # LOCATION_ZIPCODE + u"numeric", # Latitude + u"numeric", # Longitude + u"text", # Source + ] # noqa print(records) assert records == [ ( @@ -1051,7 +1120,7 @@ def test_boston_311(self, Session): u"Open", u"", # " " transforms to "" u"Street Light Outages", - u"Public Works Department", + u"Public Works Department", # " " trailing whitespace gets trimmed u"Street Lights", u"Street Light Outages", u"PWDx_Street Light Outages", @@ -1138,74 +1207,6 @@ def test_boston_311(self, Session): u"Citizens Connect App", ), ] # noqa - print(self._get_column_names(Session, resource_id)) - assert self._get_column_names(Session, resource_id) == [ - u"_id", - u"_full_text", - u"CASE_ENQUIRY_ID", - u"open_dt", - u"target_dt", - u"closed_dt", - u"OnTime_Status", - u"CASE_STATUS", - u"CLOSURE_REASON", - u"CASE_TITLE", - u"SUBJECT", - u"REASON", - u"TYPE", - u"QUEUE", - u"Department", - u"SubmittedPhoto", - u"ClosedPhoto", - u"Location", - u"Fire_district", - u"pwd_district", - u"city_council_district", - u"police_district", - u"neighborhood", - u"neighborhood_services_district", - u"ward", - u"precinct", - u"LOCATION_STREET_NAME", - u"LOCATION_ZIPCODE", - u"Latitude", - u"Longitude", - u"Source", - ] # noqa - print(self._get_column_types(Session, resource_id)) - assert self._get_column_types(Session, resource_id) == [ - u"int4", - u"tsvector", - u"numeric", - u"timestamp", - u"timestamp", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"text", - u"numeric", - u"numeric", - u"numeric", - u"text", - u"text", - u"numeric", - u"text", - u"numeric", - u"text", - u"numeric", - u"numeric", - u"numeric", - u"text", - ] # noqa def test_no_entries(self): csv_filepath = get_sample_filepath("no_entries.csv") From c00fb5ae3c90187e27fc42b67df365572cf261b5 Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Mon, 15 Jul 2024 19:09:30 +0000 Subject: [PATCH 11/16] fix(tests,logic): misc fixes; - Updated post parser to set empty string type cells to `None` for parody with `load_csv`. - Updated some tests for new code. - Added check for ckan version `2.11` for the data dictionary form. - Updated conditions in `load_csv`. --- ckanext/xloader/loader.py | 18 +++++++++-------- ckanext/xloader/parser.py | 6 +++++- ckanext/xloader/plugin.py | 10 ++++++++-- .../datastore/snippets/dictionary_form.html | 12 ++++++----- ckanext/xloader/tests/test_loader.py | 20 +++++++++---------- requirements.txt | 4 ++-- 6 files changed, 42 insertions(+), 28 deletions(-) diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index f8454904..8528a657 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -229,10 +229,11 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): super_iter = stream.iter def strip_white_space_iter(): for row in super_iter(): - for _index, _cell in enumerate(row): - # only strip white space if strip_extra_white is True - if fields and fields[_index].get('strip_extra_white', True) and isinstance(_cell, str): - row[_index] = _cell.strip() + if len(row) == len(fields): + for _index, _cell in enumerate(row): + # only strip white space if strip_extra_white is True + if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str): + row[_index] = _cell.strip() yield row stream.iter = strip_white_space_iter stream.save(**save_args) @@ -242,10 +243,11 @@ def strip_white_space_iter(): super_iter = stream.iter def strip_white_space_iter(): for row in super_iter(): - for _index, _cell in enumerate(row): - # only strip white space if strip_extra_white is True - if fields and fields[_index].get('strip_extra_white', True) and isinstance(_cell, str): - row[_index] = _cell.strip() + if len(row) == len(fields): + for _index, _cell in enumerate(row): + # only strip white space if strip_extra_white is True + if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str): + row[_index] = _cell.strip() yield row stream.iter = strip_white_space_iter stream.save(**save_args) diff --git a/ckanext/xloader/parser.py b/ckanext/xloader/parser.py index c587f187..26193203 100644 --- a/ckanext/xloader/parser.py +++ b/ckanext/xloader/parser.py @@ -34,10 +34,14 @@ def convert_types(self, extended_rows): row[cell_index] = '' if self.fields: # only strip white space if strip_extra_white is True - if self.fields[cell_index].get('strip_extra_white', True) and isinstance(cell_value, str): + if self.fields[cell_index].get('strip_extra_white', True) and isinstance(cell_value, six.text_type): cell_value = cell_value.strip() row[cell_index] = cell_value.strip() if not cell_value: + # load_csv parody: empty of string type should be None + if self.types and self.types[cell_index] == six.text_type: + cell_value = None + row[cell_index] = None continue cell_type = self.types[cell_index] if self.types else None if cell_type in [Decimal, None]: diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py index ba74119e..fc4ed2e3 100644 --- a/ckanext/xloader/plugin.py +++ b/ckanext/xloader/plugin.py @@ -4,7 +4,6 @@ from ckan import plugins from ckan.plugins import toolkit -from ckanext.datastore.interfaces import IDataDictionaryForm from ckan.model.domain_object import DomainObjectOperation from ckan.model.resource import Resource @@ -21,6 +20,12 @@ def config_declarations(cls): return cls +if toolkit.check_ckan_version(min_version='2.11'): + from ckanext.datastore.interfaces import IDataDictionaryForm + has_idata_dictionary_form = True +else: + has_idata_dictionary_form = False + log = logging.getLogger(__name__) @@ -35,7 +40,8 @@ class xloaderPlugin(plugins.SingletonPlugin): plugins.implements(plugins.IResourceController, inherit=True) plugins.implements(plugins.IClick) plugins.implements(plugins.IBlueprint) - plugins.implements(IDataDictionaryForm, inherit=True) + if has_idata_dictionary_form: + plugins.implements(IDataDictionaryForm, inherit=True) # IClick def get_commands(self): diff --git a/ckanext/xloader/templates/datastore/snippets/dictionary_form.html b/ckanext/xloader/templates/datastore/snippets/dictionary_form.html index 02919354..f5c6d06f 100644 --- a/ckanext/xloader/templates/datastore/snippets/dictionary_form.html +++ b/ckanext/xloader/templates/datastore/snippets/dictionary_form.html @@ -3,9 +3,11 @@ {% block additional_fields %} {{ super() }} - {{ form.select('fields__' ~ position ~ '__strip_extra_white', - label=_('Strip Extra Leading and Trailing White Space'), options=[ - {'text': _('Yes'), 'value': true}, - {'text': _('No'), 'value': false}, - ], selected=field.get('strip_extra_white')) }} + {% if h.check_ckan_version(min_version='2.11') %} + {{ form.select('fields__' ~ position ~ '__strip_extra_white', + label=_('Strip Extra Leading and Trailing White Space'), options=[ + {'text': _('Yes'), 'value': true}, + {'text': _('No'), 'value': false}, + ], selected=field.get('strip_extra_white')) }} + {% endif %} {% endblock %} diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py index 0e225a06..2fae544e 100644 --- a/ckanext/xloader/tests/test_loader.py +++ b/ckanext/xloader/tests/test_loader.py @@ -1136,18 +1136,18 @@ def test_boston_311(self, Session): Decimal("101002153891"), datetime.datetime(2017, 7, 6, 23, 38, 43), datetime.datetime(2017, 7, 21, 8, 30), - u"", + None, u"ONTIME", u"Open", - u"", # " " transforms to "" + None, # " " transforms to None u"Street Light Outages", u"Public Works Department", # " " trailing whitespace gets trimmed u"Street Lights", u"Street Light Outages", u"PWDx_Street Light Outages", u"PWDx", - u"", - u"", + None, + None, u"480 Harvard St Dorchester MA 02124", Decimal("8"), Decimal("7"), @@ -1168,10 +1168,10 @@ def test_boston_311(self, Session): Decimal("101002153890"), datetime.datetime(2017, 7, 6, 23, 29, 13), datetime.datetime(2017, 9, 11, 8, 30), - u"", + None, u"ONTIME", u"Open", - u"", # " " transforms to "" + None, # " " transforms to None u"Graffiti Removal", u"Property Management", u"Graffiti", @@ -1179,7 +1179,7 @@ def test_boston_311(self, Session): u"PROP_GRAF_GraffitiRemoval", u"PROP", u"https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # strip white spaces - u"", + None, u"522 Saratoga St East Boston MA 02128", Decimal("1"), Decimal("9"), @@ -1200,10 +1200,10 @@ def test_boston_311(self, Session): Decimal("101002153889"), datetime.datetime(2017, 7, 6, 23, 24, 20), datetime.datetime(2017, 9, 11, 8, 30), - u"", + None, u"ONTIME", u"Open", - u"", # " " transforms to "" + None, # " " transforms to None u"Graffiti Removal", u"Property Management", u"Graffiti", @@ -1211,7 +1211,7 @@ def test_boston_311(self, Session): u"PROP_GRAF_GraffitiRemoval", u"PROP", u"https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", # strip white spaces - u"", + None, u"965 Bennington St East Boston MA 02128", Decimal("1"), Decimal("9"), diff --git a/requirements.txt b/requirements.txt index fe92b6d7..ce7cd03e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ -ckantoolkit +ckantoolkit>=0.0.4 requests[security]>=2.11.1 six>=1.12.0 tabulator==1.53.5 Unidecode==1.0.22 python-dateutil>=2.8.2 -chardet==5.2.0 \ No newline at end of file +chardet==5.2.0 From 669930eb59c68135ed4f22a1f29b4f2aa22a1528 Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Mon, 15 Jul 2024 20:04:09 +0000 Subject: [PATCH 12/16] fix(tests,logic): new output and parody; - Fixed test for new output with `strip_extra_white`. - Fixed CKAN version pardoy of `info` vs `_info` in `update_datastore_info_field` (upstream issue). --- ckanext/xloader/plugin.py | 3 +++ ckanext/xloader/tests/test_jobs.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py index fc4ed2e3..e8268776 100644 --- a/ckanext/xloader/plugin.py +++ b/ckanext/xloader/plugin.py @@ -228,6 +228,9 @@ def update_datastore_create_schema(self, schema): def update_datastore_info_field(self, field, plugin_data): # expose all our non-secret plugin data in the field field.update(plugin_data.get('xloader', {})) + # CKAN version parody + if '_info' in plugin_data: + field.update({'info': plugin_data['_info']}) return field diff --git a/ckanext/xloader/tests/test_jobs.py b/ckanext/xloader/tests/test_jobs.py index e819dad9..62ae7174 100644 --- a/ckanext/xloader/tests/test_jobs.py +++ b/ckanext/xloader/tests/test_jobs.py @@ -81,7 +81,7 @@ def test_xloader_data_into_datastore(self, cli, data): with mock.patch("ckanext.xloader.jobs.get_response", get_response): stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output assert "File hash: d44fa65eda3675e11710682fdb5f1648" in stdout - assert "Fields: [{'id': 'x', 'type': 'text'}, {'id': 'y', 'type': 'text'}]" in stdout + assert "Fields: [{'id': 'x', 'type': 'text', 'strip_extra_white': True}, {'id': 'y', 'type': 'text', 'strip_extra_white': True}]" in stdout assert "Copying to database..." in stdout assert "Creating search index..." in stdout assert "Express Load completed" in stdout From 3263bababcb0bcb0c20744cf360d119fd607ac0a Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Tue, 16 Jul 2024 13:57:23 +0000 Subject: [PATCH 13/16] fix(logic): ckan versioning; - DS fields for ckan versions. --- ckanext/xloader/loader.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index 8528a657..da0edb1d 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -178,8 +178,11 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): existing = datastore_resource_exists(resource_id) existing_info = {} if existing: - ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id}) - existing_fields = ds_info.get('fields', []) + if p.toolkit.check_ckan_version(max_version='2.9'): + existing_fields = existing.get('fields', []) + else: + ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id}) + existing_fields = ds_info.get('fields', []) existing_info = dict((f['id'], f['info']) for f in existing_fields if 'info' in f) From 7cb6a84d682a5c691ab3d40eba5c31df3c61512c Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Tue, 16 Jul 2024 14:15:00 +0000 Subject: [PATCH 14/16] fix(logic): ckan versioning; - DS fields for ckan versions. --- ckanext/xloader/loader.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index da0edb1d..1da79fee 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -178,11 +178,11 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): existing = datastore_resource_exists(resource_id) existing_info = {} if existing: - if p.toolkit.check_ckan_version(max_version='2.9'): - existing_fields = existing.get('fields', []) - else: + if p.toolkit.check_ckan_version(min_version='2.10'): ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id}) existing_fields = ds_info.get('fields', []) + else: + existing_fields = existing.get('fields', []) existing_info = dict((f['id'], f['info']) for f in existing_fields if 'info' in f) From bf2e9396e541f36f71833cbb11124a30dca32240 Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Tue, 16 Jul 2024 14:36:52 +0000 Subject: [PATCH 15/16] feat(templates,logic): pre-datadictionary implement; - Support current versions of CKAN for the DataDictionary form override for `strip_extra_white`. - Support current versions of CKAN for existing info before existing data dictionary custom fields. --- ckanext/xloader/loader.py | 6 ++++-- .../datastore/snippets/dictionary_form.html | 14 +++++++++----- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py index 1da79fee..54ab026b 100644 --- a/ckanext/xloader/loader.py +++ b/ckanext/xloader/loader.py @@ -202,7 +202,8 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None): for f in fields: if f['id'] in existing_info: f['info'] = existing_info[f['id']] - f['strip_extra_white'] = existing_fields_by_headers[f['id']].get('strip_extra_white', True) + f['strip_extra_white'] = existing_info[f['id']].get('strip_extra_white') if 'strip_extra_white' in existing_info[f['id']] \ + else existing_fields_by_headers[f['id']].get('strip_extra_white', True) ''' Delete or truncate existing datastore table before proceeding, @@ -459,7 +460,8 @@ def row_iterator(): for h in headers_dicts: if h['id'] in existing_info: h['info'] = existing_info[h['id']] - h['strip_extra_white'] = existing_fields_by_headers[h['id']].get('strip_extra_white', True) + h['strip_extra_white'] = existing_info[h['id']].get('strip_extra_white') if 'strip_extra_white' in existing_info[h['id']] \ + else existing_fields_by_headers[h['id']].get('strip_extra_white', True) # create columns with types user requested type_override = existing_info[h['id']].get('type_override') if type_override in list(_TYPE_MAPPING.values()): diff --git a/ckanext/xloader/templates/datastore/snippets/dictionary_form.html b/ckanext/xloader/templates/datastore/snippets/dictionary_form.html index f5c6d06f..808aa764 100644 --- a/ckanext/xloader/templates/datastore/snippets/dictionary_form.html +++ b/ckanext/xloader/templates/datastore/snippets/dictionary_form.html @@ -4,10 +4,14 @@ {% block additional_fields %} {{ super() }} {% if h.check_ckan_version(min_version='2.11') %} - {{ form.select('fields__' ~ position ~ '__strip_extra_white', - label=_('Strip Extra Leading and Trailing White Space'), options=[ - {'text': _('Yes'), 'value': true}, - {'text': _('No'), 'value': false}, - ], selected=field.get('strip_extra_white')) }} + {% set field_prefix = 'fields__' %} + {% else %} + {% set field_prefix = 'info__' %} {% endif %} + {% set is_selected = field.get('info', {}).get('strip_extra_white', field.get('strip_extra_white')) != 'False' %} + {{ form.select(field_prefix ~ position ~ '__strip_extra_white', + label=_('Strip Extra Leading and Trailing White Space'), options=[ + {'text': _('Yes'), 'value': true}, + {'text': _('No'), 'value': false}, + ], selected=is_selected) }} {% endblock %} From d6de1b18bcfcd22062dd7d038c2d9220869375aa Mon Sep 17 00:00:00 2001 From: Jesse Vickery Date: Mon, 22 Jul 2024 19:45:24 +0000 Subject: [PATCH 16/16] feat(tests): add coverage; - Added test coverage for no strip white extra space. --- ckanext/xloader/tests/test_loader.py | 379 +++++++++++++++++++++++++++ 1 file changed, 379 insertions(+) diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py index 2fae544e..ba1b9288 100644 --- a/ckanext/xloader/tests/test_loader.py +++ b/ckanext/xloader/tests/test_loader.py @@ -817,6 +817,181 @@ def test_column_names(self, Session): u"Galway", ) + def test_load_with_no_strip_white(self, Session): + csv_filepath = get_sample_filepath("boston_311_sample.csv") + resource = factories.Resource() + resource_id = resource['id'] + loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=logger, + ) + + # Change strip_extra_white, as it would be done by Data Dictionary + rec = p.toolkit.get_action("datastore_search")( + None, {"resource_id": resource_id, "limit": 0} + ) + fields = [f for f in rec["fields"] if not f["id"].startswith("_")] + for field in fields: + field["info"] = {"strip_extra_white": False} # <=2.10 + field["strip_extra_white"] = False # >=2.11 + p.toolkit.get_action("datastore_create")( + {"ignore_auth": True}, + {"resource_id": resource_id, "force": True, "fields": fields}, + ) + + # Load it again with new strip_extra_white + fields = loader.load_csv( + csv_filepath, + resource_id=resource_id, + mimetype="text/csv", + logger=logger, + ) + loader.create_column_indexes( + fields=fields, resource_id=resource_id, logger=logger + ) + + records = self._get_records(Session, resource_id) + print(self._get_column_names(Session, resource_id)) + assert self._get_column_names(Session, resource_id) == [ + u"_id", + u"_full_text", + u"CASE_ENQUIRY_ID", + u"open_dt", + u"target_dt", + u"closed_dt", + u"OnTime_Status", + u"CASE_STATUS", + u"CLOSURE_REASON", + u"CASE_TITLE", + u"SUBJECT", + u"REASON", + u"TYPE", + u"QUEUE", + u"Department", + u"SubmittedPhoto", + u"ClosedPhoto", + u"Location", + u"Fire_district", + u"pwd_district", + u"city_council_district", + u"police_district", + u"neighborhood", + u"neighborhood_services_district", + u"ward", + u"precinct", + u"LOCATION_STREET_NAME", + u"LOCATION_ZIPCODE", + u"Latitude", + u"Longitude", + u"Source", + ] # noqa + print(self._get_column_types(Session, resource_id)) + assert self._get_column_types(Session, resource_id) == [ + u"int4", + u"tsvector", + ] + [u"text"] * (len(records[0]) - 1) + print(records) + assert records == [ + ( + 4, # ds auto increment + u"101002153891", + u"2017-07-06 23:38:43", + u"2017-07-21 08:30:00", + None, + u"ONTIME", + u"Open", + u" ", # no strip_extra_white + u"Street Light Outages", + u"Public Works Department ", # no strip_extra_white + u"Street Lights", + u"Street Light Outages", + u"PWDx_Street Light Outages", + u"PWDx", + None, + None, + u"480 Harvard St Dorchester MA 02124", + u"8", + u"07", + u"4", + u"B3", + u"Greater Mattapan", + u"9", + u"Ward 14", + u"1411", + u"480 Harvard St", + u"02124", + u"42.288", + u"-71.0927", + u"Citizens Connect App", + ), # noqa + ( + 5, # ds auto increment + u"101002153890", + u"2017-07-06 23:29:13", + u"2017-09-11 08:30:00", + None, + u"ONTIME", + u"Open", + u" ", # no strip_extra_white + u"Graffiti Removal", + u"Property Management", + u"Graffiti", + u"Graffiti Removal", + u"PROP_GRAF_GraffitiRemoval", + u"PROP", + u" https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # no strip_extra_white + None, + u"522 Saratoga St East Boston MA 02128", + u"1", + u"09", + u"1", + u"A7", + u"East Boston", + u"1", + u"Ward 1", + u"0110", + u"522 Saratoga St", + u"02128", + u"42.3807", + u"-71.0259", + u"Citizens Connect App", + ), # noqa + ( + 6, # ds auto increment + u"101002153889", + u"2017-07-06 23:24:20", + u"2017-09-11 08:30:00", + None, + u"ONTIME", + u"Open", + u" ", # no strip_extra_white + u"Graffiti Removal", + u"Property Management", + u"Graffiti", + u"Graffiti Removal", + u"PROP_GRAF_GraffitiRemoval", + u"PROP", + u" https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", # no strip_extra_white + None, + u"965 Bennington St East Boston MA 02128", + u"1", + u"09", + u"1", + u"A7", + u"East Boston", + u"1", + u"Ward 1", + u"0112", + u"965 Bennington St", + u"02128", + u"42.386", + u"-71.008", + u"Citizens Connect App", + ), + ] # noqa + class TestLoadUnhandledTypes(TestLoadBase): def test_kml(self): @@ -1299,3 +1474,207 @@ def test_preserving_time_ranges(self, Session): (3, "Barcaldine", 4725, Decimal("-23.55327901"), Decimal("145.289156"), "9:00-12:30", "13:30-16:30", datetime.datetime(2018, 7, 20)) ] + + def test_load_with_no_strip_white(self, Session): + csv_filepath = get_sample_filepath("boston_311_sample.csv") + resource = factories.Resource() + resource_id = resource['id'] + loader.load_table( + csv_filepath, + resource_id=resource_id, + mimetype="csv", + logger=logger, + ) + + # Change strip_extra_white, as it would be done by Data Dictionary + rec = p.toolkit.get_action("datastore_search")( + None, {"resource_id": resource_id, "limit": 0} + ) + fields = [f for f in rec["fields"] if not f["id"].startswith("_")] + for field in fields: + field["info"] = {"strip_extra_white": False} # <=2.10 + field["strip_extra_white"] = False # >=2.11 + p.toolkit.get_action("datastore_create")( + {"ignore_auth": True}, + {"resource_id": resource_id, "force": True, "fields": fields}, + ) + + # Load it again with new strip_extra_white + fields = loader.load_table( + csv_filepath, + resource_id=resource_id, + mimetype="csv", + logger=logger, + ) + loader.create_column_indexes( + fields=fields, resource_id=resource_id, logger=logger + ) + + records = self._get_records(Session, resource_id) + print(self._get_column_names(Session, resource_id)) + assert self._get_column_names(Session, resource_id) == [ + u"_id", # int4 + u"_full_text", # tsvector + u"CASE_ENQUIRY_ID", # numeric + u"open_dt", # timestamp + u"target_dt", # timestamp + u"closed_dt", # text + u"OnTime_Status", # text + u"CASE_STATUS", # text + u"CLOSURE_REASON", # text + u"CASE_TITLE", # text + u"SUBJECT", # text + u"REASON", # text + u"TYPE", # text + u"QUEUE", # text + u"Department", # text + u"SubmittedPhoto", # text + u"ClosedPhoto", # text + u"Location", # text + u"Fire_district", # numeric + u"pwd_district", # numeric + u"city_council_district", # numeric + u"police_district", # text + u"neighborhood", # text + u"neighborhood_services_district", # numeric + u"ward", # text + u"precinct", # numeric + u"LOCATION_STREET_NAME", # text + u"LOCATION_ZIPCODE", # numeric + u"Latitude", # numeric + u"Longitude", # numeric + u"Source", # text + ] # noqa + print(self._get_column_types(Session, resource_id)) + assert self._get_column_types(Session, resource_id) == [ + u"int4", # _id + u"tsvector", # _full_text + u"numeric", # CASE_ENQUIRY_ID + u"timestamp", # open_dt + u"timestamp", # target_dt + u"text", # closed_dt + u"text", # OnTime_Status + u"text", # CASE_STATUS + u"text", # CLOSURE_REASON + u"text", # CASE_TITLE + u"text", # SUBJECT + u"text", # REASON + u"text", # TYPE + u"text", # QUEUE + u"text", # Department + u"text", # SubmittedPhoto + u"text", # ClosedPhoto + u"text", # Location + u"numeric", # Fire_district + u"numeric", # pwd_district + u"numeric", # city_council_district + u"text", # police_district + u"text", # neighborhood + u"numeric", # neighborhood_services_district + u"text", # ward + u"numeric", # precinct + u"text", # LOCATION_STREET_NAME + u"numeric", # LOCATION_ZIPCODE + u"numeric", # Latitude + u"numeric", # Longitude + u"text", # Source + ] # noqa + print(records) + assert records == [ + ( + 4, # ds auto increment + Decimal("101002153891"), + datetime.datetime(2017, 7, 6, 23, 38, 43), + datetime.datetime(2017, 7, 21, 8, 30), + None, + u"ONTIME", + u"Open", + u" ", # no strip_extra_white + u"Street Light Outages", + u"Public Works Department ", # no strip_extra_white + u"Street Lights", + u"Street Light Outages", + u"PWDx_Street Light Outages", + u"PWDx", + None, + None, + u"480 Harvard St Dorchester MA 02124", + Decimal("8"), + Decimal("7"), + Decimal("4"), + u"B3", + u"Greater Mattapan", + Decimal("9"), + u"Ward 14", + Decimal("1411"), + u"480 Harvard St", + Decimal("2124"), + Decimal("42.288"), + Decimal("-71.0927"), + u"Citizens Connect App", + ), # noqa + ( + 5, # ds auto increment + Decimal("101002153890"), + datetime.datetime(2017, 7, 6, 23, 29, 13), + datetime.datetime(2017, 9, 11, 8, 30), + None, + u"ONTIME", + u"Open", + u" ", # no strip_extra_white + u"Graffiti Removal", + u"Property Management", + u"Graffiti", + u"Graffiti Removal", + u"PROP_GRAF_GraffitiRemoval", + u"PROP", + u" https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg", # no strip_extra_white + None, + u"522 Saratoga St East Boston MA 02128", + Decimal("1"), + Decimal("9"), + Decimal("1"), + u"A7", + u"East Boston", + Decimal("1"), + u"Ward 1", + Decimal("110"), + u"522 Saratoga St", + Decimal("2128"), + Decimal("42.3807"), + Decimal("-71.0259"), + u"Citizens Connect App", + ), # noqa + ( + 6, # ds auto increment + Decimal("101002153889"), + datetime.datetime(2017, 7, 6, 23, 24, 20), + datetime.datetime(2017, 9, 11, 8, 30), + None, + u"ONTIME", + u"Open", + u" ", # no strip_extra_white + u"Graffiti Removal", + u"Property Management", + u"Graffiti", + u"Graffiti Removal", + u"PROP_GRAF_GraffitiRemoval", + u"PROP", + u" https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg", # no strip_extra_white + None, + u"965 Bennington St East Boston MA 02128", + Decimal("1"), + Decimal("9"), + Decimal("1"), + u"A7", + u"East Boston", + Decimal("1"), + u"Ward 1", + Decimal("112"), + u"965 Bennington St", + Decimal("2128"), + Decimal("42.386"), + Decimal("-71.008"), + u"Citizens Connect App", + ), + ] # noqa