diff --git a/tests/providers/dataverse/test_metadata.py b/tests/providers/dataverse/test_metadata.py index fd31b8419..44ff6a384 100644 --- a/tests/providers/dataverse/test_metadata.py +++ b/tests/providers/dataverse/test_metadata.py @@ -47,7 +47,7 @@ def test_file_metadata(self, file_metadata_object): assert not file_metadata_object.created_utc assert file_metadata_object.content_type == 'text/plain; charset=US-ASCII' assert file_metadata_object.etag == 'latest::20' - assert file_metadata_object.original_name == 'thefile.txt' + assert file_metadata_object.original_names == ['thefile.txt'] assert file_metadata_object.extra == { 'fileId': '20', 'datasetVersion': 'latest', @@ -71,7 +71,9 @@ def test_csv_file_metadata(self, csv_file_metadata_object): assert not csv_file_metadata_object.created_utc assert csv_file_metadata_object.content_type == 'text/tab-separated-values' assert csv_file_metadata_object.etag == 'latest::20' - assert csv_file_metadata_object.original_name == 'thefile.csv' + names = csv_file_metadata_object.original_names + assert 'thefile.csv' in names + assert 'thefile.CSV' in names assert csv_file_metadata_object.extra == { 'fileId': '20', 'datasetVersion': 'latest', diff --git a/tests/providers/dataverse/test_provider.py b/tests/providers/dataverse/test_provider.py index 141abe45c..01e4c6164 100644 --- a/tests/providers/dataverse/test_provider.py +++ b/tests/providers/dataverse/test_provider.py @@ -11,6 +11,7 @@ from waterbutler.core.path import WaterButlerPath from waterbutler.providers.dataverse import settings as dvs from waterbutler.providers.dataverse import DataverseProvider +from waterbutler.providers.dataverse.exceptions import DataverseIngestionLockError from waterbutler.providers.dataverse.metadata import DataverseFileMetadata, DataverseRevision from tests.providers.dataverse.fixtures import ( @@ -235,6 +236,32 @@ async def test_upload_create(self, provider, file_stream, native_file_metadata, assert aiohttpretty.has_call(method='GET', uri=latest_url) assert aiohttpretty.has_call(method='GET', uri=latest_published_url) + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_upload_ingestion_exception(self, provider, file_stream, native_file_metadata, + empty_native_dataset_metadata, native_dataset_metadata): + path = WaterButlerPath('/thefile.txt') + url = provider.build_url(dvs.EDIT_MEDIA_BASE_URL, 'study', provider.doi) + aiohttpretty.register_uri('POST', url, status=400, body=b'something dataset lock: Ingest') + + with pytest.raises(DataverseIngestionLockError): + await provider.upload(file_stream, path) + + assert aiohttpretty.has_call(method='POST', uri=url) + + @pytest.mark.asyncio + @pytest.mark.aiohttpretty + async def test_upload_random_exception(self, provider, file_stream, native_file_metadata, + empty_native_dataset_metadata, native_dataset_metadata): + path = WaterButlerPath('/thefile.txt') + url = provider.build_url(dvs.EDIT_MEDIA_BASE_URL, 'study', provider.doi) + aiohttpretty.register_uri('POST', url, status=400, body=b'something something error') + + with pytest.raises(exceptions.UploadError): + await provider.upload(file_stream, path) + + assert aiohttpretty.has_call(method='POST', uri=url) + @pytest.mark.asyncio @pytest.mark.aiohttpretty async def test_upload_updates(self, provider, diff --git a/tests/providers/dataverse/test_utils.py b/tests/providers/dataverse/test_utils.py index 1d4846b74..f7c572b83 100644 --- a/tests/providers/dataverse/test_utils.py +++ b/tests/providers/dataverse/test_utils.py @@ -10,13 +10,11 @@ def format_dict(): 'originalFileFormat': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'originalFormatLabel': 'MS Excel (XLSX)', 'contentType': 'text/tab-separated-values', - }, 'RData': { 'originalFileFormat': 'application/x-rlang-transport', 'originalFormatLabel': 'R Data', 'contentType': 'text/tab-separated-values' - }, 'sav': { 'originalFileFormat': 'application/x-spss-sav', @@ -27,13 +25,11 @@ def format_dict(): 'originalFileFormat': 'application/x-stata', 'originalFormatLabel': 'Stata Binary', 'contentType': 'text/tab-separated-values' - }, 'por': { 'originalFileFormat': 'application/x-spss-por', 'originalFormatLabel': 'SPSS Portable', 'contentType': 'text/tab-separated-values' - }, 'csv': { 'originalFileFormat': 'text/csv', @@ -47,7 +43,7 @@ class TestUtils: def test_original_ext_from_raw_metadata(self, format_dict): for key in format_dict: - assert key == dv_utils.original_ext_from_raw_metadata(format_dict[key]) + assert key in dv_utils.original_ext_from_raw_metadata(format_dict[key]) def test_original_ext_from_raw_metadata_none_case(self, format_dict): for key in format_dict: diff --git a/waterbutler/providers/dataverse/exceptions.py b/waterbutler/providers/dataverse/exceptions.py new file mode 100644 index 000000000..b41364b7f --- /dev/null +++ b/waterbutler/providers/dataverse/exceptions.py @@ -0,0 +1,15 @@ +from http import HTTPStatus + +from waterbutler.core.exceptions import UploadError + + +class DataverseIngestionLockError(UploadError): + def __init__(self, message, code=HTTPStatus.BAD_REQUEST): + """``dummy`` argument is because children of ``WaterButlerError`` must be instantiable with + a single integer argument. See :class:`waterbutler.core.exceptions.WaterButlerError` + for details. + """ + super().__init__( + 'Some uploads to Dataverse will lock uploading for a time. Please wait' + ' a few seconds and try again.', + code=code) diff --git a/waterbutler/providers/dataverse/metadata.py b/waterbutler/providers/dataverse/metadata.py index 5047a6130..beafe3f6b 100644 --- a/waterbutler/providers/dataverse/metadata.py +++ b/waterbutler/providers/dataverse/metadata.py @@ -28,18 +28,21 @@ def name(self): return self.raw.get('name', None) or self.raw.get('filename', None) @property - def original_name(self): + def original_names(self): """ Dataverse 'ingests' some files types. This changes their extension. - This property will look through the metadata to try to determine the original - name of the file. + This property will look through the metadata to try to determine possible + original names of the file. """ - ext = dv_utils.original_ext_from_raw_metadata(self.raw) - if ext is None: - return self.name + extensions = dv_utils.original_ext_from_raw_metadata(self.raw) + if extensions is None: + return [self.name] else: - name = self.name[:self.name.rfind('.')] - return name + '.{}'.format(ext) + names = [] + for ext in extensions: + name = self.name[:self.name.rfind('.')] + names.append(name + '.{}'.format(ext)) + return names @property def path(self): diff --git a/waterbutler/providers/dataverse/provider.py b/waterbutler/providers/dataverse/provider.py index 82587eda1..30d6165fa 100644 --- a/waterbutler/providers/dataverse/provider.py +++ b/waterbutler/providers/dataverse/provider.py @@ -11,6 +11,7 @@ from waterbutler.providers.dataverse import settings from waterbutler.providers.dataverse.metadata import DataverseRevision from waterbutler.providers.dataverse.metadata import DataverseDatasetMetadata +from waterbutler.providers.dataverse.exceptions import DataverseIngestionLockError class DataverseProvider(provider.BaseProvider): @@ -170,16 +171,26 @@ async def upload(self, stream, path, **kwargs): headers=dv_headers, auth=(self.token, ), data=file_stream, - expects=(201, ), + expects=(201, 400,), throws=exceptions.UploadError ) + + if resp.status == 400: + data = await resp.read() + data = data.decode('utf-8') + + if 'dataset lock: Ingest' in data: + raise DataverseIngestionLockError({'response': data}) + else: + raise (await exceptions.exception_from_response(resp, + error=exceptions.UploadError)) await resp.release() # Find appropriate version of file metadata = await self._get_data('latest') files = metadata if isinstance(metadata, list) else [] - file_metadata = next(file for file in files if file.name == path.name or - file.original_name == path.name) + file_metadata = next(file for file in files if (file.name == path.name or + path.name in file.original_names)) if stream.writers['md5'].hexdigest != file_metadata.extra['hashes']['md5']: raise exceptions.UploadChecksumMismatchError() diff --git a/waterbutler/providers/dataverse/utils.py b/waterbutler/providers/dataverse/utils.py index f2d7651c8..4abdb871e 100644 --- a/waterbutler/providers/dataverse/utils.py +++ b/waterbutler/providers/dataverse/utils.py @@ -1,56 +1,46 @@ ORIGINAL_FORMATS = { - 'xlsx': { - 'original_format': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', - 'original_label': 'MS Excel (XLSX)', - 'content_type': 'text/tab-separated-values', - }, - # Rdata can come in a few different forms, so just list all of them here 'RData': { 'original_format': 'application/x-rlang-transport', 'original_label': 'R Data', - 'content_type': 'text/tab-separated-values' - - }, - 'rdata': { - 'original_format': 'application/x-rlang-transport', - 'original_label': 'R Data', - 'content_type': 'text/tab-separated-values' - - }, - 'Rdata': { - 'original_format': 'application/x-rlang-transport', - 'original_label': 'R Data', - 'content_type': 'text/tab-separated-values' - + 'content_type': 'text/tab-separated-values', + 'all_extensions': ['rdata', 'Rdata', 'RData'] }, 'sav': { 'original_format': 'application/x-spss-sav', 'original_label': 'SPSS SAV', - 'content_type': 'text/tab-separated-values' + 'content_type': 'text/tab-separated-values', + 'all_extensions': ['sav'] }, 'dta': { 'original_format': 'application/x-stata', 'original_label': 'Stata Binary', - 'content_type': 'text/tab-separated-values' - + 'content_type': 'text/tab-separated-values', + 'all_extensions': ['dta'] }, 'por': { 'original_format': 'application/x-spss-por', 'original_label': 'SPSS Portable', - 'content_type': 'text/tab-separated-values' - + 'content_type': 'text/tab-separated-values', + 'all_extensions': ['por'] }, 'csv': { 'original_format': 'text/csv', 'original_label': 'Comma Separated Values', - 'content_type': 'text/tab-separated-values' + 'content_type': 'text/tab-separated-values', + 'all_extensions': ['csv', 'CSV'] + }, + 'xlsx': { + 'original_format': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'original_label': 'MS Excel (XLSX)', + 'content_type': 'text/tab-separated-values', + 'all_extensions': ['xlsx'] } } def original_ext_from_raw_metadata(data): - """Use the raw metadata to figure out the original extension.""" + """Use the raw metadata to figure out possible original extensions.""" label = data.get('originalFormatLabel', None) file_format = data.get('originalFileFormat', None) content_type = data.get('contentType', None) @@ -63,6 +53,6 @@ def original_ext_from_raw_metadata(data): file_format == ORIGINAL_FORMATS[key]['original_format'] and content_type == ORIGINAL_FORMATS[key]['content_type']): - return key + return ORIGINAL_FORMATS[key]['all_extensions'] return None