Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SVCS-353] Look for Dataverse renamed files on upload #300

Open
wants to merge 3 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions tests/providers/dataverse/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def settings():
'name': 'A look at wizards',
}


@pytest.fixture
def native_file_metadata():
with open(os.path.join(os.path.dirname(__file__), 'fixtures/root_provider.json'), 'r') as fp:
Expand Down Expand Up @@ -65,12 +66,20 @@ def dataset_metadata_object():
'Dataset Test Version'
)


@pytest.fixture
def file_metadata_object():
with open(os.path.join(os.path.dirname(__file__), 'fixtures/root_provider.json'), 'r') as fp:
return DataverseFileMetadata(json.load(fp)['native_file_metadata']['datafile'], 'latest')


@pytest.fixture
def csv_file_metadata_object():
with open(os.path.join(os.path.dirname(__file__), 'fixtures/root_provider.json'), 'r') as fp:
return DataverseFileMetadata(json.load(fp)['csv_native_file_metadata']['datafile'],
'latest')


@pytest.fixture
def revision_metadata_object():
return DataverseRevision('Test Dataset Verision')
16 changes: 16 additions & 0 deletions tests/providers/dataverse/fixtures/root_provider.json
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,22 @@
"label":"thefile.txt",
"version":1
},
"csv_native_file_metadata":{
"datafile":{
"contentType":"text/tab-separated-values",
"description":"",
"filename":"%2Fusr%2Flocal%2Fglassfish4%2Fglassfish%2Fdomains%2Fdomain1%2Ffiles%2F10.5072%2FFK2%2F232XYH%2F14c7a73d734-8383551cc713",
"id":20,
"md5":"6b50249f91258397fc5cb7d5a4127e15",
"name":"thefile.tab",
"originalFormatLabel":"Comma Separated Values",
"originalFileFormat": "text/csv"
},
"datasetVersionId":5,
"description":"",
"label":"thefile.tab",
"version":1
},
"checksum_mismatch_dataset_metadata":{
"data":{
"createTime":"2015-04-02T13:21:59Z",
Expand Down
29 changes: 29 additions & 0 deletions tests/providers/dataverse/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
from tests.providers.dataverse.fixtures import (
dataset_metadata_object,
revision_metadata_object,
csv_file_metadata_object,
file_metadata_object
)


class TestDatasetMetadata:

def test_dataset_metadata(self, dataset_metadata_object):
Expand Down Expand Up @@ -45,6 +47,7 @@ def test_file_metadata(self, file_metadata_object):
assert not file_metadata_object.created_utc
assert file_metadata_object.content_type == 'text/plain; charset=US-ASCII'
assert file_metadata_object.etag == 'latest::20'
assert file_metadata_object.original_names == ['thefile.txt']
assert file_metadata_object.extra == {
'fileId': '20',
'datasetVersion': 'latest',
Expand All @@ -53,3 +56,29 @@ def test_file_metadata(self, file_metadata_object):
'md5': '6b50249f91258397fc5cb7d5a4127e15',
},
}

def test_csv_file_metadata(self, csv_file_metadata_object):
assert csv_file_metadata_object.is_file
assert not csv_file_metadata_object.is_folder
assert csv_file_metadata_object.provider == 'dataverse'
assert csv_file_metadata_object.kind == 'file'
assert csv_file_metadata_object.file_id == '20'
assert csv_file_metadata_object.name == 'thefile.tab'
assert csv_file_metadata_object.path == '/20'
assert csv_file_metadata_object.materialized_path == '/thefile.tab'
assert not csv_file_metadata_object.size
assert not csv_file_metadata_object.modified
assert not csv_file_metadata_object.created_utc
assert csv_file_metadata_object.content_type == 'text/tab-separated-values'
assert csv_file_metadata_object.etag == 'latest::20'
names = csv_file_metadata_object.original_names
assert 'thefile.csv' in names
assert 'thefile.CSV' in names
assert csv_file_metadata_object.extra == {
'fileId': '20',
'datasetVersion': 'latest',
'hasPublishedVersion': False,
'hashes': {
'md5': '6b50249f91258397fc5cb7d5a4127e15',
},
}
27 changes: 27 additions & 0 deletions tests/providers/dataverse/test_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from waterbutler.core.path import WaterButlerPath
from waterbutler.providers.dataverse import settings as dvs
from waterbutler.providers.dataverse import DataverseProvider
from waterbutler.providers.dataverse.exceptions import DataverseIngestionLockError
from waterbutler.providers.dataverse.metadata import DataverseFileMetadata, DataverseRevision

from tests.providers.dataverse.fixtures import (
Expand Down Expand Up @@ -235,6 +236,32 @@ async def test_upload_create(self, provider, file_stream, native_file_metadata,
assert aiohttpretty.has_call(method='GET', uri=latest_url)
assert aiohttpretty.has_call(method='GET', uri=latest_published_url)

@pytest.mark.asyncio
@pytest.mark.aiohttpretty
async def test_upload_ingestion_exception(self, provider, file_stream, native_file_metadata,
empty_native_dataset_metadata, native_dataset_metadata):
path = WaterButlerPath('/thefile.txt')
url = provider.build_url(dvs.EDIT_MEDIA_BASE_URL, 'study', provider.doi)
aiohttpretty.register_uri('POST', url, status=400, body=b'something dataset lock: Ingest')

with pytest.raises(DataverseIngestionLockError):
await provider.upload(file_stream, path)

assert aiohttpretty.has_call(method='POST', uri=url)

@pytest.mark.asyncio
@pytest.mark.aiohttpretty
async def test_upload_random_exception(self, provider, file_stream, native_file_metadata,
empty_native_dataset_metadata, native_dataset_metadata):
path = WaterButlerPath('/thefile.txt')
url = provider.build_url(dvs.EDIT_MEDIA_BASE_URL, 'study', provider.doi)
aiohttpretty.register_uri('POST', url, status=400, body=b'something something error')

with pytest.raises(exceptions.UploadError):
await provider.upload(file_stream, path)

assert aiohttpretty.has_call(method='POST', uri=url)

@pytest.mark.asyncio
@pytest.mark.aiohttpretty
async def test_upload_updates(self, provider,
Expand Down
51 changes: 51 additions & 0 deletions tests/providers/dataverse/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import pytest

from waterbutler.providers.dataverse import utils as dv_utils


@pytest.fixture
def format_dict():
return {
'xlsx': {
'originalFileFormat': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'originalFormatLabel': 'MS Excel (XLSX)',
'contentType': 'text/tab-separated-values',
},
'RData': {
'originalFileFormat': 'application/x-rlang-transport',
'originalFormatLabel': 'R Data',
'contentType': 'text/tab-separated-values'
},
'sav': {
'originalFileFormat': 'application/x-spss-sav',
'originalFormatLabel': 'SPSS SAV',
'contentType': 'text/tab-separated-values'
},
'dta': {
'originalFileFormat': 'application/x-stata',
'originalFormatLabel': 'Stata Binary',
'contentType': 'text/tab-separated-values'
},
'por': {
'originalFileFormat': 'application/x-spss-por',
'originalFormatLabel': 'SPSS Portable',
'contentType': 'text/tab-separated-values'
},
'csv': {
'originalFileFormat': 'text/csv',
'originalFormatLabel': 'Comma Separated Values',
'contentType': 'text/tab-separated-values'
}
}


class TestUtils:

def test_original_ext_from_raw_metadata(self, format_dict):
for key in format_dict:
assert key in dv_utils.original_ext_from_raw_metadata(format_dict[key])

def test_original_ext_from_raw_metadata_none_case(self, format_dict):
for key in format_dict:
format_dict[key]['originalFormatLabel'] = 'blarg'
assert dv_utils.original_ext_from_raw_metadata(format_dict[key]) is None
15 changes: 15 additions & 0 deletions waterbutler/providers/dataverse/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from http import HTTPStatus

from waterbutler.core.exceptions import UploadError


class DataverseIngestionLockError(UploadError):
def __init__(self, message, code=HTTPStatus.BAD_REQUEST):
"""``dummy`` argument is because children of ``WaterButlerError`` must be instantiable with
a single integer argument. See :class:`waterbutler.core.exceptions.WaterButlerError`
for details.
"""
super().__init__(
'Some uploads to Dataverse will lock uploading for a time. Please wait'
' a few seconds and try again.',
code=code)
18 changes: 18 additions & 0 deletions waterbutler/providers/dataverse/metadata.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from waterbutler.core import metadata
from waterbutler.providers.dataverse import utils as dv_utils


class BaseDataverseMetadata(metadata.BaseMetadata):
Expand Down Expand Up @@ -26,6 +27,23 @@ def file_id(self):
def name(self):
return self.raw.get('name', None) or self.raw.get('filename', None)

@property
def original_names(self):
""" Dataverse 'ingests' some files types. This changes their extension.
This property will look through the metadata to try to determine possible
original names of the file.
"""

extensions = dv_utils.original_ext_from_raw_metadata(self.raw)
if extensions is None:
return [self.name]
else:
names = []
for ext in extensions:
name = self.name[:self.name.rfind('.')]
names.append(name + '.{}'.format(ext))
return names

@property
def path(self):
return self.build_path(self.file_id)
Expand Down
16 changes: 14 additions & 2 deletions waterbutler/providers/dataverse/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from waterbutler.providers.dataverse import settings
from waterbutler.providers.dataverse.metadata import DataverseRevision
from waterbutler.providers.dataverse.metadata import DataverseDatasetMetadata
from waterbutler.providers.dataverse.exceptions import DataverseIngestionLockError


class DataverseProvider(provider.BaseProvider):
Expand Down Expand Up @@ -170,15 +171,26 @@ async def upload(self, stream, path, **kwargs):
headers=dv_headers,
auth=(self.token, ),
data=file_stream,
expects=(201, ),
expects=(201, 400,),
throws=exceptions.UploadError
)

if resp.status == 400:
data = await resp.read()
data = data.decode('utf-8')

if 'dataset lock: Ingest' in data:
raise DataverseIngestionLockError({'response': data})
else:
raise (await exceptions.exception_from_response(resp,
error=exceptions.UploadError))
await resp.release()

# Find appropriate version of file
metadata = await self._get_data('latest')
files = metadata if isinstance(metadata, list) else []
file_metadata = next(file for file in files if file.name == path.name)
file_metadata = next(file for file in files if (file.name == path.name or
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we not just use a list comprehension here?

path.name in file.original_names))

if stream.writers['md5'].hexdigest != file_metadata.extra['hashes']['md5']:
raise exceptions.UploadChecksumMismatchError()
Expand Down
58 changes: 58 additions & 0 deletions waterbutler/providers/dataverse/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
ORIGINAL_FORMATS = {

'RData': {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See two previous comments regarding multiple 'original_label': 'R Data'

'original_format': 'application/x-rlang-transport',
'original_label': 'R Data',
'content_type': 'text/tab-separated-values',
'all_extensions': ['rdata', 'Rdata', 'RData']
},
'sav': {
'original_format': 'application/x-spss-sav',
'original_label': 'SPSS SAV',
'content_type': 'text/tab-separated-values',
'all_extensions': ['sav']
},
'dta': {
'original_format': 'application/x-stata',
'original_label': 'Stata Binary',
'content_type': 'text/tab-separated-values',
'all_extensions': ['dta']
},
'por': {
'original_format': 'application/x-spss-por',
'original_label': 'SPSS Portable',
'content_type': 'text/tab-separated-values',
'all_extensions': ['por']
},
'csv': {
'original_format': 'text/csv',
'original_label': 'Comma Separated Values',
'content_type': 'text/tab-separated-values',
'all_extensions': ['csv', 'CSV']
},
'xlsx': {
'original_format': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'original_label': 'MS Excel (XLSX)',
'content_type': 'text/tab-separated-values',
'all_extensions': ['xlsx']
}
}


def original_ext_from_raw_metadata(data):
"""Use the raw metadata to figure out possible original extensions."""
label = data.get('originalFormatLabel', None)
file_format = data.get('originalFileFormat', None)
content_type = data.get('contentType', None)

if not label or not file_format or not content_type:
return None

for key in ORIGINAL_FORMATS:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably better as:

for key, value in ORIGINAL_FORMATS.items():
if (label == value['original_label'] and ...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can apply the above to TestUtils class as well.

if (label == ORIGINAL_FORMATS[key]['original_label'] and
file_format == ORIGINAL_FORMATS[key]['original_format'] and
content_type == ORIGINAL_FORMATS[key]['content_type']):

return ORIGINAL_FORMATS[key]['all_extensions']

return None