From 7dd691d52d00181fca7e84f60ae7603417f328d8 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 19 Aug 2020 10:47:38 +0100 Subject: [PATCH 01/41] Entry points for storage backends This adds overridable config variables for mapping storage backend names to classes (by default drawn from declared entry points) and setting a default backend for new files. It also adds a pyfs storage backend entry point, and makes it the default. Storage backends should be `django_files_rest.storage:FileStorage` subclasses, and declared with a `django_files_rest.storage` entrypoint. --- invenio_files_rest/config.py | 11 +++++++++++ setup.py | 3 +++ 2 files changed, 14 insertions(+) diff --git a/invenio_files_rest/config.py b/invenio_files_rest/config.py index f45805da..94b72701 100644 --- a/invenio_files_rest/config.py +++ b/invenio_files_rest/config.py @@ -10,6 +10,8 @@ from datetime import timedelta +import pkg_resources + from invenio_files_rest.helpers import create_file_streaming_redirect_response MAX_CONTENT_LENGTH = 16 * 1024 * 1024 @@ -57,6 +59,15 @@ FILES_REST_STORAGE_FACTORY = 'invenio_files_rest.storage.pyfs_storage_factory' """Import path of factory used to create a storage instance.""" +FILES_REST_DEFAULT_STORAGE_BACKEND = 'pyfs' +"""The default storage backend name.""" + +FILES_REST_STORAGE_BACKENDS = { + ep.name: ep.load() + for ep in pkg_resources.iter_entry_points('invenio_files_rest.storage') +} +"""A mapping from storage backend names to classes.""" + FILES_REST_PERMISSION_FACTORY = \ 'invenio_files_rest.permissions.permission_factory' """Permission factory to control the files access from the REST interface.""" diff --git a/setup.py b/setup.py index 94a276fb..0e19d113 100644 --- a/setup.py +++ b/setup.py @@ -153,6 +153,9 @@ 'invenio_db.models': [ 'invenio_files_rest = invenio_files_rest.models', ], + 'invenio_files_rest.storage': [ + 'pyfs = invenio_files_rest.storage:PyFSFileStorage', + ] }, extras_require=extras_require, install_requires=install_requires, From 0bf2176e51c0ee08998ef00337f6264f00bbc334 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 19 Aug 2020 10:59:22 +0100 Subject: [PATCH 02/41] Add a class for computing checksums, counting bytes and reporting progress At the moment, the storage backend is responsible for computing checksums and reporting the size of files based on the number of bytes read. However, these are common concerns across all storage backends, and so this functionality can be pulled the other side of the storage backend interface. This class is therefore intended to wrap any file-like object, and to checksum the file as it is read. It also counts bytes and calls the progress callback if one is provided. In a subsequent commit we'll integrate it with FileInstance, and remove the functionality from the PyFS storage backend. --- invenio_files_rest/utils.py | 40 ++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/invenio_files_rest/utils.py b/invenio_files_rest/utils.py index 1345699c..84374aa1 100644 --- a/invenio_files_rest/utils.py +++ b/invenio_files_rest/utils.py @@ -7,8 +7,9 @@ # under the terms of the MIT License; see LICENSE file for more details. """Implementation of various utility functions.""" - +import hashlib import mimetypes +from typing import Callable import six from flask import current_app @@ -56,3 +57,40 @@ def guess_mimetype(filename): if encoding: m = ENCODING_MIMETYPES.get(encoding, None) return m or 'application/octet-stream' + + +class PassthroughChecksum: + def __init__(self, fp, hash_name, progress_callback: Callable[[int, int], None] = None, offset: int = 0): + """ + :param fp: A file-like option open for reading + :param hash_name: A hashlib hash algorithm name + :param progress_callback: An optional function that will receive the number of bytes read, and the total file + size + """ + self.hash_name = hash_name + self._sum = hashlib.new(hash_name) + self._fp = fp + self._bytes_read = 0 + self._progress_callback = progress_callback + self._offset = offset + + def read(self, size=-1): + chunk = self._fp.read(size) + self._bytes_read += len(chunk) + self._sum.update(chunk) + if self._progress_callback: + self._progress_callback(self._bytes_read, self._bytes_read + self._offset) + return chunk + + def __getattr__(self, item): + return getattr(self._fp, item) + + @property + def checksum(self): + """The {hash_name}:{hash} string for the bytes read so far.""" + return '{0}:{1}'.format( + self.hash_name, self._sum.hexdigest()) + + @property + def bytes_read(self): + return self._bytes_read From 7096e9746013e3852fa09a207def4702418b5d9a Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 19 Aug 2020 11:16:06 +0100 Subject: [PATCH 03/41] Add a new extensible storage factory This new storage factory uses a `FileInstance.storage_backend` attribute to determine which backend to use. It uses the `FILES_REST_DEFAULT_STORAGE_BACKEND` config value when the file instance doesn't already have a storage backend. It can be extended to customize how the backend is chosen, and how it is initialised. The file path is chosen based on the `id` of the file instance, but this can be customised too. It's not set as the default storage factory to preserve backwards compatibility. --- invenio_files_rest/ext.py | 2 ++ invenio_files_rest/storage/__init__.py | 2 ++ invenio_files_rest/storage/factory.py | 35 ++++++++++++++++++++++++++ 3 files changed, 39 insertions(+) create mode 100644 invenio_files_rest/storage/factory.py diff --git a/invenio_files_rest/ext.py b/invenio_files_rest/ext.py index 5906ff42..e9ae6175 100644 --- a/invenio_files_rest/ext.py +++ b/invenio_files_rest/ext.py @@ -10,6 +10,8 @@ from __future__ import absolute_import, print_function +import warnings + from flask import abort from werkzeug.exceptions import UnprocessableEntity from werkzeug.utils import cached_property diff --git a/invenio_files_rest/storage/__init__.py b/invenio_files_rest/storage/__init__.py index 0112dca8..53e9f2a2 100644 --- a/invenio_files_rest/storage/__init__.py +++ b/invenio_files_rest/storage/__init__.py @@ -12,9 +12,11 @@ from .base import FileStorage from .pyfs import PyFSFileStorage, pyfs_storage_factory +from .factory import StorageFactory __all__ = ( 'FileStorage', 'pyfs_storage_factory', 'PyFSFileStorage', + 'StorageFactory', ) diff --git a/invenio_files_rest/storage/factory.py b/invenio_files_rest/storage/factory.py new file mode 100644 index 00000000..a1b9678a --- /dev/null +++ b/invenio_files_rest/storage/factory.py @@ -0,0 +1,35 @@ +import os +from typing import Any, Dict, Type + +from invenio_files_rest.models import FileInstance +from .base import FileStorage + + +class StorageFactory: + """A base storage factory, with sensible default behaviour.""" + + def __init__(self, app): + self.app = app + + def __call__(self, fileinstance: FileInstance) -> FileStorage: + """Returns a FileStorage instance for a file, for manipulating file contents.""" + storage_backend_cls = self.get_storage_backend_for_file_instance(fileinstance) + storage_backend_kwargs = self.get_storage_backend_kwargs(storage_backend_cls, fileinstance) + + if not fileinstance.uri: + fileinstance.uri = self.get_filepath_for_file_instance(fileinstance) + if not fileinstance.storage_backend: + fileinstance.storage_backend = storage_backend_cls.backend_name + + return storage_backend_cls(fileinstance.uri, **storage_backend_kwargs) + + def get_storage_backend_for_file_instance(self, fileinstance: FileInstance) -> Type[FileStorage]: + storage_backend = fileinstance.storage_backend or self.app.config['FILES_REST_DEFAULT_STORAGE_BACKEND'] + return self.app.config['FILES_REST_STORAGE_BACKENDS'][storage_backend] + + def get_filepath_for_file_instance(self, fileinstance: FileInstance) -> str: + id = fileinstance.id.hex + return os.path.join(id[0:2], id[2:4], id[4:6], id) + + def get_storage_backend_kwargs(self, storage_backend_cls: Type[FileStorage], fileinstance: FileInstance) -> Dict[str, Any]: + return {} From 9715111a616f24aee7edfd31dfbbcf96289d795e Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 19 Aug 2020 11:57:51 +0100 Subject: [PATCH 04/41] _FilesRESTState.storage_factory deprecates old storage factory, can init classes This property can now instantiate class-based storage factories, i.e. the new factory in `invenio_files_rest.storage.factory`. It's possible the deprecation should be in the pyfs storage factory itself. --- invenio_files_rest/ext.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/invenio_files_rest/ext.py b/invenio_files_rest/ext.py index e9ae6175..09eaa4f8 100644 --- a/invenio_files_rest/ext.py +++ b/invenio_files_rest/ext.py @@ -32,9 +32,19 @@ def __init__(self, app): @cached_property def storage_factory(self): """Load default storage factory.""" - return load_or_import_from_config( + if self.app.config['FILES_REST_STORAGE_FACTORY'] in [ + 'invenio_files_rest.storage.pyfs_storage_factory', + ]: + warnings.warn(DeprecationWarning( + "The " + self.app.config['FILES_REST_STORAGE_FACTORY'] + " storage factory has been deprecated in " + "favour of 'invenio_files_rest.storage:StorageFactory" + )) + storage_factory = load_or_import_from_config( 'FILES_REST_STORAGE_FACTORY', app=self.app ) + if isinstance(storage_factory, type): + storage_factory = storage_factory(self.app) + return storage_factory @cached_property def permission_factory(self): From 7774b7d4aeb1be9c455ff9ef3ede96ea439e8fc2 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 19 Aug 2020 12:49:29 +0100 Subject: [PATCH 05/41] Add some type annotations These aren't Py3.6-compatible, so will probably need to be removed later, or replaced with comment-based annotations. --- invenio_files_rest/models.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index 58181a18..b9f9b47a 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -33,7 +33,7 @@ model. """ -from __future__ import absolute_import, print_function +from __future__ import absolute_import, annotations, print_function import mimetypes @@ -42,6 +42,11 @@ import sys import uuid from datetime import datetime +from functools import wraps +from os.path import basename +from typing import TYPE_CHECKING + +import six from flask import current_app from functools import wraps from invenio_db import db @@ -60,6 +65,9 @@ from .proxies import current_files_rest from .utils import ENCODING_MIMETYPES, guess_mimetype +if TYPE_CHECKING: + from .storage import FileStorage + slug_pattern = re.compile('^[a-z][a-z0-9-]+$') @@ -740,7 +748,7 @@ def delete(self): self.query.filter_by(id=self.id).delete() return self - def storage(self, **kwargs): + def storage(self, **kwargs) -> FileStorage: """Get storage interface for object. Uses the applications storage factory to create a storage interface From bca7ee2f234503cb2a30ba1af237a3408c3d64f0 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 19 Aug 2020 13:09:29 +0100 Subject: [PATCH 06/41] Deprecate passing **kwargs to the storage factory --- invenio_files_rest/models.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index b9f9b47a..660228e2 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -41,6 +41,7 @@ import six import sys import uuid +import warnings from datetime import datetime from functools import wraps from os.path import basename @@ -756,6 +757,10 @@ def storage(self, **kwargs) -> FileStorage: :returns: Storage interface. """ + if kwargs: + warnings.warn("Passing **kwargs to .storage() is deprecated; override the storage factory with a subclass " + "of invenio_files_rest.storage.StorageFactory and implement get_storage_backend_kwargs() " + "instead.", DeprecationWarning) return current_files_rest.storage_factory(fileinstance=self, **kwargs) @ensure_readable() From f5244b39dc52ed2c4787aa70706fd6c2c5a5a55e Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 19 Aug 2020 13:13:23 +0100 Subject: [PATCH 07/41] Use the passthrough checksummer This will mean we can later remove this functionality from the storage backend. --- invenio_files_rest/models.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index 660228e2..3abd6ae5 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -64,7 +64,7 @@ MultipartInvalidChunkSize, MultipartInvalidPartNumber, \ MultipartInvalidSize, MultipartMissingParts, MultipartNotCompleted from .proxies import current_files_rest -from .utils import ENCODING_MIMETYPES, guess_mimetype +from .utils import ENCODING_MIMETYPES, PassthroughChecksum, guess_mimetype if TYPE_CHECKING: from .storage import FileStorage @@ -836,10 +836,23 @@ def set_contents(self, stream, chunk_size=None, size=None, size_limit=None, from. :param stream: File-like stream. """ - self.set_uri( - *self.storage(**kwargs).save( - stream, chunk_size=chunk_size, size=size, - size_limit=size_limit, progress_callback=progress_callback)) + wrapped_stream = PassthroughChecksum(stream, + hash_name='md5', + progress_callback=progress_callback) + + storage = self.storage(**kwargs) + storage.save(wrapped_stream, chunk_size=chunk_size, size=size, + size_limit=size_limit, progress_callback=progress_callback) + + self.storage_backend = type(storage).backend_name + self.size = wrapped_stream.bytes_read + self.checksum = wrapped_stream.checksum + self.uri = storage.filepath + + # TODO: Should these be set here, or sent back from the storage backend? + self.readable = True + self.writable = False + self.storage_class = 'S' @ensure_writable() def copy_contents(self, fileinstance, progress_callback=None, @@ -850,11 +863,8 @@ def copy_contents(self, fileinstance, progress_callback=None, if not self.size == 0: raise ValueError('File instance has data.') - self.set_uri( - *self.storage(**kwargs).copy( - fileinstance.storage(**kwargs), - chunk_size=chunk_size, - progress_callback=progress_callback)) + with fileinstance.storage(**kwargs).open() as f: + self.set_contents(f, progress_callback=progress_callback, chunk_size=chunk_size, **kwargs) @ensure_readable() def send_file(self, filename, restricted=True, mimetype=None, From c39871afb9ffd94fc390376b764cb4517be56240 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 19 Aug 2020 13:14:57 +0100 Subject: [PATCH 08/41] Add a metaclass for FileStorage to return the backend name This is exposed as a class property on FileStorage and can be accessed as `FileStorage.backend_name`. It returns the associated backend name in the application config, which can then be used to find the class. --- invenio_files_rest/storage/base.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index a48b761e..08d540d6 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -14,6 +14,8 @@ from calendar import timegm from functools import partial +from flask import current_app + from ..errors import FileSizeError, StorageError, UnexpectedFileSizeError from ..helpers import chunk_size_or_default, compute_checksum, send_stream @@ -53,7 +55,22 @@ def check_size(bytes_written, total_size): description='File is smaller than expected.') -class FileStorage(object): +class FileStorageMeta(type): + @property + def backend_name(cls): + try: + return cls._backend_name + except AttributeError: + for name, backend_cls in current_app.config['FILES_REST_STORAGE_BACKENDS'].items(): + if cls is backend_cls: + cls._backend_name = name + break + else: + raise RuntimeError("{} isn't listed in FILES_REST_STORAGE_BACKENDS config".format(cls)) + return cls._backend_name + + +class FileStorage(metaclass=FileStorageMeta): """Base class for storage interface to a single file.""" def __init__(self, size=None, modified=None): From 37bd798f12c8820061ee16a08ba1aac78224534f Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 19 Aug 2020 13:17:35 +0100 Subject: [PATCH 09/41] Simplify the pyfs storage backend implementation It no longer needs to do checksumming, and the filepath can be moved to the superclass. --- invenio_files_rest/storage/base.py | 6 +++++- invenio_files_rest/storage/pyfs.py | 22 +++++++++++----------- tests/test_storage.py | 7 ++----- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index 08d540d6..ded2ad14 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -11,7 +11,9 @@ from __future__ import absolute_import, print_function import hashlib +import warnings from calendar import timegm +from datetime import datetime from functools import partial from flask import current_app @@ -73,8 +75,9 @@ def backend_name(cls): class FileStorage(metaclass=FileStorageMeta): """Base class for storage interface to a single file.""" - def __init__(self, size=None, modified=None): + def __init__(self, filepath: str, size: int=None, modified: datetime=None): """Initialize storage object.""" + self.filepath = filepath self._size = size self._modified = timegm(modified.timetuple()) if modified else None @@ -159,6 +162,7 @@ def copy(self, src, chunk_size=None, progress_callback=None): :param src: Source stream. :param chunk_size: Chunk size to read from source stream. """ + warnings.warn("Call save with the other already-open FileStorage passed in instead.", DeprecationWarning) fp = src.open(mode='rb') try: return self.save( diff --git a/invenio_files_rest/storage/pyfs.py b/invenio_files_rest/storage/pyfs.py index 7a09ef08..81c15e1a 100644 --- a/invenio_files_rest/storage/pyfs.py +++ b/invenio_files_rest/storage/pyfs.py @@ -10,6 +10,8 @@ from __future__ import absolute_import, print_function +import shutil + from flask import current_app from fs.opener import opener from fs.path import basename, dirname @@ -33,11 +35,16 @@ class PyFSFileStorage(FileStorage): """ - def __init__(self, fileurl, size=None, modified=None, clean_dir=True): + def __init__(self, *args, clean_dir=True, **kwargs): """Storage initialization.""" - self.fileurl = fileurl + # if isinstance(args[0], str): + # raise NotImplementedError self.clean_dir = clean_dir - super(PyFSFileStorage, self).__init__(size=size, modified=modified) + super(PyFSFileStorage, self).__init__(*args, **kwargs) + + @property + def fileurl(self): + return self.filepath def _get_fs(self, create_dir=True): """Return tuple with filesystem and filename.""" @@ -98,10 +105,7 @@ def save(self, incoming_stream, size_limit=None, size=None, """Save file in the file system.""" fp = self.open(mode='wb') try: - bytes_written, checksum = self._write_stream( - incoming_stream, fp, chunk_size=chunk_size, - progress_callback=progress_callback, - size_limit=size_limit, size=size) + shutil.copyfileobj(incoming_stream, fp, length=chunk_size) except Exception: fp.close() self.delete() @@ -109,10 +113,6 @@ def save(self, incoming_stream, size_limit=None, size=None, finally: fp.close() - self._size = bytes_written - - return self.fileurl, bytes_written, checksum - def update(self, incoming_stream, seek=0, size=None, chunk_size=None, progress_callback=None): """Update a file in the file system.""" diff --git a/tests/test_storage.py b/tests/test_storage.py index a8f148ef..b1e4d656 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -27,7 +27,7 @@ def test_storage_interface(): """Test storage interface.""" - s = FileStorage() + s = FileStorage('some-path') pytest.raises(NotImplementedError, s.open) pytest.raises(NotImplementedError, s.initialize) pytest.raises(NotImplementedError, s.delete) @@ -75,11 +75,8 @@ def test_pyfs_delete_fail(pyfs, pyfs_testpath): def test_pyfs_save(pyfs, pyfs_testpath, get_md5): """Test basic save operation.""" data = b'somedata' - uri, size, checksum = pyfs.save(BytesIO(data)) + pyfs.save(BytesIO(data)) - assert uri == pyfs_testpath - assert size == len(data) - assert checksum == get_md5(data) assert exists(pyfs_testpath) assert open(pyfs_testpath, 'rb').read() == data From 568cf754dea50e0dca3d2ac581b0a87bd8ecd580 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 19 Aug 2020 16:50:18 +0100 Subject: [PATCH 10/41] fixup! Add a new extensible storage factory --- invenio_files_rest/storage/factory.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/invenio_files_rest/storage/factory.py b/invenio_files_rest/storage/factory.py index a1b9678a..7abcc9c1 100644 --- a/invenio_files_rest/storage/factory.py +++ b/invenio_files_rest/storage/factory.py @@ -13,23 +13,24 @@ def __init__(self, app): def __call__(self, fileinstance: FileInstance) -> FileStorage: """Returns a FileStorage instance for a file, for manipulating file contents.""" - storage_backend_cls = self.get_storage_backend_for_file_instance(fileinstance) - storage_backend_kwargs = self.get_storage_backend_kwargs(storage_backend_cls, fileinstance) - if not fileinstance.uri: - fileinstance.uri = self.get_filepath_for_file_instance(fileinstance) if not fileinstance.storage_backend: - fileinstance.storage_backend = storage_backend_cls.backend_name + fileinstance.storage_backend = self.get_storage_backend_name(fileinstance) + + storage_backend_cls = self.app.config['FILES_REST_STORAGE_BACKENDS'][fileinstance.storage_backend] + storage_backend_kwargs = self.get_storage_backend_kwargs(fileinstance, storage_backend_cls) + + if not fileinstance.uri: + fileinstance.uri = self.get_uri(fileinstance, storage_backend_cls) return storage_backend_cls(fileinstance.uri, **storage_backend_kwargs) - def get_storage_backend_for_file_instance(self, fileinstance: FileInstance) -> Type[FileStorage]: - storage_backend = fileinstance.storage_backend or self.app.config['FILES_REST_DEFAULT_STORAGE_BACKEND'] - return self.app.config['FILES_REST_STORAGE_BACKENDS'][storage_backend] + def get_storage_backend_name(self, fileinstance: FileInstance) -> str: + return self.app.config['FILES_REST_DEFAULT_STORAGE_BACKEND'] - def get_filepath_for_file_instance(self, fileinstance: FileInstance) -> str: + def get_storage_backend_kwargs(self, fileinstance: FileInstance, storage_backend_cls: Type[FileStorage]) -> Dict[str, Any]: + return {} + + def get_uri(self, fileinstance: FileInstance, storage_backend_cls: Type[FileStorage]) -> str: id = fileinstance.id.hex return os.path.join(id[0:2], id[2:4], id[4:6], id) - - def get_storage_backend_kwargs(self, storage_backend_cls: Type[FileStorage], fileinstance: FileInstance) -> Dict[str, Any]: - return {} From 9bcd90038ae2803e00c1501e3ea6df400504f7fa Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Thu, 20 Aug 2020 12:45:25 +0100 Subject: [PATCH 11/41] fixup! Add a new extensible storage factory --- invenio_files_rest/storage/factory.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/invenio_files_rest/storage/factory.py b/invenio_files_rest/storage/factory.py index 7abcc9c1..8a2f5352 100644 --- a/invenio_files_rest/storage/factory.py +++ b/invenio_files_rest/storage/factory.py @@ -11,13 +11,13 @@ class StorageFactory: def __init__(self, app): self.app = app - def __call__(self, fileinstance: FileInstance) -> FileStorage: + def __call__(self, fileinstance: FileInstance, preferred_backend_name: str = None) -> FileStorage: """Returns a FileStorage instance for a file, for manipulating file contents.""" if not fileinstance.storage_backend: - fileinstance.storage_backend = self.get_storage_backend_name(fileinstance) + fileinstance.storage_backend = self.get_storage_backend_name(fileinstance, preferred_backend_name) - storage_backend_cls = self.app.config['FILES_REST_STORAGE_BACKENDS'][fileinstance.storage_backend] + storage_backend_cls = self.resolve_storage_backend(fileinstance.storage_backend) storage_backend_kwargs = self.get_storage_backend_kwargs(fileinstance, storage_backend_cls) if not fileinstance.uri: @@ -25,9 +25,13 @@ def __call__(self, fileinstance: FileInstance) -> FileStorage: return storage_backend_cls(fileinstance.uri, **storage_backend_kwargs) - def get_storage_backend_name(self, fileinstance: FileInstance) -> str: + def get_storage_backend_name(self, fileinstance: FileInstance, preferred_backend_name: str = None) -> str: + """""" return self.app.config['FILES_REST_DEFAULT_STORAGE_BACKEND'] + def resolve_storage_backend(self, backend_name: str) -> Type[FileStorage]: + return self.app.config['FILES_REST_STORAGE_BACKENDS'][backend_name] + def get_storage_backend_kwargs(self, fileinstance: FileInstance, storage_backend_cls: Type[FileStorage]) -> Dict[str, Any]: return {} From 6e9412828fa9b70e01f57b33c1b52b3db5e5fe25 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Tue, 8 Sep 2020 09:42:46 +0100 Subject: [PATCH 12/41] intermediate --- docs/index.rst | 1 + docs/new-storage-backend.rst | 3 + invenio_files_rest/models.py | 75 +++++- invenio_files_rest/storage/base.py | 22 +- invenio_files_rest/storage/factory.py | 87 +++++- invenio_files_rest/storage/pyfs.py | 14 +- tests/conftest.py | 10 +- tests/legacy_storage_interface.py | 366 ++++++++++++++++++++++++++ tests/test_legacy_storage.py | 348 ++++++++++++++++++++++++ tests/test_storage.py | 8 +- 10 files changed, 897 insertions(+), 37 deletions(-) create mode 100644 docs/new-storage-backend.rst create mode 100644 tests/legacy_storage_interface.py create mode 100644 tests/test_legacy_storage.py diff --git a/docs/index.rst b/docs/index.rst index b7a74ed0..30155ec1 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -22,6 +22,7 @@ Invenio-Files-REST. installation configuration usage + new-storage-backend API Reference diff --git a/docs/new-storage-backend.rst b/docs/new-storage-backend.rst new file mode 100644 index 00000000..576cd3c6 --- /dev/null +++ b/docs/new-storage-backend.rst @@ -0,0 +1,3 @@ +Developing a new storage backend +================================ + diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index 3abd6ae5..79f02e2d 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -45,7 +45,7 @@ from datetime import datetime from functools import wraps from os.path import basename -from typing import TYPE_CHECKING +from typing import Dict, TYPE_CHECKING, Tuple, Union import six from flask import current_app @@ -271,6 +271,9 @@ class Location(db.Model, Timestamp): name = db.Column(db.String(20), unique=True, nullable=False) """External identifier of the location.""" + storage_backend = db.Column(db.String(32), nullable=True) + """Name of the storage backend to use for this location.""" + uri = db.Column(db.String(255), nullable=False) """URI of the location.""" @@ -674,6 +677,8 @@ class FileInstance(db.Model, Timestamp): storage_class = db.Column(db.String(1), nullable=True) """Storage class of file.""" + backend_name = db.Column(db.String(32), nullable=True) + size = db.Column(db.BigInteger, default=0, nullable=True) """Size of file.""" @@ -804,13 +809,47 @@ def verify_checksum(self, progress_callback=None, chunk_size=None, else (self.checksum == real_checksum)) self.last_check_at = datetime.utcnow() return self.last_check + return current_files_rest.storage_factory(fileinstance=self, **kwargs) @ensure_writable() - def init_contents(self, size=0, **kwargs): + def initialize(self, preferred_location: Location, size=0, **kwargs): """Initialize file.""" - self.set_uri( - *self.storage(**kwargs).initialize(size=size), - readable=False, writable=True) + if hasattr(current_files_rest.storage_factory, 'initialize'): + # New behaviour, with a new-style storage factory + result = current_files_rest.storage_factory.initialize( + fileinstance=self, + preferred_location=preferred_location, + size=size, + ) + else: + # Old behaviour, with an old-style storage factory + storage = self.storage(default_location=preferred_location.uri, **kwargs) + if isinstance(storage.initialize.__self__, type): + # New storage backend, but old storage factory + # New storage backends have `initialize` as a classmethod. The storage backend will have constructed + # a URI and passed it to the storage's __init__, so we can grab it from there. + result = storage.initialize(suggested_uri=storage.uri, size=size) + else: + result = storage.initialize(size=size) + self.update_file_metadata( + result, + readable=False, + writable=True, + storage_backend=type(storage).backend_name, + ) + + + @ensure_writable() + def init_contents(self, size=0, default_location: str=None, **kwargs): + if default_location: + preferred_location = Location(uri=default_location) + else: + preferred_location = None + return self.initialize( + preferred_location=preferred_location, + size=size, + **kwargs + ) @ensure_writable() def update_contents(self, stream, seek=0, size=None, chunk_size=None, @@ -895,6 +934,22 @@ def set_uri(self, uri, size, checksum, readable=True, writable=False, storage_class return self + _FILE_METADATA_FIELDS = {'uri', 'size', 'checksum', 'writable', 'readable', 'storage_class'} + + def update_file_metadata(self, file_metadata: Union[Tuple,Dict] = None, **kwargs): + if file_metadata is None: + file_metadata = {} + + if isinstance(file_metadata, tuple): + self.set_uri(*file_metadata) + elif isinstance(file_metadata, dict): + file_metadata.update(kwargs) + for key in file_metadata: + if key in self._FILE_METADATA_FIELDS: + setattr(self, key, file_metadata[key]) + + + class ObjectVersion(db.Model, Timestamp): """Model for storing versions of objects. @@ -1036,7 +1091,7 @@ def set_contents(self, stream, chunk_size=None, size=None, size_limit=None, @ensure_no_file() @update_bucket_size - def set_location(self, uri, size, checksum, storage_class=None): + def set_location(self, uri, size, checksum, storage_class=None, storage_backend=None): """Set only URI location of for object. Useful to link files on externally controlled storage. If a file @@ -1050,8 +1105,12 @@ def set_location(self, uri, size, checksum, storage_class=None): :param storage_class: Storage class where file is stored () """ self.file = FileInstance() - self.file.set_uri( - uri, size, checksum, storage_class=storage_class + self.file.update_file_metadata( + storage_backend=storage_backend, + uri=uri, + size=size, + checksum=checksum, + storage_class=storage_class ) db.session.add(self.file) return self diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index ded2ad14..18c170a7 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -11,6 +11,7 @@ from __future__ import absolute_import, print_function import hashlib +import urllib.parse import warnings from calendar import timegm from datetime import datetime @@ -19,7 +20,8 @@ from flask import current_app from ..errors import FileSizeError, StorageError, UnexpectedFileSizeError -from ..helpers import chunk_size_or_default, compute_checksum, send_stream +from ..helpers import chunk_size_or_default, compute_checksum, make_path, send_stream +from ..models import FileInstance def check_sizelimit(size_limit, bytes_written, total_size): @@ -75,9 +77,9 @@ def backend_name(cls): class FileStorage(metaclass=FileStorageMeta): """Base class for storage interface to a single file.""" - def __init__(self, filepath: str, size: int=None, modified: datetime=None): + def __init__(self, uri: str=None, size: int=None, modified: datetime=None, *, filepath=None): """Initialize storage object.""" - self.filepath = filepath + self.uri = uri or filepath self._size = size self._modified = timegm(modified.timetuple()) if modified else None @@ -92,10 +94,12 @@ def delete(self): """Delete the file.""" raise NotImplementedError - def initialize(self, size=0): + @classmethod + def initialize(cls, suggested_uri, size=0): """Initialize the file on the storage + truncate to the given size.""" raise NotImplementedError + def save(self, incoming_stream, size_limit=None, size=None, chunk_size=None, progress_callback=None): """Save incoming stream to file storage.""" @@ -170,6 +174,16 @@ def copy(self, src, chunk_size=None, progress_callback=None): finally: fp.close() + @classmethod + def get_uri(self, fileinstance: FileInstance, base_uri: str) -> str: + return make_path( + base_uri, + str(fileinstance.id), + 'data', + current_app.config['FILES_REST_STORAGE_PATH_DIMENSIONS'], + current_app.config['FILES_REST_STORAGE_PATH_SPLIT_LENGTH'], + ) + # # Helpers # diff --git a/invenio_files_rest/storage/factory.py b/invenio_files_rest/storage/factory.py index 8a2f5352..b98e583e 100644 --- a/invenio_files_rest/storage/factory.py +++ b/invenio_files_rest/storage/factory.py @@ -1,8 +1,12 @@ import os +import urllib.parse from typing import Any, Dict, Type -from invenio_files_rest.models import FileInstance +from flask import current_app + +from invenio_files_rest.models import FileInstance, Location from .base import FileStorage +from ..helpers import make_path class StorageFactory: @@ -11,30 +15,85 @@ class StorageFactory: def __init__(self, app): self.app = app - def __call__(self, fileinstance: FileInstance, preferred_backend_name: str = None) -> FileStorage: - """Returns a FileStorage instance for a file, for manipulating file contents.""" - + def __call__( + self, + fileinstance: FileInstance, + ) -> FileStorage: + """Returns a FileStorage instance for a file, for manipulating file contents. + """ if not fileinstance.storage_backend: - fileinstance.storage_backend = self.get_storage_backend_name(fileinstance, preferred_backend_name) + return None + + storage_backend_cls = self.resolve_storage_backend(fileinstance.storage_backend) + storage_backend_kwargs = self.get_storage_backend_kwargs(fileinstance, storage_backend_cls) + + return storage_backend_cls( + uri=fileinstance.uri, + **storage_backend_kwargs + ) + + def initialize( + self, + fileinstance: FileInstance, + size: int = 0, + preferred_location: Location = None + ) -> FileStorage: + if fileinstance.storage_backend: + return self(fileinstance) + + location = self.get_location(fileinstance, preferred_location) + + fileinstance.storage_backend = location.storage_backend storage_backend_cls = self.resolve_storage_backend(fileinstance.storage_backend) storage_backend_kwargs = self.get_storage_backend_kwargs(fileinstance, storage_backend_cls) - if not fileinstance.uri: - fileinstance.uri = self.get_uri(fileinstance, storage_backend_cls) + if storage_backend_cls.initialize.__self__ == storage_backend_cls: + # New behaviour, as initialize should now be a classmethod + return storage_backend_cls.initialize( + suggested_uri=self.get_suggested_uri( + fileinstance=fileinstance, + location=location, + storage_backend_cls=storage_backend_cls, + ), + size=size, + ) + else: + # Old behaviour + return self( + fileinstance, + default_location=location.uri, + ).initialize(size=size) - return storage_backend_cls(fileinstance.uri, **storage_backend_kwargs) + def get_location(self, fileinstance: FileInstance, preferred_location: Location = None) -> Location: + return preferred_location or Location.get_default() - def get_storage_backend_name(self, fileinstance: FileInstance, preferred_backend_name: str = None) -> str: + def get_storage_backend_name(self, fileinstance: FileInstance, preferred_location: Location) -> str: """""" - return self.app.config['FILES_REST_DEFAULT_STORAGE_BACKEND'] + if not preferred_location: + raise ValueError("preferred_location required for default storage factory") + return preferred_location.storage_backend def resolve_storage_backend(self, backend_name: str) -> Type[FileStorage]: return self.app.config['FILES_REST_STORAGE_BACKENDS'][backend_name] - def get_storage_backend_kwargs(self, fileinstance: FileInstance, storage_backend_cls: Type[FileStorage]) -> Dict[str, Any]: + def get_storage_backend_kwargs( + self, + fileinstance: FileInstance, + storage_backend_cls: Type[FileStorage], + ) -> Dict[str, Any]: return {} - def get_uri(self, fileinstance: FileInstance, storage_backend_cls: Type[FileStorage]) -> str: - id = fileinstance.id.hex - return os.path.join(id[0:2], id[2:4], id[4:6], id) + def get_suggested_uri( + self, + fileinstance: FileInstance, + location: Location, + storage_backend_cls: Type[FileStorage], + ): + make_path( + location, + str(fileinstance.id), + 'data', + current_app.config['FILES_REST_STORAGE_PATH_DIMENSIONS'], + current_app.config['FILES_REST_STORAGE_PATH_SPLIT_LENGTH'], + ) diff --git a/invenio_files_rest/storage/pyfs.py b/invenio_files_rest/storage/pyfs.py index 81c15e1a..6a9921e9 100644 --- a/invenio_files_rest/storage/pyfs.py +++ b/invenio_files_rest/storage/pyfs.py @@ -43,13 +43,13 @@ def __init__(self, *args, clean_dir=True, **kwargs): super(PyFSFileStorage, self).__init__(*args, **kwargs) @property - def fileurl(self): - return self.filepath + def filepath(self): + return self.uri def _get_fs(self, create_dir=True): """Return tuple with filesystem and filename.""" - filedir = dirname(self.fileurl) - filename = basename(self.fileurl) + filedir = dirname(self.uri) + filename = basename(self.uri) return ( opener.opendir(filedir, writeable=True, create_dir=create_dir), @@ -77,8 +77,10 @@ def delete(self): fs.removedir('.') return True - def initialize(self, size=0): + @classmethod + def initialize(cls, suggested_uri, size=0): """Initialize file on storage and truncate to given size.""" + self = cls(uri=suggested_uri) fs, path = self._get_fs() # Required for reliably opening the file on certain file systems: @@ -98,7 +100,7 @@ def initialize(self, size=0): self._size = size - return self.fileurl, size, None + return {'uri': self.uri, 'size': size} def save(self, incoming_stream, size_limit=None, size=None, chunk_size=None, progress_callback=None): diff --git a/tests/conftest.py b/tests/conftest.py index 5daa35c7..a0cf8b51 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -43,6 +43,7 @@ object_read_version_all from invenio_files_rest.storage import PyFSFileStorage from invenio_files_rest.views import blueprint +from tests import legacy_storage_interface @compiles(DropTable, 'postgresql') @@ -135,7 +136,8 @@ def dummy_location(db): loc = Location( name='testloc', uri=tmppath, - default=True + storage_backend='pyfs', + default=True, ) db.session.add(loc) db.session.commit() @@ -157,6 +159,12 @@ def pyfs(dummy_location, pyfs_testpath): return PyFSFileStorage(pyfs_testpath) +@pytest.fixture() +def legacy_pyfs(dummy_location, pyfs_testpath): + """Instance of PyFSFileStorage.""" + return legacy_storage_interface.PyFSFileStorage(pyfs_testpath) + + @pytest.yield_fixture() def extra_location(db): """File system location.""" diff --git a/tests/legacy_storage_interface.py b/tests/legacy_storage_interface.py new file mode 100644 index 00000000..294186bf --- /dev/null +++ b/tests/legacy_storage_interface.py @@ -0,0 +1,366 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Invenio. +# Copyright (C) 2016-2019 CERN. +# +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""The previous invenio_files_rest.storage implementation. + +These classes and factory are included so that we can test that there are no regressions against them when +implementing the new storage interface, and so ensure backwards compatibility.""" + +from __future__ import absolute_import, print_function + +import hashlib +from calendar import timegm +from functools import partial + +from flask import current_app + +from invenio_files_rest.helpers import make_path +from invenio_files_rest.errors import FileSizeError, StorageError, UnexpectedFileSizeError +from invenio_files_rest.helpers import chunk_size_or_default, compute_checksum, send_stream + +from fs.opener import opener +from fs.path import basename, dirname + +def check_sizelimit(size_limit, bytes_written, total_size): + """Check if size limit was exceeded. + :param size_limit: The size limit. + :param bytes_written: The total number of bytes written. + :param total_size: The total file size. + :raises invenio_files_rest.errors.UnexpectedFileSizeError: If the bytes + written exceed the total size. + :raises invenio_files_rest.errors.FileSizeError: If the bytes + written are major than the limit size. + """ + if size_limit is not None and bytes_written > size_limit: + desc = 'File size limit exceeded.' \ + if isinstance(size_limit, int) else size_limit.reason + raise FileSizeError(description=desc) + + # Never write more than advertised + if total_size is not None and bytes_written > total_size: + raise UnexpectedFileSizeError( + description='File is bigger than expected.') + + +def check_size(bytes_written, total_size): + """Check if expected amounts of bytes have been written. + :param bytes_written: The total number of bytes written. + :param total_size: The total file size. + :raises invenio_files_rest.errors.UnexpectedFileSizeError: If the bytes + written exceed the total size. + """ + if total_size and bytes_written < total_size: + raise UnexpectedFileSizeError( + description='File is smaller than expected.') + + +class FileStorage(object): + """Base class for storage interface to a single file.""" + + def __init__(self, size=None, modified=None): + """Initialize storage object.""" + self._size = size + self._modified = timegm(modified.timetuple()) if modified else None + + def open(self, mode=None): + """Open the file. + The caller is responsible for closing the file. + """ + raise NotImplementedError + + def delete(self): + """Delete the file.""" + raise NotImplementedError + + def initialize(self, size=0): + """Initialize the file on the storage + truncate to the given size.""" + raise NotImplementedError + + def save(self, incoming_stream, size_limit=None, size=None, + chunk_size=None, progress_callback=None): + """Save incoming stream to file storage.""" + raise NotImplementedError + + def update(self, incoming_stream, seek=0, size=None, chunk_size=None, + progress_callback=None): + """Update part of file with incoming stream.""" + raise NotImplementedError + + # + # Default implementation + # + def send_file(self, filename, mimetype=None, restricted=True, + checksum=None, trusted=False, chunk_size=None, + as_attachment=False): + """Send the file to the client.""" + try: + fp = self.open(mode='rb') + except Exception as e: + raise StorageError('Could not send file: {}'.format(e)) + + try: + md5_checksum = None + if checksum: + algo, value = checksum.split(':') + if algo == 'md5': + md5_checksum = value + + # Send stream is responsible for closing the file. + return send_stream( + fp, + filename, + self._size, + self._modified, + mimetype=mimetype, + restricted=restricted, + etag=checksum, + content_md5=md5_checksum, + chunk_size=chunk_size, + trusted=trusted, + as_attachment=as_attachment, + ) + except Exception as e: + fp.close() + raise StorageError('Could not send file: {}'.format(e)) + + def checksum(self, chunk_size=None, progress_callback=None, **kwargs): + """Compute checksum of file.""" + fp = self.open(mode='rb') + try: + value = self._compute_checksum( + fp, size=self._size, chunk_size=None, + progress_callback=progress_callback) + except StorageError: + raise + finally: + fp.close() + return value + + def copy(self, src, chunk_size=None, progress_callback=None): + """Copy data from another file instance. + :param src: Source stream. + :param chunk_size: Chunk size to read from source stream. + """ + fp = src.open(mode='rb') + try: + return self.save( + fp, chunk_size=chunk_size, progress_callback=progress_callback) + finally: + fp.close() + + # + # Helpers + # + def _init_hash(self): + """Initialize message digest object. + Overwrite this method if you want to use different checksum + algorithm for your storage backend. + """ + return 'md5', hashlib.md5() + + def _compute_checksum(self, stream, size=None, chunk_size=None, + progress_callback=None, **kwargs): + """Get helper method to compute checksum from a stream. + Naive implementation that can be overwritten by subclasses in order to + provide more efficient implementation. + """ + if progress_callback and size: + progress_callback = partial(progress_callback, size) + else: + progress_callback = None + + try: + algo, m = self._init_hash() + return compute_checksum( + stream, algo, m, + chunk_size=chunk_size, + progress_callback=progress_callback, + **kwargs + ) + except Exception as e: + raise StorageError( + 'Could not compute checksum of file: {0}'.format(e)) + + def _write_stream(self, src, dst, size=None, size_limit=None, + chunk_size=None, progress_callback=None): + """Get helper to save stream from src to dest + compute checksum. + :param src: Source stream. + :param dst: Destination stream. + :param size: If provided, this exact amount of bytes will be + written to the destination file. + :param size_limit: ``FileSizeLimit`` instance to limit number of bytes + to write. + """ + chunk_size = chunk_size_or_default(chunk_size) + + algo, m = self._init_hash() + bytes_written = 0 + + while 1: + # Check that size limits aren't bypassed + check_sizelimit(size_limit, bytes_written, size) + + chunk = src.read(chunk_size) + + if not chunk: + if progress_callback: + progress_callback(bytes_written, bytes_written) + break + + dst.write(chunk) + + bytes_written += len(chunk) + + if m: + m.update(chunk) + + if progress_callback: + progress_callback(None, bytes_written) + + check_size(bytes_written, size) + + return bytes_written, '{0}:{1}'.format( + algo, m.hexdigest()) if m else None + + +class PyFSFileStorage(FileStorage): + """File system storage using PyFilesystem for access the file. + This storage class will store files according to the following pattern: + ``//data``. + .. warning:: + File operations are not atomic. E.g. if errors happens during e.g. + updating part of a file it will leave the file in an inconsistent + state. The storage class tries as best as possible to handle errors + and leave the system in a consistent state. + """ + + def __init__(self, fileurl, size=None, modified=None, clean_dir=True): + """Storage initialization.""" + self.fileurl = fileurl + self.clean_dir = clean_dir + super(PyFSFileStorage, self).__init__(size=size, modified=modified) + + def _get_fs(self, create_dir=True): + """Return tuple with filesystem and filename.""" + filedir = dirname(self.fileurl) + filename = basename(self.fileurl) + + return ( + opener.opendir(filedir, writeable=True, create_dir=create_dir), + filename + ) + + def open(self, mode='rb'): + """Open file. + The caller is responsible for closing the file. + """ + fs, path = self._get_fs() + return fs.open(path, mode=mode) + + def delete(self): + """Delete a file. + The base directory is also removed, as it is assumed that only one file + exists in the directory. + """ + fs, path = self._get_fs(create_dir=False) + if fs.exists(path): + fs.remove(path) + if self.clean_dir and fs.exists('.'): + fs.removedir('.') + return True + + def initialize(self, size=0): + """Initialize file on storage and truncate to given size.""" + fs, path = self._get_fs() + + # Required for reliably opening the file on certain file systems: + if fs.exists(path): + fp = fs.open(path, mode='r+b') + else: + fp = fs.open(path, mode='wb') + + try: + fp.truncate(size) + except Exception: + fp.close() + self.delete() + raise + finally: + fp.close() + + self._size = size + + return self.fileurl, size, None + + def save(self, incoming_stream, size_limit=None, size=None, + chunk_size=None, progress_callback=None): + """Save file in the file system.""" + fp = self.open(mode='wb') + try: + bytes_written, checksum = self._write_stream( + incoming_stream, fp, chunk_size=chunk_size, + progress_callback=progress_callback, + size_limit=size_limit, size=size) + except Exception: + fp.close() + self.delete() + raise + finally: + fp.close() + + self._size = bytes_written + + return self.fileurl, bytes_written, checksum + + def update(self, incoming_stream, seek=0, size=None, chunk_size=None, + progress_callback=None): + """Update a file in the file system.""" + fp = self.open(mode='r+b') + try: + fp.seek(seek) + bytes_written, checksum = self._write_stream( + incoming_stream, fp, chunk_size=chunk_size, + size=size, progress_callback=progress_callback) + finally: + fp.close() + + return bytes_written, checksum + + +def pyfs_storage_factory(fileinstance=None, default_location=None, + default_storage_class=None, + filestorage_class=PyFSFileStorage, fileurl=None, + size=None, modified=None, clean_dir=True): + """Get factory function for creating a PyFS file storage instance.""" + # Either the FileInstance needs to be specified or all filestorage + # class parameters need to be specified + assert fileinstance or (fileurl and size) + + if fileinstance: + # FIXME: Code here should be refactored since it assumes a lot on the + # directory structure where the file instances are written + fileurl = None + size = fileinstance.size + modified = fileinstance.updated + + if fileinstance.uri: + # Use already existing URL. + fileurl = fileinstance.uri + else: + assert default_location + # Generate a new URL. + fileurl = make_path( + default_location, + str(fileinstance.id), + 'data', + current_app.config['FILES_REST_STORAGE_PATH_DIMENSIONS'], + current_app.config['FILES_REST_STORAGE_PATH_SPLIT_LENGTH'], + ) + + return filestorage_class( + fileurl, size=size, modified=modified, clean_dir=clean_dir) diff --git a/tests/test_legacy_storage.py b/tests/test_legacy_storage.py new file mode 100644 index 00000000..b439c058 --- /dev/null +++ b/tests/test_legacy_storage.py @@ -0,0 +1,348 @@ + +# -*- coding: utf-8 -*- +# +# This file is part of Invenio. +# Copyright (C) 2016-2019 CERN. +# +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Storage module tests.""" + +from __future__ import absolute_import, print_function + +import errno +import os +from os.path import dirname, exists, getsize, join + +import pytest +from fs.errors import DirectoryNotEmptyError, ResourceNotFoundError +from mock import patch +from six import BytesIO + +from invenio_files_rest.errors import FileSizeError, StorageError, \ + UnexpectedFileSizeError +from invenio_files_rest.limiters import FileSizeLimit +from invenio_files_rest.storage import FileStorage, PyFSFileStorage +from tests import legacy_storage_interface + + +def test_storage_interface(): + """Test storage interface.""" + s = FileStorage('some-path') + pytest.raises(NotImplementedError, s.open) + pytest.raises(NotImplementedError, s.initialize, None, 'file:///') + pytest.raises(NotImplementedError, s.delete) + pytest.raises(NotImplementedError, s.save, None) + pytest.raises(NotImplementedError, s.update, None) + pytest.raises(NotImplementedError, s.checksum) + + +def test_pyfs_initialize(legacy_pyfs, pyfs_testpath): + """Test init of files.""" + # Create file object. + assert not exists(pyfs_testpath) + uri, size, checksum = legacy_pyfs.initialize(size=100) + + assert size == 100 + assert checksum is None + assert os.stat(pyfs_testpath).st_size == size + + uri, size, checksum = legacy_pyfs.initialize() + assert size == 0 + assert size == os.stat(pyfs_testpath).st_size + + +def test_pyfs_delete(app, db, dummy_location): + """Test init of files.""" + testurl = join(dummy_location.uri, 'subpath/data') + s = legacy_storage_interface.PyFSFileStorage(testurl) + s.initialize(size=100) + assert exists(testurl) + + s.delete() + assert not exists(testurl) + + s = legacy_storage_interface.PyFSFileStorage(join(dummy_location.uri, 'anotherpath/data')) + pytest.raises(ResourceNotFoundError, s.delete) + + +def test_pyfs_delete_fail(legacy_pyfs, pyfs_testpath): + """Test init of files.""" + legacy_pyfs.save(BytesIO(b'somedata')) + os.rename(pyfs_testpath, join(dirname(pyfs_testpath), 'newname')) + pytest.raises(DirectoryNotEmptyError, legacy_pyfs.delete) + + +def test_pyfs_save(legacy_pyfs, pyfs_testpath, get_md5): + """Test basic save operation.""" + data = b'somedata' + legacy_pyfs.save(BytesIO(data)) + + assert exists(pyfs_testpath) + assert open(pyfs_testpath, 'rb').read() == data + + +def test_pyfs_save_failcleanup(legacy_pyfs, pyfs_testpath, get_md5): + """Test basic save operation.""" + data = b'somedata' + + def fail_callback(total, size): + assert exists(pyfs_testpath) + raise Exception('Something bad happened') + + pytest.raises( + Exception, + legacy_pyfs.save, + BytesIO(data), chunk_size=4, progress_callback=fail_callback + ) + assert not exists(pyfs_testpath) + assert not exists(dirname(pyfs_testpath)) + + +def test_pyfs_save_callback(legacy_pyfs): + """Test progress callback.""" + data = b'somedata' + + counter = dict(size=0) + + def callback(total, size): + counter['size'] = size + + uri, size, checksum = legacy_pyfs.save( + BytesIO(data), progress_callback=callback) + + assert counter['size'] == len(data) + + +def test_pyfs_save_limits(legacy_pyfs): + """Test progress callback.""" + data = b'somedata' + uri, size, checksum = legacy_pyfs.save(BytesIO(data), size=len(data)) + assert size == len(data) + + uri, size, checksum = legacy_pyfs.save(BytesIO(data), size_limit=len(data)) + assert size == len(data) + + # Size doesn't match + pytest.raises( + UnexpectedFileSizeError, legacy_pyfs.save, BytesIO(data), size=len(data) - 1) + pytest.raises( + UnexpectedFileSizeError, legacy_pyfs.save, BytesIO(data), size=len(data) + 1) + + # Exceeds size limits + pytest.raises( + FileSizeError, legacy_pyfs.save, BytesIO(data), + size_limit=FileSizeLimit(len(data) - 1, 'bla')) + + +def test_pyfs_update(legacy_pyfs, pyfs_testpath, get_md5): + """Test update of file.""" + legacy_pyfs.initialize(size=100) + legacy_pyfs.update(BytesIO(b'cd'), seek=2, size=2) + legacy_pyfs.update(BytesIO(b'ab'), seek=0, size=2) + + with open(pyfs_testpath) as fp: + content = fp.read() + assert content[0:4] == 'abcd' + assert len(content) == 100 + + # Assert return parameters from update. + size, checksum = legacy_pyfs.update(BytesIO(b'ef'), seek=4, size=2) + assert size == 2 + assert get_md5(b'ef') == checksum + + +def test_pyfs_update_fail(legacy_pyfs, pyfs_testpath, get_md5): + """Test update of file.""" + def fail_callback(total, size): + assert exists(pyfs_testpath) + raise Exception('Something bad happened') + + legacy_pyfs.initialize(size=100) + legacy_pyfs.update(BytesIO(b'ab'), seek=0, size=2) + pytest.raises( + Exception, + legacy_pyfs.update, + BytesIO(b'cdef'), + seek=2, + size=4, + chunk_size=2, + progress_callback=fail_callback, + ) + + # Partial file can be written to disk! + with open(pyfs_testpath) as fp: + content = fp.read() + assert content[0:4] == 'abcd' + assert content[4:6] != 'ef' + + +def test_pyfs_checksum(get_md5): + """Test fixity.""" + # Compute checksum of license file/ + with open('LICENSE', 'rb') as fp: + data = fp.read() + checksum = get_md5(data) + + counter = dict(size=0) + + def callback(total, size): + counter['size'] = size + + # Now do it with storage interface + s = legacy_storage_interface.PyFSFileStorage('LICENSE', size=getsize('LICENSE')) + assert checksum == s.checksum(chunk_size=2, progress_callback=callback) + assert counter['size'] == getsize('LICENSE') + + # No size provided, means progress callback isn't called + counter['size'] = 0 + s = legacy_storage_interface.PyFSFileStorage('LICENSE') + assert checksum == s.checksum(chunk_size=2, progress_callback=callback) + assert counter['size'] == 0 + + +def test_pyfs_checksum_fail(): + """Test fixity problems.""" + # Raise an error during checksum calculation + def callback(total, size): + raise OSError(errno.EPERM, "Permission") + + s = legacy_storage_interface.PyFSFileStorage('LICENSE', size=getsize('LICENSE')) + + pytest.raises(StorageError, s.checksum, progress_callback=callback) + + +def test_pyfs_send_file(app, legacy_pyfs): + """Test send file.""" + data = b'sendthis' + uri, size, checksum = legacy_pyfs.save(BytesIO(data)) + + with app.test_request_context(): + res = legacy_pyfs.send_file( + 'myfilename.txt', mimetype='text/plain', checksum=checksum) + assert res.status_code == 200 + h = res.headers + assert h['Content-Type'] == 'text/plain; charset=utf-8' + assert h['Content-Length'] == str(size) + assert h['Content-MD5'] == checksum[4:] + assert h['ETag'] == '"{0}"'.format(checksum) + + # Content-Type: application/octet-stream + # ETag: "b234ee4d69f5fce4486a80fdaf4a4263" + # Last-Modified: Sat, 23 Jan 2016 06:21:04 GMT + # Cache-Control: max-age=43200, public + # Expires: Sat, 23 Jan 2016 19:21:04 GMT + # Date: Sat, 23 Jan 2016 07:21:04 GMT + + res = legacy_pyfs.send_file( + 'myfilename.txt', mimetype='text/plain', checksum='crc32:test') + assert res.status_code == 200 + assert 'Content-MD5' not in dict(res.headers) + + # Test for absence of Content-Disposition header to make sure that + # it's not present when as_attachment=False + res = legacy_pyfs.send_file('myfilename.txt', mimetype='text/plain', + checksum=checksum, as_attachment=False) + assert res.status_code == 200 + assert 'attachment' not in res.headers['Content-Disposition'] + + +def test_pyfs_send_file_for_download(app, legacy_pyfs): + """Test send file.""" + data = b'sendthis' + uri, size, checksum = legacy_pyfs.save(BytesIO(data)) + + with app.test_request_context(): + # Test for presence of Content-Disposition header to make sure that + # it's present when as_attachment=True + res = legacy_pyfs.send_file('myfilename.txt', mimetype='text/plain', + checksum=checksum, as_attachment=True) + assert res.status_code == 200 + assert (res.headers['Content-Disposition'] == + 'attachment; filename=myfilename.txt') + + +def test_pyfs_send_file_xss_prevention(app, legacy_pyfs): + """Test send file.""" + data = b'' + uri, size, checksum = legacy_pyfs.save(BytesIO(data)) + + with app.test_request_context(): + res = legacy_pyfs.send_file( + 'myfilename.html', mimetype='text/html', checksum=checksum) + assert res.status_code == 200 + h = res.headers + assert h['Content-Type'] == 'text/plain; charset=utf-8' + assert h['Content-Length'] == str(size) + assert h['Content-MD5'] == checksum[4:] + assert h['ETag'] == '"{0}"'.format(checksum) + # XSS prevention + assert h['Content-Security-Policy'] == 'default-src \'none\';' + assert h['X-Content-Type-Options'] == 'nosniff' + assert h['X-Download-Options'] == 'noopen' + assert h['X-Permitted-Cross-Domain-Policies'] == 'none' + assert h['X-Frame-Options'] == 'deny' + assert h['X-XSS-Protection'] == '1; mode=block' + assert h['Content-Disposition'] == 'inline' + + # Image + h = legacy_pyfs.send_file('image.png', mimetype='image/png').headers + assert h['Content-Type'] == 'image/png' + assert h['Content-Disposition'] == 'inline' + + # README text file + h = legacy_pyfs.send_file('README').headers + assert h['Content-Type'] == 'text/plain; charset=utf-8' + assert h['Content-Disposition'] == 'inline' + + # Zip + h = legacy_pyfs.send_file('archive.zip').headers + assert h['Content-Type'] == 'application/octet-stream' + assert h['Content-Disposition'] == 'attachment; filename=archive.zip' + + # PDF + h = legacy_pyfs.send_file('doc.pdf').headers + assert h['Content-Type'] == 'application/octet-stream' + assert h['Content-Disposition'] == 'attachment; filename=doc.pdf' + + +def test_pyfs_send_file_fail(app, legacy_pyfs): + """Test send file.""" + legacy_pyfs.save(BytesIO(b'content')) + + with patch('tests.legacy_storage_interface.send_stream') as send_stream: + send_stream.side_effect = OSError(errno.EPERM, "Permission problem") + with app.test_request_context(): + pytest.raises(StorageError, legacy_pyfs.send_file, 'test.txt') + + +def test_pyfs_copy(legacy_pyfs, dummy_location): + """Test send file.""" + s = PyFSFileStorage(join(dummy_location.uri, 'anotherpath/data')) + s.save(BytesIO(b'otherdata')) + + legacy_pyfs.copy(s) + fp = legacy_pyfs.open() + assert fp.read() == b'otherdata' + + +def test_non_unicode_filename(app, legacy_pyfs): + """Test sending the non-unicode filename in the header.""" + data = b'HelloWorld' + uri, size, checksum = legacy_pyfs.save(BytesIO(data)) + + with app.test_request_context(): + res = legacy_pyfs.send_file( + u'żółć.dat', mimetype='application/octet-stream', + checksum=checksum) + assert res.status_code == 200 + assert set(res.headers['Content-Disposition'].split('; ')) == \ + set(["attachment", "filename=zoc.dat", + "filename*=UTF-8''%C5%BC%C3%B3%C5%82%C4%87.dat"]) + + with app.test_request_context(): + res = legacy_pyfs.send_file( + 'żółć.txt', mimetype='text/plain', checksum=checksum) + assert res.status_code == 200 + assert res.headers['Content-Disposition'] == 'inline' diff --git a/tests/test_storage.py b/tests/test_storage.py index b1e4d656..ade90302 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -29,24 +29,24 @@ def test_storage_interface(): """Test storage interface.""" s = FileStorage('some-path') pytest.raises(NotImplementedError, s.open) - pytest.raises(NotImplementedError, s.initialize) + pytest.raises(NotImplementedError, s.initialize, None, 'file:///') pytest.raises(NotImplementedError, s.delete) pytest.raises(NotImplementedError, s.save, None) pytest.raises(NotImplementedError, s.update, None) pytest.raises(NotImplementedError, s.checksum) -def test_pyfs_initialize(pyfs, pyfs_testpath): +def test_pyfs_initialize(pyfs_testpath): """Test init of files.""" # Create file object. assert not exists(pyfs_testpath) - uri, size, checksum = pyfs.initialize(size=100) + uri, size, checksum = PyFSFileStorage.initialize(suggested_uri=pyfs_testpath, size=100) assert size == 100 assert checksum is None assert os.stat(pyfs_testpath).st_size == size - uri, size, checksum = pyfs.initialize() + uri, size, checksum = PyFSFileStorage.initialize(suggested_uri=pyfs_testpath) assert size == 0 assert size == os.stat(pyfs_testpath).st_size From 5899f79da323845e87a8425e570d5f0060082574 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Tue, 6 Oct 2020 14:02:54 +0100 Subject: [PATCH 13/41] Lots of things. Needs pulling apart before merging. All together because this has been very exploratory. --- invenio_files_rest/models.py | 46 +++--- invenio_files_rest/storage/__init__.py | 6 +- invenio_files_rest/storage/base.py | 154 ++++++++---------- invenio_files_rest/storage/factory.py | 35 ++-- .../storage/legacy.py | 33 +--- invenio_files_rest/storage/pyfs.py | 99 ++++++----- invenio_files_rest/utils.py | 67 +++++++- setup.py | 2 +- tests/conftest.py | 7 +- tests/test_cli.py | 1 + tests/test_legacy_storage.py | 16 +- tests/test_models_multipart.py | 2 +- tests/test_storage.py | 68 ++++---- 13 files changed, 284 insertions(+), 252 deletions(-) rename tests/legacy_storage_interface.py => invenio_files_rest/storage/legacy.py (89%) diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index 79f02e2d..ebcf12c4 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -64,10 +64,11 @@ MultipartInvalidChunkSize, MultipartInvalidPartNumber, \ MultipartInvalidSize, MultipartMissingParts, MultipartNotCompleted from .proxies import current_files_rest +from .storage.base import StorageBackend from .utils import ENCODING_MIMETYPES, PassthroughChecksum, guess_mimetype if TYPE_CHECKING: - from .storage import FileStorage + from .storage import StorageBackend slug_pattern = re.compile('^[a-z][a-z0-9-]+$') @@ -754,7 +755,7 @@ def delete(self): self.query.filter_by(id=self.id).delete() return self - def storage(self, **kwargs) -> FileStorage: + def storage(self, **kwargs) -> StorageBackend: """Get storage interface for object. Uses the applications storage factory to create a storage interface @@ -814,6 +815,7 @@ def verify_checksum(self, progress_callback=None, chunk_size=None, @ensure_writable() def initialize(self, preferred_location: Location, size=0, **kwargs): """Initialize file.""" + print("WRI1", self.writable) if hasattr(current_files_rest.storage_factory, 'initialize'): # New behaviour, with a new-style storage factory result = current_files_rest.storage_factory.initialize( @@ -821,23 +823,19 @@ def initialize(self, preferred_location: Location, size=0, **kwargs): preferred_location=preferred_location, size=size, ) + print("WRI2", self.writable) else: # Old behaviour, with an old-style storage factory storage = self.storage(default_location=preferred_location.uri, **kwargs) - if isinstance(storage.initialize.__self__, type): - # New storage backend, but old storage factory - # New storage backends have `initialize` as a classmethod. The storage backend will have constructed - # a URI and passed it to the storage's __init__, so we can grab it from there. - result = storage.initialize(suggested_uri=storage.uri, size=size) - else: - result = storage.initialize(size=size) + result = storage.initialize(size=size) + print("WRI3", self.writable, storage) self.update_file_metadata( result, readable=False, writable=True, - storage_backend=type(storage).backend_name, + storage_backend=type(storage).backend_name if isinstance(storage, StorageBackend) else None, ) - + print("WRI4", self.writable, result) @ensure_writable() def init_contents(self, size=0, default_location: str=None, **kwargs): @@ -880,18 +878,11 @@ def set_contents(self, stream, chunk_size=None, size=None, size_limit=None, progress_callback=progress_callback) storage = self.storage(**kwargs) - storage.save(wrapped_stream, chunk_size=chunk_size, size=size, - size_limit=size_limit, progress_callback=progress_callback) + self.update_file_metadata(storage.save(wrapped_stream, chunk_size=chunk_size, size=size, + size_limit=size_limit, progress_callback=progress_callback)) - self.storage_backend = type(storage).backend_name - self.size = wrapped_stream.bytes_read - self.checksum = wrapped_stream.checksum - self.uri = storage.filepath + self.storage_backend = type(storage).backend_name if isinstance(storage, StorageBackend) else None - # TODO: Should these be set here, or sent back from the storage backend? - self.readable = True - self.writable = False - self.storage_class = 'S' @ensure_writable() def copy_contents(self, fileinstance, progress_callback=None, @@ -941,6 +932,15 @@ def update_file_metadata(self, file_metadata: Union[Tuple,Dict] = None, **kwargs file_metadata = {} if isinstance(file_metadata, tuple): + assert len(file_metadata) >= 3 + # Carry across defaults from **kwargs + if len(file_metadata) < 4: + file_metadata += (kwargs.get('readable', True),) + if len(file_metadata) < 5: + file_metadata += (kwargs.get('writable', False),) + if len(file_metadata) < 6: + file_metadata += (kwargs.get('storage_class', None),) + print(file_metadata) self.set_uri(*file_metadata) elif isinstance(file_metadata, dict): file_metadata.update(kwargs) @@ -1626,6 +1626,7 @@ def create(cls, bucket, key, size, chunk_size): with db.session.begin_nested(): file_ = FileInstance.create() + print("WR2", file_.writable) file_.size = size obj = cls( upload_id=uuid.uuid4(), @@ -1638,11 +1639,13 @@ def create(cls, bucket, key, size, chunk_size): ) bucket.size += size db.session.add(obj) + print("WR3", file_.writable) file_.init_contents( size=size, default_location=bucket.location.uri, default_storage_class=bucket.default_storage_class, ) + print("WR4", file_.writable) return obj @classmethod @@ -1783,6 +1786,7 @@ def set_contents(self, stream, progress_callback=None): :param chunk_size: Desired chunk size to read stream in. It is up to the storage interface if it respects this value. """ + print("writable", self.multipart.file.writable) size, checksum = self.multipart.file.update_contents( stream, seek=self.start_byte, size=self.part_size, progress_callback=progress_callback, diff --git a/invenio_files_rest/storage/__init__.py b/invenio_files_rest/storage/__init__.py index 53e9f2a2..6bc498b8 100644 --- a/invenio_files_rest/storage/__init__.py +++ b/invenio_files_rest/storage/__init__.py @@ -10,13 +10,15 @@ from __future__ import absolute_import, print_function -from .base import FileStorage -from .pyfs import PyFSFileStorage, pyfs_storage_factory +from .base import FileStorage, StorageBackend +from .pyfs import PyFSFileStorage, pyfs_storage_factory, PyFSStorageBackend from .factory import StorageFactory __all__ = ( 'FileStorage', + 'StorageBackend', 'pyfs_storage_factory', 'PyFSFileStorage', + 'PyFSStorageBackend', 'StorageFactory', ) diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index 18c170a7..d7404c2b 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -8,7 +8,7 @@ """File storage base module.""" -from __future__ import absolute_import, print_function +from __future__ import absolute_import, annotations, print_function import hashlib import urllib.parse @@ -16,50 +16,24 @@ from calendar import timegm from datetime import datetime from functools import partial +from typing import Any, Callable, Dict, TYPE_CHECKING, Tuple from flask import current_app from ..errors import FileSizeError, StorageError, UnexpectedFileSizeError from ..helpers import chunk_size_or_default, compute_checksum, make_path, send_stream -from ..models import FileInstance +from ..utils import check_size, check_sizelimit, PassthroughChecksum +from .legacy import FileStorage -def check_sizelimit(size_limit, bytes_written, total_size): - """Check if size limit was exceeded. +if TYPE_CHECKING: + from ..models import FileInstance - :param size_limit: The size limit. - :param bytes_written: The total number of bytes written. - :param total_size: The total file size. - :raises invenio_files_rest.errors.UnexpectedFileSizeError: If the bytes - written exceed the total size. - :raises invenio_files_rest.errors.FileSizeError: If the bytes - written are major than the limit size. - """ - if size_limit is not None and bytes_written > size_limit: - desc = 'File size limit exceeded.' \ - if isinstance(size_limit, int) else size_limit.reason - raise FileSizeError(description=desc) - # Never write more than advertised - if total_size is not None and bytes_written > total_size: - raise UnexpectedFileSizeError( - description='File is bigger than expected.') +__all__ = ['FileStorage', 'StorageBackend'] -def check_size(bytes_written, total_size): - """Check if expected amounts of bytes have been written. - - :param bytes_written: The total number of bytes written. - :param total_size: The total file size. - :raises invenio_files_rest.errors.UnexpectedFileSizeError: If the bytes - written exceed the total size. - """ - if total_size and bytes_written < total_size: - raise UnexpectedFileSizeError( - description='File is smaller than expected.') - - -class FileStorageMeta(type): +class StorageBackendMeta(type): @property def backend_name(cls): try: @@ -74,9 +48,11 @@ def backend_name(cls): return cls._backend_name -class FileStorage(metaclass=FileStorageMeta): +class StorageBackend(metaclass=StorageBackendMeta): """Base class for storage interface to a single file.""" + checksum_hash_name = 'md5' + def __init__(self, uri: str=None, size: int=None, modified: datetime=None, *, filepath=None): """Initialize storage object.""" self.uri = uri or filepath @@ -94,20 +70,76 @@ def delete(self): """Delete the file.""" raise NotImplementedError - @classmethod - def initialize(cls, suggested_uri, size=0): + def initialize(self, size=0): """Initialize the file on the storage + truncate to the given size.""" + return { + 'readable': False, + 'writable': True, + 'uri': self.uri, + 'size': size, + **self._initialize(size=size), + } + + def _initialize(self, size=0): raise NotImplementedError - def save(self, incoming_stream, size_limit=None, size=None, - chunk_size=None, progress_callback=None): + chunk_size=None, progress_callback: Callable[[int, int], None] = None + ): + """Save incoming stream to file storage.""" + + incoming_stream = PassthroughChecksum( + incoming_stream, + hash_name=self.checksum_hash_name, + progress_callback=progress_callback, + size_limit=size_limit, + size=size, + ) + + result = self._save( + incoming_stream, + size=None, + chunk_size=None + ) + + check_size(incoming_stream.bytes_read, size) + self._size = incoming_stream.total_size + + return { + 'checksum': incoming_stream.checksum, + 'size': incoming_stream.total_size, + 'uri': self.uri, + 'readable': True, + 'writable': False, + 'storage_class': 'S', + **result, + } + + def _save(self, incoming_stream, size_limit=None, size=None, + chunk_size=None) -> Dict[str, Any]: """Save incoming stream to file storage.""" raise NotImplementedError def update(self, incoming_stream, seek=0, size=None, chunk_size=None, - progress_callback=None): + progress_callback=None) -> Tuple[int, str]: """Update part of file with incoming stream.""" + incoming_stream = PassthroughChecksum( + incoming_stream, + hash_name=self.checksum_hash_name, + progress_callback=progress_callback, + size=size, + ) + + self._update( + incoming_stream, + seek=seek, + size=None, + chunk_size=chunk_size, + ) + + return incoming_stream.bytes_read, incoming_stream.checksum + + def _update(self, incoming_stream, seek=0, size=None, chunk_size=None): raise NotImplementedError # @@ -218,45 +250,3 @@ def _compute_checksum(self, stream, size=None, chunk_size=None, except Exception as e: raise StorageError( 'Could not compute checksum of file: {0}'.format(e)) - - def _write_stream(self, src, dst, size=None, size_limit=None, - chunk_size=None, progress_callback=None): - """Get helper to save stream from src to dest + compute checksum. - - :param src: Source stream. - :param dst: Destination stream. - :param size: If provided, this exact amount of bytes will be - written to the destination file. - :param size_limit: ``FileSizeLimit`` instance to limit number of bytes - to write. - """ - chunk_size = chunk_size_or_default(chunk_size) - - algo, m = self._init_hash() - bytes_written = 0 - - while 1: - # Check that size limits aren't bypassed - check_sizelimit(size_limit, bytes_written, size) - - chunk = src.read(chunk_size) - - if not chunk: - if progress_callback: - progress_callback(bytes_written, bytes_written) - break - - dst.write(chunk) - - bytes_written += len(chunk) - - if m: - m.update(chunk) - - if progress_callback: - progress_callback(None, bytes_written) - - check_size(bytes_written, size) - - return bytes_written, '{0}:{1}'.format( - algo, m.hexdigest()) if m else None diff --git a/invenio_files_rest/storage/factory.py b/invenio_files_rest/storage/factory.py index b98e583e..d58068ee 100644 --- a/invenio_files_rest/storage/factory.py +++ b/invenio_files_rest/storage/factory.py @@ -1,6 +1,6 @@ import os import urllib.parse -from typing import Any, Dict, Type +from typing import Any, Dict, Optional, Type from flask import current_app @@ -18,7 +18,7 @@ def __init__(self, app): def __call__( self, fileinstance: FileInstance, - ) -> FileStorage: + ) -> Optional[FileStorage]: """Returns a FileStorage instance for a file, for manipulating file contents. """ if not fileinstance.storage_backend: @@ -29,6 +29,7 @@ def __call__( return storage_backend_cls( uri=fileinstance.uri, + size=fileinstance.size, **storage_backend_kwargs ) @@ -48,22 +49,18 @@ def initialize( storage_backend_cls = self.resolve_storage_backend(fileinstance.storage_backend) storage_backend_kwargs = self.get_storage_backend_kwargs(fileinstance, storage_backend_cls) - if storage_backend_cls.initialize.__self__ == storage_backend_cls: - # New behaviour, as initialize should now be a classmethod - return storage_backend_cls.initialize( - suggested_uri=self.get_suggested_uri( - fileinstance=fileinstance, - location=location, - storage_backend_cls=storage_backend_cls, - ), - size=size, - ) - else: - # Old behaviour - return self( - fileinstance, - default_location=location.uri, - ).initialize(size=size) + uri = self.get_suggested_uri( + fileinstance=fileinstance, + location=location, + storage_backend_cls=storage_backend_cls, + ) + + return storage_backend_cls( + uri=uri, + **storage_backend_kwargs, + ).initialize( + size=size, + ) def get_location(self, fileinstance: FileInstance, preferred_location: Location = None) -> Location: return preferred_location or Location.get_default() @@ -90,7 +87,7 @@ def get_suggested_uri( location: Location, storage_backend_cls: Type[FileStorage], ): - make_path( + return make_path( location, str(fileinstance.id), 'data', diff --git a/tests/legacy_storage_interface.py b/invenio_files_rest/storage/legacy.py similarity index 89% rename from tests/legacy_storage_interface.py rename to invenio_files_rest/storage/legacy.py index 294186bf..f387152a 100644 --- a/tests/legacy_storage_interface.py +++ b/invenio_files_rest/storage/legacy.py @@ -22,42 +22,11 @@ from invenio_files_rest.helpers import make_path from invenio_files_rest.errors import FileSizeError, StorageError, UnexpectedFileSizeError from invenio_files_rest.helpers import chunk_size_or_default, compute_checksum, send_stream +from invenio_files_rest.utils import check_size, check_sizelimit from fs.opener import opener from fs.path import basename, dirname -def check_sizelimit(size_limit, bytes_written, total_size): - """Check if size limit was exceeded. - :param size_limit: The size limit. - :param bytes_written: The total number of bytes written. - :param total_size: The total file size. - :raises invenio_files_rest.errors.UnexpectedFileSizeError: If the bytes - written exceed the total size. - :raises invenio_files_rest.errors.FileSizeError: If the bytes - written are major than the limit size. - """ - if size_limit is not None and bytes_written > size_limit: - desc = 'File size limit exceeded.' \ - if isinstance(size_limit, int) else size_limit.reason - raise FileSizeError(description=desc) - - # Never write more than advertised - if total_size is not None and bytes_written > total_size: - raise UnexpectedFileSizeError( - description='File is bigger than expected.') - - -def check_size(bytes_written, total_size): - """Check if expected amounts of bytes have been written. - :param bytes_written: The total number of bytes written. - :param total_size: The total file size. - :raises invenio_files_rest.errors.UnexpectedFileSizeError: If the bytes - written exceed the total size. - """ - if total_size and bytes_written < total_size: - raise UnexpectedFileSizeError( - description='File is smaller than expected.') - class FileStorage(object): """Base class for storage interface to a single file.""" diff --git a/invenio_files_rest/storage/pyfs.py b/invenio_files_rest/storage/pyfs.py index 6a9921e9..ebbe9fdb 100644 --- a/invenio_files_rest/storage/pyfs.py +++ b/invenio_files_rest/storage/pyfs.py @@ -16,11 +16,16 @@ from fs.opener import opener from fs.path import basename, dirname -from ..helpers import make_path -from .base import FileStorage +from ..helpers import chunk_size_or_default, make_path +from .base import StorageBackend +from .legacy import PyFSFileStorage, pyfs_storage_factory +__all__ = ['PyFSFileStorage', 'pyfs_storage_factory', 'PyFSStorageBackend'] -class PyFSFileStorage(FileStorage): +from ..utils import check_size, check_sizelimit + + +class PyFSStorageBackend(StorageBackend): """File system storage using PyFilesystem for access the file. This storage class will store files according to the following pattern: @@ -40,7 +45,7 @@ def __init__(self, *args, clean_dir=True, **kwargs): # if isinstance(args[0], str): # raise NotImplementedError self.clean_dir = clean_dir - super(PyFSFileStorage, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) @property def filepath(self): @@ -77,10 +82,8 @@ def delete(self): fs.removedir('.') return True - @classmethod - def initialize(cls, suggested_uri, size=0): + def _initialize(self, size=0): """Initialize file on storage and truncate to given size.""" - self = cls(uri=suggested_uri) fs, path = self._get_fs() # Required for reliably opening the file on certain file systems: @@ -100,10 +103,10 @@ def initialize(cls, suggested_uri, size=0): self._size = size - return {'uri': self.uri, 'size': size} + return {} - def save(self, incoming_stream, size_limit=None, size=None, - chunk_size=None, progress_callback=None): + def _save(self, incoming_stream, size_limit=None, size=None, + chunk_size=None): """Save file in the file system.""" fp = self.open(mode='wb') try: @@ -115,50 +118,56 @@ def save(self, incoming_stream, size_limit=None, size=None, finally: fp.close() - def update(self, incoming_stream, seek=0, size=None, chunk_size=None, + return {} + + def _update(self, incoming_stream, seek=0, size=None, chunk_size=None, progress_callback=None): """Update a file in the file system.""" fp = self.open(mode='r+b') try: fp.seek(seek) - bytes_written, checksum = self._write_stream( - incoming_stream, fp, chunk_size=chunk_size, - size=size, progress_callback=progress_callback) + shutil.copyfileobj(incoming_stream, fp, length=chunk_size) finally: fp.close() - return bytes_written, checksum + def _write_stream(self, src, dst, size=None, size_limit=None, + chunk_size=None, progress_callback=None): + """Get helper to save stream from src to dest + compute checksum. + :param src: Source stream. + :param dst: Destination stream. + :param size: If provided, this exact amount of bytes will be + written to the destination file. + :param size_limit: ``FileSizeLimit`` instance to limit number of bytes + to write. + """ + chunk_size = chunk_size_or_default(chunk_size) -def pyfs_storage_factory(fileinstance=None, default_location=None, - default_storage_class=None, - filestorage_class=PyFSFileStorage, fileurl=None, - size=None, modified=None, clean_dir=True): - """Get factory function for creating a PyFS file storage instance.""" - # Either the FileInstance needs to be specified or all filestorage - # class parameters need to be specified - assert fileinstance or (fileurl and size) + algo, m = self._init_hash() + bytes_written = 0 - if fileinstance: - # FIXME: Code here should be refactored since it assumes a lot on the - # directory structure where the file instances are written - fileurl = None - size = fileinstance.size - modified = fileinstance.updated + while 1: + # Check that size limits aren't bypassed + check_sizelimit(size_limit, bytes_written, size) - if fileinstance.uri: - # Use already existing URL. - fileurl = fileinstance.uri - else: - assert default_location - # Generate a new URL. - fileurl = make_path( - default_location, - str(fileinstance.id), - 'data', - current_app.config['FILES_REST_STORAGE_PATH_DIMENSIONS'], - current_app.config['FILES_REST_STORAGE_PATH_SPLIT_LENGTH'], - ) - - return filestorage_class( - fileurl, size=size, modified=modified, clean_dir=clean_dir) + chunk = src.read(chunk_size) + + if not chunk: + if progress_callback: + progress_callback(bytes_written, bytes_written) + break + + dst.write(chunk) + + bytes_written += len(chunk) + + if m: + m.update(chunk) + + if progress_callback: + progress_callback(None, bytes_written) + + check_size(bytes_written, size) + + return bytes_written, '{0}:{1}'.format( + algo, m.hexdigest()) if m else None diff --git a/invenio_files_rest/utils.py b/invenio_files_rest/utils.py index 84374aa1..d667f1dd 100644 --- a/invenio_files_rest/utils.py +++ b/invenio_files_rest/utils.py @@ -15,6 +15,8 @@ from flask import current_app from werkzeug.utils import import_string +from invenio_files_rest.errors import FileSizeError, UnexpectedFileSizeError + ENCODING_MIMETYPES = { 'gzip': 'application/gzip', 'compress': 'application/gzip', @@ -59,8 +61,53 @@ def guess_mimetype(filename): return m or 'application/octet-stream' + +def check_sizelimit(size_limit, bytes_written, total_size): + """Check if size limit was exceeded. + + :param size_limit: The size limit. + :param bytes_written: The total number of bytes written. + :param total_size: The total file size. + :raises invenio_files_rest.errors.UnexpectedFileSizeError: If the bytes + written exceed the total size. + :raises invenio_files_rest.errors.FileSizeError: If the bytes + written are major than the limit size. + """ + if size_limit is not None and bytes_written > size_limit: + desc = 'File size limit exceeded.' \ + if isinstance(size_limit, int) else size_limit.reason + raise FileSizeError(description=desc) + + # Never write more than advertised + if total_size is not None and bytes_written > total_size: + raise UnexpectedFileSizeError( + description='File is bigger than expected.') + + +def check_size(bytes_written, total_size): + """Check if expected amounts of bytes have been written. + + :param bytes_written: The total number of bytes written. + :param total_size: The total file size. + :raises invenio_files_rest.errors.UnexpectedFileSizeError: If the bytes + written exceed the total size. + """ + if total_size and bytes_written < total_size: + raise UnexpectedFileSizeError( + description='File is smaller than expected.') + + + class PassthroughChecksum: - def __init__(self, fp, hash_name, progress_callback: Callable[[int, int], None] = None, offset: int = 0): + def __init__( + self, + fp, + hash_name, + progress_callback: Callable[[int, int], None] = None, + offset: int = 0, + size_limit: int = None, + size: int=None + ): """ :param fp: A file-like option open for reading :param hash_name: A hashlib hash algorithm name @@ -68,16 +115,21 @@ def __init__(self, fp, hash_name, progress_callback: Callable[[int, int], None] size """ self.hash_name = hash_name - self._sum = hashlib.new(hash_name) + self._sum = hashlib.new(hash_name) if hash_name else None + self._update_sum = self._sum.update if self._sum else lambda chunk: None self._fp = fp self._bytes_read = 0 self._progress_callback = progress_callback self._offset = offset + self._size_limit = size_limit + self._size = size def read(self, size=-1): chunk = self._fp.read(size) self._bytes_read += len(chunk) - self._sum.update(chunk) + print("CSL", self._size_limit, self.bytes_read, self._size) + check_sizelimit(self._size_limit, self.bytes_read, self._size) + self._update_sum(chunk) if self._progress_callback: self._progress_callback(self._bytes_read, self._bytes_read + self._offset) return chunk @@ -88,9 +140,14 @@ def __getattr__(self, item): @property def checksum(self): """The {hash_name}:{hash} string for the bytes read so far.""" - return '{0}:{1}'.format( - self.hash_name, self._sum.hexdigest()) + if self._sum: + return '{0}:{1}'.format( + self.hash_name, self._sum.hexdigest()) @property def bytes_read(self): return self._bytes_read + + @property + def total_size(self): + return self._bytes_read + self._offset diff --git a/setup.py b/setup.py index 0e19d113..42a9e516 100644 --- a/setup.py +++ b/setup.py @@ -154,7 +154,7 @@ 'invenio_files_rest = invenio_files_rest.models', ], 'invenio_files_rest.storage': [ - 'pyfs = invenio_files_rest.storage:PyFSFileStorage', + 'pyfs = invenio_files_rest.storage:PyFSStorageBackend', ] }, extras_require=extras_require, diff --git a/tests/conftest.py b/tests/conftest.py index a0cf8b51..4ee46344 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -41,9 +41,8 @@ location_update_all, multipart_delete_all, multipart_read_all, \ object_delete_all, object_delete_version_all, object_read_all, \ object_read_version_all -from invenio_files_rest.storage import PyFSFileStorage +from invenio_files_rest.storage import PyFSFileStorage, PyFSStorageBackend from invenio_files_rest.views import blueprint -from tests import legacy_storage_interface @compiles(DropTable, 'postgresql') @@ -156,13 +155,13 @@ def pyfs_testpath(dummy_location): @pytest.fixture() def pyfs(dummy_location, pyfs_testpath): """Instance of PyFSFileStorage.""" - return PyFSFileStorage(pyfs_testpath) + return PyFSStorageBackend(pyfs_testpath) @pytest.fixture() def legacy_pyfs(dummy_location, pyfs_testpath): """Instance of PyFSFileStorage.""" - return legacy_storage_interface.PyFSFileStorage(pyfs_testpath) + return PyFSFileStorage(pyfs_testpath) @pytest.yield_fixture() diff --git a/tests/test_cli.py b/tests/test_cli.py index 5270e2fc..8ff2e2bd 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -103,6 +103,7 @@ def test_simple_workflow(app, db, tmpdir): ], obj=script_info) assert 0 == result.exit_code + print(tmpdir.listdir()) assert len(tmpdir.listdir()) == 2 # Specify a file. diff --git a/tests/test_legacy_storage.py b/tests/test_legacy_storage.py index b439c058..a13b4056 100644 --- a/tests/test_legacy_storage.py +++ b/tests/test_legacy_storage.py @@ -24,14 +24,12 @@ UnexpectedFileSizeError from invenio_files_rest.limiters import FileSizeLimit from invenio_files_rest.storage import FileStorage, PyFSFileStorage -from tests import legacy_storage_interface - def test_storage_interface(): """Test storage interface.""" s = FileStorage('some-path') pytest.raises(NotImplementedError, s.open) - pytest.raises(NotImplementedError, s.initialize, None, 'file:///') + pytest.raises(NotImplementedError, s.initialize, 'file:///some/path') pytest.raises(NotImplementedError, s.delete) pytest.raises(NotImplementedError, s.save, None) pytest.raises(NotImplementedError, s.update, None) @@ -56,14 +54,14 @@ def test_pyfs_initialize(legacy_pyfs, pyfs_testpath): def test_pyfs_delete(app, db, dummy_location): """Test init of files.""" testurl = join(dummy_location.uri, 'subpath/data') - s = legacy_storage_interface.PyFSFileStorage(testurl) + s = PyFSFileStorage(testurl) s.initialize(size=100) assert exists(testurl) s.delete() assert not exists(testurl) - s = legacy_storage_interface.PyFSFileStorage(join(dummy_location.uri, 'anotherpath/data')) + s = PyFSFileStorage(join(dummy_location.uri, 'anotherpath/data')) pytest.raises(ResourceNotFoundError, s.delete) @@ -191,13 +189,13 @@ def callback(total, size): counter['size'] = size # Now do it with storage interface - s = legacy_storage_interface.PyFSFileStorage('LICENSE', size=getsize('LICENSE')) + s = PyFSFileStorage('LICENSE', size=getsize('LICENSE')) assert checksum == s.checksum(chunk_size=2, progress_callback=callback) assert counter['size'] == getsize('LICENSE') # No size provided, means progress callback isn't called counter['size'] = 0 - s = legacy_storage_interface.PyFSFileStorage('LICENSE') + s = PyFSFileStorage('LICENSE') assert checksum == s.checksum(chunk_size=2, progress_callback=callback) assert counter['size'] == 0 @@ -208,7 +206,7 @@ def test_pyfs_checksum_fail(): def callback(total, size): raise OSError(errno.EPERM, "Permission") - s = legacy_storage_interface.PyFSFileStorage('LICENSE', size=getsize('LICENSE')) + s = PyFSFileStorage('LICENSE', size=getsize('LICENSE')) pytest.raises(StorageError, s.checksum, progress_callback=callback) @@ -311,7 +309,7 @@ def test_pyfs_send_file_fail(app, legacy_pyfs): """Test send file.""" legacy_pyfs.save(BytesIO(b'content')) - with patch('tests.legacy_storage_interface.send_stream') as send_stream: + with patch('invenio_files_rest.storage.legacy.send_stream') as send_stream: send_stream.side_effect = OSError(errno.EPERM, "Permission problem") with app.test_request_context(): pytest.raises(StorageError, legacy_pyfs.send_file, 'test.txt') diff --git a/tests/test_models_multipart.py b/tests/test_models_multipart.py index ba4fedb7..a30e2c9b 100644 --- a/tests/test_models_multipart.py +++ b/tests/test_models_multipart.py @@ -55,7 +55,7 @@ def test_part_creation(app, db, bucket, get_md5): mp = MultipartObject.create(bucket, 'test.txt', 5, 2) db.session.commit() assert bucket.size == 5 - + print("WR", mp.file.writable) Part.create(mp, 2, stream=BytesIO(b'p')) Part.create(mp, 0, stream=BytesIO(b'p1')) Part.create(mp, 1, stream=BytesIO(b'p2')) diff --git a/tests/test_storage.py b/tests/test_storage.py index ade90302..b1823815 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -29,26 +29,26 @@ def test_storage_interface(): """Test storage interface.""" s = FileStorage('some-path') pytest.raises(NotImplementedError, s.open) - pytest.raises(NotImplementedError, s.initialize, None, 'file:///') + pytest.raises(NotImplementedError, s.initialize, 'file:///some/path') pytest.raises(NotImplementedError, s.delete) pytest.raises(NotImplementedError, s.save, None) pytest.raises(NotImplementedError, s.update, None) pytest.raises(NotImplementedError, s.checksum) -def test_pyfs_initialize(pyfs_testpath): +def test_pyfs_initialize(pyfs, pyfs_testpath): """Test init of files.""" # Create file object. assert not exists(pyfs_testpath) - uri, size, checksum = PyFSFileStorage.initialize(suggested_uri=pyfs_testpath, size=100) + result = pyfs.initialize(size=100) - assert size == 100 - assert checksum is None - assert os.stat(pyfs_testpath).st_size == size + assert result.get('size') == 100 + assert 'checksum' not in result + assert os.stat(pyfs_testpath).st_size == result['size'] - uri, size, checksum = PyFSFileStorage.initialize(suggested_uri=pyfs_testpath) - assert size == 0 - assert size == os.stat(pyfs_testpath).st_size + result = pyfs.initialize() + assert result.get('size') == 0 + assert result['size'] == os.stat(pyfs_testpath).st_size def test_pyfs_delete(app, db, dummy_location): @@ -107,7 +107,7 @@ def test_pyfs_save_callback(pyfs): def callback(total, size): counter['size'] = size - uri, size, checksum = pyfs.save( + result = pyfs.save( BytesIO(data), progress_callback=callback) assert counter['size'] == len(data) @@ -116,12 +116,13 @@ def callback(total, size): def test_pyfs_save_limits(pyfs): """Test progress callback.""" data = b'somedata' - uri, size, checksum = pyfs.save(BytesIO(data), size=len(data)) - assert size == len(data) + result = pyfs.save(BytesIO(data), size=len(data)) + assert result['size'] == len(data) - uri, size, checksum = pyfs.save(BytesIO(data), size_limit=len(data)) - assert size == len(data) + result = pyfs.save(BytesIO(data), size_limit=len(data)) + assert result['size'] == len(data) + print("-----") # Size doesn't match pytest.raises( UnexpectedFileSizeError, pyfs.save, BytesIO(data), size=len(data) - 1) @@ -137,8 +138,11 @@ def test_pyfs_save_limits(pyfs): def test_pyfs_update(pyfs, pyfs_testpath, get_md5): """Test update of file.""" pyfs.initialize(size=100) + print("E1", pyfs.open().read()[:6]) pyfs.update(BytesIO(b'cd'), seek=2, size=2) + print("E2", pyfs.open().read()[:6]) pyfs.update(BytesIO(b'ab'), seek=0, size=2) + print("E3", pyfs.open().read()[:6]) with open(pyfs_testpath) as fp: content = fp.read() @@ -155,7 +159,9 @@ def test_pyfs_update_fail(pyfs, pyfs_testpath, get_md5): """Test update of file.""" def fail_callback(total, size): assert exists(pyfs_testpath) - raise Exception('Something bad happened') + # + if total > 2: + raise Exception('Something bad happened') pyfs.initialize(size=100) pyfs.update(BytesIO(b'ab'), seek=0, size=2) @@ -214,17 +220,17 @@ def callback(total, size): def test_pyfs_send_file(app, pyfs): """Test send file.""" data = b'sendthis' - uri, size, checksum = pyfs.save(BytesIO(data)) + result = pyfs.save(BytesIO(data)) with app.test_request_context(): res = pyfs.send_file( - 'myfilename.txt', mimetype='text/plain', checksum=checksum) + 'myfilename.txt', mimetype='text/plain', checksum=result['checksum']) assert res.status_code == 200 h = res.headers assert h['Content-Type'] == 'text/plain; charset=utf-8' - assert h['Content-Length'] == str(size) - assert h['Content-MD5'] == checksum[4:] - assert h['ETag'] == '"{0}"'.format(checksum) + assert h['Content-Length'] == str(result['size']) + assert h['Content-MD5'] == result['checksum'][4:] + assert h['ETag'] == '"{0}"'.format(result['checksum']) # Content-Type: application/octet-stream # ETag: "b234ee4d69f5fce4486a80fdaf4a4263" @@ -241,7 +247,7 @@ def test_pyfs_send_file(app, pyfs): # Test for absence of Content-Disposition header to make sure that # it's not present when as_attachment=False res = pyfs.send_file('myfilename.txt', mimetype='text/plain', - checksum=checksum, as_attachment=False) + checksum=result['checksum'], as_attachment=False) assert res.status_code == 200 assert 'attachment' not in res.headers['Content-Disposition'] @@ -249,13 +255,13 @@ def test_pyfs_send_file(app, pyfs): def test_pyfs_send_file_for_download(app, pyfs): """Test send file.""" data = b'sendthis' - uri, size, checksum = pyfs.save(BytesIO(data)) + result = pyfs.save(BytesIO(data)) with app.test_request_context(): # Test for presence of Content-Disposition header to make sure that # it's present when as_attachment=True res = pyfs.send_file('myfilename.txt', mimetype='text/plain', - checksum=checksum, as_attachment=True) + checksum=result['checksum'], as_attachment=True) assert res.status_code == 200 assert (res.headers['Content-Disposition'] == 'attachment; filename=myfilename.txt') @@ -264,17 +270,17 @@ def test_pyfs_send_file_for_download(app, pyfs): def test_pyfs_send_file_xss_prevention(app, pyfs): """Test send file.""" data = b'' - uri, size, checksum = pyfs.save(BytesIO(data)) + result = pyfs.save(BytesIO(data)) with app.test_request_context(): res = pyfs.send_file( - 'myfilename.html', mimetype='text/html', checksum=checksum) + 'myfilename.html', mimetype='text/html', checksum=result['checksum']) assert res.status_code == 200 h = res.headers assert h['Content-Type'] == 'text/plain; charset=utf-8' - assert h['Content-Length'] == str(size) - assert h['Content-MD5'] == checksum[4:] - assert h['ETag'] == '"{0}"'.format(checksum) + assert h['Content-Length'] == str(result['size']) + assert h['Content-MD5'] == result['checksum'][4:] + assert h['ETag'] == '"{0}"'.format(result['checksum']) # XSS prevention assert h['Content-Security-Policy'] == 'default-src \'none\';' assert h['X-Content-Type-Options'] == 'nosniff' @@ -328,12 +334,12 @@ def test_pyfs_copy(pyfs, dummy_location): def test_non_unicode_filename(app, pyfs): """Test sending the non-unicode filename in the header.""" data = b'HelloWorld' - uri, size, checksum = pyfs.save(BytesIO(data)) + result = pyfs.save(BytesIO(data)) with app.test_request_context(): res = pyfs.send_file( u'żółć.dat', mimetype='application/octet-stream', - checksum=checksum) + checksum=result['checksum']) assert res.status_code == 200 assert set(res.headers['Content-Disposition'].split('; ')) == \ set(["attachment", "filename=zoc.dat", @@ -341,6 +347,6 @@ def test_non_unicode_filename(app, pyfs): with app.test_request_context(): res = pyfs.send_file( - 'żółć.txt', mimetype='text/plain', checksum=checksum) + 'żółć.txt', mimetype='text/plain', checksum=result['checksum']) assert res.status_code == 200 assert res.headers['Content-Disposition'] == 'inline' From 2e1a82aafb7220c49ab4a1e82b175e9192a3f995 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Tue, 6 Oct 2020 14:11:06 +0100 Subject: [PATCH 14/41] Remove debugging print statements --- invenio_files_rest/models.py | 9 --------- tests/test_models_multipart.py | 1 - tests/test_storage.py | 4 ---- 3 files changed, 14 deletions(-) diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index ebcf12c4..966f8c6f 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -815,7 +815,6 @@ def verify_checksum(self, progress_callback=None, chunk_size=None, @ensure_writable() def initialize(self, preferred_location: Location, size=0, **kwargs): """Initialize file.""" - print("WRI1", self.writable) if hasattr(current_files_rest.storage_factory, 'initialize'): # New behaviour, with a new-style storage factory result = current_files_rest.storage_factory.initialize( @@ -823,19 +822,16 @@ def initialize(self, preferred_location: Location, size=0, **kwargs): preferred_location=preferred_location, size=size, ) - print("WRI2", self.writable) else: # Old behaviour, with an old-style storage factory storage = self.storage(default_location=preferred_location.uri, **kwargs) result = storage.initialize(size=size) - print("WRI3", self.writable, storage) self.update_file_metadata( result, readable=False, writable=True, storage_backend=type(storage).backend_name if isinstance(storage, StorageBackend) else None, ) - print("WRI4", self.writable, result) @ensure_writable() def init_contents(self, size=0, default_location: str=None, **kwargs): @@ -940,7 +936,6 @@ def update_file_metadata(self, file_metadata: Union[Tuple,Dict] = None, **kwargs file_metadata += (kwargs.get('writable', False),) if len(file_metadata) < 6: file_metadata += (kwargs.get('storage_class', None),) - print(file_metadata) self.set_uri(*file_metadata) elif isinstance(file_metadata, dict): file_metadata.update(kwargs) @@ -1626,7 +1621,6 @@ def create(cls, bucket, key, size, chunk_size): with db.session.begin_nested(): file_ = FileInstance.create() - print("WR2", file_.writable) file_.size = size obj = cls( upload_id=uuid.uuid4(), @@ -1639,13 +1633,11 @@ def create(cls, bucket, key, size, chunk_size): ) bucket.size += size db.session.add(obj) - print("WR3", file_.writable) file_.init_contents( size=size, default_location=bucket.location.uri, default_storage_class=bucket.default_storage_class, ) - print("WR4", file_.writable) return obj @classmethod @@ -1786,7 +1778,6 @@ def set_contents(self, stream, progress_callback=None): :param chunk_size: Desired chunk size to read stream in. It is up to the storage interface if it respects this value. """ - print("writable", self.multipart.file.writable) size, checksum = self.multipart.file.update_contents( stream, seek=self.start_byte, size=self.part_size, progress_callback=progress_callback, diff --git a/tests/test_models_multipart.py b/tests/test_models_multipart.py index a30e2c9b..8105b878 100644 --- a/tests/test_models_multipart.py +++ b/tests/test_models_multipart.py @@ -55,7 +55,6 @@ def test_part_creation(app, db, bucket, get_md5): mp = MultipartObject.create(bucket, 'test.txt', 5, 2) db.session.commit() assert bucket.size == 5 - print("WR", mp.file.writable) Part.create(mp, 2, stream=BytesIO(b'p')) Part.create(mp, 0, stream=BytesIO(b'p1')) Part.create(mp, 1, stream=BytesIO(b'p2')) diff --git a/tests/test_storage.py b/tests/test_storage.py index b1823815..5c3fcc27 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -122,7 +122,6 @@ def test_pyfs_save_limits(pyfs): result = pyfs.save(BytesIO(data), size_limit=len(data)) assert result['size'] == len(data) - print("-----") # Size doesn't match pytest.raises( UnexpectedFileSizeError, pyfs.save, BytesIO(data), size=len(data) - 1) @@ -138,11 +137,8 @@ def test_pyfs_save_limits(pyfs): def test_pyfs_update(pyfs, pyfs_testpath, get_md5): """Test update of file.""" pyfs.initialize(size=100) - print("E1", pyfs.open().read()[:6]) pyfs.update(BytesIO(b'cd'), seek=2, size=2) - print("E2", pyfs.open().read()[:6]) pyfs.update(BytesIO(b'ab'), seek=0, size=2) - print("E3", pyfs.open().read()[:6]) with open(pyfs_testpath) as fp: content = fp.read() From 282e6181419a7f895dcc28f59fd97ed628cdca5b Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Tue, 6 Oct 2020 15:13:07 +0100 Subject: [PATCH 15/41] Fix mypy errors --- invenio_files_rest/models.py | 4 +++- invenio_files_rest/storage/factory.py | 14 +++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index 966f8c6f..64338499 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -48,6 +48,7 @@ from typing import Dict, TYPE_CHECKING, Tuple, Union import six +import typing from flask import current_app from functools import wraps from invenio_db import db @@ -835,6 +836,7 @@ def initialize(self, preferred_location: Location, size=0, **kwargs): @ensure_writable() def init_contents(self, size=0, default_location: str=None, **kwargs): + preferred_location: typing.Optional[Location] if default_location: preferred_location = Location(uri=default_location) else: @@ -1042,7 +1044,7 @@ def mimetype(self): """Get MIME type of object.""" return self._mimetype if self._mimetype else guess_mimetype(self.key) - @mimetype.setter + @mimetype.setter # type: ignore def mimetype(self, value): """Setter for MIME type.""" self._mimetype = value diff --git a/invenio_files_rest/storage/factory.py b/invenio_files_rest/storage/factory.py index d58068ee..43b42256 100644 --- a/invenio_files_rest/storage/factory.py +++ b/invenio_files_rest/storage/factory.py @@ -5,7 +5,7 @@ from flask import current_app from invenio_files_rest.models import FileInstance, Location -from .base import FileStorage +from .base import StorageBackend from ..helpers import make_path @@ -18,7 +18,7 @@ def __init__(self, app): def __call__( self, fileinstance: FileInstance, - ) -> Optional[FileStorage]: + ) -> Optional[StorageBackend]: """Returns a FileStorage instance for a file, for manipulating file contents. """ if not fileinstance.storage_backend: @@ -38,9 +38,9 @@ def initialize( fileinstance: FileInstance, size: int = 0, preferred_location: Location = None - ) -> FileStorage: + ) -> StorageBackend: if fileinstance.storage_backend: - return self(fileinstance) + return self(fileinstance) # type: ignore location = self.get_location(fileinstance, preferred_location) @@ -71,13 +71,13 @@ def get_storage_backend_name(self, fileinstance: FileInstance, preferred_locatio raise ValueError("preferred_location required for default storage factory") return preferred_location.storage_backend - def resolve_storage_backend(self, backend_name: str) -> Type[FileStorage]: + def resolve_storage_backend(self, backend_name: str) -> Type[StorageBackend]: return self.app.config['FILES_REST_STORAGE_BACKENDS'][backend_name] def get_storage_backend_kwargs( self, fileinstance: FileInstance, - storage_backend_cls: Type[FileStorage], + storage_backend_cls: Type[StorageBackend], ) -> Dict[str, Any]: return {} @@ -85,7 +85,7 @@ def get_suggested_uri( self, fileinstance: FileInstance, location: Location, - storage_backend_cls: Type[FileStorage], + storage_backend_cls: Type[StorageBackend], ): return make_path( location, From c62cfa0df5ebb6acdafb7460f81d1095b9d593ff Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Tue, 6 Oct 2020 19:17:25 +0100 Subject: [PATCH 16/41] Remove StorageBackend metaclass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's always a better option than metaclasses. In this case, a classmethod, which also simplifies getting the backend name on instances (`type(instance).backend_name` → `instance.get_backend_name()`) --- invenio_files_rest/models.py | 2 +- invenio_files_rest/storage/base.py | 28 +++++++++++++--------------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index 64338499..490f41bb 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -831,7 +831,7 @@ def initialize(self, preferred_location: Location, size=0, **kwargs): result, readable=False, writable=True, - storage_backend=type(storage).backend_name if isinstance(storage, StorageBackend) else None, + storage_backend=storage.get_backend_name() if isinstance(storage, StorageBackend) else None, ) @ensure_writable() diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index d7404c2b..15d06d11 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -33,9 +33,19 @@ __all__ = ['FileStorage', 'StorageBackend'] -class StorageBackendMeta(type): - @property - def backend_name(cls): +class StorageBackend: + """Base class for storage interface to a single file.""" + + checksum_hash_name = 'md5' + + def __init__(self, uri: str=None, size: int=None, modified: datetime=None, *): + """Initialize storage object.""" + self.uri = uri + self._size = size + self._modified = timegm(modified.timetuple()) if modified else None + + @classmethod + def get_backend_name(cls): try: return cls._backend_name except AttributeError: @@ -47,18 +57,6 @@ def backend_name(cls): raise RuntimeError("{} isn't listed in FILES_REST_STORAGE_BACKENDS config".format(cls)) return cls._backend_name - -class StorageBackend(metaclass=StorageBackendMeta): - """Base class for storage interface to a single file.""" - - checksum_hash_name = 'md5' - - def __init__(self, uri: str=None, size: int=None, modified: datetime=None, *, filepath=None): - """Initialize storage object.""" - self.uri = uri or filepath - self._size = size - self._modified = timegm(modified.timetuple()) if modified else None - def open(self, mode=None): """Open the file. From 88e5fa8a1d7115d63e76d17481bedb9dd561834a Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Tue, 6 Oct 2020 19:20:15 +0100 Subject: [PATCH 17/41] Fix typo --- invenio_files_rest/storage/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index 15d06d11..1738a37e 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -38,7 +38,7 @@ class StorageBackend: checksum_hash_name = 'md5' - def __init__(self, uri: str=None, size: int=None, modified: datetime=None, *): + def __init__(self, uri: str=None, size: int=None, modified: datetime=None): """Initialize storage object.""" self.uri = uri self._size = size From 705b95e16f08a3ee7198da67f7adceb7fdca17e2 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Tue, 6 Oct 2020 20:40:27 +0100 Subject: [PATCH 18/41] Reinstate _write_stream, and use context managers --- invenio_files_rest/models.py | 2 +- invenio_files_rest/storage/base.py | 119 ++++++++++++++++------------- invenio_files_rest/storage/pyfs.py | 68 ++--------------- 3 files changed, 76 insertions(+), 113 deletions(-) diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index 490f41bb..ec1cbb4e 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -1088,7 +1088,7 @@ def set_contents(self, stream, chunk_size=None, size=None, size_limit=None, @ensure_no_file() @update_bucket_size - def set_location(self, uri, size, checksum, storage_class=None, storage_backend=None): + def set_location(self, uri, size, checksum, storage_class=None, storage_backend=None): """Set only URI location of for object. Useful to link files on externally controlled storage. If a file diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index 1738a37e..a98f4c9e 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -11,20 +11,19 @@ from __future__ import absolute_import, annotations, print_function import hashlib -import urllib.parse import warnings from calendar import timegm from datetime import datetime from functools import partial -from typing import Any, Callable, Dict, TYPE_CHECKING, Tuple +from typing import Callable, TYPE_CHECKING, Tuple +import typing from flask import current_app -from ..errors import FileSizeError, StorageError, UnexpectedFileSizeError -from ..helpers import chunk_size_or_default, compute_checksum, make_path, send_stream -from ..utils import check_size, check_sizelimit, PassthroughChecksum - from .legacy import FileStorage +from ..errors import StorageError +from ..helpers import chunk_size_or_default, compute_checksum, make_path, send_stream +from ..utils import PassthroughChecksum, check_size, check_sizelimit if TYPE_CHECKING: from ..models import FileInstance @@ -85,60 +84,78 @@ def save(self, incoming_stream, size_limit=None, size=None, chunk_size=None, progress_callback: Callable[[int, int], None] = None ): """Save incoming stream to file storage.""" + with self.get_save_stream() as output_stream: + result = self._write_stream( + incoming_stream, + output_stream, + size_limit=size_limit, + size=size, + chunk_size=chunk_size, + progress_callback=progress_callback, + ) + self._size = result['size'] + return { + 'uri': self.uri, + 'readable': True, + 'writable': False, + 'storage_class': 'S', + **result, + } + + def get_save_stream(self) -> typing.ContextManager: + """Save incoming stream to file storage.""" + raise NotImplementedError - incoming_stream = PassthroughChecksum( - incoming_stream, - hash_name=self.checksum_hash_name, - progress_callback=progress_callback, - size_limit=size_limit, - size=size, - ) + def update(self,incoming_stream, seek=0, size=None, chunk_size=None, + progress_callback=None) -> Tuple[int, str]: + """Update part of file with incoming stream.""" + with self.get_update_stream(seek) as output_stream: + result = self._write_stream( + incoming_stream, + output_stream, + size=size, + chunk_size=chunk_size, + progress_callback=progress_callback, + ) + self._size = seek + result['size'] + return result['size'], result['checksum'] + + def get_update_stream(self, seek) -> typing.ContextManager: + raise NotImplementedError - result = self._save( - incoming_stream, - size=None, - chunk_size=None - ) + def _write_stream(self, incoming_stream, output_stream, *, size_limit=None, size=None, chunk_size=None, progress_callback=None): + chunk_size = chunk_size_or_default(chunk_size) + checksum = hashlib.new(self.checksum_hash_name) if self.checksum_hash_name else None + update_sum = checksum.update if checksum else lambda chunk: None - check_size(incoming_stream.bytes_read, size) - self._size = incoming_stream.total_size + bytes_written = 0 - return { - 'checksum': incoming_stream.checksum, - 'size': incoming_stream.total_size, - 'uri': self.uri, - 'readable': True, - 'writable': False, - 'storage_class': 'S', - **result, - } + while True: + # Check that size limits aren't bypassed + check_sizelimit(size_limit, bytes_written, size) - def _save(self, incoming_stream, size_limit=None, size=None, - chunk_size=None) -> Dict[str, Any]: - """Save incoming stream to file storage.""" - raise NotImplementedError + chunk = incoming_stream.read(chunk_size) - def update(self, incoming_stream, seek=0, size=None, chunk_size=None, - progress_callback=None) -> Tuple[int, str]: - """Update part of file with incoming stream.""" - incoming_stream = PassthroughChecksum( - incoming_stream, - hash_name=self.checksum_hash_name, - progress_callback=progress_callback, - size=size, - ) + if not chunk: + if progress_callback: + progress_callback(bytes_written, bytes_written) + break - self._update( - incoming_stream, - seek=seek, - size=None, - chunk_size=chunk_size, - ) + output_stream.write(chunk) - return incoming_stream.bytes_read, incoming_stream.checksum + bytes_written += len(chunk) - def _update(self, incoming_stream, seek=0, size=None, chunk_size=None): - raise NotImplementedError + update_sum(chunk) + + if progress_callback: + progress_callback(None, bytes_written) + + check_size(bytes_written, size) + + return { + 'checksum': f'{self.checksum_hash_name}:{checksum.hexdigest()}' if checksum else None, + 'size': bytes_written, + } # # Default implementation diff --git a/invenio_files_rest/storage/pyfs.py b/invenio_files_rest/storage/pyfs.py index ebbe9fdb..cfc3b417 100644 --- a/invenio_files_rest/storage/pyfs.py +++ b/invenio_files_rest/storage/pyfs.py @@ -10,6 +10,7 @@ from __future__ import absolute_import, print_function +import contextlib import shutil from flask import current_app @@ -47,10 +48,6 @@ def __init__(self, *args, clean_dir=True, **kwargs): self.clean_dir = clean_dir super().__init__(*args, **kwargs) - @property - def filepath(self): - return self.uri - def _get_fs(self, create_dir=True): """Return tuple with filesystem and filename.""" filedir = dirname(self.uri) @@ -105,69 +102,18 @@ def _initialize(self, size=0): return {} - def _save(self, incoming_stream, size_limit=None, size=None, - chunk_size=None): - """Save file in the file system.""" + @contextlib.contextmanager + def get_save_stream(self): fp = self.open(mode='wb') try: - shutil.copyfileobj(incoming_stream, fp, length=chunk_size) + yield fp except Exception: - fp.close() self.delete() raise finally: fp.close() - return {} - - def _update(self, incoming_stream, seek=0, size=None, chunk_size=None, - progress_callback=None): - """Update a file in the file system.""" + def get_update_stream(self, seek): fp = self.open(mode='r+b') - try: - fp.seek(seek) - shutil.copyfileobj(incoming_stream, fp, length=chunk_size) - finally: - fp.close() - - def _write_stream(self, src, dst, size=None, size_limit=None, - chunk_size=None, progress_callback=None): - """Get helper to save stream from src to dest + compute checksum. - - :param src: Source stream. - :param dst: Destination stream. - :param size: If provided, this exact amount of bytes will be - written to the destination file. - :param size_limit: ``FileSizeLimit`` instance to limit number of bytes - to write. - """ - chunk_size = chunk_size_or_default(chunk_size) - - algo, m = self._init_hash() - bytes_written = 0 - - while 1: - # Check that size limits aren't bypassed - check_sizelimit(size_limit, bytes_written, size) - - chunk = src.read(chunk_size) - - if not chunk: - if progress_callback: - progress_callback(bytes_written, bytes_written) - break - - dst.write(chunk) - - bytes_written += len(chunk) - - if m: - m.update(chunk) - - if progress_callback: - progress_callback(None, bytes_written) - - check_size(bytes_written, size) - - return bytes_written, '{0}:{1}'.format( - algo, m.hexdigest()) if m else None + fp.seek(seek) + return contextlib.closing(fp) From 93bd542713457c026b447fd1aaa205119f28775a Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Tue, 6 Oct 2020 22:36:50 +0100 Subject: [PATCH 19/41] Improve checksumming --- invenio_files_rest/storage/base.py | 78 ++++++++++++------------------ 1 file changed, 32 insertions(+), 46 deletions(-) diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index a98f4c9e..e995deca 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -93,14 +93,16 @@ def save(self, incoming_stream, size_limit=None, size=None, chunk_size=chunk_size, progress_callback=progress_callback, ) - self._size = result['size'] - return { - 'uri': self.uri, - 'readable': True, - 'writable': False, - 'storage_class': 'S', - **result, - } + self._size = result['size'] + if not result['checksum']: + result['checksum'] = self.checksum(chunk_size=chunk_size) + return { + 'uri': self.uri, + 'readable': True, + 'writable': False, + 'storage_class': 'S', + **result, + } def get_save_stream(self) -> typing.ContextManager: """Save incoming stream to file storage.""" @@ -117,15 +119,16 @@ def update(self,incoming_stream, seek=0, size=None, chunk_size=None, chunk_size=chunk_size, progress_callback=progress_callback, ) - self._size = seek + result['size'] - return result['size'], result['checksum'] + self._size = seek + result['size'] + return result['size'], result['checksum'] def get_update_stream(self, seek) -> typing.ContextManager: raise NotImplementedError def _write_stream(self, incoming_stream, output_stream, *, size_limit=None, size=None, chunk_size=None, progress_callback=None): chunk_size = chunk_size_or_default(chunk_size) - checksum = hashlib.new(self.checksum_hash_name) if self.checksum_hash_name else None + + algo, checksum = self._init_hash() update_sum = checksum.update if checksum else lambda chunk: None bytes_written = 0 @@ -194,18 +197,22 @@ def send_file(self, filename, mimetype=None, restricted=True, fp.close() raise StorageError('Could not send file: {}'.format(e)) - def checksum(self, chunk_size=None, progress_callback=None, **kwargs): + def checksum(self, chunk_size=None, progress_callback=None): """Compute checksum of file.""" - fp = self.open(mode='rb') - try: - value = self._compute_checksum( - fp, size=self._size, chunk_size=None, - progress_callback=progress_callback) - except StorageError: - raise - finally: - fp.close() - return value + + algo, m = self._init_hash() + if not m: + return None + + chunk_size = chunk_size_or_default(chunk_size) + + with self.open(mode='rb') as fp: + algo, m = self._init_hash() + return compute_checksum( + fp, algo, m, + chunk_size=chunk_size, + progress_callback=progress_callback + ) def copy(self, src, chunk_size=None, progress_callback=None): """Copy data from another file instance. @@ -240,28 +247,7 @@ def _init_hash(self): Overwrite this method if you want to use different checksum algorithm for your storage backend. """ - return 'md5', hashlib.md5() - - def _compute_checksum(self, stream, size=None, chunk_size=None, - progress_callback=None, **kwargs): - """Get helper method to compute checksum from a stream. - - Naive implementation that can be overwritten by subclasses in order to - provide more efficient implementation. - """ - if progress_callback and size: - progress_callback = partial(progress_callback, size) + if self.checksum_hash_name: + return self.checksum_hash_name, hashlib.new(self.checksum_hash_name) else: - progress_callback = None - - try: - algo, m = self._init_hash() - return compute_checksum( - stream, algo, m, - chunk_size=chunk_size, - progress_callback=progress_callback, - **kwargs - ) - except Exception as e: - raise StorageError( - 'Could not compute checksum of file: {0}'.format(e)) + return None, None From a8fbd15b1bf55ba5283b11beb67ee09d5cc99d16 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Tue, 6 Oct 2020 22:37:16 +0100 Subject: [PATCH 20/41] Add the beginning of documentation for writing storage backends --- docs/new-storage-backend.rst | 86 ++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/docs/new-storage-backend.rst b/docs/new-storage-backend.rst index 576cd3c6..684c4e13 100644 --- a/docs/new-storage-backend.rst +++ b/docs/new-storage-backend.rst @@ -1,3 +1,89 @@ Developing a new storage backend ================================ +A storage backend should subclass ``invenio_files_rest.storage.StorageBackend`` and should minimally implement the +``open()``, ``_initialize(size)``, ``get_save_stream()`` and ``update_stream(seek)`` methods. You should +register the backend with an entry point in your ``setup.py``: + +.. code-block:: python + + setup( + ..., + entry_points={ + ..., + 'invenio_files_rest.storage': [ + 'mybackend = mypackage.storage:MyStorageBackend', + ], + ... + }, + ... + ) + +Implementation +-------------- + +The base class handles reporting progress, file size limits and checksumming. + +Here's an example implementation of a storage backend that stores files remotely over HTTP with no authentication. + +.. code-block:: python + + import contextlib + import httplib + import urllib + import urllib.parse + + from invenio_files_rest.storage import StorageBackend + + class StorageBackend(StorageBackend): + def open(self): + return contextlib.closing( + urllib.urlopen('GET', self.uri) + ) + + def _initialize(self, size=0): + # Allocate space for the file. You can use `self.uri` as a suggested location, or return + # a new location as e.g. `{"uri": the_new_uri}`. + urllib.urlopen('POST', self.uri, headers={'X-Expected-Size': str(size)}) + return {} + + @contextlib.contextmanager + def get_save_stream(self): + # This should be a context manager (i.e. something that can be used in a `with` statement) + # which closes the file when exiting the context manager and performs any clear-up if + # an error occurs. + parsed_uri = urllib.parse.urlparse(self.uri) + # Assume the URI is HTTP, not HTTPS, and doesn't have a port or querystring + conn = httplib.HTTPConnection(parsed_uri.netloc) + + conn.putrequest('PUT', parsed_uri.path) + conn.endheaders() + + # HTTPConnections have a `send` method, whereas a file-like object should have `write` + conn.write = conn.send + + try: + yield conn.send + response = conn.getresponse() + if not 200 <= response.status < 300: + raise IOError("HTTP error") + finally: + conn.close() + + +Checksumming +------------ + +The base class performs checksumming by default, using the ``checksum_hash_name`` class or instance attribute as +the hashlib hashing function to use. If your underlying storage system provides checksumming functionality you can set +this to ``None`` and override ``checksum()``: + +.. code-block:: python + + class RemoteChecksumStorageBackend(StorageBackend): + checksum_hash_name = None + + def checksum(self, chunk_size=None): + checksum = urllib.urlopen(self.uri + '?checksum=sha256').read() + return f'sha256:{checksum}' + From 8a25500f0aefeec63f95a7ffd60cdf62160c7464 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Tue, 13 Oct 2020 20:25:55 +0100 Subject: [PATCH 21/41] Opening via the open() method should only be for reading --- invenio_files_rest/storage/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index e995deca..8b71732a 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -56,7 +56,7 @@ def get_backend_name(cls): raise RuntimeError("{} isn't listed in FILES_REST_STORAGE_BACKENDS config".format(cls)) return cls._backend_name - def open(self, mode=None): + def open(self): """Open the file. The caller is responsible for closing the file. From 7820f846f22f66782fbd2eb695f7e9d2a1d8eb3c Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Tue, 13 Oct 2020 20:26:35 +0100 Subject: [PATCH 22/41] Add delete() to list of methods in docs that need defining --- docs/new-storage-backend.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/new-storage-backend.rst b/docs/new-storage-backend.rst index 684c4e13..f5ee1cc6 100644 --- a/docs/new-storage-backend.rst +++ b/docs/new-storage-backend.rst @@ -2,7 +2,7 @@ Developing a new storage backend ================================ A storage backend should subclass ``invenio_files_rest.storage.StorageBackend`` and should minimally implement the -``open()``, ``_initialize(size)``, ``get_save_stream()`` and ``update_stream(seek)`` methods. You should +``open()``, ``delete()``, ``_initialize(size)``, ``get_save_stream()`` and ``update_stream(seek)`` methods. You should register the backend with an entry point in your ``setup.py``: .. code-block:: python @@ -35,6 +35,7 @@ Here's an example implementation of a storage backend that stores files remotely from invenio_files_rest.storage import StorageBackend + class StorageBackend(StorageBackend): def open(self): return contextlib.closing( From d4e660ef950688c96f94dd78dc27b7eb94f571e9 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Tue, 13 Oct 2020 20:32:01 +0100 Subject: [PATCH 23/41] Simplify by removing now-unused PassthroughChecksum --- invenio_files_rest/models.py | 8 ++--- invenio_files_rest/storage/base.py | 2 +- invenio_files_rest/utils.py | 56 ------------------------------ 3 files changed, 3 insertions(+), 63 deletions(-) diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index ec1cbb4e..67609fcc 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -66,7 +66,7 @@ MultipartInvalidSize, MultipartMissingParts, MultipartNotCompleted from .proxies import current_files_rest from .storage.base import StorageBackend -from .utils import ENCODING_MIMETYPES, PassthroughChecksum, guess_mimetype +from .utils import ENCODING_MIMETYPES, guess_mimetype if TYPE_CHECKING: from .storage import StorageBackend @@ -871,12 +871,8 @@ def set_contents(self, stream, chunk_size=None, size=None, size_limit=None, from. :param stream: File-like stream. """ - wrapped_stream = PassthroughChecksum(stream, - hash_name='md5', - progress_callback=progress_callback) - storage = self.storage(**kwargs) - self.update_file_metadata(storage.save(wrapped_stream, chunk_size=chunk_size, size=size, + self.update_file_metadata(storage.save(stream, chunk_size=chunk_size, size=size, size_limit=size_limit, progress_callback=progress_callback)) self.storage_backend = type(storage).backend_name if isinstance(storage, StorageBackend) else None diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index 8b71732a..9626461d 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -23,7 +23,7 @@ from .legacy import FileStorage from ..errors import StorageError from ..helpers import chunk_size_or_default, compute_checksum, make_path, send_stream -from ..utils import PassthroughChecksum, check_size, check_sizelimit +from ..utils import check_size, check_sizelimit if TYPE_CHECKING: from ..models import FileInstance diff --git a/invenio_files_rest/utils.py b/invenio_files_rest/utils.py index d667f1dd..95e548fd 100644 --- a/invenio_files_rest/utils.py +++ b/invenio_files_rest/utils.py @@ -95,59 +95,3 @@ def check_size(bytes_written, total_size): if total_size and bytes_written < total_size: raise UnexpectedFileSizeError( description='File is smaller than expected.') - - - -class PassthroughChecksum: - def __init__( - self, - fp, - hash_name, - progress_callback: Callable[[int, int], None] = None, - offset: int = 0, - size_limit: int = None, - size: int=None - ): - """ - :param fp: A file-like option open for reading - :param hash_name: A hashlib hash algorithm name - :param progress_callback: An optional function that will receive the number of bytes read, and the total file - size - """ - self.hash_name = hash_name - self._sum = hashlib.new(hash_name) if hash_name else None - self._update_sum = self._sum.update if self._sum else lambda chunk: None - self._fp = fp - self._bytes_read = 0 - self._progress_callback = progress_callback - self._offset = offset - self._size_limit = size_limit - self._size = size - - def read(self, size=-1): - chunk = self._fp.read(size) - self._bytes_read += len(chunk) - print("CSL", self._size_limit, self.bytes_read, self._size) - check_sizelimit(self._size_limit, self.bytes_read, self._size) - self._update_sum(chunk) - if self._progress_callback: - self._progress_callback(self._bytes_read, self._bytes_read + self._offset) - return chunk - - def __getattr__(self, item): - return getattr(self._fp, item) - - @property - def checksum(self): - """The {hash_name}:{hash} string for the bytes read so far.""" - if self._sum: - return '{0}:{1}'.format( - self.hash_name, self._sum.hexdigest()) - - @property - def bytes_read(self): - return self._bytes_read - - @property - def total_size(self): - return self._bytes_read + self._offset From a508e3e496ef5e7586e49434e4edf00c7c952db8 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Tue, 13 Oct 2020 20:32:20 +0100 Subject: [PATCH 24/41] No longer need to get class object to look up storage name --- invenio_files_rest/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index 67609fcc..c7fa912b 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -875,7 +875,7 @@ def set_contents(self, stream, chunk_size=None, size=None, size_limit=None, self.update_file_metadata(storage.save(stream, chunk_size=chunk_size, size=size, size_limit=size_limit, progress_callback=progress_callback)) - self.storage_backend = type(storage).backend_name if isinstance(storage, StorageBackend) else None + self.storage_backend = storage.get_backend_name() if isinstance(storage, StorageBackend) else None @ensure_writable() From 429504f5cfc8ec150f2b77c470c3e6e5d1e77ee0 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Tue, 13 Oct 2020 20:56:46 +0100 Subject: [PATCH 25/41] docstrings for the storage factory --- invenio_files_rest/storage/factory.py | 46 ++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/invenio_files_rest/storage/factory.py b/invenio_files_rest/storage/factory.py index 43b42256..ec9bb963 100644 --- a/invenio_files_rest/storage/factory.py +++ b/invenio_files_rest/storage/factory.py @@ -1,3 +1,5 @@ +"""Contains the base storage factory implementation.""" + import os import urllib.parse from typing import Any, Dict, Optional, Type @@ -10,16 +12,24 @@ class StorageFactory: - """A base storage factory, with sensible default behaviour.""" + """A base storage factory, with sensible default behaviour. + + You may subclass this factory to implement custom behaviour. If you do this, remember to set + FILES_REST_STORAGE_FACTORY to the right import path for your subclass. + """ def __init__(self, app): + """Initialize the storage factory.""" self.app = app def __call__( self, fileinstance: FileInstance, ) -> Optional[StorageBackend]: - """Returns a FileStorage instance for a file, for manipulating file contents. + """Return a FileStorage instance for a file, for manipulating file contents. + + This requires that the fileinstance already has an associated storage backend. If not, + you should call initialize() instead to initialize storage for the file instance. """ if not fileinstance.storage_backend: return None @@ -39,6 +49,11 @@ def initialize( size: int = 0, preferred_location: Location = None ) -> StorageBackend: + """Initialize storage for a new file. + + If provided, `preferred_location` will inform where the file will be stored, but may be ignored + if this factory is subclassed and `get_location()` is overridden. + """ if fileinstance.storage_backend: return self(fileinstance) # type: ignore @@ -63,15 +78,19 @@ def initialize( ) def get_location(self, fileinstance: FileInstance, preferred_location: Location = None) -> Location: - return preferred_location or Location.get_default() + """Return a Location for storing a new file. - def get_storage_backend_name(self, fileinstance: FileInstance, preferred_location: Location) -> str: - """""" - if not preferred_location: - raise ValueError("preferred_location required for default storage factory") - return preferred_location.storage_backend + This base implementation returns the preferred_location if it's provided, or the default location + recorded in the database. This method can be overridden if some other methodology is required. + """ + return preferred_location or Location.get_default() def resolve_storage_backend(self, backend_name: str) -> Type[StorageBackend]: + """Resolve a storage backend name to the associated storage backend class. + + This base implementation resolves backends from the FILES_REST_STORAGE_BACKENDS app config + setting. + """ return self.app.config['FILES_REST_STORAGE_BACKENDS'][backend_name] def get_storage_backend_kwargs( @@ -79,6 +98,11 @@ def get_storage_backend_kwargs( fileinstance: FileInstance, storage_backend_cls: Type[StorageBackend], ) -> Dict[str, Any]: + """Retrieve any instantiation kwargs for the storage backend. + + This returns an empty dict by defaut, but can be overridden to provide backend-specific + instantiation parameters if necessary. + """ return {} def get_suggested_uri( @@ -87,6 +111,12 @@ def get_suggested_uri( location: Location, storage_backend_cls: Type[StorageBackend], ): + """Generate a suggested URI for new files. + + This can be overridden if your implementation requires some other file layout for storage. + Note that individual storage backends may choose to ignore the suggested URI if they + organise files by some other scheme. + """ return make_path( location, str(fileinstance.id), From 0aeefc4f7eb7b7329df62a84216e0d8331f89c3e Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Tue, 13 Oct 2020 21:47:54 +0100 Subject: [PATCH 26/41] pydocstyle fixes --- invenio_files_rest/models.py | 2 ++ invenio_files_rest/storage/base.py | 24 +++++++++++++++++++----- invenio_files_rest/storage/legacy.py | 11 ++++++++++- invenio_files_rest/storage/pyfs.py | 2 ++ 4 files changed, 33 insertions(+), 6 deletions(-) diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index c7fa912b..132a8d15 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -836,6 +836,7 @@ def initialize(self, preferred_location: Location, size=0, **kwargs): @ensure_writable() def init_contents(self, size=0, default_location: str=None, **kwargs): + """Initialize storage for this FileInstance.""" preferred_location: typing.Optional[Location] if default_location: preferred_location = Location(uri=default_location) @@ -922,6 +923,7 @@ def set_uri(self, uri, size, checksum, readable=True, writable=False, _FILE_METADATA_FIELDS = {'uri', 'size', 'checksum', 'writable', 'readable', 'storage_class'} def update_file_metadata(self, file_metadata: Union[Tuple,Dict] = None, **kwargs): + """Update the file metadata, usually as a result of a storage backend operation.""" if file_metadata is None: file_metadata = {} diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index 9626461d..784660e0 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -15,7 +15,7 @@ from calendar import timegm from datetime import datetime from functools import partial -from typing import Callable, TYPE_CHECKING, Tuple +from typing import Any, Callable, Dict, TYPE_CHECKING, Tuple import typing from flask import current_app @@ -45,6 +45,10 @@ def __init__(self, uri: str=None, size: int=None, modified: datetime=None): @classmethod def get_backend_name(cls): + """Return the backend name for this StorageBackend. + + This performs a reverse-lookup in FILES_REST_STORAGE_BACKENDS and then caches the result. + """ try: return cls._backend_name except AttributeError: @@ -68,7 +72,7 @@ def delete(self): raise NotImplementedError def initialize(self, size=0): - """Initialize the file on the storage + truncate to the given size.""" + """Initialize the file on the storage and truncate to the given size.""" return { 'readable': False, 'writable': True, @@ -77,7 +81,8 @@ def initialize(self, size=0): **self._initialize(size=size), } - def _initialize(self, size=0): + def _initialize(self, size=0) -> Dict[Any, str]: + """Override this to perform file storage initialization.""" raise NotImplementedError def save(self, incoming_stream, size_limit=None, size=None, @@ -105,7 +110,11 @@ def save(self, incoming_stream, size_limit=None, size=None, } def get_save_stream(self) -> typing.ContextManager: - """Save incoming stream to file storage.""" + """Return a context manager for a file-like object for writing to the file. + + The return value should be a context manager that provides a file-like object when entered, and performs any necessary + clean-up when exited (e.g. closing the file). + """ raise NotImplementedError def update(self,incoming_stream, seek=0, size=None, chunk_size=None, @@ -123,9 +132,15 @@ def update(self,incoming_stream, seek=0, size=None, chunk_size=None, return result['size'], result['checksum'] def get_update_stream(self, seek) -> typing.ContextManager: + """Return a context manager for a file-like object for updating the file. + + The return value should be a context manager that provides a file-like object when entered, and performs any necessary + clean-up when exited (e.g. closing the file). + """ raise NotImplementedError def _write_stream(self, incoming_stream, output_stream, *, size_limit=None, size=None, chunk_size=None, progress_callback=None): + """Copy from one stream to another while honoring size limits and requested progress callbacks.""" chunk_size = chunk_size_or_default(chunk_size) algo, checksum = self._init_hash() @@ -199,7 +214,6 @@ def send_file(self, filename, mimetype=None, restricted=True, def checksum(self, chunk_size=None, progress_callback=None): """Compute checksum of file.""" - algo, m = self._init_hash() if not m: return None diff --git a/invenio_files_rest/storage/legacy.py b/invenio_files_rest/storage/legacy.py index f387152a..1a308cf7 100644 --- a/invenio_files_rest/storage/legacy.py +++ b/invenio_files_rest/storage/legacy.py @@ -9,7 +9,8 @@ """The previous invenio_files_rest.storage implementation. These classes and factory are included so that we can test that there are no regressions against them when -implementing the new storage interface, and so ensure backwards compatibility.""" +implementing the new storage interface, and so ensure backwards compatibility. +""" from __future__ import absolute_import, print_function @@ -38,6 +39,7 @@ def __init__(self, size=None, modified=None): def open(self, mode=None): """Open the file. + The caller is responsible for closing the file. """ raise NotImplementedError @@ -112,6 +114,7 @@ def checksum(self, chunk_size=None, progress_callback=None, **kwargs): def copy(self, src, chunk_size=None, progress_callback=None): """Copy data from another file instance. + :param src: Source stream. :param chunk_size: Chunk size to read from source stream. """ @@ -127,6 +130,7 @@ def copy(self, src, chunk_size=None, progress_callback=None): # def _init_hash(self): """Initialize message digest object. + Overwrite this method if you want to use different checksum algorithm for your storage backend. """ @@ -135,6 +139,7 @@ def _init_hash(self): def _compute_checksum(self, stream, size=None, chunk_size=None, progress_callback=None, **kwargs): """Get helper method to compute checksum from a stream. + Naive implementation that can be overwritten by subclasses in order to provide more efficient implementation. """ @@ -158,6 +163,7 @@ def _compute_checksum(self, stream, size=None, chunk_size=None, def _write_stream(self, src, dst, size=None, size_limit=None, chunk_size=None, progress_callback=None): """Get helper to save stream from src to dest + compute checksum. + :param src: Source stream. :param dst: Destination stream. :param size: If provided, this exact amount of bytes will be @@ -199,6 +205,7 @@ def _write_stream(self, src, dst, size=None, size_limit=None, class PyFSFileStorage(FileStorage): """File system storage using PyFilesystem for access the file. + This storage class will store files according to the following pattern: ``//data``. .. warning:: @@ -226,6 +233,7 @@ def _get_fs(self, create_dir=True): def open(self, mode='rb'): """Open file. + The caller is responsible for closing the file. """ fs, path = self._get_fs() @@ -233,6 +241,7 @@ def open(self, mode='rb'): def delete(self): """Delete a file. + The base directory is also removed, as it is assumed that only one file exists in the directory. """ diff --git a/invenio_files_rest/storage/pyfs.py b/invenio_files_rest/storage/pyfs.py index cfc3b417..c126f686 100644 --- a/invenio_files_rest/storage/pyfs.py +++ b/invenio_files_rest/storage/pyfs.py @@ -104,6 +104,7 @@ def _initialize(self, size=0): @contextlib.contextmanager def get_save_stream(self): + """Open the underlying file for writing, and delete it if there was a problem.""" fp = self.open(mode='wb') try: yield fp @@ -114,6 +115,7 @@ def get_save_stream(self): fp.close() def get_update_stream(self, seek): + """Open the underlying file for updates, seeking as requested.""" fp = self.open(mode='r+b') fp.seek(seek) return contextlib.closing(fp) From 8a55a0bfdb0edd588e8143433f55f0cd1dd0619c Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Tue, 13 Oct 2020 21:48:04 +0100 Subject: [PATCH 27/41] Remove unused method on base StorageBackend --- invenio_files_rest/storage/base.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index 784660e0..f875e0a1 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -242,16 +242,6 @@ def copy(self, src, chunk_size=None, progress_callback=None): finally: fp.close() - @classmethod - def get_uri(self, fileinstance: FileInstance, base_uri: str) -> str: - return make_path( - base_uri, - str(fileinstance.id), - 'data', - current_app.config['FILES_REST_STORAGE_PATH_DIMENSIONS'], - current_app.config['FILES_REST_STORAGE_PATH_SPLIT_LENGTH'], - ) - # # Helpers # From f03154a4205679a421540440d00f87efbd2de3be Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Tue, 13 Oct 2020 22:34:28 +0100 Subject: [PATCH 28/41] list to tuple for __all__, to placate pydocstyle --- invenio_files_rest/storage/base.py | 2 +- invenio_files_rest/storage/pyfs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index f875e0a1..4661515c 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -29,7 +29,7 @@ from ..models import FileInstance -__all__ = ['FileStorage', 'StorageBackend'] +__all__ = ('FileStorage', 'StorageBackend') class StorageBackend: diff --git a/invenio_files_rest/storage/pyfs.py b/invenio_files_rest/storage/pyfs.py index c126f686..c664dc38 100644 --- a/invenio_files_rest/storage/pyfs.py +++ b/invenio_files_rest/storage/pyfs.py @@ -21,7 +21,7 @@ from .base import StorageBackend from .legacy import PyFSFileStorage, pyfs_storage_factory -__all__ = ['PyFSFileStorage', 'pyfs_storage_factory', 'PyFSStorageBackend'] +__all__ = ('PyFSFileStorage', 'pyfs_storage_factory', 'PyFSStorageBackend') from ..utils import check_size, check_sizelimit From 501f35c630584d18fb550af515d67f3d08427e56 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 14 Oct 2020 09:23:16 +0100 Subject: [PATCH 29/41] docstring fixes for pydocstyle --- invenio_files_rest/storage/legacy.py | 2 ++ invenio_files_rest/storage/pyfs.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/invenio_files_rest/storage/legacy.py b/invenio_files_rest/storage/legacy.py index 1a308cf7..d8da54a5 100644 --- a/invenio_files_rest/storage/legacy.py +++ b/invenio_files_rest/storage/legacy.py @@ -208,7 +208,9 @@ class PyFSFileStorage(FileStorage): This storage class will store files according to the following pattern: ``//data``. + .. warning:: + File operations are not atomic. E.g. if errors happens during e.g. updating part of a file it will leave the file in an inconsistent state. The storage class tries as best as possible to handle errors diff --git a/invenio_files_rest/storage/pyfs.py b/invenio_files_rest/storage/pyfs.py index c664dc38..188df412 100644 --- a/invenio_files_rest/storage/pyfs.py +++ b/invenio_files_rest/storage/pyfs.py @@ -38,7 +38,6 @@ class PyFSStorageBackend(StorageBackend): updating part of a file it will leave the file in an inconsistent state. The storage class tries as best as possible to handle errors and leave the system in a consistent state. - """ def __init__(self, *args, clean_dir=True, **kwargs): From 5ac96fedfe3d07376e9363c8ba342ac394dee386 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 14 Oct 2020 09:33:38 +0100 Subject: [PATCH 30/41] Remove unused imports and unused Py3.7+ annotations future --- invenio_files_rest/models.py | 2 +- invenio_files_rest/storage/base.py | 11 ++++------- invenio_files_rest/storage/factory.py | 2 -- invenio_files_rest/utils.py | 3 +-- 4 files changed, 6 insertions(+), 12 deletions(-) diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index 132a8d15..7f49d543 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -33,7 +33,7 @@ model. """ -from __future__ import absolute_import, annotations, print_function +from __future__ import absolute_import, print_function import mimetypes diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index 4661515c..f8596459 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -8,25 +8,22 @@ """File storage base module.""" -from __future__ import absolute_import, annotations, print_function +from __future__ import absolute_import, print_function import hashlib import warnings from calendar import timegm from datetime import datetime -from functools import partial -from typing import Any, Callable, Dict, TYPE_CHECKING, Tuple +from typing import Any, Callable, Dict, Tuple import typing from flask import current_app from .legacy import FileStorage from ..errors import StorageError -from ..helpers import chunk_size_or_default, compute_checksum, make_path, send_stream +from ..helpers import chunk_size_or_default, compute_checksum, send_stream from ..utils import check_size, check_sizelimit -if TYPE_CHECKING: - from ..models import FileInstance __all__ = ('FileStorage', 'StorageBackend') @@ -37,7 +34,7 @@ class StorageBackend: checksum_hash_name = 'md5' - def __init__(self, uri: str=None, size: int=None, modified: datetime=None): + def __init__(self, uri: str = None, size: int = None, modified: datetime = None): """Initialize storage object.""" self.uri = uri self._size = size diff --git a/invenio_files_rest/storage/factory.py b/invenio_files_rest/storage/factory.py index ec9bb963..d03cdcee 100644 --- a/invenio_files_rest/storage/factory.py +++ b/invenio_files_rest/storage/factory.py @@ -1,7 +1,5 @@ """Contains the base storage factory implementation.""" -import os -import urllib.parse from typing import Any, Dict, Optional, Type from flask import current_app diff --git a/invenio_files_rest/utils.py b/invenio_files_rest/utils.py index 95e548fd..f64cb4ea 100644 --- a/invenio_files_rest/utils.py +++ b/invenio_files_rest/utils.py @@ -7,9 +7,8 @@ # under the terms of the MIT License; see LICENSE file for more details. """Implementation of various utility functions.""" -import hashlib + import mimetypes -from typing import Callable import six from flask import current_app From 554689933ba072f98bef504f364f596b914ef7a1 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 14 Oct 2020 11:09:50 +0100 Subject: [PATCH 31/41] PEP-8 style fixes. Mostly line-length-related. --- invenio_files_rest/ext.py | 5 +- invenio_files_rest/models.py | 63 +++++++++++++------ invenio_files_rest/storage/base.py | 91 +++++++++++++++++++-------- invenio_files_rest/storage/factory.py | 64 ++++++++++++------- invenio_files_rest/storage/legacy.py | 11 ++-- invenio_files_rest/storage/pyfs.py | 6 +- invenio_files_rest/utils.py | 1 - tests/test_legacy_storage.py | 5 +- 8 files changed, 169 insertions(+), 77 deletions(-) diff --git a/invenio_files_rest/ext.py b/invenio_files_rest/ext.py index 09eaa4f8..bba5822b 100644 --- a/invenio_files_rest/ext.py +++ b/invenio_files_rest/ext.py @@ -36,8 +36,9 @@ def storage_factory(self): 'invenio_files_rest.storage.pyfs_storage_factory', ]: warnings.warn(DeprecationWarning( - "The " + self.app.config['FILES_REST_STORAGE_FACTORY'] + " storage factory has been deprecated in " - "favour of 'invenio_files_rest.storage:StorageFactory" + "The " + self.app.config['FILES_REST_STORAGE_FACTORY'] + + " storage factory has been deprecated in favour of" + " 'invenio_files_rest.storage:StorageFactory" )) storage_factory = load_or_import_from_config( 'FILES_REST_STORAGE_FACTORY', app=self.app diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index 7f49d543..4e236bc2 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -111,7 +111,7 @@ def as_bucket_id(value): def as_object_version(value): - """Get an object version object from an object version ID or an object version. + """Get an ObjectVersion object from an ID or an ObjectVersion. :param value: A :class:`invenio_files_rest.models.ObjectVersion` or an object version ID. @@ -765,9 +765,13 @@ def storage(self, **kwargs) -> StorageBackend: :returns: Storage interface. """ if kwargs: - warnings.warn("Passing **kwargs to .storage() is deprecated; override the storage factory with a subclass " - "of invenio_files_rest.storage.StorageFactory and implement get_storage_backend_kwargs() " - "instead.", DeprecationWarning) + warnings.warn( + "Passing **kwargs to .storage() is deprecated; override the " + "storage factory with a subclass of " + "invenio_files_rest.storage.StorageFactory and implement " + "get_storage_backend_kwargs() instead.", + DeprecationWarning + ) return current_files_rest.storage_factory(fileinstance=self, **kwargs) @ensure_readable() @@ -825,17 +829,22 @@ def initialize(self, preferred_location: Location, size=0, **kwargs): ) else: # Old behaviour, with an old-style storage factory - storage = self.storage(default_location=preferred_location.uri, **kwargs) + storage = self.storage( + default_location=preferred_location.uri, **kwargs + ) result = storage.initialize(size=size) self.update_file_metadata( result, readable=False, writable=True, - storage_backend=storage.get_backend_name() if isinstance(storage, StorageBackend) else None, + storage_backend=( + storage.get_backend_name() + if isinstance(storage, StorageBackend) else None + ), ) @ensure_writable() - def init_contents(self, size=0, default_location: str=None, **kwargs): + def init_contents(self, size=0, default_location: str = None, **kwargs): """Initialize storage for this FileInstance.""" preferred_location: typing.Optional[Location] if default_location: @@ -873,11 +882,20 @@ def set_contents(self, stream, chunk_size=None, size=None, size_limit=None, :param stream: File-like stream. """ storage = self.storage(**kwargs) - self.update_file_metadata(storage.save(stream, chunk_size=chunk_size, size=size, - size_limit=size_limit, progress_callback=progress_callback)) - - self.storage_backend = storage.get_backend_name() if isinstance(storage, StorageBackend) else None + self.update_file_metadata( + storage.save( + stream, + chunk_size=chunk_size, + size=size, + size_limit=size_limit, + progress_callback=progress_callback, + ) + ) + self.storage_backend = ( + storage.get_backend_name() + if isinstance(storage, StorageBackend) else None + ) @ensure_writable() def copy_contents(self, fileinstance, progress_callback=None, @@ -889,7 +907,12 @@ def copy_contents(self, fileinstance, progress_callback=None, raise ValueError('File instance has data.') with fileinstance.storage(**kwargs).open() as f: - self.set_contents(f, progress_callback=progress_callback, chunk_size=chunk_size, **kwargs) + self.set_contents( + f, + progress_callback=progress_callback, + chunk_size=chunk_size, + **kwargs + ) @ensure_readable() def send_file(self, filename, restricted=True, mimetype=None, @@ -920,10 +943,14 @@ def set_uri(self, uri, size, checksum, readable=True, writable=False, storage_class return self - _FILE_METADATA_FIELDS = {'uri', 'size', 'checksum', 'writable', 'readable', 'storage_class'} + _FILE_METADATA_FIELDS = { + 'uri', 'size', 'checksum', 'writable', 'readable', 'storage_class' + } - def update_file_metadata(self, file_metadata: Union[Tuple,Dict] = None, **kwargs): - """Update the file metadata, usually as a result of a storage backend operation.""" + def update_file_metadata( + self, file_metadata: Union[Tuple, Dict] = None, **kwargs + ): + """Update the file metadata as a result of a storage operation.""" if file_metadata is None: file_metadata = {} @@ -944,8 +971,6 @@ def update_file_metadata(self, file_metadata: Union[Tuple,Dict] = None, **kwargs setattr(self, key, file_metadata[key]) - - class ObjectVersion(db.Model, Timestamp): """Model for storing versions of objects. @@ -1086,7 +1111,9 @@ def set_contents(self, stream, chunk_size=None, size=None, size_limit=None, @ensure_no_file() @update_bucket_size - def set_location(self, uri, size, checksum, storage_class=None, storage_backend=None): + def set_location( + self, uri, size, checksum, storage_class=None, storage_backend=None + ): """Set only URI location of for object. Useful to link files on externally controlled storage. If a file diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index f8596459..2cd7ca53 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -25,7 +25,6 @@ from ..utils import check_size, check_sizelimit - __all__ = ('FileStorage', 'StorageBackend') @@ -34,7 +33,11 @@ class StorageBackend: checksum_hash_name = 'md5' - def __init__(self, uri: str = None, size: int = None, modified: datetime = None): + def __init__( + self, uri: str = None, + size: int = None, + modified: datetime = None + ): """Initialize storage object.""" self.uri = uri self._size = size @@ -44,17 +47,22 @@ def __init__(self, uri: str = None, size: int = None, modified: datetime = None) def get_backend_name(cls): """Return the backend name for this StorageBackend. - This performs a reverse-lookup in FILES_REST_STORAGE_BACKENDS and then caches the result. + This performs a reverse-lookup in FILES_REST_STORAGE_BACKENDS and then + caches the result. """ try: return cls._backend_name except AttributeError: - for name, backend_cls in current_app.config['FILES_REST_STORAGE_BACKENDS'].items(): + backends = current_app.config['FILES_REST_STORAGE_BACKENDS'] + for name, backend_cls in backends.items(): if cls is backend_cls: cls._backend_name = name break else: - raise RuntimeError("{} isn't listed in FILES_REST_STORAGE_BACKENDS config".format(cls)) + raise RuntimeError( + "{} isn't listed in FILES_REST_STORAGE_BACKENDS " + "config".format(cls) + ) return cls._backend_name def open(self): @@ -82,9 +90,14 @@ def _initialize(self, size=0) -> Dict[Any, str]: """Override this to perform file storage initialization.""" raise NotImplementedError - def save(self, incoming_stream, size_limit=None, size=None, - chunk_size=None, progress_callback: Callable[[int, int], None] = None - ): + def save( + self, + incoming_stream, + size_limit=None, + size=None, + chunk_size=None, + progress_callback: Callable[[int, int], None] = None + ): """Save incoming stream to file storage.""" with self.get_save_stream() as output_stream: result = self._write_stream( @@ -107,14 +120,15 @@ def save(self, incoming_stream, size_limit=None, size=None, } def get_save_stream(self) -> typing.ContextManager: - """Return a context manager for a file-like object for writing to the file. + """Return a context manager for a file-like object for writing. - The return value should be a context manager that provides a file-like object when entered, and performs any necessary - clean-up when exited (e.g. closing the file). + The return value should be a context manager that provides a file-like + object when entered, and performs any necessary clean-up when exited + (e.g. closing the file). """ raise NotImplementedError - def update(self,incoming_stream, seek=0, size=None, chunk_size=None, + def update(self, incoming_stream, seek=0, size=None, chunk_size=None, progress_callback=None) -> Tuple[int, str]: """Update part of file with incoming stream.""" with self.get_update_stream(seek) as output_stream: @@ -129,15 +143,29 @@ def update(self,incoming_stream, seek=0, size=None, chunk_size=None, return result['size'], result['checksum'] def get_update_stream(self, seek) -> typing.ContextManager: - """Return a context manager for a file-like object for updating the file. + """Return a context manager for a file-like object for updating. - The return value should be a context manager that provides a file-like object when entered, and performs any necessary - clean-up when exited (e.g. closing the file). + The return value should be a context manager that provides a file-like + object when entered, and performs any necessary clean-up when exited + (e.g. closing the file). """ raise NotImplementedError - def _write_stream(self, incoming_stream, output_stream, *, size_limit=None, size=None, chunk_size=None, progress_callback=None): - """Copy from one stream to another while honoring size limits and requested progress callbacks.""" + def _write_stream( + self, + incoming_stream, + output_stream, + *, + size_limit=None, + size=None, + chunk_size=None, + progress_callback=None, + ): + """Copy from one stream to another. + + This honors size limits and performs requested progress callbacks once + data has been written to the output stream. + """ chunk_size = chunk_size_or_default(chunk_size) algo, checksum = self._init_hash() @@ -168,16 +196,26 @@ def _write_stream(self, incoming_stream, output_stream, *, size_limit=None, size check_size(bytes_written, size) return { - 'checksum': f'{self.checksum_hash_name}:{checksum.hexdigest()}' if checksum else None, + 'checksum': ( + f'{self.checksum_hash_name}:{checksum.hexdigest()}' + if checksum else None + ), 'size': bytes_written, } # # Default implementation # - def send_file(self, filename, mimetype=None, restricted=True, - checksum=None, trusted=False, chunk_size=None, - as_attachment=False): + def send_file( + self, + filename, + mimetype=None, + restricted=True, + checksum=None, + trusted=False, + chunk_size=None, + as_attachment=False + ): """Send the file to the client.""" try: fp = self.open(mode='rb') @@ -231,13 +269,14 @@ def copy(self, src, chunk_size=None, progress_callback=None): :param src: Source stream. :param chunk_size: Chunk size to read from source stream. """ - warnings.warn("Call save with the other already-open FileStorage passed in instead.", DeprecationWarning) - fp = src.open(mode='rb') - try: + warnings.warn( + "Call save() with the other already-open FileStorage passed in " + "instead.", + DeprecationWarning + ) + with src.open() as fp: return self.save( fp, chunk_size=chunk_size, progress_callback=progress_callback) - finally: - fp.close() # # Helpers diff --git a/invenio_files_rest/storage/factory.py b/invenio_files_rest/storage/factory.py index d03cdcee..8ff9dc00 100644 --- a/invenio_files_rest/storage/factory.py +++ b/invenio_files_rest/storage/factory.py @@ -12,8 +12,9 @@ class StorageFactory: """A base storage factory, with sensible default behaviour. - You may subclass this factory to implement custom behaviour. If you do this, remember to set - FILES_REST_STORAGE_FACTORY to the right import path for your subclass. + You may subclass this factory to implement custom behaviour. If you do + this, remember to set FILES_REST_STORAGE_FACTORY to the right import path + for your subclass. """ def __init__(self, app): @@ -24,16 +25,21 @@ def __call__( self, fileinstance: FileInstance, ) -> Optional[StorageBackend]: - """Return a FileStorage instance for a file, for manipulating file contents. + """Return a FileStorage instance for a file, for manipulating contents. - This requires that the fileinstance already has an associated storage backend. If not, - you should call initialize() instead to initialize storage for the file instance. + This requires that the fileinstance already has an associated storage + backend. If not, you should call initialize() instead to initialize + storage for the file instance. """ if not fileinstance.storage_backend: return None - storage_backend_cls = self.resolve_storage_backend(fileinstance.storage_backend) - storage_backend_kwargs = self.get_storage_backend_kwargs(fileinstance, storage_backend_cls) + storage_backend_cls = self.resolve_storage_backend( + fileinstance.storage_backend + ) + storage_backend_kwargs = self.get_storage_backend_kwargs( + fileinstance, storage_backend_cls + ) return storage_backend_cls( uri=fileinstance.uri, @@ -49,8 +55,9 @@ def initialize( ) -> StorageBackend: """Initialize storage for a new file. - If provided, `preferred_location` will inform where the file will be stored, but may be ignored - if this factory is subclassed and `get_location()` is overridden. + If provided, `preferred_location` will inform where the file will be + stored, but may be ignored if this factory is subclassed and + `get_location()` is overridden. """ if fileinstance.storage_backend: return self(fileinstance) # type: ignore @@ -59,8 +66,12 @@ def initialize( fileinstance.storage_backend = location.storage_backend - storage_backend_cls = self.resolve_storage_backend(fileinstance.storage_backend) - storage_backend_kwargs = self.get_storage_backend_kwargs(fileinstance, storage_backend_cls) + storage_backend_cls = self.resolve_storage_backend( + fileinstance.storage_backend + ) + storage_backend_kwargs = self.get_storage_backend_kwargs( + fileinstance, storage_backend_cls + ) uri = self.get_suggested_uri( fileinstance=fileinstance, @@ -75,19 +86,26 @@ def initialize( size=size, ) - def get_location(self, fileinstance: FileInstance, preferred_location: Location = None) -> Location: + def get_location( + self, + fileinstance: FileInstance, + preferred_location: Location = None + ) -> Location: """Return a Location for storing a new file. - This base implementation returns the preferred_location if it's provided, or the default location - recorded in the database. This method can be overridden if some other methodology is required. + This base implementation returns the preferred_location if it's + provided, or the default location recorded in the database. This method + can be overridden if some other methodology is required. """ return preferred_location or Location.get_default() - def resolve_storage_backend(self, backend_name: str) -> Type[StorageBackend]: - """Resolve a storage backend name to the associated storage backend class. + def resolve_storage_backend( + self, backend_name: str + ) -> Type[StorageBackend]: + """Resolve a storage backend name to the associated backend class. - This base implementation resolves backends from the FILES_REST_STORAGE_BACKENDS app config - setting. + This base implementation resolves backends from the + FILES_REST_STORAGE_BACKENDS app config setting. """ return self.app.config['FILES_REST_STORAGE_BACKENDS'][backend_name] @@ -98,8 +116,8 @@ def get_storage_backend_kwargs( ) -> Dict[str, Any]: """Retrieve any instantiation kwargs for the storage backend. - This returns an empty dict by defaut, but can be overridden to provide backend-specific - instantiation parameters if necessary. + This returns an empty dict by defaut, but can be overridden to provide + backend-specific instantiation parameters if necessary. """ return {} @@ -111,9 +129,9 @@ def get_suggested_uri( ): """Generate a suggested URI for new files. - This can be overridden if your implementation requires some other file layout for storage. - Note that individual storage backends may choose to ignore the suggested URI if they - organise files by some other scheme. + This can be overridden if your implementation requires some other file + layout for storage. Note that individual storage backends may choose to + ignore the suggested URI if they organise files by some other scheme. """ return make_path( location, diff --git a/invenio_files_rest/storage/legacy.py b/invenio_files_rest/storage/legacy.py index d8da54a5..db17ee39 100644 --- a/invenio_files_rest/storage/legacy.py +++ b/invenio_files_rest/storage/legacy.py @@ -8,8 +8,9 @@ """The previous invenio_files_rest.storage implementation. -These classes and factory are included so that we can test that there are no regressions against them when -implementing the new storage interface, and so ensure backwards compatibility. +These classes and factory are included so that we can test that there are no +regressions against them when implementing the new storage interface, and so +ensure backwards compatibility. """ from __future__ import absolute_import, print_function @@ -21,8 +22,10 @@ from flask import current_app from invenio_files_rest.helpers import make_path -from invenio_files_rest.errors import FileSizeError, StorageError, UnexpectedFileSizeError -from invenio_files_rest.helpers import chunk_size_or_default, compute_checksum, send_stream +from invenio_files_rest.errors import StorageError +from invenio_files_rest.helpers import ( + chunk_size_or_default, compute_checksum, send_stream +) from invenio_files_rest.utils import check_size, check_sizelimit from fs.opener import opener diff --git a/invenio_files_rest/storage/pyfs.py b/invenio_files_rest/storage/pyfs.py index 188df412..992ed779 100644 --- a/invenio_files_rest/storage/pyfs.py +++ b/invenio_files_rest/storage/pyfs.py @@ -103,7 +103,11 @@ def _initialize(self, size=0): @contextlib.contextmanager def get_save_stream(self): - """Open the underlying file for writing, and delete it if there was a problem.""" + """Return the underlying file for writing. + + This implementation deletes the file if an exception is thrown by code + executing in this context. + """ fp = self.open(mode='wb') try: yield fp diff --git a/invenio_files_rest/utils.py b/invenio_files_rest/utils.py index f64cb4ea..13b20474 100644 --- a/invenio_files_rest/utils.py +++ b/invenio_files_rest/utils.py @@ -60,7 +60,6 @@ def guess_mimetype(filename): return m or 'application/octet-stream' - def check_sizelimit(size_limit, bytes_written, total_size): """Check if size limit was exceeded. diff --git a/tests/test_legacy_storage.py b/tests/test_legacy_storage.py index a13b4056..82ad5f39 100644 --- a/tests/test_legacy_storage.py +++ b/tests/test_legacy_storage.py @@ -25,6 +25,7 @@ from invenio_files_rest.limiters import FileSizeLimit from invenio_files_rest.storage import FileStorage, PyFSFileStorage + def test_storage_interface(): """Test storage interface.""" s = FileStorage('some-path') @@ -241,7 +242,7 @@ def test_pyfs_send_file(app, legacy_pyfs): # Test for absence of Content-Disposition header to make sure that # it's not present when as_attachment=False res = legacy_pyfs.send_file('myfilename.txt', mimetype='text/plain', - checksum=checksum, as_attachment=False) + checksum=checksum, as_attachment=False) assert res.status_code == 200 assert 'attachment' not in res.headers['Content-Disposition'] @@ -255,7 +256,7 @@ def test_pyfs_send_file_for_download(app, legacy_pyfs): # Test for presence of Content-Disposition header to make sure that # it's present when as_attachment=True res = legacy_pyfs.send_file('myfilename.txt', mimetype='text/plain', - checksum=checksum, as_attachment=True) + checksum=checksum, as_attachment=True) assert res.status_code == 200 assert (res.headers['Content-Disposition'] == 'attachment; filename=myfilename.txt') From 6b33fd8327685548053fd5211dca30f21201223c Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 14 Oct 2020 11:20:21 +0100 Subject: [PATCH 32/41] More PEP-8 fixes --- invenio_files_rest/models.py | 8 +++----- invenio_files_rest/storage/base.py | 12 +++++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index 4e236bc2..050091a6 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -846,11 +846,9 @@ def initialize(self, preferred_location: Location, size=0, **kwargs): @ensure_writable() def init_contents(self, size=0, default_location: str = None, **kwargs): """Initialize storage for this FileInstance.""" - preferred_location: typing.Optional[Location] - if default_location: - preferred_location = Location(uri=default_location) - else: - preferred_location = None + preferred_location = ( # type: typing.Optional[Location] + Location(uri=default_location) if default_location else None + ) return self.initialize( preferred_location=preferred_location, size=size, diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index 2cd7ca53..5fa5c4db 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -77,7 +77,7 @@ def delete(self): raise NotImplementedError def initialize(self, size=0): - """Initialize the file on the storage and truncate to the given size.""" + """Initialize the file on the storage and truncate to given size.""" return { 'readable': False, 'writable': True, @@ -163,9 +163,9 @@ def _write_stream( ): """Copy from one stream to another. - This honors size limits and performs requested progress callbacks once - data has been written to the output stream. - """ + This honors size limits and performs requested progress callbacks once + data has been written to the output stream. + """ chunk_size = chunk_size_or_default(chunk_size) algo, checksum = self._init_hash() @@ -288,6 +288,8 @@ def _init_hash(self): algorithm for your storage backend. """ if self.checksum_hash_name: - return self.checksum_hash_name, hashlib.new(self.checksum_hash_name) + return ( + self.checksum_hash_name, hashlib.new(self.checksum_hash_name) + ) else: return None, None From da397f8f1d48f8fe54944536d5c0705bf051a5c2 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 14 Oct 2020 11:40:39 +0100 Subject: [PATCH 33/41] The last PEP-8 warnings --- invenio_files_rest/models.py | 4 +++- invenio_files_rest/storage/base.py | 2 +- invenio_files_rest/storage/pyfs.py | 5 ----- tests/test_legacy_storage.py | 8 ++++---- tests/test_storage.py | 10 ++++++++-- 5 files changed, 16 insertions(+), 13 deletions(-) diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index 050091a6..cd3986ff 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -946,7 +946,9 @@ def set_uri(self, uri, size, checksum, readable=True, writable=False, } def update_file_metadata( - self, file_metadata: Union[Tuple, Dict] = None, **kwargs + self, + file_metadata=None, # type: Union[Tuple, Dict] + **kwargs ): """Update the file metadata as a result of a storage operation.""" if file_metadata is None: diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index 5fa5c4db..7cf4ff77 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -96,7 +96,7 @@ def save( size_limit=None, size=None, chunk_size=None, - progress_callback: Callable[[int, int], None] = None + progress_callback=None, # type: Callable[[int, int], None] ): """Save incoming stream to file storage.""" with self.get_save_stream() as output_stream: diff --git a/invenio_files_rest/storage/pyfs.py b/invenio_files_rest/storage/pyfs.py index 992ed779..41f0f7b2 100644 --- a/invenio_files_rest/storage/pyfs.py +++ b/invenio_files_rest/storage/pyfs.py @@ -11,20 +11,15 @@ from __future__ import absolute_import, print_function import contextlib -import shutil -from flask import current_app from fs.opener import opener from fs.path import basename, dirname -from ..helpers import chunk_size_or_default, make_path from .base import StorageBackend from .legacy import PyFSFileStorage, pyfs_storage_factory __all__ = ('PyFSFileStorage', 'pyfs_storage_factory', 'PyFSStorageBackend') -from ..utils import check_size, check_sizelimit - class PyFSStorageBackend(StorageBackend): """File system storage using PyFilesystem for access the file. diff --git a/tests/test_legacy_storage.py b/tests/test_legacy_storage.py index 82ad5f39..29006e4c 100644 --- a/tests/test_legacy_storage.py +++ b/tests/test_legacy_storage.py @@ -124,10 +124,10 @@ def test_pyfs_save_limits(legacy_pyfs): assert size == len(data) # Size doesn't match - pytest.raises( - UnexpectedFileSizeError, legacy_pyfs.save, BytesIO(data), size=len(data) - 1) - pytest.raises( - UnexpectedFileSizeError, legacy_pyfs.save, BytesIO(data), size=len(data) + 1) + with pytest.raises(UnexpectedFileSizeError): + legacy_pyfs.save(BytesIO(data), size=len(data) - 1) + with pytest.raises(UnexpectedFileSizeError): + legacy_pyfs.save(BytesIO(data), size=len(data) + 1) # Exceeds size limits pytest.raises( diff --git a/tests/test_storage.py b/tests/test_storage.py index 5c3fcc27..ed9cadee 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -220,7 +220,10 @@ def test_pyfs_send_file(app, pyfs): with app.test_request_context(): res = pyfs.send_file( - 'myfilename.txt', mimetype='text/plain', checksum=result['checksum']) + 'myfilename.txt', + mimetype='text/plain', + checksum=result['checksum'] + ) assert res.status_code == 200 h = res.headers assert h['Content-Type'] == 'text/plain; charset=utf-8' @@ -270,7 +273,10 @@ def test_pyfs_send_file_xss_prevention(app, pyfs): with app.test_request_context(): res = pyfs.send_file( - 'myfilename.html', mimetype='text/html', checksum=result['checksum']) + 'myfilename.html', + mimetype='text/html', + checksum=result['checksum'] + ) assert res.status_code == 200 h = res.headers assert h['Content-Type'] == 'text/plain; charset=utf-8' From f56a7c390809018bb065d55bdfccc45b48551d68 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 14 Oct 2020 12:28:02 +0100 Subject: [PATCH 34/41] Run isort, for CI happiness --- invenio_files_rest/config.py | 3 +-- invenio_files_rest/ext.py | 1 - invenio_files_rest/models.py | 8 ++------ invenio_files_rest/storage/__init__.py | 2 +- invenio_files_rest/storage/base.py | 8 +++----- invenio_files_rest/storage/factory.py | 6 +++--- invenio_files_rest/storage/legacy.py | 14 +++++--------- invenio_files_rest/storage/pyfs.py | 1 - 8 files changed, 15 insertions(+), 28 deletions(-) diff --git a/invenio_files_rest/config.py b/invenio_files_rest/config.py index 94b72701..1424dcab 100644 --- a/invenio_files_rest/config.py +++ b/invenio_files_rest/config.py @@ -8,9 +8,8 @@ """Invenio Files Rest module configuration file.""" -from datetime import timedelta - import pkg_resources +from datetime import timedelta from invenio_files_rest.helpers import create_file_streaming_redirect_response diff --git a/invenio_files_rest/ext.py b/invenio_files_rest/ext.py index bba5822b..6eb956d1 100644 --- a/invenio_files_rest/ext.py +++ b/invenio_files_rest/ext.py @@ -11,7 +11,6 @@ from __future__ import absolute_import, print_function import warnings - from flask import abort from werkzeug.exceptions import UnprocessableEntity from werkzeug.utils import cached_property diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index cd3986ff..108fb098 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -40,15 +40,10 @@ import re import six import sys +import typing import uuid import warnings from datetime import datetime -from functools import wraps -from os.path import basename -from typing import Dict, TYPE_CHECKING, Tuple, Union - -import six -import typing from flask import current_app from functools import wraps from invenio_db import db @@ -58,6 +53,7 @@ from sqlalchemy.orm import validates from sqlalchemy.orm.exc import MultipleResultsFound from sqlalchemy_utils.types import UUIDType +from typing import TYPE_CHECKING, Dict, Tuple, Union from .errors import BucketLockedError, FileInstanceAlreadySetError, \ FileInstanceUnreadableError, FileSizeError, InvalidKeyError, \ diff --git a/invenio_files_rest/storage/__init__.py b/invenio_files_rest/storage/__init__.py index 6bc498b8..1db026f7 100644 --- a/invenio_files_rest/storage/__init__.py +++ b/invenio_files_rest/storage/__init__.py @@ -11,8 +11,8 @@ from __future__ import absolute_import, print_function from .base import FileStorage, StorageBackend -from .pyfs import PyFSFileStorage, pyfs_storage_factory, PyFSStorageBackend from .factory import StorageFactory +from .pyfs import PyFSFileStorage, PyFSStorageBackend, pyfs_storage_factory __all__ = ( 'FileStorage', diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index 7cf4ff77..0720caf0 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -11,19 +11,17 @@ from __future__ import absolute_import, print_function import hashlib +import typing import warnings from calendar import timegm from datetime import datetime -from typing import Any, Callable, Dict, Tuple - -import typing from flask import current_app +from typing import Any, Callable, Dict, Tuple -from .legacy import FileStorage from ..errors import StorageError from ..helpers import chunk_size_or_default, compute_checksum, send_stream from ..utils import check_size, check_sizelimit - +from .legacy import FileStorage __all__ = ('FileStorage', 'StorageBackend') diff --git a/invenio_files_rest/storage/factory.py b/invenio_files_rest/storage/factory.py index 8ff9dc00..69704e01 100644 --- a/invenio_files_rest/storage/factory.py +++ b/invenio_files_rest/storage/factory.py @@ -1,12 +1,12 @@ """Contains the base storage factory implementation.""" -from typing import Any, Dict, Optional, Type - from flask import current_app +from typing import Any, Dict, Optional, Type from invenio_files_rest.models import FileInstance, Location -from .base import StorageBackend + from ..helpers import make_path +from .base import StorageBackend class StorageFactory: diff --git a/invenio_files_rest/storage/legacy.py b/invenio_files_rest/storage/legacy.py index db17ee39..ed577183 100644 --- a/invenio_files_rest/storage/legacy.py +++ b/invenio_files_rest/storage/legacy.py @@ -17,20 +17,16 @@ import hashlib from calendar import timegm -from functools import partial - from flask import current_app +from fs.opener import opener +from fs.path import basename, dirname +from functools import partial -from invenio_files_rest.helpers import make_path from invenio_files_rest.errors import StorageError -from invenio_files_rest.helpers import ( - chunk_size_or_default, compute_checksum, send_stream -) +from invenio_files_rest.helpers import chunk_size_or_default, \ + compute_checksum, make_path, send_stream from invenio_files_rest.utils import check_size, check_sizelimit -from fs.opener import opener -from fs.path import basename, dirname - class FileStorage(object): """Base class for storage interface to a single file.""" diff --git a/invenio_files_rest/storage/pyfs.py b/invenio_files_rest/storage/pyfs.py index 41f0f7b2..7344be8a 100644 --- a/invenio_files_rest/storage/pyfs.py +++ b/invenio_files_rest/storage/pyfs.py @@ -11,7 +11,6 @@ from __future__ import absolute_import, print_function import contextlib - from fs.opener import opener from fs.path import basename, dirname From 3b0868f6ab873897beed13a78ae9cb0dcb8b9866 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 14 Oct 2020 14:40:03 +0100 Subject: [PATCH 35/41] Remove more extraneous code --- invenio_files_rest/models.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index 108fb098..d53009c4 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -35,8 +35,6 @@ from __future__ import absolute_import, print_function -import mimetypes - import re import six import sys @@ -53,7 +51,7 @@ from sqlalchemy.orm import validates from sqlalchemy.orm.exc import MultipleResultsFound from sqlalchemy_utils.types import UUIDType -from typing import TYPE_CHECKING, Dict, Tuple, Union +from typing import Dict, Tuple, Union from .errors import BucketLockedError, FileInstanceAlreadySetError, \ FileInstanceUnreadableError, FileSizeError, InvalidKeyError, \ @@ -61,11 +59,9 @@ MultipartInvalidChunkSize, MultipartInvalidPartNumber, \ MultipartInvalidSize, MultipartMissingParts, MultipartNotCompleted from .proxies import current_files_rest -from .storage.base import StorageBackend -from .utils import ENCODING_MIMETYPES, guess_mimetype +from .storage import StorageBackend +from .utils import guess_mimetype -if TYPE_CHECKING: - from .storage import StorageBackend slug_pattern = re.compile('^[a-z][a-z0-9-]+$') From 985d64d52582c0718f1cd8cf5716eda4dcd331c6 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 14 Oct 2020 14:43:02 +0100 Subject: [PATCH 36/41] Add migration for backend_name --- .../alembic/0999e27defd5_add_backend_name.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 invenio_files_rest/alembic/0999e27defd5_add_backend_name.py diff --git a/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py b/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py new file mode 100644 index 00000000..45764d1b --- /dev/null +++ b/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py @@ -0,0 +1,34 @@ +# +# This file is part of Invenio. +# Copyright (C) 2016-2018 CERN. +# +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Add backend_name""" + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '0999e27defd5' +down_revision = '8ae99b034410' +branch_labels = () +depends_on = None + + +def upgrade(): + """Upgrade database.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('files_files', sa.Column('backend_name', sa.String(length=32), nullable=True)) + op.add_column('files_location', sa.Column('storage_backend', sa.String(length=32), nullable=True)) + # ### end Alembic commands ### + + +def downgrade(): + """Downgrade database.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('files_location', 'storage_backend') + op.drop_column('files_files', 'backend_name') + # ### end Alembic commands ### From 55d059f45b169d44d37e369ab0ad0e1d8ea58bcc Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 14 Oct 2020 14:47:17 +0100 Subject: [PATCH 37/41] Coalesce on `storage_backend`, not `backend_name` for model attr --- invenio_files_rest/alembic/0999e27defd5_add_backend_name.py | 4 ++-- invenio_files_rest/models.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py b/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py index 45764d1b..26f9e428 100644 --- a/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py +++ b/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py @@ -21,7 +21,7 @@ def upgrade(): """Upgrade database.""" # ### commands auto generated by Alembic - please adjust! ### - op.add_column('files_files', sa.Column('backend_name', sa.String(length=32), nullable=True)) + op.add_column('files_files', sa.Column('storage_backend', sa.String(length=32), nullable=True)) op.add_column('files_location', sa.Column('storage_backend', sa.String(length=32), nullable=True)) # ### end Alembic commands ### @@ -30,5 +30,5 @@ def downgrade(): """Downgrade database.""" # ### commands auto generated by Alembic - please adjust! ### op.drop_column('files_location', 'storage_backend') - op.drop_column('files_files', 'backend_name') + op.drop_column('files_files', 'storage_backend') # ### end Alembic commands ### diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index d53009c4..9a0a17e1 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -671,7 +671,7 @@ class FileInstance(db.Model, Timestamp): storage_class = db.Column(db.String(1), nullable=True) """Storage class of file.""" - backend_name = db.Column(db.String(32), nullable=True) + storage_backend = db.Column(db.String(32), nullable=True) size = db.Column(db.BigInteger, default=0, nullable=True) """Size of file.""" From 0fede69b4ffb863b69782b9cfd6aefe15102bb27 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Wed, 14 Oct 2020 15:54:54 +0100 Subject: [PATCH 38/41] fixup! Add migration for backend_name --- invenio_files_rest/alembic/0999e27defd5_add_backend_name.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py b/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py index 26f9e428..830111d7 100644 --- a/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py +++ b/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py @@ -5,7 +5,7 @@ # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. -"""Add backend_name""" +"""Add backend_name.""" from alembic import op import sqlalchemy as sa From dd1a55733a58f2bcbcf3d215cba10df7e298cf76 Mon Sep 17 00:00:00 2001 From: Alex Dutton Date: Thu, 15 Oct 2020 12:08:23 +0100 Subject: [PATCH 39/41] More style fixes in the alembic migration --- .../alembic/0999e27defd5_add_backend_name.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py b/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py index 830111d7..6224df0b 100644 --- a/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py +++ b/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py @@ -21,8 +21,16 @@ def upgrade(): """Upgrade database.""" # ### commands auto generated by Alembic - please adjust! ### - op.add_column('files_files', sa.Column('storage_backend', sa.String(length=32), nullable=True)) - op.add_column('files_location', sa.Column('storage_backend', sa.String(length=32), nullable=True)) + op.add_column( + 'files_files', + sa.Column('storage_backend', + sa.String(length=32), + nullable=True)) + op.add_column( + 'files_location', + sa.Column('storage_backend', + sa.String(length=32), + nullable=True)) # ### end Alembic commands ### From 142660d435997f182f17a447e39a049881db9d6a Mon Sep 17 00:00:00 2001 From: Lars Holm Nielsen Date: Fri, 20 Nov 2020 06:59:44 +0100 Subject: [PATCH 40/41] global: copyright headers update --- LICENSE | 3 ++- docs/new-storage-backend.rst | 8 ++++++++ .../alembic/0999e27defd5_add_backend_name.py | 5 ++--- invenio_files_rest/config.py | 3 ++- invenio_files_rest/ext.py | 3 ++- invenio_files_rest/models.py | 4 ++-- invenio_files_rest/storage/__init__.py | 1 + invenio_files_rest/storage/base.py | 3 ++- invenio_files_rest/storage/factory.py | 8 ++++++++ invenio_files_rest/storage/legacy.py | 3 ++- invenio_files_rest/storage/pyfs.py | 3 ++- invenio_files_rest/utils.py | 1 + setup.py | 1 + tests/conftest.py | 1 + tests/test_legacy_storage.py | 4 ++-- tests/test_storage.py | 4 ++-- 16 files changed, 40 insertions(+), 15 deletions(-) diff --git a/LICENSE b/LICENSE index e2986635..e8198804 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ MIT License -Copyright (C) 2015-2019 CERN. +Copyright (C) 2015-2020 CERN. +Copyright (C) 2020 Cottage Labs LLP. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/docs/new-storage-backend.rst b/docs/new-storage-backend.rst index f5ee1cc6..03f25d10 100644 --- a/docs/new-storage-backend.rst +++ b/docs/new-storage-backend.rst @@ -1,3 +1,11 @@ +.. + This file is part of Invenio. + Copyright (C) 2020 Cottage Labs LLP + + Invenio is free software; you can redistribute it and/or modify it + under the terms of the MIT License; see LICENSE file for more details. + + Developing a new storage backend ================================ diff --git a/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py b/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py index 6224df0b..5fd0437c 100644 --- a/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py +++ b/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py @@ -1,15 +1,14 @@ # # This file is part of Invenio. -# Copyright (C) 2016-2018 CERN. +# Copyright (C) 2020 Cottage Labs LLP # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. """Add backend_name.""" -from alembic import op import sqlalchemy as sa - +from alembic import op # revision identifiers, used by Alembic. revision = '0999e27defd5' diff --git a/invenio_files_rest/config.py b/invenio_files_rest/config.py index 1424dcab..ea10b946 100644 --- a/invenio_files_rest/config.py +++ b/invenio_files_rest/config.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2015-2019 CERN. +# Copyright (C) 2015-2020 CERN. +# Copyright (C) 2020 Cottage Labs LLP # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. diff --git a/invenio_files_rest/ext.py b/invenio_files_rest/ext.py index 6eb956d1..9f0ce08e 100644 --- a/invenio_files_rest/ext.py +++ b/invenio_files_rest/ext.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2015-2019 CERN. +# Copyright (C) 2015-2020 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index 9a0a17e1..9cc1d102 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2015-2019 CERN. +# Copyright (C) 2015-2020 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -62,7 +63,6 @@ from .storage import StorageBackend from .utils import guess_mimetype - slug_pattern = re.compile('^[a-z][a-z0-9-]+$') diff --git a/invenio_files_rest/storage/__init__.py b/invenio_files_rest/storage/__init__.py index 1db026f7..055c66c3 100644 --- a/invenio_files_rest/storage/__init__.py +++ b/invenio_files_rest/storage/__init__.py @@ -2,6 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2015-2019 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index 0720caf0..31d2405b 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2016-2019 CERN. +# Copyright (C) 2016-2020 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. diff --git a/invenio_files_rest/storage/factory.py b/invenio_files_rest/storage/factory.py index 69704e01..c9db85ef 100644 --- a/invenio_files_rest/storage/factory.py +++ b/invenio_files_rest/storage/factory.py @@ -1,3 +1,11 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Invenio. +# Copyright (C) 2020 Cottage Labs LLP. +# +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + """Contains the base storage factory implementation.""" from flask import current_app diff --git a/invenio_files_rest/storage/legacy.py b/invenio_files_rest/storage/legacy.py index ed577183..3a34d7b4 100644 --- a/invenio_files_rest/storage/legacy.py +++ b/invenio_files_rest/storage/legacy.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2016-2019 CERN. +# Copyright (C) 2016-2020 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. diff --git a/invenio_files_rest/storage/pyfs.py b/invenio_files_rest/storage/pyfs.py index 7344be8a..1704e82a 100644 --- a/invenio_files_rest/storage/pyfs.py +++ b/invenio_files_rest/storage/pyfs.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2016-2019 CERN. +# Copyright (C) 2016-2020 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. diff --git a/invenio_files_rest/utils.py b/invenio_files_rest/utils.py index 13b20474..8f8bc2b7 100644 --- a/invenio_files_rest/utils.py +++ b/invenio_files_rest/utils.py @@ -2,6 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2016-2019 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. diff --git a/setup.py b/setup.py index 42a9e516..2df9e179 100644 --- a/setup.py +++ b/setup.py @@ -2,6 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2015-2019 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. diff --git a/tests/conftest.py b/tests/conftest.py index 4ee46344..25ecf384 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,6 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2015-2019 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. diff --git a/tests/test_legacy_storage.py b/tests/test_legacy_storage.py index 29006e4c..499cfa95 100644 --- a/tests/test_legacy_storage.py +++ b/tests/test_legacy_storage.py @@ -1,8 +1,8 @@ - # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2016-2019 CERN. +# Copyright (C) 2016-2020 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. diff --git a/tests/test_storage.py b/tests/test_storage.py index ed9cadee..1d40036f 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -1,8 +1,8 @@ - # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2016-2019 CERN. +# Copyright (C) 2016-2020 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. From 4879f59854cfb73e4ff5a16cb9255016c2aa5686 Mon Sep 17 00:00:00 2001 From: Lars Holm Nielsen Date: Fri, 20 Nov 2020 07:09:18 +0100 Subject: [PATCH 41/41] models: remove python 2 support --- invenio_files_rest/models.py | 12 +++--------- setup.py | 1 + tests/test_cli.py | 1 - tests/test_legacy_storage.py | 3 +-- 4 files changed, 5 insertions(+), 12 deletions(-) diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index 9cc1d102..21757899 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -1044,15 +1044,9 @@ def __unicode__(self): return u"{0}:{1}:{2}".format( self.bucket_id, self.version_id, self.key) - # https://docs.python.org/3.3/howto/pyporting.html#str-unicode - if sys.version_info[0] >= 3: # Python 3 - def __repr__(self): - """Return representation of location.""" - return self.__unicode__() - else: # Python 2 - def __repr__(self): - """Return representation of location.""" - return self.__unicode__().encode('utf8') + def __repr__(self): + """Return representation of location.""" + return self.__unicode__() @hybrid_property def mimetype(self): diff --git a/setup.py b/setup.py index 2df9e179..d1673b51 100644 --- a/setup.py +++ b/setup.py @@ -173,6 +173,7 @@ 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: Implementation :: CPython', 'Development Status :: 5 - Production/Stable', ], diff --git a/tests/test_cli.py b/tests/test_cli.py index 8ff2e2bd..5270e2fc 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -103,7 +103,6 @@ def test_simple_workflow(app, db, tmpdir): ], obj=script_info) assert 0 == result.exit_code - print(tmpdir.listdir()) assert len(tmpdir.listdir()) == 2 # Specify a file. diff --git a/tests/test_legacy_storage.py b/tests/test_legacy_storage.py index 499cfa95..9d9af69b 100644 --- a/tests/test_legacy_storage.py +++ b/tests/test_legacy_storage.py @@ -13,11 +13,10 @@ import errno import os -from os.path import dirname, exists, getsize, join - import pytest from fs.errors import DirectoryNotEmptyError, ResourceNotFoundError from mock import patch +from os.path import dirname, exists, getsize, join from six import BytesIO from invenio_files_rest.errors import FileSizeError, StorageError, \