diff --git a/LICENSE b/LICENSE index e2986635..e8198804 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ MIT License -Copyright (C) 2015-2019 CERN. +Copyright (C) 2015-2020 CERN. +Copyright (C) 2020 Cottage Labs LLP. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/docs/index.rst b/docs/index.rst index b7a74ed0..30155ec1 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -22,6 +22,7 @@ Invenio-Files-REST. installation configuration usage + new-storage-backend API Reference diff --git a/docs/new-storage-backend.rst b/docs/new-storage-backend.rst new file mode 100644 index 00000000..03f25d10 --- /dev/null +++ b/docs/new-storage-backend.rst @@ -0,0 +1,98 @@ +.. + This file is part of Invenio. + Copyright (C) 2020 Cottage Labs LLP + + Invenio is free software; you can redistribute it and/or modify it + under the terms of the MIT License; see LICENSE file for more details. + + +Developing a new storage backend +================================ + +A storage backend should subclass ``invenio_files_rest.storage.StorageBackend`` and should minimally implement the +``open()``, ``delete()``, ``_initialize(size)``, ``get_save_stream()`` and ``update_stream(seek)`` methods. You should +register the backend with an entry point in your ``setup.py``: + +.. code-block:: python + + setup( + ..., + entry_points={ + ..., + 'invenio_files_rest.storage': [ + 'mybackend = mypackage.storage:MyStorageBackend', + ], + ... + }, + ... + ) + +Implementation +-------------- + +The base class handles reporting progress, file size limits and checksumming. + +Here's an example implementation of a storage backend that stores files remotely over HTTP with no authentication. + +.. code-block:: python + + import contextlib + import httplib + import urllib + import urllib.parse + + from invenio_files_rest.storage import StorageBackend + + + class StorageBackend(StorageBackend): + def open(self): + return contextlib.closing( + urllib.urlopen('GET', self.uri) + ) + + def _initialize(self, size=0): + # Allocate space for the file. You can use `self.uri` as a suggested location, or return + # a new location as e.g. `{"uri": the_new_uri}`. + urllib.urlopen('POST', self.uri, headers={'X-Expected-Size': str(size)}) + return {} + + @contextlib.contextmanager + def get_save_stream(self): + # This should be a context manager (i.e. something that can be used in a `with` statement) + # which closes the file when exiting the context manager and performs any clear-up if + # an error occurs. + parsed_uri = urllib.parse.urlparse(self.uri) + # Assume the URI is HTTP, not HTTPS, and doesn't have a port or querystring + conn = httplib.HTTPConnection(parsed_uri.netloc) + + conn.putrequest('PUT', parsed_uri.path) + conn.endheaders() + + # HTTPConnections have a `send` method, whereas a file-like object should have `write` + conn.write = conn.send + + try: + yield conn.send + response = conn.getresponse() + if not 200 <= response.status < 300: + raise IOError("HTTP error") + finally: + conn.close() + + +Checksumming +------------ + +The base class performs checksumming by default, using the ``checksum_hash_name`` class or instance attribute as +the hashlib hashing function to use. If your underlying storage system provides checksumming functionality you can set +this to ``None`` and override ``checksum()``: + +.. code-block:: python + + class RemoteChecksumStorageBackend(StorageBackend): + checksum_hash_name = None + + def checksum(self, chunk_size=None): + checksum = urllib.urlopen(self.uri + '?checksum=sha256').read() + return f'sha256:{checksum}' + diff --git a/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py b/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py new file mode 100644 index 00000000..5fd0437c --- /dev/null +++ b/invenio_files_rest/alembic/0999e27defd5_add_backend_name.py @@ -0,0 +1,41 @@ +# +# This file is part of Invenio. +# Copyright (C) 2020 Cottage Labs LLP +# +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Add backend_name.""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = '0999e27defd5' +down_revision = '8ae99b034410' +branch_labels = () +depends_on = None + + +def upgrade(): + """Upgrade database.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + 'files_files', + sa.Column('storage_backend', + sa.String(length=32), + nullable=True)) + op.add_column( + 'files_location', + sa.Column('storage_backend', + sa.String(length=32), + nullable=True)) + # ### end Alembic commands ### + + +def downgrade(): + """Downgrade database.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('files_location', 'storage_backend') + op.drop_column('files_files', 'storage_backend') + # ### end Alembic commands ### diff --git a/invenio_files_rest/config.py b/invenio_files_rest/config.py index f45805da..ea10b946 100644 --- a/invenio_files_rest/config.py +++ b/invenio_files_rest/config.py @@ -1,13 +1,15 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2015-2019 CERN. +# Copyright (C) 2015-2020 CERN. +# Copyright (C) 2020 Cottage Labs LLP # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. """Invenio Files Rest module configuration file.""" +import pkg_resources from datetime import timedelta from invenio_files_rest.helpers import create_file_streaming_redirect_response @@ -57,6 +59,15 @@ FILES_REST_STORAGE_FACTORY = 'invenio_files_rest.storage.pyfs_storage_factory' """Import path of factory used to create a storage instance.""" +FILES_REST_DEFAULT_STORAGE_BACKEND = 'pyfs' +"""The default storage backend name.""" + +FILES_REST_STORAGE_BACKENDS = { + ep.name: ep.load() + for ep in pkg_resources.iter_entry_points('invenio_files_rest.storage') +} +"""A mapping from storage backend names to classes.""" + FILES_REST_PERMISSION_FACTORY = \ 'invenio_files_rest.permissions.permission_factory' """Permission factory to control the files access from the REST interface.""" diff --git a/invenio_files_rest/ext.py b/invenio_files_rest/ext.py index 5906ff42..9f0ce08e 100644 --- a/invenio_files_rest/ext.py +++ b/invenio_files_rest/ext.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2015-2019 CERN. +# Copyright (C) 2015-2020 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -10,6 +11,7 @@ from __future__ import absolute_import, print_function +import warnings from flask import abort from werkzeug.exceptions import UnprocessableEntity from werkzeug.utils import cached_property @@ -30,9 +32,20 @@ def __init__(self, app): @cached_property def storage_factory(self): """Load default storage factory.""" - return load_or_import_from_config( + if self.app.config['FILES_REST_STORAGE_FACTORY'] in [ + 'invenio_files_rest.storage.pyfs_storage_factory', + ]: + warnings.warn(DeprecationWarning( + "The " + self.app.config['FILES_REST_STORAGE_FACTORY'] + + " storage factory has been deprecated in favour of" + " 'invenio_files_rest.storage:StorageFactory" + )) + storage_factory = load_or_import_from_config( 'FILES_REST_STORAGE_FACTORY', app=self.app ) + if isinstance(storage_factory, type): + storage_factory = storage_factory(self.app) + return storage_factory @cached_property def permission_factory(self): diff --git a/invenio_files_rest/models.py b/invenio_files_rest/models.py index 58181a18..21757899 100644 --- a/invenio_files_rest/models.py +++ b/invenio_files_rest/models.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2015-2019 CERN. +# Copyright (C) 2015-2020 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -35,12 +36,12 @@ from __future__ import absolute_import, print_function -import mimetypes - import re import six import sys +import typing import uuid +import warnings from datetime import datetime from flask import current_app from functools import wraps @@ -51,6 +52,7 @@ from sqlalchemy.orm import validates from sqlalchemy.orm.exc import MultipleResultsFound from sqlalchemy_utils.types import UUIDType +from typing import Dict, Tuple, Union from .errors import BucketLockedError, FileInstanceAlreadySetError, \ FileInstanceUnreadableError, FileSizeError, InvalidKeyError, \ @@ -58,7 +60,8 @@ MultipartInvalidChunkSize, MultipartInvalidPartNumber, \ MultipartInvalidSize, MultipartMissingParts, MultipartNotCompleted from .proxies import current_files_rest -from .utils import ENCODING_MIMETYPES, guess_mimetype +from .storage import StorageBackend +from .utils import guess_mimetype slug_pattern = re.compile('^[a-z][a-z0-9-]+$') @@ -100,7 +103,7 @@ def as_bucket_id(value): def as_object_version(value): - """Get an object version object from an object version ID or an object version. + """Get an ObjectVersion object from an ID or an ObjectVersion. :param value: A :class:`invenio_files_rest.models.ObjectVersion` or an object version ID. @@ -262,6 +265,9 @@ class Location(db.Model, Timestamp): name = db.Column(db.String(20), unique=True, nullable=False) """External identifier of the location.""" + storage_backend = db.Column(db.String(32), nullable=True) + """Name of the storage backend to use for this location.""" + uri = db.Column(db.String(255), nullable=False) """URI of the location.""" @@ -665,6 +671,8 @@ class FileInstance(db.Model, Timestamp): storage_class = db.Column(db.String(1), nullable=True) """Storage class of file.""" + storage_backend = db.Column(db.String(32), nullable=True) + size = db.Column(db.BigInteger, default=0, nullable=True) """Size of file.""" @@ -740,7 +748,7 @@ def delete(self): self.query.filter_by(id=self.id).delete() return self - def storage(self, **kwargs): + def storage(self, **kwargs) -> StorageBackend: """Get storage interface for object. Uses the applications storage factory to create a storage interface @@ -748,6 +756,14 @@ def storage(self, **kwargs): :returns: Storage interface. """ + if kwargs: + warnings.warn( + "Passing **kwargs to .storage() is deprecated; override the " + "storage factory with a subclass of " + "invenio_files_rest.storage.StorageFactory and implement " + "get_storage_backend_kwargs() instead.", + DeprecationWarning + ) return current_files_rest.storage_factory(fileinstance=self, **kwargs) @ensure_readable() @@ -791,13 +807,45 @@ def verify_checksum(self, progress_callback=None, chunk_size=None, else (self.checksum == real_checksum)) self.last_check_at = datetime.utcnow() return self.last_check + return current_files_rest.storage_factory(fileinstance=self, **kwargs) @ensure_writable() - def init_contents(self, size=0, **kwargs): + def initialize(self, preferred_location: Location, size=0, **kwargs): """Initialize file.""" - self.set_uri( - *self.storage(**kwargs).initialize(size=size), - readable=False, writable=True) + if hasattr(current_files_rest.storage_factory, 'initialize'): + # New behaviour, with a new-style storage factory + result = current_files_rest.storage_factory.initialize( + fileinstance=self, + preferred_location=preferred_location, + size=size, + ) + else: + # Old behaviour, with an old-style storage factory + storage = self.storage( + default_location=preferred_location.uri, **kwargs + ) + result = storage.initialize(size=size) + self.update_file_metadata( + result, + readable=False, + writable=True, + storage_backend=( + storage.get_backend_name() + if isinstance(storage, StorageBackend) else None + ), + ) + + @ensure_writable() + def init_contents(self, size=0, default_location: str = None, **kwargs): + """Initialize storage for this FileInstance.""" + preferred_location = ( # type: typing.Optional[Location] + Location(uri=default_location) if default_location else None + ) + return self.initialize( + preferred_location=preferred_location, + size=size, + **kwargs + ) @ensure_writable() def update_contents(self, stream, seek=0, size=None, chunk_size=None, @@ -823,10 +871,21 @@ def set_contents(self, stream, chunk_size=None, size=None, size_limit=None, from. :param stream: File-like stream. """ - self.set_uri( - *self.storage(**kwargs).save( - stream, chunk_size=chunk_size, size=size, - size_limit=size_limit, progress_callback=progress_callback)) + storage = self.storage(**kwargs) + self.update_file_metadata( + storage.save( + stream, + chunk_size=chunk_size, + size=size, + size_limit=size_limit, + progress_callback=progress_callback, + ) + ) + + self.storage_backend = ( + storage.get_backend_name() + if isinstance(storage, StorageBackend) else None + ) @ensure_writable() def copy_contents(self, fileinstance, progress_callback=None, @@ -837,11 +896,13 @@ def copy_contents(self, fileinstance, progress_callback=None, if not self.size == 0: raise ValueError('File instance has data.') - self.set_uri( - *self.storage(**kwargs).copy( - fileinstance.storage(**kwargs), + with fileinstance.storage(**kwargs).open() as f: + self.set_contents( + f, + progress_callback=progress_callback, chunk_size=chunk_size, - progress_callback=progress_callback)) + **kwargs + ) @ensure_readable() def send_file(self, filename, restricted=True, mimetype=None, @@ -872,6 +933,35 @@ def set_uri(self, uri, size, checksum, readable=True, writable=False, storage_class return self + _FILE_METADATA_FIELDS = { + 'uri', 'size', 'checksum', 'writable', 'readable', 'storage_class' + } + + def update_file_metadata( + self, + file_metadata=None, # type: Union[Tuple, Dict] + **kwargs + ): + """Update the file metadata as a result of a storage operation.""" + if file_metadata is None: + file_metadata = {} + + if isinstance(file_metadata, tuple): + assert len(file_metadata) >= 3 + # Carry across defaults from **kwargs + if len(file_metadata) < 4: + file_metadata += (kwargs.get('readable', True),) + if len(file_metadata) < 5: + file_metadata += (kwargs.get('writable', False),) + if len(file_metadata) < 6: + file_metadata += (kwargs.get('storage_class', None),) + self.set_uri(*file_metadata) + elif isinstance(file_metadata, dict): + file_metadata.update(kwargs) + for key in file_metadata: + if key in self._FILE_METADATA_FIELDS: + setattr(self, key, file_metadata[key]) + class ObjectVersion(db.Model, Timestamp): """Model for storing versions of objects. @@ -954,22 +1044,16 @@ def __unicode__(self): return u"{0}:{1}:{2}".format( self.bucket_id, self.version_id, self.key) - # https://docs.python.org/3.3/howto/pyporting.html#str-unicode - if sys.version_info[0] >= 3: # Python 3 - def __repr__(self): - """Return representation of location.""" - return self.__unicode__() - else: # Python 2 - def __repr__(self): - """Return representation of location.""" - return self.__unicode__().encode('utf8') + def __repr__(self): + """Return representation of location.""" + return self.__unicode__() @hybrid_property def mimetype(self): """Get MIME type of object.""" return self._mimetype if self._mimetype else guess_mimetype(self.key) - @mimetype.setter + @mimetype.setter # type: ignore def mimetype(self, value): """Setter for MIME type.""" self._mimetype = value @@ -1013,7 +1097,9 @@ def set_contents(self, stream, chunk_size=None, size=None, size_limit=None, @ensure_no_file() @update_bucket_size - def set_location(self, uri, size, checksum, storage_class=None): + def set_location( + self, uri, size, checksum, storage_class=None, storage_backend=None + ): """Set only URI location of for object. Useful to link files on externally controlled storage. If a file @@ -1027,8 +1113,12 @@ def set_location(self, uri, size, checksum, storage_class=None): :param storage_class: Storage class where file is stored () """ self.file = FileInstance() - self.file.set_uri( - uri, size, checksum, storage_class=storage_class + self.file.update_file_metadata( + storage_backend=storage_backend, + uri=uri, + size=size, + checksum=checksum, + storage_class=storage_class ) db.session.add(self.file) return self diff --git a/invenio_files_rest/storage/__init__.py b/invenio_files_rest/storage/__init__.py index 0112dca8..055c66c3 100644 --- a/invenio_files_rest/storage/__init__.py +++ b/invenio_files_rest/storage/__init__.py @@ -2,6 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2015-2019 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -10,11 +11,15 @@ from __future__ import absolute_import, print_function -from .base import FileStorage -from .pyfs import PyFSFileStorage, pyfs_storage_factory +from .base import FileStorage, StorageBackend +from .factory import StorageFactory +from .pyfs import PyFSFileStorage, PyFSStorageBackend, pyfs_storage_factory __all__ = ( 'FileStorage', + 'StorageBackend', 'pyfs_storage_factory', 'PyFSFileStorage', + 'PyFSStorageBackend', + 'StorageFactory', ) diff --git a/invenio_files_rest/storage/base.py b/invenio_files_rest/storage/base.py index a48b761e..31d2405b 100644 --- a/invenio_files_rest/storage/base.py +++ b/invenio_files_rest/storage/base.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2016-2019 CERN. +# Copyright (C) 2016-2020 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -11,57 +12,59 @@ from __future__ import absolute_import, print_function import hashlib +import typing +import warnings from calendar import timegm -from functools import partial +from datetime import datetime +from flask import current_app +from typing import Any, Callable, Dict, Tuple -from ..errors import FileSizeError, StorageError, UnexpectedFileSizeError +from ..errors import StorageError from ..helpers import chunk_size_or_default, compute_checksum, send_stream +from ..utils import check_size, check_sizelimit +from .legacy import FileStorage +__all__ = ('FileStorage', 'StorageBackend') -def check_sizelimit(size_limit, bytes_written, total_size): - """Check if size limit was exceeded. - :param size_limit: The size limit. - :param bytes_written: The total number of bytes written. - :param total_size: The total file size. - :raises invenio_files_rest.errors.UnexpectedFileSizeError: If the bytes - written exceed the total size. - :raises invenio_files_rest.errors.FileSizeError: If the bytes - written are major than the limit size. - """ - if size_limit is not None and bytes_written > size_limit: - desc = 'File size limit exceeded.' \ - if isinstance(size_limit, int) else size_limit.reason - raise FileSizeError(description=desc) - - # Never write more than advertised - if total_size is not None and bytes_written > total_size: - raise UnexpectedFileSizeError( - description='File is bigger than expected.') - - -def check_size(bytes_written, total_size): - """Check if expected amounts of bytes have been written. - - :param bytes_written: The total number of bytes written. - :param total_size: The total file size. - :raises invenio_files_rest.errors.UnexpectedFileSizeError: If the bytes - written exceed the total size. - """ - if total_size and bytes_written < total_size: - raise UnexpectedFileSizeError( - description='File is smaller than expected.') - - -class FileStorage(object): +class StorageBackend: """Base class for storage interface to a single file.""" - def __init__(self, size=None, modified=None): + checksum_hash_name = 'md5' + + def __init__( + self, uri: str = None, + size: int = None, + modified: datetime = None + ): """Initialize storage object.""" + self.uri = uri self._size = size self._modified = timegm(modified.timetuple()) if modified else None - def open(self, mode=None): + @classmethod + def get_backend_name(cls): + """Return the backend name for this StorageBackend. + + This performs a reverse-lookup in FILES_REST_STORAGE_BACKENDS and then + caches the result. + """ + try: + return cls._backend_name + except AttributeError: + backends = current_app.config['FILES_REST_STORAGE_BACKENDS'] + for name, backend_cls in backends.items(): + if cls is backend_cls: + cls._backend_name = name + break + else: + raise RuntimeError( + "{} isn't listed in FILES_REST_STORAGE_BACKENDS " + "config".format(cls) + ) + return cls._backend_name + + def open(self): """Open the file. The caller is responsible for closing the file. @@ -73,25 +76,145 @@ def delete(self): raise NotImplementedError def initialize(self, size=0): - """Initialize the file on the storage + truncate to the given size.""" + """Initialize the file on the storage and truncate to given size.""" + return { + 'readable': False, + 'writable': True, + 'uri': self.uri, + 'size': size, + **self._initialize(size=size), + } + + def _initialize(self, size=0) -> Dict[Any, str]: + """Override this to perform file storage initialization.""" raise NotImplementedError - def save(self, incoming_stream, size_limit=None, size=None, - chunk_size=None, progress_callback=None): + def save( + self, + incoming_stream, + size_limit=None, + size=None, + chunk_size=None, + progress_callback=None, # type: Callable[[int, int], None] + ): """Save incoming stream to file storage.""" + with self.get_save_stream() as output_stream: + result = self._write_stream( + incoming_stream, + output_stream, + size_limit=size_limit, + size=size, + chunk_size=chunk_size, + progress_callback=progress_callback, + ) + self._size = result['size'] + if not result['checksum']: + result['checksum'] = self.checksum(chunk_size=chunk_size) + return { + 'uri': self.uri, + 'readable': True, + 'writable': False, + 'storage_class': 'S', + **result, + } + + def get_save_stream(self) -> typing.ContextManager: + """Return a context manager for a file-like object for writing. + + The return value should be a context manager that provides a file-like + object when entered, and performs any necessary clean-up when exited + (e.g. closing the file). + """ raise NotImplementedError def update(self, incoming_stream, seek=0, size=None, chunk_size=None, - progress_callback=None): + progress_callback=None) -> Tuple[int, str]: """Update part of file with incoming stream.""" + with self.get_update_stream(seek) as output_stream: + result = self._write_stream( + incoming_stream, + output_stream, + size=size, + chunk_size=chunk_size, + progress_callback=progress_callback, + ) + self._size = seek + result['size'] + return result['size'], result['checksum'] + + def get_update_stream(self, seek) -> typing.ContextManager: + """Return a context manager for a file-like object for updating. + + The return value should be a context manager that provides a file-like + object when entered, and performs any necessary clean-up when exited + (e.g. closing the file). + """ raise NotImplementedError + def _write_stream( + self, + incoming_stream, + output_stream, + *, + size_limit=None, + size=None, + chunk_size=None, + progress_callback=None, + ): + """Copy from one stream to another. + + This honors size limits and performs requested progress callbacks once + data has been written to the output stream. + """ + chunk_size = chunk_size_or_default(chunk_size) + + algo, checksum = self._init_hash() + update_sum = checksum.update if checksum else lambda chunk: None + + bytes_written = 0 + + while True: + # Check that size limits aren't bypassed + check_sizelimit(size_limit, bytes_written, size) + + chunk = incoming_stream.read(chunk_size) + + if not chunk: + if progress_callback: + progress_callback(bytes_written, bytes_written) + break + + output_stream.write(chunk) + + bytes_written += len(chunk) + + update_sum(chunk) + + if progress_callback: + progress_callback(None, bytes_written) + + check_size(bytes_written, size) + + return { + 'checksum': ( + f'{self.checksum_hash_name}:{checksum.hexdigest()}' + if checksum else None + ), + 'size': bytes_written, + } + # # Default implementation # - def send_file(self, filename, mimetype=None, restricted=True, - checksum=None, trusted=False, chunk_size=None, - as_attachment=False): + def send_file( + self, + filename, + mimetype=None, + restricted=True, + checksum=None, + trusted=False, + chunk_size=None, + as_attachment=False + ): """Send the file to the client.""" try: fp = self.open(mode='rb') @@ -123,18 +246,21 @@ def send_file(self, filename, mimetype=None, restricted=True, fp.close() raise StorageError('Could not send file: {}'.format(e)) - def checksum(self, chunk_size=None, progress_callback=None, **kwargs): + def checksum(self, chunk_size=None, progress_callback=None): """Compute checksum of file.""" - fp = self.open(mode='rb') - try: - value = self._compute_checksum( - fp, size=self._size, chunk_size=None, - progress_callback=progress_callback) - except StorageError: - raise - finally: - fp.close() - return value + algo, m = self._init_hash() + if not m: + return None + + chunk_size = chunk_size_or_default(chunk_size) + + with self.open(mode='rb') as fp: + algo, m = self._init_hash() + return compute_checksum( + fp, algo, m, + chunk_size=chunk_size, + progress_callback=progress_callback + ) def copy(self, src, chunk_size=None, progress_callback=None): """Copy data from another file instance. @@ -142,12 +268,14 @@ def copy(self, src, chunk_size=None, progress_callback=None): :param src: Source stream. :param chunk_size: Chunk size to read from source stream. """ - fp = src.open(mode='rb') - try: + warnings.warn( + "Call save() with the other already-open FileStorage passed in " + "instead.", + DeprecationWarning + ) + with src.open() as fp: return self.save( fp, chunk_size=chunk_size, progress_callback=progress_callback) - finally: - fp.close() # # Helpers @@ -158,70 +286,9 @@ def _init_hash(self): Overwrite this method if you want to use different checksum algorithm for your storage backend. """ - return 'md5', hashlib.md5() - - def _compute_checksum(self, stream, size=None, chunk_size=None, - progress_callback=None, **kwargs): - """Get helper method to compute checksum from a stream. - - Naive implementation that can be overwritten by subclasses in order to - provide more efficient implementation. - """ - if progress_callback and size: - progress_callback = partial(progress_callback, size) - else: - progress_callback = None - - try: - algo, m = self._init_hash() - return compute_checksum( - stream, algo, m, - chunk_size=chunk_size, - progress_callback=progress_callback, - **kwargs + if self.checksum_hash_name: + return ( + self.checksum_hash_name, hashlib.new(self.checksum_hash_name) ) - except Exception as e: - raise StorageError( - 'Could not compute checksum of file: {0}'.format(e)) - - def _write_stream(self, src, dst, size=None, size_limit=None, - chunk_size=None, progress_callback=None): - """Get helper to save stream from src to dest + compute checksum. - - :param src: Source stream. - :param dst: Destination stream. - :param size: If provided, this exact amount of bytes will be - written to the destination file. - :param size_limit: ``FileSizeLimit`` instance to limit number of bytes - to write. - """ - chunk_size = chunk_size_or_default(chunk_size) - - algo, m = self._init_hash() - bytes_written = 0 - - while 1: - # Check that size limits aren't bypassed - check_sizelimit(size_limit, bytes_written, size) - - chunk = src.read(chunk_size) - - if not chunk: - if progress_callback: - progress_callback(bytes_written, bytes_written) - break - - dst.write(chunk) - - bytes_written += len(chunk) - - if m: - m.update(chunk) - - if progress_callback: - progress_callback(None, bytes_written) - - check_size(bytes_written, size) - - return bytes_written, '{0}:{1}'.format( - algo, m.hexdigest()) if m else None + else: + return None, None diff --git a/invenio_files_rest/storage/factory.py b/invenio_files_rest/storage/factory.py new file mode 100644 index 00000000..c9db85ef --- /dev/null +++ b/invenio_files_rest/storage/factory.py @@ -0,0 +1,150 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Invenio. +# Copyright (C) 2020 Cottage Labs LLP. +# +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Contains the base storage factory implementation.""" + +from flask import current_app +from typing import Any, Dict, Optional, Type + +from invenio_files_rest.models import FileInstance, Location + +from ..helpers import make_path +from .base import StorageBackend + + +class StorageFactory: + """A base storage factory, with sensible default behaviour. + + You may subclass this factory to implement custom behaviour. If you do + this, remember to set FILES_REST_STORAGE_FACTORY to the right import path + for your subclass. + """ + + def __init__(self, app): + """Initialize the storage factory.""" + self.app = app + + def __call__( + self, + fileinstance: FileInstance, + ) -> Optional[StorageBackend]: + """Return a FileStorage instance for a file, for manipulating contents. + + This requires that the fileinstance already has an associated storage + backend. If not, you should call initialize() instead to initialize + storage for the file instance. + """ + if not fileinstance.storage_backend: + return None + + storage_backend_cls = self.resolve_storage_backend( + fileinstance.storage_backend + ) + storage_backend_kwargs = self.get_storage_backend_kwargs( + fileinstance, storage_backend_cls + ) + + return storage_backend_cls( + uri=fileinstance.uri, + size=fileinstance.size, + **storage_backend_kwargs + ) + + def initialize( + self, + fileinstance: FileInstance, + size: int = 0, + preferred_location: Location = None + ) -> StorageBackend: + """Initialize storage for a new file. + + If provided, `preferred_location` will inform where the file will be + stored, but may be ignored if this factory is subclassed and + `get_location()` is overridden. + """ + if fileinstance.storage_backend: + return self(fileinstance) # type: ignore + + location = self.get_location(fileinstance, preferred_location) + + fileinstance.storage_backend = location.storage_backend + + storage_backend_cls = self.resolve_storage_backend( + fileinstance.storage_backend + ) + storage_backend_kwargs = self.get_storage_backend_kwargs( + fileinstance, storage_backend_cls + ) + + uri = self.get_suggested_uri( + fileinstance=fileinstance, + location=location, + storage_backend_cls=storage_backend_cls, + ) + + return storage_backend_cls( + uri=uri, + **storage_backend_kwargs, + ).initialize( + size=size, + ) + + def get_location( + self, + fileinstance: FileInstance, + preferred_location: Location = None + ) -> Location: + """Return a Location for storing a new file. + + This base implementation returns the preferred_location if it's + provided, or the default location recorded in the database. This method + can be overridden if some other methodology is required. + """ + return preferred_location or Location.get_default() + + def resolve_storage_backend( + self, backend_name: str + ) -> Type[StorageBackend]: + """Resolve a storage backend name to the associated backend class. + + This base implementation resolves backends from the + FILES_REST_STORAGE_BACKENDS app config setting. + """ + return self.app.config['FILES_REST_STORAGE_BACKENDS'][backend_name] + + def get_storage_backend_kwargs( + self, + fileinstance: FileInstance, + storage_backend_cls: Type[StorageBackend], + ) -> Dict[str, Any]: + """Retrieve any instantiation kwargs for the storage backend. + + This returns an empty dict by defaut, but can be overridden to provide + backend-specific instantiation parameters if necessary. + """ + return {} + + def get_suggested_uri( + self, + fileinstance: FileInstance, + location: Location, + storage_backend_cls: Type[StorageBackend], + ): + """Generate a suggested URI for new files. + + This can be overridden if your implementation requires some other file + layout for storage. Note that individual storage backends may choose to + ignore the suggested URI if they organise files by some other scheme. + """ + return make_path( + location, + str(fileinstance.id), + 'data', + current_app.config['FILES_REST_STORAGE_PATH_DIMENSIONS'], + current_app.config['FILES_REST_STORAGE_PATH_SPLIT_LENGTH'], + ) diff --git a/invenio_files_rest/storage/legacy.py b/invenio_files_rest/storage/legacy.py new file mode 100644 index 00000000..3a34d7b4 --- /dev/null +++ b/invenio_files_rest/storage/legacy.py @@ -0,0 +1,346 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Invenio. +# Copyright (C) 2016-2020 CERN. +# Copyright (C) 2020 Cottage Labs LLP. +# +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""The previous invenio_files_rest.storage implementation. + +These classes and factory are included so that we can test that there are no +regressions against them when implementing the new storage interface, and so +ensure backwards compatibility. +""" + +from __future__ import absolute_import, print_function + +import hashlib +from calendar import timegm +from flask import current_app +from fs.opener import opener +from fs.path import basename, dirname +from functools import partial + +from invenio_files_rest.errors import StorageError +from invenio_files_rest.helpers import chunk_size_or_default, \ + compute_checksum, make_path, send_stream +from invenio_files_rest.utils import check_size, check_sizelimit + + +class FileStorage(object): + """Base class for storage interface to a single file.""" + + def __init__(self, size=None, modified=None): + """Initialize storage object.""" + self._size = size + self._modified = timegm(modified.timetuple()) if modified else None + + def open(self, mode=None): + """Open the file. + + The caller is responsible for closing the file. + """ + raise NotImplementedError + + def delete(self): + """Delete the file.""" + raise NotImplementedError + + def initialize(self, size=0): + """Initialize the file on the storage + truncate to the given size.""" + raise NotImplementedError + + def save(self, incoming_stream, size_limit=None, size=None, + chunk_size=None, progress_callback=None): + """Save incoming stream to file storage.""" + raise NotImplementedError + + def update(self, incoming_stream, seek=0, size=None, chunk_size=None, + progress_callback=None): + """Update part of file with incoming stream.""" + raise NotImplementedError + + # + # Default implementation + # + def send_file(self, filename, mimetype=None, restricted=True, + checksum=None, trusted=False, chunk_size=None, + as_attachment=False): + """Send the file to the client.""" + try: + fp = self.open(mode='rb') + except Exception as e: + raise StorageError('Could not send file: {}'.format(e)) + + try: + md5_checksum = None + if checksum: + algo, value = checksum.split(':') + if algo == 'md5': + md5_checksum = value + + # Send stream is responsible for closing the file. + return send_stream( + fp, + filename, + self._size, + self._modified, + mimetype=mimetype, + restricted=restricted, + etag=checksum, + content_md5=md5_checksum, + chunk_size=chunk_size, + trusted=trusted, + as_attachment=as_attachment, + ) + except Exception as e: + fp.close() + raise StorageError('Could not send file: {}'.format(e)) + + def checksum(self, chunk_size=None, progress_callback=None, **kwargs): + """Compute checksum of file.""" + fp = self.open(mode='rb') + try: + value = self._compute_checksum( + fp, size=self._size, chunk_size=None, + progress_callback=progress_callback) + except StorageError: + raise + finally: + fp.close() + return value + + def copy(self, src, chunk_size=None, progress_callback=None): + """Copy data from another file instance. + + :param src: Source stream. + :param chunk_size: Chunk size to read from source stream. + """ + fp = src.open(mode='rb') + try: + return self.save( + fp, chunk_size=chunk_size, progress_callback=progress_callback) + finally: + fp.close() + + # + # Helpers + # + def _init_hash(self): + """Initialize message digest object. + + Overwrite this method if you want to use different checksum + algorithm for your storage backend. + """ + return 'md5', hashlib.md5() + + def _compute_checksum(self, stream, size=None, chunk_size=None, + progress_callback=None, **kwargs): + """Get helper method to compute checksum from a stream. + + Naive implementation that can be overwritten by subclasses in order to + provide more efficient implementation. + """ + if progress_callback and size: + progress_callback = partial(progress_callback, size) + else: + progress_callback = None + + try: + algo, m = self._init_hash() + return compute_checksum( + stream, algo, m, + chunk_size=chunk_size, + progress_callback=progress_callback, + **kwargs + ) + except Exception as e: + raise StorageError( + 'Could not compute checksum of file: {0}'.format(e)) + + def _write_stream(self, src, dst, size=None, size_limit=None, + chunk_size=None, progress_callback=None): + """Get helper to save stream from src to dest + compute checksum. + + :param src: Source stream. + :param dst: Destination stream. + :param size: If provided, this exact amount of bytes will be + written to the destination file. + :param size_limit: ``FileSizeLimit`` instance to limit number of bytes + to write. + """ + chunk_size = chunk_size_or_default(chunk_size) + + algo, m = self._init_hash() + bytes_written = 0 + + while 1: + # Check that size limits aren't bypassed + check_sizelimit(size_limit, bytes_written, size) + + chunk = src.read(chunk_size) + + if not chunk: + if progress_callback: + progress_callback(bytes_written, bytes_written) + break + + dst.write(chunk) + + bytes_written += len(chunk) + + if m: + m.update(chunk) + + if progress_callback: + progress_callback(None, bytes_written) + + check_size(bytes_written, size) + + return bytes_written, '{0}:{1}'.format( + algo, m.hexdigest()) if m else None + + +class PyFSFileStorage(FileStorage): + """File system storage using PyFilesystem for access the file. + + This storage class will store files according to the following pattern: + ``//data``. + + .. warning:: + + File operations are not atomic. E.g. if errors happens during e.g. + updating part of a file it will leave the file in an inconsistent + state. The storage class tries as best as possible to handle errors + and leave the system in a consistent state. + """ + + def __init__(self, fileurl, size=None, modified=None, clean_dir=True): + """Storage initialization.""" + self.fileurl = fileurl + self.clean_dir = clean_dir + super(PyFSFileStorage, self).__init__(size=size, modified=modified) + + def _get_fs(self, create_dir=True): + """Return tuple with filesystem and filename.""" + filedir = dirname(self.fileurl) + filename = basename(self.fileurl) + + return ( + opener.opendir(filedir, writeable=True, create_dir=create_dir), + filename + ) + + def open(self, mode='rb'): + """Open file. + + The caller is responsible for closing the file. + """ + fs, path = self._get_fs() + return fs.open(path, mode=mode) + + def delete(self): + """Delete a file. + + The base directory is also removed, as it is assumed that only one file + exists in the directory. + """ + fs, path = self._get_fs(create_dir=False) + if fs.exists(path): + fs.remove(path) + if self.clean_dir and fs.exists('.'): + fs.removedir('.') + return True + + def initialize(self, size=0): + """Initialize file on storage and truncate to given size.""" + fs, path = self._get_fs() + + # Required for reliably opening the file on certain file systems: + if fs.exists(path): + fp = fs.open(path, mode='r+b') + else: + fp = fs.open(path, mode='wb') + + try: + fp.truncate(size) + except Exception: + fp.close() + self.delete() + raise + finally: + fp.close() + + self._size = size + + return self.fileurl, size, None + + def save(self, incoming_stream, size_limit=None, size=None, + chunk_size=None, progress_callback=None): + """Save file in the file system.""" + fp = self.open(mode='wb') + try: + bytes_written, checksum = self._write_stream( + incoming_stream, fp, chunk_size=chunk_size, + progress_callback=progress_callback, + size_limit=size_limit, size=size) + except Exception: + fp.close() + self.delete() + raise + finally: + fp.close() + + self._size = bytes_written + + return self.fileurl, bytes_written, checksum + + def update(self, incoming_stream, seek=0, size=None, chunk_size=None, + progress_callback=None): + """Update a file in the file system.""" + fp = self.open(mode='r+b') + try: + fp.seek(seek) + bytes_written, checksum = self._write_stream( + incoming_stream, fp, chunk_size=chunk_size, + size=size, progress_callback=progress_callback) + finally: + fp.close() + + return bytes_written, checksum + + +def pyfs_storage_factory(fileinstance=None, default_location=None, + default_storage_class=None, + filestorage_class=PyFSFileStorage, fileurl=None, + size=None, modified=None, clean_dir=True): + """Get factory function for creating a PyFS file storage instance.""" + # Either the FileInstance needs to be specified or all filestorage + # class parameters need to be specified + assert fileinstance or (fileurl and size) + + if fileinstance: + # FIXME: Code here should be refactored since it assumes a lot on the + # directory structure where the file instances are written + fileurl = None + size = fileinstance.size + modified = fileinstance.updated + + if fileinstance.uri: + # Use already existing URL. + fileurl = fileinstance.uri + else: + assert default_location + # Generate a new URL. + fileurl = make_path( + default_location, + str(fileinstance.id), + 'data', + current_app.config['FILES_REST_STORAGE_PATH_DIMENSIONS'], + current_app.config['FILES_REST_STORAGE_PATH_SPLIT_LENGTH'], + ) + + return filestorage_class( + fileurl, size=size, modified=modified, clean_dir=clean_dir) diff --git a/invenio_files_rest/storage/pyfs.py b/invenio_files_rest/storage/pyfs.py index 7a09ef08..1704e82a 100644 --- a/invenio_files_rest/storage/pyfs.py +++ b/invenio_files_rest/storage/pyfs.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2016-2019 CERN. +# Copyright (C) 2016-2020 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -10,15 +11,17 @@ from __future__ import absolute_import, print_function -from flask import current_app +import contextlib from fs.opener import opener from fs.path import basename, dirname -from ..helpers import make_path -from .base import FileStorage +from .base import StorageBackend +from .legacy import PyFSFileStorage, pyfs_storage_factory +__all__ = ('PyFSFileStorage', 'pyfs_storage_factory', 'PyFSStorageBackend') -class PyFSFileStorage(FileStorage): + +class PyFSStorageBackend(StorageBackend): """File system storage using PyFilesystem for access the file. This storage class will store files according to the following pattern: @@ -30,19 +33,19 @@ class PyFSFileStorage(FileStorage): updating part of a file it will leave the file in an inconsistent state. The storage class tries as best as possible to handle errors and leave the system in a consistent state. - """ - def __init__(self, fileurl, size=None, modified=None, clean_dir=True): + def __init__(self, *args, clean_dir=True, **kwargs): """Storage initialization.""" - self.fileurl = fileurl + # if isinstance(args[0], str): + # raise NotImplementedError self.clean_dir = clean_dir - super(PyFSFileStorage, self).__init__(size=size, modified=modified) + super().__init__(*args, **kwargs) def _get_fs(self, create_dir=True): """Return tuple with filesystem and filename.""" - filedir = dirname(self.fileurl) - filename = basename(self.fileurl) + filedir = dirname(self.uri) + filename = basename(self.uri) return ( opener.opendir(filedir, writeable=True, create_dir=create_dir), @@ -70,7 +73,7 @@ def delete(self): fs.removedir('.') return True - def initialize(self, size=0): + def _initialize(self, size=0): """Initialize file on storage and truncate to given size.""" fs, path = self._get_fs() @@ -91,72 +94,26 @@ def initialize(self, size=0): self._size = size - return self.fileurl, size, None + return {} - def save(self, incoming_stream, size_limit=None, size=None, - chunk_size=None, progress_callback=None): - """Save file in the file system.""" + @contextlib.contextmanager + def get_save_stream(self): + """Return the underlying file for writing. + + This implementation deletes the file if an exception is thrown by code + executing in this context. + """ fp = self.open(mode='wb') try: - bytes_written, checksum = self._write_stream( - incoming_stream, fp, chunk_size=chunk_size, - progress_callback=progress_callback, - size_limit=size_limit, size=size) + yield fp except Exception: - fp.close() self.delete() raise finally: fp.close() - self._size = bytes_written - - return self.fileurl, bytes_written, checksum - - def update(self, incoming_stream, seek=0, size=None, chunk_size=None, - progress_callback=None): - """Update a file in the file system.""" + def get_update_stream(self, seek): + """Open the underlying file for updates, seeking as requested.""" fp = self.open(mode='r+b') - try: - fp.seek(seek) - bytes_written, checksum = self._write_stream( - incoming_stream, fp, chunk_size=chunk_size, - size=size, progress_callback=progress_callback) - finally: - fp.close() - - return bytes_written, checksum - - -def pyfs_storage_factory(fileinstance=None, default_location=None, - default_storage_class=None, - filestorage_class=PyFSFileStorage, fileurl=None, - size=None, modified=None, clean_dir=True): - """Get factory function for creating a PyFS file storage instance.""" - # Either the FileInstance needs to be specified or all filestorage - # class parameters need to be specified - assert fileinstance or (fileurl and size) - - if fileinstance: - # FIXME: Code here should be refactored since it assumes a lot on the - # directory structure where the file instances are written - fileurl = None - size = fileinstance.size - modified = fileinstance.updated - - if fileinstance.uri: - # Use already existing URL. - fileurl = fileinstance.uri - else: - assert default_location - # Generate a new URL. - fileurl = make_path( - default_location, - str(fileinstance.id), - 'data', - current_app.config['FILES_REST_STORAGE_PATH_DIMENSIONS'], - current_app.config['FILES_REST_STORAGE_PATH_SPLIT_LENGTH'], - ) - - return filestorage_class( - fileurl, size=size, modified=modified, clean_dir=clean_dir) + fp.seek(seek) + return contextlib.closing(fp) diff --git a/invenio_files_rest/utils.py b/invenio_files_rest/utils.py index 1345699c..8f8bc2b7 100644 --- a/invenio_files_rest/utils.py +++ b/invenio_files_rest/utils.py @@ -2,6 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2016-2019 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -14,6 +15,8 @@ from flask import current_app from werkzeug.utils import import_string +from invenio_files_rest.errors import FileSizeError, UnexpectedFileSizeError + ENCODING_MIMETYPES = { 'gzip': 'application/gzip', 'compress': 'application/gzip', @@ -56,3 +59,38 @@ def guess_mimetype(filename): if encoding: m = ENCODING_MIMETYPES.get(encoding, None) return m or 'application/octet-stream' + + +def check_sizelimit(size_limit, bytes_written, total_size): + """Check if size limit was exceeded. + + :param size_limit: The size limit. + :param bytes_written: The total number of bytes written. + :param total_size: The total file size. + :raises invenio_files_rest.errors.UnexpectedFileSizeError: If the bytes + written exceed the total size. + :raises invenio_files_rest.errors.FileSizeError: If the bytes + written are major than the limit size. + """ + if size_limit is not None and bytes_written > size_limit: + desc = 'File size limit exceeded.' \ + if isinstance(size_limit, int) else size_limit.reason + raise FileSizeError(description=desc) + + # Never write more than advertised + if total_size is not None and bytes_written > total_size: + raise UnexpectedFileSizeError( + description='File is bigger than expected.') + + +def check_size(bytes_written, total_size): + """Check if expected amounts of bytes have been written. + + :param bytes_written: The total number of bytes written. + :param total_size: The total file size. + :raises invenio_files_rest.errors.UnexpectedFileSizeError: If the bytes + written exceed the total size. + """ + if total_size and bytes_written < total_size: + raise UnexpectedFileSizeError( + description='File is smaller than expected.') diff --git a/setup.py b/setup.py index 94a276fb..d1673b51 100644 --- a/setup.py +++ b/setup.py @@ -2,6 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2015-2019 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -153,6 +154,9 @@ 'invenio_db.models': [ 'invenio_files_rest = invenio_files_rest.models', ], + 'invenio_files_rest.storage': [ + 'pyfs = invenio_files_rest.storage:PyFSStorageBackend', + ] }, extras_require=extras_require, install_requires=install_requires, @@ -169,6 +173,7 @@ 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: Implementation :: CPython', 'Development Status :: 5 - Production/Stable', ], diff --git a/tests/conftest.py b/tests/conftest.py index 5daa35c7..25ecf384 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,6 +2,7 @@ # # This file is part of Invenio. # Copyright (C) 2015-2019 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -41,7 +42,7 @@ location_update_all, multipart_delete_all, multipart_read_all, \ object_delete_all, object_delete_version_all, object_read_all, \ object_read_version_all -from invenio_files_rest.storage import PyFSFileStorage +from invenio_files_rest.storage import PyFSFileStorage, PyFSStorageBackend from invenio_files_rest.views import blueprint @@ -135,7 +136,8 @@ def dummy_location(db): loc = Location( name='testloc', uri=tmppath, - default=True + storage_backend='pyfs', + default=True, ) db.session.add(loc) db.session.commit() @@ -153,6 +155,12 @@ def pyfs_testpath(dummy_location): @pytest.fixture() def pyfs(dummy_location, pyfs_testpath): + """Instance of PyFSFileStorage.""" + return PyFSStorageBackend(pyfs_testpath) + + +@pytest.fixture() +def legacy_pyfs(dummy_location, pyfs_testpath): """Instance of PyFSFileStorage.""" return PyFSFileStorage(pyfs_testpath) diff --git a/tests/test_legacy_storage.py b/tests/test_legacy_storage.py new file mode 100644 index 00000000..9d9af69b --- /dev/null +++ b/tests/test_legacy_storage.py @@ -0,0 +1,346 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Invenio. +# Copyright (C) 2016-2020 CERN. +# Copyright (C) 2020 Cottage Labs LLP. +# +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Storage module tests.""" + +from __future__ import absolute_import, print_function + +import errno +import os +import pytest +from fs.errors import DirectoryNotEmptyError, ResourceNotFoundError +from mock import patch +from os.path import dirname, exists, getsize, join +from six import BytesIO + +from invenio_files_rest.errors import FileSizeError, StorageError, \ + UnexpectedFileSizeError +from invenio_files_rest.limiters import FileSizeLimit +from invenio_files_rest.storage import FileStorage, PyFSFileStorage + + +def test_storage_interface(): + """Test storage interface.""" + s = FileStorage('some-path') + pytest.raises(NotImplementedError, s.open) + pytest.raises(NotImplementedError, s.initialize, 'file:///some/path') + pytest.raises(NotImplementedError, s.delete) + pytest.raises(NotImplementedError, s.save, None) + pytest.raises(NotImplementedError, s.update, None) + pytest.raises(NotImplementedError, s.checksum) + + +def test_pyfs_initialize(legacy_pyfs, pyfs_testpath): + """Test init of files.""" + # Create file object. + assert not exists(pyfs_testpath) + uri, size, checksum = legacy_pyfs.initialize(size=100) + + assert size == 100 + assert checksum is None + assert os.stat(pyfs_testpath).st_size == size + + uri, size, checksum = legacy_pyfs.initialize() + assert size == 0 + assert size == os.stat(pyfs_testpath).st_size + + +def test_pyfs_delete(app, db, dummy_location): + """Test init of files.""" + testurl = join(dummy_location.uri, 'subpath/data') + s = PyFSFileStorage(testurl) + s.initialize(size=100) + assert exists(testurl) + + s.delete() + assert not exists(testurl) + + s = PyFSFileStorage(join(dummy_location.uri, 'anotherpath/data')) + pytest.raises(ResourceNotFoundError, s.delete) + + +def test_pyfs_delete_fail(legacy_pyfs, pyfs_testpath): + """Test init of files.""" + legacy_pyfs.save(BytesIO(b'somedata')) + os.rename(pyfs_testpath, join(dirname(pyfs_testpath), 'newname')) + pytest.raises(DirectoryNotEmptyError, legacy_pyfs.delete) + + +def test_pyfs_save(legacy_pyfs, pyfs_testpath, get_md5): + """Test basic save operation.""" + data = b'somedata' + legacy_pyfs.save(BytesIO(data)) + + assert exists(pyfs_testpath) + assert open(pyfs_testpath, 'rb').read() == data + + +def test_pyfs_save_failcleanup(legacy_pyfs, pyfs_testpath, get_md5): + """Test basic save operation.""" + data = b'somedata' + + def fail_callback(total, size): + assert exists(pyfs_testpath) + raise Exception('Something bad happened') + + pytest.raises( + Exception, + legacy_pyfs.save, + BytesIO(data), chunk_size=4, progress_callback=fail_callback + ) + assert not exists(pyfs_testpath) + assert not exists(dirname(pyfs_testpath)) + + +def test_pyfs_save_callback(legacy_pyfs): + """Test progress callback.""" + data = b'somedata' + + counter = dict(size=0) + + def callback(total, size): + counter['size'] = size + + uri, size, checksum = legacy_pyfs.save( + BytesIO(data), progress_callback=callback) + + assert counter['size'] == len(data) + + +def test_pyfs_save_limits(legacy_pyfs): + """Test progress callback.""" + data = b'somedata' + uri, size, checksum = legacy_pyfs.save(BytesIO(data), size=len(data)) + assert size == len(data) + + uri, size, checksum = legacy_pyfs.save(BytesIO(data), size_limit=len(data)) + assert size == len(data) + + # Size doesn't match + with pytest.raises(UnexpectedFileSizeError): + legacy_pyfs.save(BytesIO(data), size=len(data) - 1) + with pytest.raises(UnexpectedFileSizeError): + legacy_pyfs.save(BytesIO(data), size=len(data) + 1) + + # Exceeds size limits + pytest.raises( + FileSizeError, legacy_pyfs.save, BytesIO(data), + size_limit=FileSizeLimit(len(data) - 1, 'bla')) + + +def test_pyfs_update(legacy_pyfs, pyfs_testpath, get_md5): + """Test update of file.""" + legacy_pyfs.initialize(size=100) + legacy_pyfs.update(BytesIO(b'cd'), seek=2, size=2) + legacy_pyfs.update(BytesIO(b'ab'), seek=0, size=2) + + with open(pyfs_testpath) as fp: + content = fp.read() + assert content[0:4] == 'abcd' + assert len(content) == 100 + + # Assert return parameters from update. + size, checksum = legacy_pyfs.update(BytesIO(b'ef'), seek=4, size=2) + assert size == 2 + assert get_md5(b'ef') == checksum + + +def test_pyfs_update_fail(legacy_pyfs, pyfs_testpath, get_md5): + """Test update of file.""" + def fail_callback(total, size): + assert exists(pyfs_testpath) + raise Exception('Something bad happened') + + legacy_pyfs.initialize(size=100) + legacy_pyfs.update(BytesIO(b'ab'), seek=0, size=2) + pytest.raises( + Exception, + legacy_pyfs.update, + BytesIO(b'cdef'), + seek=2, + size=4, + chunk_size=2, + progress_callback=fail_callback, + ) + + # Partial file can be written to disk! + with open(pyfs_testpath) as fp: + content = fp.read() + assert content[0:4] == 'abcd' + assert content[4:6] != 'ef' + + +def test_pyfs_checksum(get_md5): + """Test fixity.""" + # Compute checksum of license file/ + with open('LICENSE', 'rb') as fp: + data = fp.read() + checksum = get_md5(data) + + counter = dict(size=0) + + def callback(total, size): + counter['size'] = size + + # Now do it with storage interface + s = PyFSFileStorage('LICENSE', size=getsize('LICENSE')) + assert checksum == s.checksum(chunk_size=2, progress_callback=callback) + assert counter['size'] == getsize('LICENSE') + + # No size provided, means progress callback isn't called + counter['size'] = 0 + s = PyFSFileStorage('LICENSE') + assert checksum == s.checksum(chunk_size=2, progress_callback=callback) + assert counter['size'] == 0 + + +def test_pyfs_checksum_fail(): + """Test fixity problems.""" + # Raise an error during checksum calculation + def callback(total, size): + raise OSError(errno.EPERM, "Permission") + + s = PyFSFileStorage('LICENSE', size=getsize('LICENSE')) + + pytest.raises(StorageError, s.checksum, progress_callback=callback) + + +def test_pyfs_send_file(app, legacy_pyfs): + """Test send file.""" + data = b'sendthis' + uri, size, checksum = legacy_pyfs.save(BytesIO(data)) + + with app.test_request_context(): + res = legacy_pyfs.send_file( + 'myfilename.txt', mimetype='text/plain', checksum=checksum) + assert res.status_code == 200 + h = res.headers + assert h['Content-Type'] == 'text/plain; charset=utf-8' + assert h['Content-Length'] == str(size) + assert h['Content-MD5'] == checksum[4:] + assert h['ETag'] == '"{0}"'.format(checksum) + + # Content-Type: application/octet-stream + # ETag: "b234ee4d69f5fce4486a80fdaf4a4263" + # Last-Modified: Sat, 23 Jan 2016 06:21:04 GMT + # Cache-Control: max-age=43200, public + # Expires: Sat, 23 Jan 2016 19:21:04 GMT + # Date: Sat, 23 Jan 2016 07:21:04 GMT + + res = legacy_pyfs.send_file( + 'myfilename.txt', mimetype='text/plain', checksum='crc32:test') + assert res.status_code == 200 + assert 'Content-MD5' not in dict(res.headers) + + # Test for absence of Content-Disposition header to make sure that + # it's not present when as_attachment=False + res = legacy_pyfs.send_file('myfilename.txt', mimetype='text/plain', + checksum=checksum, as_attachment=False) + assert res.status_code == 200 + assert 'attachment' not in res.headers['Content-Disposition'] + + +def test_pyfs_send_file_for_download(app, legacy_pyfs): + """Test send file.""" + data = b'sendthis' + uri, size, checksum = legacy_pyfs.save(BytesIO(data)) + + with app.test_request_context(): + # Test for presence of Content-Disposition header to make sure that + # it's present when as_attachment=True + res = legacy_pyfs.send_file('myfilename.txt', mimetype='text/plain', + checksum=checksum, as_attachment=True) + assert res.status_code == 200 + assert (res.headers['Content-Disposition'] == + 'attachment; filename=myfilename.txt') + + +def test_pyfs_send_file_xss_prevention(app, legacy_pyfs): + """Test send file.""" + data = b'' + uri, size, checksum = legacy_pyfs.save(BytesIO(data)) + + with app.test_request_context(): + res = legacy_pyfs.send_file( + 'myfilename.html', mimetype='text/html', checksum=checksum) + assert res.status_code == 200 + h = res.headers + assert h['Content-Type'] == 'text/plain; charset=utf-8' + assert h['Content-Length'] == str(size) + assert h['Content-MD5'] == checksum[4:] + assert h['ETag'] == '"{0}"'.format(checksum) + # XSS prevention + assert h['Content-Security-Policy'] == 'default-src \'none\';' + assert h['X-Content-Type-Options'] == 'nosniff' + assert h['X-Download-Options'] == 'noopen' + assert h['X-Permitted-Cross-Domain-Policies'] == 'none' + assert h['X-Frame-Options'] == 'deny' + assert h['X-XSS-Protection'] == '1; mode=block' + assert h['Content-Disposition'] == 'inline' + + # Image + h = legacy_pyfs.send_file('image.png', mimetype='image/png').headers + assert h['Content-Type'] == 'image/png' + assert h['Content-Disposition'] == 'inline' + + # README text file + h = legacy_pyfs.send_file('README').headers + assert h['Content-Type'] == 'text/plain; charset=utf-8' + assert h['Content-Disposition'] == 'inline' + + # Zip + h = legacy_pyfs.send_file('archive.zip').headers + assert h['Content-Type'] == 'application/octet-stream' + assert h['Content-Disposition'] == 'attachment; filename=archive.zip' + + # PDF + h = legacy_pyfs.send_file('doc.pdf').headers + assert h['Content-Type'] == 'application/octet-stream' + assert h['Content-Disposition'] == 'attachment; filename=doc.pdf' + + +def test_pyfs_send_file_fail(app, legacy_pyfs): + """Test send file.""" + legacy_pyfs.save(BytesIO(b'content')) + + with patch('invenio_files_rest.storage.legacy.send_stream') as send_stream: + send_stream.side_effect = OSError(errno.EPERM, "Permission problem") + with app.test_request_context(): + pytest.raises(StorageError, legacy_pyfs.send_file, 'test.txt') + + +def test_pyfs_copy(legacy_pyfs, dummy_location): + """Test send file.""" + s = PyFSFileStorage(join(dummy_location.uri, 'anotherpath/data')) + s.save(BytesIO(b'otherdata')) + + legacy_pyfs.copy(s) + fp = legacy_pyfs.open() + assert fp.read() == b'otherdata' + + +def test_non_unicode_filename(app, legacy_pyfs): + """Test sending the non-unicode filename in the header.""" + data = b'HelloWorld' + uri, size, checksum = legacy_pyfs.save(BytesIO(data)) + + with app.test_request_context(): + res = legacy_pyfs.send_file( + u'żółć.dat', mimetype='application/octet-stream', + checksum=checksum) + assert res.status_code == 200 + assert set(res.headers['Content-Disposition'].split('; ')) == \ + set(["attachment", "filename=zoc.dat", + "filename*=UTF-8''%C5%BC%C3%B3%C5%82%C4%87.dat"]) + + with app.test_request_context(): + res = legacy_pyfs.send_file( + 'żółć.txt', mimetype='text/plain', checksum=checksum) + assert res.status_code == 200 + assert res.headers['Content-Disposition'] == 'inline' diff --git a/tests/test_models_multipart.py b/tests/test_models_multipart.py index ba4fedb7..8105b878 100644 --- a/tests/test_models_multipart.py +++ b/tests/test_models_multipart.py @@ -55,7 +55,6 @@ def test_part_creation(app, db, bucket, get_md5): mp = MultipartObject.create(bucket, 'test.txt', 5, 2) db.session.commit() assert bucket.size == 5 - Part.create(mp, 2, stream=BytesIO(b'p')) Part.create(mp, 0, stream=BytesIO(b'p1')) Part.create(mp, 1, stream=BytesIO(b'p2')) diff --git a/tests/test_storage.py b/tests/test_storage.py index a8f148ef..1d40036f 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -1,8 +1,8 @@ - # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2016-2019 CERN. +# Copyright (C) 2016-2020 CERN. +# Copyright (C) 2020 Cottage Labs LLP. # # Invenio is free software; you can redistribute it and/or modify it # under the terms of the MIT License; see LICENSE file for more details. @@ -27,9 +27,9 @@ def test_storage_interface(): """Test storage interface.""" - s = FileStorage() + s = FileStorage('some-path') pytest.raises(NotImplementedError, s.open) - pytest.raises(NotImplementedError, s.initialize) + pytest.raises(NotImplementedError, s.initialize, 'file:///some/path') pytest.raises(NotImplementedError, s.delete) pytest.raises(NotImplementedError, s.save, None) pytest.raises(NotImplementedError, s.update, None) @@ -40,15 +40,15 @@ def test_pyfs_initialize(pyfs, pyfs_testpath): """Test init of files.""" # Create file object. assert not exists(pyfs_testpath) - uri, size, checksum = pyfs.initialize(size=100) + result = pyfs.initialize(size=100) - assert size == 100 - assert checksum is None - assert os.stat(pyfs_testpath).st_size == size + assert result.get('size') == 100 + assert 'checksum' not in result + assert os.stat(pyfs_testpath).st_size == result['size'] - uri, size, checksum = pyfs.initialize() - assert size == 0 - assert size == os.stat(pyfs_testpath).st_size + result = pyfs.initialize() + assert result.get('size') == 0 + assert result['size'] == os.stat(pyfs_testpath).st_size def test_pyfs_delete(app, db, dummy_location): @@ -75,11 +75,8 @@ def test_pyfs_delete_fail(pyfs, pyfs_testpath): def test_pyfs_save(pyfs, pyfs_testpath, get_md5): """Test basic save operation.""" data = b'somedata' - uri, size, checksum = pyfs.save(BytesIO(data)) + pyfs.save(BytesIO(data)) - assert uri == pyfs_testpath - assert size == len(data) - assert checksum == get_md5(data) assert exists(pyfs_testpath) assert open(pyfs_testpath, 'rb').read() == data @@ -110,7 +107,7 @@ def test_pyfs_save_callback(pyfs): def callback(total, size): counter['size'] = size - uri, size, checksum = pyfs.save( + result = pyfs.save( BytesIO(data), progress_callback=callback) assert counter['size'] == len(data) @@ -119,11 +116,11 @@ def callback(total, size): def test_pyfs_save_limits(pyfs): """Test progress callback.""" data = b'somedata' - uri, size, checksum = pyfs.save(BytesIO(data), size=len(data)) - assert size == len(data) + result = pyfs.save(BytesIO(data), size=len(data)) + assert result['size'] == len(data) - uri, size, checksum = pyfs.save(BytesIO(data), size_limit=len(data)) - assert size == len(data) + result = pyfs.save(BytesIO(data), size_limit=len(data)) + assert result['size'] == len(data) # Size doesn't match pytest.raises( @@ -158,7 +155,9 @@ def test_pyfs_update_fail(pyfs, pyfs_testpath, get_md5): """Test update of file.""" def fail_callback(total, size): assert exists(pyfs_testpath) - raise Exception('Something bad happened') + # + if total > 2: + raise Exception('Something bad happened') pyfs.initialize(size=100) pyfs.update(BytesIO(b'ab'), seek=0, size=2) @@ -217,17 +216,20 @@ def callback(total, size): def test_pyfs_send_file(app, pyfs): """Test send file.""" data = b'sendthis' - uri, size, checksum = pyfs.save(BytesIO(data)) + result = pyfs.save(BytesIO(data)) with app.test_request_context(): res = pyfs.send_file( - 'myfilename.txt', mimetype='text/plain', checksum=checksum) + 'myfilename.txt', + mimetype='text/plain', + checksum=result['checksum'] + ) assert res.status_code == 200 h = res.headers assert h['Content-Type'] == 'text/plain; charset=utf-8' - assert h['Content-Length'] == str(size) - assert h['Content-MD5'] == checksum[4:] - assert h['ETag'] == '"{0}"'.format(checksum) + assert h['Content-Length'] == str(result['size']) + assert h['Content-MD5'] == result['checksum'][4:] + assert h['ETag'] == '"{0}"'.format(result['checksum']) # Content-Type: application/octet-stream # ETag: "b234ee4d69f5fce4486a80fdaf4a4263" @@ -244,7 +246,7 @@ def test_pyfs_send_file(app, pyfs): # Test for absence of Content-Disposition header to make sure that # it's not present when as_attachment=False res = pyfs.send_file('myfilename.txt', mimetype='text/plain', - checksum=checksum, as_attachment=False) + checksum=result['checksum'], as_attachment=False) assert res.status_code == 200 assert 'attachment' not in res.headers['Content-Disposition'] @@ -252,13 +254,13 @@ def test_pyfs_send_file(app, pyfs): def test_pyfs_send_file_for_download(app, pyfs): """Test send file.""" data = b'sendthis' - uri, size, checksum = pyfs.save(BytesIO(data)) + result = pyfs.save(BytesIO(data)) with app.test_request_context(): # Test for presence of Content-Disposition header to make sure that # it's present when as_attachment=True res = pyfs.send_file('myfilename.txt', mimetype='text/plain', - checksum=checksum, as_attachment=True) + checksum=result['checksum'], as_attachment=True) assert res.status_code == 200 assert (res.headers['Content-Disposition'] == 'attachment; filename=myfilename.txt') @@ -267,17 +269,20 @@ def test_pyfs_send_file_for_download(app, pyfs): def test_pyfs_send_file_xss_prevention(app, pyfs): """Test send file.""" data = b'' - uri, size, checksum = pyfs.save(BytesIO(data)) + result = pyfs.save(BytesIO(data)) with app.test_request_context(): res = pyfs.send_file( - 'myfilename.html', mimetype='text/html', checksum=checksum) + 'myfilename.html', + mimetype='text/html', + checksum=result['checksum'] + ) assert res.status_code == 200 h = res.headers assert h['Content-Type'] == 'text/plain; charset=utf-8' - assert h['Content-Length'] == str(size) - assert h['Content-MD5'] == checksum[4:] - assert h['ETag'] == '"{0}"'.format(checksum) + assert h['Content-Length'] == str(result['size']) + assert h['Content-MD5'] == result['checksum'][4:] + assert h['ETag'] == '"{0}"'.format(result['checksum']) # XSS prevention assert h['Content-Security-Policy'] == 'default-src \'none\';' assert h['X-Content-Type-Options'] == 'nosniff' @@ -331,12 +336,12 @@ def test_pyfs_copy(pyfs, dummy_location): def test_non_unicode_filename(app, pyfs): """Test sending the non-unicode filename in the header.""" data = b'HelloWorld' - uri, size, checksum = pyfs.save(BytesIO(data)) + result = pyfs.save(BytesIO(data)) with app.test_request_context(): res = pyfs.send_file( u'żółć.dat', mimetype='application/octet-stream', - checksum=checksum) + checksum=result['checksum']) assert res.status_code == 200 assert set(res.headers['Content-Disposition'].split('; ')) == \ set(["attachment", "filename=zoc.dat", @@ -344,6 +349,6 @@ def test_non_unicode_filename(app, pyfs): with app.test_request_context(): res = pyfs.send_file( - 'żółć.txt', mimetype='text/plain', checksum=checksum) + 'żółć.txt', mimetype='text/plain', checksum=result['checksum']) assert res.status_code == 200 assert res.headers['Content-Disposition'] == 'inline'