Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

storage: customizable checksum algorithms for fixity #187

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion invenio_files_rest/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
# as an Intergovernmental Organization or submit itself to any jurisdiction.

"""Invenio Files Rest module configuration file."""

import hashlib
from datetime import timedelta

MAX_CONTENT_LENGTH = 16 * 1024 * 1024
Expand Down Expand Up @@ -132,3 +132,16 @@

FILES_REST_TASK_WAIT_MAX_SECONDS = 600
"""Maximum number of seconds to wait for a task to finish."""

FILES_REST_SUPPORTED_CHECKSUM_ALGORITHMS = {
'md5': hashlib.md5
}
"""Algorithms that can be used for file checksum compute and verify"""

FILES_REST_CHECKSUM_ALGORITHM = 'md5'
"""Checksum algorithm to be used on all newly uploaded files

.. note::
Value of this variable must be a corresponding
key in the ``FILES_REST_SUPPORTED_CHECKSUM_ALGORITHMS`` variable.
"""
13 changes: 13 additions & 0 deletions invenio_files_rest/ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,19 @@ def upload_factories(self):
self.app.config.get('FILES_REST_UPLOAD_FACTORIES', [])
]

@cached_property
def supported_checksums(self):
"""Load list of supported checksum algorithms."""
return load_or_import_from_config(
'FILES_REST_SUPPORTED_CHECKSUM_ALGORITHMS', app=self.app
)

@cached_property
def checksum_algorithm(self):
return self.app.config.get(
'FILES_REST_CHECKSUM_ALGORITHM', 'md5'
)

def multipart_partfactory(self):
"""Get factory for content length, part number, stream for a part."""
for factory in self.part_factories:
Expand Down
6 changes: 5 additions & 1 deletion invenio_files_rest/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -727,9 +727,13 @@ def verify_checksum(self, progress_callback=None, chunk_size=None,
``storage().checksum``.
"""
try:
checksum_kwargs = {
'algo': self.checksum.split(':', 1)[0],
**(checksum_kwargs or {})
}
real_checksum = self.storage(**kwargs).checksum(
progress_callback=progress_callback, chunk_size=chunk_size,
**(checksum_kwargs or {}))
**checksum_kwargs)
except Exception as exc:
current_app.logger.exception(str(exc))
if throws:
Expand Down
18 changes: 13 additions & 5 deletions invenio_files_rest/storage/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@

from __future__ import absolute_import, print_function

import hashlib
from calendar import timegm
from functools import partial

from ..errors import FileSizeError, StorageError, UnexpectedFileSizeError
from ..helpers import chunk_size_or_default, compute_checksum, send_stream
from ..proxies import current_files_rest


def check_sizelimit(size_limit, bytes_written, total_size):
Expand Down Expand Up @@ -145,7 +145,7 @@ def checksum(self, chunk_size=None, progress_callback=None, **kwargs):
try:
value = self._compute_checksum(
fp, size=self._size, chunk_size=None,
progress_callback=progress_callback)
progress_callback=progress_callback, **kwargs)
except StorageError:
raise
finally:
Expand All @@ -168,13 +168,21 @@ def copy(self, src, chunk_size=None, progress_callback=None):
#
# Helpers
#
def _init_hash(self):
def _init_hash(self, algo=None):
"""Initialize message digest object.

Overwrite this method if you want to use different checksum
algorithm for your storage backend.
"""
return 'md5', hashlib.md5()
if not algo:
algo = current_files_rest.checksum_algorithm

m = current_files_rest.supported_checksums.get(algo)
if not m:
raise AttributeError(
'Unsupported checksum algorithm: {}'.format(algo))

return algo, m()

def _compute_checksum(self, stream, size=None, chunk_size=None,
progress_callback=None, **kwargs):
Expand All @@ -189,7 +197,7 @@ def _compute_checksum(self, stream, size=None, chunk_size=None,
progress_callback = None

try:
algo, m = self._init_hash()
algo, m = self._init_hash(algo=kwargs.pop('algo', None))
return compute_checksum(
stream, algo, m,
chunk_size=chunk_size,
Expand Down
15 changes: 15 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,11 @@ def base_app():
FILES_REST_MULTIPART_MAX_PARTS=100,
FILES_REST_TASK_WAIT_INTERVAL=0.1,
FILES_REST_TASK_WAIT_MAX_SECONDS=1,
FILES_REST_SUPPORTED_CHECKSUM_ALGORITHMS={
'md5': hashlib.md5,
'sha256': hashlib.sha256
},
FILES_REST_CHECKSUM_ALGORITHM='md5'
)

FlaskCeleryExt(app_)
Expand Down Expand Up @@ -366,6 +371,16 @@ def inner(data, prefix=True):
return inner


@pytest.fixture()
def get_sha256():
"""Get SHA256 digest of data."""
def inner(data, prefix=True):
m = hashlib.sha256()
m.update(data)
return "sha256:{0}".format(m.hexdigest()) if prefix else m.hexdigest()
return inner


@pytest.fixture()
def get_json():
"""Get JSON from response."""
Expand Down
50 changes: 47 additions & 3 deletions tests/test_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def fail_callback(total, size):
assert content[4:6] != 'ef'


def test_pyfs_checksum(get_md5):
def test_pyfs_checksum(app, get_md5):
"""Test fixity."""
# Compute checksum of license file/
with open('LICENSE', 'rb') as fp:
Expand All @@ -210,16 +210,60 @@ def callback(total, size):

# Now do it with storage interface
s = PyFSFileStorage('LICENSE', size=getsize('LICENSE'))
assert checksum == s.checksum(chunk_size=2, progress_callback=callback)
with app.test_request_context():
assert checksum == s.checksum(chunk_size=2,
progress_callback=callback)

assert counter['size'] == getsize('LICENSE')

# No size provided, means progress callback isn't called
counter['size'] = 0
s = PyFSFileStorage('LICENSE')
assert checksum == s.checksum(chunk_size=2, progress_callback=callback)
with app.test_request_context():
assert checksum == s.checksum(chunk_size=2,
progress_callback=callback)

assert counter['size'] == 0


def test_pyfs_custom_checksums(app, get_md5, get_sha256):
"""Test custom checksum algorithms."""

# Compute all supported checksums of license file
with open('LICENSE', 'rb') as fp:
data = fp.read()
checksum_md5 = get_md5(data)
checksum_sha256 = get_sha256(data)

counter = dict(size=0)

def callback(total, size):
counter['size'] = size

# Compute default (MD5) checksum of license file using storage interface
fm = PyFSFileStorage('LICENSE', size=getsize('LICENSE'))
with app.test_request_context():
assert checksum_md5 == fm.checksum(chunk_size=2,
progress_callback=callback)
assert counter['size'] == getsize('LICENSE')

# Now change the default checksum algorithm to SHA-256
app.extensions['invenio-files-rest'].checksum_algorithm = 'sha256'
assert app.extensions['invenio-files-rest'].checksum_algorithm == 'sha256'

# Storage should now compute SHA-256 checksums
fs = PyFSFileStorage('LICENSE')
with app.test_request_context():
assert checksum_sha256 == fs.checksum(chunk_size=2,
progress_callback=callback)

# Test setting unsupported checksum algorithm
app.extensions['invenio-files-rest'].checksum_algorithm = '256ahs'
fu = PyFSFileStorage('LICENSE')
with app.test_request_context():
pytest.raises(StorageError, fu.checksum, progress_callback=callback)


def test_pyfs_checksum_fail():
"""Test fixity problems."""
# Raise an error during checksum calculation
Expand Down
14 changes: 14 additions & 0 deletions tests/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,20 @@ def test_verify_checksum(app, db, dummy_location):
f = FileInstance.query.get(file_id)
assert f.last_check is None

# Create another file using MD5 checksum
with open('LICENSE', 'rb') as fp:
obj = ObjectVersion.create(b1, 'LICENSE', stream=fp)
db.session.commit()
file_id = obj.file_id

# Set checksums to SHA256, verify of previous MD5 checksums must work
app.extensions['invenio-files-rest'].checksum_algorithm = 'sha256'
verify_checksum(str(file_id))

f = FileInstance.query.get(file_id)
assert f.last_check_at
assert f.last_check is True


def test_schedule_checksum_verification(app, db, dummy_location):
"""Test file checksum verification scheduling celery task."""
Expand Down