Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP fix derive logic and redundant hash calc #351

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 12 additions & 29 deletions internetarchive/item.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@

from internetarchive.utils import (IdentifierListAsItems, get_md5,
chunk_generator, IterableToFileAdapter,
iter_directory, recursive_file_count,
norm_filepath)
iter_directory, norm_filepath)
from internetarchive.files import File
from internetarchive.iarequest import MetadataRequest, S3Request
from internetarchive.auth import S3Auth
Expand Down Expand Up @@ -1176,45 +1175,32 @@ def upload(self, files,
"""
queue_derive = True if queue_derive is None else queue_derive
remote_dir_name = None
total_files = None

if isinstance(files, dict):
if files.get('name'):
files = [files]
total_files = 1
else:
files = list(files.items())
if not isinstance(files, (list, tuple)):
files = [files]
if all(isinstance(f, dict) and f.get('name') for f in files):
total_files = len(files)

responses = []
file_index = 0
if queue_derive and total_files is None:
if checksum:
total_files = recursive_file_count(files, item=self, checksum=True)
else:
total_files = recursive_file_count(files, item=self, checksum=False)
file_metadata = None
for f in files:

if isinstance(f, dict):
if f.get('name'):
file_metadata = f.copy()
del file_metadata['name']
f = f['name']

if ((isinstance(f, string_types) and is_dir(f))
or (isinstance(f, tuple) and is_dir(f[-1]))):
if isinstance(f, tuple):
remote_dir_name = f[0].strip('/')
f = f[-1]
for filepath, key in iter_directory(f):
file_index += 1
# Set derive header if queue_derive is True,
# and this is the last request being made.
if queue_derive is True and file_index >= total_files:
_queue_derive = True
else:
_queue_derive = False
if not f.endswith('/'):
if remote_dir_name:
key = '{0}{1}/{2}'.format(remote_dir_name, f, key)
Expand All @@ -1230,7 +1216,7 @@ def upload(self, files,
headers=headers,
access_key=access_key,
secret_key=secret_key,
queue_derive=_queue_derive,
queue_derive=False,
verbose=verbose,
verify=verify,
checksum=checksum,
Expand All @@ -1242,15 +1228,6 @@ def upload(self, files,
request_kwargs=request_kwargs)
responses.append(resp)
else:
file_index += 1
# Set derive header if queue_derive is True,
# and this is the last request being made.
# if queue_derive is True and file_index >= len(files):
if queue_derive is True and file_index >= total_files:
_queue_derive = True
else:
_queue_derive = False

if not isinstance(f, (list, tuple)):
key, body = (None, f)
else:
Expand All @@ -1264,7 +1241,7 @@ def upload(self, files,
headers=headers,
access_key=access_key,
secret_key=secret_key,
queue_derive=_queue_derive,
queue_derive=False,
verbose=verbose,
verify=verify,
checksum=checksum,
Expand All @@ -1275,6 +1252,12 @@ def upload(self, files,
validate_identifier=validate_identifier,
request_kwargs=request_kwargs)
responses.append(resp)

if queue_derive:
# Came this far without any exceptions raised, so all uploads
# probably completed successfully. Derive now.
self.derive()

return responses


Expand Down
48 changes: 0 additions & 48 deletions internetarchive/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,54 +221,6 @@ def iter_directory(directory):
yield (filepath, key)


def recursive_file_count(files, item=None, checksum=False):
"""Given a filepath or list of filepaths, return the total number of files."""
if not isinstance(files, (list, set)):
files = [files]
total_files = 0
if checksum is True:
md5s = [f.get('md5') for f in item.files]
else:
md5s = list()
if isinstance(files, dict):
# make sure to use local filenames.
_files = files.values()
else:
if isinstance(files[0], tuple):
_files = dict(files).values()
else:
_files = files
for f in _files:
try:
is_dir = os.path.isdir(f)
except TypeError:
try:
f = f[0]
is_dir = os.path.isdir(f)
except (AttributeError, TypeError):
is_dir = False
if is_dir:
for x, _ in iter_directory(f):
if checksum is True:
with open(x, 'rb') as fh:
lmd5 = get_md5(fh)
if lmd5 in md5s:
continue
total_files += 1
else:
if checksum is True:
try:
with open(f, 'rb') as fh:
lmd5 = get_md5(fh)
except TypeError:
# Support file-like objects.
lmd5 = get_md5(f)
if lmd5 in md5s:
continue
total_files += 1
return total_files


def is_dir(obj):
"""Special is_dir function to handle file-like object cases that
cannot be stat'd"""
Expand Down