Skip to content

Commit

Permalink
WIP fix derive logic and redundant hash calc
Browse files Browse the repository at this point in the history
removed all counting whatsoever and just queue the derive in the end
  • Loading branch information
Dobatymo committed Mar 28, 2022
1 parent e3130ac commit a8d781f
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 77 deletions.
41 changes: 12 additions & 29 deletions internetarchive/item.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@

from internetarchive.utils import (IdentifierListAsItems, get_md5,
chunk_generator, IterableToFileAdapter,
iter_directory, recursive_file_count,
norm_filepath)
iter_directory, norm_filepath)
from internetarchive.files import File
from internetarchive.iarequest import MetadataRequest, S3Request
from internetarchive.auth import S3Auth
Expand Down Expand Up @@ -1176,45 +1175,32 @@ def upload(self, files,
"""
queue_derive = True if queue_derive is None else queue_derive
remote_dir_name = None
total_files = None

if isinstance(files, dict):
if files.get('name'):
files = [files]
total_files = 1
else:
files = list(files.items())
if not isinstance(files, (list, tuple)):
files = [files]
if all(isinstance(f, dict) and f.get('name') for f in files):
total_files = len(files)

responses = []
file_index = 0
if queue_derive and total_files is None:
if checksum:
total_files = recursive_file_count(files, item=self, checksum=True)
else:
total_files = recursive_file_count(files, item=self, checksum=False)
file_metadata = None
for f in files:

if isinstance(f, dict):
if f.get('name'):
file_metadata = f.copy()
del file_metadata['name']
f = f['name']

if ((isinstance(f, string_types) and is_dir(f))
or (isinstance(f, tuple) and is_dir(f[-1]))):
if isinstance(f, tuple):
remote_dir_name = f[0].strip('/')
f = f[-1]
for filepath, key in iter_directory(f):
file_index += 1
# Set derive header if queue_derive is True,
# and this is the last request being made.
if queue_derive is True and file_index >= total_files:
_queue_derive = True
else:
_queue_derive = False
if not f.endswith('/'):
if remote_dir_name:
key = '{0}{1}/{2}'.format(remote_dir_name, f, key)
Expand All @@ -1230,7 +1216,7 @@ def upload(self, files,
headers=headers,
access_key=access_key,
secret_key=secret_key,
queue_derive=_queue_derive,
queue_derive=False,
verbose=verbose,
verify=verify,
checksum=checksum,
Expand All @@ -1242,15 +1228,6 @@ def upload(self, files,
request_kwargs=request_kwargs)
responses.append(resp)
else:
file_index += 1
# Set derive header if queue_derive is True,
# and this is the last request being made.
# if queue_derive is True and file_index >= len(files):
if queue_derive is True and file_index >= total_files:
_queue_derive = True
else:
_queue_derive = False

if not isinstance(f, (list, tuple)):
key, body = (None, f)
else:
Expand All @@ -1264,7 +1241,7 @@ def upload(self, files,
headers=headers,
access_key=access_key,
secret_key=secret_key,
queue_derive=_queue_derive,
queue_derive=False,
verbose=verbose,
verify=verify,
checksum=checksum,
Expand All @@ -1275,6 +1252,12 @@ def upload(self, files,
validate_identifier=validate_identifier,
request_kwargs=request_kwargs)
responses.append(resp)

if queue_derive:
# Came this far without any exceptions raised, so all uploads
# probably completed successfully. Derive now.
self.derive()

return responses


Expand Down
48 changes: 0 additions & 48 deletions internetarchive/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,54 +221,6 @@ def iter_directory(directory):
yield (filepath, key)


def recursive_file_count(files, item=None, checksum=False):
"""Given a filepath or list of filepaths, return the total number of files."""
if not isinstance(files, (list, set)):
files = [files]
total_files = 0
if checksum is True:
md5s = [f.get('md5') for f in item.files]
else:
md5s = list()
if isinstance(files, dict):
# make sure to use local filenames.
_files = files.values()
else:
if isinstance(files[0], tuple):
_files = dict(files).values()
else:
_files = files
for f in _files:
try:
is_dir = os.path.isdir(f)
except TypeError:
try:
f = f[0]
is_dir = os.path.isdir(f)
except (AttributeError, TypeError):
is_dir = False
if is_dir:
for x, _ in iter_directory(f):
if checksum is True:
with open(x, 'rb') as fh:
lmd5 = get_md5(fh)
if lmd5 in md5s:
continue
total_files += 1
else:
if checksum is True:
try:
with open(f, 'rb') as fh:
lmd5 = get_md5(fh)
except TypeError:
# Support file-like objects.
lmd5 = get_md5(f)
if lmd5 in md5s:
continue
total_files += 1
return total_files


def is_dir(obj):
"""Special is_dir function to handle file-like object cases that
cannot be stat'd"""
Expand Down

0 comments on commit a8d781f

Please sign in to comment.