Skip to content

Commit

Permalink
Improve sync of files with special characters
Browse files Browse the repository at this point in the history
  • Loading branch information
const-cloudinary committed Nov 9, 2020
1 parent 0473eab commit 6fd7447
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 35 deletions.
91 changes: 70 additions & 21 deletions cloudinary_cli/modules/sync.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
import logging
from functools import reduce
from itertools import product
from os import remove
from os.path import join as path_join, abspath
from os import path, remove

from click import command, argument, option, style
from cloudinary import api

from cloudinary_cli.utils.api_utils import query_cld_folder, upload_file, download_file
from cloudinary_cli.utils.file_utils import walk_dir, delete_empty_dirs, get_destination_folder
from cloudinary_cli.utils.json_utils import print_json
from cloudinary_cli.utils.utils import logger, run_tasks_concurrently, get_user_action
from cloudinary_cli.utils.json_utils import print_json, read_json_from_file, write_json_to_file
from cloudinary_cli.utils.utils import logger, run_tasks_concurrently, get_user_action, invert_dict

_DEFAULT_DELETION_BATCH_SIZE = 30
_DEFAULT_CONCURRENT_WORKERS = 30

_SYNC_META_FILE = '.cld-sync'


@command("sync",
short_help="Synchronize between a local directory and a Cloudinary folder.",
Expand Down Expand Up @@ -50,31 +51,52 @@ class SyncDir:
def __init__(self, local_dir, remote_dir, include_hidden, concurrent_workers, force, keep_deleted,
deletion_batch_size):
self.local_dir = local_dir
self.remote_dir = remote_dir
self.remote_dir = remote_dir.strip('/')
self.include_hidden = include_hidden
self.concurrent_workers = concurrent_workers
self.force = force
self.keep_unique = keep_deleted
self.deletion_batch_size = deletion_batch_size

self.sync_meta_file = path.join(self.local_dir, _SYNC_META_FILE)

self.verbose = logger.getEffectiveLevel() < logging.INFO

self.local_files = walk_dir(abspath(self.local_dir), include_hidden)
self.local_files = walk_dir(path.abspath(self.local_dir), include_hidden)
logger.info(f"Found {len(self.local_files)} items in local folder '{local_dir}'")

self.remote_files = query_cld_folder(self.remote_dir)
logger.info(f"Found {len(self.remote_files)} items in Cloudinary folder '{self.remote_dir}'")

local_file_names = self.local_files.keys()
remote_file_names = self.remote_files.keys()
"""
Cloudinary is a very permissive service. When uploading files that contain invalid characters,
unicode characters, etc, Cloudinary does the best effort to store those files.
Usually Cloudinary sanitizes those file names and strips invalid characters. Although it is good best effort for
a general use case, when syncing local folder with Cloudinary, it is not the best option, since directories will
be always out-of-sync.
To overcome this limitation, cloudinary-cli keeps .cld-sync hidden file in the sync directory that contains a
mapping of the diverse file names. This file keeps tracking on the files and allows syncing in both directions.
"""
self.diverse_file_names = read_json_from_file(self.sync_meta_file, does_not_exist_ok=True)
inverted_diverse_file_names = invert_dict(self.diverse_file_names)

cloudinarized_local_file_names = [self.diverse_file_names.get(f, f) for f in local_file_names]
self.recovered_remote_files = {inverted_diverse_file_names.get(f, f): dt for f, dt in self.remote_files.items()}

self.unique_remote_file_names = remote_file_names - cloudinarized_local_file_names
self.unique_local_file_names = local_file_names - self.recovered_remote_files.keys()

self.unique_remote_file_names = remote_file_names - local_file_names
self.unique_local_file_names = local_file_names - remote_file_names
common_file_names = local_file_names - self.unique_local_file_names

self.out_of_sync_file_names = self._get_out_of_sync_file_names(common_file_names)
self.out_of_sync_local_file_names = self._get_out_of_sync_file_names(common_file_names)
self.out_of_sync_remote_file_names = set(self.diverse_file_names.get(f, f) for f in
self.out_of_sync_local_file_names)

skipping = len(common_file_names) - len(self.out_of_sync_file_names)
skipping = len(common_file_names) - len(self.out_of_sync_local_file_names)

if skipping:
logger.info(f"Skipping {skipping} items")
Expand All @@ -83,12 +105,16 @@ def _get_out_of_sync_file_names(self, common_file_names):
logger.debug("\nCalculating differences...\n")
out_of_sync_file_names = set()
for f in common_file_names:
if self.local_files[f]['etag'] != self.remote_files[f]['etag']:
logger.warning(f"{f} is out of sync")
logger.debug(f"Local etag: {self.local_files[f]['etag']}. Remote etag: {self.remote_files[f]['etag']}")
local_etag = self.local_files[f]['etag']
remote_etag = self.recovered_remote_files[f]['etag']
if local_etag != remote_etag:
logger.warning(f"{f} is out of sync" +
(f" with '{self.diverse_file_names[f]}" if f in self.diverse_file_names else ""))
logger.debug(f"Local etag: {local_etag}. Remote etag: {remote_etag}")
out_of_sync_file_names.add(f)
continue
logger.debug(f"{f} is in sync")
logger.debug(f"'{f}' is in sync" +
(f" with '{self.diverse_file_names[f]}" if f in self.diverse_file_names else ""))

return out_of_sync_file_names

Expand All @@ -97,7 +123,7 @@ def push(self):
logger.info("Aborting...")
return False

files_to_push = self.unique_local_file_names | self.out_of_sync_file_names
files_to_push = self.unique_local_file_names | self.out_of_sync_local_file_names
if not files_to_push:
return True

Expand All @@ -109,14 +135,37 @@ def push(self):
'invalidate': True,
'resource_type': 'auto'
}
upload_results = {}
uploads = []
for file in files_to_push:
folder = get_destination_folder(self.remote_dir, file)

uploads.append((self.local_files[file]['path'], {**options, 'folder': folder}))
uploads.append((self.local_files[file]['path'], {**options, 'folder': folder}, upload_results))

run_tasks_concurrently(upload_file, uploads, self.concurrent_workers)

self.save_sync_meta_file(upload_results)

def save_sync_meta_file(self, upload_results):
diverse_filenames = {}
for local_path, remote_path in upload_results.items():
local = path.relpath(local_path, self.local_dir)
remote = path.relpath(remote_path, self.remote_dir)
if local != remote:
diverse_filenames[local] = remote

# filter out outdated meta file entries
current_diverse_files = {k: v for k, v in self.diverse_file_names.items() if k in self.local_files.keys()}

if diverse_filenames or current_diverse_files != self.diverse_file_names:
current_diverse_files.update(diverse_filenames)
try:
write_json_to_file(current_diverse_files, self.sync_meta_file)
logger.debug(f"Updated '{self.sync_meta_file}' file")
except Exception as e:
# Meta file is not critical for the sync itself, in case we cannot write it, we just log a warning
logger.warning(f"Failed updating '{self.sync_meta_file}' file: {e}")

def _handle_unique_remote_files(self):
handled = self._handle_files_deletion(len(self.unique_remote_file_names), "remote")
if handled is not None:
Expand Down Expand Up @@ -155,7 +204,7 @@ def pull(self):
if not self._handle_unique_local_files():
return False

files_to_pull = self.unique_remote_file_names | self.out_of_sync_file_names
files_to_pull = self.unique_remote_file_names | self.out_of_sync_remote_file_names

if not files_to_pull:
return True
Expand All @@ -164,7 +213,7 @@ def pull(self):
downloads = []
for file in files_to_pull:
remote_file = self.remote_files[file]
local_path = abspath(path_join(self.local_dir, file))
local_path = path.abspath(path.join(self.local_dir, file))

downloads.append((remote_file, local_path))

Expand All @@ -177,9 +226,9 @@ def _handle_unique_local_files(self):

logger.info(f"Deleting {len(self.unique_local_file_names)} local files...")
for file in self.unique_local_file_names:
path = abspath(self.local_files[file]['path'])
remove(path)
logger.info(f"Deleted '{path}'")
full_path = path.abspath(self.local_files[file]['path'])
remove(full_path)
logger.info(f"Deleted '{full_path}'")

logger.info("Deleting empty folders...")
delete_empty_dirs(self.local_dir)
Expand Down
4 changes: 2 additions & 2 deletions cloudinary_cli/utils/api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def query_cld_folder(folder):


def upload_file(file_path, options, uploaded=None, skipped=None):
uploaded = uploaded if uploaded is not None else []
uploaded = uploaded if uploaded is not None else {}
skipped = skipped if skipped is not None else []
verbose = logger.getEffectiveLevel() < logging.INFO

Expand All @@ -55,7 +55,7 @@ def upload_file(file_path, options, uploaded=None, skipped=None):
logger.info(style(f"Successfully uploaded {file_path} as {result['public_id']}", fg="green"))
if verbose:
print_json(result)
uploaded.append(result['public_id'])
uploaded[file_path] = asset_source(result)
except Exception as e:
log_exception(e, f"Failed uploading {file_path}")
skipped.append(file_path)
Expand Down
9 changes: 2 additions & 7 deletions cloudinary_cli/utils/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,7 @@


def load_config():
if not os.path.exists(CLOUDINARY_CLI_CONFIG_FILE) or os.path.getsize(CLOUDINARY_CLI_CONFIG_FILE) < 1:
return {}

return read_json_from_file(CLOUDINARY_CLI_CONFIG_FILE)
return read_json_from_file(CLOUDINARY_CLI_CONFIG_FILE, does_not_exist_ok=True)


def save_config(config):
Expand Down Expand Up @@ -68,9 +65,7 @@ def migrate_old_config():
f"please fix or remove it")
raise

new_config = load_config()
new_config.update(old_config)
save_config(new_config)
update_config(old_config)

os.remove(OLD_CLOUDINARY_CLI_CONFIG_FILE)

Expand Down
6 changes: 5 additions & 1 deletion cloudinary_cli/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,11 @@ def is_hidden_path(filepath):


def has_hidden_attribute(filepath):
st = os.stat(filepath)
try:
st = os.stat(filepath)
except OSError as e:
logger.debug(f"Failed getting os.stat for file '{filepath}': {e}")
return False

if not hasattr(st, 'st_file_attributes'): # not a pythonic way, but it's relevant only for windows, no need to try
return False
Expand Down
17 changes: 13 additions & 4 deletions cloudinary_cli/utils/json_utils.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,27 @@
import json
from platform import system

from os import path
import click
from pygments import highlight, lexers, formatters


def read_json_from_file(filename, does_not_exist_ok=False):
if does_not_exist_ok and (not path.exists(filename) or path.getsize(filename) < 1):
return {}

with open(filename, 'r') as file:
return json.loads(file.read() or "{}")


def write_json_to_file(json_obj, filename, indent=2, sort_keys=False):
with open(filename, 'w') as file:
json.dump(json_obj, file, indent=indent, sort_keys=sort_keys)


def read_json_from_file(filename):
with open(filename, 'r') as file:
return json.loads(file.read() or "{}")
def update_json_file(json_obj, filename, indent=2, sort_keys=False):
curr_obj = read_json_from_file(filename, True)
curr_obj.update(json_obj)
write_json_to_file(curr_obj, filename, indent, sort_keys)


def print_json(res):
Expand Down
8 changes: 8 additions & 0 deletions cloudinary_cli/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,14 @@ def remove_string_prefix(string, prefix):
return string[string.startswith(prefix) and len(prefix):]


def invert_dict(d):
inv_dict = {}
for k, v in d.items():
inv_dict[v] = k

return inv_dict


def write_json_list_to_csv(json_list, filename, fields_to_keep=()):
with open(f'{filename}.csv', 'w') as f:
if not fields_to_keep:
Expand Down

0 comments on commit 6fd7447

Please sign in to comment.