Improve sync of files with special characters

cloudinary · Nov 9, 2020 · 6fd7447 · 6fd7447
1 parent 0473eab
commit 6fd7447
Show file tree

Hide file tree

Showing 6 changed files with 100 additions and 35 deletions.
diff --git a/cloudinary_cli/modules/sync.py b/cloudinary_cli/modules/sync.py
@@ -1,20 +1,21 @@
 import logging
 from functools import reduce
 from itertools import product
-from os import remove
-from os.path import join as path_join, abspath
+from os import path, remove
 
 from click import command, argument, option, style
 from cloudinary import api
 
 from cloudinary_cli.utils.api_utils import query_cld_folder, upload_file, download_file
 from cloudinary_cli.utils.file_utils import walk_dir, delete_empty_dirs, get_destination_folder
-from cloudinary_cli.utils.json_utils import print_json
-from cloudinary_cli.utils.utils import logger, run_tasks_concurrently, get_user_action
+from cloudinary_cli.utils.json_utils import print_json, read_json_from_file, write_json_to_file
+from cloudinary_cli.utils.utils import logger, run_tasks_concurrently, get_user_action, invert_dict
 
 _DEFAULT_DELETION_BATCH_SIZE = 30
 _DEFAULT_CONCURRENT_WORKERS = 30
 
+_SYNC_META_FILE = '.cld-sync'
+
 
 @command("sync",
          short_help="Synchronize between a local directory and a Cloudinary folder.",
@@ -50,31 +51,52 @@ class SyncDir:
     def __init__(self, local_dir, remote_dir, include_hidden, concurrent_workers, force, keep_deleted,
                  deletion_batch_size):
         self.local_dir = local_dir
-        self.remote_dir = remote_dir
+        self.remote_dir = remote_dir.strip('/')
         self.include_hidden = include_hidden
         self.concurrent_workers = concurrent_workers
         self.force = force
         self.keep_unique = keep_deleted
         self.deletion_batch_size = deletion_batch_size
 
+        self.sync_meta_file = path.join(self.local_dir, _SYNC_META_FILE)
+
         self.verbose = logger.getEffectiveLevel() < logging.INFO
 
-        self.local_files = walk_dir(abspath(self.local_dir), include_hidden)
+        self.local_files = walk_dir(path.abspath(self.local_dir), include_hidden)
         logger.info(f"Found {len(self.local_files)} items in local folder '{local_dir}'")
 
         self.remote_files = query_cld_folder(self.remote_dir)
         logger.info(f"Found {len(self.remote_files)} items in Cloudinary folder '{self.remote_dir}'")
 
         local_file_names = self.local_files.keys()
         remote_file_names = self.remote_files.keys()
+        """
+        Cloudinary is a very permissive service. When uploading files that contain invalid characters, 
+        unicode characters, etc, Cloudinary does the best effort to store those files. 
+        
+        Usually Cloudinary sanitizes those file names and strips invalid characters. Although it is good best effort for
+        a general use case, when syncing local folder with Cloudinary, it is not the best option, since directories will
+        be always out-of-sync.
+         
+        To overcome this limitation, cloudinary-cli keeps .cld-sync hidden file in the sync directory that contains a 
+        mapping of the diverse file names. This file keeps tracking on the files and allows syncing in both directions.
+        """
+        self.diverse_file_names = read_json_from_file(self.sync_meta_file, does_not_exist_ok=True)
+        inverted_diverse_file_names = invert_dict(self.diverse_file_names)
+
+        cloudinarized_local_file_names = [self.diverse_file_names.get(f, f) for f in local_file_names]
+        self.recovered_remote_files = {inverted_diverse_file_names.get(f, f): dt for f, dt in self.remote_files.items()}
+
+        self.unique_remote_file_names = remote_file_names - cloudinarized_local_file_names
+        self.unique_local_file_names = local_file_names - self.recovered_remote_files.keys()
 
-        self.unique_remote_file_names = remote_file_names - local_file_names
-        self.unique_local_file_names = local_file_names - remote_file_names
         common_file_names = local_file_names - self.unique_local_file_names
 
-        self.out_of_sync_file_names = self._get_out_of_sync_file_names(common_file_names)
+        self.out_of_sync_local_file_names = self._get_out_of_sync_file_names(common_file_names)
+        self.out_of_sync_remote_file_names = set(self.diverse_file_names.get(f, f) for f in
+                                                 self.out_of_sync_local_file_names)
 
-        skipping = len(common_file_names) - len(self.out_of_sync_file_names)
+        skipping = len(common_file_names) - len(self.out_of_sync_local_file_names)
 
         if skipping:
             logger.info(f"Skipping {skipping} items")
@@ -83,12 +105,16 @@ def _get_out_of_sync_file_names(self, common_file_names):
         logger.debug("\nCalculating differences...\n")
         out_of_sync_file_names = set()
         for f in common_file_names:
-            if self.local_files[f]['etag'] != self.remote_files[f]['etag']:
-                logger.warning(f"{f} is out of sync")
-                logger.debug(f"Local etag: {self.local_files[f]['etag']}. Remote etag: {self.remote_files[f]['etag']}")
+            local_etag = self.local_files[f]['etag']
+            remote_etag = self.recovered_remote_files[f]['etag']
+            if local_etag != remote_etag:
+                logger.warning(f"{f} is out of sync" +
+                               (f" with '{self.diverse_file_names[f]}" if f in self.diverse_file_names else ""))
+                logger.debug(f"Local etag: {local_etag}. Remote etag: {remote_etag}")
                 out_of_sync_file_names.add(f)
                 continue
-            logger.debug(f"{f} is in sync")
+            logger.debug(f"'{f}' is in sync" +
+                         (f" with '{self.diverse_file_names[f]}" if f in self.diverse_file_names else ""))
 
         return out_of_sync_file_names
 
@@ -97,7 +123,7 @@ def push(self):
             logger.info("Aborting...")
             return False
 
-        files_to_push = self.unique_local_file_names | self.out_of_sync_file_names
+        files_to_push = self.unique_local_file_names | self.out_of_sync_local_file_names
         if not files_to_push:
             return True
 
@@ -109,14 +135,37 @@ def push(self):
             'invalidate': True,
             'resource_type': 'auto'
         }
+        upload_results = {}
         uploads = []
         for file in files_to_push:
             folder = get_destination_folder(self.remote_dir, file)
 
-            uploads.append((self.local_files[file]['path'], {**options, 'folder': folder}))
+            uploads.append((self.local_files[file]['path'], {**options, 'folder': folder}, upload_results))
 
         run_tasks_concurrently(upload_file, uploads, self.concurrent_workers)
 
+        self.save_sync_meta_file(upload_results)
+
+    def save_sync_meta_file(self, upload_results):
+        diverse_filenames = {}
+        for local_path, remote_path in upload_results.items():
+            local = path.relpath(local_path, self.local_dir)
+            remote = path.relpath(remote_path, self.remote_dir)
+            if local != remote:
+                diverse_filenames[local] = remote
+
+        # filter out outdated meta file entries
+        current_diverse_files = {k: v for k, v in self.diverse_file_names.items() if k in self.local_files.keys()}
+
+        if diverse_filenames or current_diverse_files != self.diverse_file_names:
+            current_diverse_files.update(diverse_filenames)
+            try:
+                write_json_to_file(current_diverse_files, self.sync_meta_file)
+                logger.debug(f"Updated '{self.sync_meta_file}' file")
+            except Exception as e:
+                # Meta file is not critical for the sync itself, in case we cannot write it, we just log a warning
+                logger.warning(f"Failed updating '{self.sync_meta_file}' file: {e}")
+
     def _handle_unique_remote_files(self):
         handled = self._handle_files_deletion(len(self.unique_remote_file_names), "remote")
         if handled is not None:
@@ -155,7 +204,7 @@ def pull(self):
         if not self._handle_unique_local_files():
             return False
 
-        files_to_pull = self.unique_remote_file_names | self.out_of_sync_file_names
+        files_to_pull = self.unique_remote_file_names | self.out_of_sync_remote_file_names
 
         if not files_to_pull:
             return True
@@ -164,7 +213,7 @@ def pull(self):
         downloads = []
         for file in files_to_pull:
             remote_file = self.remote_files[file]
-            local_path = abspath(path_join(self.local_dir, file))
+            local_path = path.abspath(path.join(self.local_dir, file))
 
             downloads.append((remote_file, local_path))
 
@@ -177,9 +226,9 @@ def _handle_unique_local_files(self):
 
         logger.info(f"Deleting {len(self.unique_local_file_names)} local files...")
         for file in self.unique_local_file_names:
-            path = abspath(self.local_files[file]['path'])
-            remove(path)
-            logger.info(f"Deleted '{path}'")
+            full_path = path.abspath(self.local_files[file]['path'])
+            remove(full_path)
+            logger.info(f"Deleted '{full_path}'")
 
         logger.info("Deleting empty folders...")
         delete_empty_dirs(self.local_dir)

diff --git a/cloudinary_cli/utils/api_utils.py b/cloudinary_cli/utils/api_utils.py
@@ -42,7 +42,7 @@ def query_cld_folder(folder):
 
 
 def upload_file(file_path, options, uploaded=None, skipped=None):
-    uploaded = uploaded if uploaded is not None else []
+    uploaded = uploaded if uploaded is not None else {}
     skipped = skipped if skipped is not None else []
     verbose = logger.getEffectiveLevel() < logging.INFO
 
@@ -55,7 +55,7 @@ def upload_file(file_path, options, uploaded=None, skipped=None):
         logger.info(style(f"Successfully uploaded {file_path} as {result['public_id']}", fg="green"))
         if verbose:
             print_json(result)
-        uploaded.append(result['public_id'])
+        uploaded[file_path] = asset_source(result)
     except Exception as e:
         log_exception(e, f"Failed uploading {file_path}")
         skipped.append(file_path)

diff --git a/cloudinary_cli/utils/config_utils.py b/cloudinary_cli/utils/config_utils.py
@@ -10,10 +10,7 @@
 
 
 def load_config():
-    if not os.path.exists(CLOUDINARY_CLI_CONFIG_FILE) or os.path.getsize(CLOUDINARY_CLI_CONFIG_FILE) < 1:
-        return {}
-
-    return read_json_from_file(CLOUDINARY_CLI_CONFIG_FILE)
+    return read_json_from_file(CLOUDINARY_CLI_CONFIG_FILE, does_not_exist_ok=True)
 
 
 def save_config(config):
@@ -68,9 +65,7 @@ def migrate_old_config():
                      f"please fix or remove it")
         raise
 
-    new_config = load_config()
-    new_config.update(old_config)
-    save_config(new_config)
+    update_config(old_config)
 
     os.remove(OLD_CLOUDINARY_CLI_CONFIG_FILE)
 

diff --git a/cloudinary_cli/utils/file_utils.py b/cloudinary_cli/utils/file_utils.py
@@ -35,7 +35,11 @@ def is_hidden_path(filepath):
 
 
 def has_hidden_attribute(filepath):
-    st = os.stat(filepath)
+    try:
+        st = os.stat(filepath)
+    except OSError as e:
+        logger.debug(f"Failed getting os.stat for file '{filepath}': {e}")
+        return False
 
     if not hasattr(st, 'st_file_attributes'):  # not a pythonic way, but it's relevant only for windows, no need to try
         return False

diff --git a/cloudinary_cli/utils/json_utils.py b/cloudinary_cli/utils/json_utils.py
@@ -1,18 +1,27 @@
 import json
 from platform import system
-
+from os import path
 import click
 from pygments import highlight, lexers, formatters
 
 
+def read_json_from_file(filename, does_not_exist_ok=False):
+    if does_not_exist_ok and (not path.exists(filename) or path.getsize(filename) < 1):
+        return {}
+
+    with open(filename, 'r') as file:
+        return json.loads(file.read() or "{}")
+
+
 def write_json_to_file(json_obj, filename, indent=2, sort_keys=False):
     with open(filename, 'w') as file:
         json.dump(json_obj, file, indent=indent, sort_keys=sort_keys)
 
 
-def read_json_from_file(filename):
-    with open(filename, 'r') as file:
-        return json.loads(file.read() or "{}")
+def update_json_file(json_obj, filename, indent=2, sort_keys=False):
+    curr_obj = read_json_from_file(filename, True)
+    curr_obj.update(json_obj)
+    write_json_to_file(curr_obj, filename, indent, sort_keys)
 
 
 def print_json(res):

diff --git a/cloudinary_cli/utils/utils.py b/cloudinary_cli/utils/utils.py
@@ -102,6 +102,14 @@ def remove_string_prefix(string, prefix):
     return string[string.startswith(prefix) and len(prefix):]
 
 
+def invert_dict(d):
+    inv_dict = {}
+    for k, v in d.items():
+        inv_dict[v] = k
+
+    return inv_dict
+
+
 def write_json_list_to_csv(json_list, filename, fields_to_keep=()):
     with open(f'{filename}.csv', 'w') as f:
         if not fields_to_keep: