updaging fastdup to 1.85

superlinear-ai · Jan 17, 2024 · 62cf17c · 62cf17c
1 parent 12ce82f
commit 62cf17c
Show file tree

Hide file tree

Showing 23 changed files with 1,894 additions and 193 deletions.
diff --git a/fastdup/__init__.py b/fastdup/__init__.py
diff --git a/fastdup/ascii_art.py b/fastdup/ascii_art.py
@@ -0,0 +1,40 @@
+import random 
+
+ascii_arts = [
+    """                                                                                                 
+               :++.                                                  ...                                       
+             ;&&$+.                               ;x:                ;;;                                       
+            .$&+                                  X&x                ;;;                                       
+           X$&&&$$;    :$&&$+. ;$x    .x&&&X:   ;$&&&$$$:    :;;;;:. ;;;  :;;.      .;;.  :;;  :;;;;;.         
+           +x&&Xxx: :$&&Xx++X&&$&$  .&&$+;+$&&; :x&&$xxx: .;;;;;:;;;;;;;  :;;.      .;;:  :;;;;;;:;;;;;:       
+            .$&;   :$&X       +&&$  ;&$.    ;XX   X&x    .;;;       ;;;;  :;;.      .;;:  :;;;.      :;;:      
+            .$&;   +&&.        x&$  .$&&$+.       X&x    ;;;         ;;;  :;;.      .;;:  :;;.        ;;;      
+            .$&;   +&&.        x&$     ;X&&&&$.   X&x    ;;;         ;;;  :;;.      .;;:  :;;         ;;;      
+            .$&;   :&&;       :&&$  .:.     x&&   X&x    :;;;       :;;;  .;;:      :;;:  :;;;       .;;;      
+            .$&;    +&&$;   :X&&&$  ;&&+.  .X&&   X&&.    :;;;:   :;;;;;   ;;;;.  .;;;;:  :;;;;:.  .;;;;       
+            .$&;      +&&&&&&X.+&$   .X&&&&&&+     X&&&&;   :;;;;;;;.;;;    .;;;;;;;:;;:  :;; :;;;;;;;.        
+                                                                                          :;;                  
+                                                                                          :;;                  
+                                                                                          :;;                  
+                                                                                          .::                                                                                                                                                                                                                                                                                                                                                       
+""",
+"""
+                                                                               
+   ad88                                          88                            
+  d8"                             ,d             88                            
+  88                              88             88                            
+MM88MMM  ,adPPYYba,  ,adPPYba,  MM88MMM  ,adPPYb,88  88       88  8b,dPPYba,   
+  88     ""     `Y8  I8[    ""    88    a8"    `Y88  88       88  88P'    "8a  
+  88     ,adPPPPP88   `"Y8ba,     88    8b       88  88       88  88       d8  
+  88     88,    ,88  aa    ]8I    88,   "8a,   ,d88  "8a,   ,a88  88b,   ,a8"  
+  88     `"8bbdP"Y8  `"YbbdP"'    "Y888  `"8bbdP"Y8   `"YbbdP'Y8  88`YbbdP"'   
+                                                                  88           
+                                                                  88           
+"""
+
+]
+
+
+def get_ascii_art():
+    choice = random.randint(0, len(ascii_arts) - 1)
+    return ascii_arts[choice]
diff --git a/fastdup/captions.py b/fastdup/captions.py
@@ -1,10 +1,15 @@
+import torch
 from fastdup.sentry import fastdup_capture_exception
 from fastdup.definitions import MISSING_LABEL
 from fastdup.galleries import fastdup_imread
 import cv2
+from tqdm import tqdm
 
+device_to_captioner = {}
+
+def init_captioning(model_name='automatic', device='cpu', batch_size=8, max_new_tokens=20,
+                        use_float_16=True):
 
-def generate_labels(filenames, model_name='automatic', device = 'cpu', batch_size=8):
     '''
     This function generates captions for a given set of images, and takes the following arguments:
         - filenames: the list of images passed to the function
@@ -15,64 +20,82 @@ def generate_labels(filenames, model_name='automatic', device = 'cpu', batch_siz
             - BLIP: 'blip'
         - batch_size: the size of image batches to caption (default: 8)
         - device: whether to use a GPU (default: -1, CPU only ; set to 0 for GPU)
+        - max_bew_tokens: set the number of allowed tokens
     '''
+
+    global device_to_captioner
     # use GPU if device is specified
     if device == 'gpu':
         device = 0
     elif device == 'cpu':
         device = -1
+        use_float_16 = False
     else:
-        assert False, "Incompatible device name entered. Available device names are gpu and cpu."
+        assert False, "Incompatible device name entered {device}. Available device names are gpu and cpu."
 
     # confirm necessary dependencies are installed, and import them
     try:
         from transformers import pipeline
         from transformers.utils import logging
-        logging.set_verbosity_info()
-        import torch
-        from PIL import Image
-        from tqdm import tqdm
+        logging.set_verbosity(50)
+
     except Exception as e:
         fastdup_capture_exception("Auto generate labels", e)
         print("Auto captioning requires an installation of the following libraries:\n")
-        print("   huggingface transformers\n   pytorch\n   pillow\n   tqdm\n")
-        print("to install, use `pip install transformers torch pillow tqdm`")
-        return [MISSING_LABEL] * len(filenames)
+        print("   huggingface transformers\n   pytorch\n")
+        print("   to install, use `pip3 install transformers torch`")
+        raise
 
     # dictionary of captioning models
     models = {
         'automatic': "nlpconnect/vit-gpt2-image-captioning",
         'vitgpt2': "nlpconnect/vit-gpt2-image-captioning",
-        'blip2': "Salesforce/blip2-opt-2.7b",
+        'blip-2': "Salesforce/blip2-opt-2.7b",
         'blip': "Salesforce/blip-image-captioning-large"
     }
-
+    assert model_name in models.keys(), f"Unknown captioning model {model_name} allowed models are {models.keys()}"
     model = models[model_name]
+    has_gpu = torch.cuda.is_available()
+    captioner = pipeline("image-to-text", model=model, device=device if has_gpu else "cpu", max_new_tokens=max_new_tokens,
+                         torch_dtype=torch.float16 if use_float_16 else torch.float32)
+    device_to_captioner[device] = captioner
 
-    # generate captions
-    try:
-        captioner = pipeline("image-to-text", model=model, device=device)
-
-        captions = []
-
-        for pred in captioner(filenames, batch_size=batch_size):
-            #caption = pred['generated_text']
-            caption = ''.join([d['generated_text'] for d in pred])
-            captions.append(caption)
+    return captioner
 
+def generate_labels(filenames, model_name='automatic', device = 'cpu', batch_size=8, max_new_tokens=20, use_float_16=True):
+    global device_to_captioner
+    if device not in device_to_captioner:
+        captioner = init_captioning(model_name, device, batch_size, max_new_tokens, use_float_16)
+    else:
+        captioner = device_to_captioner[device]
 
-        '''for image_path in tqdm(filenames):
-            img = Image.open(image_path)
-            pred = captioner(img)
-            caption = pred[0]['generated_text']
-            captions.append(caption)'''
-        return captions
-
+    captions = []
+    # generate captions
+    try:
+        for i in tqdm(range(0, len(filenames), batch_size)):
+            chunk = filenames[i:i + batch_size]
+            try:
+                for pred in captioner(chunk, batch_size=batch_size):
+                    charstring = '' if model_name != 'blip' else ' '
+                    caption = charstring.join([d['generated_text'] for d in pred])
+                    # Split the sentence into words
+                    words = caption.split()
+                    # Filter out words containing '#'
+                    filtered_words = [word for word in words if '#' not in word]
+                    # Join the filtered words back into a sentence
+                    caption = ' '.join(filtered_words)
+                    caption = caption.strip()
+                    captions.append(caption)
+            except Exception as ex:
+                print("Failed to caption chunk", chunk[:5], ex)
+                captions.extend([MISSING_LABEL] * len(chunk))
 
     except Exception as e:
         fastdup_capture_exception("Auto caption image", e)
         return [MISSING_LABEL] * len(filenames)
 
+    return captions
+
 
 def generate_vqa_labels(filenames, text, kwargs):
     # confirm necessary dependencies are installed, and import them
@@ -156,3 +179,15 @@ def generate_age_labels(filenames, kwargs):
         fastdup_capture_exception("Age label", e)
         return [MISSING_LABEL] * len(filenames)
 
+if __name__ == "__main__":
+    import fastdup
+    from fastdup.captions import generate_labels
+    file = "/Users/dannybickson/visual_database/cxx/unittests/two_images/"
+    import os
+    files = os.listdir(file)
+    files = [os.path.join(file, f) for f in files]
+    ret = generate_labels(files, model_name='blip')
+    assert(len(ret) == 2)
+    print(ret)
+    for r in ret:
+        assert "shelf" in r or "shelves" in r or "store" in r
diff --git a/fastdup/coco.py b/fastdup/coco.py
@@ -2,7 +2,7 @@
 # FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
 # This software is free for non-commercial and academic usage under the Creative Common Attribution-NonCommercial-NoDerivatives
 # 4.0 International license. Please reach out to [email protected] for licensing options.
-
+from fastdup.sentry import fastdup_capture_exception
 
 import os
 cat = {0: u'__background__',
@@ -103,8 +103,8 @@ def read_coco_labels(path):
           #print('cat is', cat[int_cat])
           label_dict[os.path.join(path.replace('labels','images'),f).replace('.txt','.jpg')] = cat[int_cat]
       except Exception as ex:
-        print('Failed to read file ', os.path.join(path, f), ' with error: ', ex)
-        
+        fastdup_capture_exception(f'Failed to read coco file {os.path.join(path, f)}', ex)
+
   return label_dict
 
 
diff --git a/fastdup/cvat.py b/fastdup/cvat.py
@@ -277,8 +277,8 @@ def copy_images_and_zip(files, save_path):
             rel_path = os.path.relpath(f, save_path)
             zip_path = os.path.join('data', rel_path)
             zipf.write(f, zip_path)
-    assert os.path.exists(local_file)
-    print('Zipped file:', local_file, ' for cvat')
+    assert os.path.exists(zip_path)
+    print('Zipped file:', zip_path, ' for cvat')
     return 0    
 
 

diff --git a/fastdup/datasets.py b/fastdup/datasets.py
@@ -7,7 +7,7 @@
 import logging
 from typing import Optional, Any
 from concurrent.futures import ThreadPoolExecutor
-
+from fastdup.sentry import fastdup_capture_exception
 
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -67,7 +67,7 @@ def __init__(
                 dataset_name, split=split, cache_dir=self.cache_dir, **kwargs
             )
         except Exception as e:
-            logging.error(f"Error loading dataset: {e}")
+            fastdup_capture_exception(f"dataset/__init__", e)
             return
 
         super().__init__(
@@ -87,7 +87,7 @@ def __init__(
             current_hash: str = self._generate_img_folder_hash()
             previous_hash: Optional[str] = self._retrieve_cached_metadata()
         except Exception as e:
-            logging.error(f"Error generating or retrieving hash: {e}")
+            fastdup_capture_exception(f"Error generating or retrieving hash:", e)
             return
 
         if (current_hash != previous_hash) or reconvert_jpg:
@@ -122,6 +122,7 @@ def _cache_metadata(self, cache_hash: str) -> None:
             with open(cache_file, "w") as f:
                 f.write(cache_hash)
         except Exception as e:
+            fastdup_capture_exception("error extracting metadata", e)
             logging.error(f"Error caching metadata: {e}")
 
     def _retrieve_cached_metadata(self) -> Optional[str]:
@@ -131,6 +132,7 @@ def _retrieve_cached_metadata(self) -> Optional[str]:
                 with open(cache_file, "r") as f:
                     return f.read()
             except Exception as e:
+                fastdup_capture_exception("Error opening cache file ", e)
                 logging.error(f"Error reading cached metadata: {e}")
         return None
 
@@ -150,6 +152,7 @@ def _save_single_image(self, idx: int, item: dict, pbar) -> None:
             image.save(os.path.join(label_dir, f"{idx}.jpg"))
             pbar.update(1)
         except Exception as e:
+            fastdup_capture_exception("Error saving an image", e)
             logging.error(f"Error in saving image at index {idx}: {e}")
 
     def _save_as_image_files(self) -> None:
@@ -177,6 +180,7 @@ def annotations(self) -> pd.DataFrame:
                             filenames.append(subentry.path)
                             labels.append(label)
         except Exception as e:
+            fastdup_capture_exception("Error generating annotation", e)
             logging.error(f"Error in generating annotations: {e}")
             return pd.DataFrame()
 

diff --git a/fastdup/definitions.py b/fastdup/definitions.py
@@ -4,6 +4,7 @@
 import tempfile
 
 FILENAME_SIMILARITY = "similarity.csv"
+FILENAME_SEARCH = "search.csv"
 FILENAME_OUTLIERS = "outliers.csv"
 FILENAME_NNF_INDEX = "nnf.index"
 FILENAME_FEATURES = "features.dat"
@@ -49,7 +50,7 @@
 
 DEFUALT_METRIC_ZERO = 0
 DEFAULT_METRIC_MINUS_ONE = -1
-VERSION__ = "1.39"
+VERSION__ = "1.86"
 
 GITHUB_URL = "https://github.com/visual-layer/fastdup/issues"
 
@@ -58,8 +59,8 @@
 "and special care needs to select the right backend for your OS/Hardware combination. You can install matplot lib using "
 "python3.8 -m pip install matplotlib matplotlib-inline. (change the python3.8 to your python version). "
 
-SUPPORTED_IMG_FORMATS = [".png", ".jpg", ".jpeg", ".giff", ".jpeg", ".tif", ".tiff", ".heic", ".heif", ".bmp", ".webp"]
-SUPPORTED_VID_FORMATS = ["mp4", ".avi"]
+SUPPORTED_IMG_FORMATS = [".png", ".jpg", ".jpeg", ".giff", ".jpeg", ".tif", ".tiff", ".heic", ".heif", ".bmp", ".webp", ".jp2"]
+SUPPORTED_VID_FORMATS = [".mp4", ".avi", ".dav", ".m4a", ".m4v", ".mov"]
 
 RUN_ALL = 0
 RUN_EXTRACT = 1