chore: update models of local backend to use torch, remove tensorflow

nuffin · Oct 20, 2024 · cfda35a · cfda35a
1 parent 406dd0f
commit cfda35a
Show file tree

Hide file tree

Showing 8 changed files with 120 additions and 89 deletions.
diff --git a/llmpa/backends/base.py b/llmpa/backends/base.py
@@ -3,7 +3,7 @@
 from typing import Optional, List
 
 
-class BaseBackend:
+class BackendBase:
     def get_model_info(self, model: str) -> Optional[dict]:
         raise NotImplementedError
 

diff --git a/llmpa/backends/local/models/__init__.py b/llmpa/backends/local/models/__init__.py
@@ -1,2 +1,4 @@
-from . import image
-from . import video
+# from . import image
+# from . import video
+
+# __all__ = ["image", "video"]
diff --git a/llmpa/backends/local/models/image.py b/llmpa/backends/local/models/image.py
@@ -1,73 +1,117 @@
-import tensorflow as tf
 import cv2
 import numpy as np
-from transformers import TFAutoModel, AutoConfig
-from tensorflow.keras.applications import (
-    EfficientNetV2B0,
-    ResNet50,
-    preprocess_input as keras_preprocess_input,
-)
+import torch
+import torchvision.transforms as transforms
+from torchvision.models import resnet50, efficientnet_v2_s
+
+from transformers import AutoModel, AutoFeatureExtractor
+
+# from tensorflow.keras.applications import (
+#     EfficientNetV2B0,
+#     ResNet50,
+# )
 
 
 # EmbeddingExtractor Class with model name as a parameter
 class EmbeddingExtractor:
     def __init__(self, model_name="EfficientNetV2B0"):
         self.model_name = model_name
-        self.model, self.preprocess_fn = self.load_model()
+        self.use_pretrained_model = False
+        self.feature_extractor = None
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+        self.load_model()
 
     def load_model(self):
         if self.model_name == "EfficientNetV2B0":
-            base_model = EfficientNetV2B0(include_top=False, pooling="avg")
-            preprocess_fn = (
-                keras_preprocess_input  # Define custom preprocessing if needed
+            from torchvision.models import EfficientNet_V2_S_Weights
+            base_model = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.DEFAULT)
+            self.model = torch.nn.Sequential(*list(base_model.children())[:-1]).to(
+                self.device
             )
         elif self.model_name == "ResNet50":
-            base_model = ResNet50(include_top=False, pooling="avg")
-            preprocess_fn = (
-                keras_preprocess_input  # Define custom preprocessing if needed
+            from torchvision.models import ResNet50_Weights
+            base_model = resnet50(weights=ResNet50_Weights.DEFAULT)
+            self.model = torch.nn.Sequential(*list(base_model.children())[:-1]).to(
+                self.device
             )
         else:
-            config = AutoConfig.from_pretrained(self.model_name)
-            base_model = TFAutoModel.from_pretrained(self.model_name, config=config)
-            preprocess_fn = (
-                keras_preprocess_input  # Define custom preprocessing if needed
+            self.use_pretrained_model = True
+            self.feature_extractor = AutoFeatureExtractor.from_pretrained(
+                self.model_name
             )
-
-        return (
-            tf.keras.Model(inputs=base_model.input, outputs=base_model.output),
-            preprocess_fn,
-        )
+            self.model = AutoModel.from_pretrained(self.model_name).to(self.device)
 
     def preprocess_image(self, image):
-        image = cv2.resize(image, (224, 224))
-        image = image.astype("float32")
-        image = self.preprocess_fn(image)
-        return image
+        if self.use_pretrained_model:
+            # If using Hugging Face models, do not apply torchvision transforms
+            if image.shape[-1] == 3:  # Ensure 3 channels for RGB
+                image = cv2.resize(image, (2048, 2048))  # Adjust size as needed
+                image = np.expand_dims(image, axis=0)  # Add batch dimension
+                return image
+            else:
+                raise ValueError(
+                    f"Expected 3 channels (RGB) but got {image.shape[-1]} channels."
+                )
+        else:
+            transform = transforms.Compose(
+                [
+                    transforms.ToTensor(),
+                    transforms.Resize((2048, 2048)),  # Adjust size as needed
+                    transforms.Normalize(
+                        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                    ),
+                ]
+            )
+            image = transform(image)
+            image = image.unsqueeze(0).to(self.device)
+            return image
 
     def extract_image_embedding(self, image_path):
         image = cv2.imread(image_path)
         if image is not None:
             image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
             image = self.preprocess_image(image)
-            embedding = self.model.predict(np.expand_dims(image, axis=0))
-            return embedding.squeeze()
+            with torch.no_grad():
+                if self.use_pretrained_model:
+                    inputs = self.feature_extractor(
+                        images=image, return_tensors="pt"
+                    ).to(self.device)
+                    embedding = self.model(**inputs).last_hidden_state
+                    return embedding.cpu().numpy().squeeze()
+                else:
+                    embedding = self.model(image).squeeze()
+                    return embedding.cpu().numpy()
         return None
 
-    def process_image(self, image_id, file_path):
+    def process_image(self, file_path):
         embedding = self.extract_image_embedding(file_path)
         if embedding is not None:
-            print(f"Extracted embedding for image {image_id}: {embedding.shape}")
+            print(f"Extracted embedding for image {file_path}: {embedding.shape}")
         else:
-            print(f"Failed to extract embedding for image {image_id}")
+            print(f"Failed to extract embedding for image {file_path}")
         return embedding
 
 
 if __name__ == "__main__":
-    image_path = "path_to_your_image.jpg"
+    import os
+    import sys
 
-    # Pass the model name as a parameter
-    model_name = "microsoft/resnet-50"  # Example for Hugging Face model
-    extractor = EmbeddingExtractor(model_name=model_name)
+    if len(sys.argv) < 2:
+        print(f"Usage: {os.path.basename(__file__)} <filepath>")
+        sys.exit(1)
+
+    image_path = sys.argv[1]
 
-    # Process the image
-    extractor.process_image(1, image_path)
+    def test_model(model_name):
+        extractor = EmbeddingExtractor(model_name=model_name)
+        extractor.process_image(image_path)
+
+    # Pass the model name as a parameter
+    for model_name in [
+        "EfficientNetV2B0",  ## embedding.shape == (1280,)
+        "ResNet50",  ## embedding.shape == (2048,)
+        "microsoft/resnet-50",  # Example for Hugging Face model
+    ]:
+        print(f"Testing model: {model_name}")
+        test_model(model_name)
diff --git a/llmpa/backends/local/models/video.py b/llmpa/backends/local/models/video.py
@@ -3,11 +3,9 @@
 import torchvision.transforms as transforms
 import torchvision.models.video as models
 
-# import tensorflow as tf
 import cv2
 import numpy as np
 
-# from tensorflow.keras.applications import EfficientNetV2B0  # Replaceable with other models
 from transformers import TFAutoModel  # For Hugging Face models
 
 # from tensorflow.keras.models import load_model  # To load the converted X3D model
@@ -26,15 +24,6 @@ def __init__(
         self.model = self.load_model()
 
     def load_model(self):
-        ## if self.model_name == "EfficientNetV2B0":
-        ##     base_model = EfficientNetV2B0(include_top=False, pooling="avg")
-        ##     model = tf.keras.Model(inputs=base_model.input, outputs=base_model.output)
-        ## else if self.model_name == "x3d_model_tf":
-        ##     # Load the converted X3D model
-        ##     model = load_model(self.model_name)
-        ## else:
-        ##     # Load a model from Hugging Face if it's a supported video model
-        ##     model = TFAutoModel.from_pretrained(self.model_name)
         # Load the pre-trained X3D model from torchvision
         if self.model_name == "x3d_m":
             model = models.video.x3d_x3d_m(pretrained=True)
@@ -53,10 +42,6 @@ def load_model(self):
         return model
 
     def preprocess_video_frames(self, frames):
-        ## Resize each frame to the input shape for the specific model
-        # frames = [cv2.resize(frame, (self.input_shape[0], self.input_shape[1])) for frame in frames]
-        # frames = np.array(frames).astype("float32") / 255.0  # Normalize to [0, 1]
-        # return frames
         # Resize and normalize each frame to the input shape for X3D (3D CNN models expect normalization)
         transform = transforms.Compose(
             [
@@ -90,9 +75,6 @@ def extract_video_embeddings(self, video_path):
         cap.release()
 
         if len(frames) > 0:
-            ## frames = self.preprocess_video_frames(frames)
-            ## embeddings = self.model.predict(np.expand_dims(frames, axis=0))  # Add batch dimension
-            ## return embeddings.squeeze()  # Return embedding as numpy array
             frames_tensor = (
                 self.preprocess_video_frames(frames).unsqueeze(0).to(self.device)
             )  # Add batch dimension
@@ -101,19 +83,25 @@ def extract_video_embeddings(self, video_path):
             return embeddings.squeeze().cpu().numpy()  # Convert to numpy array
         return None
 
-    def process_video(self, video_id, file_path):
+    def process_video(self, file_path):
         embeddings = self.extract_video_embeddings(file_path)
         if embeddings is not None:
-            print(f"Extracted embeddings for video {video_id}: {embeddings.shape}")
+            print(f"Extracted embeddings for video {file_path}: {embeddings.shape}")
         else:
-            print(f"Failed to extract embeddings for video {video_id}")
+            print(f"Failed to extract embeddings for video {file_path}")
         return embeddings
 
 
 if __name__ == "__main__":
-    video_path = "path_to_your_video.mp4"
-    ## extractor = EmbeddingExtractor(model_name="EfficientNetV2B0")  # Change model name here
-    extractor = EmbeddingExtractor(
-        model_name="x3d_m"
-    )  # You can change to x3d_s or x3d_l
-    extractor.process_video(1, video_path)
+    import os
+    import sys
+
+    if len(sys.argv) < 2:
+        print(f"Usage: {os.path.basename(__file__)} <filepath>")
+        sys.exit(1)
+
+    video_path = sys.argv[1]
+
+    def test_model(model_name):
+        extractor = EmbeddingExtractor(model_name=model_name)
+        extractor.process_video(video_path)
diff --git a/llmpa/backends/localai.py b/llmpa/backends/localai.py
@@ -10,17 +10,17 @@
     sys.path.insert(0, project_root)
 
 from clients.http import HttpClient
-from .base import BaseBackend
+from .base import BackendBase
 
 
-class Backend(BaseBackend, HttpClient):
+class Backend(BackendBase):
     def __init__(self, base_url: str, api_key=None, verify_ssl=True, timeout=10):
-        super(Backend, self).__init__(base_url, api_key, verify_ssl, timeout)
+        super(Backend, self).__init__()
         self.client = HttpClient(base_url, api_key, verify_ssl, timeout)
 
     def list_available_models(self) -> Optional[list]:
         try:
-            response = self.client.get("/v1/models", timeout=self.timeout)
+            response = self.client.get("/v1/models")
             print(response.json())
             if not response:
                 return None
@@ -50,7 +50,6 @@ def generate(
             response = self.client.post(
                 "/v1/chat/completions",
                 json=payload,
-                timeout=self.timeout,
             )
             if not response:
                 return None
@@ -78,7 +77,6 @@ def embedding(self, text: str, model: str) -> Optional[List[float]]:
                 "/embeddings",
                 extra_headers=headers,
                 json=payload,
-                timeout=self.timeout,
             )
             if not response:
                 return None

diff --git a/llmpa/db/milvus.py b/llmpa/db/milvus.py
@@ -20,7 +20,11 @@ def list_collections(self):
 
     def create_collections(self):
         self.create_collection("video_embeddings", 1280)
-        self.create_collection("image_embeddings", 512)
+
+        ## embedding lenth of image
+        #     EfficientNetV2B0: 1280
+        #     ResNet50: 2048
+        self.create_collection("image_embeddings", 2048)
         self.create_collection("audio_embeddings", 1024)
         self.create_collection("text_embeddings", 768)
 

diff --git a/llmpa/fileparser/mimetype.py b/llmpa/fileparser/mimetype.py
@@ -55,11 +55,13 @@ def detect(filepath: str, follow_symlinks: bool = True) -> str:
 
 
 def main():
+    import os
     import sys
 
     if len(sys.argv) < 2:
-        print("Usage: mimetype.py <filepath>")
+        print(f"Usage: {os.path.basename(__file__)} <filepath>")
         sys.exit(1)
+
     filepath = sys.argv[1]
     meme_type_real = _detect(filepath)
     meme_type = detect(filepath)

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
-torch==2.4.1 ## for fastai
-torchaudio==2.4.1
-torchvision==0.19.1
+torch==2.5.0
+torchaudio==2.5.0
+torchvision==0.20.0
 python-magic==0.4.27
 flask==3.0.3
 dash==2.18.1
@@ -17,21 +17,14 @@ black==24.10.0
 bcrypt==4.2.0
 uvicorn==0.32.0
 asgiref==3.8.1
-requests[security]=2.32.3
-tensorflow[and-cuda]==2.17.0
+requests[security]==2.32.3
 transformers==4.45.2
 scikit-learn==1.5.2
-#fastai==2.7.17
-# bark==0.1.5
+# fastai==2.7.17 ## need torch<2.5 and >=1.10
+bark==0.1.5
 joblib==1.4.2
 dspy-ai==2.5.12
 pgvector==0.3.5
 pytorchvideo==0.1.5
-tensorrt==10.5.0
-tensorflow-addons==0.23.0
-tensorflow-hub==0.16.1
 opencv-python==4.10.0.84
 pymilvus==2.4.8
-tensorrt==10.5.0
-nvidia-tensorrt==99.0.0
-huggingface-hub==0.25.2