diff --git a/llmpa/backends/base.py b/llmpa/backends/base.py index 64f8b13..1ff89d4 100644 --- a/llmpa/backends/base.py +++ b/llmpa/backends/base.py @@ -3,7 +3,7 @@ from typing import Optional, List -class BaseBackend: +class BackendBase: def get_model_info(self, model: str) -> Optional[dict]: raise NotImplementedError diff --git a/llmpa/backends/local/models/__init__.py b/llmpa/backends/local/models/__init__.py index 0b04770..76a2175 100644 --- a/llmpa/backends/local/models/__init__.py +++ b/llmpa/backends/local/models/__init__.py @@ -1,2 +1,4 @@ -from . import image -from . import video +# from . import image +# from . import video + +# __all__ = ["image", "video"] diff --git a/llmpa/backends/local/models/image.py b/llmpa/backends/local/models/image.py index c56836c..0bec180 100644 --- a/llmpa/backends/local/models/image.py +++ b/llmpa/backends/local/models/image.py @@ -1,73 +1,117 @@ -import tensorflow as tf import cv2 import numpy as np -from transformers import TFAutoModel, AutoConfig -from tensorflow.keras.applications import ( - EfficientNetV2B0, - ResNet50, - preprocess_input as keras_preprocess_input, -) +import torch +import torchvision.transforms as transforms +from torchvision.models import resnet50, efficientnet_v2_s + +from transformers import AutoModel, AutoFeatureExtractor + +# from tensorflow.keras.applications import ( +# EfficientNetV2B0, +# ResNet50, +# ) # EmbeddingExtractor Class with model name as a parameter class EmbeddingExtractor: def __init__(self, model_name="EfficientNetV2B0"): self.model_name = model_name - self.model, self.preprocess_fn = self.load_model() + self.use_pretrained_model = False + self.feature_extractor = None + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + self.load_model() def load_model(self): if self.model_name == "EfficientNetV2B0": - base_model = EfficientNetV2B0(include_top=False, pooling="avg") - preprocess_fn = ( - keras_preprocess_input # Define custom preprocessing if needed + from torchvision.models import EfficientNet_V2_S_Weights + base_model = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.DEFAULT) + self.model = torch.nn.Sequential(*list(base_model.children())[:-1]).to( + self.device ) elif self.model_name == "ResNet50": - base_model = ResNet50(include_top=False, pooling="avg") - preprocess_fn = ( - keras_preprocess_input # Define custom preprocessing if needed + from torchvision.models import ResNet50_Weights + base_model = resnet50(weights=ResNet50_Weights.DEFAULT) + self.model = torch.nn.Sequential(*list(base_model.children())[:-1]).to( + self.device ) else: - config = AutoConfig.from_pretrained(self.model_name) - base_model = TFAutoModel.from_pretrained(self.model_name, config=config) - preprocess_fn = ( - keras_preprocess_input # Define custom preprocessing if needed + self.use_pretrained_model = True + self.feature_extractor = AutoFeatureExtractor.from_pretrained( + self.model_name ) - - return ( - tf.keras.Model(inputs=base_model.input, outputs=base_model.output), - preprocess_fn, - ) + self.model = AutoModel.from_pretrained(self.model_name).to(self.device) def preprocess_image(self, image): - image = cv2.resize(image, (224, 224)) - image = image.astype("float32") - image = self.preprocess_fn(image) - return image + if self.use_pretrained_model: + # If using Hugging Face models, do not apply torchvision transforms + if image.shape[-1] == 3: # Ensure 3 channels for RGB + image = cv2.resize(image, (2048, 2048)) # Adjust size as needed + image = np.expand_dims(image, axis=0) # Add batch dimension + return image + else: + raise ValueError( + f"Expected 3 channels (RGB) but got {image.shape[-1]} channels." + ) + else: + transform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Resize((2048, 2048)), # Adjust size as needed + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + image = transform(image) + image = image.unsqueeze(0).to(self.device) + return image def extract_image_embedding(self, image_path): image = cv2.imread(image_path) if image is not None: image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image = self.preprocess_image(image) - embedding = self.model.predict(np.expand_dims(image, axis=0)) - return embedding.squeeze() + with torch.no_grad(): + if self.use_pretrained_model: + inputs = self.feature_extractor( + images=image, return_tensors="pt" + ).to(self.device) + embedding = self.model(**inputs).last_hidden_state + return embedding.cpu().numpy().squeeze() + else: + embedding = self.model(image).squeeze() + return embedding.cpu().numpy() return None - def process_image(self, image_id, file_path): + def process_image(self, file_path): embedding = self.extract_image_embedding(file_path) if embedding is not None: - print(f"Extracted embedding for image {image_id}: {embedding.shape}") + print(f"Extracted embedding for image {file_path}: {embedding.shape}") else: - print(f"Failed to extract embedding for image {image_id}") + print(f"Failed to extract embedding for image {file_path}") return embedding if __name__ == "__main__": - image_path = "path_to_your_image.jpg" + import os + import sys - # Pass the model name as a parameter - model_name = "microsoft/resnet-50" # Example for Hugging Face model - extractor = EmbeddingExtractor(model_name=model_name) + if len(sys.argv) < 2: + print(f"Usage: {os.path.basename(__file__)} ") + sys.exit(1) + + image_path = sys.argv[1] - # Process the image - extractor.process_image(1, image_path) + def test_model(model_name): + extractor = EmbeddingExtractor(model_name=model_name) + extractor.process_image(image_path) + + # Pass the model name as a parameter + for model_name in [ + "EfficientNetV2B0", ## embedding.shape == (1280,) + "ResNet50", ## embedding.shape == (2048,) + "microsoft/resnet-50", # Example for Hugging Face model + ]: + print(f"Testing model: {model_name}") + test_model(model_name) diff --git a/llmpa/backends/local/models/video.py b/llmpa/backends/local/models/video.py index 936b0f2..18efa78 100644 --- a/llmpa/backends/local/models/video.py +++ b/llmpa/backends/local/models/video.py @@ -3,11 +3,9 @@ import torchvision.transforms as transforms import torchvision.models.video as models -# import tensorflow as tf import cv2 import numpy as np -# from tensorflow.keras.applications import EfficientNetV2B0 # Replaceable with other models from transformers import TFAutoModel # For Hugging Face models # from tensorflow.keras.models import load_model # To load the converted X3D model @@ -26,15 +24,6 @@ def __init__( self.model = self.load_model() def load_model(self): - ## if self.model_name == "EfficientNetV2B0": - ## base_model = EfficientNetV2B0(include_top=False, pooling="avg") - ## model = tf.keras.Model(inputs=base_model.input, outputs=base_model.output) - ## else if self.model_name == "x3d_model_tf": - ## # Load the converted X3D model - ## model = load_model(self.model_name) - ## else: - ## # Load a model from Hugging Face if it's a supported video model - ## model = TFAutoModel.from_pretrained(self.model_name) # Load the pre-trained X3D model from torchvision if self.model_name == "x3d_m": model = models.video.x3d_x3d_m(pretrained=True) @@ -53,10 +42,6 @@ def load_model(self): return model def preprocess_video_frames(self, frames): - ## Resize each frame to the input shape for the specific model - # frames = [cv2.resize(frame, (self.input_shape[0], self.input_shape[1])) for frame in frames] - # frames = np.array(frames).astype("float32") / 255.0 # Normalize to [0, 1] - # return frames # Resize and normalize each frame to the input shape for X3D (3D CNN models expect normalization) transform = transforms.Compose( [ @@ -90,9 +75,6 @@ def extract_video_embeddings(self, video_path): cap.release() if len(frames) > 0: - ## frames = self.preprocess_video_frames(frames) - ## embeddings = self.model.predict(np.expand_dims(frames, axis=0)) # Add batch dimension - ## return embeddings.squeeze() # Return embedding as numpy array frames_tensor = ( self.preprocess_video_frames(frames).unsqueeze(0).to(self.device) ) # Add batch dimension @@ -101,19 +83,25 @@ def extract_video_embeddings(self, video_path): return embeddings.squeeze().cpu().numpy() # Convert to numpy array return None - def process_video(self, video_id, file_path): + def process_video(self, file_path): embeddings = self.extract_video_embeddings(file_path) if embeddings is not None: - print(f"Extracted embeddings for video {video_id}: {embeddings.shape}") + print(f"Extracted embeddings for video {file_path}: {embeddings.shape}") else: - print(f"Failed to extract embeddings for video {video_id}") + print(f"Failed to extract embeddings for video {file_path}") return embeddings if __name__ == "__main__": - video_path = "path_to_your_video.mp4" - ## extractor = EmbeddingExtractor(model_name="EfficientNetV2B0") # Change model name here - extractor = EmbeddingExtractor( - model_name="x3d_m" - ) # You can change to x3d_s or x3d_l - extractor.process_video(1, video_path) + import os + import sys + + if len(sys.argv) < 2: + print(f"Usage: {os.path.basename(__file__)} ") + sys.exit(1) + + video_path = sys.argv[1] + + def test_model(model_name): + extractor = EmbeddingExtractor(model_name=model_name) + extractor.process_video(video_path) diff --git a/llmpa/backends/localai.py b/llmpa/backends/localai.py index dc2476d..082e8a4 100644 --- a/llmpa/backends/localai.py +++ b/llmpa/backends/localai.py @@ -10,17 +10,17 @@ sys.path.insert(0, project_root) from clients.http import HttpClient -from .base import BaseBackend +from .base import BackendBase -class Backend(BaseBackend, HttpClient): +class Backend(BackendBase): def __init__(self, base_url: str, api_key=None, verify_ssl=True, timeout=10): - super(Backend, self).__init__(base_url, api_key, verify_ssl, timeout) + super(Backend, self).__init__() self.client = HttpClient(base_url, api_key, verify_ssl, timeout) def list_available_models(self) -> Optional[list]: try: - response = self.client.get("/v1/models", timeout=self.timeout) + response = self.client.get("/v1/models") print(response.json()) if not response: return None @@ -50,7 +50,6 @@ def generate( response = self.client.post( "/v1/chat/completions", json=payload, - timeout=self.timeout, ) if not response: return None @@ -78,7 +77,6 @@ def embedding(self, text: str, model: str) -> Optional[List[float]]: "/embeddings", extra_headers=headers, json=payload, - timeout=self.timeout, ) if not response: return None diff --git a/llmpa/db/milvus.py b/llmpa/db/milvus.py index bf87853..bd3eaa7 100644 --- a/llmpa/db/milvus.py +++ b/llmpa/db/milvus.py @@ -20,7 +20,11 @@ def list_collections(self): def create_collections(self): self.create_collection("video_embeddings", 1280) - self.create_collection("image_embeddings", 512) + + ## embedding lenth of image + # EfficientNetV2B0: 1280 + # ResNet50: 2048 + self.create_collection("image_embeddings", 2048) self.create_collection("audio_embeddings", 1024) self.create_collection("text_embeddings", 768) diff --git a/llmpa/fileparser/mimetype.py b/llmpa/fileparser/mimetype.py index 7dd5070..40ded95 100644 --- a/llmpa/fileparser/mimetype.py +++ b/llmpa/fileparser/mimetype.py @@ -55,11 +55,13 @@ def detect(filepath: str, follow_symlinks: bool = True) -> str: def main(): + import os import sys if len(sys.argv) < 2: - print("Usage: mimetype.py ") + print(f"Usage: {os.path.basename(__file__)} ") sys.exit(1) + filepath = sys.argv[1] meme_type_real = _detect(filepath) meme_type = detect(filepath) diff --git a/requirements.txt b/requirements.txt index 7a878cc..dd529f8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -torch==2.4.1 ## for fastai -torchaudio==2.4.1 -torchvision==0.19.1 +torch==2.5.0 +torchaudio==2.5.0 +torchvision==0.20.0 python-magic==0.4.27 flask==3.0.3 dash==2.18.1 @@ -17,21 +17,14 @@ black==24.10.0 bcrypt==4.2.0 uvicorn==0.32.0 asgiref==3.8.1 -requests[security]=2.32.3 -tensorflow[and-cuda]==2.17.0 +requests[security]==2.32.3 transformers==4.45.2 scikit-learn==1.5.2 -#fastai==2.7.17 -# bark==0.1.5 +# fastai==2.7.17 ## need torch<2.5 and >=1.10 +bark==0.1.5 joblib==1.4.2 dspy-ai==2.5.12 pgvector==0.3.5 pytorchvideo==0.1.5 -tensorrt==10.5.0 -tensorflow-addons==0.23.0 -tensorflow-hub==0.16.1 opencv-python==4.10.0.84 pymilvus==2.4.8 -tensorrt==10.5.0 -nvidia-tensorrt==99.0.0 -huggingface-hub==0.25.2