Skip to content

Commit

Permalink
chore: update models of local backend to use torch, remove tensorflow
Browse files Browse the repository at this point in the history
  • Loading branch information
nuffin committed Oct 20, 2024
1 parent 406dd0f commit cfda35a
Show file tree
Hide file tree
Showing 8 changed files with 120 additions and 89 deletions.
2 changes: 1 addition & 1 deletion llmpa/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import Optional, List


class BaseBackend:
class BackendBase:
def get_model_info(self, model: str) -> Optional[dict]:
raise NotImplementedError

Expand Down
6 changes: 4 additions & 2 deletions llmpa/backends/local/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
from . import image
from . import video
# from . import image
# from . import video

# __all__ = ["image", "video"]
120 changes: 82 additions & 38 deletions llmpa/backends/local/models/image.py
Original file line number Diff line number Diff line change
@@ -1,73 +1,117 @@
import tensorflow as tf
import cv2
import numpy as np
from transformers import TFAutoModel, AutoConfig
from tensorflow.keras.applications import (
EfficientNetV2B0,
ResNet50,
preprocess_input as keras_preprocess_input,
)
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet50, efficientnet_v2_s

from transformers import AutoModel, AutoFeatureExtractor

# from tensorflow.keras.applications import (
# EfficientNetV2B0,
# ResNet50,
# )


# EmbeddingExtractor Class with model name as a parameter
class EmbeddingExtractor:
def __init__(self, model_name="EfficientNetV2B0"):
self.model_name = model_name
self.model, self.preprocess_fn = self.load_model()
self.use_pretrained_model = False
self.feature_extractor = None
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

self.load_model()

def load_model(self):
if self.model_name == "EfficientNetV2B0":
base_model = EfficientNetV2B0(include_top=False, pooling="avg")
preprocess_fn = (
keras_preprocess_input # Define custom preprocessing if needed
from torchvision.models import EfficientNet_V2_S_Weights
base_model = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.DEFAULT)
self.model = torch.nn.Sequential(*list(base_model.children())[:-1]).to(
self.device
)
elif self.model_name == "ResNet50":
base_model = ResNet50(include_top=False, pooling="avg")
preprocess_fn = (
keras_preprocess_input # Define custom preprocessing if needed
from torchvision.models import ResNet50_Weights
base_model = resnet50(weights=ResNet50_Weights.DEFAULT)
self.model = torch.nn.Sequential(*list(base_model.children())[:-1]).to(
self.device
)
else:
config = AutoConfig.from_pretrained(self.model_name)
base_model = TFAutoModel.from_pretrained(self.model_name, config=config)
preprocess_fn = (
keras_preprocess_input # Define custom preprocessing if needed
self.use_pretrained_model = True
self.feature_extractor = AutoFeatureExtractor.from_pretrained(
self.model_name
)

return (
tf.keras.Model(inputs=base_model.input, outputs=base_model.output),
preprocess_fn,
)
self.model = AutoModel.from_pretrained(self.model_name).to(self.device)

def preprocess_image(self, image):
image = cv2.resize(image, (224, 224))
image = image.astype("float32")
image = self.preprocess_fn(image)
return image
if self.use_pretrained_model:
# If using Hugging Face models, do not apply torchvision transforms
if image.shape[-1] == 3: # Ensure 3 channels for RGB
image = cv2.resize(image, (2048, 2048)) # Adjust size as needed
image = np.expand_dims(image, axis=0) # Add batch dimension
return image
else:
raise ValueError(
f"Expected 3 channels (RGB) but got {image.shape[-1]} channels."
)
else:
transform = transforms.Compose(
[
transforms.ToTensor(),
transforms.Resize((2048, 2048)), # Adjust size as needed
transforms.Normalize(
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
),
]
)
image = transform(image)
image = image.unsqueeze(0).to(self.device)
return image

def extract_image_embedding(self, image_path):
image = cv2.imread(image_path)
if image is not None:
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = self.preprocess_image(image)
embedding = self.model.predict(np.expand_dims(image, axis=0))
return embedding.squeeze()
with torch.no_grad():
if self.use_pretrained_model:
inputs = self.feature_extractor(
images=image, return_tensors="pt"
).to(self.device)
embedding = self.model(**inputs).last_hidden_state
return embedding.cpu().numpy().squeeze()
else:
embedding = self.model(image).squeeze()
return embedding.cpu().numpy()
return None

def process_image(self, image_id, file_path):
def process_image(self, file_path):
embedding = self.extract_image_embedding(file_path)
if embedding is not None:
print(f"Extracted embedding for image {image_id}: {embedding.shape}")
print(f"Extracted embedding for image {file_path}: {embedding.shape}")
else:
print(f"Failed to extract embedding for image {image_id}")
print(f"Failed to extract embedding for image {file_path}")
return embedding


if __name__ == "__main__":
image_path = "path_to_your_image.jpg"
import os
import sys

# Pass the model name as a parameter
model_name = "microsoft/resnet-50" # Example for Hugging Face model
extractor = EmbeddingExtractor(model_name=model_name)
if len(sys.argv) < 2:
print(f"Usage: {os.path.basename(__file__)} <filepath>")
sys.exit(1)

image_path = sys.argv[1]

# Process the image
extractor.process_image(1, image_path)
def test_model(model_name):
extractor = EmbeddingExtractor(model_name=model_name)
extractor.process_image(image_path)

# Pass the model name as a parameter
for model_name in [
"EfficientNetV2B0", ## embedding.shape == (1280,)
"ResNet50", ## embedding.shape == (2048,)
"microsoft/resnet-50", # Example for Hugging Face model
]:
print(f"Testing model: {model_name}")
test_model(model_name)
42 changes: 15 additions & 27 deletions llmpa/backends/local/models/video.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,9 @@
import torchvision.transforms as transforms
import torchvision.models.video as models

# import tensorflow as tf
import cv2
import numpy as np

# from tensorflow.keras.applications import EfficientNetV2B0 # Replaceable with other models
from transformers import TFAutoModel # For Hugging Face models

# from tensorflow.keras.models import load_model # To load the converted X3D model
Expand All @@ -26,15 +24,6 @@ def __init__(
self.model = self.load_model()

def load_model(self):
## if self.model_name == "EfficientNetV2B0":
## base_model = EfficientNetV2B0(include_top=False, pooling="avg")
## model = tf.keras.Model(inputs=base_model.input, outputs=base_model.output)
## else if self.model_name == "x3d_model_tf":
## # Load the converted X3D model
## model = load_model(self.model_name)
## else:
## # Load a model from Hugging Face if it's a supported video model
## model = TFAutoModel.from_pretrained(self.model_name)
# Load the pre-trained X3D model from torchvision
if self.model_name == "x3d_m":
model = models.video.x3d_x3d_m(pretrained=True)
Expand All @@ -53,10 +42,6 @@ def load_model(self):
return model

def preprocess_video_frames(self, frames):
## Resize each frame to the input shape for the specific model
# frames = [cv2.resize(frame, (self.input_shape[0], self.input_shape[1])) for frame in frames]
# frames = np.array(frames).astype("float32") / 255.0 # Normalize to [0, 1]
# return frames
# Resize and normalize each frame to the input shape for X3D (3D CNN models expect normalization)
transform = transforms.Compose(
[
Expand Down Expand Up @@ -90,9 +75,6 @@ def extract_video_embeddings(self, video_path):
cap.release()

if len(frames) > 0:
## frames = self.preprocess_video_frames(frames)
## embeddings = self.model.predict(np.expand_dims(frames, axis=0)) # Add batch dimension
## return embeddings.squeeze() # Return embedding as numpy array
frames_tensor = (
self.preprocess_video_frames(frames).unsqueeze(0).to(self.device)
) # Add batch dimension
Expand All @@ -101,19 +83,25 @@ def extract_video_embeddings(self, video_path):
return embeddings.squeeze().cpu().numpy() # Convert to numpy array
return None

def process_video(self, video_id, file_path):
def process_video(self, file_path):
embeddings = self.extract_video_embeddings(file_path)
if embeddings is not None:
print(f"Extracted embeddings for video {video_id}: {embeddings.shape}")
print(f"Extracted embeddings for video {file_path}: {embeddings.shape}")
else:
print(f"Failed to extract embeddings for video {video_id}")
print(f"Failed to extract embeddings for video {file_path}")
return embeddings


if __name__ == "__main__":
video_path = "path_to_your_video.mp4"
## extractor = EmbeddingExtractor(model_name="EfficientNetV2B0") # Change model name here
extractor = EmbeddingExtractor(
model_name="x3d_m"
) # You can change to x3d_s or x3d_l
extractor.process_video(1, video_path)
import os
import sys

if len(sys.argv) < 2:
print(f"Usage: {os.path.basename(__file__)} <filepath>")
sys.exit(1)

video_path = sys.argv[1]

def test_model(model_name):
extractor = EmbeddingExtractor(model_name=model_name)
extractor.process_video(video_path)
10 changes: 4 additions & 6 deletions llmpa/backends/localai.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,17 @@
sys.path.insert(0, project_root)

from clients.http import HttpClient
from .base import BaseBackend
from .base import BackendBase


class Backend(BaseBackend, HttpClient):
class Backend(BackendBase):
def __init__(self, base_url: str, api_key=None, verify_ssl=True, timeout=10):
super(Backend, self).__init__(base_url, api_key, verify_ssl, timeout)
super(Backend, self).__init__()
self.client = HttpClient(base_url, api_key, verify_ssl, timeout)

def list_available_models(self) -> Optional[list]:
try:
response = self.client.get("/v1/models", timeout=self.timeout)
response = self.client.get("/v1/models")
print(response.json())
if not response:
return None
Expand Down Expand Up @@ -50,7 +50,6 @@ def generate(
response = self.client.post(
"/v1/chat/completions",
json=payload,
timeout=self.timeout,
)
if not response:
return None
Expand Down Expand Up @@ -78,7 +77,6 @@ def embedding(self, text: str, model: str) -> Optional[List[float]]:
"/embeddings",
extra_headers=headers,
json=payload,
timeout=self.timeout,
)
if not response:
return None
Expand Down
6 changes: 5 additions & 1 deletion llmpa/db/milvus.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@ def list_collections(self):

def create_collections(self):
self.create_collection("video_embeddings", 1280)
self.create_collection("image_embeddings", 512)

## embedding lenth of image
# EfficientNetV2B0: 1280
# ResNet50: 2048
self.create_collection("image_embeddings", 2048)
self.create_collection("audio_embeddings", 1024)
self.create_collection("text_embeddings", 768)

Expand Down
4 changes: 3 additions & 1 deletion llmpa/fileparser/mimetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,13 @@ def detect(filepath: str, follow_symlinks: bool = True) -> str:


def main():
import os
import sys

if len(sys.argv) < 2:
print("Usage: mimetype.py <filepath>")
print(f"Usage: {os.path.basename(__file__)} <filepath>")
sys.exit(1)

filepath = sys.argv[1]
meme_type_real = _detect(filepath)
meme_type = detect(filepath)
Expand Down
19 changes: 6 additions & 13 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
torch==2.4.1 ## for fastai
torchaudio==2.4.1
torchvision==0.19.1
torch==2.5.0
torchaudio==2.5.0
torchvision==0.20.0
python-magic==0.4.27
flask==3.0.3
dash==2.18.1
Expand All @@ -17,21 +17,14 @@ black==24.10.0
bcrypt==4.2.0
uvicorn==0.32.0
asgiref==3.8.1
requests[security]=2.32.3
tensorflow[and-cuda]==2.17.0
requests[security]==2.32.3
transformers==4.45.2
scikit-learn==1.5.2
#fastai==2.7.17
# bark==0.1.5
# fastai==2.7.17 ## need torch<2.5 and >=1.10
bark==0.1.5
joblib==1.4.2
dspy-ai==2.5.12
pgvector==0.3.5
pytorchvideo==0.1.5
tensorrt==10.5.0
tensorflow-addons==0.23.0
tensorflow-hub==0.16.1
opencv-python==4.10.0.84
pymilvus==2.4.8
tensorrt==10.5.0
nvidia-tensorrt==99.0.0
huggingface-hub==0.25.2

0 comments on commit cfda35a

Please sign in to comment.