Merge branch 'release/0.2.1'

pixano · Nov 13, 2023 · aaf6761 · aaf6761
2 parents 8ddec6d + 1887bc4
commit aaf6761
Show file tree

Hide file tree

Showing 6 changed files with 154 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,12 +6,22 @@ All notable changes to Pixano will be documented in this file.
 
 ## [Unreleased]
 
+
+
+## [0.2.1] - 2023-11-13
+
+### Added
+- Add CLIP model for **semantic search** on images
+
+
+
 ## [0.2.0] - 2023-10-26
 
 ### Changed
 - **Breaking:** Update models to the new **PixanoTypes** and **lancedb storage format** of Pixano 0.4.0
 
 
+
 ## [0.1.6] - 2023-07-10
 
 ### Added
@@ -81,7 +91,8 @@ All notable changes to Pixano will be documented in this file.
 
 
 
-[Unreleased]: https://github.com/pixano/pixano-inference/compare/v0.2.0...develop
+[Unreleased]: https://github.com/pixano/pixano/compare/main...develop
+[0.2.1]: https://github.com/pixano/pixano-inference/compare/v0.2.0...v0.2.1
 [0.2.0]: https://github.com/pixano/pixano-inference/compare/v0.1.6...v0.2.0
 [0.1.6]: https://github.com/pixano/pixano-inference/compare/v0.1.5...v0.1.6
 [0.1.5]: https://github.com/pixano/pixano-inference/compare/v0.1.4...v0.1.5

diff --git a/pixano_inference/__version__.py b/pixano_inference/__version__.py
@@ -11,4 +11,4 @@
 #
 # http://www.cecill.info
 
-__version__ = "0.2.0"
+__version__ = "0.2.1"
diff --git a/pixano_inference/segment_anything/segment_anything.py b/pixano_inference/segment_anything/segment_anything.py
@@ -37,6 +37,7 @@ class SAM(InferenceModel):
         device (str): Model GPU or CPU device (e.g. "cuda", "cpu")
         description (str): Model description
         model (torch.nn.Module): SAM model
+        checkpoint_path (Path): Model checkpoint path
     """
 
     def __init__(

diff --git a/pixano_inference/transformers/__init__.py b/pixano_inference/transformers/__init__.py
@@ -0,0 +1,18 @@
+# @Copyright: CEA-LIST/DIASI/SIALV/LVA (2023)
+# @Author: CEA-LIST/DIASI/SIALV/LVA <[email protected]>
+# @License: CECILL-C
+#
+# This software is a collaborative computer program whose purpose is to
+# generate and explore labeled data for computer vision applications.
+# This software is governed by the CeCILL-C license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL-C
+# license as circulated by CEA, CNRS and INRIA at the following URL
+#
+# http://www.cecill.info
+
+from .clip import CLIP
+
+__all__ = [
+    "CLIP",
+]
diff --git a/pixano_inference/transformers/clip.py b/pixano_inference/transformers/clip.py
@@ -0,0 +1,118 @@
+# @Copyright: CEA-LIST/DIASI/SIALV/LVA (2023)
+# @Author: CEA-LIST/DIASI/SIALV/LVA <[email protected]>
+# @License: CECILL-C
+#
+# This software is a collaborative computer program whose purpose is to
+# generate and explore labeled data for computer vision applications.
+# This software is governed by the CeCILL-C license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL-C
+# license as circulated by CEA, CNRS and INRIA at the following URL
+#
+# http://www.cecill.info
+
+
+import numpy as np
+import pyarrow as pa
+from pixano.core import Image
+from pixano.models import InferenceModel
+from transformers import CLIPModel, CLIPProcessor, CLIPTokenizerFast
+
+
+class CLIP(InferenceModel):
+    """CLIP: Connecting text and images
+
+    Attributes:
+        name (str): Model name
+        id (str): Model ID
+        device (str): Model GPU or CPU device (e.g. "cuda", "cpu")
+        description (str): Model description
+        model (CLIPModel): CLIP model
+        processor (CLIPProcessor): CLIP processor
+        tokenizer (CLIPTokenizerFast): CLIP tokenizer
+        pretrained_model (str): Pretrained model name or path
+    """
+
+    def __init__(
+        self,
+        pretrained_model: str = "openai/clip-vit-base-patch32",
+        id: str = "",
+    ) -> None:
+        """Initialize model
+
+        Args:
+            pretrained_model (str): Pretrained model name or path
+            id (str, optional): Previously used ID, generate new ID if "". Defaults to "".
+        """
+
+        super().__init__(
+            name=f"CLIP",
+            id=id,
+            device="cpu",
+            description=f"From HuggingFace Transformers. CLIP: Connecting text and images. {pretrained_model}.",
+        )
+
+        # Model
+        self.model = CLIPModel.from_pretrained(pretrained_model)
+        self.processor = CLIPProcessor.from_pretrained(pretrained_model)
+        self.tokenizer = CLIPTokenizerFast.from_pretrained(pretrained_model)
+
+        # Model name or path
+        self.pretrained_model = pretrained_model
+
+    def precompute_embeddings(
+        self,
+        batch: pa.RecordBatch,
+        views: list[str],
+        uri_prefix: str,
+    ) -> list[dict]:
+        """Embedding precomputing for a batch
+
+        Args:
+            batch (pa.RecordBatch): Input batch
+            views (list[str]): Dataset views
+            uri_prefix (str): URI prefix for media files
+
+        Returns:
+            pa.RecordBatch: Embedding rows
+        """
+
+        rows = [
+            {
+                "id": batch["id"][x].as_py(),
+            }
+            for x in range(batch.num_rows)
+        ]
+
+        for view in views:
+            # Iterate manually
+            for x in range(batch.num_rows):
+                # Preprocess image
+                im = Image.from_dict(batch[view][x].as_py())
+                im.uri_prefix = uri_prefix
+                im = im.as_pillow()
+
+                # Inference
+                inputs = self.processor(images=im, padded=True, return_tensors="pt")
+                image_features = self.model.get_image_features(**inputs)
+                vect = image_features.detach().numpy()[0]
+
+                # Process model outputs
+                rows[x][view] = vect
+
+        return rows
+
+    def semantic_search(self, query: str) -> np.ndarray:
+        """Process semantic search query with CLIP
+
+        Args:
+            query (str): Search query text
+
+        Returns:
+            np.ndarray: Search query vector
+        """
+
+        inputs = self.tokenizer([query], padding=True, return_tensors="pt")
+        text_features = self.model.get_text_features(**inputs)
+
+        return text_features.detach().numpy()[0]
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,13 +23,14 @@ classifiers = [
   "License :: CeCILL-C Free Software License Agreement (CECILL-C)",
 ]
 dependencies = [
-  "pixano >= 0.2.0",
+  "pixano ~= 0.4.0",
   "torch >= 2.0.0",
   "torchaudio >= 2.0.0",
   "torchvision >= 0.15.0",
   "tensorflow >= 2.12.0",
   "tensorflow-hub >= 0.13.0",
   "segment-anything@git+https://github.com/facebookresearch/segment-anything",
+  "transformers >= 4.33.0",
   "gitpython >= 3.1.30",
   "matplotlib >= 3.3",
   "psutil",
@@ -42,8 +43,8 @@ dependencies = [
 
 [project.optional-dependencies]
 documentation = [
-  "mkdocs-material ~= 9.3.0",
-  "mkdocstrings-python ~= 1.6.0",
+  "mkdocs-material ~= 9.4.0",
+  "mkdocstrings-python ~= 1.7.0",
   "mkdocs-gen-files ~= 0.5.0",
   "mkdocs-literate-nav ~= 0.6.0",
 ]