-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
311 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
# LAVIS | ||
|
||
[LAVIS](https://github.com/salesforce/LAVIS) is a Python deep learning library for LAnguage-and-VISion intelligence research and applications that supports 10+ tasks like retrieval, captioning, visual question answering (vqa), multimodal classification. In this example we are going to show how to use LAVIS to do image captioning, vqa and features extraction on Lepton. | ||
|
||
## Install Lepton sdk | ||
```shell | ||
pip install leptonai | ||
``` | ||
|
||
## Launch inference service locally | ||
|
||
### Image Captioning | ||
|
||
```shell | ||
lep photon run -n caption -m caption.py | ||
``` | ||
|
||
### Visual Question Answering (VQA) | ||
|
||
```shell | ||
lep photon run -n vqa -m vqa.py | ||
``` | ||
|
||
### Features Extraction | ||
|
||
```shell | ||
lep photon run -n extract-features -m extract-features.py | ||
``` | ||
|
||
## Launch inference service in the cloud | ||
|
||
Similar to other examples, you can run services on Lepton Cloud Platform easily, e.g.: | ||
|
||
```shell | ||
lep photon create -n extract-features -m extract-features.py | ||
lep photon push -n extract-features | ||
lep photon run \ | ||
-n extract-features \ | ||
--resource-shape gpu.a10 | ||
``` | ||
|
||
You can visit [dashboard.lepton.ai](https://dashboard.lepton.ai/) to try out the model. | ||
|
||
Note: in default, the server is protected via a token, so you won't be able to access the gradio UI. This is by design to provide adequate security. If you want to make the UI public, you can either add the `--public` argument to `lep photon run`, or update the deployment with: | ||
|
||
```shell | ||
lep deployment update -n extract-features --public | ||
``` | ||
|
||
## Client | ||
|
||
Once the inference service is up (either locally or in the cloud), you can use the client to access it in a programmatical way: | ||
|
||
```python | ||
from leptonai.client import Client, local, current | ||
|
||
# Use this if you are running locally | ||
client = Client(local()) | ||
# Or, if you are logged in to your workspace via `lep login` already | ||
# and have launched it: | ||
# client = Client(current(), "extract-features") # or "caption" for Image Captioning, or "vqa" for VQA | ||
``` | ||
|
||
### Image Captioning | ||
```python | ||
image = "http://images.cocodataset.org/val2017/000000039769.jpg" | ||
caption = client.run(image=image) | ||
|
||
print(caption) | ||
``` | ||
|
||
``` | ||
a couple of cats laying on top of a pink couch | ||
``` | ||
|
||
### Visual Question Answering (VQA) | ||
|
||
```python | ||
image = "http://images.cocodataset.org/val2017/000000039769.jpg" | ||
question = "How many cats?" | ||
answer = client.run(image=image, question=question) | ||
|
||
print(answer) | ||
``` | ||
|
||
``` | ||
2 | ||
``` | ||
|
||
### Features Extraction | ||
|
||
```python | ||
# image embedding | ||
image = "http://images.cocodataset.org/val2017/000000039769.jpg" | ||
features = client.run(image=image) | ||
|
||
print(f"embedding dimensions: {len(features)} x {len(features[0])}") | ||
``` | ||
|
||
``` | ||
embedding dimensions: 32 x 768 | ||
``` | ||
|
||
```python | ||
# text embedding | ||
text = "a large fountain spewing water into the air" | ||
features = client.run(text=text) | ||
|
||
print(f"embedding dimensions: {len(features)} x {len(features[0])}") | ||
``` | ||
|
||
``` | ||
embedding dimensions: 12 x 768 | ||
``` | ||
|
||
```python | ||
# multimodal embedding | ||
image = "http://images.cocodataset.org/val2017/000000039769.jpg" | ||
text = "two cats" | ||
features = client.run(image=image, text=text) | ||
|
||
print(f"embedding dimensions: {len(features)} x {len(features[0])}") | ||
``` | ||
|
||
``` | ||
embedding dimensions: 32 x 768 | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
from io import BytesIO | ||
from typing import Union | ||
|
||
from leptonai.photon import Photon, FileParam, get_file_content | ||
|
||
|
||
class CaptionPhoton(Photon): | ||
requirement_dependency = [ | ||
"salesforce-lavis", | ||
"Pillow", | ||
"opencv-python!=4.8.0.76", | ||
"opencv-contrib-python!=4.8.0.76", | ||
] | ||
|
||
def _get_img(self, param): | ||
from PIL import Image | ||
|
||
content = get_file_content(param) | ||
return Image.open(BytesIO(content)).convert("RGB") | ||
|
||
def init(self): | ||
import torch | ||
from lavis.models import load_model_and_preprocess | ||
|
||
if torch.cuda.is_available(): | ||
self.device = torch.device("cuda") | ||
else: | ||
self.device = torch.device("cpu") | ||
|
||
# Here we choose blip model, for other available models, please refer to: | ||
# | ||
# from lavis.models import model_zoo | ||
# print(model_zoo) | ||
# | ||
self.model_and_preprocess = load_model_and_preprocess( | ||
name="blip_caption", | ||
model_type="large_coco", | ||
is_eval=True, | ||
device=self.device, | ||
) | ||
|
||
@Photon.handler( | ||
example={"image": "http://images.cocodataset.org/val2017/000000039769.jpg"} | ||
) | ||
def run(self, image: Union[FileParam, str]) -> str: | ||
model, vis_processors, _ = self.model_and_preprocess | ||
|
||
image = self._get_img(image) | ||
image = vis_processors["eval"](image).unsqueeze(0).to(self.device) | ||
captions = model.generate({"image": image}) | ||
return captions[0] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
from io import BytesIO | ||
from typing import Union, Optional, List | ||
|
||
from leptonai.photon import Photon, FileParam, get_file_content, HTTPException | ||
|
||
|
||
class ExtractFeaturesPhoton(Photon): | ||
requirement_dependency = [ | ||
"salesforce-lavis", | ||
"Pillow", | ||
"opencv-python!=4.8.0.76", | ||
"opencv-contrib-python!=4.8.0.76", | ||
] | ||
|
||
def _get_img(self, param): | ||
from PIL import Image | ||
|
||
content = get_file_content(param) | ||
return Image.open(BytesIO(content)).convert("RGB") | ||
|
||
def init(self): | ||
import torch | ||
from lavis.models import load_model_and_preprocess | ||
|
||
if torch.cuda.is_available(): | ||
self.device = torch.device("cuda") | ||
else: | ||
self.device = torch.device("cpu") | ||
|
||
# Here we choose blip2 model, for other available models, please refer to: | ||
# | ||
# from lavis.models import model_zoo | ||
# print(model_zoo) | ||
# | ||
self.model_and_preprocess = load_model_and_preprocess( | ||
name="blip2_feature_extractor", | ||
model_type="pretrain", | ||
is_eval=True, | ||
device=self.device, | ||
) | ||
|
||
@Photon.handler( | ||
examples=[ | ||
{"image": "http://images.cocodataset.org/val2017/000000039769.jpg"}, | ||
{"text": "a large fountain spewing water into the air"}, | ||
{ | ||
"image": "http://images.cocodataset.org/val2017/000000039769.jpg", | ||
"text": "two cats", | ||
}, | ||
] | ||
) | ||
def run( | ||
self, image: Optional[Union[FileParam, str]] = None, text: Optional[str] = None | ||
) -> List[float]: | ||
model, vis_processors, txt_processors = self.model_and_preprocess | ||
|
||
if image is None and text is None: | ||
raise HTTPException( | ||
status_code=400, detail="Either image or text should be provided." | ||
) | ||
|
||
if image is not None: | ||
image = self._get_img(image) | ||
image = vis_processors["eval"](image).unsqueeze(0).to(self.device) | ||
if text is not None: | ||
text = txt_processors["eval"](text) | ||
|
||
if image is not None and text is None: | ||
# image embedding | ||
features = model.extract_features({"image": image}, mode="image") | ||
return features.image_embeds[0].tolist() | ||
elif image is None and text is not None: | ||
# text embedding | ||
features = model.extract_features({"text_input": [text]}, mode="text") | ||
return features.text_embeds[0].tolist() | ||
else: | ||
# multimodal embedding | ||
features = model.extract_features({"image": image, "text_input": [text]}) | ||
return features.multimodal_embeds[0].tolist() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
from io import BytesIO | ||
from typing import Union | ||
|
||
from leptonai.photon import Photon, FileParam, get_file_content | ||
|
||
|
||
class VQAPhoton(Photon): | ||
requirement_dependency = [ | ||
"salesforce-lavis", | ||
"Pillow", | ||
"opencv-python!=4.8.0.76", | ||
"opencv-contrib-python!=4.8.0.76", | ||
] | ||
|
||
def _get_img(self, param): | ||
from PIL import Image | ||
|
||
content = get_file_content(param) | ||
return Image.open(BytesIO(content)).convert("RGB") | ||
|
||
def init(self): | ||
import torch | ||
from lavis.models import load_model_and_preprocess | ||
|
||
if torch.cuda.is_available(): | ||
self.device = torch.device("cuda") | ||
else: | ||
self.device = torch.device("cpu") | ||
|
||
# Here we choose blip model, for other available models, please refer to: | ||
# | ||
# from lavis.models import model_zoo | ||
# print(model_zoo) | ||
# | ||
self.model_and_preprocess = load_model_and_preprocess( | ||
name="blip_vqa", model_type="vqav2", is_eval=True, device=self.device | ||
) | ||
|
||
@Photon.handler( | ||
example={ | ||
"image": "http://images.cocodataset.org/val2017/000000039769.jpg", | ||
"question": "How many cats?", | ||
} | ||
) | ||
def run(self, image: Union[FileParam, str], question: str) -> str: | ||
model, vis_processors, txt_processors = self.model_and_preprocess | ||
image = self._get_img(image) | ||
image = vis_processors["eval"](image).unsqueeze(0).to(self.device) | ||
question = txt_processors["eval"](question) | ||
answers = model.predict_answers( | ||
samples={"image": image, "text_input": question}, | ||
inference_method="generate", | ||
) | ||
return answers[0] |