Skip to content

Commit

Permalink
Add LAVIS (blip) example
Browse files Browse the repository at this point in the history
  • Loading branch information
bddppq committed Sep 14, 2023
1 parent f96f485 commit dfa3ac0
Show file tree
Hide file tree
Showing 4 changed files with 314 additions and 0 deletions.
130 changes: 130 additions & 0 deletions advanced/lavis/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# LAVIS

[LAVIS](https://github.com/salesforce/LAVIS) is a Python deep learning library for LAnguage-and-VISion intelligence research and applications that supports 10+ tasks like retrieval, captioning, visual question answering (vqa), multimodal classification. In this example we are going to show how to use LAVIS to do image captioning, vqa and features extraction on Lepton.

## Install Lepton sdk
```shell
pip install leptonai
```

## Launch inference service locally

### Image Captioning

Run:
```shell
lep photon run -n caption -m caption.py
```

### Visual Question Answering (VQA)

Run:
```shell
lep photon run -n vqa -m vqa.py
```

### Features Extraction

Run:
```shell
lep photon run -n extract-features -m extract-features.py
```

## Launch inference service in the cloud

Similar to other examples, you can run services on Lepton Cloud Platform easily, e.g.:

```shell
lep photon create -n extract-features -m extract-features.py
lep photon push -n extract-features
lep photon run \
-n extract-features \
--resource-shape gpu.a10
```

You can visit [dashboard.lepton.ai](https://dashboard.lepton.ai/) to try out the model.

Note: in default, the server is protected via a token, so you won't be able to access the gradio UI. This is by design to provide adequate security. If you want to make the UI public, you can either add the `--public` argument to `lep photon run`, or update the deployment with:

```shell
lep deployment update -n extract-features --public
```

## Client

Once the inference service is up (either locally or in the cloud), you can use the client to access it in a programmatical way:

```python
from leptonai.client import Client, local, current

# Use this if you are running locally
client = Client(local())
# Or, if you are logged in to your workspace via `lep login` already
# and have launched it:
# client = Client(current(), "extract-features") # or "caption" for Image Captioning, or "vqa" for VQA
```

### Image Captioning
```
image = "http://images.cocodataset.org/val2017/000000039769.jpg"
caption = client.run(image=image)
print(caption)
```

```
a couple of cats laying on top of a pink couch
```

### Visual Question Answering (VQA)

```
image = "http://images.cocodataset.org/val2017/000000039769.jpg"
question = "How many cats?"
answer = client.run(image=image, question=question)
print(answer)
```

```
2
```

### Features Extraction

```
# image embedding
image = "http://images.cocodataset.org/val2017/000000039769.jpg"
features = client.run(image=image)
print(f"embedding dimensions: {len(features)} x {len(features[0])}")
```

```
embedding dimensions: 32 x 768
```

```
# text embedding
text = "a large fountain spewing water into the air"
features = client.run(text=text)
print(f"embedding dimensions: {len(features)} x {len(features[0])}")
```

```
embedding dimensions: 12 x 768
```

```
# multimodal embedding
image = "http://images.cocodataset.org/val2017/000000039769.jpg"
text = "two cats"
features = client.run(image=image, text=text)
print(f"embedding dimensions: {len(features)} x {len(features[0])}")
```

```
embedding dimensions: 32 x 768
```
51 changes: 51 additions & 0 deletions advanced/lavis/caption.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from io import BytesIO
from typing import Union

from leptonai.photon import Photon, FileParam, get_file_content


class CaptionPhoton(Photon):
requirement_dependency = [
"salesforce-lavis",
"Pillow",
"opencv-python!=4.8.0.76",
"opencv-contrib-python!=4.8.0.76",
]

def _get_img(self, param):
from PIL import Image

content = get_file_content(param)
return Image.open(BytesIO(content)).convert("RGB")

def init(self):
import torch
from lavis.models import load_model_and_preprocess

if torch.cuda.is_available():
self.device = torch.device("cuda")
else:
self.device = torch.device("cpu")

# Here we choose blip model, for other available models, please refer to:
#
# from lavis.models import model_zoo
# print(model_zoo)
#
self.model_and_preprocess = load_model_and_preprocess(
name="blip_caption",
model_type="large_coco",
is_eval=True,
device=self.device,
)

@Photon.handler(
example={"image": "http://images.cocodataset.org/val2017/000000039769.jpg"}
)
def run(self, image: Union[FileParam, str]) -> str:
model, vis_processors, _ = self.model_and_preprocess

image = self._get_img(image)
image = vis_processors["eval"](image).unsqueeze(0).to(self.device)
captions = model.generate({"image": image})
return captions[0]
79 changes: 79 additions & 0 deletions advanced/lavis/extract-features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from io import BytesIO
from typing import Union, Optional, List

from leptonai.photon import Photon, FileParam, get_file_content, HTTPException


class ExtractFeaturesPhoton(Photon):
requirement_dependency = [
"salesforce-lavis",
"Pillow",
"opencv-python!=4.8.0.76",
"opencv-contrib-python!=4.8.0.76",
]

def _get_img(self, param):
from PIL import Image

content = get_file_content(param)
return Image.open(BytesIO(content)).convert("RGB")

def init(self):
import torch
from lavis.models import load_model_and_preprocess

if torch.cuda.is_available():
self.device = torch.device("cuda")
else:
self.device = torch.device("cpu")

# Here we choose blip2 model, for other available models, please refer to:
#
# from lavis.models import model_zoo
# print(model_zoo)
#
self.model_and_preprocess = load_model_and_preprocess(
name="blip2_feature_extractor",
model_type="pretrain",
is_eval=True,
device=self.device,
)

@Photon.handler(
examples=[
{"image": "http://images.cocodataset.org/val2017/000000039769.jpg"},
{"text": "a large fountain spewing water into the air"},
{
"image": "http://images.cocodataset.org/val2017/000000039769.jpg",
"text": "two cats",
},
]
)
def run(
self, image: Optional[Union[FileParam, str]] = None, text: Optional[str] = None
) -> List[float]:
model, vis_processors, txt_processors = self.model_and_preprocess

if image is None and text is None:
raise HTTPException(
status_code=400, detail="Either image or text should be provided."
)

if image is not None:
image = self._get_img(image)
image = vis_processors["eval"](image).unsqueeze(0).to(self.device)
if text is not None:
text = txt_processors["eval"](text)

if image is not None and text is None:
# image embedding
features = model.extract_features({"image": image}, mode="image")
return features.image_embeds[0].tolist()
elif image is None and text is not None:
# text embedding
features = model.extract_features({"text_input": [text]}, mode="text")
return features.text_embeds[0].tolist()
else:
# multimodal embedding
features = model.extract_features({"image": image, "text_input": [text]})
return features.multimodal_embeds[0].tolist()
54 changes: 54 additions & 0 deletions advanced/lavis/vqa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from io import BytesIO
from typing import Union

from leptonai.photon import Photon, FileParam, get_file_content


class VQAPhoton(Photon):
requirement_dependency = [
"salesforce-lavis",
"Pillow",
"opencv-python!=4.8.0.76",
"opencv-contrib-python!=4.8.0.76",
]

def _get_img(self, param):
from PIL import Image

content = get_file_content(param)
return Image.open(BytesIO(content)).convert("RGB")

def init(self):
import torch
from lavis.models import load_model_and_preprocess

if torch.cuda.is_available():
self.device = torch.device("cuda")
else:
self.device = torch.device("cpu")

# Here we choose blip model, for other available models, please refer to:
#
# from lavis.models import model_zoo
# print(model_zoo)
#
self.model_and_preprocess = load_model_and_preprocess(
name="blip_vqa", model_type="vqav2", is_eval=True, device=self.device
)

@Photon.handler(
example={
"image": "http://images.cocodataset.org/val2017/000000039769.jpg",
"question": "How many cats?",
}
)
def run(self, image: Union[FileParam, str], question: str) -> str:
model, vis_processors, txt_processors = self.model_and_preprocess
image = self._get_img(image)
image = vis_processors["eval"](image).unsqueeze(0).to(self.device)
question = txt_processors["eval"](question)
answers = model.predict_answers(
samples={"image": image, "text_input": question},
inference_method="generate",
)
return answers[0]

0 comments on commit dfa3ac0

Please sign in to comment.