diff --git a/Makefile b/Makefile index 6e87a399a3..4b78f88edc 100644 --- a/Makefile +++ b/Makefile @@ -71,6 +71,11 @@ slow_tests_text_generation_example: test_installs slow_tests_image_to_text_example: test_installs python -m pytest tests/test_image_to_text_example.py -v -s --token $(TOKEN) +# Run visual question answering tests +slow_tests_openclip_vqa_example: test_installs + python -m pip install -r examples/visual-question-answering/openclip_requirements.txt + python -m pytest tests/test_openclip_vqa.py + slow_tests_fsdp: test_installs python -m pytest tests/test_fsdp_examples.py -v -s --token $(TOKEN) diff --git a/examples/visual-question-answering/README.md b/examples/visual-question-answering/README.md index efbe1f2a92..ed4d2a8a66 100644 --- a/examples/visual-question-answering/README.md +++ b/examples/visual-question-answering/README.md @@ -16,10 +16,10 @@ limitations under the License. # Visual Question Answering Examples -This directory contains a script that showcases how to use the Transformers pipeline API to run visual question answering task on HPUs. - ## Single-HPU inference +The `run_pipeline.py` script showcases how to use the Transformers pipeline API to run visual question answering task on HPUs. + ```bash python3 run_pipeline.py \ --model_name_or_path Salesforce/blip-vqa-capfilt-large \ @@ -32,4 +32,37 @@ python3 run_pipeline.py \ Models that have been validated: - [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) - [dandelin/vilt-b32-finetuned-vqa](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa) - - [Salesforce/blip-vqa-capfilt-large](https://huggingface.co/Salesforce/blip-vqa-capfilt-large) \ No newline at end of file + - [Salesforce/blip-vqa-capfilt-large](https://huggingface.co/Salesforce/blip-vqa-capfilt-large) + +## OpenCLIP inference + +The `run_openclip_vqa.py` can be used to run zero shot image classification with [OpenCLIP Huggingface Models](https://huggingface.co/docs/hub/en/open_clip#using-openclip-at-hugging-face). +The requirements for `run_openclip_vqa.py` can be installed with `openclip_requirements.txt` as follows: + +```bash +pip install -r openclip_requirements.txt +``` + +By default, the script runs the sample outlined in [BiomedCLIP-PubMedBERT_256-vit_base_patch16_224 notebook](https://huggingface.co/microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224/blob/main/biomed_clip_example.ipynb) which can be run as follows: + +```bash +python run_openclip_vqa.py \ + --use_hpu_graphs \ + --bf16 +``` + +One can also run other OpenCLIP models by specifying model, classifier labels and image URL(s) like so: + +```bash +python run_openclip_vqa.py \ + --model_name_or_path laion/CLIP-ViT-g-14-laion2B-s12B-b42K \ + --labels "a dog" "a cat" \ + --image_path "http://images.cocodataset.org/val2017/000000039769.jpg" \ + --use_hpu_graphs \ + --bf16 +``` + +Models that have been validated: + - [BiomedCLIP-PubMedBERT_256-vit_base_patch16_224](https://huggingface.co/microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224) + - [laion/CLIP-ViT-g-14-laion2B-s12B-b42K](https://huggingface.co/laion/CLIP-ViT-g-14-laion2B-s12B-b42K) + - [apple/DFN5B-CLIP-ViT-H-14](https://huggingface.co/apple/DFN5B-CLIP-ViT-H-14/tree/main) \ No newline at end of file diff --git a/examples/visual-question-answering/openclip_requirements.txt b/examples/visual-question-answering/openclip_requirements.txt new file mode 100644 index 0000000000..c132e5eb90 --- /dev/null +++ b/examples/visual-question-answering/openclip_requirements.txt @@ -0,0 +1,3 @@ +open_clip_torch==2.23.0 +matplotlib + diff --git a/examples/visual-question-answering/run_openclip_vqa.py b/examples/visual-question-answering/run_openclip_vqa.py new file mode 100644 index 0000000000..76b4159149 --- /dev/null +++ b/examples/visual-question-answering/run_openclip_vqa.py @@ -0,0 +1,232 @@ +# This script is based on https://huggingface.co/microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224/blob/main/biomed_clip_example.ipynb +import argparse +import json +import logging +import os +import time +from pathlib import Path +from pprint import pprint +from urllib.request import urlopen + +import matplotlib.pyplot as plt +import numpy +import torch +from open_clip import create_model_from_pretrained, get_tokenizer, model +from PIL import Image + +from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi + + +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + +DATASET_URL = "https://huggingface.co/microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224/resolve/main/example_data/biomed_image_classification_example_data/" +LABELS = [ + "adenocarcinoma histopathology", + "brain MRI", + "covid line chart", + "squamous cell carcinoma histopathology", + "immunohistochemistry histopathology", + "bone X-ray", + "chest X-ray", + "pie chart", + "hematoxylin and eosin histopathology", +] + +TEST_IMGS = [ + "squamous_cell_carcinoma_histopathology.jpeg", + "H_and_E_histopathology.jpg", + "bone_X-ray.jpg", + "adenocarcinoma_histopathology.jpg", + "covid_line_chart.png", + "IHC_histopathology.jpg", + "chest_X-ray.jpg", + "brain_MRI.jpg", + "pie_chart.png", +] + + +def plot_images_with_metadata(images: list, metadata, output_dir: str, plot_name: str) -> None: + print(f"plottypes {type(images)} {type(metadata)} {type(output_dir)} {type(plot_name)}") + + num_images = len(images) + fig, axes = plt.subplots(nrows=num_images, ncols=1, figsize=(5, 5 * num_images)) + + for i, (img_path, metadata) in enumerate(zip(images, metadata)): + img = Image.open(urlopen(img_path)) + if isinstance(axes, list) or isinstance(axes, numpy.ndarray): + ax = axes[i] + else: + ax = axes + ax.imshow(img) + ax.axis("off") + ax.set_title(f"{metadata['filename']}\n{metadata['top_probs']}", fontsize=14) + + plt.tight_layout() + plt.savefig(f"{output_dir}/{plot_name}.png") + + +def run_qa(model: model, images: torch.Tensor, texts: torch.Tensor, device: torch.device) -> tuple: + with torch.no_grad(): + image_features, text_features, logit_scale = model(images, texts) + logits = (logit_scale * image_features @ text_features.t()).detach().softmax(dim=-1) + sorted_indices = torch.argsort(logits, dim=-1, descending=True) + return sorted_indices, logits + + +def postprocess(args: argparse.Namespace, sorted_indices: torch.Tensor, logits: torch.Tensor, topk: int) -> list: + logits = logits.float().cpu().numpy() + sorted_indices = sorted_indices.int().cpu().numpy() + metadata_list = [] + for i, img in enumerate(args.image_path): + img_name = img.split("/")[-1] + + top_probs = [] + topk = len(args.labels) if topk == -1 else topk + for j in range(topk): + jth_index = sorted_indices[i][j] + top_probs.append(f"{args.labels[jth_index]}: {logits[i][jth_index] * 100:.1f}") + + metadata = {"filename": img_name, "top_probs": "\n".join(top_probs)} + metadata_list.append(metadata) + return metadata_list + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_name_or_path", + default="microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224", + type=str, + help="Path to pre-trained model", + ) + parser.add_argument( + "--image_path", + default=[DATASET_URL + img for img in TEST_IMGS], + type=str, + nargs="*", + help='Path to image as input. Can be a single string (eg: --image_path "URL1"), or a list of space-separated strings (eg: --image_path "URL1" "URL2")', + ) + parser.add_argument( + "--topk", + default=1, + type=int, + help="topk num. Provides top K probabilities for the labels provided.", + ) + parser.add_argument( + "--prompt", + default="this is a picture of ", + type=str, + help='Prompt for classification. It should be a string separated by comma. (eg: --prompt "a photo of ")', + ) + parser.add_argument( + "--labels", + default=LABELS, + type=str, + nargs="*", + help='Labels for classification (eg: --labels "LABEL1"), or a list of space-separated strings (eg: --labels "LABEL1" "LABEL2")', + ) + parser.add_argument( + "--use_hpu_graphs", + action="store_true", + help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.", + ) + parser.add_argument( + "--bf16", + action="store_true", + help="Whether to perform in bf16 precision.", + ) + parser.add_argument( + "--output_dir", + default=os.getcwd(), + type=str, + help="Output directory to store results in.", + ) + parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.") + parser.add_argument( + "--n_iterations", type=int, default=10, help="Number of inference iterations for benchmarking." + ) + parser.add_argument("--plot_images", action="store_true", help="Plot images with metadata for verification") + parser.add_argument( + "--plot_name", + default="openclip_vqa_plot", + type=str, + help="Name of the plot generated with the image and corresponding top K results", + ) + parser.add_argument( + "--print_result", + action="store_true", + help="Whether to print the zero shot classification results.", + ) + + args = parser.parse_args() + + adapt_transformers_to_gaudi() + + precision = "fp32" + dtype = torch.float32 + if args.bf16: + precision = "bf16" + dtype = torch.bfloat16 + + model, preprocess = create_model_from_pretrained(f"hf-hub:{args.model_name_or_path}", precision=precision) + tokenizer = get_tokenizer(f"hf-hub:{args.model_name_or_path}") + + device = torch.device("hpu") if torch.hpu.is_available() else torch.device("cpu") + device_type = "hpu" if torch.hpu.is_available() else "cpu" + + # Initialize model + if args.use_hpu_graphs: + from habana_frameworks.torch.hpu import wrap_in_hpu_graph + + model = wrap_in_hpu_graph(model) + model = model.to(device) + model.eval() + + images = torch.stack([preprocess(Image.open(urlopen(img))) for img in args.image_path]).to(device) + texts = tokenizer([args.prompt + l for l in args.labels]).to(device) + + # Warm up + logger.info("Running warmup") + for i in range(args.warmup): + with torch.autocast(device_type=device_type, dtype=dtype, enabled=True): + _, _ = run_qa(model, images, texts, device=device) + + logger.info("Running inference") + start = time.time() + for i in range(args.n_iterations): + logits = None + with torch.autocast(device_type=device_type, dtype=dtype, enabled=True): + sorted_indices, logits = run_qa(model, images, texts, device=device) + end = time.time() + + # Results and metrics + metadata_list = [] + metadata_list = postprocess(args, sorted_indices, logits, args.topk) + if args.print_result: + logger.info("Results from the last iteration:") + pprint(metadata_list) + inference_time_per_iteration = (end - start) * 1000 / args.n_iterations + logger.info(f"Inference Time per iteration = {inference_time_per_iteration:.4}ms") + throughput = len(args.image_path) * args.n_iterations / (end - start) + logger.info(f"Throughput = {throughput:.4} images/s") + + # Store results if necessary + if args.output_dir is not None: + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + results = {"throughput": throughput, "inference time per iteration ": inference_time_per_iteration} + with (output_dir / "results.json").open("w", encoding="utf-8") as f: + json.dump(results, f, ensure_ascii=False, indent=4) + if args.plot_images: + plot_images_with_metadata(args.image_path, metadata_list, args.output_dir, args.plot_name) + + +if __name__ == "__main__": + main() diff --git a/tests/test_openclip_vqa.py b/tests/test_openclip_vqa.py new file mode 100644 index 0000000000..c0c3d38521 --- /dev/null +++ b/tests/test_openclip_vqa.py @@ -0,0 +1,81 @@ +import json +import os +import re +import subprocess +from pathlib import Path +from tempfile import TemporaryDirectory + +import pytest + +from .test_examples import TIME_PERF_FACTOR + + +if os.environ.get("GAUDI2_CI", "0") == "1": + # Gaudi2 CI baselines + MODELS_TO_TEST = { + "bf16": [ + ("laion/CLIP-ViT-g-14-laion2B-s12B-b42K", 1472), + ("microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224", 1816), + ], + } +else: + # Gaudi1 CI baselines + MODELS_TO_TEST = { + "bf16": [ + ("laion/CLIP-ViT-g-14-laion2B-s12B-b42K", 550), + ("microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224", 1200), + ], + } + + +def _install_requirements(): + PATH_TO_EXAMPLE_DIR = Path(__file__).resolve().parent.parent / "examples" + cmd_line = ( + f"pip install -r {PATH_TO_EXAMPLE_DIR / 'visual-question-answering' / 'openclip_requirements.txt'}".split() + ) + p = subprocess.Popen(cmd_line) + return_code = p.wait() + assert return_code == 0 + + +def _test_openclip_vqa(model_name: str, baseline: float): + _install_requirements() + command = ["python3"] + path_to_example_dir = Path(__file__).resolve().parent.parent / "examples" + env_variables = os.environ.copy() + + command += [ + f"{path_to_example_dir / 'visual-question-answering' / 'run_openclip_vqa.py'}", + f"--model_name_or_path {model_name}", + "--bf16", + "--use_hpu_graphs", + ] + + with TemporaryDirectory() as tmp_dir: + command.append(f"--output_dir {tmp_dir}") + print(f"\n\nCommand to test: {' '.join(command)}\n") + + pattern = re.compile(r"([\"\'].+?[\"\'])|\s") + command = [x for y in command for x in re.split(pattern, y) if x] + + proc = subprocess.run(command, env=env_variables) + + # Ensure the run finished without any issue + # Use try-except to avoid logging the token if used + try: + assert proc.returncode == 0 + except AssertionError as e: + if "'--token', 'hf_" in e.args[0]: + e.args = (f"The following command failed:\n{' '.join(command[:-2])}",) + raise + + with open(Path(tmp_dir) / "results.json") as fp: + results = json.load(fp) + + # Ensure performance requirements (throughput) are met + assert results["throughput"] >= (2 - TIME_PERF_FACTOR) * baseline + + +@pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["bf16"]) +def test_openclip_vqa_bf16(model_name: str, baseline: float): + _test_openclip_vqa(model_name, baseline)