Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Vision Language Model in GenAI-Perf #756

Merged
merged 6 commits into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 172 additions & 13 deletions src/c++/perf_analyzer/genai-perf/genai_perf/llm_inputs/llm_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,17 @@
from typing import Any, Dict, List, Optional, Tuple, cast

import requests
from genai_perf import utils
from genai_perf.constants import CNN_DAILY_MAIL, DEFAULT_INPUT_DATA_JSON, OPEN_ORCA
from genai_perf.exceptions import GenAIPerfException
from genai_perf.llm_inputs.synthetic_image_generator import (
ImageFormat,
SyntheticImageGenerator,
)
from genai_perf.llm_inputs.synthetic_prompt_generator import SyntheticPromptGenerator
from genai_perf.tokenizer import DEFAULT_TOKENIZER, Tokenizer, get_tokenizer
from genai_perf.utils import load_json_str
from PIL import Image
from requests import Response


Expand All @@ -43,6 +49,7 @@ class OutputFormat(Enum):
OPENAI_CHAT_COMPLETIONS = auto()
OPENAI_COMPLETIONS = auto()
OPENAI_EMBEDDINGS = auto()
OPENAI_VISION = auto()
RANKINGS = auto()
TENSORRTLLM = auto()
VLLM = auto()
Expand Down Expand Up @@ -75,6 +82,11 @@ class LlmInputs:
DEFAULT_OUTPUT_TOKENS_STDDEV = 0
DEFAULT_NUM_PROMPTS = 100

DEFAULT_IMAGE_WIDTH_MEAN = 100
DEFAULT_IMAGE_WIDTH_STDDEV = 0
DEFAULT_IMAGE_HEIGHT_MEAN = 100
DEFAULT_IMAGE_HEIGHT_STDDEV = 0

EMPTY_JSON_IN_VLLM_PA_FORMAT: Dict = {"data": []}
EMPTY_JSON_IN_TENSORRTLLM_PA_FORMAT: Dict = {"data": []}
EMPTY_JSON_IN_OPENAI_PA_FORMAT: Dict = {"data": []}
Expand All @@ -97,6 +109,11 @@ def create_llm_inputs(
output_tokens_deterministic: bool = False,
prompt_tokens_mean: int = DEFAULT_PROMPT_TOKENS_MEAN,
prompt_tokens_stddev: int = DEFAULT_PROMPT_TOKENS_STDDEV,
image_width_mean: int = DEFAULT_IMAGE_WIDTH_MEAN,
image_width_stddev: int = DEFAULT_IMAGE_WIDTH_STDDEV,
image_height_mean: int = DEFAULT_IMAGE_HEIGHT_MEAN,
image_height_stddev: int = DEFAULT_IMAGE_HEIGHT_STDDEV,
image_format: ImageFormat = ImageFormat.PNG,
random_seed: int = DEFAULT_RANDOM_SEED,
num_of_output_prompts: int = DEFAULT_NUM_PROMPTS,
add_model_name: bool = False,
Expand Down Expand Up @@ -139,6 +156,16 @@ def create_llm_inputs(
The standard deviation of the length of the output to generate. This is only used if output_tokens_mean is provided.
output_tokens_deterministic:
If true, the output tokens will set the minimum and maximum tokens to be equivalent.
image_width_mean:
The mean width of images when generating synthetic image data.
image_width_stddev:
The standard deviation of width of images when generating synthetic image data.
image_height_mean:
The mean height of images when generating synthetic image data.
image_height_stddev:
The standard deviation of height of images when generating synthetic image data.
image_format:
The compression format of the images.
batch_size:
The number of inputs per request (currently only used for the embeddings and rankings endpoints)
Expand Down Expand Up @@ -175,6 +202,11 @@ def create_llm_inputs(
prompt_tokens_mean,
prompt_tokens_stddev,
num_of_output_prompts,
image_width_mean,
image_width_stddev,
image_height_mean,
image_height_stddev,
image_format,
batch_size,
input_filename,
)
Expand Down Expand Up @@ -210,6 +242,11 @@ def get_generic_dataset_json(
prompt_tokens_mean: int,
prompt_tokens_stddev: int,
num_of_output_prompts: int,
image_width_mean: int,
image_width_stddev: int,
image_height_mean: int,
image_height_stddev: int,
image_format: ImageFormat,
batch_size: int,
input_filename: Optional[Path],
) -> Dict:
Expand All @@ -236,6 +273,16 @@ def get_generic_dataset_json(
The standard deviation of the length of the prompt to generate
num_of_output_prompts:
The number of synthetic output prompts to generate
image_width_mean:
The mean width of images when generating synthetic image data.
image_width_stddev:
The standard deviation of width of images when generating synthetic image data.
image_height_mean:
The mean height of images when generating synthetic image data.
image_height_stddev:
The standard deviation of height of images when generating synthetic image data.
image_format:
The compression format of the images.
batch_size:
The number of inputs per request (currently only used for the embeddings and rankings endpoints)
input_filename:
Expand Down Expand Up @@ -280,6 +327,12 @@ def get_generic_dataset_json(
)
else:
if input_type == PromptSource.DATASET:
# (TMA-1990) support VLM input from public dataset
if output_format == OutputFormat.OPENAI_VISION:
raise GenAIPerfException(
f"{OutputFormat.OPENAI_VISION.to_lowercase()} currently "
"does not support dataset as input."
)
dataset = cls._get_input_dataset_from_url(
dataset_name, starting_index, length
)
Expand All @@ -292,6 +345,12 @@ def get_generic_dataset_json(
prompt_tokens_mean,
prompt_tokens_stddev,
num_of_output_prompts,
image_width_mean,
image_width_stddev,
image_height_mean,
image_height_stddev,
image_format,
output_format,
)
generic_dataset_json = (
cls._convert_input_synthetic_or_file_dataset_to_generic_json(
Expand All @@ -301,6 +360,9 @@ def get_generic_dataset_json(
elif input_type == PromptSource.FILE:
input_filename = cast(Path, input_filename)
input_file_dataset = cls._get_input_dataset_from_file(input_filename)
input_file_dataset = cls._encode_images_in_input_dataset(
input_file_dataset
)
generic_dataset_json = (
cls._convert_input_synthetic_or_file_dataset_to_generic_json(
input_file_dataset
Expand All @@ -309,6 +371,14 @@ def get_generic_dataset_json(
else:
raise GenAIPerfException("Input source is not recognized.")

# When the generic_dataset_json contains multi-modal data (e.g. images),
# convert the format of the content to OpenAI multi-modal format:
# see https://platform.openai.com/docs/guides/vision
if output_format == OutputFormat.OPENAI_VISION:
generic_dataset_json = cls._convert_to_openai_multi_modal_content(
generic_dataset_json
)

return generic_dataset_json

@classmethod
Expand Down Expand Up @@ -405,17 +475,36 @@ def _get_input_dataset_from_synthetic(
prompt_tokens_mean: int,
prompt_tokens_stddev: int,
num_of_output_prompts: int,
image_width_mean: int,
image_width_stddev: int,
image_height_mean: int,
image_height_stddev: int,
image_format: ImageFormat,
output_format: OutputFormat,
) -> Dict[str, Any]:
dataset_json: Dict[str, Any] = {}
dataset_json["features"] = [{"name": "text_input"}]
dataset_json["rows"] = []
for _ in range(num_of_output_prompts):
row: Dict["str", Any] = {"row": {}}
synthetic_prompt = cls._create_synthetic_prompt(
tokenizer,
prompt_tokens_mean,
prompt_tokens_stddev,
)
dataset_json["rows"].append({"row": {"text_input": synthetic_prompt}})
row["row"]["text_input"] = synthetic_prompt

if output_format == OutputFormat.OPENAI_VISION:
synthetic_image = cls._create_synthetic_image(
image_width_mean=image_width_mean,
image_width_stddev=image_width_stddev,
image_height_mean=image_height_mean,
image_height_stddev=image_height_stddev,
image_format=image_format,
)
row["row"]["image"] = synthetic_image

dataset_json["rows"].append(row)

return dataset_json

Expand Down Expand Up @@ -497,29 +586,37 @@ def _add_rows_to_generic_json(
@classmethod
def _get_input_dataset_from_file(cls, input_filename: Path) -> Dict:
"""
Reads the input prompts from a JSONL file and converts them into the required dataset format.
Reads the input prompts and images from a JSONL file and converts them
into the required dataset format.
Parameters
----------
input_filename : Path
The path to the input file containing the prompts in JSONL format.
The path to the input file containing the prompts and/or images in
JSONL format.
Returns
-------
Dict
The dataset in the required format with the prompts read from the file.
The dataset in the required format with the prompts and/or images
read from the file.
"""
cls.verify_file(input_filename)
input_file_prompts = cls._get_prompts_from_input_file(input_filename)
prompts, images = cls._get_prompts_from_input_file(input_filename)
dataset_json: Dict[str, Any] = {}
dataset_json["features"] = [{"name": "text_input"}]
dataset_json["rows"] = [
{"row": {"text_input": prompt}} for prompt in input_file_prompts
]
dataset_json["rows"] = []
for prompt, image in zip(prompts, images):
content = {"text_input": prompt}
content.update({"image": image} if image else {})
dataset_json["rows"].append({"row": content})

return dataset_json

@classmethod
def _get_prompts_from_input_file(cls, input_filename: Path) -> List[str]:
def _get_prompts_from_input_file(
cls, input_filename: Path
) -> Tuple[List[str], List[str]]:
"""
Reads the input prompts from a JSONL file and returns a list of prompts.
Expand All @@ -530,21 +627,63 @@ def _get_prompts_from_input_file(cls, input_filename: Path) -> List[str]:
Returns
-------
List[str]
A list of prompts read from the file.
Tuple[List[str], List[str]]
A list of prompts and images read from the file.
"""
prompts = []
images = []
with open(input_filename, mode="r", newline=None) as file:
for line in file:
if line.strip():
prompts.append(load_json_str(line).get("text_input", "").strip())
return prompts
images.append(load_json_str(line).get("image", "").strip())
return prompts, images

@classmethod
def verify_file(cls, input_filename: Path) -> None:
if not input_filename.exists():
raise FileNotFoundError(f"The file '{input_filename}' does not exist.")

@classmethod
def _convert_to_openai_multi_modal_content(
cls, generic_dataset_json: Dict[str, List[Dict]]
) -> Dict[str, List[Dict]]:
"""
Converts to multi-modal content format of OpenAI Chat Completions API.
"""
for row in generic_dataset_json["rows"]:
if row["image"]:
row["text_input"] = [
{
"type": "text",
"text": row["text_input"],
},
{
"type": "image_url",
"image_url": {"url": row["image"]},
},
]

return generic_dataset_json

@classmethod
def _encode_images_in_input_dataset(cls, input_file_dataset: Dict) -> Dict:
for row in input_file_dataset["rows"]:
filename = row["row"].get("image")
if filename:
img = Image.open(filename)
if img.format.lower() not in utils.get_enum_names(ImageFormat):
raise GenAIPerfException(
f"Unsupported image format '{img.format}' of "
f"the image '{filename}'."
)

img_base64 = utils.encode_image(img, img.format)
payload = f"data:image/{img.format.lower()};base64,{img_base64}"
row["row"]["image"] = payload

return input_file_dataset

@classmethod
def _convert_generic_json_to_output_format(
cls,
Expand All @@ -559,7 +698,10 @@ def _convert_generic_json_to_output_format(
model_name: list = [],
model_selection_strategy: ModelSelectionStrategy = ModelSelectionStrategy.ROUND_ROBIN,
) -> Dict:
if output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS:
if (
output_format == OutputFormat.OPENAI_CHAT_COMPLETIONS
or output_format == OutputFormat.OPENAI_VISION
):
output_json = cls._convert_generic_json_to_openai_chat_completions_format(
generic_dataset,
add_model_name,
Expand Down Expand Up @@ -1424,3 +1566,20 @@ def _create_synthetic_prompt(
return SyntheticPromptGenerator.create_synthetic_prompt(
tokenizer, prompt_tokens_mean, prompt_tokens_stddev
)

@classmethod
def _create_synthetic_image(
cls,
image_width_mean: int,
image_width_stddev: int,
image_height_mean: int,
image_height_stddev: int,
image_format: ImageFormat,
) -> str:
return SyntheticImageGenerator.create_synthetic_image(
image_width_mean=image_width_mean,
image_width_stddev=image_width_stddev,
image_height_mean=image_height_mean,
image_height_stddev=image_height_stddev,
image_format=image_format,
)
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Loading