diff --git a/.gitignore b/.gitignore index ceef6a5fba456..bb7e4d5b244a8 100644 --- a/.gitignore +++ b/.gitignore @@ -81,6 +81,8 @@ instance/ docs/_build/ docs/source/getting_started/examples/*.rst !**/*.template.rst +docs/source/getting_started/examples/*.md +!**/*.template.md # PyBuilder .pybuilder/ diff --git a/Dockerfile b/Dockerfile index 6226569e9d3b4..153bff9cf565f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ # to run the OpenAI compatible server. # Please update any changes made here to -# docs/source/dev/dockerfile/dockerfile.rst and +# docs/source/dev/dockerfile/dockerfile.md and # docs/source/assets/dev/dockerfile-stages-dependency.png ARG CUDA_VERSION=12.4.1 diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index ca2da4cd66d2d..4859c8ac08bea 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,7 +1,7 @@ sphinx==6.2.1 sphinx-book-theme==1.0.1 sphinx-copybutton==0.5.2 -myst-parser==2.0.0 +myst-parser==3.0.1 sphinx-argparse==0.4.0 msgspec cloudpickle diff --git a/docs/source/automatic_prefix_caching/apc.md b/docs/source/automatic_prefix_caching/apc.md new file mode 100644 index 0000000000000..c0c141c5fb7ef --- /dev/null +++ b/docs/source/automatic_prefix_caching/apc.md @@ -0,0 +1,102 @@ +(apc)= + +# Introduction + +## What is Automatic Prefix Caching + +Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part. + +```{note} +Technical details on how vLLM implements APC are in the next page. +``` + +## Enabling APC in vLLM + +Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example: + +```python +import time +from vllm import LLM, SamplingParams + + +# A prompt containing a large markdown table. The table is randomly generated by GPT-4. +LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """ +| ID | Name | Age | Occupation | Country | Email | Phone Number | Address | +|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------| +| 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL | +| 2 | Jane Smith | 34 | Doctor | Canada | jane.smith@example.com | 555-5678 | 456 Oak St, Toronto, ON | +| 3 | Alice Johnson | 27 | Teacher | UK | alice.j@example.com | 555-8765 | 789 Pine St, London, UK | +| 4 | Bob Brown | 45 | Artist | Australia | bob.b@example.com | 555-4321 | 321 Maple St, Sydney, NSW | +| 5 | Carol White | 31 | Scientist | New Zealand | carol.w@example.com | 555-6789 | 654 Birch St, Wellington, NZ | +| 6 | Dave Green | 28 | Lawyer | Ireland | dave.g@example.com | 555-3456 | 987 Cedar St, Dublin, IE | +| 7 | Emma Black | 40 | Musician | USA | emma.b@example.com | 555-1111 | 246 Ash St, New York, NY | +| 8 | Frank Blue | 37 | Chef | Canada | frank.b@example.com | 555-2222 | 135 Spruce St, Vancouver, BC | +| 9 | Grace Yellow | 50 | Engineer | UK | grace.y@example.com | 555-3333 | 864 Fir St, Manchester, UK | +| 10 | Henry Violet | 32 | Artist | Australia | henry.v@example.com | 555-4444 | 753 Willow St, Melbourne, VIC| +| 11 | Irene Orange | 26 | Scientist | New Zealand | irene.o@example.com | 555-5555 | 912 Poplar St, Auckland, NZ | +| 12 | Jack Indigo | 38 | Teacher | Ireland | jack.i@example.com | 555-6666 | 159 Elm St, Cork, IE | +| 13 | Karen Red | 41 | Lawyer | USA | karen.r@example.com | 555-7777 | 357 Cedar St, Boston, MA | +| 14 | Leo Brown | 30 | Chef | Canada | leo.b@example.com | 555-8888 | 246 Oak St, Calgary, AB | +| 15 | Mia Green | 33 | Musician | UK | mia.g@example.com | 555-9999 | 975 Pine St, Edinburgh, UK | +| 16 | Noah Yellow | 29 | Doctor | Australia | noah.y@example.com | 555-0000 | 864 Birch St, Brisbane, QLD | +| 17 | Olivia Blue | 35 | Engineer | New Zealand | olivia.b@example.com | 555-1212 | 753 Maple St, Hamilton, NZ | +| 18 | Peter Black | 42 | Artist | Ireland | peter.b@example.com | 555-3434 | 912 Fir St, Limerick, IE | +| 19 | Quinn White | 28 | Scientist | USA | quinn.w@example.com | 555-5656 | 159 Willow St, Seattle, WA | +| 20 | Rachel Red | 31 | Teacher | Canada | rachel.r@example.com | 555-7878 | 357 Poplar St, Ottawa, ON | +| 21 | Steve Green | 44 | Lawyer | UK | steve.g@example.com | 555-9090 | 753 Elm St, Birmingham, UK | +| 22 | Tina Blue | 36 | Musician | Australia | tina.b@example.com | 555-1213 | 864 Cedar St, Perth, WA | +| 23 | Umar Black | 39 | Chef | New Zealand | umar.b@example.com | 555-3435 | 975 Spruce St, Christchurch, NZ| +| 24 | Victor Yellow | 43 | Engineer | Ireland | victor.y@example.com | 555-5657 | 246 Willow St, Galway, IE | +| 25 | Wendy Orange | 27 | Artist | USA | wendy.o@example.com | 555-7879 | 135 Elm St, Denver, CO | +| 26 | Xavier Green | 34 | Scientist | Canada | xavier.g@example.com | 555-9091 | 357 Oak St, Montreal, QC | +| 27 | Yara Red | 41 | Teacher | UK | yara.r@example.com | 555-1214 | 975 Pine St, Leeds, UK | +| 28 | Zack Blue | 30 | Lawyer | Australia | zack.b@example.com | 555-3436 | 135 Birch St, Adelaide, SA | +| 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ | +| 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE | +""" + + +def get_generation_time(llm, sampling_params, prompts): + # time the generation + start_time = time.time() + output = llm.generate(prompts, sampling_params=sampling_params) + end_time = time.time() + # print the output and generation time + print(f"Output: {output[0].outputs[0].text}") + print(f"Generation time: {end_time - start_time} seconds.") + + +# set enable_prefix_caching=True to enable APC +llm = LLM( + model='lmsys/longchat-13b-16k', + enable_prefix_caching=True +) + +sampling_params = SamplingParams(temperature=0, max_tokens=100) + +# Querying the age of John Doe +get_generation_time( + llm, + sampling_params, + LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ", +) + +# Querying the age of Zack Blue +# This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again. +get_generation_time( + llm, + sampling_params, + LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ", +) +``` + +## Example workloads + +We describe two example workloads, where APC can provide huge performance benefit: + +- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency. +- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency. + +## Limits + +APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused). diff --git a/docs/source/automatic_prefix_caching/apc.rst b/docs/source/automatic_prefix_caching/apc.rst deleted file mode 100644 index 0d70c74689bf9..0000000000000 --- a/docs/source/automatic_prefix_caching/apc.rst +++ /dev/null @@ -1,110 +0,0 @@ -.. _apc: - -Introduction -============ - -What is Automatic Prefix Caching --------------------------------- - -Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part. - - -.. note:: - - Technical details on how vLLM implements APC are in the next page. - - - -Enabling APC in vLLM --------------------- - -Set ``enable_prefix_caching=True`` in vLLM engine to enable APC. Here is an example: - -.. code-block:: python - - import time - from vllm import LLM, SamplingParams - - - # A prompt containing a large markdown table. The table is randomly generated by GPT-4. - LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """ - | ID | Name | Age | Occupation | Country | Email | Phone Number | Address | - |-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------| - | 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL | - | 2 | Jane Smith | 34 | Doctor | Canada | jane.smith@example.com | 555-5678 | 456 Oak St, Toronto, ON | - | 3 | Alice Johnson | 27 | Teacher | UK | alice.j@example.com | 555-8765 | 789 Pine St, London, UK | - | 4 | Bob Brown | 45 | Artist | Australia | bob.b@example.com | 555-4321 | 321 Maple St, Sydney, NSW | - | 5 | Carol White | 31 | Scientist | New Zealand | carol.w@example.com | 555-6789 | 654 Birch St, Wellington, NZ | - | 6 | Dave Green | 28 | Lawyer | Ireland | dave.g@example.com | 555-3456 | 987 Cedar St, Dublin, IE | - | 7 | Emma Black | 40 | Musician | USA | emma.b@example.com | 555-1111 | 246 Ash St, New York, NY | - | 8 | Frank Blue | 37 | Chef | Canada | frank.b@example.com | 555-2222 | 135 Spruce St, Vancouver, BC | - | 9 | Grace Yellow | 50 | Engineer | UK | grace.y@example.com | 555-3333 | 864 Fir St, Manchester, UK | - | 10 | Henry Violet | 32 | Artist | Australia | henry.v@example.com | 555-4444 | 753 Willow St, Melbourne, VIC| - | 11 | Irene Orange | 26 | Scientist | New Zealand | irene.o@example.com | 555-5555 | 912 Poplar St, Auckland, NZ | - | 12 | Jack Indigo | 38 | Teacher | Ireland | jack.i@example.com | 555-6666 | 159 Elm St, Cork, IE | - | 13 | Karen Red | 41 | Lawyer | USA | karen.r@example.com | 555-7777 | 357 Cedar St, Boston, MA | - | 14 | Leo Brown | 30 | Chef | Canada | leo.b@example.com | 555-8888 | 246 Oak St, Calgary, AB | - | 15 | Mia Green | 33 | Musician | UK | mia.g@example.com | 555-9999 | 975 Pine St, Edinburgh, UK | - | 16 | Noah Yellow | 29 | Doctor | Australia | noah.y@example.com | 555-0000 | 864 Birch St, Brisbane, QLD | - | 17 | Olivia Blue | 35 | Engineer | New Zealand | olivia.b@example.com | 555-1212 | 753 Maple St, Hamilton, NZ | - | 18 | Peter Black | 42 | Artist | Ireland | peter.b@example.com | 555-3434 | 912 Fir St, Limerick, IE | - | 19 | Quinn White | 28 | Scientist | USA | quinn.w@example.com | 555-5656 | 159 Willow St, Seattle, WA | - | 20 | Rachel Red | 31 | Teacher | Canada | rachel.r@example.com | 555-7878 | 357 Poplar St, Ottawa, ON | - | 21 | Steve Green | 44 | Lawyer | UK | steve.g@example.com | 555-9090 | 753 Elm St, Birmingham, UK | - | 22 | Tina Blue | 36 | Musician | Australia | tina.b@example.com | 555-1213 | 864 Cedar St, Perth, WA | - | 23 | Umar Black | 39 | Chef | New Zealand | umar.b@example.com | 555-3435 | 975 Spruce St, Christchurch, NZ| - | 24 | Victor Yellow | 43 | Engineer | Ireland | victor.y@example.com | 555-5657 | 246 Willow St, Galway, IE | - | 25 | Wendy Orange | 27 | Artist | USA | wendy.o@example.com | 555-7879 | 135 Elm St, Denver, CO | - | 26 | Xavier Green | 34 | Scientist | Canada | xavier.g@example.com | 555-9091 | 357 Oak St, Montreal, QC | - | 27 | Yara Red | 41 | Teacher | UK | yara.r@example.com | 555-1214 | 975 Pine St, Leeds, UK | - | 28 | Zack Blue | 30 | Lawyer | Australia | zack.b@example.com | 555-3436 | 135 Birch St, Adelaide, SA | - | 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ | - | 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE | - """ - - - def get_generation_time(llm, sampling_params, prompts): - # time the generation - start_time = time.time() - output = llm.generate(prompts, sampling_params=sampling_params) - end_time = time.time() - # print the output and generation time - print(f"Output: {output[0].outputs[0].text}") - print(f"Generation time: {end_time - start_time} seconds.") - - - # set enable_prefix_caching=True to enable APC - llm = LLM( - model='lmsys/longchat-13b-16k', - enable_prefix_caching=True - ) - - sampling_params = SamplingParams(temperature=0, max_tokens=100) - - # Querying the age of John Doe - get_generation_time( - llm, - sampling_params, - LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ", - ) - - # Querying the age of Zack Blue - # This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again. - get_generation_time( - llm, - sampling_params, - LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ", - ) - -Example workloads ------------------ - -We describe two example workloads, where APC can provide huge performance benefit: - -- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency. -- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency. - - -Limits ------- -APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused). diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md new file mode 100644 index 0000000000000..43fa9ee616096 --- /dev/null +++ b/docs/source/community/meetups.md @@ -0,0 +1,15 @@ +(meetups)= + +# vLLM Meetups + +We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: + +- [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing) +- [The sixth vLLM meetup](https://lu.ma/87q3nvnh), with NVIDIA, September 9th 2024. [[Slides]](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing) +- [The fifth vLLM meetup](https://lu.ma/lp0gyjqr), with AWS, July 24th 2024. [[Slides]](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing) +- [The fourth vLLM meetup](https://lu.ma/agivllm), with Cloudflare and BentoML, June 11th 2024. [[Slides]](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing) +- [The third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/), with Roblox, April 2nd 2024. [[Slides]](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing) +- [The second vLLM meetup](https://lu.ma/ygxbpzhl), with IBM Research, January 31st 2024. [[Slides]](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing) [[Video (vLLM Update)]](https://youtu.be/Y0C-DUvEnZQ) [[Video (IBM Research & torch.compile)]](https://youtu.be/m0dMtFLI-dg) +- [The first vLLM meetup](https://lu.ma/first-vllm-meetup), with a16z, October 5th 2023. [[Slides]](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing) + +We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu). diff --git a/docs/source/community/meetups.rst b/docs/source/community/meetups.rst deleted file mode 100644 index c87f01aa263b3..0000000000000 --- a/docs/source/community/meetups.rst +++ /dev/null @@ -1,16 +0,0 @@ -.. _meetups: - -vLLM Meetups -============ - -We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: - -- `The seventh vLLM meetup `__, with Snowflake, November 14th 2024. `[Slides] `__ -- `The sixth vLLM meetup `__, with NVIDIA, September 9th 2024. `[Slides] `__ -- `The fifth vLLM meetup `__, with AWS, July 24th 2024. `[Slides] `__ -- `The fourth vLLM meetup `__, with Cloudflare and BentoML, June 11th 2024. `[Slides] `__ -- `The third vLLM meetup `__, with Roblox, April 2nd 2024. `[Slides] `__ -- `The second vLLM meetup `__, with IBM Research, January 31st 2024. `[Slides] `__ `[Video (vLLM Update)] `__ `[Video (IBM Research & torch.compile)] `__ -- `The first vLLM meetup `__, with a16z, October 5th 2023. `[Slides] `__ - -We are always looking for speakers and sponsors at San Francisco Bay Area and potentially other locations. If you are interested in speaking or sponsoring, please contact us at `vllm-questions@lists.berkeley.edu `__. diff --git a/docs/source/conf.py b/docs/source/conf.py index e9d9ac68c9560..6f1d1842fe686 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -51,7 +51,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns: List[str] = ["**/*.template.rst"] +exclude_patterns: List[str] = ["**/*.template.md"] # Exclude the prompt "$" when copying code copybutton_prompt_text = r"\$ " diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md new file mode 100644 index 0000000000000..d72b99fe017b6 --- /dev/null +++ b/docs/source/contributing/dockerfile/dockerfile.md @@ -0,0 +1,50 @@ +# Dockerfile + +See [here](https://github.com/vllm-project/vllm/blob/main/Dockerfile) for the main Dockerfile to construct +the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found [here](https://docs.vllm.ai/en/stable/serving/deploying_with_docker.html). + +Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: + +- All build stages +- The default build target (highlighted in grey) +- External images (with dashed borders) + +The edges of the build graph represent: + +- FROM ... dependencies (with a solid line and a full arrow head) + +- COPY --from=... dependencies (with a dashed line and an empty arrow head) + +- RUN --mount=(.\*)from=... dependencies (with a dotted line and an empty diamond arrow head) + + > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png + > :align: center + > :alt: query + > :width: 100% + > ``` + > + > Made using: + > + > Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present): + > + > ```bash + > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile + > ``` + > + > or in case you want to run it directly with the docker image: + > + > ```bash + > docker run \ + > --rm \ + > --user "$(id -u):$(id -g)" \ + > --workdir /workspace \ + > --volume "$(pwd)":/workspace \ + > ghcr.io/patrickhoefler/dockerfilegraph:alpine \ + > --output png \ + > --dpi 200 \ + > --max-label-length 50 \ + > --filename Dockerfile \ + > --legend + > ``` + > + > (To run it for a different file, you can pass in a different argument to the flag `--filename`.) diff --git a/docs/source/contributing/dockerfile/dockerfile.rst b/docs/source/contributing/dockerfile/dockerfile.rst deleted file mode 100644 index 9c17c27aa61bf..0000000000000 --- a/docs/source/contributing/dockerfile/dockerfile.rst +++ /dev/null @@ -1,50 +0,0 @@ -Dockerfile -==================== - -See `here `__ for the main Dockerfile to construct -the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found `here `__. - -Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: - -- All build stages -- The default build target (highlighted in grey) -- External images (with dashed borders) - -The edges of the build graph represent: - -- FROM ... dependencies (with a solid line and a full arrow head) -- COPY --from=... dependencies (with a dashed line and an empty arrow head) -- RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head) - - .. figure:: ../../assets/dev/dockerfile-stages-dependency.png - :alt: query - :width: 100% - :align: center - - Made using: https://github.com/patrickhoefler/dockerfilegraph - - Commands to regenerate the build graph (make sure to run it **from the `root` directory of the vLLM repository** where the dockerfile is present): - - .. code:: bash - - dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile - - or in case you want to run it directly with the docker image: - - .. code:: bash - - docker run \ - --rm \ - --user "$(id -u):$(id -g)" \ - --workdir /workspace \ - --volume "$(pwd)":/workspace \ - ghcr.io/patrickhoefler/dockerfilegraph:alpine \ - --output png \ - --dpi 200 \ - --max-label-length 50 \ - --filename Dockerfile \ - --legend - - (To run it for a different file, you can pass in a different argument to the flag `--filename`.) - - \ No newline at end of file diff --git a/docs/source/contributing/overview.rst b/docs/source/contributing/overview.md similarity index 51% rename from docs/source/contributing/overview.rst rename to docs/source/contributing/overview.md index 4cea0afdaea74..53e8e78f08e72 100644 --- a/docs/source/contributing/overview.rst +++ b/docs/source/contributing/overview.md @@ -1,5 +1,4 @@ -Contributing to vLLM -===================== +# Contributing to vLLM Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project: @@ -12,132 +11,121 @@ We also believe in the power of community support; thus, answering queries, offe Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository! -License -------- +## License -See `LICENSE `_. +See [LICENSE](https://github.com/vllm-project/vllm/tree/main/LICENSE). -Developing ----------- +## Developing -Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the `building from source `_ documentation for details. +Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details. -Testing -------- +## Testing -.. code-block:: bash +```bash +pip install -r requirements-dev.txt - pip install -r requirements-dev.txt +# linting and formatting +bash format.sh +# Static type checking +mypy +# Unit tests +pytest tests/ +``` - # linting and formatting - bash format.sh - # Static type checking - mypy - # Unit tests - pytest tests/ +```{note} +Currently, the repository does not pass the `mypy` tests. +``` -.. note:: Currently, the repository does not pass the ``mypy`` tests. +# Contribution Guidelines -Contribution Guidelines -======================= +## Issues -Issues ------- +If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. -If you encounter a bug or have a feature request, please `search existing issues `_ first to see if it has already been reported. If not, please `file a new issue `_, providing as much relevant information as possible. +```{important} +If you discover a security vulnerability, please follow the instructions [here](https://github.com/vllm-project/vllm/tree/main/SECURITY.md#reporting-a-vulnerability). +``` -.. important:: - If you discover a security vulnerability, please follow the instructions `here `_. - -Pull Requests & Code Reviews ----------------------------- +## Pull Requests & Code Reviews Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process. -DCO and Signed-off-by -^^^^^^^^^^^^^^^^^^^^^ +### DCO and Signed-off-by -When contributing changes to this project, you must agree to the `DCO `_. -Commits must include a ``Signed-off-by:`` header which certifies agreement with -the terms of the `DCO `_. +When contributing changes to this project, you must agree to the [DCO](https://github.com/vllm-project/vllm/tree/main/DCO). +Commits must include a `Signed-off-by:` header which certifies agreement with +the terms of the [DCO](https://github.com/vllm-project/vllm/tree/main/DCO). -Using ``-s`` with ``git commit`` will automatically add this header. +Using `-s` with `git commit` will automatically add this header. -PR Title and Classification -^^^^^^^^^^^^^^^^^^^^^^^^^^^ +### PR Title and Classification Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following: -- ``[Bugfix]`` for bug fixes. -- ``[CI/Build]`` for build or continuous integration improvements. -- ``[Doc]`` for documentation fixes and improvements. -- ``[Model]`` for adding a new model or improving an existing model. Model name +- `[Bugfix]` for bug fixes. +- `[CI/Build]` for build or continuous integration improvements. +- `[Doc]` for documentation fixes and improvements. +- `[Model]` for adding a new model or improving an existing model. Model name should appear in the title. -- ``[Frontend]`` For changes on the vLLM frontend (e.g., OpenAI API server, - ``LLM`` class, etc.) -- ``[Kernel]`` for changes affecting CUDA kernels or other compute kernels. -- ``[Core]`` for changes in the core vLLM logic (e.g., ``LLMEngine``, - ``AsyncLLMEngine``, ``Scheduler``, etc.) -- ``[Hardware][Vendor]`` for hardware-specific changes. Vendor name should - appear in the prefix (e.g., ``[Hardware][AMD]``). -- ``[Misc]`` for PRs that do not fit the above categories. Please use this +- `[Frontend]` For changes on the vLLM frontend (e.g., OpenAI API server, + `LLM` class, etc.) +- `[Kernel]` for changes affecting CUDA kernels or other compute kernels. +- `[Core]` for changes in the core vLLM logic (e.g., `LLMEngine`, + `AsyncLLMEngine`, `Scheduler`, etc.) +- `[Hardware][Vendor]` for hardware-specific changes. Vendor name should + appear in the prefix (e.g., `[Hardware][AMD]`). +- `[Misc]` for PRs that do not fit the above categories. Please use this sparingly. -.. note:: - If the PR spans more than one category, please include all relevant prefixes. +```{note} +If the PR spans more than one category, please include all relevant prefixes. +``` -Code Quality -^^^^^^^^^^^^ +### Code Quality The PR needs to meet the following code quality standards: -- We adhere to `Google Python style guide - `_ and `Google C++ style guide - `_. -- Pass all linter checks. Please use `format.sh - `_ to format your +- We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html). +- Pass all linter checks. Please use [format.sh](https://github.com/vllm-project/vllm/blob/main/format.sh) to format your code. - The code needs to be well-documented to ensure future contributors can easily understand the code. - Include sufficient tests to ensure the project stays correct and robust. This includes both unit tests and integration tests. -- Please add documentation to ``docs/source/`` if the PR modifies the +- Please add documentation to `docs/source/` if the PR modifies the user-facing behaviors of vLLM. It helps vLLM users understand and utilize the new features or changes. -Adding or Changing Kernels -^^^^^^^^^^^^^^^^^^^^^^^^^^ +### Adding or Changing Kernels Each custom kernel needs a schema and one or more implementations to be registered with PyTorch. - Make sure custom ops are registered following PyTorch guidelines: - `Custom C++ and CUDA Operators `_ - and `The Custom Operators Manual `_. -- Custom operations that return ``Tensors`` require meta-functions. + [Custom C++ and CUDA Operators](https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial) + and [The Custom Operators Manual](https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU). +- Custom operations that return `Tensors` require meta-functions. Meta-functions should be implemented and registered in Python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions. -- Use `torch.library.opcheck() `_ +- Use [torch.library.opcheck()](https://pytorch.org/docs/stable/library.html#torch.library.opcheck) to test the function registration and meta-function for any registered ops. - See ``tests/kernels`` for examples. + See `tests/kernels` for examples. - When changing the C++ signature of an existing op, the schema must be updated to reflect the changes. - If a new custom type is needed, see the following document: - `Custom Class Support in PT2 `_. + [Custom Class Support in PT2](https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA). -Notes for Large Changes -^^^^^^^^^^^^^^^^^^^^^^^ +### Notes for Large Changes Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag -it with ``rfc-required`` and might not go through the PR. +it with `rfc-required` and might not go through the PR. -What to Expect for the Reviews -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +### What to Expect for the Reviews The goal of the vLLM team is to be a *transparent reviewing machine*. We would like to make the review process transparent and efficient and make sure no @@ -150,15 +138,14 @@ review process: - After the PR is assigned, the reviewer will provide status updates every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team. -- After the review, the reviewer will put an ``action-required`` label on the PR +- After the review, the reviewer will put an `action-required` label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR. - Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion. -Thank You ---------- +## Thank You Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. All of your contributions help make vLLM a great tool and community for everyone! diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md new file mode 100644 index 0000000000000..04e01da556231 --- /dev/null +++ b/docs/source/contributing/profiling/profiling_index.md @@ -0,0 +1,41 @@ +# Profiling vLLM + +We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/` + +The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set. + +When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag. + +```{warning} +Only enable profiling in a development environment. +``` + +Traces can be visualized using . + +```{tip} +Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. +``` + +```{tip} +To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100. +Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes. +`export VLLM_RPC_TIMEOUT=1800000` +``` + +## Example commands and usage: + +### Offline Inference: + +Refer to [examples/offline_inference_with_profiler.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py) for an example. + +### OpenAI Server: + +```bash +VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B +``` + +benchmark_serving.py: + +```bash +python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2 +``` diff --git a/docs/source/contributing/profiling/profiling_index.rst b/docs/source/contributing/profiling/profiling_index.rst deleted file mode 100644 index a422b1fcda521..0000000000000 --- a/docs/source/contributing/profiling/profiling_index.rst +++ /dev/null @@ -1,48 +0,0 @@ -============== -Profiling vLLM -============== - -We support tracing vLLM workers using the ``torch.profiler`` module. You can enable tracing by setting the ``VLLM_TORCH_PROFILER_DIR`` environment variable to the directory where you want to save the traces: ``VLLM_TORCH_PROFILER_DIR=/mnt/traces/`` - -The OpenAI server also needs to be started with the ``VLLM_TORCH_PROFILER_DIR`` environment variable set. - -When using ``benchmarks/benchmark_serving.py``, you can enable profiling by passing the ``--profile`` flag. - -.. warning:: - - Only enable profiling in a development environment. - - -Traces can be visualized using https://ui.perfetto.dev/. - -.. tip:: - - Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. - -.. tip:: - - To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100. - Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes. - ``export VLLM_RPC_TIMEOUT=1800000`` - -Example commands and usage: -=========================== - -Offline Inference: ------------------- - -Refer to `examples/offline_inference_with_profiler.py `_ for an example. - - -OpenAI Server: --------------- - -.. code-block:: bash - - VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B - -benchmark_serving.py: - -.. code-block:: bash - - python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2 \ No newline at end of file diff --git a/docs/source/design/arch_overview.rst b/docs/source/design/arch_overview.md similarity index 54% rename from docs/source/design/arch_overview.rst rename to docs/source/design/arch_overview.md index bc3f509f0a66e..511bee20a91f4 100644 --- a/docs/source/design/arch_overview.rst +++ b/docs/source/design/arch_overview.md @@ -1,25 +1,24 @@ -.. _arch_overview: +(arch-overview)= -Architecture Overview -====================== +# Architecture Overview This document provides an overview of the vLLM architecture. -.. contents:: Table of Contents - :local: - :depth: 2 +```{contents} Table of Contents +:depth: 2 +:local: true +``` -Entrypoints ------------ +## Entrypoints vLLM provides a number of entrypoints for interacting with the system. The following diagram shows the relationship between them. -.. image:: /assets/design/arch_overview/entrypoints.excalidraw.png - :alt: Entrypoints Diagram +```{image} /assets/design/arch_overview/entrypoints.excalidraw.png +:alt: Entrypoints Diagram +``` -LLM Class -^^^^^^^^^ +### LLM Class The LLM class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference @@ -27,75 +26,70 @@ server. Here is a sample of `LLM` class usage: -.. code-block:: python +```python +from vllm import LLM, SamplingParams - from vllm import LLM, SamplingParams +# Define a list of input prompts +prompts = [ + "Hello, my name is", + "The capital of France is", + "The largest ocean is", +] - # Define a list of input prompts - prompts = [ - "Hello, my name is", - "The capital of France is", - "The largest ocean is", - ] +# Define sampling parameters +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - # Define sampling parameters - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) +# Initialize the LLM engine with the OPT-125M model +llm = LLM(model="facebook/opt-125m") - # Initialize the LLM engine with the OPT-125M model - llm = LLM(model="facebook/opt-125m") +# Generate outputs for the input prompts +outputs = llm.generate(prompts, sampling_params) - # Generate outputs for the input prompts - outputs = llm.generate(prompts, sampling_params) +# Print the generated outputs +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` - # Print the generated outputs - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -More API details can be found in the :doc:`Offline Inference +More API details can be found in the {doc}`Offline Inference ` section of the API docs. -The code for the `LLM` class can be found in `vllm/entrypoints/llm.py -`_. +The code for the `LLM` class can be found in [vllm/entrypoints/llm.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py). -OpenAI-compatible API server -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +### OpenAI-compatible API server The second primary interface to vLLM is via its OpenAI-compatible API server. This server can be started using the `vllm serve` command. -.. code-block:: bash - - vllm serve +```bash +vllm serve +``` -The code for the `vllm` CLI can be found in `vllm/scripts.py -`_. +The code for the `vllm` CLI can be found in [vllm/scripts.py](https://github.com/vllm-project/vllm/blob/main/vllm/scripts.py). Sometimes you may see the API server entrypoint used directly instead of via the `vllm` CLI command. For example: -.. code-block:: bash - - python -m vllm.entrypoints.openai.api_server --model +```bash +python -m vllm.entrypoints.openai.api_server --model +``` -That code can be found in `vllm/entrypoints/openai/api_server.py -`_. +That code can be found in [vllm/entrypoints/openai/api_server.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py). -More details on the API server can be found in the :doc:`OpenAI Compatible +More details on the API server can be found in the {doc}`OpenAI Compatible Server ` document. -LLM Engine ----------- +## LLM Engine The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of the vLLM system, handling model inference and asynchronous request processing. -.. image:: /assets/design/arch_overview/llm_engine.excalidraw.png - :alt: LLMEngine Diagram +```{image} /assets/design/arch_overview/llm_engine.excalidraw.png +:alt: LLMEngine Diagram +``` -LLMEngine -^^^^^^^^^ +### LLMEngine The `LLMEngine` class is the core component of the vLLM engine. It is responsible for receiving requests from clients and generating outputs from the @@ -105,21 +99,15 @@ processing. - **Input Processing**: Handles tokenization of input text using the specified tokenizer. - - **Scheduling**: Chooses which requests are processed in each step. - - **Model Execution**: Manages the execution of the language model, including distributed execution across multiple GPUs. - - **Output Processing**: Processes the outputs generated by the model, decoding the token IDs from a language model into human-readable text. -The code for `LLMEngine` can be found in `vllm/engine/llm_engine.py`_. - -.. _vllm/engine/llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/llm_engine.py +The code for `LLMEngine` can be found in [vllm/engine/llm_engine.py]. -AsyncLLMEngine -^^^^^^^^^^^^^^ +### AsyncLLMEngine The `AsyncLLMEngine` class is an asynchronous wrapper for the `LLMEngine` class. It uses `asyncio` to create a background loop that continuously processes @@ -128,54 +116,46 @@ can handle multiple concurrent requests and stream outputs to clients. The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo API server that serves as a simpler example in -`vllm/entrypoints/api_server.py`_. - -.. _vllm/entrypoints/api_server.py: https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/api_server.py +[vllm/entrypoints/api_server.py]. -The code for `AsyncLLMEngine` can be found in `vllm/engine/async_llm_engine.py`_. +The code for `AsyncLLMEngine` can be found in [vllm/engine/async_llm_engine.py]. -.. _vllm/engine/async_llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/async_llm_engine.py - -Worker ------- +## Worker A worker is a process that runs the model inference. vLLM follows the common practice of using one process to control one accelerator device, such as GPUs. For example, if we use tensor parallelism of size 2 and pipeline parallelism of size 2, we will have 4 workers in total. Workers are identified by their -``rank`` and ``local_rank``. ``rank`` is used for global orchestration, while -``local_rank`` is mainly used for assigning the accelerator device and accessing +`rank` and `local_rank`. `rank` is used for global orchestration, while +`local_rank` is mainly used for assigning the accelerator device and accessing local resources such as the file system and shared memory. -Model Runner ------------- +## Model Runner Every worker has one model runner object, responsible for loading and running the model. Much of the model execution logic resides here, such as preparing input tensors and capturing cudagraphs. -Model ------ +## Model Every model runner object has one model object, which is the actual -``torch.nn.Module`` instance. See :ref:`huggingface_integration` for how various +`torch.nn.Module` instance. See [huggingface_integration](#huggingface-integration) for how various configurations affect the class we ultimately get. -Class Hierarchy ---------------- +## Class Hierarchy The following figure shows the class hierarchy of vLLM: - .. figure:: /assets/design/hierarchy.png - :alt: query - :width: 100% - :align: center +> ```{figure} /assets/design/hierarchy.png +> :align: center +> :alt: query +> :width: 100% +> ``` There are several important design choices behind this class hierarchy: -1. **Extensibility**: All classes in the hierarchy accept a configuration object -containing all the necessary information. The `VllmConfig -`__ +1\. **Extensibility**: All classes in the hierarchy accept a configuration object +containing all the necessary information. The [VllmConfig](https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/config.py#L2036) class is the main configuration object that is passed around. The class hierarchy is quite deep, and every class needs to read the configuration it is interested in. By encapsulating all configurations in one object, we can easily @@ -188,7 +168,7 @@ the `VllmConfig` class, and the model runner can access it directly. We don't need to change the constructor of the engine, worker, or model class to pass the new configuration option. -2. **Uniformity**: The model runner needs a unified interface to create and +2\. **Uniformity**: The model runner needs a unified interface to create and initialize the model. vLLM supports more than 50 types of popular open-source models. Each model has its own initialization logic. If the constructor signature varies with models, the model runner does not know how to call the @@ -200,46 +180,46 @@ of a vision model and a language model. By making the constructor uniform, we can easily create a vision model and a language model and compose them into a vision-language model. -.. note:: - - To support this change, all vLLM models' signatures have been updated to: - - .. code-block:: python - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - - To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: - - .. code-block:: python - - class MyOldModel(nn.Module): - def __init__( - self, - config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: - ... - - from vllm.config import VllmConfig - class MyNewModel(MyOldModel): - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config - super().__init__(config, cache_config, quant_config, lora_config, prefix) - - if __version__ >= "0.6.4": - MyModel = MyNewModel - else: - MyModel = MyOldModel - - This way, the model can work with both old and new versions of vLLM. - -3. **Sharding and Quantization at Initialization**: Certain features require +````{note} +To support this change, all vLLM models' signatures have been updated to: + +```python +def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): +``` + +To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: + +```python +class MyOldModel(nn.Module): + def __init__( + self, + config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + prefix: str = "", + ) -> None: + ... + +from vllm.config import VllmConfig +class MyNewModel(MyOldModel): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + super().__init__(config, cache_config, quant_config, lora_config, prefix) + +if __version__ >= "0.6.4": + MyModel = MyNewModel +else: + MyModel = MyOldModel +``` + +This way, the model can work with both old and new versions of vLLM. +```` + +3\. **Sharding and Quantization at Initialization**: Certain features require changing the model weights. For example, tensor parallelism needs to shard the model weights, and quantization needs to quantize the model weights. There are two possible ways to implement this feature. One way is to change the model @@ -252,23 +232,27 @@ initialized, we need to load the full 810GB weights to every GPU and then shard the weights, leading to a huge memory overhead. Instead, if we shard the weights during the model initialization, every layer will only create a shard of the weights it needs, leading to a much smaller memory overhead. The same idea -applies to quantization. Note that we also add an additional argument ``prefix`` +applies to quantization. Note that we also add an additional argument `prefix` to the model's constructor so that the model can initialize itself differently based on the prefix. This is useful for non-uniform quantization, where -different parts of the model are quantized differently. The ``prefix`` is -usually an empty string for the top-level model and a string like ``"vision"`` -or ``"language"`` for the sub-models. In general, it matches the name of the +different parts of the model are quantized differently. The `prefix` is +usually an empty string for the top-level model and a string like `"vision"` +or `"language"` for the sub-models. In general, it matches the name of the module's state dict in the checkpoint file. One disadvantage of this design is that it is hard to write unit tests for individual components in vLLM because every component needs to be initialized by a complete config object. We solve this problem by providing a default initialization function that creates a default config object with all fields set -to ``None``. If the component we want to test only cares about a few fields in +to `None`. If the component we want to test only cares about a few fields in the config object, we can create a default config object and set the fields we care about. This way, we can test the component in isolation. Note that many tests in vLLM are end-to-end tests that test the whole system, so this is not a big problem. -In summary, the complete config object ``VllmConfig`` can be treated as an +In summary, the complete config object `VllmConfig` can be treated as an engine-level global state that is shared among all vLLM classes. + +[vllm/engine/async_llm_engine.py]: https://github.com/vllm-project/vllm/tree/main/vllm/engine/async_llm_engine.py +[vllm/engine/llm_engine.py]: https://github.com/vllm-project/vllm/tree/main/vllm/engine/llm_engine.py +[vllm/entrypoints/api_server.py]: https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/api_server.py diff --git a/docs/source/design/huggingface_integration.md b/docs/source/design/huggingface_integration.md new file mode 100644 index 0000000000000..99b4cb56424c6 --- /dev/null +++ b/docs/source/design/huggingface_integration.md @@ -0,0 +1,36 @@ +(huggingface-integration)= + +# Integration with HuggingFace + +This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`. + +Let's say we want to serve the popular QWen model by running `vllm serve Qwen/Qwen2-7B`. + +1. The `model` argument is `Qwen/Qwen2-7B`. vLLM determines whether this model exists by checking for the corresponding config file `config.json`. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182) for the implementation. Within this process: + + - If the `model` argument corresponds to an existing local path, vLLM will load the config file directly from this path. + - If the `model` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the HuggingFace cache works. + - If the `model` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file. + +2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186) for the implementation. + +3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that: + + - HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example. + - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled. + +4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see [here](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244) for the implementation. + +5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the `architectures` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in [its registry](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80). If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For `Qwen/Qwen2-7B`, the `architectures` field is `["Qwen2ForCausalLM"]`, which corresponds to the `Qwen2ForCausalLM` class in [vLLM's code](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364). This class will initialize itself depending on various configs. + +Beyond that, there are two more things vLLM depends on HuggingFace for. + +1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24). + +2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights. + + - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that: + +This completes the integration between vLLM and HuggingFace. + +In summary, vLLM reads the config file `config.json`, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository. diff --git a/docs/source/design/huggingface_integration.rst b/docs/source/design/huggingface_integration.rst deleted file mode 100644 index e6c1cea6001ea..0000000000000 --- a/docs/source/design/huggingface_integration.rst +++ /dev/null @@ -1,40 +0,0 @@ -.. _huggingface_integration: - -Integration with HuggingFace -=================================== - -This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run ``vllm serve``. - -Let's say we want to serve the popular QWen model by running ``vllm serve Qwen/Qwen2-7B``. - -1. The ``model`` argument is ``Qwen/Qwen2-7B``. vLLM determines whether this model exists by checking for the corresponding config file ``config.json``. See this `code snippet `__ for the implementation. Within this process: - - - If the ``model`` argument corresponds to an existing local path, vLLM will load the config file directly from this path. - - - If the ``model`` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the ``model`` argument as the model name and the ``--revision`` argument as the revision. See `their website `__ for more information on how the HuggingFace cache works. - - - If the ``model`` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to `this function `__ for the implementation. The input arguments include the ``model`` argument as the model name, the ``--revision`` argument as the revision, and the environment variable ``HF_TOKEN`` as the token to access the model hub. In our case, vLLM will download the `config.json `__ file. - -2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this `code snippet `__ for the implementation. - -3. Next, vLLM `inspects `__ the ``model_type`` field in the config dictionary to `generate `__ the config object to use. There are some ``model_type`` values that vLLM directly supports; see `here `__ for the list. If the ``model_type`` is not in the list, vLLM will use `AutoConfig.from_pretrained `__ to load the config class, with ``model``, ``--revision``, and ``--trust_remote_code`` as the arguments. Please note that: - - - HuggingFace also has its own logic to determine the config class to use. It will again use the ``model_type`` field to search for the class name in the transformers library; see `here `__ for the list of supported models. If the ``model_type`` is not found, HuggingFace will use the ``auto_map`` field from the config JSON file to determine the class name. Specifically, it is the ``AutoConfig`` field under ``auto_map``. See `DeepSeek `__ for an example. - - - The ``AutoConfig`` field under ``auto_map`` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the ``from_pretrained`` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when ``--trust_remote_code`` is enabled. - -4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see `here `__ for the implementation. - -5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the ``architectures`` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in `its registry `__. If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For ``Qwen/Qwen2-7B``, the ``architectures`` field is ``["Qwen2ForCausalLM"]``, which corresponds to the ``Qwen2ForCausalLM`` class in `vLLM's code `__. This class will initialize itself depending on various configs. - -Beyond that, there are two more things vLLM depends on HuggingFace for. - -1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using `AutoTokenizer.from_pretrained `__ with the ``model`` argument as the model name and the ``--revision`` argument as the revision. It is also possible to use a tokenizer from another model by specifying the ``--tokenizer`` argument in the ``vllm serve`` command. Other relevant arguments are ``--tokenizer-revision`` and ``--tokenizer-mode``. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the `get_tokenizer `__ function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in `get_cached_tokenizer `__. - -2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the ``model`` argument as the model name and the ``--revision`` argument as the revision. vLLM provides the argument ``--load-format`` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass ``--load-format dummy`` to skip downloading the weights. - - - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the `documentation `__ for more information on the safetensors format. This part of the logic can be found `here `__. Please note that: - -This completes the integration between vLLM and HuggingFace. - -In summary, vLLM reads the config file ``config.json``, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository. diff --git a/docs/source/design/input_processing/input_processing_pipeline.md b/docs/source/design/input_processing/input_processing_pipeline.md new file mode 100644 index 0000000000000..bb16920e3d0c0 --- /dev/null +++ b/docs/source/design/input_processing/input_processing_pipeline.md @@ -0,0 +1,19 @@ +(input-processing-pipeline)= + +# Input Processing Pipeline + +1. Input data is passed to {class}`~vllm.LLMEngine` (or {class}`~vllm.AsyncLLMEngine`). + +2. Tokenize the data if necessary. + +3. Process the inputs using {meth}`INPUT_REGISTRY.process_input `. + + - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings. + +4. Send the processed inputs to {class}`~vllm.executor.executor_base.ExecutorBase`. + +5. Distribute the inputs via {class}`~vllm.worker.worker_base.WorkerBase` to {class}`~vllm.worker.model_runner_base.ModelRunnerBase`. + +6. If the data contains multi-modal data, convert it into keyword arguments using {meth}`MULTIMODAL_REGISTRY.map_input `. + + - For example, convert a {class}`PIL.Image.Image` input to its pixel values for a vision model. diff --git a/docs/source/design/input_processing/input_processing_pipeline.rst b/docs/source/design/input_processing/input_processing_pipeline.rst deleted file mode 100644 index 48abec8f75286..0000000000000 --- a/docs/source/design/input_processing/input_processing_pipeline.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. _input_processing_pipeline: - -Input Processing Pipeline -========================= - -1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`). - -2. Tokenize the data if necessary. - -3. Process the inputs using :meth:`INPUT_REGISTRY.process_input `. - - - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings. - -4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`. - -5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`. - -6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input `. - - - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision model. diff --git a/docs/source/design/input_processing/model_inputs_index.md b/docs/source/design/input_processing/model_inputs_index.md new file mode 100644 index 0000000000000..cb415366e5a66 --- /dev/null +++ b/docs/source/design/input_processing/model_inputs_index.md @@ -0,0 +1,43 @@ +(input-processing)= + +# Input Processing + +```{eval-rst} +.. currentmodule:: vllm.inputs +``` + +Each model can override parts of vLLM's [input processing pipeline](#input-processing-pipeline) via +{data}`~vllm.inputs.INPUT_REGISTRY` and {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`. + +Currently, this mechanism is only utilized in [multi-modal](#multi-modality) models for preprocessing multi-modal input +data in addition to input prompt, but it can be extended to text-only language models when needed. + +## Guides + +```{toctree} +:maxdepth: 1 + +input_processing_pipeline +``` + +## Module Contents + +### LLM Engine Inputs + +```{eval-rst} +.. autoclass:: vllm.inputs.DecoderOnlyInputs + :members: + :show-inheritance: +``` + +### Registry + +```{eval-rst} +.. autodata:: vllm.inputs.INPUT_REGISTRY +``` + +```{eval-rst} +.. automodule:: vllm.inputs.registry + :members: + :show-inheritance: +``` diff --git a/docs/source/design/input_processing/model_inputs_index.rst b/docs/source/design/input_processing/model_inputs_index.rst deleted file mode 100644 index f0ec1fea15ddb..0000000000000 --- a/docs/source/design/input_processing/model_inputs_index.rst +++ /dev/null @@ -1,39 +0,0 @@ -.. _input_processing: - -Input Processing -================ - -.. currentmodule:: vllm.inputs - -Each model can override parts of vLLM's :ref:`input processing pipeline ` via -:data:`~vllm.inputs.INPUT_REGISTRY` and :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`. - -Currently, this mechanism is only utilized in :ref:`multi-modal ` models for preprocessing multi-modal input -data in addition to input prompt, but it can be extended to text-only language models when needed. - -Guides -++++++ - -.. toctree:: - :maxdepth: 1 - - input_processing_pipeline - -Module Contents -+++++++++++++++ - -LLM Engine Inputs ------------------ - -.. autoclass:: vllm.inputs.DecoderOnlyInputs - :members: - :show-inheritance: - -Registry --------- - -.. autodata:: vllm.inputs.INPUT_REGISTRY - -.. automodule:: vllm.inputs.registry - :members: - :show-inheritance: diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md new file mode 100644 index 0000000000000..c21985b36eb3a --- /dev/null +++ b/docs/source/design/kernel/paged_attention.md @@ -0,0 +1,527 @@ +# vLLM Paged Attention + +- Currently, vLLM utilizes its own implementation of a multi-head query + attention kernel (`csrc/attention/attention_kernels.cu`). + This kernel is designed to be compatible with + vLLM's paged KV caches, where the key and value cache are stored in + separate blocks (note that this block concept differs from the GPU + thread block. So in a later document, I will refer to vLLM paged + attention block as "block", while refer to GPU thread block as + "thread block"). +- To achieve high performance, this kernel relies on a specially + designed memory layout and access method, specifically when threads + read data from global memory to shared memory. The purpose of this + document is to provide a high-level explanation of the kernel + implementation step by step, aiding those who wish to learn about the + vLLM multi-head query attention kernel. After going through this + document, users will likely have a better understanding and feel easier + to follow the actual implementation. +- Please note that this document may not cover all details, such as how + to calculate the correct index for the corresponding data or the dot + multiplication implementation. However, after reading this document + and becoming familiar with the high-level logic flow, it should be + easier for you to read the actual code and understand the details. + +## Inputs + +- The kernel function takes a list of arguments for the current thread + to perform its assigned work. The three most important arguments are + the input pointers `q`, `k_cache`, and `v_cache`, which point + to query, key, and value data on global memory that need to be read + and processed. The output pointer `out` points to global memory + where the result should be written. These four pointers actually + refer to multi-dimensional arrays, but each thread only accesses the + portion of data assigned to it. I have omitted all other runtime + parameters here for simplicity. + + ```cpp + template< + typename scalar_t, + int HEAD_SIZE, + int BLOCK_SIZE, + int NUM_THREADS, + int PARTITION_SIZE = 0> + __device__ void paged_attention_kernel( + ... // Other side args. + const scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size] + const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] + const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] + const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] + ... // Other side args. + ) + ``` + +- There are also a list of template arguments above the function + signature that are determined during compilation time. `scalar_t` + represents the data type of the query, key, and value data elements, + such as FP16. `HEAD_SIZE` indicates the number of elements in each + head. `BLOCK_SIZE` refers to the number of tokens in each block. + `NUM_THREADS` denotes the number of threads in each thread block. + `PARTITION_SIZE` represents the number of tensor parallel GPUs (For + simplicity, we assume this is 0 and tensor parallel is disabled). + +- With these arguments, we need to perform a sequence of preparations. + This includes calculating the current head index, block index, and + other necessary variables. However, for now, we can ignore these + preparations and proceed directly to the actual calculations. It will + be easier to understand them once we grasp the entire flow. + +## Concepts + +- Just before we dive into the calculation flow, I want to describe a + few concepts that are needed for later sections. However, you may + skip this section and return later if you encounter any confusing + terminologies. +- **Sequence**: A sequence represents a client request. For example, + the data pointed to by `q` has a shape of + `[num_seqs, num_heads, head_size]`. That represents there are total + `num_seqs` of query sequence data are pointed by `q`. Since this + kernel is a single query attention kernel, each sequence only has one + query token. Hence, the `num_seqs` equals the total number of tokens + that are processed in the batch. +- **Context**: The context consists of the generated tokens from the + sequence. For instance, `["What", "is", "your"]` are the context + tokens, and the input query token is `"name"`. The model might + generate the token `"?"`. +- **Vec**: The vec is a list of elements that are fetched and + calculated together. For query and key data, the vec size + (`VEC_SIZE`) is determined so that each thread group can fetch and + calculate 16 bytes of data at a time. For value data, the vec size + (`V_VEC_SIZE`) is determined so that each thread can fetch and + calculate 16 bytes of data at a time. For example, if the + `scalar_t` is FP16 (2 bytes) and `THREAD_GROUP_SIZE` is 2, the + `VEC_SIZE` will be 4, while the `V_VEC_SIZE` will be 8. +- **Thread group**: The thread group is a small group of + threads(`THREAD_GROUP_SIZE`) that fetches and calculates one + query token and one key token at a time. Each thread handles only a + portion of the token data. The total number of elements processed by + one thread group is referred as `x`. For example, if the thread + group contains 2 threads and the head size is 8, then thread 0 + handles the query and key elements at index 0, 2, 4, 6, while thread + 1 handles the elements at index 1, 3, 5, 7. +- **Block**: The key and value cache data in vLLM are split into + blocks. Each block stores data for a fixed number(`BLOCK_SIZE`) + of tokens at one head. Each block may contain only a portion of the + whole context tokens. For example, if the block size is 16 and the + head size is 128, then for one head, one block can store 16 * 128 = + 2048 elements. +- **Warp**: A warp is a group of 32 threads(`WARP_SIZE`) that + execute simultaneously on a stream multiprocessor (SM). In this + kernel, each warp processes the calculation between one query token + and key tokens of one entire block at a time (it may process multiple + blocks in multiple iterations). For example, if there are 4 warps and + 6 blocks for one context, the assignment would be like warp 0 handles + the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2 + handles the 2nd block and warp 3 handles the 3rd block. +- **Thread block**: A thread block is a group of + threads(`NUM_THREADS`) that can access the same shared memory. + Each thread block contains multiple warps(`NUM_WARPS`), and in + this kernel, each thread block processes the calculation between one + query token and key tokens of a whole context. +- **Grid**: A grid is a collection of thread blocks and defines the + shape of the collection. In this kernel, the shape is + `(num_heads, num_seqs, max_num_partitions)`. Therefore, each thread + block only handles the calculation for one head, one sequence, and + one partition. + +## Query + +- This section will introduce how query data is stored in memory and + fetched by each thread. As mentioned above, each thread group fetches + one query token data, while each thread itself only handles a part of + one query token data. Within each warp, every thread group will fetch + the same query token data, but will multiply it with different key + token data. + + ```cpp + const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; + ``` + + ```{figure} ../../assets/kernel/query.png + :align: center + :alt: query + :width: 70% + + Query data of one token at one head + ``` + +- Each thread defines its own `q_ptr` which points to the assigned + query token data on global memory. For example, if `VEC_SIZE` is 4 + and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains + total of 128 elements divided into 128 / 4 = 32 vecs. + + ```{figure} ../../assets/kernel/q_vecs.png + :align: center + :alt: q_vecs + :width: 70% + + `q_vecs` for one thread group + ``` + + ```cpp + __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; + ``` + +- Next, we need to read the global memory data pointed to by `q_ptr` + into shared memory as `q_vecs`. It is important to note that each + vecs is assigned to a different row. For example, if the + `THREAD_GROUP_SIZE` is 2, thread 0 will handle the 0th row vecs, + while thread 1 handles the 1st row vecs. By reading the query data in + this way, neighboring threads like thread 0 and thread 1 can read + neighbor memory, achieving the memory coalescing to improve + performance. + +## Key + +- Similar to the "Query" section, this section introduces memory layout + and assignment for keys. While each thread group only handle one + query token one kernel run, it may handle multiple key tokens across + multiple iterations. Meanwhile, each warp will process multiple blocks + of key tokens in multiple iterations, ensuring that all context + tokens are processed by the entire thread group after the kernel run. + In this context, "handle" refers to performing the dot multiplication + between query data and key data. + + ```cpp + const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride + + kv_head_idx * kv_head_stride + + physical_block_offset * x; + ``` + +- Unlike to `q_ptr`, `k_ptr` in each thread will point to different + key token at different iterations. As shown above, that `k_ptr` + points to key token data based on `k_cache` at assigned block, + assigned head and assigned token. + + ```{figure} ../../assets/kernel/key.png + :align: center + :alt: key + :width: 70% + + Key data of all context tokens at one head + ``` + +- The diagram above illustrates the memory layout for key data. It + assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is + 8, `THREAD_GROUP_SIZE` is 2, and there are a total of 4 warps. Each + rectangle represents all the elements for one key token at one head, + which will be processed by one thread group. The left half shows the + total 16 blocks of key token data for warp 0, while the right half + represents the remaining key token data for other warps or + iterations. Inside each rectangle, there are a total 32 vecs (128 + elements for one token) that will be processed by 2 threads (one + thread group) separately. + + ```{figure} ../../assets/kernel/k_vecs.png + :align: center + :alt: k_vecs + :width: 70% + + `k_vecs` for one thread + ``` + + ```cpp + K_vec k_vecs[NUM_VECS_PER_THREAD] + ``` + +- Next, we need to read the key token data from `k_ptr` and store + them on register memory as `k_vecs`. We use register memory for + `k_vecs` because it will only be accessed by one thread once, + whereas `q_vecs` will be accessed by multiple threads multiple + times. Each `k_vecs` will contain multiple vectors for later + calculation. Each vec will be set at each inner iteration. The + assignment of vecs allows neighboring threads in a warp to read + neighboring memory together, which again promotes the memory + coalescing. For instance, thread 0 will read vec 0, while thread 1 + will read vec 1. In the next inner loop, thread 0 will read vec 2, + while thread 1 will read vec 3, and so on. + +- You may still be a little confused about the overall flow. Don't + worry, please keep reading the next "QK" section. It will illustrate + the query and key calculation flow in a clearer and higher-level + manner. + +## QK + +- As shown the pseudo code below, before the entire for loop block, we + fetch the query data for one token and store it in `q_vecs`. Then, + in the outer for loop, we iterate through different `k_ptrs` that + point to different tokens and prepare the `k_vecs` in the inner for + loop. Finally, we perform the dot multiplication between the + `q_vecs` and each `k_vecs`. + + ```cpp + q_vecs = ... + for ... { + k_ptr = ... + for ... { + k_vecs[i] = ... + } + ... + float qk = scale * Qk_dot::dot(q_vecs[thread_group_offset], k_vecs); + } + ``` + +- As mentioned before, for each thread, it only fetches part of the + query and key token data at a time. However, there will be a cross + thread group reduction happen in the `Qk_dot<>::dot` . So `qk` + returned here is not just between part of the query and key token dot + multiplication, but actually a full result between entire query and + key token data. + +- For example, if the value of `HEAD_SIZE` is 128 and + `THREAD_GROUP_SIZE` is 2, each thread's `k_vecs` will contain + total 64 elements. However, the returned `qk` is actually the + result of dot multiplication between 128 query elements and 128 key + elements. If you want to learn more about the details of the dot + multiplication and reduction, you may refer to the implementation of + `Qk_dot<>::dot`. However, for the sake of simplicity, I will not + cover it in this document. + +## Softmax + +- Next, we need to calculate the normalized softmax for all `qk`s, + as shown above, where each $x$ represents a `qk`. To do this, + we must obtain the reduced value of `qk_max`($m(x)$) and + the `exp_sum`($\ell(x)$) of all `qk`s. The reduction + should be performed across the entire thread block, encompassing + results between the query token and all context key tokens. + + ```{math} + :nowrap: true + + \begin{gather*} + m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\ + \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)} + \end{gather*} + ``` + +### `qk_max` and `logits` + +- Just right after we get the `qk` result, we can set the temporary + `logits` result with `qk` (In the end, the `logits` should + store the normalized softmax result). Also we can compare and collect + the `qk_max` for all `qk`s that are calculated by current + thread group. + + ```cpp + if (thread_group_offset == 0) { + const bool mask = token_idx >= context_len; + logits[token_idx - start_token_idx] = mask ? 0.f : qk; + qk_max = mask ? qk_max : fmaxf(qk_max, qk); + } + ``` + +- Please note that the `logits` here is on shared memory, so each + thread group will set the fields for its own assigned context tokens. + Overall, the size of logits should be number of context tokens. + + ```cpp + for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) { + qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); + } + + if (lane == 0) { + red_smem[warp_idx] = qk_max; + } + ``` + +- Then we need to get the reduced `qk_max` across each warp. The main + idea is to make threads in warp to communicate with each other and + get the final max `qk` . + + ```cpp + for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { + qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); + } + qk_max = VLLM_SHFL_SYNC(qk_max, 0); + ``` + +- Finally, we can get the reduced `qk_max` from whole thread block by + compare the `qk_max` from all warps in this thread block. Then we + need to broadcast the final result to each thread. + +### `exp_sum` + +- Similar to `qk_max`, we need to get the reduced sum value from the + entire thread block too. + + ```cpp + for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { + float val = __expf(logits[i] - qk_max); + logits[i] = val; + exp_sum += val; + } + ... + exp_sum = block_sum(&red_smem[NUM_WARPS], exp_sum); + ``` + +- Firstly, sum all exp values from each thread group, and meanwhile, + convert each entry of `logits` from `qk` to `exp(qk - qk_max)`. + Please note, the `qk_max` here is already the max `qk` across the + whole thread block. And then we can do reduction for `exp_sum` + across whole thread block just like the `qk_max`. + + ```cpp + const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f); + for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { + logits[i] *= inv_sum; + } + ``` + +- Finally, with the reduced `qk_max` and `exp_sum`, we can obtain + the final normalized softmax result as `logits`. This `logits` + variable will be used for dot multiplication with the value data in + later steps. Now, it should store the normalized softmax result of + `qk` for all assigned context tokens. + +## Value + +```{figure} ../../assets/kernel/value.png +:align: center +:alt: value +:width: 70% + +Value data of all context tokens at one head +``` + +```{figure} ../../assets/kernel/logits_vec.png +:align: center +:alt: logits_vec +:width: 50% + +`logits_vec` for one thread +``` + +```{figure} ../../assets/kernel/v_vec.png +:align: center +:alt: v_vec +:width: 70% + +List of `v_vec` for one thread +``` + +- Now we need to retrieve the value data and perform dot multiplication + with `logits`. Unlike query and key, there is no thread group + concept for value data. As shown in diagram, different from key token + memory layout, elements from the same column correspond to the same + value token. For one block of value data, there are `HEAD_SIZE` of + rows and `BLOCK_SIZE` of columns that are split into multiple + `v_vecs`. + +- Each thread always fetches `V_VEC_SIZE` elements from the same + `V_VEC_SIZE` of tokens at a time. As a result, a single thread + retrieves multiple `v_vec`s from different rows and the same + columns through multiple inner iterations. For each `v_vec`, it + needs to be dot multiplied with the corresponding `logits_vec`, + which is also `V_VEC_SIZE` elements from `logits`. Overall, with + multiple inner iterations, each warp will process one block of value + tokens. And with multiple outer iterations, the whole context value + tokens are processd + + ```cpp + float accs[NUM_ROWS_PER_THREAD]; + for ... { // Iteration over different blocks. + logits_vec = ... + for ... { // Iteration over different rows. + v_vec = ... + ... + accs[i] += dot(logits_vec, v_vec); + } + } + ``` + +- As shown in the above pseudo code, in the outer loop, similar to + `k_ptr`, `logits_vec` iterates over different blocks and reads + `V_VEC_SIZE` elements from `logits`. In the inner loop, each + thread reads `V_VEC_SIZE` elements from the same tokens as a + `v_vec` and performs dot multiplication. It is important to note + that in each inner iteration, the thread fetches different head + position elements for the same tokens. The dot result is then + accumulated in `accs`. Therefore, each entry of `accs` is mapped + to a head position assigned to the current thread. + +- For example, if `BLOCK_SIZE` is 16 and `V_VEC_SIZE` is 8, each + thread fetches 8 value elements for 8 tokens at a time. Each element + is from different tokens at the same head position. If `HEAD_SIZE` + is 128 and `WARP_SIZE` is 32, for each inner loop, a warp needs to + fetch `WARP_SIZE * V_VEC_SIZE = 256` elements. This means there are + a total of 128 * 16 / 256 = 8 inner iterations for a warp to handle + a whole block of value tokens. And each `accs` in each thread + contains 8 elements that accumulated at 8 different head positions. + For the thread 0, the `accs` variable will have 8 elements, which + are 0th, 32th … 224th elements of a value head that are accumulated + from all assigned 8 tokens. + +## LV + +- Now, we need to perform reduction for `accs` within each warp. This + process allows each thread to accumulate the `accs` for the + assigned head positions of all tokens in one block. + + ```cpp + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + float acc = accs[i]; + for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) { + acc += VLLM_SHFL_XOR_SYNC(acc, mask); + } + accs[i] = acc; + } + ``` + +- Next, we perform reduction for `accs` across all warps, allowing + each thread to have the accumulation of `accs` for the assigned + head positions of all context tokens. Please note that each `accs` + in every thread only stores the accumulation for a portion of + elements of the entire head for all context tokens. However, overall, + all results for output have been calculated but are just stored in + different thread register memory. + + ```cpp + float* out_smem = reinterpret_cast(shared_mem); + for (int i = NUM_WARPS; i > 1; i /= 2) { + // Upper warps write to shared memory. + ... + float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE]; + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + ... + dst[row_idx] = accs[i]; + } + + // Lower warps update the output. + const float* src = &out_smem[warp_idx * HEAD_SIZE]; + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + ... + accs[i] += src[row_idx]; + } + + // Write out the accs. + } + ``` + +## Output + +- Now we can write all of calculated result from local register memory + to final output global memory. + + ```cpp + scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE + + head_idx * max_num_partitions * HEAD_SIZE + + partition_idx * HEAD_SIZE; + ``` + +- First, we need to define the `out_ptr` variable, which points to + the start address of the assigned sequence and assigned head. + + ```cpp + for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { + const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; + if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) { + from_float(*(out_ptr + row_idx), accs[i]); + } + } + ``` + +- Finally, we need to iterate over different assigned head positions + and write out the corresponding accumulated result based on the + `out_ptr`. diff --git a/docs/source/design/kernel/paged_attention.rst b/docs/source/design/kernel/paged_attention.rst deleted file mode 100644 index ba4f7a2718158..0000000000000 --- a/docs/source/design/kernel/paged_attention.rst +++ /dev/null @@ -1,525 +0,0 @@ -vLLM Paged Attention -==================== - -- Currently, vLLM utilizes its own implementation of a multi-head query - attention kernel (``csrc/attention/attention_kernels.cu``). - This kernel is designed to be compatible with - vLLM's paged KV caches, where the key and value cache are stored in - separate blocks (note that this block concept differs from the GPU - thread block. So in a later document, I will refer to vLLM paged - attention block as "block", while refer to GPU thread block as - "thread block"). -- To achieve high performance, this kernel relies on a specially - designed memory layout and access method, specifically when threads - read data from global memory to shared memory. The purpose of this - document is to provide a high-level explanation of the kernel - implementation step by step, aiding those who wish to learn about the - vLLM multi-head query attention kernel. After going through this - document, users will likely have a better understanding and feel easier - to follow the actual implementation. -- Please note that this document may not cover all details, such as how - to calculate the correct index for the corresponding data or the dot - multiplication implementation. However, after reading this document - and becoming familiar with the high-level logic flow, it should be - easier for you to read the actual code and understand the details. - -Inputs ------- - -- The kernel function takes a list of arguments for the current thread - to perform its assigned work. The three most important arguments are - the input pointers ``q``, ``k_cache``, and ``v_cache``, which point - to query, key, and value data on global memory that need to be read - and processed. The output pointer ``out`` points to global memory - where the result should be written. These four pointers actually - refer to multi-dimensional arrays, but each thread only accesses the - portion of data assigned to it. I have omitted all other runtime - parameters here for simplicity. - - .. code:: cpp - - template< - typename scalar_t, - int HEAD_SIZE, - int BLOCK_SIZE, - int NUM_THREADS, - int PARTITION_SIZE = 0> - __device__ void paged_attention_kernel( - ... // Other side args. - const scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size] - const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] - const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] - const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] - ... // Other side args. - ) - -- There are also a list of template arguments above the function - signature that are determined during compilation time. ``scalar_t`` - represents the data type of the query, key, and value data elements, - such as FP16. ``HEAD_SIZE`` indicates the number of elements in each - head. ``BLOCK_SIZE`` refers to the number of tokens in each block. - ``NUM_THREADS`` denotes the number of threads in each thread block. - ``PARTITION_SIZE`` represents the number of tensor parallel GPUs (For - simplicity, we assume this is 0 and tensor parallel is disabled). -- With these arguments, we need to perform a sequence of preparations. - This includes calculating the current head index, block index, and - other necessary variables. However, for now, we can ignore these - preparations and proceed directly to the actual calculations. It will - be easier to understand them once we grasp the entire flow. - -Concepts --------- - -- Just before we dive into the calculation flow, I want to describe a - few concepts that are needed for later sections. However, you may - skip this section and return later if you encounter any confusing - terminologies. -- **Sequence**: A sequence represents a client request. For example, - the data pointed to by ``q`` has a shape of - ``[num_seqs, num_heads, head_size]``. That represents there are total - ``num_seqs`` of query sequence data are pointed by ``q``. Since this - kernel is a single query attention kernel, each sequence only has one - query token. Hence, the ``num_seqs`` equals the total number of tokens - that are processed in the batch. -- **Context**: The context consists of the generated tokens from the - sequence. For instance, ``["What", "is", "your"]`` are the context - tokens, and the input query token is ``"name"``. The model might - generate the token ``"?"``. -- **Vec**: The vec is a list of elements that are fetched and - calculated together. For query and key data, the vec size - (``VEC_SIZE``) is determined so that each thread group can fetch and - calculate 16 bytes of data at a time. For value data, the vec size - (``V_VEC_SIZE``) is determined so that each thread can fetch and - calculate 16 bytes of data at a time. For example, if the - ``scalar_t`` is FP16 (2 bytes) and ``THREAD_GROUP_SIZE`` is 2, the - ``VEC_SIZE`` will be 4, while the ``V_VEC_SIZE`` will be 8. -- **Thread group**: The thread group is a small group of - threads(\ ``THREAD_GROUP_SIZE``) that fetches and calculates one - query token and one key token at a time. Each thread handles only a - portion of the token data. The total number of elements processed by - one thread group is referred as ``x``. For example, if the thread - group contains 2 threads and the head size is 8, then thread 0 - handles the query and key elements at index 0, 2, 4, 6, while thread - 1 handles the elements at index 1, 3, 5, 7. -- **Block**: The key and value cache data in vLLM are split into - blocks. Each block stores data for a fixed number(\ ``BLOCK_SIZE``) - of tokens at one head. Each block may contain only a portion of the - whole context tokens. For example, if the block size is 16 and the - head size is 128, then for one head, one block can store 16 \* 128 = - 2048 elements. -- **Warp**: A warp is a group of 32 threads(\ ``WARP_SIZE``) that - execute simultaneously on a stream multiprocessor (SM). In this - kernel, each warp processes the calculation between one query token - and key tokens of one entire block at a time (it may process multiple - blocks in multiple iterations). For example, if there are 4 warps and - 6 blocks for one context, the assignment would be like warp 0 handles - the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2 - handles the 2nd block and warp 3 handles the 3rd block. -- **Thread block**: A thread block is a group of - threads(\ ``NUM_THREADS``) that can access the same shared memory. - Each thread block contains multiple warps(\ ``NUM_WARPS``), and in - this kernel, each thread block processes the calculation between one - query token and key tokens of a whole context. -- **Grid**: A grid is a collection of thread blocks and defines the - shape of the collection. In this kernel, the shape is - ``(num_heads, num_seqs, max_num_partitions)``. Therefore, each thread - block only handles the calculation for one head, one sequence, and - one partition. - -Query ------ - -- This section will introduce how query data is stored in memory and - fetched by each thread. As mentioned above, each thread group fetches - one query token data, while each thread itself only handles a part of - one query token data. Within each warp, every thread group will fetch - the same query token data, but will multiply it with different key - token data. - - .. code:: cpp - - const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; - - .. figure:: ../../assets/kernel/query.png - :alt: query - :width: 70% - :align: center - - Query data of one token at one head - -- Each thread defines its own ``q_ptr`` which points to the assigned - query token data on global memory. For example, if ``VEC_SIZE`` is 4 - and ``HEAD_SIZE`` is 128, the ``q_ptr`` points to data that contains - total of 128 elements divided into 128 / 4 = 32 vecs. - - .. figure:: ../../assets/kernel/q_vecs.png - :alt: q_vecs - :width: 70% - :align: center - - ``q_vecs`` for one thread group - - .. code:: cpp - - __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; - -- Next, we need to read the global memory data pointed to by ``q_ptr`` - into shared memory as ``q_vecs``. It is important to note that each - vecs is assigned to a different row. For example, if the - ``THREAD_GROUP_SIZE`` is 2, thread 0 will handle the 0th row vecs, - while thread 1 handles the 1st row vecs. By reading the query data in - this way, neighboring threads like thread 0 and thread 1 can read - neighbor memory, achieving the memory coalescing to improve - performance. - -Key ---- - -- Similar to the "Query" section, this section introduces memory layout - and assignment for keys. While each thread group only handle one - query token one kernel run, it may handle multiple key tokens across - multiple iterations. Meanwhile, each warp will process multiple blocks - of key tokens in multiple iterations, ensuring that all context - tokens are processed by the entire thread group after the kernel run. - In this context, "handle" refers to performing the dot multiplication - between query data and key data. - - .. code:: cpp - - const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride - + kv_head_idx * kv_head_stride - + physical_block_offset * x; - -- Unlike to ``q_ptr``, ``k_ptr`` in each thread will point to different - key token at different iterations. As shown above, that ``k_ptr`` - points to key token data based on ``k_cache`` at assigned block, - assigned head and assigned token. - - .. figure:: ../../assets/kernel/key.png - :alt: key - :width: 70% - :align: center - - Key data of all context tokens at one head - -- The diagram above illustrates the memory layout for key data. It - assumes that the ``BLOCK_SIZE`` is 16, ``HEAD_SIZE`` is 128, ``x`` is - 8, ``THREAD_GROUP_SIZE`` is 2, and there are a total of 4 warps. Each - rectangle represents all the elements for one key token at one head, - which will be processed by one thread group. The left half shows the - total 16 blocks of key token data for warp 0, while the right half - represents the remaining key token data for other warps or - iterations. Inside each rectangle, there are a total 32 vecs (128 - elements for one token) that will be processed by 2 threads (one - thread group) separately. - - .. figure:: ../../assets/kernel/k_vecs.png - :alt: k_vecs - :width: 70% - :align: center - - ``k_vecs`` for one thread - - .. code:: cpp - - K_vec k_vecs[NUM_VECS_PER_THREAD] - -- Next, we need to read the key token data from ``k_ptr`` and store - them on register memory as ``k_vecs``. We use register memory for - ``k_vecs`` because it will only be accessed by one thread once, - whereas ``q_vecs`` will be accessed by multiple threads multiple - times. Each ``k_vecs`` will contain multiple vectors for later - calculation. Each vec will be set at each inner iteration. The - assignment of vecs allows neighboring threads in a warp to read - neighboring memory together, which again promotes the memory - coalescing. For instance, thread 0 will read vec 0, while thread 1 - will read vec 1. In the next inner loop, thread 0 will read vec 2, - while thread 1 will read vec 3, and so on. -- You may still be a little confused about the overall flow. Don't - worry, please keep reading the next "QK" section. It will illustrate - the query and key calculation flow in a clearer and higher-level - manner. - -QK ---- - -- As shown the pseudo code below, before the entire for loop block, we - fetch the query data for one token and store it in ``q_vecs``. Then, - in the outer for loop, we iterate through different ``k_ptrs`` that - point to different tokens and prepare the ``k_vecs`` in the inner for - loop. Finally, we perform the dot multiplication between the - ``q_vecs`` and each ``k_vecs``. - - .. code:: cpp - - q_vecs = ... - for ... { - k_ptr = ... - for ... { - k_vecs[i] = ... - } - ... - float qk = scale * Qk_dot::dot(q_vecs[thread_group_offset], k_vecs); - } - -- As mentioned before, for each thread, it only fetches part of the - query and key token data at a time. However, there will be a cross - thread group reduction happen in the ``Qk_dot<>::dot`` . So ``qk`` - returned here is not just between part of the query and key token dot - multiplication, but actually a full result between entire query and - key token data. -- For example, if the value of ``HEAD_SIZE`` is 128 and - ``THREAD_GROUP_SIZE`` is 2, each thread's ``k_vecs`` will contain - total 64 elements. However, the returned ``qk`` is actually the - result of dot multiplication between 128 query elements and 128 key - elements. If you want to learn more about the details of the dot - multiplication and reduction, you may refer to the implementation of - ``Qk_dot<>::dot``. However, for the sake of simplicity, I will not - cover it in this document. - -Softmax -------- - -- Next, we need to calculate the normalized softmax for all ``qk``\ s, - as shown above, where each :math:`x` represents a ``qk``. To do this, - we must obtain the reduced value of ``qk_max``\ (:math:`m(x)`) and - the ``exp_sum``\ (:math:`\ell(x)`) of all ``qk``\ s. The reduction - should be performed across the entire thread block, encompassing - results between the query token and all context key tokens. - - .. math:: - :nowrap: - - \begin{gather*} - m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\ - \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)} - \end{gather*} - -``qk_max`` and ``logits`` -~~~~~~~~~~~~~~~~~~~~~~~~~ - -- Just right after we get the ``qk`` result, we can set the temporary - ``logits`` result with ``qk`` (In the end, the ``logits`` should - store the normalized softmax result). Also we can compare and collect - the ``qk_max`` for all ``qk``\ s that are calculated by current - thread group. - - .. code:: cpp - - if (thread_group_offset == 0) { - const bool mask = token_idx >= context_len; - logits[token_idx - start_token_idx] = mask ? 0.f : qk; - qk_max = mask ? qk_max : fmaxf(qk_max, qk); - } - -- Please note that the ``logits`` here is on shared memory, so each - thread group will set the fields for its own assigned context tokens. - Overall, the size of logits should be number of context tokens. - - .. code:: cpp - - for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) { - qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); - } - - if (lane == 0) { - red_smem[warp_idx] = qk_max; - } - -- Then we need to get the reduced ``qk_max`` across each warp. The main - idea is to make threads in warp to communicate with each other and - get the final max ``qk`` . - - .. code:: cpp - - for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) { - qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask)); - } - qk_max = VLLM_SHFL_SYNC(qk_max, 0); - -- Finally, we can get the reduced ``qk_max`` from whole thread block by - compare the ``qk_max`` from all warps in this thread block. Then we - need to broadcast the final result to each thread. - -``exp_sum`` -~~~~~~~~~~~ - -- Similar to ``qk_max``, we need to get the reduced sum value from the - entire thread block too. - - .. code:: cpp - - for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { - float val = __expf(logits[i] - qk_max); - logits[i] = val; - exp_sum += val; - } - ... - exp_sum = block_sum(&red_smem[NUM_WARPS], exp_sum); - -- Firstly, sum all exp values from each thread group, and meanwhile, - convert each entry of ``logits`` from ``qk`` to ``exp(qk - qk_max)``. - Please note, the ``qk_max`` here is already the max ``qk`` across the - whole thread block. And then we can do reduction for ``exp_sum`` - across whole thread block just like the ``qk_max``. - - .. code:: cpp - - const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f); - for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) { - logits[i] *= inv_sum; - } - -- Finally, with the reduced ``qk_max`` and ``exp_sum``, we can obtain - the final normalized softmax result as ``logits``. This ``logits`` - variable will be used for dot multiplication with the value data in - later steps. Now, it should store the normalized softmax result of - ``qk`` for all assigned context tokens. - -Value ------ - -.. figure:: ../../assets/kernel/value.png - :alt: value - :width: 70% - :align: center - - Value data of all context tokens at one head - -.. figure:: ../../assets/kernel/logits_vec.png - :alt: logits_vec - :width: 50% - :align: center - - ``logits_vec`` for one thread - -.. figure:: ../../assets/kernel/v_vec.png - :alt: v_vec - :width: 70% - :align: center - - List of ``v_vec`` for one thread - -- Now we need to retrieve the value data and perform dot multiplication - with ``logits``. Unlike query and key, there is no thread group - concept for value data. As shown in diagram, different from key token - memory layout, elements from the same column correspond to the same - value token. For one block of value data, there are ``HEAD_SIZE`` of - rows and ``BLOCK_SIZE`` of columns that are split into multiple - ``v_vecs``. -- Each thread always fetches ``V_VEC_SIZE`` elements from the same - ``V_VEC_SIZE`` of tokens at a time. As a result, a single thread - retrieves multiple ``v_vec``\ s from different rows and the same - columns through multiple inner iterations. For each ``v_vec``, it - needs to be dot multiplied with the corresponding ``logits_vec``, - which is also ``V_VEC_SIZE`` elements from ``logits``. Overall, with - multiple inner iterations, each warp will process one block of value - tokens. And with multiple outer iterations, the whole context value - tokens are processd - - .. code:: cpp - - float accs[NUM_ROWS_PER_THREAD]; - for ... { // Iteration over different blocks. - logits_vec = ... - for ... { // Iteration over different rows. - v_vec = ... - ... - accs[i] += dot(logits_vec, v_vec); - } - } - -- As shown in the above pseudo code, in the outer loop, similar to - ``k_ptr``, ``logits_vec`` iterates over different blocks and reads - ``V_VEC_SIZE`` elements from ``logits``. In the inner loop, each - thread reads ``V_VEC_SIZE`` elements from the same tokens as a - ``v_vec`` and performs dot multiplication. It is important to note - that in each inner iteration, the thread fetches different head - position elements for the same tokens. The dot result is then - accumulated in ``accs``. Therefore, each entry of ``accs`` is mapped - to a head position assigned to the current thread. -- For example, if ``BLOCK_SIZE`` is 16 and ``V_VEC_SIZE`` is 8, each - thread fetches 8 value elements for 8 tokens at a time. Each element - is from different tokens at the same head position. If ``HEAD_SIZE`` - is 128 and ``WARP_SIZE`` is 32, for each inner loop, a warp needs to - fetch ``WARP_SIZE * V_VEC_SIZE = 256`` elements. This means there are - a total of 128 \* 16 / 256 = 8 inner iterations for a warp to handle - a whole block of value tokens. And each ``accs`` in each thread - contains 8 elements that accumulated at 8 different head positions. - For the thread 0, the ``accs`` variable will have 8 elements, which - are 0th, 32th … 224th elements of a value head that are accumulated - from all assigned 8 tokens. - -LV ---- -- Now, we need to perform reduction for ``accs`` within each warp. This - process allows each thread to accumulate the ``accs`` for the - assigned head positions of all tokens in one block. - - .. code:: cpp - - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - float acc = accs[i]; - for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) { - acc += VLLM_SHFL_XOR_SYNC(acc, mask); - } - accs[i] = acc; - } - -- Next, we perform reduction for ``accs`` across all warps, allowing - each thread to have the accumulation of ``accs`` for the assigned - head positions of all context tokens. Please note that each ``accs`` - in every thread only stores the accumulation for a portion of - elements of the entire head for all context tokens. However, overall, - all results for output have been calculated but are just stored in - different thread register memory. - - .. code:: cpp - - float* out_smem = reinterpret_cast(shared_mem); - for (int i = NUM_WARPS; i > 1; i /= 2) { - // Upper warps write to shared memory. - ... - float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE]; - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - ... - dst[row_idx] = accs[i]; - } - - // Lower warps update the output. - const float* src = &out_smem[warp_idx * HEAD_SIZE]; - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - ... - accs[i] += src[row_idx]; - } - - // Write out the accs. - } - -Output ------- - -- Now we can write all of calculated result from local register memory - to final output global memory. - - .. code:: cpp - - scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE - + head_idx * max_num_partitions * HEAD_SIZE - + partition_idx * HEAD_SIZE; - -- First, we need to define the ``out_ptr`` variable, which points to - the start address of the assigned sequence and assigned head. - - .. code:: cpp - - for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { - const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER; - if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) { - from_float(*(out_ptr + row_idx), accs[i]); - } - } - -- Finally, we need to iterate over different assigned head positions - and write out the corresponding accumulated result based on the - ``out_ptr``. diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.md b/docs/source/design/multimodal/adding_multimodal_plugin.md new file mode 100644 index 0000000000000..bcccd284879bb --- /dev/null +++ b/docs/source/design/multimodal/adding_multimodal_plugin.md @@ -0,0 +1,16 @@ +(adding-multimodal-plugin)= + +# Adding a Multimodal Plugin + +This document teaches you how to add a new modality to vLLM. + +Each modality in vLLM is represented by a {class}`~vllm.multimodal.MultiModalPlugin` and registered to {data}`~vllm.multimodal.MULTIMODAL_REGISTRY`. +For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to {meth}`~vllm.multimodal.MultiModalRegistry.register_plugin`. + +The remainder of this document details how to define custom {class}`~vllm.multimodal.MultiModalPlugin` s. + +```{note} +This article is a work in progress. +``` + +% TODO: Add more instructions on how to add new plugins once embeddings is in. diff --git a/docs/source/design/multimodal/adding_multimodal_plugin.rst b/docs/source/design/multimodal/adding_multimodal_plugin.rst deleted file mode 100644 index b726138f840a3..0000000000000 --- a/docs/source/design/multimodal/adding_multimodal_plugin.rst +++ /dev/null @@ -1,17 +0,0 @@ -.. _adding_multimodal_plugin: - -Adding a Multimodal Plugin -========================== - -This document teaches you how to add a new modality to vLLM. - -Each modality in vLLM is represented by a :class:`~vllm.multimodal.MultiModalPlugin` and registered to :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`. -For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to :meth:`~vllm.multimodal.MultiModalRegistry.register_plugin`. - -The remainder of this document details how to define custom :class:`~vllm.multimodal.MultiModalPlugin` s. - -.. note:: - This article is a work in progress. - -.. - TODO: Add more instructions on how to add new plugins once embeddings is in. diff --git a/docs/source/design/multimodal/multimodal_index.rst b/docs/source/design/multimodal/multimodal_index.md similarity index 61% rename from docs/source/design/multimodal/multimodal_index.rst rename to docs/source/design/multimodal/multimodal_index.md index c6d47f90b62d5..88af07afc7018 100644 --- a/docs/source/design/multimodal/multimodal_index.rst +++ b/docs/source/design/multimodal/multimodal_index.md @@ -1,66 +1,83 @@ -.. _multi_modality: +(multi-modality)= -Multi-Modality -============== +# Multi-Modality +```{eval-rst} .. currentmodule:: vllm.multimodal - -vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package. +``` -Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models ` -via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptType`. +vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package. + +Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models) +via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities -by following :ref:`this guide `. +by following [this guide](#adding-multimodal-plugin). -Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here `. +Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs). -Guides -++++++ +## Guides -.. toctree:: - :maxdepth: 1 +```{toctree} +:maxdepth: 1 - adding_multimodal_plugin +adding_multimodal_plugin +``` -Module Contents -+++++++++++++++ +## Module Contents +```{eval-rst} .. automodule:: vllm.multimodal +``` -Registry --------- +### Registry +```{eval-rst} .. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY +``` +```{eval-rst} .. autoclass:: vllm.multimodal.MultiModalRegistry :members: :show-inheritance: +``` -Base Classes ------------- +### Base Classes +```{eval-rst} .. autodata:: vllm.multimodal.NestedTensors +``` +```{eval-rst} .. autodata:: vllm.multimodal.BatchedTensorInputs +``` +```{eval-rst} .. autoclass:: vllm.multimodal.MultiModalDataBuiltins :members: :show-inheritance: +``` +```{eval-rst} .. autodata:: vllm.multimodal.MultiModalDataDict +``` +```{eval-rst} .. autoclass:: vllm.multimodal.MultiModalKwargs :members: :show-inheritance: +``` +```{eval-rst} .. autoclass:: vllm.multimodal.MultiModalPlugin :members: :show-inheritance: +``` -Image Classes -------------- +### Image Classes +```{eval-rst} .. automodule:: vllm.multimodal.image :members: :show-inheritance: +``` diff --git a/docs/source/design/plugin_system.md b/docs/source/design/plugin_system.md new file mode 100644 index 0000000000000..79aff757518f2 --- /dev/null +++ b/docs/source/design/plugin_system.md @@ -0,0 +1,54 @@ +(plugin-system)= + +# vLLM's Plugin System + +The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM. + +## How Plugins Work in vLLM + +Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [](#arch-overview)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work. + +## How vLLM Discovers Plugins + +vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin: + +```python +# inside `setup.py` file +from setuptools import setup + +setup(name='vllm_add_dummy_model', + version='0.1', + packages=['vllm_add_dummy_model'], + entry_points={ + 'vllm.general_plugins': + ["register_dummy_model = vllm_add_dummy_model:register"] + }) + +# inside `vllm_add_dummy_model.py` file +def register(): + from vllm import ModelRegistry + + if "MyLlava" not in ModelRegistry.get_supported_archs(): + ModelRegistry.register_model("MyLlava", + "vllm_add_dummy_model.my_llava:MyLlava") +``` + +For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html). + +Every plugin has three parts: + +1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group `vllm.general_plugins` to register general plugins. This is the key of `entry_points` in the `setup.py` file. Always use `vllm.general_plugins` for vLLM's general plugins. +2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name. +3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module. + +## What Can Plugins Do? + +Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM. + +## Guidelines for Writing Plugins + +- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes. + +## Compatibility Guarantee + +vLLM guarantees the interface of documented plugins, such as `ModelRegistry.register_model`, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, `"vllm_add_dummy_model.my_llava:MyLlava"` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development. diff --git a/docs/source/design/plugin_system.rst b/docs/source/design/plugin_system.rst deleted file mode 100644 index 5a96cc8b3a464..0000000000000 --- a/docs/source/design/plugin_system.rst +++ /dev/null @@ -1,62 +0,0 @@ -.. _plugin_system: - -vLLM's Plugin System -==================== - -The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM. - -How Plugins Work in vLLM ------------------------- - -Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see :ref:`arch_overview`), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the `load_general_plugins `__ function in the ``vllm.plugins`` module. This function is called for every process created by vLLM before it starts any work. - -How vLLM Discovers Plugins --------------------------- - -vLLM's plugin system uses the standard Python ``entry_points`` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin: - -.. code-block:: python - - # inside `setup.py` file - from setuptools import setup - - setup(name='vllm_add_dummy_model', - version='0.1', - packages=['vllm_add_dummy_model'], - entry_points={ - 'vllm.general_plugins': - ["register_dummy_model = vllm_add_dummy_model:register"] - }) - - # inside `vllm_add_dummy_model.py` file - def register(): - from vllm import ModelRegistry - - if "MyLlava" not in ModelRegistry.get_supported_archs(): - ModelRegistry.register_model("MyLlava", - "vllm_add_dummy_model.my_llava:MyLlava") - -For more information on adding entry points to your package, please check the `official documentation `__. - -Every plugin has three parts: - -1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group ``vllm.general_plugins`` to register general plugins. This is the key of ``entry_points`` in the ``setup.py`` file. Always use ``vllm.general_plugins`` for vLLM's general plugins. - -2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the ``entry_points`` dictionary. In the example above, the plugin name is ``register_dummy_model``. Plugins can be filtered by their names using the ``VLLM_PLUGINS`` environment variable. To load only a specific plugin, set ``VLLM_PLUGINS`` to the plugin name. - -3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is ``vllm_add_dummy_model:register``, which refers to a function named ``register`` in the ``vllm_add_dummy_model`` module. - -What Can Plugins Do? --------------------- - -Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling ``ModelRegistry.register_model`` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM. - -Guidelines for Writing Plugins ------------------------------- - -- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes. - -Compatibility Guarantee ------------------------ - -vLLM guarantees the interface of documented plugins, such as ``ModelRegistry.register_model``, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, ``"vllm_add_dummy_model.my_llava:MyLlava"`` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development. diff --git a/docs/source/dev/engine/async_llm_engine.rst b/docs/source/dev/engine/async_llm_engine.md similarity index 59% rename from docs/source/dev/engine/async_llm_engine.rst rename to docs/source/dev/engine/async_llm_engine.md index 93fc310cb543b..904feaa505164 100644 --- a/docs/source/dev/engine/async_llm_engine.rst +++ b/docs/source/dev/engine/async_llm_engine.md @@ -1,6 +1,7 @@ -AsyncLLMEngine -================================= +# AsyncLLMEngine +```{eval-rst} .. autoclass:: vllm.AsyncLLMEngine :members: :show-inheritance: +``` diff --git a/docs/source/dev/engine/engine_index.md b/docs/source/dev/engine/engine_index.md new file mode 100644 index 0000000000000..701cb95d3be33 --- /dev/null +++ b/docs/source/dev/engine/engine_index.md @@ -0,0 +1,17 @@ +# vLLM Engine + +```{eval-rst} +.. automodule:: vllm.engine +``` + +```{eval-rst} +.. currentmodule:: vllm.engine +``` + +```{toctree} +:caption: Engines +:maxdepth: 2 + +llm_engine +async_llm_engine +``` diff --git a/docs/source/dev/engine/engine_index.rst b/docs/source/dev/engine/engine_index.rst deleted file mode 100644 index ba9ae55ddea46..0000000000000 --- a/docs/source/dev/engine/engine_index.rst +++ /dev/null @@ -1,13 +0,0 @@ -vLLM Engine -================================= - -.. automodule:: vllm.engine -.. currentmodule:: vllm.engine - -.. toctree:: - :maxdepth: 2 - :caption: Engines - - llm_engine - async_llm_engine - diff --git a/docs/source/dev/engine/llm_engine.rst b/docs/source/dev/engine/llm_engine.md similarity index 60% rename from docs/source/dev/engine/llm_engine.rst rename to docs/source/dev/engine/llm_engine.md index 0b8c1e219d7c9..d6613ef5562dc 100644 --- a/docs/source/dev/engine/llm_engine.rst +++ b/docs/source/dev/engine/llm_engine.md @@ -1,6 +1,7 @@ -LLMEngine -================================= +# LLMEngine +```{eval-rst} .. autoclass:: vllm.LLMEngine :members: :show-inheritance: +``` diff --git a/docs/source/dev/offline_inference/llm.rst b/docs/source/dev/offline_inference/llm.md similarity index 67% rename from docs/source/dev/offline_inference/llm.rst rename to docs/source/dev/offline_inference/llm.md index 83ba1b6987c6d..9f129d5e41686 100644 --- a/docs/source/dev/offline_inference/llm.rst +++ b/docs/source/dev/offline_inference/llm.md @@ -1,6 +1,7 @@ -LLM Class -========= +# LLM Class +```{eval-rst} .. autoclass:: vllm.LLM :members: :show-inheritance: +``` diff --git a/docs/source/dev/offline_inference/llm_inputs.rst b/docs/source/dev/offline_inference/llm_inputs.md similarity index 78% rename from docs/source/dev/offline_inference/llm_inputs.rst rename to docs/source/dev/offline_inference/llm_inputs.md index 0d47281db485e..21f688a12c536 100644 --- a/docs/source/dev/offline_inference/llm_inputs.rst +++ b/docs/source/dev/offline_inference/llm_inputs.md @@ -1,14 +1,19 @@ -LLM Inputs -========== +# LLM Inputs +```{eval-rst} .. autodata:: vllm.inputs.PromptType +``` +```{eval-rst} .. autoclass:: vllm.inputs.TextPrompt :show-inheritance: :members: :member-order: bysource +``` +```{eval-rst} .. autoclass:: vllm.inputs.TokensPrompt :show-inheritance: :members: :member-order: bysource +``` diff --git a/docs/source/dev/offline_inference/offline_index.md b/docs/source/dev/offline_inference/offline_index.md new file mode 100644 index 0000000000000..318a02d8c78df --- /dev/null +++ b/docs/source/dev/offline_inference/offline_index.md @@ -0,0 +1,8 @@ +# Offline Inference + +```{toctree} +:maxdepth: 1 + +llm +llm_inputs +``` diff --git a/docs/source/dev/offline_inference/offline_index.rst b/docs/source/dev/offline_inference/offline_index.rst deleted file mode 100644 index 27dfb0e9df90e..0000000000000 --- a/docs/source/dev/offline_inference/offline_index.rst +++ /dev/null @@ -1,8 +0,0 @@ -Offline Inference -================================= - -.. toctree:: - :maxdepth: 1 - - llm - llm_inputs diff --git a/docs/source/dev/pooling_params.rst b/docs/source/dev/pooling_params.md similarity index 55% rename from docs/source/dev/pooling_params.rst rename to docs/source/dev/pooling_params.md index 334e0287aff09..74b2c57443e4b 100644 --- a/docs/source/dev/pooling_params.rst +++ b/docs/source/dev/pooling_params.md @@ -1,5 +1,6 @@ -Pooling Parameters -================== +# Pooling Parameters +```{eval-rst} .. autoclass:: vllm.PoolingParams :members: +``` diff --git a/docs/source/dev/sampling_params.rst b/docs/source/dev/sampling_params.md similarity index 55% rename from docs/source/dev/sampling_params.rst rename to docs/source/dev/sampling_params.md index f645941a6c022..bdc36af5153db 100644 --- a/docs/source/dev/sampling_params.rst +++ b/docs/source/dev/sampling_params.md @@ -1,5 +1,6 @@ -Sampling Parameters -=================== +# Sampling Parameters +```{eval-rst} .. autoclass:: vllm.SamplingParams :members: +``` diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index 79b49a186236a..4c5a9d9c1da38 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -15,18 +15,12 @@ def fix_case(text: str) -> str: return text -def underline(title: str, character: str = "=") -> str: - return f"{title}\n{character * len(title)}" - - def generate_title(filename: str) -> str: # Turn filename into a title title = filename.replace("_", " ").title() # Handle acronyms and names title = fix_case(title) - # Underline title - title = underline(title) - return title + return f"# {title}" def generate_examples(): @@ -38,7 +32,7 @@ def generate_examples(): # Destination paths doc_dir = root_dir / "docs/source/getting_started/examples" - doc_paths = [doc_dir / f"{path.stem}.rst" for path in script_paths] + doc_paths = [doc_dir / f"{path.stem}.md" for path in script_paths] # Generate the example docs for each example script for script_path, doc_path in zip(script_paths, doc_paths): @@ -46,16 +40,16 @@ def generate_examples(): # Make script_path relative to doc_path and call it include_path include_path = '../../../..' / script_path.relative_to(root_dir) content = (f"{generate_title(doc_path.stem)}\n\n" - f"Source {script_url}.\n\n" - f".. literalinclude:: {include_path}\n" - " :language: python\n" - " :linenos:\n") + f"Source: <{script_url}>.\n\n" + f"```{{literalinclude}} {include_path}\n" + ":language: python\n" + ":linenos:\n```") with open(doc_path, "w+") as f: f.write(content) # Generate the toctree for the example scripts - with open(doc_dir / "examples_index.template.rst") as f: + with open(doc_dir / "examples_index.template.md") as f: examples_index = f.read() - with open(doc_dir / "examples_index.rst", "w+") as f: - example_docs = "\n ".join(path.stem for path in script_paths) + with open(doc_dir / "examples_index.md", "w+") as f: + example_docs = "\n".join(path.stem + ".md" for path in script_paths) f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs)) diff --git a/docs/source/getting_started/amd-installation.md b/docs/source/getting_started/amd-installation.md new file mode 100644 index 0000000000000..b9ccbd7d6c7fc --- /dev/null +++ b/docs/source/getting_started/amd-installation.md @@ -0,0 +1,163 @@ +(installation-rocm)= + +# Installation with ROCm + +vLLM supports AMD GPUs with ROCm 6.2. + +## Requirements + +- OS: Linux +- Python: 3.9 -- 3.12 +- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) +- ROCm 6.2 + +Installation options: + +1. [Build from source with docker](#build-from-source-docker-rocm) +2. [Build from source](#build-from-source-rocm) + +(build-from-source-docker-rocm)= + +## Option 1: Build from source with docker (recommended) + +You can build and install vLLM from source. + +First, build a docker image from [Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm) and launch a docker container from the image. +It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: + +```console +{ + "features": { + "buildkit": true + } +} +``` + +[Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm) uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. +It provides flexibility to customize the build of docker image using the following arguments: + +- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image. +- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target. +- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942` +- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c` +- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1. + +Their values can be passed in when running `docker build` with `--build-arg` options. + +To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: + +```console +$ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . +``` + +To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below: + +```console +$ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . +``` + +To run the above docker image `vllm-rocm`, use the below command: + +```console +$ docker run -it \ + --network=host \ + --group-add=video \ + --ipc=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --device /dev/kfd \ + --device /dev/dri \ + -v :/app/model \ + vllm-rocm \ + bash +``` + +Where the `` is the location where the model is stored, for example, the weights for llama2 or llama3 models. + +(build-from-source-rocm)= + +## Option 2: Build from source + +0. Install prerequisites (skip if you are already in an environment/docker with the following installed): + +- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) +- [PyTorch](https://pytorch.org/) + +For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. + +Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/) + +1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton) + +Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) + +```console +$ python3 -m pip install ninja cmake wheel pybind11 +$ pip uninstall -y triton +$ git clone https://github.com/OpenAI/triton.git +$ cd triton +$ git checkout e192dba +$ cd python +$ pip3 install . +$ cd ../.. +``` + +```{note} +- If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. +``` + +2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile) + +Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support) +Alternatively, wheels intended for vLLM use can be accessed under the releases. + +For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`. + +```console +$ git clone https://github.com/ROCm/flash-attention.git +$ cd flash-attention +$ git checkout 3cea2fb +$ git submodule update --init +$ GPU_ARCHS="gfx90a" python3 setup.py install +$ cd .. +``` + +```{note} +- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) +``` + +3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps: + +```bash +$ pip install --upgrade pip + +# Install PyTorch +$ pip uninstall torch -y +$ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 + +# Build & install AMD SMI +$ pip install /opt/rocm/share/amd_smi + +# Install dependencies +$ pip install --upgrade numba scipy huggingface-hub[cli] +$ pip install "numpy<2" +$ pip install -r requirements-rocm.txt + +# Build vLLM for MI210/MI250/MI300. +$ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" +$ python3 setup.py develop +``` + +This may take 5-10 minutes. Currently, {code}`pip install .` does not work for ROCm installation. + +```{tip} +- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. +- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. +- To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. +- The ROCm version of PyTorch, ideally, should match the ROCm driver version. +``` + +```{tip} +- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. + For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). +``` diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst deleted file mode 100644 index 27636d936270c..0000000000000 --- a/docs/source/getting_started/amd-installation.rst +++ /dev/null @@ -1,178 +0,0 @@ -.. _installation_rocm: - -Installation with ROCm -====================== - -vLLM supports AMD GPUs with ROCm 6.2. - -Requirements ------------- - -* OS: Linux -* Python: 3.9 -- 3.12 -* GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) -* ROCm 6.2 - -Installation options: - -#. :ref:`Build from source with docker ` -#. :ref:`Build from source ` - -.. _build_from_source_docker_rocm: - -Option 1: Build from source with docker (recommended) ------------------------------------------------------ - -You can build and install vLLM from source. - -First, build a docker image from `Dockerfile.rocm `_ and launch a docker container from the image. -It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: - -.. code-block:: console - - { - "features": { - "buildkit": true - } - } - - -`Dockerfile.rocm `_ uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. -It provides flexibility to customize the build of docker image using the following arguments: - -* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. -* `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For `Radeon RX 7900 series (gfx1100) `_, this should be set to 0 before flash-attention supports this target. -* `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942` -* `FA_BRANCH`: specifies the branch used to build the CK flash-attention in `ROCm's flash-attention repo `_. The default is `ae7928c` -* `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1. - -Their values can be passed in when running ``docker build`` with ``--build-arg`` options. - - -To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: - -.. code-block:: console - - $ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . - -To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below: - -.. code-block:: console - - $ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . - -To run the above docker image ``vllm-rocm``, use the below command: - -.. code-block:: console - - $ docker run -it \ - --network=host \ - --group-add=video \ - --ipc=host \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --device /dev/kfd \ - --device /dev/dri \ - -v :/app/model \ - vllm-rocm \ - bash - -Where the `` is the location where the model is stored, for example, the weights for llama2 or llama3 models. - - -.. _build_from_source_rocm: - -Option 2: Build from source ---------------------------- - -0. Install prerequisites (skip if you are already in an environment/docker with the following installed): - -- `ROCm `_ -- `PyTorch `_ - -For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. - -Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch `Getting Started `_ - - -1. Install `Triton flash attention for ROCm `_ - -Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from `ROCm/triton `_ - - .. code-block:: console - - $ python3 -m pip install ninja cmake wheel pybind11 - $ pip uninstall -y triton - $ git clone https://github.com/OpenAI/triton.git - $ cd triton - $ git checkout e192dba - $ cd python - $ pip3 install . - $ cd ../.. - -.. note:: - - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. - - -2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm `_ - - -Install ROCm's flash attention (v2.5.9.post1) following the instructions from `ROCm/flash-attention `_ -Alternatively, wheels intended for vLLM use can be accessed under the releases. - -For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. -Note to get your gfx architecture, run `rocminfo |grep gfx`. - - .. code-block:: console - - $ git clone https://github.com/ROCm/flash-attention.git - $ cd flash-attention - $ git checkout 3cea2fb - $ git submodule update --init - $ GPU_ARCHS="gfx90a" python3 setup.py install - $ cd .. - -.. note:: - - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) - -3. Build vLLM. - - For example, vLLM on ROCM 6.2 can be built with the following steps: - - .. code-block:: console - - $ pip install --upgrade pip - - $ # Install PyTorch - $ pip uninstall torch -y - $ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 - - $ # Build & install AMD SMI - $ pip install /opt/rocm/share/amd_smi - - $ # Install dependencies - $ pip install --upgrade numba scipy huggingface-hub[cli] - $ pip install "numpy<2" - $ pip install -r requirements-rocm.txt - - $ # Build vLLM for MI210/MI250/MI300. - $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" - $ python3 setup.py develop - - - This may take 5-10 minutes. Currently, :code:`pip install .` does not work for ROCm installation. - - -.. tip:: - - - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. - - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. - - To use CK flash-attention or PyTorch naive attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. - - The ROCm version of PyTorch, ideally, should match the ROCm driver version. - - -.. tip:: - - For MI300x (gfx942) users, to achieve optimal performance, please refer to `MI300x tuning guide `_ for performance optimization and tuning tips on system and workflow level. - For vLLM, please refer to `vLLM performance optimization `_. - - diff --git a/docs/source/getting_started/arm-installation.md b/docs/source/getting_started/arm-installation.md new file mode 100644 index 0000000000000..de807e198b4f6 --- /dev/null +++ b/docs/source/getting_started/arm-installation.md @@ -0,0 +1,46 @@ +(installation-arm)= + +# Installation for ARM CPUs + +vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering: + +- CPU backend inference capabilities +- Relevant runtime environment variables +- Performance optimization tips + +ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. +Contents: + +1. [Requirements](#arm-backend-requirements) +2. [Quick Start with Dockerfile](#arm-backend-quick-start-dockerfile) +3. [Building from Source](#build-arm-backend-from-source) + +(arm-backend-requirements)= + +## Requirements + +- **Operating System**: Linux or macOS +- **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended) +- **Instruction Set Architecture (ISA)**: NEON support is required + +(arm-backend-quick-start-dockerfile)= + +## Quick Start with Dockerfile + +You can quickly set up vLLM on ARM using Docker: + +```console +$ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g . +$ docker run -it \ + --rm \ + --network=host \ + --cpuset-cpus= \ + --cpuset-mems= \ + vllm-cpu-env +``` + +(build-arm-backend-from-source)= + +## Building from Source + +To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility. diff --git a/docs/source/getting_started/arm-installation.rst b/docs/source/getting_started/arm-installation.rst deleted file mode 100644 index 7b457df92c11d..0000000000000 --- a/docs/source/getting_started/arm-installation.rst +++ /dev/null @@ -1,50 +0,0 @@ -.. _installation_arm: - -Installation for ARM CPUs -========================= - -vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering: - -* CPU backend inference capabilities -* Relevant runtime environment variables -* Performance optimization tips - -ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. -Contents: - -1. :ref:`Requirements ` -2. :ref:`Quick Start with Dockerfile ` -3. :ref:`Building from Source ` - -.. _arm_backend_requirements: - -Requirements ------------- - -* **Operating System**: Linux or macOS -* **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended) -* **Instruction Set Architecture (ISA)**: NEON support is required - -.. _arm_backend_quick_start_dockerfile: - -Quick Start with Dockerfile ---------------------------- - -You can quickly set up vLLM on ARM using Docker: - -.. code-block:: console - - $ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g . - $ docker run -it \ - --rm \ - --network=host \ - --cpuset-cpus= \ - --cpuset-mems= \ - vllm-cpu-env - -.. _build_arm_backend_from_source: - -Building from Source --------------------- - -To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility. diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/cpu-installation.md new file mode 100644 index 0000000000000..4ab5437f091d5 --- /dev/null +++ b/docs/source/getting_started/cpu-installation.md @@ -0,0 +1,154 @@ +(installation-cpu)= + +# Installation with CPU + +vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features: + +- Tensor Parallel +- Model Quantization (`INT8 W8A8, AWQ`) +- Chunked-prefill +- Prefix-caching +- FP8-E5M2 KV-Caching (TODO) + +Table of contents: + +1. [Requirements](#cpu-backend-requirements) +2. [Quick start using Dockerfile](#cpu-backend-quick-start-dockerfile) +3. [Build from source](#build-cpu-backend-from-source) +4. [Related runtime environment variables](#env-intro) +5. [Intel Extension for PyTorch](#ipex-guidance) +6. [Performance tips](#cpu-backend-performance-tips) + +(cpu-backend-requirements)= + +## Requirements + +- OS: Linux +- Compiler: gcc/g++>=12.3.0 (optional, recommended) +- Instruction set architecture (ISA) requirement: AVX512 (optional, recommended) + +(cpu-backend-quick-start-dockerfile)= + +## Quick start using Dockerfile + +```console +$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g . +$ docker run -it \ + --rm \ + --network=host \ + --cpuset-cpus= \ + --cpuset-mems= \ + vllm-cpu-env +``` + +(build-cpu-backend-from-source)= + +## Build from source + +- First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: + +```console +$ sudo apt-get update -y +$ sudo apt-get install -y gcc-12 g++-12 libnuma-dev +$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 +``` + +- Second, install Python packages for vLLM CPU backend building: + +```console +$ pip install --upgrade pip +$ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy +$ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +``` + +- Finally, build and install vLLM CPU backend: + +```console +$ VLLM_TARGET_DEVICE=cpu python setup.py install +``` + +```{note} +- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. +- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building. +``` + +(env-intro)= + +## Related runtime environment variables + +- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. +- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. + +(ipex-guidance)= + +## Intel Extension for PyTorch + +- [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. + +(cpu-backend-performance-tips)= + +## Performance tips + +- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run: + +```console +$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library +$ find / -name *libtcmalloc* # find the dynamic link library path +$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD +$ python examples/offline_inference.py # run vLLM +``` + +- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: + +```console +$ export VLLM_CPU_KVCACHE_SPACE=40 +$ export VLLM_CPU_OMP_THREADS_BIND=0-29 +$ vllm serve facebook/opt-125m +``` + +- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: + +```console +$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores + +# The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core. +CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ +0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 +1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 +2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 +3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 +4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 +5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 +6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 +7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 +8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 +9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 +10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 +11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 +12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 +13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 +14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 +15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 + +# On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 +$ export VLLM_CPU_OMP_THREADS_BIND=0-7 +$ python examples/offline_inference.py +``` + +- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access. + +## CPU Backend Considerations + +- The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance. + +- Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance. + +- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel. + + - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](https://github.com/vllm-project/vllm/pull/6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: + + ```console + $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp + ``` + + - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md). diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst deleted file mode 100644 index 649de1cd9b53c..0000000000000 --- a/docs/source/getting_started/cpu-installation.rst +++ /dev/null @@ -1,164 +0,0 @@ -.. _installation_cpu: - -Installation with CPU -======================== - -vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features: - -- Tensor Parallel -- Model Quantization (``INT8 W8A8, AWQ``) -- Chunked-prefill -- Prefix-caching -- FP8-E5M2 KV-Caching (TODO) - -Table of contents: - -#. :ref:`Requirements ` -#. :ref:`Quick start using Dockerfile ` -#. :ref:`Build from source ` -#. :ref:`Related runtime environment variables ` -#. :ref:`Intel Extension for PyTorch ` -#. :ref:`Performance tips ` - -.. _cpu_backend_requirements: - -Requirements ------------- - -* OS: Linux -* Compiler: gcc/g++>=12.3.0 (optional, recommended) -* Instruction set architecture (ISA) requirement: AVX512 (optional, recommended) - -.. _cpu_backend_quick_start_dockerfile: - -Quick start using Dockerfile ----------------------------- - -.. code-block:: console - - $ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g . - $ docker run -it \ - --rm \ - --network=host \ - --cpuset-cpus= \ - --cpuset-mems= \ - vllm-cpu-env - -.. _build_cpu_backend_from_source: - -Build from source ------------------ - -- First, install recommended compiler. We recommend to use ``gcc/g++ >= 12.3.0`` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: - -.. code-block:: console - - $ sudo apt-get update -y - $ sudo apt-get install -y gcc-12 g++-12 libnuma-dev - $ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 - -- Second, install Python packages for vLLM CPU backend building: - -.. code-block:: console - - $ pip install --upgrade pip - $ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy - $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu - -- Finally, build and install vLLM CPU backend: - -.. code-block:: console - - $ VLLM_TARGET_DEVICE=cpu python setup.py install - -.. note:: - - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. - - - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building. - -.. _env_intro: - -Related runtime environment variables -------------------------------------- - -- ``VLLM_CPU_KVCACHE_SPACE``: specify the KV Cache size (e.g, ``VLLM_CPU_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. - -- ``VLLM_CPU_OMP_THREADS_BIND``: specify the CPU cores dedicated to the OpenMP threads. For example, ``VLLM_CPU_OMP_THREADS_BIND=0-31`` means there will be 32 OpenMP threads bound on 0-31 CPU cores. ``VLLM_CPU_OMP_THREADS_BIND=0-31|32-63`` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. - -.. _ipex_guidance: - -Intel Extension for PyTorch ---------------------------- - -- `Intel Extension for PyTorch (IPEX) `_ extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. - -.. _cpu_backend_performance_tips: - -Performance tips ------------------ - -- We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run: - -.. code-block:: console - - $ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library - $ find / -name *libtcmalloc* # find the dynamic link library path - $ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD - $ python examples/offline_inference.py # run vLLM - -- When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: - -.. code-block:: console - - $ export VLLM_CPU_KVCACHE_SPACE=40 - $ export VLLM_CPU_OMP_THREADS_BIND=0-29 - $ vllm serve facebook/opt-125m - -- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using ``VLLM_CPU_OMP_THREADS_BIND``. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: - -.. code-block:: console - - $ lscpu -e # check the mapping between logical CPU cores and physical CPU cores - - # The "CPU" column means the logical CPU core IDs, and the "CORE" column means the physical core IDs. On this platform, two logical cores are sharing one physical core. - CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE MAXMHZ MINMHZ MHZ - 0 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 - 1 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 - 2 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 - 3 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 - 4 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 - 5 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 - 6 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 - 7 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 - 8 0 0 0 0:0:0:0 yes 2401.0000 800.0000 800.000 - 9 0 0 1 1:1:1:0 yes 2401.0000 800.0000 800.000 - 10 0 0 2 2:2:2:0 yes 2401.0000 800.0000 800.000 - 11 0 0 3 3:3:3:0 yes 2401.0000 800.0000 800.000 - 12 0 0 4 4:4:4:0 yes 2401.0000 800.0000 800.000 - 13 0 0 5 5:5:5:0 yes 2401.0000 800.0000 800.000 - 14 0 0 6 6:6:6:0 yes 2401.0000 800.0000 800.000 - 15 0 0 7 7:7:7:0 yes 2401.0000 800.0000 800.000 - - # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15 - $ export VLLM_CPU_OMP_THREADS_BIND=0-7 - $ python examples/offline_inference.py - -- If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using ``VLLM_CPU_OMP_THREADS_BIND`` to avoid cross NUMA node memory access. - -CPU Backend Considerations --------------------------- - -- The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance. - -- Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance. - -- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the `topology `_. For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel. - - * Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With `TP feature on CPU `_ merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: - - .. code-block:: console - - $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp - - - * Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like `Nginx <../serving/deploying_with_nginx.html>`_ or HAProxy are recommended. Anyscale Ray project provides the feature on LLM `serving `_. Here is the example to setup a scalable LLM serving with `Ray Serve `_. \ No newline at end of file diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/debugging.md new file mode 100644 index 0000000000000..2f11c95ce0e77 --- /dev/null +++ b/docs/source/getting_started/debugging.md @@ -0,0 +1,199 @@ +(debugging)= + +# Debugging Tips + +This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. + +```{note} +Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated. +``` + +## Hangs downloading a model + +If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection. +It's recommended to download the model first using the [huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/cli) and passing the local path to the model to vLLM. This way, you can isolate the issue. + +## Hangs loading a model from disk + +If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. +It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory. + +```{note} +To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck. +``` + +## Model is too large + +If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](https://docs.vllm.ai/en/latest/serving/distributed_serving.html#distributed-inference-and-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using [this example](https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html) . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. + +## Enable more logging + +If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue: + +- `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging. +- `export CUDA_LAUNCH_BLOCKING=1` to identify which CUDA kernel is causing the problem. +- `export NCCL_DEBUG=TRACE` to turn on more logging for NCCL. +- `export VLLM_TRACE_FUNCTION=1` to record all function calls for inspection in the log files to tell which function crashes or hangs. + +## Incorrect network setup + +The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as `DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl` and the IP address should be the correct one. +If it's not, override the IP address using the environment variable `export VLLM_HOST_IP=`. + +You might also need to set `export NCCL_SOCKET_IFNAME=` and `export GLOO_SOCKET_IFNAME=` to specify the network interface for the IP address. + +## Error near `self.graph.replay()` + +If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph. +To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. + +## Incorrect hardware/driver + +If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. + +```python +# Test PyTorch NCCL +import torch +import torch.distributed as dist +dist.init_process_group(backend="nccl") +local_rank = dist.get_rank() % torch.cuda.device_count() +torch.cuda.set_device(local_rank) +data = torch.FloatTensor([1,] * 128).to("cuda") +dist.all_reduce(data, op=dist.ReduceOp.SUM) +torch.cuda.synchronize() +value = data.mean().item() +world_size = dist.get_world_size() +assert value == world_size, f"Expected {world_size}, got {value}" + +print("PyTorch NCCL is successful!") + +# Test PyTorch GLOO +gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo") +cpu_data = torch.FloatTensor([1,] * 128) +dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group) +value = cpu_data.mean().item() +assert value == world_size, f"Expected {world_size}, got {value}" + +print("PyTorch GLOO is successful!") + +if world_size <= 1: + exit() + +# Test vLLM NCCL, with cuda graph +from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator + +pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank) +# pynccl is enabled by default for 0.6.5+, +# but for 0.6.4 and below, we need to enable it manually. +# keep the code for backward compatibility when because people +# prefer to read the latest documentation. +pynccl.disabled = False + +s = torch.cuda.Stream() +with torch.cuda.stream(s): + data.fill_(1) + pynccl.all_reduce(data, stream=s) + value = data.mean().item() + assert value == world_size, f"Expected {world_size}, got {value}" + +print("vLLM NCCL is successful!") + +g = torch.cuda.CUDAGraph() +with torch.cuda.graph(cuda_graph=g, stream=s): + pynccl.all_reduce(data, stream=torch.cuda.current_stream()) + +data.fill_(1) +g.replay() +torch.cuda.current_stream().synchronize() +value = data.mean().item() +assert value == world_size, f"Expected {world_size}, got {value}" + +print("vLLM NCCL with cuda graph is successful!") + +dist.destroy_process_group(gloo_group) +dist.destroy_process_group() +``` + +If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use: + +```console +$ NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py +``` + +If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run: + +```console +$ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py +``` + +If the script runs successfully, you should see the message `sanity check is successful!`. + +If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as `export NCCL_P2P_DISABLE=1` to see if it helps. Please check [their documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully. + +```{note} +A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: + +- In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`. +- In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`. + +Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes. +``` + +## Python multiprocessing + +### `RuntimeError` Exception + +If you have seen a warning in your logs like this: + +```console +WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously + initialized. We must use the `spawn` multiprocessing start method. Setting + VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See + https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing + for more information. +``` + +or an error from Python that looks like this: + +```console +RuntimeError: + An attempt has been made to start a new process before the + current process has finished its bootstrapping phase. + + This probably means that you are not using fork to start your + child processes and you have forgotten to use the proper idiom + in the main module: + + if __name__ == '__main__': + freeze_support() + ... + + The "freeze_support()" line can be omitted if the program + is not going to be frozen to produce an executable. + + To fix this issue, refer to the "Safe importing of main module" + section in https://docs.python.org/3/library/multiprocessing.html +``` + +then you must update your Python code to guard usage of `vllm` behind a `if +__name__ == '__main__':` block. For example, instead of this: + +```python +import vllm + +llm = vllm.LLM(...) +``` + +try this instead: + +```python +if __name__ == '__main__': + import vllm + + llm = vllm.LLM(...) +``` + +## Known Issues + +- In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](https://github.com/vllm-project/vllm/pull/6759). +- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](https://github.com/vllm-project/vllm/issues/5723#issuecomment-2554389656) . diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst deleted file mode 100644 index b123960533816..0000000000000 --- a/docs/source/getting_started/debugging.rst +++ /dev/null @@ -1,203 +0,0 @@ -.. _debugging: - -=============== -Debugging Tips -=============== - -This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please `search existing issues `_ first to see if it has already been reported. If not, please `file a new issue `_, providing as much relevant information as possible. - -.. note:: - - Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated. - -Hangs downloading a model ----------------------------------------- -If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection. -It's recommended to download the model first using the `huggingface-cli `_ and passing the local path to the model to vLLM. This way, you can isolate the issue. - -Hangs loading a model from disk ----------------------------------------- -If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. -It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory. - -.. note:: - - To isolate the model downloading and loading issue, you can use the ``--load-format dummy`` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck. - -Model is too large ----------------------------------------- -If the model is too large to fit in a single GPU, you might want to `consider tensor parallelism `_ to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `this example `_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. - -Enable more logging ----------------------------------------- -If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue: - -- ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging. -- ``export CUDA_LAUNCH_BLOCKING=1`` to identify which CUDA kernel is causing the problem. -- ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL. -- ``export VLLM_TRACE_FUNCTION=1`` to record all function calls for inspection in the log files to tell which function crashes or hangs. - -Incorrect network setup ----------------------------------------- -The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl`` and the IP address should be the correct one. -If it's not, override the IP address using the environment variable ``export VLLM_HOST_IP=``. - -You might also need to set ``export NCCL_SOCKET_IFNAME=`` and ``export GLOO_SOCKET_IFNAME=`` to specify the network interface for the IP address. - -Error near ``self.graph.replay()`` ----------------------------------------- -If vLLM crashes and the error trace captures it somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a CUDA error inside CUDAGraph. -To identify the particular CUDA operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. - -Incorrect hardware/driver ----------------------------------------- -If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. - -.. code-block:: python - - # Test PyTorch NCCL - import torch - import torch.distributed as dist - dist.init_process_group(backend="nccl") - local_rank = dist.get_rank() % torch.cuda.device_count() - torch.cuda.set_device(local_rank) - data = torch.FloatTensor([1,] * 128).to("cuda") - dist.all_reduce(data, op=dist.ReduceOp.SUM) - torch.cuda.synchronize() - value = data.mean().item() - world_size = dist.get_world_size() - assert value == world_size, f"Expected {world_size}, got {value}" - - print("PyTorch NCCL is successful!") - - # Test PyTorch GLOO - gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo") - cpu_data = torch.FloatTensor([1,] * 128) - dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group) - value = cpu_data.mean().item() - assert value == world_size, f"Expected {world_size}, got {value}" - - print("PyTorch GLOO is successful!") - - if world_size <= 1: - exit() - - # Test vLLM NCCL, with cuda graph - from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator - - pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank) - # pynccl is enabled by default for 0.6.5+, - # but for 0.6.4 and below, we need to enable it manually. - # keep the code for backward compatibility when because people - # prefer to read the latest documentation. - pynccl.disabled = False - - s = torch.cuda.Stream() - with torch.cuda.stream(s): - data.fill_(1) - pynccl.all_reduce(data, stream=s) - value = data.mean().item() - assert value == world_size, f"Expected {world_size}, got {value}" - - print("vLLM NCCL is successful!") - - g = torch.cuda.CUDAGraph() - with torch.cuda.graph(cuda_graph=g, stream=s): - pynccl.all_reduce(data, stream=torch.cuda.current_stream()) - - data.fill_(1) - g.replay() - torch.cuda.current_stream().synchronize() - value = data.mean().item() - assert value == world_size, f"Expected {world_size}, got {value}" - - print("vLLM NCCL with cuda graph is successful!") - - dist.destroy_process_group(gloo_group) - dist.destroy_process_group() - -If you are testing with a single node, adjust ``--nproc-per-node`` to the number of GPUs you want to use: - -.. code-block:: console - - $ NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py - -If you are testing with multi-nodes, adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup and set ``MASTER_ADDR`` to the correct IP address of the master node, reachable from all nodes. Then, run: - -.. code-block:: console - - $ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py - -If the script runs successfully, you should see the message ``sanity check is successful!``. - -If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as ``export NCCL_P2P_DISABLE=1`` to see if it helps. Please check `their documentation `__ for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully. - -.. note:: - - A multi-node environment is more complicated than a single-node one. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: - - - In the first node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py``. - - In the second node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py``. - - Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup, being sure to execute different commands (with different ``--node-rank``) on different nodes. - -Python multiprocessing ----------------------- - -`RuntimeError` Exception -^^^^^^^^^^^^^^^^^^^^^^^^ - -If you have seen a warning in your logs like this: - -.. code-block:: console - - WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously - initialized. We must use the `spawn` multiprocessing start method. Setting - VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See - https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing - for more information. - -or an error from Python that looks like this: - -.. code-block:: console - - RuntimeError: - An attempt has been made to start a new process before the - current process has finished its bootstrapping phase. - - This probably means that you are not using fork to start your - child processes and you have forgotten to use the proper idiom - in the main module: - - if __name__ == '__main__': - freeze_support() - ... - - The "freeze_support()" line can be omitted if the program - is not going to be frozen to produce an executable. - - To fix this issue, refer to the "Safe importing of main module" - section in https://docs.python.org/3/library/multiprocessing.html - -then you must update your Python code to guard usage of ``vllm`` behind a ``if -__name__ == '__main__':`` block. For example, instead of this: - -.. code-block:: python - - import vllm - - llm = vllm.LLM(...) - -try this instead: - -.. code-block:: python - - if __name__ == '__main__': - import vllm - - llm = vllm.LLM(...) - -Known Issues ----------------------------------------- -- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq `_ , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of ``vllm`` to include the `fix `_. -- To circumvent a NCCL `bug `__ , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in `the RLHF integration `__ and the `discussion `__ . diff --git a/docs/source/getting_started/examples/examples_index.template.md b/docs/source/getting_started/examples/examples_index.template.md new file mode 100644 index 0000000000000..de7a91c0ffa48 --- /dev/null +++ b/docs/source/getting_started/examples/examples_index.template.md @@ -0,0 +1,8 @@ +# Examples + +```{toctree} +:maxdepth: 1 +:caption: Scripts + +%EXAMPLE_DOCS% +``` \ No newline at end of file diff --git a/docs/source/getting_started/examples/examples_index.template.rst b/docs/source/getting_started/examples/examples_index.template.rst deleted file mode 100644 index 1b34cccbae15a..0000000000000 --- a/docs/source/getting_started/examples/examples_index.template.rst +++ /dev/null @@ -1,8 +0,0 @@ -Examples -================================= - -.. toctree:: - :maxdepth: 1 - :caption: Scripts - - %EXAMPLE_DOCS% diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/gaudi-installation.md new file mode 100644 index 0000000000000..170d7e49ba806 --- /dev/null +++ b/docs/source/getting_started/gaudi-installation.md @@ -0,0 +1,388 @@ +# Installation with Intel® Gaudi® AI Accelerators + +This README provides instructions on running vLLM with Intel Gaudi devices. + +## Requirements and Installation + +Please follow the instructions provided in the [Gaudi Installation +Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) +to set up the execution environment. To achieve the best performance, +please follow the methods outlined in the [Optimizing Training Platform +Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). + +### Requirements + +- OS: Ubuntu 22.04 LTS +- Python: 3.10 +- Intel Gaudi accelerator +- Intel Gaudi software version 1.18.0 + +### Quick start using Dockerfile + +```console +$ docker build -f Dockerfile.hpu -t vllm-hpu-env . +$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env +``` + +```{tip} +If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered. +``` + +### Build from source + +#### Environment verification + +To verify that the Intel Gaudi software was correctly installed, run: + +```console +$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible +$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed +$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed +$ pip list | grep neural # verify that neural_compressor is installed +``` + +Refer to [Intel Gaudi Software Stack +Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) +for more details. + +#### Run Docker Image + +It is highly recommended to use the latest Docker image from Intel Gaudi +vault. Refer to the [Intel Gaudi +documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) +for more details. + +Use the following commands to run a Docker image: + +```console +$ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +``` + +#### Build and Install vLLM + +To build and install vLLM from source, run: + +```console +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ python setup.py develop +``` + +Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: + +```console +$ git clone https://github.com/HabanaAI/vllm-fork.git +$ cd vllm-fork +$ git checkout habana_main +$ python setup.py develop +``` + +## Supported Features + +- [Offline batched + inference](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference) +- Online inference via [OpenAI-Compatible + Server](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server) +- HPU autodetection - no need to manually select device within vLLM +- Paged KV cache with algorithms enabled for Intel Gaudi accelerators +- Custom Intel Gaudi implementations of Paged Attention, KV cache ops, + prefill attention, Root Mean Square Layer Normalization, Rotary + Positional Encoding +- Tensor parallelism support for multi-card inference +- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) + for accelerating low-batch latency and throughput +- Attention with Linear Biases (ALiBi) + +## Unsupported Features + +- Beam search +- LoRA adapters +- Quantization +- Prefill chunking (mixed-batch inferencing) + +## Supported Configurations + +The following configurations have been validated to be function with +Gaudi2 devices. Configurations that are not listed may or may not work. + +- [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling + +## Performance Tuning + +### Execution modes + +Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag. + +```{eval-rst} +.. list-table:: vLLM execution modes + :widths: 25 25 50 + :header-rows: 1 + + * - ``PT_HPU_LAZY_MODE`` + - ``enforce_eager`` + - execution mode + * - 0 + - 0 + - torch.compile + * - 0 + - 1 + - PyTorch eager mode + * - 1 + - 0 + - HPU Graphs + * - 1 + - 1 + - PyTorch lazy mode +``` + +```{warning} +In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. +``` + +### Bucketing mechanism + +Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. +In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`. + +```{note} +Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase. +``` + +Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: + +``` +INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] +INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] +INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] +INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] +``` + +`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. + +Example (with ramp-up) + +``` +min = 2, step = 32, max = 64 +=> ramp_up = (2, 4, 8, 16) +=> stable = (32, 64) +=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64) +``` + +Example (without ramp-up) + +``` +min = 128, step = 128, max = 512 +=> ramp_up = () +=> stable = (128, 256, 384, 512) +=> buckets = ramp_up + stable => (128, 256, 384, 512) +``` + +In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. + +```{warning} +If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. +``` + +As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket. + +```{note} +Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. +``` + +### Warmup + +Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: + +``` +INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB +INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB +INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB +... +INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB +INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB +INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB +INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB +... +INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB +INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB +``` + +This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. + +```{tip} +Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. +``` + +### HPU Graph capture + +[HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management. + +When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by `gpu_memory_utilization` flag (`0.9` by default). +Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage. +Only after that, `gpu_memory_utilization` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable. +Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. +Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of memory reserved for HPU Graphs capture. +With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. +Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints. +Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. + +```{note} +`gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory. +``` + +User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented: +\- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode +\- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt + +When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy. + +```{note} +`VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below. +``` + +Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): + +``` +INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] +INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] +INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] +INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] +INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) +INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) +INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) +INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) +INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache +INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 +INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) +INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB +... +INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB +INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) +INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB +... +INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB +INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB +... +INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB +INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB +INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB +INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB +INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB +INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] +INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] +INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory +INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) +``` + +### Recommended vLLM Parameters + +- We recommend running inference on Gaudi 2 with `block_size` of 128 + for BF16 data type. Using default values (16, 32) might lead to + sub-optimal performance due to Matrix Multiplication Engine + under-utilization (see [Gaudi + Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)). +- For max throughput on Llama 7B, we recommend running with batch size + of 128 or 256 and max context length of 2048 with HPU Graphs enabled. + If you encounter out-of-memory issues, see troubleshooting section. + +### Environment variables + +**Diagnostic and profiling knobs:** + +- `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be enabled. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled by default. +- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`. Disabled by default. +- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default. +- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default. +- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default. + +**Performance tuning knobs:** + +- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by default + +- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default + +- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default + +- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default + +- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default + +- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism + + - `{phase}` is either `PROMPT` or `DECODE` + + - `{dim}` is either `BS`, `SEQ` or `BLOCK` + + - `{param}` is either `MIN`, `STEP` or `MAX` + + - Default values: + + - Prompt: + : - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)` + - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` + - Decode: + : - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` + - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` + +Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: + +- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default +- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs + +## Troubleshooting: Tweaking HPU Graphs + +If you experience device out-of-memory issues or want to attempt +inference at higher batch sizes, try tweaking HPU Graphs by following +the below: + +- Tweak `gpu_memory_utilization` knob. It will decrease the + allocation of KV cache, leaving some headroom for capturing graphs + with larger batch size. By default `gpu_memory_utilization` is set + to 0.9. It attempts to allocate ~90% of HBM left for KV cache after + short profiling run. Note that decreasing reduces the number of KV + cache blocks you have available, and therefore reduces the effective + maximum number of tokens you can handle at a given time. +- If this method is not efficient, you can disable `HPUGraph` + completely. With HPU Graphs disabled, you are trading latency and + throughput at lower batches for potentially higher throughput on + higher batches. You can do that by adding `--enforce-eager` flag to + server (for online inference), or by passing `enforce_eager=True` + argument to LLM constructor (for offline inference). diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst deleted file mode 100644 index 249e08278ff8f..0000000000000 --- a/docs/source/getting_started/gaudi-installation.rst +++ /dev/null @@ -1,402 +0,0 @@ -Installation with Intel® Gaudi® AI Accelerators -=============================================== - -This README provides instructions on running vLLM with Intel Gaudi devices. - -Requirements and Installation ------------------------------ - -Please follow the instructions provided in the `Gaudi Installation -Guide `__ -to set up the execution environment. To achieve the best performance, -please follow the methods outlined in the `Optimizing Training Platform -Guide `__. - -Requirements -~~~~~~~~~~~~ - -- OS: Ubuntu 22.04 LTS -- Python: 3.10 -- Intel Gaudi accelerator -- Intel Gaudi software version 1.18.0 - - -Quick start using Dockerfile -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. code:: console - - $ docker build -f Dockerfile.hpu -t vllm-hpu-env . - $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env - - -.. tip:: - If you're observing the following error: ``docker: Error response from daemon: Unknown runtime specified habana.``, please refer to "Install Using Containers" section of `Intel Gaudi Software Stack and Driver Installation `__. Make sure you have ``habana-container-runtime`` package installed and that ``habana`` container runtime is registered. - - -Build from source -~~~~~~~~~~~~~~~~~ - -Environment verification -^^^^^^^^^^^^^^^^^^^^^^^^ - -To verify that the Intel Gaudi software was correctly installed, run: - -.. code:: console - - $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible - $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed - $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed - $ pip list | grep neural # verify that neural_compressor is installed - -Refer to `Intel Gaudi Software Stack -Verification `__ -for more details. - -Run Docker Image -^^^^^^^^^^^^^^^^ - -It is highly recommended to use the latest Docker image from Intel Gaudi -vault. Refer to the `Intel Gaudi -documentation `__ -for more details. - -Use the following commands to run a Docker image: - -.. code:: console - - $ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest - $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest - -Build and Install vLLM -^^^^^^^^^^^^^^^^^^^^^^ - -To build and install vLLM from source, run: - -.. code:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ python setup.py develop - - -Currently, the latest features and performance optimizations are developed in Gaudi's `vLLM-fork `__ and we periodically upstream them to vLLM main repo. To install latest `HabanaAI/vLLM-fork `__, run the following: - -.. code:: console - - $ git clone https://github.com/HabanaAI/vllm-fork.git - $ cd vllm-fork - $ git checkout habana_main - $ python setup.py develop - - -Supported Features ------------------- - -- `Offline batched - inference `__ -- Online inference via `OpenAI-Compatible - Server `__ -- HPU autodetection - no need to manually select device within vLLM -- Paged KV cache with algorithms enabled for Intel Gaudi accelerators -- Custom Intel Gaudi implementations of Paged Attention, KV cache ops, - prefill attention, Root Mean Square Layer Normalization, Rotary - Positional Encoding -- Tensor parallelism support for multi-card inference -- Inference with `HPU Graphs `__ - for accelerating low-batch latency and throughput -- Attention with Linear Biases (ALiBi) - -Unsupported Features --------------------- - -- Beam search -- LoRA adapters -- Quantization -- Prefill chunking (mixed-batch inferencing) - -Supported Configurations ------------------------- - -The following configurations have been validated to be function with -Gaudi2 devices. Configurations that are not listed may or may not work. - -- `meta-llama/Llama-2-7b `__ - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- `meta-llama/Llama-2-7b-chat-hf `__ - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3-8B `__ - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3-8B-Instruct `__ - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3.1-8B `__ - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3.1-8B-Instruct `__ - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- `meta-llama/Llama-2-70b `__ - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- `meta-llama/Llama-2-70b-chat-hf `__ - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3-70B `__ - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3-70B-Instruct `__ - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3.1-70B `__ - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling -- `meta-llama/Meta-Llama-3.1-70B-Instruct `__ - with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling - -Performance Tuning ------------------- - -Execution modes -~~~~~~~~~~~~~~~ - -Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via ``PT_HPU_LAZY_MODE`` environment variable), and ``--enforce-eager`` flag. - -.. list-table:: vLLM execution modes - :widths: 25 25 50 - :header-rows: 1 - - * - ``PT_HPU_LAZY_MODE`` - - ``enforce_eager`` - - execution mode - * - 0 - - 0 - - torch.compile - * - 0 - - 1 - - PyTorch eager mode - * - 1 - - 0 - - HPU Graphs - * - 1 - - 1 - - PyTorch lazy mode - -.. warning:: - In 1.18.0, all modes utilizing ``PT_HPU_LAZY_MODE=0`` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. - - -Bucketing mechanism -~~~~~~~~~~~~~~~~~~~ - -Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. `Intel Gaudi Graph Compiler `__ is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. -In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - ``batch_size`` and ``sequence_length``. - -.. note:: - Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase. - -Bucketing ranges are determined with 3 parameters - ``min``, ``step`` and ``max``. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: - -.. code-block:: - - INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] - INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] - INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] - INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] - -``min`` determines the lowest value of the bucket. ``step`` determines the interval between buckets, and ``max`` determines the upper bound of the bucket. Furthermore, interval between ``min`` and ``step`` has special handling - ``min`` gets multiplied by consecutive powers of two, until ``step`` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. - -Example (with ramp-up) - -.. code-block:: - - min = 2, step = 32, max = 64 - => ramp_up = (2, 4, 8, 16) - => stable = (32, 64) - => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64) - -Example (without ramp-up) - -.. code-block:: - - min = 128, step = 128, max = 512 - => ramp_up = () - => stable = (128, 256, 384, 512) - => buckets = ramp_up + stable => (128, 256, 384, 512) - - -In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. - -.. warning:: - If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. - -As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as ``(4, 512)`` prefill bucket, as ``batch_size`` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as ``(4, 512)`` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a ``(2, 512)`` bucket, or context length increases above 512 tokens, in which case it will become ``(4, 640)`` bucket. - -.. note:: - Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. - -Warmup -~~~~~~ - -Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: - -.. code-block:: - - INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB - INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB - INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB - ... - INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB - INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB - INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB - INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB - ... - INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB - INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB - -This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. - -.. tip:: - Compiling all the buckets might take some time and can be turned off with ``VLLM_SKIP_WARMUP=true`` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. - -HPU Graph capture -~~~~~~~~~~~~~~~~~ - -`HPU Graphs `__ are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management. - - -When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by ``gpu_memory_utilization`` flag (``0.9`` by default). -Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage. -Only after that, ``gpu_memory_utilization`` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable. -Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. -Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture. -With its default value (``VLLM_GRAPH_RESERVED_MEM=0.1``), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. -Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.3``), both stages have equal memory constraints. -Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. - -.. note:: - ``gpu_memory_utilization`` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, ``gpu_memory_utilization`` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory. - -User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented: -- ``max_bs`` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. ``(64, 128)``, ``(64, 256)``, ``(32, 128)``, ``(32, 256)``, ``(1, 128)``, ``(1,256)``), default strategy for decode -- ``min_tokens`` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (``batch_size*sequence_length``), default strategy for prompt - -When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by ``max_bs`` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in ``min_tokens`` strategy. - - -.. note:: - ``VLLM_GRAPH_PROMPT_RATIO`` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * ``VLLM_GRAPH_PROMPT_RATIO``) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below. - - -Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): - -.. code-block:: - - INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] - INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] - INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] - INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] - INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) - INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) - INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) - INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) - INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache - INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 - INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) - INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB - ... - INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB - INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) - INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB - ... - INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB - INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB - ... - INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB - INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB - INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB - INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB - INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB - INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] - INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] - INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory - INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) - - -Recommended vLLM Parameters -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- We recommend running inference on Gaudi 2 with ``block_size`` of 128 - for BF16 data type. Using default values (16, 32) might lead to - sub-optimal performance due to Matrix Multiplication Engine - under-utilization (see `Gaudi - Architecture `__). -- For max throughput on Llama 7B, we recommend running with batch size - of 128 or 256 and max context length of 2048 with HPU Graphs enabled. - If you encounter out-of-memory issues, see troubleshooting section. - -Environment variables -~~~~~~~~~~~~~~~~~~~~~ - -**Diagnostic and profiling knobs:** - -- ``VLLM_PROFILER_ENABLED``: if ``true``, high level profiler will be enabled. Resulting JSON traces can be viewed in `perfetto.habana.ai `__. Disabled by default. -- ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION``: if ``true``, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside ``PT_HPU_METRICS_GC_DETAILS=1``. Disabled by default. -- ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL``: if ``true``, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default. -- ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS``: if ``true``, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default. -- ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL``: if ``true``, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default. - -**Performance tuning knobs:** - -- ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default -- ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.1`` by default -- ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.3`` by default -- ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default -- ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default -- ``VLLM_{phase}_{dim}_BUCKET_{param}`` - collection of 12 environment variables configuring ranges of bucketing mechanism - - - ``{phase}`` is either ``PROMPT`` or ``DECODE`` - - ``{dim}`` is either ``BS``, ``SEQ`` or ``BLOCK`` - - ``{param}`` is either ``MIN``, ``STEP`` or ``MAX`` - - Default values: - - - Prompt: - - batch size min (``VLLM_PROMPT_BS_BUCKET_MIN``): ``1`` - - batch size step (``VLLM_PROMPT_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)`` - - batch size max (``VLLM_PROMPT_BS_BUCKET_MAX``): ``min(max_num_seqs, 64)`` - - sequence length min (``VLLM_PROMPT_SEQ_BUCKET_MIN``): ``block_size`` - - sequence length step (``VLLM_PROMPT_SEQ_BUCKET_STEP``): ``block_size`` - - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``max_model_len`` - - - Decode: - - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``1`` - - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)`` - - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs`` - - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``block_size`` - - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``block_size`` - - sequence length max (``VLLM_DECODE_BLOCK_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)`` - - -Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: - -- ``PT_HPU_LAZY_MODE``: if ``0``, PyTorch Eager backend for Gaudi will be used, if ``1`` PyTorch Lazy backend for Gaudi will be used, ``1`` is default -- ``PT_HPU_ENABLE_LAZY_COLLECTIVES``: required to be ``true`` for tensor parallel inference with HPU Graphs - -Troubleshooting: Tweaking HPU Graphs ------------------------------------- - -If you experience device out-of-memory issues or want to attempt -inference at higher batch sizes, try tweaking HPU Graphs by following -the below: - -- Tweak ``gpu_memory_utilization`` knob. It will decrease the - allocation of KV cache, leaving some headroom for capturing graphs - with larger batch size. By default ``gpu_memory_utilization`` is set - to 0.9. It attempts to allocate ~90% of HBM left for KV cache after - short profiling run. Note that decreasing reduces the number of KV - cache blocks you have available, and therefore reduces the effective - maximum number of tokens you can handle at a given time. - -- If this method is not efficient, you can disable ``HPUGraph`` - completely. With HPU Graphs disabled, you are trading latency and - throughput at lower batches for potentially higher throughput on - higher batches. You can do that by adding ``--enforce-eager`` flag to - server (for online inference), or by passing ``enforce_eager=True`` - argument to LLM constructor (for offline inference). diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation.md new file mode 100644 index 0000000000000..8ca634f966a06 --- /dev/null +++ b/docs/source/getting_started/installation.md @@ -0,0 +1,199 @@ +(installation)= + +# Installation + +vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. + +## Requirements + +- OS: Linux +- Python: 3.9 -- 3.12 +- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) + +## Install released versions + +You can install vLLM using pip: + +```console +$ # (Recommended) Create a new conda environment. +$ conda create -n myenv python=3.12 -y +$ conda activate myenv + +$ # Install vLLM with CUDA 12.1. +$ pip install vllm +``` + +```{note} +Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See [this issue](https://github.com/vllm-project/vllm/issues/8420) for more details. +``` + +````{note} +As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. +We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions: + +```console +$ # Install vLLM with CUDA 11.8. +$ export VLLM_VERSION=0.6.1.post1 +$ export PYTHON_VERSION=310 +$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +``` + +In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. + +Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions. +```` + +(install-the-latest-code)= + +## Install the latest code + +LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`. You can download and install it with the following command: + +```console +$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +``` + +If you want to access the wheels for previous commits, you can specify the commit hash in the URL: + +```console +$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +``` + +Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. + +Another way to access the latest code is to use the docker images: + +```console +$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +$ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} +``` + +These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. + +The latest code can contain bugs and may not be stable. Please use it with caution. + +(build-from-source)= + +## Build from source + +(python-only-build)= + +### Python-only build (without compilation) + +If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM: + +```console +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ VLLM_USE_PRECOMPILED=1 pip install --editable . +``` + +This will download the latest nightly wheel and use the compiled libraries from there in the install. + +The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files): + +```console +$ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl +$ pip install --editable . +``` + +You can find more information about vLLM's wheels [above](#install-the-latest-code). + +```{note} +There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. +It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [the section above](#install-the-latest-code) for instructions on how to install a specified wheel. +``` + +### Full build (with compilation) + +If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: + +```console +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ pip install -e . +``` + +```{tip} +Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. + +For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` . +As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. + +[sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments. +The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`. +``` + +#### Use an existing PyTorch installation + +There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.: + +- Building vLLM with PyTorch nightly or a custom PyTorch build. +- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124` to [install PyTorch nightly](https://pytorch.org/get-started/locally/), and then build vLLM on top of it. + +To build vLLM using an existing PyTorch installation: + +```console +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ python use_existing_torch.py +$ pip install -r requirements-build.txt +$ pip install -e . --no-build-isolation +``` + +#### Use the local cutlass for compilation + +Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead. +To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory. + +```console +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . +``` + +#### Troubleshooting + +To avoid your system being overloaded, you can limit the number of compilation jobs +to be run simultaneously, via the environment variable `MAX_JOBS`. For example: + +```console +$ export MAX_JOBS=6 +$ pip install -e . +``` + +This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory. +A side effect is a much slower build process. + +Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. + +```console +$ # Use `--ipc=host` to make sure the shared memory is large enough. +$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 +``` + +If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.: + +```console +$ export CUDA_HOME=/usr/local/cuda +$ export PATH="${CUDA_HOME}/bin:$PATH" +``` + +Here is a sanity check to verify that the CUDA Toolkit is correctly installed: + +```console +$ nvcc --version # verify that nvcc is in your PATH +$ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME +``` + +### Unsupported OS build + +vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. + +Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing: + +```console +$ export VLLM_TARGET_DEVICE=empty +$ pip install -e . +``` diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst deleted file mode 100644 index 9b6cb0e80d60e..0000000000000 --- a/docs/source/getting_started/installation.rst +++ /dev/null @@ -1,214 +0,0 @@ -.. _installation: - -============ -Installation -============ - -vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. - -Requirements -============ - -* OS: Linux -* Python: 3.9 -- 3.12 -* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) - -Install released versions -========================= - -You can install vLLM using pip: - -.. code-block:: console - - $ # (Recommended) Create a new conda environment. - $ conda create -n myenv python=3.12 -y - $ conda activate myenv - - $ # Install vLLM with CUDA 12.1. - $ pip install vllm - -.. note:: - - Although we recommend using ``conda`` to create and manage Python environments, it is highly recommended to use ``pip`` to install vLLM. This is because ``pip`` can install ``torch`` with separate library packages like ``NCCL``, while ``conda`` installs ``torch`` with statically linked ``NCCL``. This can cause issues when vLLM tries to use ``NCCL``. See `this issue `_ for more details. - -.. note:: - - As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. - We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions: - - .. code-block:: console - - $ # Install vLLM with CUDA 11.8. - $ export VLLM_VERSION=0.6.1.post1 - $ export PYTHON_VERSION=310 - $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 - - In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. - - Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions. - - -.. _install-the-latest-code: - -Install the latest code -======================= - -LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since ``v0.5.3``. You can download and install it with the following command: - -.. code-block:: console - - $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl - -If you want to access the wheels for previous commits, you can specify the commit hash in the URL: - -.. code-block:: console - - $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch - $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl - -Note that the wheels are built with Python 3.8 ABI (see `PEP 425 `_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. - -Another way to access the latest code is to use the docker images: - -.. code-block:: console - - $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch - $ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} - -These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. - -The latest code can contain bugs and may not be stable. Please use it with caution. - -.. _build_from_source: - -Build from source -================= - -.. _python-only-build: - -Python-only build (without compilation) ---------------------------------------- - -If you only need to change Python code, you can build and install vLLM without compilation. Using `pip's ``--editable`` flag `_, changes you make to the code will be reflected when you run vLLM: - -.. code-block:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ VLLM_USE_PRECOMPILED=1 pip install --editable . - -This will download the latest nightly wheel and use the compiled libraries from there in the install. - -The ``VLLM_PRECOMPILED_WHEEL_LOCATION`` environment variable can be used instead of ``VLLM_USE_PRECOMPILED`` to specify a custom path or URL to the wheel file. For example, to use the `0.6.1.post1 PyPi wheel `_: - -.. code-block:: console - - $ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl - $ pip install --editable . - -You can find more information about vLLM's wheels `above <#install-the-latest-code>`_. - -.. note:: - - There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. - It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to `the section above <#install-the-latest-code>`_ for instructions on how to install a specified wheel. - -Full build (with compilation) ------------------------------ - -If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: - -.. code-block:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ pip install -e . - -.. tip:: - - Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. - - For example, you can install `ccache `_ using ``conda install ccache`` or ``apt install ccache`` . - As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. - - `sccache `_ works similarly to ``ccache``, but has the capability to utilize caching in remote storage environments. - The following environment variables can be set to configure the vLLM ``sccache`` remote: ``SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1``. We also recommend setting ``SCCACHE_IDLE_TIMEOUT=0``. - - -Use an existing PyTorch installation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.: - -* Building vLLM with PyTorch nightly or a custom PyTorch build. -* Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run ``pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124`` to `install PyTorch nightly `_, and then build vLLM on top of it. - -To build vLLM using an existing PyTorch installation: - -.. code-block:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ python use_existing_torch.py - $ pip install -r requirements-build.txt - $ pip install -e . --no-build-isolation - - -Use the local cutlass for compilation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead. -To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory. - -.. code-block:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . - - -Troubleshooting -~~~~~~~~~~~~~~~ - -To avoid your system being overloaded, you can limit the number of compilation jobs -to be run simultaneously, via the environment variable ``MAX_JOBS``. For example: - -.. code-block:: console - - $ export MAX_JOBS=6 - $ pip install -e . - -This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default `_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory. -A side effect is a much slower build process. - -Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. - -.. code-block:: console - - $ # Use `--ipc=host` to make sure the shared memory is large enough. - $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 - -If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website `_. After installation, set the environment variable ``CUDA_HOME`` to the installation path of CUDA Toolkit, and make sure that the ``nvcc`` compiler is in your ``PATH``, e.g.: - -.. code-block:: console - - $ export CUDA_HOME=/usr/local/cuda - $ export PATH="${CUDA_HOME}/bin:$PATH" - -Here is a sanity check to verify that the CUDA Toolkit is correctly installed: - -.. code-block:: console - - $ nvcc --version # verify that nvcc is in your PATH - $ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME - - -Unsupported OS build --------------------- - -vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. - -Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing: - -.. code-block:: console - - $ export VLLM_TARGET_DEVICE=empty - $ pip install -e . diff --git a/docs/source/getting_started/neuron-installation.md b/docs/source/getting_started/neuron-installation.md new file mode 100644 index 0000000000000..d6de5760cc82c --- /dev/null +++ b/docs/source/getting_started/neuron-installation.md @@ -0,0 +1,132 @@ +(installation-neuron)= + +# Installation with Neuron + +vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching. +Paged Attention and Chunked Prefill are currently in development and will be available soon. +Data types currently supported in Neuron SDK are FP16 and BF16. + +## Requirements + +- OS: Linux +- Python: 3.9 -- 3.11 +- Accelerator: NeuronCore_v2 (in trn1/inf2 instances) +- Pytorch 2.0.1/2.1.1 +- AWS Neuron SDK 2.16/2.17 (Verified on python 3.8) + +Installation steps: + +- [Build from source](#build-from-source-neuron) + + - [Step 0. Launch Trn1/Inf2 instances](#launch-instances) + - [Step 1. Install drivers and tools](#install-drivers) + - [Step 2. Install transformers-neuronx and its dependencies](#install-tnx) + - [Step 3. Install vLLM from source](#install-vllm) + +(build-from-source-neuron)= + +```{note} +The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. +``` + +## Build from source + +Following instructions are applicable to Neuron SDK 2.16 and beyond. + +(launch-instances)= + +### Step 0. Launch Trn1/Inf2 instances + +Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html). + +- Please follow the instructions at [launch an Amazon EC2 Instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance) to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type. +- To get more information about instances sizes and pricing see: [Trn1 web page](https://aws.amazon.com/ec2/instance-types/trn1/), [Inf2 web page](https://aws.amazon.com/ec2/instance-types/inf2/) +- Select Ubuntu Server 22.04 TLS AMI +- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB. +- After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance + +(install-drivers)= + +### Step 1. Install drivers and tools + +The installation of drivers and tools wouldn't be necessary, if [Deep Learning AMI Neuron](https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html) is installed. In case the drivers and tools are not installed on the operating system, follow the steps below: + +```console +# Configure Linux for Neuron repository updates +. /etc/os-release +sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <` - - - :ref:`Step 0. Launch Trn1/Inf2 instances ` - - :ref:`Step 1. Install drivers and tools ` - - :ref:`Step 2. Install transformers-neuronx and its dependencies ` - - :ref:`Step 3. Install vLLM from source ` - -.. _build_from_source_neuron: - -.. note:: - - The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. - -Build from source ------------------ - -Following instructions are applicable to Neuron SDK 2.16 and beyond. - -.. _launch_instances: - -Step 0. Launch Trn1/Inf2 instances -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Here are the steps to launch trn1/inf2 instances, in order to install `PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS `_. - -- Please follow the instructions at `launch an Amazon EC2 Instance `_ to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type. -- To get more information about instances sizes and pricing see: `Trn1 web page `_, `Inf2 web page `_ -- Select Ubuntu Server 22.04 TLS AMI -- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB. -- After launching the instance, follow the instructions in `Connect to your instance `_ to connect to the instance - -.. _install_drivers: - -Step 1. Install drivers and tools -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The installation of drivers and tools wouldn't be necessary, if `Deep Learning AMI Neuron `_ is installed. In case the drivers and tools are not installed on the operating system, follow the steps below: - -.. code-block:: console - - # Configure Linux for Neuron repository updates - . /etc/os-release - sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <`_ will be the backend to support inference on trn1/inf2 instances. -Follow the steps below to install transformer-neuronx package and its dependencies. - -.. code-block:: console - - # Install Python venv - sudo apt-get install -y python3.10-venv g++ - - # Create Python venv - python3.10 -m venv aws_neuron_venv_pytorch - - # Activate Python venv - source aws_neuron_venv_pytorch/bin/activate - - # Install Jupyter notebook kernel - pip install ipykernel - python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)" - pip install jupyter notebook - pip install environment_kernels - - # Set pip repository pointing to the Neuron repository - python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com - - # Install wget, awscli - python -m pip install wget - python -m pip install awscli - - # Update Neuron Compiler and Framework - python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx - -.. _install_vllm: - -Step 3. Install vLLM from source -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows: - -.. code-block:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ pip install -U -r requirements-neuron.txt - $ VLLM_TARGET_DEVICE="neuron" pip install . - -If neuron packages are detected correctly in the installation process, ``vllm-0.3.0+neuron212`` will be installed. diff --git a/docs/source/getting_started/openvino-installation.md b/docs/source/getting_started/openvino-installation.md new file mode 100644 index 0000000000000..8b43c0a90447f --- /dev/null +++ b/docs/source/getting_started/openvino-installation.md @@ -0,0 +1,104 @@ +(installation-openvino)= + +# Installation with OpenVINO + +vLLM powered by OpenVINO supports all LLM models from {doc}`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features: + +- Prefix caching (`--enable-prefix-caching`) +- Chunked prefill (`--enable-chunked-prefill`) + +**Table of contents**: + +- [Requirements](#openvino-backend-requirements) +- [Quick start using Dockerfile](#openvino-backend-quick-start-dockerfile) +- [Build from source](#install-openvino-backend-from-source) +- [Performance tips](#openvino-backend-performance-tips) +- [Limitations](#openvino-backend-limitations) + +(openvino-backend-requirements)= + +## Requirements + +- OS: Linux +- Instruction set architecture (ISA) requirement: at least AVX2. + +(openvino-backend-quick-start-dockerfile)= + +## Quick start using Dockerfile + +```console +$ docker build -f Dockerfile.openvino -t vllm-openvino-env . +$ docker run -it --rm vllm-openvino-env +``` + +(install-openvino-backend-from-source)= + +## Install from source + +- First, install Python. For example, on Ubuntu 22.04, you can run: + + ```console + $ sudo apt-get update -y + $ sudo apt-get install python3 + ``` + +- Second, install prerequisites vLLM OpenVINO backend installation: + + ```console + $ pip install --upgrade pip + $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu + ``` + +- Finally, install vLLM with OpenVINO backend: + + ```console + $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . + ``` + +- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html). + +(openvino-backend-performance-tips)= + +## Performance tips + +### vLLM OpenVINO backend environment variables + +- `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, `VLLM_OPENVINO_DEVICE=GPU.1`). If the value is not specified, CPU device is used by default. +- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `` + +### CPU performance tips + +CPU uses the following environment variables to control behavior: + +- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. +- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform. + +To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`) + +OpenVINO best known configuration for CPU is: + +```console +$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ + python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256 +``` + +### GPU performance tips + +GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account `gpu_memory_utilization` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using `VLLM_OPENVINO_KVCACHE_SPACE` environment variable (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=8` means 8 GB space for KV cache). + +Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`. + +OpenVINO best known configuration for GPU is: + +```console +$ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ + python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json +``` + +(openvino-backend-limitations)= + +## Limitations + +- LoRA serving is not supported. +- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration. +- Tensor and pipeline parallelism are not currently enabled in vLLM integration. diff --git a/docs/source/getting_started/openvino-installation.rst b/docs/source/getting_started/openvino-installation.rst deleted file mode 100644 index 5eeb7c78f7e51..0000000000000 --- a/docs/source/getting_started/openvino-installation.rst +++ /dev/null @@ -1,116 +0,0 @@ -.. _installation_openvino: - -Installation with OpenVINO -========================== - -vLLM powered by OpenVINO supports all LLM models from :doc:`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs (`the list of supported GPUs `_). OpenVINO vLLM backend supports the following advanced vLLM features: - -- Prefix caching (``--enable-prefix-caching``) -- Chunked prefill (``--enable-chunked-prefill``) - -**Table of contents**: - -- :ref:`Requirements ` -- :ref:`Quick start using Dockerfile ` -- :ref:`Build from source ` -- :ref:`Performance tips ` -- :ref:`Limitations ` - -.. _openvino_backend_requirements: - -Requirements ------------- - -* OS: Linux -* Instruction set architecture (ISA) requirement: at least AVX2. - -.. _openvino_backend_quick_start_dockerfile: - -Quick start using Dockerfile ----------------------------- - -.. code-block:: console - - $ docker build -f Dockerfile.openvino -t vllm-openvino-env . - $ docker run -it --rm vllm-openvino-env - -.. _install_openvino_backend_from_source: - -Install from source -------------------- - -- First, install Python. For example, on Ubuntu 22.04, you can run: - - .. code-block:: console - - $ sudo apt-get update -y - $ sudo apt-get install python3 - -- Second, install prerequisites vLLM OpenVINO backend installation: - - .. code-block:: console - - $ pip install --upgrade pip - $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu - -- Finally, install vLLM with OpenVINO backend: - - .. code-block:: console - - $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . - -- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: `https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html `_. - -.. _openvino_backend_performance_tips: - -Performance tips ----------------- - -vLLM OpenVINO backend environment variables -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- ``VLLM_OPENVINO_DEVICE`` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, ``VLLM_OPENVINO_DEVICE=GPU.1``). If the value is not specified, CPU device is used by default. - -- ``VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON`` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `` - -CPU performance tips -~~~~~~~~~~~~~~~~~~~~ - -CPU uses the following environment variables to control behavior: - -- ``VLLM_OPENVINO_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. - -- ``VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`` to control KV cache precision. By default, FP16 / BF16 is used depending on platform. - -To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (``--enable-chunked-prefill``). Based on the experiments, the recommended batch size is ``256`` (``--max-num-batched-tokens``) - -OpenVINO best known configuration for CPU is: - -.. code-block:: console - - $ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ - python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256 - -GPU performance tips -~~~~~~~~~~~~~~~~~~~~ -GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account ``gpu_memory_utilization`` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using ``VLLM_OPENVINO_KVCACHE_SPACE`` environment variable (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=8`` means 8 GB space for KV cache). - -Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`. - -OpenVINO best known configuration for GPU is: - -.. code-block:: console - - $ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ - python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json - -.. _openvino_backend_limitations: - -Limitations ------------ - -- LoRA serving is not supported. - -- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration. - -- Tensor and pipeline parallelism are not currently enabled in vLLM integration. diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md new file mode 100644 index 0000000000000..e3508bce68c2d --- /dev/null +++ b/docs/source/getting_started/quickstart.md @@ -0,0 +1,174 @@ +(quickstart)= + +# Quickstart + +This guide will help you quickly get started with vLLM to: + +- [Run offline batched inference](#offline-batched-inference) +- [Run OpenAI-compatible inference](#openai-compatible-server) + +## Prerequisites + +- OS: Linux +- Python: 3.9 -- 3.12 +- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) + +## Installation + +You can install vLLM using pip. It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. + +```console +$ conda create -n myenv python=3.10 -y +$ conda activate myenv +$ pip install vllm +``` + +Please refer to the {ref}`installation documentation ` for more details on installing vLLM. + +(offline-batched-inference)= + +## Offline Batched Inference + +With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). The example script for this section can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py). + +The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`: + +- {class}`~vllm.LLM` is the main class for running offline inference with vLLM engine. +- {class}`~vllm.SamplingParams` specifies the parameters for the sampling process. + +```python +from vllm import LLM, SamplingParams +``` + +The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](https://docs.vllm.ai/en/stable/dev/sampling_params.html). + +```python +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) +``` + +The {class}`~vllm.LLM` class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found [here](#supported-models). + +```python +llm = LLM(model="facebook/opt-125m") +``` + +```{note} +By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine. +``` + +Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens. + +```python +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +(openai-compatible-server)= + +## OpenAI-Compatible Server + +vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API. +By default, it starts the server at `http://localhost:8000`. You can specify the address with `--host` and `--port` arguments. The server currently hosts one model at a time and implements endpoints such as [list models](https://platform.openai.com/docs/api-reference/models/list), [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create), and [create completion](https://platform.openai.com/docs/api-reference/completions/create) endpoints. + +Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model: + +```console +$ vllm serve Qwen/Qwen2.5-1.5B-Instruct +``` + +```{note} +By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it [here](https://github.com/vllm-project/vllm/blob/main/docs/source/serving/openai_compatible_server.md#chat-template). +``` + +This server can be queried in the same format as OpenAI API. For example, to list the models: + +```console +$ curl http://localhost:8000/v1/models +``` + +You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header. + +### OpenAI Completions API with vLLM + +Once your server is started, you can query the model with input prompts: + +```console +$ curl http://localhost:8000/v1/completions \ +$ -H "Content-Type: application/json" \ +$ -d '{ +$ "model": "Qwen/Qwen2.5-1.5B-Instruct", +$ "prompt": "San Francisco is a", +$ "max_tokens": 7, +$ "temperature": 0 +$ }' +``` + +Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` python package: + +```python +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) +completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", + prompt="San Francisco is a") +print("Completion result:", completion) +``` + +A more detailed client example can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py). + +### OpenAI Chat Completions API with vLLM + +vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. + +You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model: + +```console +$ curl http://localhost:8000/v1/chat/completions \ +$ -H "Content-Type: application/json" \ +$ -d '{ +$ "model": "Qwen/Qwen2.5-1.5B-Instruct", +$ "messages": [ +$ {"role": "system", "content": "You are a helpful assistant."}, +$ {"role": "user", "content": "Who won the world series in 2020?"} +$ ] +$ }' +``` + +Alternatively, you can use the `openai` python package: + +```python +from openai import OpenAI +# Set OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +chat_response = client.chat.completions.create( + model="Qwen/Qwen2.5-1.5B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Tell me a joke."}, + ] +) +print("Chat response:", chat_response) +``` diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst deleted file mode 100644 index 0c0491c860563..0000000000000 --- a/docs/source/getting_started/quickstart.rst +++ /dev/null @@ -1,181 +0,0 @@ -.. _quickstart: - -========== -Quickstart -========== - -This guide will help you quickly get started with vLLM to: - -* :ref:`Run offline batched inference ` -* :ref:`Run OpenAI-compatible inference ` - -Prerequisites --------------- -- OS: Linux -- Python: 3.9 -- 3.12 -- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) - -Installation --------------- - -You can install vLLM using pip. It's recommended to use `conda `_ to create and manage Python environments. - -.. code-block:: console - - $ conda create -n myenv python=3.10 -y - $ conda activate myenv - $ pip install vllm - -Please refer to the :ref:`installation documentation ` for more details on installing vLLM. - -.. _offline_batched_inference: - -Offline Batched Inference -------------------------- - -With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). The example script for this section can be found `here `__. - -The first line of this example imports the classes :class:`~vllm.LLM` and :class:`~vllm.SamplingParams`: - -- :class:`~vllm.LLM` is the main class for running offline inference with vLLM engine. -- :class:`~vllm.SamplingParams` specifies the parameters for the sampling process. - -.. code-block:: python - - from vllm import LLM, SamplingParams - -The next section defines a list of input prompts and sampling parameters for text generation. The `sampling temperature `_ is set to ``0.8`` and the `nucleus sampling probability `_ is set to ``0.95``. You can find more information about the sampling parameters `here `__. - -.. code-block:: python - - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - -The :class:`~vllm.LLM` class initializes vLLM's engine and the `OPT-125M model `_ for offline inference. The list of supported models can be found :ref:`here `. - -.. code-block:: python - - llm = LLM(model="facebook/opt-125m") - -.. note:: - - By default, vLLM downloads models from `HuggingFace `_. If you would like to use models from `ModelScope `_, set the environment variable ``VLLM_USE_MODELSCOPE`` before initializing the engine. - -Now, the fun part! The outputs are generated using ``llm.generate``. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all of the output tokens. - -.. code-block:: python - - outputs = llm.generate(prompts, sampling_params) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -.. _openai_compatible_server: - -OpenAI-Compatible Server ------------------------- - -vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API. -By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time and implements endpoints such as `list models `_, `create chat completion `_, and `create completion `_ endpoints. - -Run the following command to start the vLLM server with the `Qwen2.5-1.5B-Instruct `_ model: - -.. code-block:: console - - $ vllm serve Qwen/Qwen2.5-1.5B-Instruct - -.. note:: - - By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it `here `__. - -This server can be queried in the same format as OpenAI API. For example, to list the models: - -.. code-block:: console - - $ curl http://localhost:8000/v1/models - -You can pass in the argument ``--api-key`` or environment variable ``VLLM_API_KEY`` to enable the server to check for API key in the header. - -OpenAI Completions API with vLLM -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Once your server is started, you can query the model with input prompts: - -.. code-block:: console - - $ curl http://localhost:8000/v1/completions \ - $ -H "Content-Type: application/json" \ - $ -d '{ - $ "model": "Qwen/Qwen2.5-1.5B-Instruct", - $ "prompt": "San Francisco is a", - $ "max_tokens": 7, - $ "temperature": 0 - $ }' - -Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the ``openai`` python package: - -.. code-block:: python - - from openai import OpenAI - - # Modify OpenAI's API key and API base to use vLLM's API server. - openai_api_key = "EMPTY" - openai_api_base = "http://localhost:8000/v1" - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) - completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", - prompt="San Francisco is a") - print("Completion result:", completion) - -A more detailed client example can be found `here `__. - -OpenAI Chat Completions API with vLLM -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. - -You can use the `create chat completion `_ endpoint to interact with the model: - -.. code-block:: console - - $ curl http://localhost:8000/v1/chat/completions \ - $ -H "Content-Type: application/json" \ - $ -d '{ - $ "model": "Qwen/Qwen2.5-1.5B-Instruct", - $ "messages": [ - $ {"role": "system", "content": "You are a helpful assistant."}, - $ {"role": "user", "content": "Who won the world series in 2020?"} - $ ] - $ }' - -Alternatively, you can use the ``openai`` python package: - -.. code-block:: python - - from openai import OpenAI - # Set OpenAI's API key and API base to use vLLM's API server. - openai_api_key = "EMPTY" - openai_api_base = "http://localhost:8000/v1" - - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) - - chat_response = client.chat.completions.create( - model="Qwen/Qwen2.5-1.5B-Instruct", - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Tell me a joke."}, - ] - ) - print("Chat response:", chat_response) diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/tpu-installation.md new file mode 100644 index 0000000000000..f4916460026d1 --- /dev/null +++ b/docs/source/getting_started/tpu-installation.md @@ -0,0 +1,193 @@ +(installation-tpu)= + +# Installation with TPU + +Tensor Processing Units (TPUs) are Google's custom-developed application-specific +integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs +are available in different versions each with different hardware specifications. +For more information about TPUs, see [TPU System Architecture](https://cloud.google.com/tpu/docs/system-architecture-tpu-vm). +For more information on the TPU versions supported with vLLM, see: + +- [TPU v6e](https://cloud.google.com/tpu/docs/v6e) +- [TPU v5e](https://cloud.google.com/tpu/docs/v5e) +- [TPU v5p](https://cloud.google.com/tpu/docs/v5p) +- [TPU v4](https://cloud.google.com/tpu/docs/v4) + +These TPU versions allow you to configure the physical arrangements of the TPU +chips. This can improve throughput and networking performance. For more +information see: + +- [TPU v6e topologies](https://cloud.google.com/tpu/docs/v6e#configurations) +- [TPU v5e topologies](https://cloud.google.com/tpu/docs/v5e#tpu-v5e-config) +- [TPU v5p topologies](https://cloud.google.com/tpu/docs/v5p#tpu-v5p-config) +- [TPU v4 topologies](https://cloud.google.com/tpu/docs/v4#tpu-v4-config) + +In order for you to use Cloud TPUs you need to have TPU quota granted to your +Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a +GPC project and are specified in terms of TPU version, the number of TPU you +want to use, and quota type. For more information, see [TPU quota](https://cloud.google.com/tpu/docs/quota#tpu_quota). + +For TPU pricing information, see [Cloud TPU pricing](https://cloud.google.com/tpu/pricing). + +You may need additional persistent storage for your TPU VMs. For more +information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp.google.com/tpu/docs/storage-options). + +## Requirements + +- Google Cloud TPU VM +- TPU versions: v6e, v5e, v5p, v4 +- Python: 3.10 or newer + +### Provision Cloud TPUs + +You can provision Cloud TPUs using the [Cloud TPU API](https://cloud.google.com/tpu/docs/reference/rest) +or the [queued resources](https://cloud.google.com/tpu/docs/queued-resources) +API. This section shows how to create TPUs using the queued resource API. For +more information about using the Cloud TPU API, see [Create a Cloud TPU using the Create Node API](https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api). +Queued resources enable you to request Cloud TPU resources in a queued manner. +When you request queued resources, the request is added to a queue maintained by +the Cloud TPU service. When the requested resource becomes available, it's +assigned to your Google Cloud project for your immediate exclusive use. + +```{note} +In all of the following commands, replace the ALL CAPS parameter names with +appropriate values. See the parameter descriptions table for more information. +``` + +## Provision a Cloud TPU with the queued resource API + +Create a TPU v5e with 4 TPU chips: + +```console +gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ +--node-id TPU_NAME \ +--project PROJECT_ID \ +--zone ZONE \ +--accelerator-type ACCELERATOR_TYPE \ +--runtime-version RUNTIME_VERSION \ +--service-account SERVICE_ACCOUNT +``` + +```{eval-rst} +.. list-table:: Parameter descriptions + :header-rows: 1 + + * - Parameter name + - Description + * - QUEUED_RESOURCE_ID + - The user-assigned ID of the queued resource request. + * - TPU_NAME + - The user-assigned name of the TPU which is created when the queued + resource request is allocated. + * - PROJECT_ID + - Your Google Cloud project + * - ZONE + - The GCP zone where you want to create your Cloud TPU. The value you use + depends on the version of TPUs you are using. For more information, see + `TPU regions and zones `_ + * - ACCELERATOR_TYPE + - The TPU version you want to use. Specify the TPU version, for example + `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, + see `TPU versions `_. + * - RUNTIME_VERSION + - The TPU VM runtime version to use. For more information see `TPU VM images `_. + * - SERVICE_ACCOUNT + - The email address for your service account. You can find it in the IAM + Cloud Console under *Service Accounts*. For example: + `tpu-service-account@.iam.gserviceaccount.com` +``` + +Connect to your TPU using SSH: + +```bash +gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE +``` + +Install Miniconda + +```bash +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh +bash Miniconda3-latest-Linux-x86_64.sh +source ~/.bashrc +``` + +Create and activate a Conda environment for vLLM: + +```bash +conda create -n vllm python=3.10 -y +conda activate vllm +``` + +Clone the vLLM repository and go to the vLLM directory: + +```bash +git clone https://github.com/vllm-project/vllm.git && cd vllm +``` + +Uninstall the existing `torch` and `torch_xla` packages: + +```bash +pip uninstall torch torch-xla -y +``` + +Install build dependencies: + +```bash +pip install -r requirements-tpu.txt +sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev +``` + +Run the setup script: + +```bash +VLLM_TARGET_DEVICE="tpu" python setup.py develop +``` + +## Provision Cloud TPUs with GKE + +For more information about using TPUs with GKE, see + + + + +(build-docker-tpu)= + +## Build a docker image with {code}`Dockerfile.tpu` + +You can use [Dockerfile.tpu](https://github.com/vllm-project/vllm/blob/main/Dockerfile.tpu) +to build a Docker image with TPU support. + +```console +$ docker build -f Dockerfile.tpu -t vllm-tpu . +``` + +Run the Docker image with the following command: + +```console +$ # Make sure to add `--privileged --net host --shm-size=16G`. +$ docker run --privileged --net host --shm-size=16G -it vllm-tpu +``` + +```{note} +Since TPU relies on XLA which requires static shapes, vLLM bucketizes the +possible input shapes and compiles an XLA graph for each shape. The +compilation time may take 20~30 minutes in the first run. However, the +compilation time reduces to ~5 minutes afterwards because the XLA graphs are +cached in the disk (in {code}`VLLM_XLA_CACHE_PATH` or {code}`~/.cache/vllm/xla_cache` by default). +``` + +````{tip} +If you encounter the following error: + +```console +from torch._C import * # noqa: F403 +ImportError: libopenblas.so.0: cannot open shared object file: No such +file or directory +``` + +Install OpenBLAS with the following command: + +```console +$ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev +``` +```` diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst deleted file mode 100644 index 22cc684a1c778..0000000000000 --- a/docs/source/getting_started/tpu-installation.rst +++ /dev/null @@ -1,200 +0,0 @@ -.. _installation_tpu: - -##################### -Installation with TPU -##################### - -Tensor Processing Units (TPUs) are Google's custom-developed application-specific -integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs -are available in different versions each with different hardware specifications. -For more information about TPUs, see `TPU System Architecture `_. -For more information on the TPU versions supported with vLLM, see: - -* `TPU v6e `_ -* `TPU v5e `_ -* `TPU v5p `_ -* `TPU v4 `_ - -These TPU versions allow you to configure the physical arrangements of the TPU -chips. This can improve throughput and networking performance. For more -information see: - -* `TPU v6e topologies `_ -* `TPU v5e topologies `_ -* `TPU v5p topologies `_ -* `TPU v4 topologies `_ - -In order for you to use Cloud TPUs you need to have TPU quota granted to your -Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a -GPC project and are specified in terms of TPU version, the number of TPU you -want to use, and quota type. For more information, see `TPU quota `_. - -For TPU pricing information, see `Cloud TPU pricing `_. - -You may need additional persistent storage for your TPU VMs. For more -information, see `Storage options for Cloud TPU data `_. - -Requirements ------------- - -* Google Cloud TPU VM -* TPU versions: v6e, v5e, v5p, v4 -* Python: 3.10 or newer - -Provision Cloud TPUs -==================== - -You can provision Cloud TPUs using the `Cloud TPU API `_ -or the `queued resources `_ -API. This section shows how to create TPUs using the queued resource API. For -more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API `_. -Queued resources enable you to request Cloud TPU resources in a queued manner. -When you request queued resources, the request is added to a queue maintained by -the Cloud TPU service. When the requested resource becomes available, it's -assigned to your Google Cloud project for your immediate exclusive use. - -.. note:: - In all of the following commands, replace the ALL CAPS parameter names with - appropriate values. See the parameter descriptions table for more information. - -Provision a Cloud TPU with the queued resource API --------------------------------------------------- -Create a TPU v5e with 4 TPU chips: - -.. code-block:: console - - gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ - --node-id TPU_NAME \ - --project PROJECT_ID \ - --zone ZONE \ - --accelerator-type ACCELERATOR_TYPE \ - --runtime-version RUNTIME_VERSION \ - --service-account SERVICE_ACCOUNT - - -.. list-table:: Parameter descriptions - :header-rows: 1 - - * - Parameter name - - Description - * - QUEUED_RESOURCE_ID - - The user-assigned ID of the queued resource request. - * - TPU_NAME - - The user-assigned name of the TPU which is created when the queued - resource request is allocated. - * - PROJECT_ID - - Your Google Cloud project - * - ZONE - - The GCP zone where you want to create your Cloud TPU. The value you use - depends on the version of TPUs you are using. For more information, see - `TPU regions and zones `_ - * - ACCELERATOR_TYPE - - The TPU version you want to use. Specify the TPU version, for example - `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, - see `TPU versions `_. - * - RUNTIME_VERSION - - The TPU VM runtime version to use. For more information see `TPU VM images `_. - * - SERVICE_ACCOUNT - - The email address for your service account. You can find it in the IAM - Cloud Console under *Service Accounts*. For example: - `tpu-service-account@.iam.gserviceaccount.com` - -Connect to your TPU using SSH: - -.. code-block:: bash - - gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE - -Install Miniconda - -.. code-block:: bash - - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - bash Miniconda3-latest-Linux-x86_64.sh - source ~/.bashrc - -Create and activate a Conda environment for vLLM: - -.. code-block:: bash - - conda create -n vllm python=3.10 -y - conda activate vllm - -Clone the vLLM repository and go to the vLLM directory: - -.. code-block:: bash - - git clone https://github.com/vllm-project/vllm.git && cd vllm - -Uninstall the existing `torch` and `torch_xla` packages: - -.. code-block:: bash - - pip uninstall torch torch-xla -y - -Install build dependencies: - -.. code-block:: bash - - pip install -r requirements-tpu.txt - sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev - -Run the setup script: - -.. code-block:: bash - - VLLM_TARGET_DEVICE="tpu" python setup.py develop - - -Provision Cloud TPUs with GKE ------------------------------ - -For more information about using TPUs with GKE, see -https://cloud.google.com/kubernetes-engine/docs/how-to/tpus -https://cloud.google.com/kubernetes-engine/docs/concepts/tpus -https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus - -.. _build_docker_tpu: - -Build a docker image with :code:`Dockerfile.tpu` ------------------------------------------------- - -You can use `Dockerfile.tpu `_ -to build a Docker image with TPU support. - -.. code-block:: console - - $ docker build -f Dockerfile.tpu -t vllm-tpu . - -Run the Docker image with the following command: - -.. code-block:: console - - $ # Make sure to add `--privileged --net host --shm-size=16G`. - $ docker run --privileged --net host --shm-size=16G -it vllm-tpu - -.. note:: - - Since TPU relies on XLA which requires static shapes, vLLM bucketizes the - possible input shapes and compiles an XLA graph for each shape. The - compilation time may take 20~30 minutes in the first run. However, the - compilation time reduces to ~5 minutes afterwards because the XLA graphs are - cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default). - -.. tip:: - - If you encounter the following error: - - .. code-block:: console - - from torch._C import * # noqa: F403 - ImportError: libopenblas.so.0: cannot open shared object file: No such - file or directory - - - Install OpenBLAS with the following command: - - .. code-block:: console - - $ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev - diff --git a/docs/source/getting_started/xpu-installation.md b/docs/source/getting_started/xpu-installation.md new file mode 100644 index 0000000000000..5c57509aef2db --- /dev/null +++ b/docs/source/getting_started/xpu-installation.md @@ -0,0 +1,74 @@ +(installation-xpu)= + +# Installation with XPU + +vLLM initially supports basic model inferencing and serving on Intel GPU platform. + +Table of contents: + +1. [Requirements](#xpu-backend-requirements) +2. [Quick start using Dockerfile](#xpu-backend-quick-start-dockerfile) +3. [Build from source](#build-xpu-backend-from-source) + +(xpu-backend-requirements)= + +## Requirements + +- OS: Linux +- Supported Hardware: Intel Data Center GPU, Intel ARC GPU +- OneAPI requirements: oneAPI 2024.2 + +(xpu-backend-quick-start-dockerfile)= + +## Quick start using Dockerfile + +```console +$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . +$ docker run -it \ + --rm \ + --network=host \ + --device /dev/dri \ + -v /dev/dri/by-path:/dev/dri/by-path \ + vllm-xpu-env +``` + +(build-xpu-backend-from-source)= + +## Build from source + +- First, install required driver and intel OneAPI 2024.2 or later. +- Second, install Python packages for vLLM XPU backend building: + +```console +$ source /opt/intel/oneapi/setvars.sh +$ pip install --upgrade pip +$ pip install -v -r requirements-xpu.txt +``` + +- Finally, build and install vLLM XPU backend: + +```console +$ VLLM_TARGET_DEVICE=xpu python setup.py install +``` + +```{note} +- FP16 is the default data type in the current XPU backend. The BF16 data + type will be supported in the future. +``` + +## Distributed inference and serving + +XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following: + +```console +$ python -m vllm.entrypoints.openai.api_server \ +$ --model=facebook/opt-13b \ +$ --dtype=bfloat16 \ +$ --device=xpu \ +$ --max_model_len=1024 \ +$ --distributed-executor-backend=ray \ +$ --pipeline-parallel-size=2 \ +$ -tp=8 +``` + +By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring helper [script](https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh). diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst deleted file mode 100644 index b1868acbc84b0..0000000000000 --- a/docs/source/getting_started/xpu-installation.rst +++ /dev/null @@ -1,80 +0,0 @@ -.. _installation_xpu: - -Installation with XPU -======================== - -vLLM initially supports basic model inferencing and serving on Intel GPU platform. - -Table of contents: - -#. :ref:`Requirements ` -#. :ref:`Quick start using Dockerfile ` -#. :ref:`Build from source ` - -.. _xpu_backend_requirements: - -Requirements ------------- - -* OS: Linux -* Supported Hardware: Intel Data Center GPU, Intel ARC GPU -* OneAPI requirements: oneAPI 2024.2 - -.. _xpu_backend_quick_start_dockerfile: - -Quick start using Dockerfile ----------------------------- - -.. code-block:: console - - $ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g . - $ docker run -it \ - --rm \ - --network=host \ - --device /dev/dri \ - -v /dev/dri/by-path:/dev/dri/by-path \ - vllm-xpu-env - -.. _build_xpu_backend_from_source: - -Build from source ------------------ - -- First, install required driver and intel OneAPI 2024.2 or later. - -- Second, install Python packages for vLLM XPU backend building: - -.. code-block:: console - - $ source /opt/intel/oneapi/setvars.sh - $ pip install --upgrade pip - $ pip install -v -r requirements-xpu.txt - -- Finally, build and install vLLM XPU backend: - -.. code-block:: console - - $ VLLM_TARGET_DEVICE=xpu python setup.py install - -.. note:: - - FP16 is the default data type in the current XPU backend. The BF16 data - type will be supported in the future. - - -Distributed inference and serving ---------------------------------- - -XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following: - -.. code-block:: console - - $ python -m vllm.entrypoints.openai.api_server \ - $ --model=facebook/opt-13b \ - $ --dtype=bfloat16 \ - $ --device=xpu \ - $ --max_model_len=1024 \ - $ --distributed-executor-backend=ray \ - $ --pipeline-parallel-size=2 \ - $ -tp=8 - -By default, a ray instance will be launched automatically if no existing one is detected in system, with ``num-gpus`` equals to ``parallel_config.world_size``. We recommend properly starting a ray cluster before execution, referring helper `script `_. diff --git a/docs/source/index.md b/docs/source/index.md new file mode 100644 index 0000000000000..34f9c4caebe6f --- /dev/null +++ b/docs/source/index.md @@ -0,0 +1,200 @@ +# Welcome to vLLM! + +```{figure} ./assets/logos/vllm-logo-text-light.png +:align: center +:alt: vLLM +:class: no-scaled-link +:width: 60% +``` + +```{raw} html +

+Easy, fast, and cheap LLM serving for everyone + +

+ +

+ +Star +Watch +Fork +

+``` + +vLLM is a fast and easy-to-use library for LLM inference and serving. + +vLLM is fast with: + +- State-of-the-art serving throughput +- Efficient management of attention key and value memory with **PagedAttention** +- Continuous batching of incoming requests +- Fast model execution with CUDA/HIP graph +- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8 +- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. +- Speculative decoding +- Chunked prefill + +vLLM is flexible and easy to use with: + +- Seamless integration with popular HuggingFace models +- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more +- Tensor parallelism and pipeline parallelism support for distributed inference +- Streaming outputs +- OpenAI-compatible API server +- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators. +- Prefix caching support +- Multi-lora support + +For more information, check out the following: + +- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention) +- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023) +- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al. +- {ref}`vLLM Meetups `. + +## Documentation + +```{toctree} +:caption: Getting Started +:maxdepth: 1 + +getting_started/installation +getting_started/amd-installation +getting_started/openvino-installation +getting_started/cpu-installation +getting_started/gaudi-installation +getting_started/arm-installation +getting_started/neuron-installation +getting_started/tpu-installation +getting_started/xpu-installation +getting_started/quickstart +getting_started/debugging +getting_started/examples/examples_index +``` + +```{toctree} +:caption: Serving +:maxdepth: 1 + +serving/openai_compatible_server +serving/deploying_with_docker +serving/deploying_with_k8s +serving/deploying_with_helm +serving/deploying_with_nginx +serving/distributed_serving +serving/metrics +serving/integrations +serving/tensorizer +serving/runai_model_streamer +``` + +```{toctree} +:caption: Models +:maxdepth: 1 + +models/supported_models +models/generative_models +models/pooling_models +models/adding_model +models/enabling_multimodal_inputs +``` + +```{toctree} +:caption: Usage +:maxdepth: 1 + +usage/lora +usage/multimodal_inputs +usage/tool_calling +usage/structured_outputs +usage/spec_decode +usage/compatibility_matrix +usage/performance +usage/faq +usage/engine_args +usage/env_vars +usage/usage_stats +usage/disagg_prefill +``` + +```{toctree} +:caption: Quantization +:maxdepth: 1 + +quantization/supported_hardware +quantization/auto_awq +quantization/bnb +quantization/gguf +quantization/int8 +quantization/fp8 +quantization/fp8_e5m2_kvcache +quantization/fp8_e4m3_kvcache +``` + +```{toctree} +:caption: Automatic Prefix Caching +:maxdepth: 1 + +automatic_prefix_caching/apc +automatic_prefix_caching/details +``` + +```{toctree} +:caption: Performance +:maxdepth: 1 + +performance/benchmarks +``` + +% Community: User community resources + +```{toctree} +:caption: Community +:maxdepth: 1 + +community/meetups +community/sponsors +``` + +% API Documentation: API reference aimed at vllm library usage + +```{toctree} +:caption: API Documentation +:maxdepth: 2 + +dev/sampling_params +dev/pooling_params +dev/offline_inference/offline_index +dev/engine/engine_index +``` + +% Design: docs about vLLM internals + +```{toctree} +:caption: Design +:maxdepth: 2 + +design/arch_overview +design/huggingface_integration +design/plugin_system +design/input_processing/model_inputs_index +design/kernel/paged_attention +design/multimodal/multimodal_index +design/multiprocessing +``` + +% For Developers: contributing to the vLLM project + +```{toctree} +:caption: For Developers +:maxdepth: 2 + +contributing/overview +contributing/profiling/profiling_index +contributing/dockerfile/dockerfile +``` + +# Indices and tables + +- {ref}`genindex` +- {ref}`modindex` diff --git a/docs/source/index.rst b/docs/source/index.rst deleted file mode 100644 index d812885aafea9..0000000000000 --- a/docs/source/index.rst +++ /dev/null @@ -1,194 +0,0 @@ -Welcome to vLLM! -================ - -.. figure:: ./assets/logos/vllm-logo-text-light.png - :width: 60% - :align: center - :alt: vLLM - :class: no-scaled-link - -.. raw:: html - -

- Easy, fast, and cheap LLM serving for everyone - -

- -

- - Star - Watch - Fork -

- - - -vLLM is a fast and easy-to-use library for LLM inference and serving. - -vLLM is fast with: - -* State-of-the-art serving throughput -* Efficient management of attention key and value memory with **PagedAttention** -* Continuous batching of incoming requests -* Fast model execution with CUDA/HIP graph -* Quantization: `GPTQ `_, `AWQ `_, INT4, INT8, and FP8 -* Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. -* Speculative decoding -* Chunked prefill - -vLLM is flexible and easy to use with: - -* Seamless integration with popular HuggingFace models -* High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more -* Tensor parallelism and pipeline parallelism support for distributed inference -* Streaming outputs -* OpenAI-compatible API server -* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators. -* Prefix caching support -* Multi-lora support - -For more information, check out the following: - -* `vLLM announcing blog post `_ (intro to PagedAttention) -* `vLLM paper `_ (SOSP 2023) -* `How continuous batching enables 23x throughput in LLM inference while reducing p50 latency `_ by Cade Daniel et al. -* :ref:`vLLM Meetups `. - - -Documentation -------------- - -.. toctree:: - :maxdepth: 1 - :caption: Getting Started - - getting_started/installation - getting_started/amd-installation - getting_started/openvino-installation - getting_started/cpu-installation - getting_started/gaudi-installation - getting_started/arm-installation - getting_started/neuron-installation - getting_started/tpu-installation - getting_started/xpu-installation - getting_started/quickstart - getting_started/debugging - getting_started/examples/examples_index - -.. toctree:: - :maxdepth: 1 - :caption: Serving - - serving/openai_compatible_server - serving/deploying_with_docker - serving/deploying_with_k8s - serving/deploying_with_helm - serving/deploying_with_nginx - serving/distributed_serving - serving/metrics - serving/integrations - serving/tensorizer - serving/runai_model_streamer - -.. toctree:: - :maxdepth: 1 - :caption: Models - - models/supported_models - models/generative_models - models/pooling_models - models/adding_model - models/enabling_multimodal_inputs - -.. toctree:: - :maxdepth: 1 - :caption: Usage - - usage/lora - usage/multimodal_inputs - usage/tool_calling - usage/structured_outputs - usage/spec_decode - usage/compatibility_matrix - usage/performance - usage/faq - usage/engine_args - usage/env_vars - usage/usage_stats - usage/disagg_prefill - -.. toctree:: - :maxdepth: 1 - :caption: Quantization - - quantization/supported_hardware - quantization/auto_awq - quantization/bnb - quantization/gguf - quantization/int8 - quantization/fp8 - quantization/fp8_e5m2_kvcache - quantization/fp8_e4m3_kvcache - -.. toctree:: - :maxdepth: 1 - :caption: Automatic Prefix Caching - - automatic_prefix_caching/apc - automatic_prefix_caching/details - -.. toctree:: - :maxdepth: 1 - :caption: Performance - - performance/benchmarks - -.. Community: User community resources - -.. toctree:: - :maxdepth: 1 - :caption: Community - - community/meetups - community/sponsors - -.. API Documentation: API reference aimed at vllm library usage - -.. toctree:: - :maxdepth: 2 - :caption: API Documentation - - dev/sampling_params - dev/pooling_params - dev/offline_inference/offline_index - dev/engine/engine_index - -.. Design: docs about vLLM internals - -.. toctree:: - :maxdepth: 2 - :caption: Design - - design/arch_overview - design/huggingface_integration - design/plugin_system - design/input_processing/model_inputs_index - design/kernel/paged_attention - design/multimodal/multimodal_index - design/multiprocessing - -.. For Developers: contributing to the vLLM project - -.. toctree:: - :maxdepth: 2 - :caption: For Developers - - contributing/overview - contributing/profiling/profiling_index - contributing/dockerfile/dockerfile - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` diff --git a/docs/source/models/adding_model.md b/docs/source/models/adding_model.md new file mode 100644 index 0000000000000..3739873bb547b --- /dev/null +++ b/docs/source/models/adding_model.md @@ -0,0 +1,155 @@ +(adding-a-new-model)= + +# Adding a New Model + +This document provides a high-level guide on integrating a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM. + +```{note} +The complexity of adding a new model depends heavily on the model's architecture. +The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. +However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. +``` + +```{note} +By default, vLLM models do not support multi-modal inputs. To enable multi-modal support, +please follow [this guide](#enabling-multimodal-inputs) after implementing the model here. +``` + +```{tip} +If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our [GitHub](https://github.com/vllm-project/vllm/issues) repository. +We will be happy to help you out! +``` + +## 0. Fork the vLLM repository + +Start by forking our [GitHub] repository and then [build it from source](#build-from-source). +This gives you the ability to modify the codebase and test your model. + +```{tip} +If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below. +``` + +## 1. Bring your model code + +Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the [vllm/model_executor/models](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models) directory. +For instance, vLLM's [OPT model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/opt.py) was adapted from the HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file. + +```{warning} +When copying the model code, make sure to review and adhere to the code's copyright and licensing terms. +``` + +## 2. Make your code compatible with vLLM + +To ensure compatibility with vLLM, your model must meet the following requirements: + +### Initialization Code + +All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for: + +- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts. +- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode. + +The initialization code should look like this: + +```python +from torch import nn +from vllm.config import VllmConfig +from vllm.attention import Attention + +class MyAttention(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.attn = Attention(prefix=f"{prefix}.attn") + +class MyDecoderLayer(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.self_attn = MyAttention(prefix=f"{prefix}.self_attn") + +class MyModel(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.layers = nn.ModuleList( + [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)] + ) + +class MyModelForCausalLM(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.model = MyModel(vllm_config, prefix=f"{prefix}.model") +``` + +### Computation Code + +Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. + +```python +def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, +) -> torch.Tensor: + ... +``` + +```{note} +Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. +If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. +``` + +For reference, check out the [LLAMA model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out the [vLLM models](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models) directory for more examples. + +## 3. (Optional) Implement tensor parallelism and quantization support + +If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it. +To do this, substitute your model's linear and embedding layers with their tensor-parallel versions. +For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with {code}`VocabParallelEmbedding`. For the output LM head, you can use {code}`ParallelLMHead`. +When it comes to the linear layers, we provide the following options to parallelize them: + +- {code}`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving. +- {code}`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer. +- {code}`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer. +- {code}`MergedColumnParallelLinear`: Column-parallel linear that merges multiple {code}`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices. +- {code}`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices. + +Note that all the linear layers above take {code}`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization. + +## 4. Implement the weight loading logic + +You now need to implement the {code}`load_weights` method in your {code}`*ForCausalLM` class. +This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for {code}`MergedColumnParallelLinear` and {code}`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately. + +## 5. Register your model + +Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in [vllm/model_executor/models/registry.py](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/registry.py). + +## 6. Out-of-Tree Model Integration + +You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see [plugin-system](#plugin-system). + +To register the model, use the following code: + +```python +from vllm import ModelRegistry +from your_code import YourModelForCausalLM +ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) +``` + +If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like {code}`RuntimeError: Cannot re-initialize CUDA in forked subprocess`: + +```python +from vllm import ModelRegistry + +ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") +``` + +```{important} +If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. +Read more about that [here](#enabling-multimodal-inputs). +``` + +```{note} +Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. +``` diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst deleted file mode 100644 index df06d736ca86b..0000000000000 --- a/docs/source/models/adding_model.rst +++ /dev/null @@ -1,159 +0,0 @@ -.. _adding_a_new_model: - -Adding a New Model -================== - -This document provides a high-level guide on integrating a `HuggingFace Transformers `_ model into vLLM. - -.. note:: - The complexity of adding a new model depends heavily on the model's architecture. - The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. - However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. - -.. note:: - By default, vLLM models do not support multi-modal inputs. To enable multi-modal support, - please follow :ref:`this guide ` after implementing the model here. - -.. tip:: - If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub `_ repository. - We will be happy to help you out! - - -0. Fork the vLLM repository --------------------------------- - -Start by forking our `GitHub`_ repository and then :ref:`build it from source `. -This gives you the ability to modify the codebase and test your model. - -.. tip:: - If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below. - -1. Bring your model code ------------------------- - -Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the `vllm/model_executor/models `_ directory. -For instance, vLLM's `OPT model `_ was adapted from the HuggingFace's `modeling_opt.py `_ file. - -.. warning:: - When copying the model code, make sure to review and adhere to the code's copyright and licensing terms. - - -2. Make your code compatible with vLLM --------------------------------------- - -To ensure compatibility with vLLM, your model must meet the following requirements: - -Initialization Code -^^^^^^^^^^^^^^^^^^^ - -All vLLM modules within the model must include a ``prefix`` argument in their constructor. This ``prefix`` is typically the full name of the module in the model's state dictionary and is crucial for: - -* Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts. -* Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the ``prefix`` during initialization, vLLM can match the current layer's ``prefix`` with the quantization configuration to determine if the layer should be initialized in quantized mode. - -The initialization code should look like this: - -.. code-block:: python - - from torch import nn - from vllm.config import VllmConfig - from vllm.attention import Attention - - class MyAttention(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.attn = Attention(prefix=f"{prefix}.attn") - - class MyDecoderLayer(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.self_attn = MyAttention(prefix=f"{prefix}.self_attn") - - class MyModel(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.layers = nn.ModuleList( - [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)] - ) - - class MyModelForCausalLM(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - self.model = MyModel(vllm_config, prefix=f"{prefix}.model") - -Computation Code -^^^^^^^^^^^^^^^^ - -Rewrite the :meth:`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat ``input_ids`` and ``positions`` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. - -.. code-block:: python - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - ) -> torch.Tensor: - ... - -.. note:: - Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. - If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. - -For reference, check out the `LLAMA model `__. vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out the `vLLM models `__ directory for more examples. - -3. (Optional) Implement tensor parallelism and quantization support -------------------------------------------------------------------- - -If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it. -To do this, substitute your model's linear and embedding layers with their tensor-parallel versions. -For the embedding layer, you can simply replace :class:`torch.nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`. -When it comes to the linear layers, we provide the following options to parallelize them: - -* :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving. -* :code:`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer. -* :code:`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer. -* :code:`MergedColumnParallelLinear`: Column-parallel linear that merges multiple :code:`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices. -* :code:`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices. - -Note that all the linear layers above take :code:`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization. - -4. Implement the weight loading logic -------------------------------------- - -You now need to implement the :code:`load_weights` method in your :code:`*ForCausalLM` class. -This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for :code:`MergedColumnParallelLinear` and :code:`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately. - -5. Register your model ----------------------- - -Finally, register your :code:`*ForCausalLM` class to the :code:`_VLLM_MODELS` in `vllm/model_executor/models/registry.py `_. - -6. Out-of-Tree Model Integration --------------------------------- - -You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see :ref:`plugin_system`. - -To register the model, use the following code: - -.. code-block:: python - - from vllm import ModelRegistry - from your_code import YourModelForCausalLM - ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) - -If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`: - -.. code-block:: python - - from vllm import ModelRegistry - - ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") - -.. important:: - If your model is a multimodal model, ensure the model class implements the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. - Read more about that :ref:`here `. - -.. note:: - Although you can directly put these code snippets in your script using ``vllm.LLM``, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. diff --git a/docs/source/models/enabling_multimodal_inputs.md b/docs/source/models/enabling_multimodal_inputs.md new file mode 100644 index 0000000000000..2f93eb826fb1e --- /dev/null +++ b/docs/source/models/enabling_multimodal_inputs.md @@ -0,0 +1,143 @@ +(enabling-multimodal-inputs)= + +# Enabling Multimodal Inputs + +This document walks you through the steps to extend a vLLM model so that it accepts [multi-modal inputs](#multimodal-inputs). + +```{seealso} +[Adding a New Model](adding-a-new-model) +``` + +## 1. Update the base vLLM model + +It is assumed that you have already implemented the model in vLLM according to [these steps](#adding-a-new-model). +Further update the model as follows: + +- Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. + + ```diff + + from vllm.model_executor.models.interfaces import SupportsMultiModal + + - class YourModelForImage2Seq(nn.Module): + + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): + ``` + + ```{note} + The model class does not have to be named {code}`*ForCausalLM`. + Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples. + ``` + +- If you haven't already done so, reserve a keyword parameter in {meth}`~torch.nn.Module.forward` + for each input tensor that corresponds to a multi-modal input, as shown in the following example: + + ```diff + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + + pixel_values: torch.Tensor, + ) -> SamplerOutput: + ``` + +## 2. Register input mappers + +For each modality type that the model accepts as input, decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_input_mapper `. +This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in {meth}`~torch.nn.Module.forward`. + +```diff + from vllm.model_executor.models.interfaces import SupportsMultiModal ++ from vllm.multimodal import MULTIMODAL_REGISTRY + ++ @MULTIMODAL_REGISTRY.register_image_input_mapper() + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): +``` + +A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function. + +```{seealso} +[Input Processing Pipeline](#input-processing-pipeline) +``` + +## 3. Register maximum number of multi-modal tokens + +For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item +and register it via {meth}`INPUT_REGISTRY.register_dummy_data `. + +```diff + from vllm.inputs import INPUT_REGISTRY + from vllm.model_executor.models.interfaces import SupportsMultiModal + from vllm.multimodal import MULTIMODAL_REGISTRY + + @MULTIMODAL_REGISTRY.register_image_input_mapper() ++ @MULTIMODAL_REGISTRY.register_max_image_tokens() + @INPUT_REGISTRY.register_dummy_data() + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): +``` + +Here are some examples: + +- Image inputs (static feature size): [LLaVA-1.5 Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py) +- Image inputs (dynamic feature size): [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py) + +```{seealso} +[Input Processing Pipeline](#input-processing-pipeline) +``` + +## 4. (Optional) Register dummy data + +During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models. +In such cases, you can define your own dummy data by registering a factory method via {meth}`INPUT_REGISTRY.register_dummy_data `. + +```diff + from vllm.inputs import INPUT_REGISTRY + from vllm.model_executor.models.interfaces import SupportsMultiModal + from vllm.multimodal import MULTIMODAL_REGISTRY + + @MULTIMODAL_REGISTRY.register_image_input_mapper() + @MULTIMODAL_REGISTRY.register_max_image_tokens() ++ @INPUT_REGISTRY.register_dummy_data() + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): +``` + +```{note} +The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step. +``` + +Here are some examples: + +- Image inputs (static feature size): [LLaVA-1.5 Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py) +- Image inputs (dynamic feature size): [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py) + +```{seealso} +[Input Processing Pipeline](#input-processing-pipeline) +``` + +## 5. (Optional) Register input processor + +Sometimes, there is a need to process inputs at the {class}`~vllm.LLMEngine` level before they are passed to the model executor. +This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's {meth}`~torch.nn.Module.forward` call. +You can register input processors via {meth}`INPUT_REGISTRY.register_input_processor `. + +```diff + from vllm.inputs import INPUT_REGISTRY + from vllm.model_executor.models.interfaces import SupportsMultiModal + from vllm.multimodal import MULTIMODAL_REGISTRY + + @MULTIMODAL_REGISTRY.register_image_input_mapper() + @MULTIMODAL_REGISTRY.register_max_image_tokens() + @INPUT_REGISTRY.register_dummy_data() ++ @INPUT_REGISTRY.register_input_processor() + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): +``` + +A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation. +Here are some examples: + +- Insert static number of image tokens: [LLaVA-1.5 Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py) +- Insert dynamic number of image tokens: [LLaVA-NeXT Model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py) + +```{seealso} +[Input Processing Pipeline](#input-processing-pipeline) +``` diff --git a/docs/source/models/enabling_multimodal_inputs.rst b/docs/source/models/enabling_multimodal_inputs.rst deleted file mode 100644 index 5c1236e1a8972..0000000000000 --- a/docs/source/models/enabling_multimodal_inputs.rst +++ /dev/null @@ -1,147 +0,0 @@ -.. _enabling_multimodal_inputs: - -Enabling Multimodal Inputs -========================== - -This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal inputs `. - -.. seealso:: - :ref:`adding_a_new_model` - - -1. Update the base vLLM model ------------------------------ - -It is assumed that you have already implemented the model in vLLM according to :ref:`these steps `. -Further update the model as follows: - -- Implement the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. - - .. code-block:: diff - - + from vllm.model_executor.models.interfaces import SupportsMultiModal - - - class YourModelForImage2Seq(nn.Module): - + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): - - .. note:: - The model class does not have to be named :code:`*ForCausalLM`. - Check out `the HuggingFace Transformers documentation `__ for some examples. - -- If you haven't already done so, reserve a keyword parameter in :meth:`~torch.nn.Module.forward` - for each input tensor that corresponds to a multi-modal input, as shown in the following example: - - .. code-block:: diff - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - + pixel_values: torch.Tensor, - ) -> SamplerOutput: - - -2. Register input mappers -------------------------- - -For each modality type that the model accepts as input, decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_input_mapper `. -This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in :meth:`~torch.nn.Module.forward`. - -.. code-block:: diff - - from vllm.model_executor.models.interfaces import SupportsMultiModal - + from vllm.multimodal import MULTIMODAL_REGISTRY - - + @MULTIMODAL_REGISTRY.register_image_input_mapper() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): - -A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function. - -.. seealso:: - :ref:`input_processing_pipeline` - - -3. Register maximum number of multi-modal tokens ------------------------------------------------- - -For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item -and register it via :meth:`INPUT_REGISTRY.register_dummy_data `. - -.. code-block:: diff - - from vllm.inputs import INPUT_REGISTRY - from vllm.model_executor.models.interfaces import SupportsMultiModal - from vllm.multimodal import MULTIMODAL_REGISTRY - - @MULTIMODAL_REGISTRY.register_image_input_mapper() - + @MULTIMODAL_REGISTRY.register_max_image_tokens() - @INPUT_REGISTRY.register_dummy_data() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): - -Here are some examples: - -- Image inputs (static feature size): `LLaVA-1.5 Model `__ -- Image inputs (dynamic feature size): `LLaVA-NeXT Model `__ - -.. seealso:: - :ref:`input_processing_pipeline` - - -4. (Optional) Register dummy data ---------------------------------- - -During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models. -In such cases, you can define your own dummy data by registering a factory method via :meth:`INPUT_REGISTRY.register_dummy_data `. - -.. code-block:: diff - - from vllm.inputs import INPUT_REGISTRY - from vllm.model_executor.models.interfaces import SupportsMultiModal - from vllm.multimodal import MULTIMODAL_REGISTRY - - @MULTIMODAL_REGISTRY.register_image_input_mapper() - @MULTIMODAL_REGISTRY.register_max_image_tokens() - + @INPUT_REGISTRY.register_dummy_data() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): - -.. note:: - The dummy data should have the maximum possible number of multi-modal tokens, as described in the previous step. - -Here are some examples: - -- Image inputs (static feature size): `LLaVA-1.5 Model `__ -- Image inputs (dynamic feature size): `LLaVA-NeXT Model `__ - -.. seealso:: - :ref:`input_processing_pipeline` - - -5. (Optional) Register input processor --------------------------------------- - -Sometimes, there is a need to process inputs at the :class:`~vllm.LLMEngine` level before they are passed to the model executor. -This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's :meth:`~torch.nn.Module.forward` call. -You can register input processors via :meth:`INPUT_REGISTRY.register_input_processor `. - -.. code-block:: diff - - from vllm.inputs import INPUT_REGISTRY - from vllm.model_executor.models.interfaces import SupportsMultiModal - from vllm.multimodal import MULTIMODAL_REGISTRY - - @MULTIMODAL_REGISTRY.register_image_input_mapper() - @MULTIMODAL_REGISTRY.register_max_image_tokens() - @INPUT_REGISTRY.register_dummy_data() - + @INPUT_REGISTRY.register_input_processor() - class YourModelForImage2Seq(nn.Module, SupportsMultiModal): - -A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation. -Here are some examples: - -- Insert static number of image tokens: `LLaVA-1.5 Model `__ -- Insert dynamic number of image tokens: `LLaVA-NeXT Model `__ - -.. seealso:: - :ref:`input_processing_pipeline` diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md new file mode 100644 index 0000000000000..7aeaba855dcfb --- /dev/null +++ b/docs/source/models/generative_models.md @@ -0,0 +1,138 @@ +(generative-models)= + +# Generative Models + +vLLM provides first-class support for generative models, which covers most of LLMs. + +In vLLM, generative models implement the {class}`~vllm.model_executor.models.VllmModelForTextGeneration` interface. +Based on the final hidden states of the input, these models output log probabilities of the tokens to generate, +which are then passed through {class}`~vllm.model_executor.layers.Sampler` to obtain the final text. + +## Offline Inference + +The {class}`~vllm.LLM` class provides various methods for offline inference. +See [Engine Arguments](#engine-args) for a list of options when initializing the model. + +For generative models, the only supported {code}`task` option is {code}`"generate"`. +Usually, this is automatically inferred so you don't have to specify it. + +### `LLM.generate` + +The {class}`~vllm.LLM.generate` method is available to all generative models in vLLM. +It is similar to [its counterpart in HF Transformers](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate), +except that tokenization and detokenization are also performed automatically. + +```python +llm = LLM(model="facebook/opt-125m") +outputs = llm.generate("Hello, my name is") + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +You can optionally control the language generation by passing {class}`~vllm.SamplingParams`. +For example, you can use greedy sampling by setting {code}`temperature=0`: + +```python +llm = LLM(model="facebook/opt-125m") +params = SamplingParams(temperature=0) +outputs = llm.generate("Hello, my name is", params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +A code example can be found in [examples/offline_inference.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py). + +### `LLM.beam_search` + +The {class}`~vllm.LLM.beam_search` method implements [beam search](https://huggingface.co/docs/transformers/en/generation_strategies#beam-search-decoding) on top of {class}`~vllm.LLM.generate`. +For example, to search using 5 beams and output at most 50 tokens: + +```python +llm = LLM(model="facebook/opt-125m") +params = BeamSearchParams(beam_width=5, max_tokens=50) +outputs = llm.generate("Hello, my name is", params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +### `LLM.chat` + +The {class}`~vllm.LLM.chat` method implements chat functionality on top of {class}`~vllm.LLM.generate`. +In particular, it accepts input similar to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat) +and automatically applies the model's [chat template](https://huggingface.co/docs/transformers/en/chat_templating) to format the prompt. + +```{important} +In general, only instruction-tuned models have a chat template. +Base models may perform poorly as they are not trained to respond to the chat conversation. +``` + +```python +llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") +conversation = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "content": "Hello! How can I assist you today?" + }, + { + "role": "user", + "content": "Write an essay about the importance of higher education.", + }, +] +outputs = llm.chat(conversation) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +A code example can be found in [examples/offline_inference_chat.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_chat.py). + +If the model doesn't have a chat template or you want to specify another one, +you can explicitly pass a chat template: + +```python +from vllm.entrypoints.chat_utils import load_chat_template + +# You can find a list of existing chat templates under `examples/` +custom_template = load_chat_template(chat_template="") +print("Loaded chat template:", custom_template) + +outputs = llm.chat(conversation, chat_template=custom_template) +``` + +## Online Inference + +Our [OpenAI Compatible Server](../serving/openai_compatible_server) can be used for online inference. +Please click on the above link for more details on how to launch the server. + +### Completions API + +Our Completions API is similar to `LLM.generate` but only accepts text. +It is compatible with [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions) +so that you can use OpenAI client to interact with it. +A code example can be found in [examples/openai_completion_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py). + +### Chat API + +Our Chat API is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs). +It is compatible with [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat) +so that you can use OpenAI client to interact with it. +A code example can be found in [examples/openai_chat_completion_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client.py). diff --git a/docs/source/models/generative_models.rst b/docs/source/models/generative_models.rst deleted file mode 100644 index fb71185600863..0000000000000 --- a/docs/source/models/generative_models.rst +++ /dev/null @@ -1,146 +0,0 @@ -.. _generative_models: - -Generative Models -================= - -vLLM provides first-class support for generative models, which covers most of LLMs. - -In vLLM, generative models implement the :class:`~vllm.model_executor.models.VllmModelForTextGeneration` interface. -Based on the final hidden states of the input, these models output log probabilities of the tokens to generate, -which are then passed through :class:`~vllm.model_executor.layers.Sampler` to obtain the final text. - -Offline Inference ------------------ - -The :class:`~vllm.LLM` class provides various methods for offline inference. -See :ref:`Engine Arguments ` for a list of options when initializing the model. - -For generative models, the only supported :code:`task` option is :code:`"generate"`. -Usually, this is automatically inferred so you don't have to specify it. - -``LLM.generate`` -^^^^^^^^^^^^^^^^ - -The :class:`~vllm.LLM.generate` method is available to all generative models in vLLM. -It is similar to `its counterpart in HF Transformers `__, -except that tokenization and detokenization are also performed automatically. - -.. code-block:: python - - llm = LLM(model="facebook/opt-125m") - outputs = llm.generate("Hello, my name is") - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -You can optionally control the language generation by passing :class:`~vllm.SamplingParams`. -For example, you can use greedy sampling by setting :code:`temperature=0`: - -.. code-block:: python - - llm = LLM(model="facebook/opt-125m") - params = SamplingParams(temperature=0) - outputs = llm.generate("Hello, my name is", params) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -A code example can be found in `examples/offline_inference.py `_. - -``LLM.beam_search`` -^^^^^^^^^^^^^^^^^^^ - -The :class:`~vllm.LLM.beam_search` method implements `beam search `__ on top of :class:`~vllm.LLM.generate`. -For example, to search using 5 beams and output at most 50 tokens: - -.. code-block:: python - - llm = LLM(model="facebook/opt-125m") - params = BeamSearchParams(beam_width=5, max_tokens=50) - outputs = llm.generate("Hello, my name is", params) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -``LLM.chat`` -^^^^^^^^^^^^ - -The :class:`~vllm.LLM.chat` method implements chat functionality on top of :class:`~vllm.LLM.generate`. -In particular, it accepts input similar to `OpenAI Chat Completions API `__ -and automatically applies the model's `chat template `__ to format the prompt. - -.. important:: - - In general, only instruction-tuned models have a chat template. - Base models may perform poorly as they are not trained to respond to the chat conversation. - -.. code-block:: python - - llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") - conversation = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": "Hello" - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?" - }, - { - "role": "user", - "content": "Write an essay about the importance of higher education.", - }, - ] - outputs = llm.chat(conversation) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -A code example can be found in `examples/offline_inference_chat.py `_. - -If the model doesn't have a chat template or you want to specify another one, -you can explicitly pass a chat template: - -.. code-block:: python - - from vllm.entrypoints.chat_utils import load_chat_template - - # You can find a list of existing chat templates under `examples/` - custom_template = load_chat_template(chat_template="") - print("Loaded chat template:", custom_template) - - outputs = llm.chat(conversation, chat_template=custom_template) - -Online Inference ----------------- - -Our `OpenAI Compatible Server <../serving/openai_compatible_server>`__ can be used for online inference. -Please click on the above link for more details on how to launch the server. - -Completions API -^^^^^^^^^^^^^^^ - -Our Completions API is similar to ``LLM.generate`` but only accepts text. -It is compatible with `OpenAI Completions API `__ -so that you can use OpenAI client to interact with it. -A code example can be found in `examples/openai_completion_client.py `_. - -Chat API -^^^^^^^^ - -Our Chat API is similar to ``LLM.chat``, accepting both text and :ref:`multi-modal inputs `. -It is compatible with `OpenAI Chat Completions API `__ -so that you can use OpenAI client to interact with it. -A code example can be found in `examples/openai_chat_completion_client.py `_. diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md new file mode 100644 index 0000000000000..20a7b8f33947d --- /dev/null +++ b/docs/source/models/pooling_models.md @@ -0,0 +1,127 @@ +(pooling-models)= + +# Pooling Models + +vLLM also supports pooling models, including embedding, reranking and reward models. + +In vLLM, pooling models implement the {class}`~vllm.model_executor.models.VllmModelForPooling` interface. +These models use a {class}`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input +before returning them. + +```{note} +We currently support pooling models primarily as a matter of convenience. +As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM features are not applicable to +pooling models as they only work on the generation or decode stage, so performance may not improve as much. +``` + +## Offline Inference + +The {class}`~vllm.LLM` class provides various methods for offline inference. +See [Engine Arguments](#engine-args) for a list of options when initializing the model. + +For pooling models, we support the following {code}`task` options: + +- Embedding ({code}`"embed"` / {code}`"embedding"`) +- Classification ({code}`"classify"`) +- Sentence Pair Scoring ({code}`"score"`) +- Reward Modeling ({code}`"reward"`) + +The selected task determines the default {class}`~vllm.model_executor.layers.Pooler` that is used: + +- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization. +- Classification: Extract only the hidden states corresponding to the last token, and apply softmax. +- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax. +- Reward Modeling: Extract all of the hidden states and return them directly. + +When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, +we attempt to override the default pooler based on its Sentence Transformers configuration file ({code}`modules.json`). + +You can customize the model's pooling method via the {code}`override_pooler_config` option, +which takes priority over both the model's and Sentence Transformers's defaults. + +### `LLM.encode` + +The {class}`~vllm.LLM.encode` method is available to all pooling models in vLLM. +It returns the extracted hidden states directly, which is useful for reward models. + +```python +llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward") +(output,) = llm.encode("Hello, my name is") + +data = output.outputs.data +print(f"Data: {data!r}") +``` + +### `LLM.embed` + +The {class}`~vllm.LLM.embed` method outputs an embedding vector for each prompt. +It is primarily designed for embedding models. + +```python +llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed") +(output,) = llm.embed("Hello, my name is") + +embeds = output.outputs.embedding +print(f"Embeddings: {embeds!r} (size={len(embeds)})") +``` + +A code example can be found in [examples/offline_inference_embedding.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_embedding.py). + +### `LLM.classify` + +The {class}`~vllm.LLM.classify` method outputs a probability vector for each prompt. +It is primarily designed for classification models. + +```python +llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify") +(output,) = llm.classify("Hello, my name is") + +probs = output.outputs.probs +print(f"Class Probabilities: {probs!r} (size={len(probs)})") +``` + +A code example can be found in [examples/offline_inference_classification.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_classification.py). + +### `LLM.score` + +The {class}`~vllm.LLM.score` method outputs similarity scores between sentence pairs. +It is primarily designed for [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html). +These types of models serve as rerankers between candidate query-document pairs in RAG systems. + +```{note} +vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG. +To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain). +``` + +```python +llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score") +(output,) = llm.score("What is the capital of France?", + "The capital of Brazil is Brasilia.") + +score = output.outputs.score +print(f"Score: {score}") +``` + +A code example can be found in [examples/offline_inference_scoring.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_scoring.py). + +## Online Inference + +Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) can be used for online inference. +Please click on the above link for more details on how to launch the server. + +### Embeddings API + +Our Embeddings API is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs). + +The text-only API is compatible with [OpenAI Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) +so that you can use OpenAI client to interact with it. +A code example can be found in [examples/openai_embedding_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_embedding_client.py). + +The multi-modal API is an extension of the [OpenAI Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) +that incorporates [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat), +so it is not part of the OpenAI standard. Please see [](#multimodal-inputs) for more details on how to use it. + +### Score API + +Our Score API is similar to `LLM.score`. +Please see [this page](#score-api) for more details on how to use it. diff --git a/docs/source/models/pooling_models.rst b/docs/source/models/pooling_models.rst deleted file mode 100644 index 4e67677a2767a..0000000000000 --- a/docs/source/models/pooling_models.rst +++ /dev/null @@ -1,136 +0,0 @@ -.. _pooling_models: - -Pooling Models -============== - -vLLM also supports pooling models, including embedding, reranking and reward models. - -In vLLM, pooling models implement the :class:`~vllm.model_executor.models.VllmModelForPooling` interface. -These models use a :class:`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input -before returning them. - -.. note:: - - We currently support pooling models primarily as a matter of convenience. - As shown in the :ref:`Compatibility Matrix `, most vLLM features are not applicable to - pooling models as they only work on the generation or decode stage, so performance may not improve as much. - -Offline Inference ------------------ - -The :class:`~vllm.LLM` class provides various methods for offline inference. -See :ref:`Engine Arguments ` for a list of options when initializing the model. - -For pooling models, we support the following :code:`task` options: - -- Embedding (:code:`"embed"` / :code:`"embedding"`) -- Classification (:code:`"classify"`) -- Sentence Pair Scoring (:code:`"score"`) -- Reward Modeling (:code:`"reward"`) - -The selected task determines the default :class:`~vllm.model_executor.layers.Pooler` that is used: - -- Embedding: Extract only the hidden states corresponding to the last token, and apply normalization. -- Classification: Extract only the hidden states corresponding to the last token, and apply softmax. -- Sentence Pair Scoring: Extract only the hidden states corresponding to the last token, and apply softmax. -- Reward Modeling: Extract all of the hidden states and return them directly. - -When loading `Sentence Transformers `__ models, -we attempt to override the default pooler based on its Sentence Transformers configuration file (:code:`modules.json`). - -You can customize the model's pooling method via the :code:`override_pooler_config` option, -which takes priority over both the model's and Sentence Transformers's defaults. - -``LLM.encode`` -^^^^^^^^^^^^^^ - -The :class:`~vllm.LLM.encode` method is available to all pooling models in vLLM. -It returns the extracted hidden states directly, which is useful for reward models. - -.. code-block:: python - - llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward") - (output,) = llm.encode("Hello, my name is") - - data = output.outputs.data - print(f"Data: {data!r}") - -``LLM.embed`` -^^^^^^^^^^^^^ - -The :class:`~vllm.LLM.embed` method outputs an embedding vector for each prompt. -It is primarily designed for embedding models. - -.. code-block:: python - - llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed") - (output,) = llm.embed("Hello, my name is") - - embeds = output.outputs.embedding - print(f"Embeddings: {embeds!r} (size={len(embeds)})") - -A code example can be found in `examples/offline_inference_embedding.py `_. - -``LLM.classify`` -^^^^^^^^^^^^^^^^ - -The :class:`~vllm.LLM.classify` method outputs a probability vector for each prompt. -It is primarily designed for classification models. - -.. code-block:: python - - llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify") - (output,) = llm.classify("Hello, my name is") - - probs = output.outputs.probs - print(f"Class Probabilities: {probs!r} (size={len(probs)})") - -A code example can be found in `examples/offline_inference_classification.py `_. - -``LLM.score`` -^^^^^^^^^^^^^ - -The :class:`~vllm.LLM.score` method outputs similarity scores between sentence pairs. -It is primarily designed for `cross-encoder models `__. -These types of models serve as rerankers between candidate query-document pairs in RAG systems. - -.. note:: - - vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG. - To handle RAG at a higher level, you should use integration frameworks such as `LangChain `_. - -.. code-block:: python - - llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score") - (output,) = llm.score("What is the capital of France?", - "The capital of Brazil is Brasilia.") - - score = output.outputs.score - print(f"Score: {score}") - -A code example can be found in `examples/offline_inference_scoring.py `_. - -Online Inference ----------------- - -Our `OpenAI Compatible Server <../serving/openai_compatible_server>`__ can be used for online inference. -Please click on the above link for more details on how to launch the server. - -Embeddings API -^^^^^^^^^^^^^^ - -Our Embeddings API is similar to ``LLM.embed``, accepting both text and :ref:`multi-modal inputs `. - -The text-only API is compatible with `OpenAI Embeddings API `__ -so that you can use OpenAI client to interact with it. -A code example can be found in `examples/openai_embedding_client.py `_. - -The multi-modal API is an extension of the `OpenAI Embeddings API `__ -that incorporates `OpenAI Chat Completions API `__, -so it is not part of the OpenAI standard. Please see :ref:`this page ` for more details on how to use it. - -Score API -^^^^^^^^^ - -Our Score API is similar to ``LLM.score``. -Please see `this page <../serving/openai_compatible_server.html#score-api-for-cross-encoder-models>`__ for more details on how to use it. diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.md similarity index 71% rename from docs/source/models/supported_models.rst rename to docs/source/models/supported_models.md index 488fcc7709c77..650293d864011 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.md @@ -1,84 +1,78 @@ -.. _supported_models: +(supported-models)= -Supported Models -================ +# Supported Models vLLM supports generative and pooling models across various tasks. -If a model supports more than one task, you can set the task via the :code:`--task` argument. +If a model supports more than one task, you can set the task via the {code}`--task` argument. For each task, we list the model architectures that have been implemented in vLLM. Alongside each architecture, we include some popular models that use it. -Loading a Model -^^^^^^^^^^^^^^^ +## Loading a Model -HuggingFace Hub -+++++++++++++++ +### HuggingFace Hub -By default, vLLM loads models from `HuggingFace (HF) Hub `_. +By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models). -To determine whether a given model is supported, you can check the :code:`config.json` file inside the HF repository. -If the :code:`"architectures"` field contains a model architecture listed below, then it should be supported in theory. +To determine whether a given model is supported, you can check the {code}`config.json` file inside the HF repository. +If the {code}`"architectures"` field contains a model architecture listed below, then it should be supported in theory. -.. tip:: - The easiest way to check if your model is really supported at runtime is to run the program below: +````{tip} +The easiest way to check if your model is really supported at runtime is to run the program below: - .. code-block:: python +```python +from vllm import LLM - from vllm import LLM +# For generative models (task=generate) only +llm = LLM(model=..., task="generate") # Name or path of your model +output = llm.generate("Hello, my name is") +print(output) - # For generative models (task=generate) only - llm = LLM(model=..., task="generate") # Name or path of your model - output = llm.generate("Hello, my name is") - print(output) +# For pooling models (task={embed,classify,reward}) only +llm = LLM(model=..., task="embed") # Name or path of your model +output = llm.encode("Hello, my name is") +print(output) +``` - # For pooling models (task={embed,classify,reward}) only - llm = LLM(model=..., task="embed") # Name or path of your model - output = llm.encode("Hello, my name is") - print(output) +If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. +```` - If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. +Otherwise, please refer to [Adding a New Model](#adding-a-new-model) and [Enabling Multimodal Inputs](#enabling-multimodal-inputs) for instructions on how to implement your model in vLLM. +Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support. -Otherwise, please refer to :ref:`Adding a New Model ` and :ref:`Enabling Multimodal Inputs ` -for instructions on how to implement your model in vLLM. -Alternatively, you can `open an issue on GitHub `_ to request vLLM support. +### ModelScope -ModelScope -++++++++++ +To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable: -To use models from `ModelScope `_ instead of HuggingFace Hub, set an environment variable: +```shell +$ export VLLM_USE_MODELSCOPE=True +``` -.. code-block:: shell +And use with {code}`trust_remote_code=True`. - $ export VLLM_USE_MODELSCOPE=True +```python +from vllm import LLM -And use with :code:`trust_remote_code=True`. +llm = LLM(model=..., revision=..., task=..., trust_remote_code=True) -.. code-block:: python +# For generative models (task=generate) only +output = llm.generate("Hello, my name is") +print(output) - from vllm import LLM +# For pooling models (task={embed,classify,reward}) only +output = llm.encode("Hello, my name is") +print(output) +``` - llm = LLM(model=..., revision=..., task=..., trust_remote_code=True) +## List of Text-only Language Models - # For generative models (task=generate) only - output = llm.generate("Hello, my name is") - print(output) +### Generative Models - # For pooling models (task={embed,classify,reward}) only - output = llm.encode("Hello, my name is") - print(output) +See [this page](#generative-models) for more information on how to use generative models. -List of Text-only Language Models -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Generative Models -+++++++++++++++++ - -See :ref:`this page ` for more information on how to use generative models. - -Text Generation (``--task generate``) -------------------------------------- +#### Text Generation (`--task generate`) +```{eval-rst} .. list-table:: :widths: 25 25 50 5 5 :header-rows: 1 @@ -86,8 +80,8 @@ Text Generation (``--task generate``) * - Architecture - Models - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - :ref:`LoRA ` + - :ref:`PP ` * - :code:`AquilaForCausalLM` - Aquila, Aquila2 - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc. @@ -111,8 +105,8 @@ Text Generation (``--task generate``) * - :code:`BartForConditionalGeneration` - BART - :code:`facebook/bart-base`, :code:`facebook/bart-large-cnn`, etc. - - - - + - + - * - :code:`ChatGLMModel` - ChatGLM - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc. @@ -136,12 +130,12 @@ Text Generation (``--task generate``) * - :code:`DeepseekForCausalLM` - DeepSeek - :code:`deepseek-ai/deepseek-llm-67b-base`, :code:`deepseek-ai/deepseek-llm-7b-chat` etc. - - + - - ✅︎ * - :code:`DeepseekV2ForCausalLM` - DeepSeek-V2 - :code:`deepseek-ai/DeepSeek-V2`, :code:`deepseek-ai/DeepSeek-V2-Chat` etc. - - + - - ✅︎ * - :code:`ExaoneForCausalLM` - EXAONE-3 @@ -316,7 +310,7 @@ Text Generation (``--task generate``) * - :code:`PersimmonForCausalLM` - Persimmon - :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc. - - + - - ✅︎ * - :code:`QWenLMHeadModel` - Qwen @@ -358,29 +352,32 @@ Text Generation (``--task generate``) - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc. - ✅︎ - ✅︎ +``` -.. note:: - Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. +```{note} +Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. +``` -Pooling Models -++++++++++++++ +### Pooling Models -See :ref:`this page ` for more information on how to use pooling models. +See [this page](pooling-models) for more information on how to use pooling models. -.. important:: - Since some model architectures support both generative and pooling tasks, - you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. +```{important} +Since some model architectures support both generative and pooling tasks, +you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. +``` -Text Embedding (``--task embed``) ---------------------------------- +#### Text Embedding (`--task embed`) -Any text generation model can be converted into an embedding model by passing :code:`--task embed`. +Any text generation model can be converted into an embedding model by passing {code}`--task embed`. -.. note:: - To get the best results, you should use pooling models that are specifically trained as such. +```{note} +To get the best results, you should use pooling models that are specifically trained as such. +``` The following table lists those that are tested in vLLM. +```{eval-rst} .. list-table:: :widths: 25 25 50 5 5 :header-rows: 1 @@ -388,17 +385,17 @@ The following table lists those that are tested in vLLM. * - Architecture - Models - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - :ref:`LoRA ` + - :ref:`PP ` * - :code:`BertModel` - BERT-based - :code:`BAAI/bge-base-en-v1.5`, etc. - - - - + - + - * - :code:`Gemma2Model` - Gemma2-based - :code:`BAAI/bge-multilingual-gemma2`, etc. - - + - - ✅︎ * - :code:`GritLM` - GritLM @@ -418,28 +415,31 @@ The following table lists those that are tested in vLLM. * - :code:`RobertaModel`, :code:`RobertaForMaskedLM` - RoBERTa-based - :code:`sentence-transformers/all-roberta-large-v1`, :code:`sentence-transformers/all-roberta-large-v1`, etc. - - - - + - + - * - :code:`XLMRobertaModel` - XLM-RoBERTa-based - :code:`intfloat/multilingual-e5-large`, etc. - - - - + - + - +``` -.. note:: - :code:`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. - You should manually set mean pooling by passing :code:`--override-pooler-config '{"pooling_type": "MEAN"}'`. +```{note} +{code}`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. +You should manually set mean pooling by passing {code}`--override-pooler-config '{"pooling_type": "MEAN"}'`. +``` -.. note:: - Unlike base Qwen2, :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention. - You can set :code:`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly. +```{note} +Unlike base Qwen2, {code}`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention. +You can set {code}`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly. - On the other hand, its 1.5B variant (:code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention - despite being described otherwise on its model card. +On the other hand, its 1.5B variant ({code}`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention +despite being described otherwise on its model card. +``` -Reward Modeling (``--task reward``) ------------------------------------ +#### Reward Modeling (`--task reward`) +```{eval-rst} .. list-table:: :widths: 25 25 50 5 5 :header-rows: 1 @@ -447,8 +447,8 @@ Reward Modeling (``--task reward``) * - Architecture - Models - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - :ref:`LoRA ` + - :ref:`PP ` * - :code:`LlamaForCausalLM` - Llama-based - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc. @@ -459,14 +459,16 @@ Reward Modeling (``--task reward``) - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc. - ✅︎ - ✅︎ +``` -.. important:: - For process-supervised reward models such as :code:`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, - e.g.: :code:`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. +```{important} +For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, +e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. +``` -Classification (``--task classify``) ------------------------------------- +#### Classification (`--task classify`) +```{eval-rst} .. list-table:: :widths: 25 25 50 5 5 :header-rows: 1 @@ -474,8 +476,8 @@ Classification (``--task classify``) * - Architecture - Models - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - :ref:`LoRA ` + - :ref:`PP ` * - :code:`JambaForSequenceClassification` - Jamba - :code:`ai21labs/Jamba-tiny-reward-dev`, etc. @@ -486,10 +488,11 @@ Classification (``--task classify``) - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc. - ✅︎ - ✅︎ +``` -Sentence Pair Scoring (``--task score``) ----------------------------------------- +#### Sentence Pair Scoring (`--task score`) +```{eval-rst} .. list-table:: :widths: 25 25 50 5 5 :header-rows: 1 @@ -497,54 +500,53 @@ Sentence Pair Scoring (``--task score``) * - Architecture - Models - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - :ref:`LoRA ` + - :ref:`PP ` * - :code:`BertForSequenceClassification` - BERT-based - :code:`cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. - - - - + - + - * - :code:`RobertaForSequenceClassification` - RoBERTa-based - :code:`cross-encoder/quora-roberta-base`, etc. - - - - + - + - * - :code:`XLMRobertaForSequenceClassification` - XLM-RoBERTa-based - :code:`BAAI/bge-reranker-v2-m3`, etc. - - - - + - + - +``` -.. _supported_mm_models: +(supported-mm-models)= -List of Multimodal Language Models -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +## List of Multimodal Language Models The following modalities are supported depending on the model: -- **T**\ ext -- **I**\ mage -- **V**\ ideo -- **A**\ udio +- **T**ext +- **I**mage +- **V**ideo +- **A**udio -Any combination of modalities joined by :code:`+` are supported. +Any combination of modalities joined by {code}`+` are supported. -- e.g.: :code:`T + I` means that the model supports text-only, image-only, and text-with-image inputs. +- e.g.: {code}`T + I` means that the model supports text-only, image-only, and text-with-image inputs. -On the other hand, modalities separated by :code:`/` are mutually exclusive. +On the other hand, modalities separated by {code}`/` are mutually exclusive. -- e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. +- e.g.: {code}`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. -See :ref:`this page ` on how to pass multi-modal inputs to the model. +See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model. -Generative Models -+++++++++++++++++ +### Generative Models -See :ref:`this page ` for more information on how to use generative models. +See [this page](#generative-models) for more information on how to use generative models. -Text Generation (``--task generate``) -------------------------------------- +#### Text Generation (`--task generate`) +```{eval-rst} .. list-table:: :widths: 25 25 15 20 5 5 5 :header-rows: 1 @@ -553,63 +555,63 @@ Text Generation (``--task generate``) - Models - Inputs - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - :ref:`LoRA ` + - :ref:`PP ` - V1 * - :code:`AriaForConditionalGeneration` - Aria - T + I - :code:`rhymes-ai/Aria` - - + - - ✅︎ - - + - * - :code:`Blip2ForConditionalGeneration` - BLIP-2 - T + I\ :sup:`E` - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc. - - ✅︎ - - + - * - :code:`ChameleonForConditionalGeneration` - Chameleon - T + I - :code:`facebook/chameleon-7b` etc. - - + - - ✅︎ - - + - * - :code:`FuyuForCausalLM` - Fuyu - T + I - :code:`adept/fuyu-8b` etc. - - + - - ✅︎ - - + - * - :code:`ChatGLMModel` - GLM-4V - T + I - :code:`THUDM/glm-4v-9b` etc. - ✅︎ - ✅︎ - - + - * - :code:`H2OVLChatModel` - H2OVL - T + I\ :sup:`E+` - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc. - - + - - ✅︎ - - + - * - :code:`Idefics3ForConditionalGeneration` - Idefics3 - T + I - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc. - ✅︎ - - - + - * - :code:`InternVLChatModel` - InternVL 2.5, Mono-InternVL, InternVL 2.0 - T + I\ :sup:`E+` - :code:`OpenGVLab/InternVL2_5-4B`, :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, etc. - - + - - ✅︎ - ✅︎ * - :code:`LlavaForConditionalGeneration` @@ -625,28 +627,28 @@ Text Generation (``--task generate``) - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc. - - ✅︎ - - + - * - :code:`LlavaNextVideoForConditionalGeneration` - LLaVA-NeXT-Video - T + V - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. - - ✅︎ - - + - * - :code:`LlavaOnevisionForConditionalGeneration` - LLaVA-Onevision - T + I\ :sup:`+` + V\ :sup:`+` - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. - - ✅︎ - - + - * - :code:`MiniCPMV` - MiniCPM-V - T + I\ :sup:`E+` - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc. - ✅︎ - ✅︎ - - + - * - :code:`MllamaForConditionalGeneration` - Llama 3.2 - T + I\ :sup:`+` @@ -665,7 +667,7 @@ Text Generation (``--task generate``) - NVLM-D 1.0 - T + I\ :sup:`E+` - :code:`nvidia/NVLM-D-72B`, etc. - - + - - ✅︎ - ✅︎ * - :code:`PaliGemmaForConditionalGeneration` @@ -674,7 +676,7 @@ Text Generation (``--task generate``) - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, :code:`google/paligemma2-3b-ft-docci-448`, etc. - - ✅︎ - - + - * - :code:`Phi3VForCausalLM` - Phi-3-Vision, Phi-3.5-Vision - T + I\ :sup:`E+` @@ -702,70 +704,79 @@ Text Generation (``--task generate``) - :code:`Qwen/Qwen2-Audio-7B-Instruct` - - ✅︎ - - + - * - :code:`Qwen2VLForConditionalGeneration` - Qwen2-VL - T + I\ :sup:`E+` + V\ :sup:`E+` - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. - ✅︎ - ✅︎ - - + - * - :code:`UltravoxModel` - Ultravox - T + A\ :sup:`E+` - :code:`fixie-ai/ultravox-v0_3` - - ✅︎ - - - -| :sup:`E` Pre-computed embeddings can be inputted for this modality. -| :sup:`+` Multiple items can be inputted per text prompt for this modality. + - +``` -.. important:: - To enable multiple multi-modal items per text prompt, you have to set :code:`limit_mm_per_prompt` (offline inference) - or :code:`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt: +```{eval-rst} +:sup:`E` Pre-computed embeddings can be inputted for this modality. - .. code-block:: python +:sup:`+` Multiple items can be inputted per text prompt for this modality. +``` - llm = LLM( - model="Qwen/Qwen2-VL-7B-Instruct", - limit_mm_per_prompt={"image": 4}, - ) +````{important} +To enable multiple multi-modal items per text prompt, you have to set {code}`limit_mm_per_prompt` (offline inference) +or {code}`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt: - .. code-block:: bash +```python +llm = LLM( + model="Qwen/Qwen2-VL-7B-Instruct", + limit_mm_per_prompt={"image": 4}, +) +``` - vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 +```bash +vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 +``` +```` -.. note:: - vLLM currently only supports adding LoRA to the language backbone of multimodal models. +```{note} +vLLM currently only supports adding LoRA to the language backbone of multimodal models. +``` -.. note:: - To use :code:`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo (:code:`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`) - and pass :code:`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. +```{note} +To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo ({code}`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`) +and pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. +``` -.. note:: - The official :code:`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now. - For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 +```{note} +The official {code}`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork ({code}`HwwwH/MiniCPM-V-2`) for now. +For more details, please see: +``` -Pooling Models -++++++++++++++ +### Pooling Models -See :ref:`this page ` for more information on how to use pooling models. +See [this page](pooling-models) for more information on how to use pooling models. -.. important:: - Since some model architectures support both generative and pooling tasks, - you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. +```{important} +Since some model architectures support both generative and pooling tasks, +you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. +``` -Text Embedding (``--task embed``) ---------------------------------- +#### Text Embedding (`--task embed`) -Any text generation model can be converted into an embedding model by passing :code:`--task embed`. +Any text generation model can be converted into an embedding model by passing {code}`--task embed`. -.. note:: - To get the best results, you should use pooling models that are specifically trained as such. +```{note} +To get the best results, you should use pooling models that are specifically trained as such. +``` The following table lists those that are tested in vLLM. +```{eval-rst} .. list-table:: :widths: 25 25 15 25 5 5 :header-rows: 1 @@ -774,13 +785,13 @@ The following table lists those that are tested in vLLM. - Models - Inputs - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` + - :ref:`LoRA ` + - :ref:`PP ` * - :code:`LlavaNextForConditionalGeneration` - LLaVA-NeXT-based - T / I - :code:`royokong/e5-v` - - + - - ✅︎ * - :code:`Phi3VForCausalLM` - Phi-3-Vision-based @@ -792,27 +803,25 @@ The following table lists those that are tested in vLLM. - Qwen2-VL-based - T + I - :code:`MrLight/dse-qwen2-2b-mrl-v1` - - + - - ✅︎ +``` ----- +______________________________________________________________________ -Model Support Policy -===================== +# Model Support Policy At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support: 1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated! - 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. -.. tip:: - When comparing the output of :code:`model.generate` from HuggingFace Transformers with the output of :code:`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., `generation_config.json `__) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. +```{tip} +When comparing the output of {code}`model.generate` from HuggingFace Transformers with the output of {code}`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. +``` 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. - 4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use. - 5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement. Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem. @@ -821,7 +830,7 @@ Note that, as an inference engine, vLLM does not introduce new models. Therefore We have the following levels of testing for models: -1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to `models tests `_ for the models that have passed this test. +1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test. 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test. -3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to `functionality tests `_ and `examples `_ for the models that have passed this test. +3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](https://github.com/vllm-project/vllm/tree/main/tests) and [examples](https://github.com/vllm-project/vllm/tree/main/examples) for the models that have passed this test. 4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category. diff --git a/docs/source/performance/benchmarks.md b/docs/source/performance/benchmarks.md new file mode 100644 index 0000000000000..50ef4a1f3b54d --- /dev/null +++ b/docs/source/performance/benchmarks.md @@ -0,0 +1,28 @@ +(benchmarks)= + +# Benchmark Suites + +vLLM contains two sets of benchmarks: + +- [Performance benchmarks](#performance-benchmarks) +- [Nightly benchmarks](#nightly-benchmarks) + +(performance-benchmarks)= + +## Performance Benchmarks + +The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM. + +The latest performance results are hosted on the public [vLLM Performance Dashboard](https://perf.vllm.ai). + +More information on the performance benchmarks and their parameters can be found [here](https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). + +(nightly-benchmarks)= + +## Nightly Benchmarks + +These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lmdeploy`) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the `perf-benchmarks` and `nightly-benchmarks` labels. + +The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html). + +More information on the nightly benchmarks and their parameters can be found [here](https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/nightly-descriptions.md). diff --git a/docs/source/performance/benchmarks.rst b/docs/source/performance/benchmarks.rst deleted file mode 100644 index 6d4d7b544cb5d..0000000000000 --- a/docs/source/performance/benchmarks.rst +++ /dev/null @@ -1,33 +0,0 @@ -.. _benchmarks: - -================ -Benchmark Suites -================ - -vLLM contains two sets of benchmarks: - -+ :ref:`Performance benchmarks ` -+ :ref:`Nightly benchmarks ` - - -.. _performance_benchmarks: - -Performance Benchmarks ----------------------- - -The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the ``perf-benchmarks`` and ``ready`` labels, and when a PR is merged into vLLM. - -The latest performance results are hosted on the public `vLLM Performance Dashboard `_. - -More information on the performance benchmarks and their parameters can be found `here `__. - -.. _nightly_benchmarks: - -Nightly Benchmarks ------------------- - -These compare vLLM's performance against alternatives (``tgi``, ``trt-llm``, and ``lmdeploy``) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the ``perf-benchmarks`` and ``nightly-benchmarks`` labels. - -The latest nightly benchmark results are shared in major release blog posts such as `vLLM v0.6.0 `_. - -More information on the nightly benchmarks and their parameters can be found `here `__. \ No newline at end of file diff --git a/docs/source/quantization/auto_awq.md b/docs/source/quantization/auto_awq.md new file mode 100644 index 0000000000000..c02fbf0605a8c --- /dev/null +++ b/docs/source/quantization/auto_awq.md @@ -0,0 +1,78 @@ +(auto-awq)= + +# AutoAWQ + +```{warning} +Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better +accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency +inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version. +``` + +To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ). +Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%. +The main benefits are lower latency and memory usage. + +You can quantize your own models by installing AutoAWQ or picking one of the [400+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq). + +```console +$ pip install autoawq +``` + +After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: + +```python +from awq import AutoAWQForCausalLM +from transformers import AutoTokenizer + +model_path = 'mistralai/Mistral-7B-Instruct-v0.2' +quant_path = 'mistral-instruct-v0.2-awq' +quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } + +# Load model +model = AutoAWQForCausalLM.from_pretrained( + model_path, **{"low_cpu_mem_usage": True, "use_cache": False} +) +tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + +# Quantize +model.quantize(tokenizer, quant_config=quant_config) + +# Save quantized model +model.save_quantized(quant_path) +tokenizer.save_pretrained(quant_path) + +print(f'Model is quantized and saved at "{quant_path}"') +``` + +To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: + +```console +$ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq +``` + +AWQ models are also supported directly through the LLM entrypoint: + +```python +from vllm import LLM, SamplingParams + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Create an LLM. +llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ") +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` diff --git a/docs/source/quantization/auto_awq.rst b/docs/source/quantization/auto_awq.rst deleted file mode 100644 index 8eb6fa2f4cbe1..0000000000000 --- a/docs/source/quantization/auto_awq.rst +++ /dev/null @@ -1,79 +0,0 @@ -.. _auto_awq: - -AutoAWQ -================== - -.. warning:: - - Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better - accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency - inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version. - -To create a new 4-bit quantized model, you can leverage `AutoAWQ `_. -Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%. -The main benefits are lower latency and memory usage. - -You can quantize your own models by installing AutoAWQ or picking one of the `400+ models on Huggingface `_. - -.. code-block:: console - - $ pip install autoawq - -After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: - -.. code-block:: python - - from awq import AutoAWQForCausalLM - from transformers import AutoTokenizer - - model_path = 'mistralai/Mistral-7B-Instruct-v0.2' - quant_path = 'mistral-instruct-v0.2-awq' - quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } - - # Load model - model = AutoAWQForCausalLM.from_pretrained( - model_path, **{"low_cpu_mem_usage": True, "use_cache": False} - ) - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - - # Quantize - model.quantize(tokenizer, quant_config=quant_config) - - # Save quantized model - model.save_quantized(quant_path) - tokenizer.save_pretrained(quant_path) - - print(f'Model is quantized and saved at "{quant_path}"') - -To run an AWQ model with vLLM, you can use `TheBloke/Llama-2-7b-Chat-AWQ `_ with the following command: - -.. code-block:: console - - $ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq - -AWQ models are also supported directly through the LLM entrypoint: - -.. code-block:: python - - from vllm import LLM, SamplingParams - - # Sample prompts. - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - # Create a sampling params object. - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - # Create an LLM. - llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ") - # Generate texts from the prompts. The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.generate(prompts, sampling_params) - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/docs/source/quantization/bnb.md b/docs/source/quantization/bnb.md new file mode 100644 index 0000000000000..8240eca1c7e03 --- /dev/null +++ b/docs/source/quantization/bnb.md @@ -0,0 +1,39 @@ +(bits-and-bytes)= + +# BitsAndBytes + +vLLM now supports [BitsAndBytes](https://github.com/TimDettmers/bitsandbytes) for more efficient model inference. +BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy. +Compared to other quantization methods, BitsAndBytes eliminates the need for calibrating the quantized model with input data. + +Below are the steps to utilize BitsAndBytes with vLLM. + +```console +$ pip install bitsandbytes>=0.45.0 +``` + +vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. + +You can find bitsandbytes quantized models on . +And usually, these repositories have a config.json file that includes a quantization_config section. + +## Read quantized checkpoint. + +```python +from vllm import LLM +import torch +# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint. +model_id = "unsloth/tinyllama-bnb-4bit" +llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ +quantization="bitsandbytes", load_format="bitsandbytes") +``` + +## Inflight quantization: load as 4bit quantization + +```python +from vllm import LLM +import torch +model_id = "huggyllama/llama-7b" +llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ +quantization="bitsandbytes", load_format="bitsandbytes") +``` diff --git a/docs/source/quantization/bnb.rst b/docs/source/quantization/bnb.rst deleted file mode 100644 index 84f805bb60c2a..0000000000000 --- a/docs/source/quantization/bnb.rst +++ /dev/null @@ -1,43 +0,0 @@ -.. _bits_and_bytes: - -BitsAndBytes -================== - -vLLM now supports `BitsAndBytes `_ for more efficient model inference. -BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy. -Compared to other quantization methods, BitsAndBytes eliminates the need for calibrating the quantized model with input data. - -Below are the steps to utilize BitsAndBytes with vLLM. - -.. code-block:: console - - $ pip install bitsandbytes>=0.45.0 - -vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. - -You can find bitsandbytes quantized models on https://huggingface.co/models?other=bitsandbytes. -And usually, these repositories have a config.json file that includes a quantization_config section. - -Read quantized checkpoint. --------------------------- - -.. code-block:: python - - from vllm import LLM - import torch - # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint. - model_id = "unsloth/tinyllama-bnb-4bit" - llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ - quantization="bitsandbytes", load_format="bitsandbytes") - -Inflight quantization: load as 4bit quantization ------------------------------------------------- - -.. code-block:: python - - from vllm import LLM - import torch - model_id = "huggyllama/llama-7b" - llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ - quantization="bitsandbytes", load_format="bitsandbytes") - diff --git a/docs/source/quantization/fp8.md b/docs/source/quantization/fp8.md new file mode 100644 index 0000000000000..b2eda74fd1e3b --- /dev/null +++ b/docs/source/quantization/fp8.md @@ -0,0 +1,192 @@ +(fp8)= + +# FP8 W8A8 + +vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. +Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8. +Ampere GPUs are supported for W8A16 (weight-only FP8) utilizing Marlin kernels. +Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy. + +Please visit the HF collection of [quantized FP8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127). + +The FP8 types typically supported in hardware have two distinct representations, each useful in different scenarios: + +- **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and `nan`. +- **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- `inf`, and `nan`. The tradeoff for the increased dynamic range is lower precision of the stored values. + +```{note} +FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper). +FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin. +``` + +## Quick Start with Online Dynamic Quantization + +Dynamic quantization of an original precision BF16/FP16 model to FP8 can be achieved with vLLM without any calibration data required. You can enable the feature by specifying `--quantization="fp8"` in the command line or setting `quantization="fp8"` in the LLM constructor. + +In this mode, all Linear modules (except for the final `lm_head`) have their weights quantized down to FP8_E4M3 precision with a per-tensor scale. Activations have their minimum and maximum values calculated during each forward pass to provide a dynamic per-tensor scale for high accuracy. As a result, latency improvements are limited in this mode. + +```python +from vllm import LLM +model = LLM("facebook/opt-125m", quantization="fp8") +# INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB +result = model.generate("Hello, my name is") +``` + +```{warning} +Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. +``` + +## Installation + +To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: + +```console +$ pip install llmcompressor +``` + +## Quantization Process + +The quantization process involves three main steps: + +1. Loading the model +2. Applying quantization +3. Evaluating accuracy in vLLM + +### 1. Loading the Model + +Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models: + +```python +from llmcompressor.transformers import SparseAutoModelForCausalLM +from transformers import AutoTokenizer + +MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" + +model = SparseAutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) +``` + +### 2. Applying Quantization + +For FP8 quantization, we can recover accuracy with simple RTN quantization. We recommend targeting all `Linear` layers using the `FP8_DYNAMIC` scheme, which uses: + +- Static, per-channel quantization on the weights +- Dynamic, per-token quantization on the activations + +Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow. + +```python +from llmcompressor.transformers import oneshot +from llmcompressor.modifiers.quantization import QuantizationModifier + +# Configure the simple PTQ quantization +recipe = QuantizationModifier( + targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) + +# Apply the quantization algorithm. +oneshot(model=model, recipe=recipe) + +# Save the model. +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" +model.save_pretrained(SAVE_DIR) +tokenizer.save_pretrained(SAVE_DIR) +``` + +### 3. Evaluating Accuracy + +Install `vllm` and `lm-evaluation-harness`: + +```console +$ pip install vllm lm-eval==0.4.4 +``` + +Load and run the model in `vllm`: + +```python +from vllm import LLM +model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic") +model.generate("Hello my name is") +``` + +Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`): + +```{note} +Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations. +``` + +```console +$ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic +$ lm_eval \ + --model vllm \ + --model_args pretrained=$MODEL,add_bos_token=True \ + --tasks gsm8k --num_fewshot 5 --batch_size auto --limit 250 +``` + +Here's an example of the resulting scores: + +```text +|Tasks|Version| Filter |n-shot| Metric | |Value| |Stderr| +|-----|------:|----------------|-----:|-----------|---|----:|---|-----:| +|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.768|± |0.0268| +| | |strict-match | 5|exact_match|↑ |0.768|± |0.0268| +``` + +## Troubleshooting and Support + +If you encounter any issues or have feature requests, please open an issue on the `vllm-project/llm-compressor` GitHub repository. + +## Deprecated Flow + +```{note} +The following information is preserved for reference and search purposes. +The quantization method described below is deprecated in favor of the `llmcompressor` method described above. +``` + +For static per-tensor offline quantization to FP8, please install the [AutoFP8 library](https://github.com/neuralmagic/autofp8). + +```bash +git clone https://github.com/neuralmagic/AutoFP8.git +pip install -e AutoFP8 +``` + +This package introduces the `AutoFP8ForCausalLM` and `BaseQuantizeConfig` objects for managing how your model will be compressed. + +## Offline Quantization with Static Activation Scaling Factors + +You can use AutoFP8 with calibration data to produce per-tensor static scales for both the weights and activations by enabling the `activation_scheme="static"` argument. + +```python +from datasets import load_dataset +from transformers import AutoTokenizer +from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig + +pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct" +quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8" + +tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) +tokenizer.pad_token = tokenizer.eos_token + +# Load and tokenize 512 dataset samples for calibration of activation scales +ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512)) +examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds] +examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda") + +# Define quantization config with static activation scales +quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static") + +# Load the model, quantize, and save checkpoint +model = AutoFP8ForCausalLM.from_pretrained(pretrained_model_dir, quantize_config) +model.quantize(examples) +model.save_quantized(quantized_model_dir) +``` + +Your model checkpoint with quantized weights and activations should be available at `Meta-Llama-3-8B-Instruct-FP8/`. +Finally, you can load the quantized model checkpoint directly in vLLM. + +```python +from vllm import LLM +model = LLM(model="Meta-Llama-3-8B-Instruct-FP8/") +# INFO 06-10 21:15:41 model_runner.py:159] Loading model weights took 8.4596 GB +result = model.generate("Hello, my name is") +``` diff --git a/docs/source/quantization/fp8.rst b/docs/source/quantization/fp8.rst deleted file mode 100644 index 4dbf8e9d346e1..0000000000000 --- a/docs/source/quantization/fp8.rst +++ /dev/null @@ -1,204 +0,0 @@ -.. _fp8: - -FP8 W8A8 -================== - -vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. -Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8. -Ampere GPUs are supported for W8A16 (weight-only FP8) utilizing Marlin kernels. -Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy. - -Please visit the HF collection of `quantized FP8 checkpoints of popular LLMs ready to use with vLLM `_. - -The FP8 types typically supported in hardware have two distinct representations, each useful in different scenarios: - -- **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and ``nan``. -- **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- ``inf``, and ``nan``. The tradeoff for the increased dynamic range is lower precision of the stored values. - -.. note:: - - FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper). - FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin. - -Quick Start with Online Dynamic Quantization --------------------------------------------- - -Dynamic quantization of an original precision BF16/FP16 model to FP8 can be achieved with vLLM without any calibration data required. You can enable the feature by specifying ``--quantization="fp8"`` in the command line or setting ``quantization="fp8"`` in the LLM constructor. - -In this mode, all Linear modules (except for the final ``lm_head``) have their weights quantized down to FP8_E4M3 precision with a per-tensor scale. Activations have their minimum and maximum values calculated during each forward pass to provide a dynamic per-tensor scale for high accuracy. As a result, latency improvements are limited in this mode. - -.. code-block:: python - - from vllm import LLM - model = LLM("facebook/opt-125m", quantization="fp8") - # INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB - result = model.generate("Hello, my name is") - -.. warning:: - - Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. - -Installation ------------- - -To produce performant FP8 quantized models with vLLM, you'll need to install the `llm-compressor `_ library: - -.. code-block:: console - - $ pip install llmcompressor - -Quantization Process --------------------- - -The quantization process involves three main steps: - -1. Loading the model -2. Applying quantization -3. Evaluating accuracy in vLLM - -1. Loading the Model -^^^^^^^^^^^^^^^^^^^^ - -Use ``SparseAutoModelForCausalLM``, which wraps ``AutoModelForCausalLM``, for saving and loading quantized models: - -.. code-block:: python - - from llmcompressor.transformers import SparseAutoModelForCausalLM - from transformers import AutoTokenizer - - MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" - - model = SparseAutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto") - tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -2. Applying Quantization -^^^^^^^^^^^^^^^^^^^^^^^^ - -For FP8 quantization, we can recover accuracy with simple RTN quantization. We recommend targeting all ``Linear`` layers using the ``FP8_DYNAMIC`` scheme, which uses: - -- Static, per-channel quantization on the weights -- Dynamic, per-token quantization on the activations - -Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow. - -.. code-block:: python - - from llmcompressor.transformers import oneshot - from llmcompressor.modifiers.quantization import QuantizationModifier - - # Configure the simple PTQ quantization - recipe = QuantizationModifier( - targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) - - # Apply the quantization algorithm. - oneshot(model=model, recipe=recipe) - - # Save the model. - SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic" - model.save_pretrained(SAVE_DIR) - tokenizer.save_pretrained(SAVE_DIR) - -3. Evaluating Accuracy -^^^^^^^^^^^^^^^^^^^^^^ - -Install ``vllm`` and ``lm-evaluation-harness``: - -.. code-block:: console - - $ pip install vllm lm-eval==0.4.4 - -Load and run the model in ``vllm``: - -.. code-block:: python - - from vllm import LLM - model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic") - model.generate("Hello my name is") - -Evaluate accuracy with ``lm_eval`` (for example on 250 samples of ``gsm8k``): - -.. note:: - - Quantized models can be sensitive to the presence of the ``bos`` token. ``lm_eval`` does not add a ``bos`` token by default, so make sure to include the ``add_bos_token=True`` argument when running your evaluations. - -.. code-block:: console - - $ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic - $ lm_eval \ - --model vllm \ - --model_args pretrained=$MODEL,add_bos_token=True \ - --tasks gsm8k --num_fewshot 5 --batch_size auto --limit 250 - -Here's an example of the resulting scores: - -.. code-block:: text - - |Tasks|Version| Filter |n-shot| Metric | |Value| |Stderr| - |-----|------:|----------------|-----:|-----------|---|----:|---|-----:| - |gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.768|± |0.0268| - | | |strict-match | 5|exact_match|↑ |0.768|± |0.0268| - -Troubleshooting and Support ---------------------------- - -If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository. - - -Deprecated Flow ------------------- - -.. note:: - - The following information is preserved for reference and search purposes. - The quantization method described below is deprecated in favor of the ``llmcompressor`` method described above. - -For static per-tensor offline quantization to FP8, please install the `AutoFP8 library `_. - -.. code-block:: bash - - git clone https://github.com/neuralmagic/AutoFP8.git - pip install -e AutoFP8 - -This package introduces the ``AutoFP8ForCausalLM`` and ``BaseQuantizeConfig`` objects for managing how your model will be compressed. - -Offline Quantization with Static Activation Scaling Factors ------------------------------------------------------------ - -You can use AutoFP8 with calibration data to produce per-tensor static scales for both the weights and activations by enabling the ``activation_scheme="static"`` argument. - -.. code-block:: python - - from datasets import load_dataset - from transformers import AutoTokenizer - from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig - - pretrained_model_dir = "meta-llama/Meta-Llama-3-8B-Instruct" - quantized_model_dir = "Meta-Llama-3-8B-Instruct-FP8" - - tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) - tokenizer.pad_token = tokenizer.eos_token - - # Load and tokenize 512 dataset samples for calibration of activation scales - ds = load_dataset("mgoin/ultrachat_2k", split="train_sft").select(range(512)) - examples = [tokenizer.apply_chat_template(batch["messages"], tokenize=False) for batch in ds] - examples = tokenizer(examples, padding=True, truncation=True, return_tensors="pt").to("cuda") - - # Define quantization config with static activation scales - quantize_config = BaseQuantizeConfig(quant_method="fp8", activation_scheme="static") - - # Load the model, quantize, and save checkpoint - model = AutoFP8ForCausalLM.from_pretrained(pretrained_model_dir, quantize_config) - model.quantize(examples) - model.save_quantized(quantized_model_dir) - -Your model checkpoint with quantized weights and activations should be available at ``Meta-Llama-3-8B-Instruct-FP8/``. -Finally, you can load the quantized model checkpoint directly in vLLM. - -.. code-block:: python - - from vllm import LLM - model = LLM(model="Meta-Llama-3-8B-Instruct-FP8/") - # INFO 06-10 21:15:41 model_runner.py:159] Loading model weights took 8.4596 GB - result = model.generate("Hello, my name is") - diff --git a/docs/source/quantization/fp8_e4m3_kvcache.md b/docs/source/quantization/fp8_e4m3_kvcache.md new file mode 100644 index 0000000000000..f200c722d1d42 --- /dev/null +++ b/docs/source/quantization/fp8_e4m3_kvcache.md @@ -0,0 +1,44 @@ +(fp8-e4m3-kvcache)= + +# FP8 E4M3 KV Cache + +Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, +improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2 +(5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of +the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of +FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside +each quantized tensor. For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling +factors of a finer granularity (e.g. per-channel). + +These scaling factors can be specified by passing an optional quantization param JSON to the LLM engine at load time. If +this JSON is not specified, scaling factors default to 1.0. These scaling factors are typically obtained when running an +unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO). + +To install AMMO (AlgorithMic Model Optimization): + +```console +$ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo +``` + +Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon +offerings e.g. AMD MI300, NVIDIA Hopper or later support native hardware conversion to and from fp32, fp16, bf16, etc. +Thus, LLM inference is greatly accelerated with minimal accuracy loss. + +Here is an example of how to enable this feature: + +```python +# two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to +# https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own. + +from vllm import LLM, SamplingParams +sampling_params = SamplingParams(temperature=1.3, top_p=0.8) +llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", + kv_cache_dtype="fp8", + quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json") +prompt = "London is the capital of" +out = llm.generate(prompt, sampling_params)[0].outputs[0].text +print(out) + +# output w/ scaling factors: England, the United Kingdom, and one of the world's leading financial, +# output w/o scaling factors: England, located in the southeastern part of the country. It is known +``` diff --git a/docs/source/quantization/fp8_e4m3_kvcache.rst b/docs/source/quantization/fp8_e4m3_kvcache.rst deleted file mode 100644 index cc52d8f40af8f..0000000000000 --- a/docs/source/quantization/fp8_e4m3_kvcache.rst +++ /dev/null @@ -1,47 +0,0 @@ -.. _fp8_e4m3_kvcache: - -FP8 E4M3 KV Cache -================== - -Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, -improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2 -(5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of -the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of -FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside -each quantized tensor. For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling -factors of a finer granularity (e.g. per-channel). - -These scaling factors can be specified by passing an optional quantization param JSON to the LLM engine at load time. If -this JSON is not specified, scaling factors default to 1.0. These scaling factors are typically obtained when running an -unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO). - -To install AMMO (AlgorithMic Model Optimization): - -.. code-block:: console - - $ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo - -Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon -offerings e.g. AMD MI300, NVIDIA Hopper or later support native hardware conversion to and from fp32, fp16, bf16, etc. -Thus, LLM inference is greatly accelerated with minimal accuracy loss. - - -Here is an example of how to enable this feature: - -.. code-block:: python - - # two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to - # https://github.com/vllm-project/vllm/blob/main/examples/fp8/README.md to generate kv_cache_scales.json of your own. - - from vllm import LLM, SamplingParams - sampling_params = SamplingParams(temperature=1.3, top_p=0.8) - llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", - kv_cache_dtype="fp8", - quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json") - prompt = "London is the capital of" - out = llm.generate(prompt, sampling_params)[0].outputs[0].text - print(out) - - # output w/ scaling factors: England, the United Kingdom, and one of the world's leading financial, - # output w/o scaling factors: England, located in the southeastern part of the country. It is known - diff --git a/docs/source/quantization/fp8_e5m2_kvcache.md b/docs/source/quantization/fp8_e5m2_kvcache.md new file mode 100644 index 0000000000000..3a81ab17f332f --- /dev/null +++ b/docs/source/quantization/fp8_e5m2_kvcache.md @@ -0,0 +1,31 @@ +(fp8-kv-cache)= + +# FP8 E5M2 KV Cache + +The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits. +The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other. + +Here is an example of how to enable this feature: + +```python +from vllm import LLM, SamplingParams +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) +# Create an LLM. +llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8") +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` diff --git a/docs/source/quantization/fp8_e5m2_kvcache.rst b/docs/source/quantization/fp8_e5m2_kvcache.rst deleted file mode 100644 index b2d824427f786..0000000000000 --- a/docs/source/quantization/fp8_e5m2_kvcache.rst +++ /dev/null @@ -1,34 +0,0 @@ -.. _fp8_kv_cache: - -FP8 E5M2 KV Cache -================== - -The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits. -The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other. - -Here is an example of how to enable this feature: - -.. code-block:: python - - from vllm import LLM, SamplingParams - # Sample prompts. - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - # Create a sampling params object. - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - # Create an LLM. - llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8") - # Generate texts from the prompts. The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.generate(prompts, sampling_params) - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - diff --git a/docs/source/quantization/gguf.md b/docs/source/quantization/gguf.md new file mode 100644 index 0000000000000..eebf11dfc1b2b --- /dev/null +++ b/docs/source/quantization/gguf.md @@ -0,0 +1,72 @@ +(gguf)= + +# GGUF + +```{warning} +Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team. +``` + +```{warning} +Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model. +``` + +To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command: + +```console +$ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf +$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. +$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 +``` + +You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs: + +```console +$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. +$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 +``` + +```{warning} +We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size. +``` + +You can also use the GGUF model directly through the LLM entrypoint: + +```python +from vllm import LLM, SamplingParams + +# In this script, we demonstrate how to pass input to the chat method: +conversation = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": "Hello" + }, + { + "role": "assistant", + "content": "Hello! How can I assist you today?" + }, + { + "role": "user", + "content": "Write an essay about the importance of higher education.", + }, +] + +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Create an LLM. +llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", + tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.chat(conversation, sampling_params) + +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` diff --git a/docs/source/quantization/gguf.rst b/docs/source/quantization/gguf.rst deleted file mode 100644 index 9f00dc5563909..0000000000000 --- a/docs/source/quantization/gguf.rst +++ /dev/null @@ -1,73 +0,0 @@ -.. _gguf: - -GGUF -================== - -.. warning:: - - Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team. - -.. warning:: - - Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use `gguf-split `_ tool to merge them to a single-file model. - -To run a GGUF model with vLLM, you can download and use the local GGUF model from `TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF `_ with the following command: - -.. code-block:: console - - $ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf - $ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. - $ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 - -You can also add ``--tensor-parallel-size 2`` to enable tensor parallelism inference with 2 GPUs: - -.. code-block:: console - - $ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. - $ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 - -.. warning:: - - We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size. - -You can also use the GGUF model directly through the LLM entrypoint: - -.. code-block:: python - - from vllm import LLM, SamplingParams - - # In this script, we demonstrate how to pass input to the chat method: - conversation = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": "Hello" - }, - { - "role": "assistant", - "content": "Hello! How can I assist you today?" - }, - { - "role": "user", - "content": "Write an essay about the importance of higher education.", - }, - ] - - # Create a sampling params object. - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - # Create an LLM. - llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", - tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") - # Generate texts from the prompts. The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.chat(conversation, sampling_params) - - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/docs/source/quantization/int8.md b/docs/source/quantization/int8.md new file mode 100644 index 0000000000000..1ac50ba987dda --- /dev/null +++ b/docs/source/quantization/int8.md @@ -0,0 +1,136 @@ +(int8)= + +# INT8 W8A8 + +vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration. +This quantization method is particularly useful for reducing model size while maintaining good performance. + +Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415). + +```{note} +INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper). +``` + +## Prerequisites + +To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: + +```console +$ pip install llmcompressor +``` + +## Quantization Process + +The quantization process involves four main steps: + +1. Loading the model +2. Preparing calibration data +3. Applying quantization +4. Evaluating accuracy in vLLM + +### 1. Loading the Model + +Use `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM`, for saving and loading quantized models: + +```python +from llmcompressor.transformers import SparseAutoModelForCausalLM +from transformers import AutoTokenizer + +MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" +model = SparseAutoModelForCausalLM.from_pretrained( + MODEL_ID, device_map="auto", torch_dtype="auto", +) +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) +``` + +### 2. Preparing Calibration Data + +When quantizing activations to INT8, you need sample data to estimate the activation scales. +It's best to use calibration data that closely matches your deployment data. +For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`: + +```python +from datasets import load_dataset + +NUM_CALIBRATION_SAMPLES = 512 +MAX_SEQUENCE_LENGTH = 2048 + +# Load and preprocess the dataset +ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") +ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) + +def preprocess(example): + return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} +ds = ds.map(preprocess) + +def tokenize(sample): + return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) +ds = ds.map(tokenize, remove_columns=ds.column_names) +``` + +### 3. Applying Quantization + +Now, apply the quantization algorithms: + +```python +from llmcompressor.transformers import oneshot +from llmcompressor.modifiers.quantization import GPTQModifier +from llmcompressor.modifiers.smoothquant import SmoothQuantModifier + +# Configure the quantization algorithms +recipe = [ + SmoothQuantModifier(smoothing_strength=0.8), + GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), +] + +# Apply quantization +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, +) + +# Save the compressed model +SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) +``` + +This process creates a W8A8 model with weights and activations quantized to 8-bit integers. + +### 4. Evaluating Accuracy + +After quantization, you can load and run the model in vLLM: + +```python +from vllm import LLM +model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token") +``` + +To evaluate accuracy, you can use `lm_eval`: + +```console +$ lm_eval --model vllm \ + --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \ + --tasks gsm8k \ + --num_fewshot 5 \ + --limit 250 \ + --batch_size 'auto' +``` + +```{note} +Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations. +``` + +## Best Practices + +- Start with 512 samples for calibration data (increase if accuracy drops) +- Use a sequence length of 2048 as a starting point +- Employ the chat template or instruction template that the model was trained with +- If you've fine-tuned a model, consider using a sample of your training data for calibration + +## Troubleshooting and Support + +If you encounter any issues or have feature requests, please open an issue on the `vllm-project/llm-compressor` GitHub repository. diff --git a/docs/source/quantization/int8.rst b/docs/source/quantization/int8.rst deleted file mode 100644 index aa5b251becb1c..0000000000000 --- a/docs/source/quantization/int8.rst +++ /dev/null @@ -1,145 +0,0 @@ -.. _int8: - -INT8 W8A8 -================== - -vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration. -This quantization method is particularly useful for reducing model size while maintaining good performance. - -Please visit the HF collection of `quantized INT8 checkpoints of popular LLMs ready to use with vLLM `_. - -.. note:: - - INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper). - -Prerequisites -------------- - -To use INT8 quantization with vLLM, you'll need to install the `llm-compressor `_ library: - -.. code-block:: console - - $ pip install llmcompressor - -Quantization Process --------------------- - -The quantization process involves four main steps: - -1. Loading the model -2. Preparing calibration data -3. Applying quantization -4. Evaluating accuracy in vLLM - -1. Loading the Model -^^^^^^^^^^^^^^^^^^^^ - -Use ``SparseAutoModelForCausalLM``, which wraps ``AutoModelForCausalLM``, for saving and loading quantized models: - -.. code-block:: python - - from llmcompressor.transformers import SparseAutoModelForCausalLM - from transformers import AutoTokenizer - - MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" - model = SparseAutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", - ) - tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - -2. Preparing Calibration Data -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -When quantizing activations to INT8, you need sample data to estimate the activation scales. -It's best to use calibration data that closely matches your deployment data. -For a general-purpose instruction-tuned model, you can use a dataset like ``ultrachat``: - -.. code-block:: python - - from datasets import load_dataset - - NUM_CALIBRATION_SAMPLES = 512 - MAX_SEQUENCE_LENGTH = 2048 - - # Load and preprocess the dataset - ds = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft") - ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) - - def preprocess(example): - return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False)} - ds = ds.map(preprocess) - - def tokenize(sample): - return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False) - ds = ds.map(tokenize, remove_columns=ds.column_names) - -3. Applying Quantization -^^^^^^^^^^^^^^^^^^^^^^^^ - -Now, apply the quantization algorithms: - -.. code-block:: python - - from llmcompressor.transformers import oneshot - from llmcompressor.modifiers.quantization import GPTQModifier - from llmcompressor.modifiers.smoothquant import SmoothQuantModifier - - # Configure the quantization algorithms - recipe = [ - SmoothQuantModifier(smoothing_strength=0.8), - GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), - ] - - # Apply quantization - oneshot( - model=model, - dataset=ds, - recipe=recipe, - max_seq_length=MAX_SEQUENCE_LENGTH, - num_calibration_samples=NUM_CALIBRATION_SAMPLES, - ) - - # Save the compressed model - SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic-Per-Token" - model.save_pretrained(SAVE_DIR, save_compressed=True) - tokenizer.save_pretrained(SAVE_DIR) - -This process creates a W8A8 model with weights and activations quantized to 8-bit integers. - -4. Evaluating Accuracy -^^^^^^^^^^^^^^^^^^^^^^ - -After quantization, you can load and run the model in vLLM: - -.. code-block:: python - - from vllm import LLM - model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token") - -To evaluate accuracy, you can use ``lm_eval``: - -.. code-block:: console - - $ lm_eval --model vllm \ - --model_args pretrained="./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token",add_bos_token=true \ - --tasks gsm8k \ - --num_fewshot 5 \ - --limit 250 \ - --batch_size 'auto' - -.. note:: - - Quantized models can be sensitive to the presence of the ``bos`` token. Make sure to include the ``add_bos_token=True`` argument when running evaluations. - -Best Practices --------------- - -- Start with 512 samples for calibration data (increase if accuracy drops) -- Use a sequence length of 2048 as a starting point -- Employ the chat template or instruction template that the model was trained with -- If you've fine-tuned a model, consider using a sample of your training data for calibration - -Troubleshooting and Support ---------------------------- - -If you encounter any issues or have feature requests, please open an issue on the ``vllm-project/llm-compressor`` GitHub repository. diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.md similarity index 84% rename from docs/source/quantization/supported_hardware.rst rename to docs/source/quantization/supported_hardware.md index 09f8e7112cf0c..d2160772a24cb 100644 --- a/docs/source/quantization/supported_hardware.rst +++ b/docs/source/quantization/supported_hardware.md @@ -1,132 +1,132 @@ -.. _supported_hardware_for_quantization: - -Supported Hardware for Quantization Kernels -=========================================== - -The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: - -.. list-table:: - :header-rows: 1 - :widths: 20 8 8 8 8 8 8 8 8 8 8 - - * - Implementation - - Volta - - Turing - - Ampere - - Ada - - Hopper - - AMD GPU - - Intel GPU - - x86 CPU - - AWS Inferentia - - Google TPU - * - AWQ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - * - GPTQ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - * - Marlin (GPTQ/AWQ/FP8) - - ✗ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - INT8 (W8A8) - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✅︎ - - ✗ - - ✗ - * - FP8 (W8A8) - - ✗ - - ✗ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - * - AQLM - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - bitsandbytes - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - DeepSpeedFP - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - GGUF - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - -Notes: -^^^^^^ - -- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. -- "✅︎" indicates that the quantization method is supported on the specified hardware. -- "✗" indicates that the quantization method is not supported on the specified hardware. - -Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. - -For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory `_ or consult with the vLLM development team. +(supported-hardware-for-quantization)= + +# Supported Hardware for Quantization Kernels + +The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: + +```{eval-rst} +.. list-table:: + :header-rows: 1 + :widths: 20 8 8 8 8 8 8 8 8 8 8 + + * - Implementation + - Volta + - Turing + - Ampere + - Ada + - Hopper + - AMD GPU + - Intel GPU + - x86 CPU + - AWS Inferentia + - Google TPU + * - AWQ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + * - GPTQ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + * - Marlin (GPTQ/AWQ/FP8) + - ✗ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - INT8 (W8A8) + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✅︎ + - ✗ + - ✗ + * - FP8 (W8A8) + - ✗ + - ✗ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + * - AQLM + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - bitsandbytes + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - DeepSpeedFP + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + * - GGUF + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ +``` + +## Notes: + +- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. +- "✅︎" indicates that the quantization method is supported on the specified hardware. +- "✗" indicates that the quantization method is not supported on the specified hardware. + +Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. + +For the most up-to-date information on hardware support and quantization methods, please check the [quantization directory](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization) or consult with the vLLM development team. diff --git a/docs/source/serving/deploying_with_bentoml.md b/docs/source/serving/deploying_with_bentoml.md new file mode 100644 index 0000000000000..dfa0de4f0f6d7 --- /dev/null +++ b/docs/source/serving/deploying_with_bentoml.md @@ -0,0 +1,7 @@ +(deploying-with-bentoml)= + +# Deploying with BentoML + +[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes. + +For details, see the tutorial [vLLM inference in the BentoML documentation](https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html). diff --git a/docs/source/serving/deploying_with_bentoml.rst b/docs/source/serving/deploying_with_bentoml.rst deleted file mode 100644 index 4b9d19f5bdb72..0000000000000 --- a/docs/source/serving/deploying_with_bentoml.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. _deploying_with_bentoml: - -Deploying with BentoML -====================== - -`BentoML `_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes. - -For details, see the tutorial `vLLM inference in the BentoML documentation `_. \ No newline at end of file diff --git a/docs/source/serving/deploying_with_cerebrium.md b/docs/source/serving/deploying_with_cerebrium.md new file mode 100644 index 0000000000000..4863936236119 --- /dev/null +++ b/docs/source/serving/deploying_with_cerebrium.md @@ -0,0 +1,109 @@ +(deploying-with-cerebrium)= + +# Deploying with Cerebrium + +```{raw} html +

+ vLLM_plus_cerebrium +

+``` + +vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebrium.ai/), a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications. + +To install the Cerebrium client, run: + +```console +$ pip install cerebrium +$ cerebrium login +``` + +Next, create your Cerebrium project, run: + +```console +$ cerebrium init vllm-project +``` + +Next, to install the required packages, add the following to your cerebrium.toml: + +```toml +[cerebrium.deployment] +docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04" + +[cerebrium.dependencies.pip] +vllm = "latest" +``` + +Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py\`: + +```python +from vllm import LLM, SamplingParams + +llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1") + +def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): + + sampling_params = SamplingParams(temperature=temperature, top_p=top_p) + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + results = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + results.append({"prompt": prompt, "generated_text": generated_text}) + + return {"results": results} +``` + +Then, run the following code to deploy it to the cloud + +```console +$ cerebrium deploy +``` + +If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run) + +```python +curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ + -H 'Content-Type: application/json' \ + -H 'Authorization: ' \ + --data '{ + "prompts": [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is" + ] + }' +``` + +You should get a response like: + +```python +{ + "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", + "result": { + "result": [ + { + "prompt": "Hello, my name is", + "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of" + }, + { + "prompt": "The president of the United States is", + "generated_text": " elected every four years. This is a democratic system.\n\n5. What" + }, + { + "prompt": "The capital of France is", + "generated_text": " Paris.\n" + }, + { + "prompt": "The future of AI is", + "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective." + } + ] + }, + "run_time_ms": 152.53663063049316 +} +``` + +You now have an autoscaling endpoint where you only pay for the compute you use! diff --git a/docs/source/serving/deploying_with_cerebrium.rst b/docs/source/serving/deploying_with_cerebrium.rst deleted file mode 100644 index 9585b6ef5cb38..0000000000000 --- a/docs/source/serving/deploying_with_cerebrium.rst +++ /dev/null @@ -1,112 +0,0 @@ -.. _deploying_with_cerebrium: - -Deploying with Cerebrium -============================ - -.. raw:: html - -

- vLLM_plus_cerebrium -

- -vLLM can be run on a cloud based GPU machine with `Cerebrium `__, a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications. - -To install the Cerebrium client, run: - -.. code-block:: console - - $ pip install cerebrium - $ cerebrium login - -Next, create your Cerebrium project, run: - -.. code-block:: console - - $ cerebrium init vllm-project - -Next, to install the required packages, add the following to your cerebrium.toml: - -.. code-block:: toml - - [cerebrium.deployment] - docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04" - - [cerebrium.dependencies.pip] - vllm = "latest" - -Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py`: - -.. code-block:: python - - from vllm import LLM, SamplingParams - - llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1") - - def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): - - sampling_params = SamplingParams(temperature=temperature, top_p=top_p) - outputs = llm.generate(prompts, sampling_params) - - # Print the outputs. - results = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - results.append({"prompt": prompt, "generated_text": generated_text}) - - return {"results": results} - - -Then, run the following code to deploy it to the cloud - -.. code-block:: console - - $ cerebrium deploy - -If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run) - -.. code-block:: python - - curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ - -H 'Content-Type: application/json' \ - -H 'Authorization: ' \ - --data '{ - "prompts": [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is" - ] - }' - -You should get a response like: - -.. code-block:: python - - { - "run_id": "52911756-3066-9ae8-bcc9-d9129d1bd262", - "result": { - "result": [ - { - "prompt": "Hello, my name is", - "generated_text": " Sarah, and I'm a teacher. I teach elementary school students. One of" - }, - { - "prompt": "The president of the United States is", - "generated_text": " elected every four years. This is a democratic system.\n\n5. What" - }, - { - "prompt": "The capital of France is", - "generated_text": " Paris.\n" - }, - { - "prompt": "The future of AI is", - "generated_text": " bright, but it's important to approach it with a balanced and nuanced perspective." - } - ] - }, - "run_time_ms": 152.53663063049316 - } - -You now have an autoscaling endpoint where you only pay for the compute you use! - diff --git a/docs/source/serving/deploying_with_docker.md b/docs/source/serving/deploying_with_docker.md new file mode 100644 index 0000000000000..2d8ceed8cecfd --- /dev/null +++ b/docs/source/serving/deploying_with_docker.md @@ -0,0 +1,81 @@ +(deploying-with-docker)= + +# Deploying with Docker + +## Use vLLM's Official Docker Image + +vLLM offers an official Docker image for deployment. +The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags). + +```console +$ docker run --runtime nvidia --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env "HUGGING_FACE_HUB_TOKEN=" \ + -p 8000:8000 \ + --ipc=host \ + vllm/vllm-openai:latest \ + --model mistralai/Mistral-7B-v0.1 +``` + +```{note} +You can either use the `ipc=host` flag or `--shm-size` flag to allow the +container to access the host's shared memory. vLLM uses PyTorch, which uses shared +memory to share data between processes under the hood, particularly for tensor parallel inference. +``` + +## Building vLLM's Docker Image from Source + +You can build and run vLLM from source via the provided [Dockerfile](https://github.com/vllm-project/vllm/blob/main/Dockerfile). To build vLLM: + +```console +$ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 +$ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai +``` + +```{note} +By default vLLM will build for all GPU types for widest distribution. If you are just building for the +current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""` +for vLLM to find the current GPU type and build for that. +``` + +## Building for Arm64/aarch64 + +A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use +of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64. + +```{note} +Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=` +flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits. +Keep an eye on memory usage with parallel jobs as it can be substantial (see example below). +``` + +```console +# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) +$ python3 use_existing_torch.py +$ DOCKER_BUILDKIT=1 docker build . \ + --target vllm-openai \ + --platform "linux/arm64" \ + -t vllm/vllm-gh200-openai:latest \ + --build-arg max_jobs=66 \ + --build-arg nvcc_threads=2 \ + --build-arg torch_cuda_arch_list="9.0+PTX" \ + --build-arg vllm_fa_cmake_gpu_arches="90-real" +``` + +## Use the custom-built vLLM Docker image + +To run vLLM with the custom-built Docker image: + +```console +$ docker run --runtime nvidia --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + -p 8000:8000 \ + --env "HUGGING_FACE_HUB_TOKEN=" \ + vllm/vllm-openai +``` + +The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command). + +```{note} +**For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` . +``` diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst deleted file mode 100644 index b64eef819cd2e..0000000000000 --- a/docs/source/serving/deploying_with_docker.rst +++ /dev/null @@ -1,88 +0,0 @@ -.. _deploying_with_docker: - -Deploying with Docker -============================ - -Use vLLM's Official Docker Image --------------------------------- - -vLLM offers an official Docker image for deployment. -The image can be used to run OpenAI compatible server and is available on Docker Hub as `vllm/vllm-openai `_. - -.. code-block:: console - - $ docker run --runtime nvidia --gpus all \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - --env "HUGGING_FACE_HUB_TOKEN=" \ - -p 8000:8000 \ - --ipc=host \ - vllm/vllm-openai:latest \ - --model mistralai/Mistral-7B-v0.1 - - -.. note:: - - You can either use the ``ipc=host`` flag or ``--shm-size`` flag to allow the - container to access the host's shared memory. vLLM uses PyTorch, which uses shared - memory to share data between processes under the hood, particularly for tensor parallel inference. - - -Building vLLM's Docker Image from Source ----------------------------------------- - -You can build and run vLLM from source via the provided `Dockerfile `_. To build vLLM: - -.. code-block:: console - - $ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 - $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai - -.. note:: - - By default vLLM will build for all GPU types for widest distribution. If you are just building for the - current GPU type the machine is running on, you can add the argument ``--build-arg torch_cuda_arch_list=""`` - for vLLM to find the current GPU type and build for that. - -Building for Arm64/aarch64 --------------------------- - -A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use -of PyTorch Nightly and should be considered **experimental**. Using the flag ``--platform "linux/arm64"`` will attempt to build for arm64. - -.. note:: - - Multiple modules must be compiled, so this process can take a while. Recommend using ``--build-arg max_jobs=`` & ``--build-arg nvcc_threads=`` - flags to speed up build process. However, ensure your ``max_jobs`` is substantially larger than ``nvcc_threads`` to get the most benefits. - Keep an eye on memory usage with parallel jobs as it can be substantial (see example below). - -.. code-block:: console - - # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) - $ python3 use_existing_torch.py - $ DOCKER_BUILDKIT=1 docker build . \ - --target vllm-openai \ - --platform "linux/arm64" \ - -t vllm/vllm-gh200-openai:latest \ - --build-arg max_jobs=66 \ - --build-arg nvcc_threads=2 \ - --build-arg torch_cuda_arch_list="9.0+PTX" \ - --build-arg vllm_fa_cmake_gpu_arches="90-real" - -Use the custom-built vLLM Docker image --------------------------------------- - -To run vLLM with the custom-built Docker image: - -.. code-block:: console - - $ docker run --runtime nvidia --gpus all \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - -p 8000:8000 \ - --env "HUGGING_FACE_HUB_TOKEN=" \ - vllm/vllm-openai - -The argument ``vllm/vllm-openai`` specifies the image to run, and should be replaced with the name of the custom-built image (the ``-t`` tag from the build command). - -.. note:: - - **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` . diff --git a/docs/source/serving/deploying_with_dstack.md b/docs/source/serving/deploying_with_dstack.md new file mode 100644 index 0000000000000..65ef1c0016208 --- /dev/null +++ b/docs/source/serving/deploying_with_dstack.md @@ -0,0 +1,102 @@ +(deploying-with-dstack)= + +# Deploying with dstack + +```{raw} html +

+ vLLM_plus_dstack +

+``` + +vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment. + +To install dstack client, run: + +```console +$ pip install "dstack[all] +$ dstack server +``` + +Next, to configure your dstack project, run: + +```console +$ mkdir -p vllm-dstack +$ cd vllm-dstack +$ dstack init +``` + +Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: + +```yaml +type: service + +python: "3.11" +env: + - MODEL=NousResearch/Llama-2-7b-chat-hf +port: 8000 +resources: + gpu: 24GB +commands: + - pip install vllm + - vllm serve $MODEL --port 8000 +model: + format: openai + type: chat + name: NousResearch/Llama-2-7b-chat-hf +``` + +Then, run the following CLI for provisioning: + +```console +$ dstack run . -f serve.dstack.yml + +⠸ Getting run plan... + Configuration serve.dstack.yml + Project deep-diver-main + User deep-diver + Min resources 2..xCPU, 8GB.., 1xGPU (24GB) + Max price - + Max duration - + Spot policy auto + Retry policy no + + # BACKEND REGION INSTANCE RESOURCES SPOT PRICE + 1 gcp us-central1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 + 2 gcp us-east1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 + 3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 + ... + Shown 3 of 193 offers, $5.876 max + +Continue? [y/n]: y +⠙ Submitting run... +⠏ Launching spicy-treefrog-1 (pulling) +spicy-treefrog-1 provisioning completed (running) +Service is published at ... +``` + +After the provisioning, you can interact with the model by using the OpenAI SDK: + +```python +from openai import OpenAI + +client = OpenAI( + base_url="https://gateway.", + api_key="" +) + +completion = client.chat.completions.create( + model="NousResearch/Llama-2-7b-chat-hf", + messages=[ + { + "role": "user", + "content": "Compose a poem that explains the concept of recursion in programming.", + } + ] +) + +print(completion.choices[0].message.content) +``` + +```{note} +dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm) +``` diff --git a/docs/source/serving/deploying_with_dstack.rst b/docs/source/serving/deploying_with_dstack.rst deleted file mode 100644 index e1eb45b225d9c..0000000000000 --- a/docs/source/serving/deploying_with_dstack.rst +++ /dev/null @@ -1,103 +0,0 @@ -.. _deploying_with_dstack: - -Deploying with dstack -============================ - -.. raw:: html - -

- vLLM_plus_dstack -

- -vLLM can be run on a cloud based GPU machine with `dstack `__, an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment. - -To install dstack client, run: - -.. code-block:: console - - $ pip install "dstack[all] - $ dstack server - -Next, to configure your dstack project, run: - -.. code-block:: console - - $ mkdir -p vllm-dstack - $ cd vllm-dstack - $ dstack init - -Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: - -.. code-block:: yaml - - type: service - - python: "3.11" - env: - - MODEL=NousResearch/Llama-2-7b-chat-hf - port: 8000 - resources: - gpu: 24GB - commands: - - pip install vllm - - vllm serve $MODEL --port 8000 - model: - format: openai - type: chat - name: NousResearch/Llama-2-7b-chat-hf - -Then, run the following CLI for provisioning: - -.. code-block:: console - - $ dstack run . -f serve.dstack.yml - - ⠸ Getting run plan... - Configuration serve.dstack.yml - Project deep-diver-main - User deep-diver - Min resources 2..xCPU, 8GB.., 1xGPU (24GB) - Max price - - Max duration - - Spot policy auto - Retry policy no - - # BACKEND REGION INSTANCE RESOURCES SPOT PRICE - 1 gcp us-central1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 - 2 gcp us-east1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 - 3 gcp us-west1 g2-standard-4 4xCPU, 16GB, 1xL4 (24GB), 100GB (disk) yes $0.223804 - ... - Shown 3 of 193 offers, $5.876 max - - Continue? [y/n]: y - ⠙ Submitting run... - ⠏ Launching spicy-treefrog-1 (pulling) - spicy-treefrog-1 provisioning completed (running) - Service is published at ... - -After the provisioning, you can interact with the model by using the OpenAI SDK: - -.. code-block:: python - - from openai import OpenAI - - client = OpenAI( - base_url="https://gateway.", - api_key="" - ) - - completion = client.chat.completions.create( - model="NousResearch/Llama-2-7b-chat-hf", - messages=[ - { - "role": "user", - "content": "Compose a poem that explains the concept of recursion in programming.", - } - ] - ) - - print(completion.choices[0].message.content) - -.. note:: - - dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out `this repository `__ diff --git a/docs/source/serving/deploying_with_helm.rst b/docs/source/serving/deploying_with_helm.md similarity index 88% rename from docs/source/serving/deploying_with_helm.rst rename to docs/source/serving/deploying_with_helm.md index d185a6951d7ec..3b26575827011 100644 --- a/docs/source/serving/deploying_with_helm.rst +++ b/docs/source/serving/deploying_with_helm.md @@ -1,7 +1,6 @@ -.. _deploying_with_helm: +(deploying-with-helm)= -Deploying with Helm -=================== +# Deploying with Helm A Helm chart to deploy vLLM for Kubernetes @@ -9,44 +8,42 @@ Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm install and documentation on architecture and values file. -Prerequisites -------------- +## Prerequisites + Before you begin, ensure that you have the following: - A running Kubernetes cluster -- NVIDIA Kubernetes Device Plugin (``k8s-device-plugin``): This can be found at `https://github.com/NVIDIA/k8s-device-plugin `__ +- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin) - Available GPU resources in your cluster - S3 with the model which will be deployed -Installing the chart --------------------- - -To install the chart with the release name ``test-vllm``: - -.. code-block:: console +## Installing the chart - helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY +To install the chart with the release name `test-vllm`: -Uninstalling the Chart ----------------------- +```console +helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY +``` -To uninstall the ``test-vllm`` deployment: +## Uninstalling the Chart -.. code-block:: console +To uninstall the `test-vllm` deployment: - helm uninstall test-vllm --namespace=ns-vllm +```console +helm uninstall test-vllm --namespace=ns-vllm +``` The command removes all the Kubernetes components associated with the chart **including persistent volumes** and deletes the release. -Architecture ------------- +## Architecture -.. image:: architecture_helm_deployment.png +```{image} architecture_helm_deployment.png +``` -Values ------- +## Values +```{eval-rst} .. list-table:: Values :widths: 25 25 25 25 :header-rows: 1 @@ -251,3 +248,4 @@ Values - string - test - Release name +``` diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/serving/deploying_with_k8s.md new file mode 100644 index 0000000000000..d27db826cd006 --- /dev/null +++ b/docs/source/serving/deploying_with_k8s.md @@ -0,0 +1,171 @@ +(deploying-with-k8s)= + +# Deploying with Kubernetes + +Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing. + +## Prerequisites + +Before you begin, ensure that you have the following: + +- A running Kubernetes cluster +- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at `https://github.com/NVIDIA/k8s-device-plugin/` +- Available GPU resources in your cluster + +## Deployment Steps + +1. **Create a PVC , Secret and Deployment for vLLM** + +PVC is used to store the model cache and it is optional, you can use hostPath or other storage options + +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: mistral-7b + namespace: default +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi + storageClassName: default + volumeMode: Filesystem +``` + +Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: hf-token-secret + namespace: default +type: Opaque +data: + token: "REPLACE_WITH_TOKEN" +``` + +Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model: + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mistral-7b + namespace: default + labels: + app: mistral-7b +spec: + replicas: 1 + selector: + matchLabels: + app: mistral-7b + template: + metadata: + labels: + app: mistral-7b + spec: + volumes: + - name: cache-volume + persistentVolumeClaim: + claimName: mistral-7b + # vLLM needs to access the host's shared memory for tensor parallel inference. + - name: shm + emptyDir: + medium: Memory + sizeLimit: "2Gi" + containers: + - name: mistral-7b + image: vllm/vllm-openai:latest + command: ["/bin/sh", "-c"] + args: [ + "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" + ] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8000 + resources: + limits: + cpu: "10" + memory: 20G + nvidia.com/gpu: "1" + requests: + cpu: "2" + memory: 6G + nvidia.com/gpu: "1" + volumeMounts: + - mountPath: /root/.cache/huggingface + name: cache-volume + - name: shm + mountPath: /dev/shm + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 5 +``` + +2. **Create a Kubernetes Service for vLLM** + +Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: + +```yaml +apiVersion: v1 +kind: Service +metadata: + name: mistral-7b + namespace: default +spec: + ports: + - name: http-mistral-7b + port: 80 + protocol: TCP + targetPort: 8000 + # The label selector should match the deployment labels & it is useful for prefix caching feature + selector: + app: mistral-7b + sessionAffinity: None + type: ClusterIP +``` + +3. **Deploy and Test** + +Apply the deployment and service configurations using `kubectl apply -f `: + +```console +kubectl apply -f deployment.yaml +kubectl apply -f service.yaml +``` + +To test the deployment, run the following `curl` command: + +```console +curl http://mistral-7b.default.svc.cluster.local/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "mistralai/Mistral-7B-Instruct-v0.3", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' +``` + +If the service is correctly deployed, you should receive a response from the vLLM model. + +## Conclusion + +Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation. diff --git a/docs/source/serving/deploying_with_k8s.rst b/docs/source/serving/deploying_with_k8s.rst deleted file mode 100644 index cc3606f0df851..0000000000000 --- a/docs/source/serving/deploying_with_k8s.rst +++ /dev/null @@ -1,175 +0,0 @@ -.. _deploying_with_k8s: - -Deploying with Kubernetes -========================== - -Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing. - -Prerequisites -------------- -Before you begin, ensure that you have the following: - -- A running Kubernetes cluster -- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at `https://github.com/NVIDIA/k8s-device-plugin/` -- Available GPU resources in your cluster - -Deployment Steps ----------------- - -1. **Create a PVC , Secret and Deployment for vLLM** - - -PVC is used to store the model cache and it is optional, you can use hostPath or other storage options - -.. code-block:: yaml - - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: mistral-7b - namespace: default - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi - storageClassName: default - volumeMode: Filesystem - -Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models - -.. code-block:: yaml - - apiVersion: v1 - kind: Secret - metadata: - name: hf-token-secret - namespace: default - type: Opaque - data: - token: "REPLACE_WITH_TOKEN" - - -Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model: - -.. code-block:: yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: mistral-7b - namespace: default - labels: - app: mistral-7b - spec: - replicas: 1 - selector: - matchLabels: - app: mistral-7b - template: - metadata: - labels: - app: mistral-7b - spec: - volumes: - - name: cache-volume - persistentVolumeClaim: - claimName: mistral-7b - # vLLM needs to access the host's shared memory for tensor parallel inference. - - name: shm - emptyDir: - medium: Memory - sizeLimit: "2Gi" - containers: - - name: mistral-7b - image: vllm/vllm-openai:latest - command: ["/bin/sh", "-c"] - args: [ - "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" - ] - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - ports: - - containerPort: 8000 - resources: - limits: - cpu: "10" - memory: 20G - nvidia.com/gpu: "1" - requests: - cpu: "2" - memory: 6G - nvidia.com/gpu: "1" - volumeMounts: - - mountPath: /root/.cache/huggingface - name: cache-volume - - name: shm - mountPath: /dev/shm - livenessProbe: - httpGet: - path: /health - port: 8000 - initialDelaySeconds: 60 - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 8000 - initialDelaySeconds: 60 - periodSeconds: 5 - -2. **Create a Kubernetes Service for vLLM** - -Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: - -.. code-block:: yaml - - apiVersion: v1 - kind: Service - metadata: - name: mistral-7b - namespace: default - spec: - ports: - - name: http-mistral-7b - port: 80 - protocol: TCP - targetPort: 8000 - # The label selector should match the deployment labels & it is useful for prefix caching feature - selector: - app: mistral-7b - sessionAffinity: None - type: ClusterIP - -3. **Deploy and Test** - -Apply the deployment and service configurations using ``kubectl apply -f ``: - -.. code-block:: console - - kubectl apply -f deployment.yaml - kubectl apply -f service.yaml - -To test the deployment, run the following ``curl`` command: - -.. code-block:: console - - curl http://mistral-7b.default.svc.cluster.local/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "mistralai/Mistral-7B-Instruct-v0.3", - "prompt": "San Francisco is a", - "max_tokens": 7, - "temperature": 0 - }' - -If the service is correctly deployed, you should receive a response from the vLLM model. - -Conclusion ----------- -Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation. diff --git a/docs/source/serving/deploying_with_kserve.md b/docs/source/serving/deploying_with_kserve.md new file mode 100644 index 0000000000000..feaeb5d0ec8a2 --- /dev/null +++ b/docs/source/serving/deploying_with_kserve.md @@ -0,0 +1,7 @@ +(deploying-with-kserve)= + +# Deploying with KServe + +vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving. + +Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe. diff --git a/docs/source/serving/deploying_with_kserve.rst b/docs/source/serving/deploying_with_kserve.rst deleted file mode 100644 index 01d7ccc6e9300..0000000000000 --- a/docs/source/serving/deploying_with_kserve.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. _deploying_with_kserve: - -Deploying with KServe -============================ - -vLLM can be deployed with `KServe `_ on Kubernetes for highly scalable distributed model serving. - -Please see `this guide `_ for more details on using vLLM with KServe. diff --git a/docs/source/serving/deploying_with_kubeai.md b/docs/source/serving/deploying_with_kubeai.md new file mode 100644 index 0000000000000..3609d7e05acd3 --- /dev/null +++ b/docs/source/serving/deploying_with_kubeai.md @@ -0,0 +1,15 @@ +(deploying-with-kubeai)= + +# Deploying with KubeAI + +[KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies. + +Please see the Installation Guides for environment specific instructions: + +- [Any Kubernetes Cluster](https://www.kubeai.org/installation/any/) +- [EKS](https://www.kubeai.org/installation/eks/) +- [GKE](https://www.kubeai.org/installation/gke/) + +Once you have KubeAI installed, you can +[configure text generation models](https://www.kubeai.org/how-to/configure-text-generation-models/) +using vLLM. diff --git a/docs/source/serving/deploying_with_kubeai.rst b/docs/source/serving/deploying_with_kubeai.rst deleted file mode 100644 index ec3c065320fd9..0000000000000 --- a/docs/source/serving/deploying_with_kubeai.rst +++ /dev/null @@ -1,17 +0,0 @@ -.. _deploying_with_kubeai: - -Deploying with KubeAI -===================== - -`KubeAI `_ is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies. - - -Please see the Installation Guides for environment specific instructions: - -* `Any Kubernetes Cluster `_ -* `EKS `_ -* `GKE `_ - -Once you have KubeAI installed, you can -`configure text generation models `_ -using vLLM. \ No newline at end of file diff --git a/docs/source/serving/deploying_with_lws.md b/docs/source/serving/deploying_with_lws.md new file mode 100644 index 0000000000000..22bab419eaca3 --- /dev/null +++ b/docs/source/serving/deploying_with_lws.md @@ -0,0 +1,11 @@ +(deploying-with-lws)= + +# Deploying with LWS + +LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads. +A major use case is for multi-host/multi-node distributed inference. + +vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kubernetes for distributed model serving. + +Please see [this guide](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm) for more details on +deploying vLLM on Kubernetes using LWS. diff --git a/docs/source/serving/deploying_with_lws.rst b/docs/source/serving/deploying_with_lws.rst deleted file mode 100644 index b63a432dde0d5..0000000000000 --- a/docs/source/serving/deploying_with_lws.rst +++ /dev/null @@ -1,12 +0,0 @@ -.. _deploying_with_lws: - -Deploying with LWS -============================ - -LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads. -A major use case is for multi-host/multi-node distributed inference. - -vLLM can be deployed with `LWS `_ on Kubernetes for distributed model serving. - -Please see `this guide `_ for more details on -deploying vLLM on Kubernetes using LWS. diff --git a/docs/source/serving/deploying_with_nginx.md b/docs/source/serving/deploying_with_nginx.md new file mode 100644 index 0000000000000..a1f00d8536465 --- /dev/null +++ b/docs/source/serving/deploying_with_nginx.md @@ -0,0 +1,133 @@ +(nginxloadbalancer)= + +# Deploying with Nginx Loadbalancer + +This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers. + +Table of contents: + +1. [Build Nginx Container](#nginxloadbalancer-nginx-build) +2. [Create Simple Nginx Config file](#nginxloadbalancer-nginx-conf) +3. [Build vLLM Container](#nginxloadbalancer-nginx-vllm-container) +4. [Create Docker Network](#nginxloadbalancer-nginx-docker-network) +5. [Launch vLLM Containers](#nginxloadbalancer-nginx-launch-container) +6. [Launch Nginx](#nginxloadbalancer-nginx-launch-nginx) +7. [Verify That vLLM Servers Are Ready](#nginxloadbalancer-nginx-verify-nginx) + +(nginxloadbalancer-nginx-build)= + +## Build Nginx Container + +This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory. + +```console +export vllm_root=`pwd` +``` + +Create a file named `Dockerfile.nginx`: + +```console +FROM nginx:latest +RUN rm /etc/nginx/conf.d/default.conf +EXPOSE 80 +CMD ["nginx", "-g", "daemon off;"] +``` + +Build the container: + +```console +docker build . -f Dockerfile.nginx --tag nginx-lb +``` + +(nginxloadbalancer-nginx-conf)= + +## Create Simple Nginx Config file + +Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`. + +```console +upstream backend { + least_conn; + server vllm0:8000 max_fails=3 fail_timeout=10000s; + server vllm1:8000 max_fails=3 fail_timeout=10000s; +} +server { + listen 80; + location / { + proxy_pass http://backend; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } +} +``` + +(nginxloadbalancer-nginx-vllm-container)= + +## Build vLLM Container + +```console +cd $vllm_root +docker build -f Dockerfile . --tag vllm +``` + +If you are behind proxy, you can pass the proxy settings to the docker build command as shown below: + +```console +cd $vllm_root +docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy +``` + +(nginxloadbalancer-nginx-docker-network)= + +## Create Docker Network + +```console +docker network create vllm_nginx +``` + +(nginxloadbalancer-nginx-launch-container)= + +## Launch vLLM Containers + +Notes: + +- If you have your HuggingFace models cached somewhere else, update `hf_cache_dir` below. +- If you don't have an existing HuggingFace cache you will want to start `vllm0` and wait for the model to complete downloading and the server to be ready. This will ensure that `vllm1` can leverage the model you just downloaded and it won't have to be downloaded again. +- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus all`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command. +- Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`. + +```console +mkdir -p ~/.cache/huggingface/hub/ +hf_cache_dir=~/.cache/huggingface/ +docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf +docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf +``` + +```{note} +If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`. +``` + +(nginxloadbalancer-nginx-launch-nginx)= + +## Launch Nginx + +```console +docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest +``` + +(nginxloadbalancer-nginx-verify-nginx)= + +## Verify That vLLM Servers Are Ready + +```console +docker logs vllm0 | grep Uvicorn +docker logs vllm1 | grep Uvicorn +``` + +Both outputs should look like this: + +```console +INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) +``` diff --git a/docs/source/serving/deploying_with_nginx.rst b/docs/source/serving/deploying_with_nginx.rst deleted file mode 100644 index b5dff02b6bae6..0000000000000 --- a/docs/source/serving/deploying_with_nginx.rst +++ /dev/null @@ -1,142 +0,0 @@ -.. _nginxloadbalancer: - -Deploying with Nginx Loadbalancer -================================= - -This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers. - -Table of contents: - -#. :ref:`Build Nginx Container ` -#. :ref:`Create Simple Nginx Config file ` -#. :ref:`Build vLLM Container ` -#. :ref:`Create Docker Network ` -#. :ref:`Launch vLLM Containers ` -#. :ref:`Launch Nginx ` -#. :ref:`Verify That vLLM Servers Are Ready ` - -.. _nginxloadbalancer_nginx_build: - -Build Nginx Container ---------------------- - -This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory. - -.. code-block:: console - - export vllm_root=`pwd` - -Create a file named ``Dockerfile.nginx``: - -.. code-block:: console - - FROM nginx:latest - RUN rm /etc/nginx/conf.d/default.conf - EXPOSE 80 - CMD ["nginx", "-g", "daemon off;"] - -Build the container: - -.. code-block:: console - - docker build . -f Dockerfile.nginx --tag nginx-lb - -.. _nginxloadbalancer_nginx_conf: - -Create Simple Nginx Config file -------------------------------- - -Create a file named ``nginx_conf/nginx.conf``. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another ``server vllmN:8000 max_fails=3 fail_timeout=10000s;`` entry to ``upstream backend``. - -.. code-block:: console - - upstream backend { - least_conn; - server vllm0:8000 max_fails=3 fail_timeout=10000s; - server vllm1:8000 max_fails=3 fail_timeout=10000s; - } - server { - listen 80; - location / { - proxy_pass http://backend; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - } - } - -.. _nginxloadbalancer_nginx_vllm_container: - -Build vLLM Container --------------------- - -.. code-block:: console - - cd $vllm_root - docker build -f Dockerfile . --tag vllm - - -If you are behind proxy, you can pass the proxy settings to the docker build command as shown below: - -.. code-block:: console - - cd $vllm_root - docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy - -.. _nginxloadbalancer_nginx_docker_network: - -Create Docker Network ---------------------- - -.. code-block:: console - - docker network create vllm_nginx - - -.. _nginxloadbalancer_nginx_launch_container: - -Launch vLLM Containers ----------------------- - -Notes: - -* If you have your HuggingFace models cached somewhere else, update ``hf_cache_dir`` below. -* If you don't have an existing HuggingFace cache you will want to start ``vllm0`` and wait for the model to complete downloading and the server to be ready. This will ensure that ``vllm1`` can leverage the model you just downloaded and it won't have to be downloaded again. -* The below example assumes GPU backend used. If you are using CPU backend, remove ``--gpus all``, add ``VLLM_CPU_KVCACHE_SPACE`` and ``VLLM_CPU_OMP_THREADS_BIND`` environment variables to the docker run command. -* Adjust the model name that you want to use in your vLLM servers if you don't want to use ``Llama-2-7b-chat-hf``. - -.. code-block:: console - - mkdir -p ~/.cache/huggingface/hub/ - hf_cache_dir=~/.cache/huggingface/ - docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf - docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf - -.. note:: - If you are behind proxy, you can pass the proxy settings to the docker run command via ``-e http_proxy=$http_proxy -e https_proxy=$https_proxy``. - -.. _nginxloadbalancer_nginx_launch_nginx: - -Launch Nginx ------------- - -.. code-block:: console - - docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest - -.. _nginxloadbalancer_nginx_verify_nginx: - -Verify That vLLM Servers Are Ready ----------------------------------- - -.. code-block:: console - - docker logs vllm0 | grep Uvicorn - docker logs vllm1 | grep Uvicorn - -Both outputs should look like this: - -.. code-block:: console - - INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) diff --git a/docs/source/serving/deploying_with_triton.md b/docs/source/serving/deploying_with_triton.md new file mode 100644 index 0000000000000..9b0a6f1d54ae8 --- /dev/null +++ b/docs/source/serving/deploying_with_triton.md @@ -0,0 +1,5 @@ +(deploying-with-triton)= + +# Deploying with NVIDIA Triton + +The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details. diff --git a/docs/source/serving/deploying_with_triton.rst b/docs/source/serving/deploying_with_triton.rst deleted file mode 100644 index 5ce7c3d03dd2d..0000000000000 --- a/docs/source/serving/deploying_with_triton.rst +++ /dev/null @@ -1,6 +0,0 @@ -.. _deploying_with_triton: - -Deploying with NVIDIA Triton -============================ - -The `Triton Inference Server `_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m `_ model using vLLM. Please see `Deploying a vLLM model in Triton `_ for more details. diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md new file mode 100644 index 0000000000000..e0485d66c0a26 --- /dev/null +++ b/docs/source/serving/distributed_serving.md @@ -0,0 +1,105 @@ +(distributed-serving)= + +# Distributed Inference and Serving + +## How to decide the distributed inference strategy? + +Before going into the details of distributed inference and serving, let's first make it clear when to use distributed inference and what are the strategies available. The common practice is: + +- **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference. +- **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4. +- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2. + +In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes. + +After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like `# GPU blocks: 790`. Multiply the number by `16` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough. + +```{note} +There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs. +``` + +## Details for Distributed Inference and Serving + +vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray. + +Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed-executor-backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. + +To run multi-GPU inference with the {code}`LLM` class, set the {code}`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: + +```python +from vllm import LLM +llm = LLM("facebook/opt-13b", tensor_parallel_size=4) +output = llm.generate("San Franciso is a") +``` + +To run multi-GPU serving, pass in the {code}`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: + +```console +$ vllm serve facebook/opt-13b \ +$ --tensor-parallel-size 4 +``` + +You can also additionally specify {code}`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: + +```console +$ vllm serve gpt2 \ +$ --tensor-parallel-size 4 \ +$ --pipeline-parallel-size 2 +``` + +## Multi-Node Inference and Serving + +If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration. + +The first step, is to start containers and organize them into a cluster. We have provided a helper [script](https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh) to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command. + +Pick a node as the head node, and run the following command: + +```console +$ bash run_cluster.sh \ +$ vllm/vllm-openai \ +$ ip_of_head_node \ +$ --head \ +$ /path/to/the/huggingface/home/in/this/node +``` + +On the rest of the worker nodes, run the following command: + +```console +$ bash run_cluster.sh \ +$ vllm/vllm-openai \ +$ ip_of_head_node \ +$ --worker \ +$ /path/to/the/huggingface/home/in/this/node +``` + +Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct. + +Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs. + +After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: + +```console +$ vllm serve /path/to/the/model/in/the/container \ +$ --tensor-parallel-size 8 \ +$ --pipeline-parallel-size 2 +``` + +You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16: + +```console +$ vllm serve /path/to/the/model/in/the/container \ +$ --tensor-parallel-size 16 +``` + +To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. + +```{warning} +After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](https://docs.vllm.ai/en/latest/getting_started/debugging.html) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See the [discussion](https://github.com/vllm-project/vllm/issues/6803) for more information. +``` + +```{warning} +Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes. + +When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model. +``` diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst deleted file mode 100644 index b24ba53e59694..0000000000000 --- a/docs/source/serving/distributed_serving.rst +++ /dev/null @@ -1,107 +0,0 @@ -.. _distributed_serving: - -Distributed Inference and Serving -================================= - -How to decide the distributed inference strategy? -------------------------------------------------- - -Before going into the details of distributed inference and serving, let's first make it clear when to use distributed inference and what are the strategies available. The common practice is: - -- **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference. -- **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4. -- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2. - -In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes. - -After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like ``# GPU blocks: 790``. Multiply the number by ``16`` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough. - -.. note:: - There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs. - -Details for Distributed Inference and Serving ----------------------------------------------- - -vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm `_. We manage the distributed runtime with either `Ray `_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray. - -Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured :code:`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the :code:`LLM` class :code:`distributed-executor-backend` argument or :code:`--distributed-executor-backend` API server argument. Set it to :code:`mp` for multiprocessing or :code:`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. - -To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: - -.. code-block:: python - - from vllm import LLM - llm = LLM("facebook/opt-13b", tensor_parallel_size=4) - output = llm.generate("San Franciso is a") - -To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: - -.. code-block:: console - - $ vllm serve facebook/opt-13b \ - $ --tensor-parallel-size 4 - -You can also additionally specify :code:`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: - -.. code-block:: console - - $ vllm serve gpt2 \ - $ --tensor-parallel-size 4 \ - $ --pipeline-parallel-size 2 - -Multi-Node Inference and Serving --------------------------------- - -If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration. - -The first step, is to start containers and organize them into a cluster. We have provided a helper `script `_ to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have ``CAP_SYS_ADMIN`` to the docker container by using the ``--cap-add`` option in the docker run command. - -Pick a node as the head node, and run the following command: - -.. code-block:: console - - $ bash run_cluster.sh \ - $ vllm/vllm-openai \ - $ ip_of_head_node \ - $ --head \ - $ /path/to/the/huggingface/home/in/this/node - -On the rest of the worker nodes, run the following command: - -.. code-block:: console - - $ bash run_cluster.sh \ - $ vllm/vllm-openai \ - $ ip_of_head_node \ - $ --worker \ - $ /path/to/the/huggingface/home/in/this/node - -Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument ``ip_of_head_node`` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct. - -Then, on any node, use ``docker exec -it node /bin/bash`` to enter the container, execute ``ray status`` to check the status of the Ray cluster. You should see the right number of nodes and GPUs. - -After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: - -.. code-block:: console - - $ vllm serve /path/to/the/model/in/the/container \ - $ --tensor-parallel-size 8 \ - $ --pipeline-parallel-size 2 - -You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16: - -.. code-block:: console - - $ vllm serve /path/to/the/model/in/the/container \ - $ --tensor-parallel-size 16 - -To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like ``--privileged -e NCCL_IB_HCA=mlx5`` to the ``run_cluster.sh`` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with ``NCCL_DEBUG=TRACE`` environment variable set, e.g. ``NCCL_DEBUG=TRACE vllm serve ...`` and check the logs for the NCCL version and the network used. If you find ``[send] via NET/Socket`` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find ``[send] via NET/IB/GDRDMA`` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. - -.. warning:: - After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the `sanity check script `_ for more information. If you need to set some environment variables for the communication configuration, you can append them to the ``run_cluster.sh`` script, e.g. ``-e NCCL_SOCKET_IFNAME=eth0``. Note that setting environment variables in the shell (e.g. ``NCCL_SOCKET_IFNAME=eth0 vllm serve ...``) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See the `discussion `_ for more information. - -.. warning:: - - Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes. - - When you use huggingface repo id to refer to the model, you should append your huggingface token to the ``run_cluster.sh`` script, e.g. ``-e HF_TOKEN=``. The recommended way is to download the model first, and then use the path to refer to the model. diff --git a/docs/source/serving/integrations.md b/docs/source/serving/integrations.md new file mode 100644 index 0000000000000..d214c77254257 --- /dev/null +++ b/docs/source/serving/integrations.md @@ -0,0 +1,17 @@ +# Integrations + +```{toctree} +:maxdepth: 1 + +run_on_sky +deploying_with_kserve +deploying_with_kubeai +deploying_with_triton +deploying_with_bentoml +deploying_with_cerebrium +deploying_with_lws +deploying_with_dstack +serving_with_langchain +serving_with_llamaindex +serving_with_llamastack +``` diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst deleted file mode 100644 index 0dd505a739863..0000000000000 --- a/docs/source/serving/integrations.rst +++ /dev/null @@ -1,17 +0,0 @@ -Integrations ------------- - -.. toctree:: - :maxdepth: 1 - - run_on_sky - deploying_with_kserve - deploying_with_kubeai - deploying_with_triton - deploying_with_bentoml - deploying_with_cerebrium - deploying_with_lws - deploying_with_dstack - serving_with_langchain - serving_with_llamaindex - serving_with_llamastack diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md new file mode 100644 index 0000000000000..2dc78643f6d8f --- /dev/null +++ b/docs/source/serving/metrics.md @@ -0,0 +1,38 @@ +# Production Metrics + +vLLM exposes a number of metrics that can be used to monitor the health of the +system. These metrics are exposed via the `/metrics` endpoint on the vLLM +OpenAI compatible API server. + +You can start the server using Python, or using [Docker](deploying_with_docker.md): + +```console +$ vllm serve unsloth/Llama-3.2-1B-Instruct +``` + +Then query the endpoint to get the latest metrics from the server: + +```console +$ curl http://0.0.0.0:8000/metrics + +# HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step. +# TYPE vllm:iteration_tokens_total histogram +vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0 +vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 +... +``` + +The following metrics are exposed: + +```{literalinclude} ../../../vllm/engine/metrics.py +:end-before: end-metrics-definitions +:language: python +:start-after: begin-metrics-definitions +``` diff --git a/docs/source/serving/metrics.rst b/docs/source/serving/metrics.rst deleted file mode 100644 index 231111cd7b738..0000000000000 --- a/docs/source/serving/metrics.rst +++ /dev/null @@ -1,38 +0,0 @@ -Production Metrics -================== - -vLLM exposes a number of metrics that can be used to monitor the health of the -system. These metrics are exposed via the ``/metrics`` endpoint on the vLLM -OpenAI compatible API server. - -You can start the server using Python, or using [Docker](deploying_with_docker.rst): - -.. code-block:: console - - $ vllm serve unsloth/Llama-3.2-1B-Instruct - -Then query the endpoint to get the latest metrics from the server: - -.. code-block:: console - - $ curl http://0.0.0.0:8000/metrics - - # HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step. - # TYPE vllm:iteration_tokens_total histogram - vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0 - vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 - ... - -The following metrics are exposed: - -.. literalinclude:: ../../../vllm/engine/metrics.py - :language: python - :start-after: begin-metrics-definitions - :end-before: end-metrics-definitions diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 1bc8d32d2d161..934a7cea7b9cb 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -2,7 +2,7 @@ vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API, and more! -You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.rst): +You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](deploying_with_docker.md): ```bash vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 ``` @@ -30,20 +30,20 @@ print(completion.choices[0].message) We currently support the following OpenAI APIs: - [Completions API](#completions-api) (`/v1/completions`) - - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`). + - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`). - *Note: `suffix` parameter is not supported.* - [Chat Completions API](#chat-api) (`/v1/chat/completions`) - - Only applicable to [text generation models](../models/generative_models.rst) (`--task generate`) with a [chat template](#chat-template). + - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`) with a [chat template](#chat-template). - *Note: `parallel_tool_calls` and `user` parameters are ignored.* - [Embeddings API](#embeddings-api) (`/v1/embeddings`) - - Only applicable to [embedding models](../models/pooling_models.rst) (`--task embed`). + - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`). In addition, we have the following custom APIs: - [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`) - Applicable to any model with a tokenizer. - [Score API](#score-api) (`/score`) - - Only applicable to [cross-encoder models](../models/pooling_models.rst) (`--task score`). + - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). (chat-template)= ## Chat Template @@ -183,7 +183,7 @@ Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference #### Extra parameters -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -206,12 +206,12 @@ Refer to [OpenAI's API reference](https://platform.openai.com/docs/api-reference We support both [Vision](https://platform.openai.com/docs/guides/vision)- and [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters; -see our [Multimodal Inputs](../usage/multimodal_inputs.rst) guide for more information. +see our [Multimodal Inputs](../usage/multimodal_inputs.md) guide for more information. - *Note: `image_url.detail` parameter is not supported.* #### Extra parameters -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.md) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -236,12 +236,12 @@ If the model has a [chat template](#chat-template), you can replace `inputs` wit which will be treated as a single prompt to the model. ```{tip} -This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details. +This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.md) for details. ``` #### Extra parameters -The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported. +The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python @@ -418,7 +418,7 @@ Response: #### Extra parameters -The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported. +The following [pooling parameters (click through to see documentation)](../dev/pooling_params.md) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python diff --git a/docs/source/serving/run_on_sky.md b/docs/source/serving/run_on_sky.md new file mode 100644 index 0000000000000..115873ae49292 --- /dev/null +++ b/docs/source/serving/run_on_sky.md @@ -0,0 +1,345 @@ +(on-cloud)= + +# Deploying and scaling up with SkyPilot + +```{raw} html +

+ vLLM +

+``` + +vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html). + +## Prerequisites + +- Go to the [HuggingFace model page](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and request access to the model {code}`meta-llama/Meta-Llama-3-8B-Instruct`. +- Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)). +- Check that {code}`sky check` shows clouds or Kubernetes are enabled. + +```console +pip install skypilot-nightly +sky check +``` + +## Run on a single instance + +See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml). + +```yaml +resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + +envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + +setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + +run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log & + + echo 'Waiting for vllm api server to start...' + while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done + + echo 'Starting gradio server...' + git clone https://github.com/vllm-project/vllm.git || true + python vllm/examples/gradio_openai_chatbot_webserver.py \ + -m $MODEL_NAME \ + --port 8811 \ + --model-url http://localhost:8081/v1 \ + --stop-token-ids 128009,128001 +``` + +Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): + +```console +HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN +``` + +Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion. + +```console +(task, pid=7431) Running on public URL: https://.gradio.live +``` + +**Optional**: Serve the 70B model instead of the default 8B and use more GPU: + +```console +HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct +``` + +## Scale up to multiple replicas + +SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file. + +```yaml +service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_completion_tokens: 1 +``` + +```{raw} html +
+Click to see the full recipe YAML +``` + +```yaml +service: + replicas: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_completion_tokens: 1 + +resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + +envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + +setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + +run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log +``` + +```{raw} html +
+``` + +Start the serving the Llama-3 8B model on multiple replicas: + +```console +HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN +``` + +Wait until the service is ready: + +```console +watch -n10 sky serve status vllm +``` + +```{raw} html +
+Example outputs: +``` + +```console +Services +NAME VERSION UPTIME STATUS REPLICAS ENDPOINT +vllm 1 35s READY 2/2 xx.yy.zz.100:30001 + +Service Replicas +SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION +vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 +vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 +``` + +```{raw} html +
+``` + +After the service is READY, you can find a single endpoint for the service and access the service with the endpoint: + +```console +ENDPOINT=$(sky serve status --endpoint 8081 vllm) +curl -L http://$ENDPOINT/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "Who are you?" + } + ], + "stop_token_ids": [128009, 128001] + }' +``` + +To enable autoscaling, you could replace the `replicas` with the following configs in `service`: + +```yaml +service: + replica_policy: + min_replicas: 2 + max_replicas: 4 + target_qps_per_replica: 2 +``` + +This will scale the service up to when the QPS exceeds 2 for each replica. + +```{raw} html +
+Click to see the full recipe YAML +``` + +```yaml +service: + replica_policy: + min_replicas: 2 + max_replicas: 4 + target_qps_per_replica: 2 + # An actual request for readiness probe. + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_completion_tokens: 1 + +resources: + accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. + use_spot: True + disk_size: 512 # Ensure model checkpoints can fit. + disk_tier: best + ports: 8081 # Expose to internet traffic. + +envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # Change to your own huggingface token, or use --env to pass. + +setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + pip install vllm==0.4.0.post1 + # Install Gradio for web UI. + pip install gradio openai + pip install flash-attn==2.5.7 + +run: | + conda activate vllm + echo 'Starting vllm api server...' + python -u -m vllm.entrypoints.openai.api_server \ + --port 8081 \ + --model $MODEL_NAME \ + --trust-remote-code \ + --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ + 2>&1 | tee api_server.log +``` + +```{raw} html +
+``` + +To update the service with the new config: + +```console +HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN +``` + +To stop the service: + +```console +sky serve down vllm +``` + +### **Optional**: Connect a GUI to the endpoint + +It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas. + +```{raw} html +
+Click to see the full GUI YAML +``` + +```yaml +envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. + +resources: + cpus: 2 + +setup: | + conda create -n vllm python=3.10 -y + conda activate vllm + + # Install Gradio for web UI. + pip install gradio openai + +run: | + conda activate vllm + export PATH=$PATH:/sbin + + echo 'Starting gradio server...' + git clone https://github.com/vllm-project/vllm.git || true + python vllm/examples/gradio_openai_chatbot_webserver.py \ + -m $MODEL_NAME \ + --port 8811 \ + --model-url http://$ENDPOINT/v1 \ + --stop-token-ids 128009,128001 | tee ~/gradio.log +``` + +```{raw} html +
+``` + +1. Start the chat web UI: + +```console +sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) +``` + +2. Then, we can access the GUI at the returned gradio link: + +```console +| INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live +``` diff --git a/docs/source/serving/run_on_sky.rst b/docs/source/serving/run_on_sky.rst deleted file mode 100644 index 227e6fd2a7818..0000000000000 --- a/docs/source/serving/run_on_sky.rst +++ /dev/null @@ -1,366 +0,0 @@ -.. _on_cloud: - -Deploying and scaling up with SkyPilot -================================================ - -.. raw:: html - -

- vLLM -

- -vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with `SkyPilot `__, an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in `SkyPilot AI gallery `__. - - -Prerequisites -------------- - -- Go to the `HuggingFace model page `__ and request access to the model :code:`meta-llama/Meta-Llama-3-8B-Instruct`. -- Check that you have installed SkyPilot (`docs `__). -- Check that :code:`sky check` shows clouds or Kubernetes are enabled. - -.. code-block:: console - - pip install skypilot-nightly - sky check - - -Run on a single instance ------------------------- - -See the vLLM SkyPilot YAML for serving, `serving.yaml `__. - -.. code-block:: yaml - - resources: - accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. - use_spot: True - disk_size: 512 # Ensure model checkpoints can fit. - disk_tier: best - ports: 8081 # Expose to internet traffic. - - envs: - MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct - HF_TOKEN: # Change to your own huggingface token, or use --env to pass. - - setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - pip install vllm==0.4.0.post1 - # Install Gradio for web UI. - pip install gradio openai - pip install flash-attn==2.5.7 - - run: | - conda activate vllm - echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ - --port 8081 \ - --model $MODEL_NAME \ - --trust-remote-code \ - --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ - 2>&1 | tee api_server.log & - - echo 'Waiting for vllm api server to start...' - while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done - - echo 'Starting gradio server...' - git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ - -m $MODEL_NAME \ - --port 8811 \ - --model-url http://localhost:8081/v1 \ - --stop-token-ids 128009,128001 - -Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): - -.. code-block:: console - - HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN - -Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion. - -.. code-block:: console - - (task, pid=7431) Running on public URL: https://.gradio.live - -**Optional**: Serve the 70B model instead of the default 8B and use more GPU: - -.. code-block:: console - - HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct - - -Scale up to multiple replicas ------------------------------ - -SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file. - -.. code-block:: yaml - - service: - replicas: 2 - # An actual request for readiness probe. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_completion_tokens: 1 - -.. raw:: html - -
- Click to see the full recipe YAML - - -.. code-block:: yaml - - service: - replicas: 2 - # An actual request for readiness probe. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_completion_tokens: 1 - - resources: - accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. - use_spot: True - disk_size: 512 # Ensure model checkpoints can fit. - disk_tier: best - ports: 8081 # Expose to internet traffic. - - envs: - MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct - HF_TOKEN: # Change to your own huggingface token, or use --env to pass. - - setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - pip install vllm==0.4.0.post1 - # Install Gradio for web UI. - pip install gradio openai - pip install flash-attn==2.5.7 - - run: | - conda activate vllm - echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ - --port 8081 \ - --model $MODEL_NAME \ - --trust-remote-code \ - --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ - 2>&1 | tee api_server.log - -.. raw:: html - -
- -Start the serving the Llama-3 8B model on multiple replicas: - -.. code-block:: console - - HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN - - -Wait until the service is ready: - -.. code-block:: console - - watch -n10 sky serve status vllm - - -.. raw:: html - -
- Example outputs: - -.. code-block:: console - - Services - NAME VERSION UPTIME STATUS REPLICAS ENDPOINT - vllm 1 35s READY 2/2 xx.yy.zz.100:30001 - - Service Replicas - SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION - vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 - vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 - -.. raw:: html - -
- -After the service is READY, you can find a single endpoint for the service and access the service with the endpoint: - -.. code-block:: console - - ENDPOINT=$(sky serve status --endpoint 8081 vllm) - curl -L http://$ENDPOINT/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "meta-llama/Meta-Llama-3-8B-Instruct", - "messages": [ - { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "Who are you?" - } - ], - "stop_token_ids": [128009, 128001] - }' - -To enable autoscaling, you could replace the `replicas` with the following configs in `service`: - -.. code-block:: yaml - - service: - replica_policy: - min_replicas: 2 - max_replicas: 4 - target_qps_per_replica: 2 - -This will scale the service up to when the QPS exceeds 2 for each replica. - - -.. raw:: html - -
- Click to see the full recipe YAML - - -.. code-block:: yaml - - service: - replica_policy: - min_replicas: 2 - max_replicas: 4 - target_qps_per_replica: 2 - # An actual request for readiness probe. - readiness_probe: - path: /v1/chat/completions - post_data: - model: $MODEL_NAME - messages: - - role: user - content: Hello! What is your name? - max_completion_tokens: 1 - - resources: - accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. - use_spot: True - disk_size: 512 # Ensure model checkpoints can fit. - disk_tier: best - ports: 8081 # Expose to internet traffic. - - envs: - MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct - HF_TOKEN: # Change to your own huggingface token, or use --env to pass. - - setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - pip install vllm==0.4.0.post1 - # Install Gradio for web UI. - pip install gradio openai - pip install flash-attn==2.5.7 - - run: | - conda activate vllm - echo 'Starting vllm api server...' - python -u -m vllm.entrypoints.openai.api_server \ - --port 8081 \ - --model $MODEL_NAME \ - --trust-remote-code \ - --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ - 2>&1 | tee api_server.log - - -.. raw:: html - -
- -To update the service with the new config: - -.. code-block:: console - - HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN - - -To stop the service: - -.. code-block:: console - - sky serve down vllm - - -**Optional**: Connect a GUI to the endpoint -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - -It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas. - -.. raw:: html - -
- Click to see the full GUI YAML - -.. code-block:: yaml - - envs: - MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct - ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. - - resources: - cpus: 2 - - setup: | - conda create -n vllm python=3.10 -y - conda activate vllm - - # Install Gradio for web UI. - pip install gradio openai - - run: | - conda activate vllm - export PATH=$PATH:/sbin - - echo 'Starting gradio server...' - git clone https://github.com/vllm-project/vllm.git || true - python vllm/examples/gradio_openai_chatbot_webserver.py \ - -m $MODEL_NAME \ - --port 8811 \ - --model-url http://$ENDPOINT/v1 \ - --stop-token-ids 128009,128001 | tee ~/gradio.log - - -.. raw:: html - -
- -1. Start the chat web UI: - -.. code-block:: console - - sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) - - -2. Then, we can access the GUI at the returned gradio link: - -.. code-block:: console - - | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live - - diff --git a/docs/source/serving/runai_model_streamer.md b/docs/source/serving/runai_model_streamer.md new file mode 100644 index 0000000000000..1b5756a95075a --- /dev/null +++ b/docs/source/serving/runai_model_streamer.md @@ -0,0 +1,53 @@ +(runai-model-streamer)= + +# Loading Models with Run:ai Model Streamer + +Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory. +Further reading can be found in [Run:ai Model Streamer Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md). + +vLLM supports loading weights in Safetensors format using the Run:ai Model Streamer. +You first need to install vLLM RunAI optional dependency: + +```console +$ pip3 install vllm[runai] +``` + +To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag: + +```console +$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer +``` + +To run model from AWS S3 object store run: + +```console +$ vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +``` + +To run model from a S3 compatible object store run: + +```console +$ RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +``` + +## Tunable parameters + +You can tune parameters using `--model-loader-extra-config`: + +You can tune `concurrency` that controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer. +For reading from S3, it will be the number of client instances the host is opening to the S3 server. + +```console +$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' +``` + +You can controls the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. +You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit). + +```console +$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' +``` + +```{note} +For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md). +``` diff --git a/docs/source/serving/runai_model_streamer.rst b/docs/source/serving/runai_model_streamer.rst deleted file mode 100644 index 459eb8677fb95..0000000000000 --- a/docs/source/serving/runai_model_streamer.rst +++ /dev/null @@ -1,53 +0,0 @@ -.. _runai_model_streamer: - -Loading Models with Run:ai Model Streamer -========================================= -Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory. -Further reading can be found in `Run:ai Model Streamer Documentation `_. - -vLLM supports loading weights in Safetensors format using the Run:ai Model Streamer. -You first need to install vLLM RunAI optional dependency: - -.. code-block:: console - - $ pip3 install vllm[runai] - -To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag: - -.. code-block:: console - - $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer - -To run model from AWS S3 object store run: - -.. code-block:: console - - $ vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer - - -To run model from a S3 compatible object store run: - -.. code-block:: console - - $ RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer - -Tunable parameters ------------------- -You can tune parameters using `--model-loader-extra-config`: - -You can tune `concurrency` that controls the level of concurrency and number of OS threads reading tensors from the file to the CPU buffer. -For reading from S3, it will be the number of client instances the host is opening to the S3 server. - - .. code-block:: console - - $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' - -You can controls the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. -You can read further about CPU buffer memory limiting `here `_. - - .. code-block:: console - - $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' - -.. note:: - For further instructions about tunable parameters and additional parameters configurable through environment variables, read the `Environment Variables Documentation `_. diff --git a/docs/source/serving/serving_with_langchain.md b/docs/source/serving/serving_with_langchain.md new file mode 100644 index 0000000000000..96bd5943f3d64 --- /dev/null +++ b/docs/source/serving/serving_with_langchain.md @@ -0,0 +1,30 @@ +(run-on-langchain)= + +# Serving with Langchain + +vLLM is also available via [Langchain](https://github.com/langchain-ai/langchain) . + +To install langchain, run + +```console +$ pip install langchain langchain_community -q +``` + +To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`. + +```python +from langchain_community.llms import VLLM + +llm = VLLM(model="mosaicml/mpt-7b", + trust_remote_code=True, # mandatory for hf models + max_new_tokens=128, + top_k=10, + top_p=0.95, + temperature=0.8, + # tensor_parallel_size=... # for distributed inference +) + +print(llm("What is the capital of France ?")) +``` + +Please refer to this [Tutorial](https://python.langchain.com/docs/integrations/llms/vllm) for more details. diff --git a/docs/source/serving/serving_with_langchain.rst b/docs/source/serving/serving_with_langchain.rst deleted file mode 100644 index 6440c8aad5986..0000000000000 --- a/docs/source/serving/serving_with_langchain.rst +++ /dev/null @@ -1,31 +0,0 @@ -.. _run_on_langchain: - -Serving with Langchain -============================ - -vLLM is also available via `Langchain `_ . - -To install langchain, run - -.. code-block:: console - - $ pip install langchain langchain_community -q - -To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``. - -.. code-block:: python - - from langchain_community.llms import VLLM - - llm = VLLM(model="mosaicml/mpt-7b", - trust_remote_code=True, # mandatory for hf models - max_new_tokens=128, - top_k=10, - top_p=0.95, - temperature=0.8, - # tensor_parallel_size=... # for distributed inference - ) - - print(llm("What is the capital of France ?")) - -Please refer to this `Tutorial `_ for more details. diff --git a/docs/source/serving/serving_with_llamaindex.md b/docs/source/serving/serving_with_llamaindex.md new file mode 100644 index 0000000000000..98859d8e3f828 --- /dev/null +++ b/docs/source/serving/serving_with_llamaindex.md @@ -0,0 +1,26 @@ +(run-on-llamaindex)= + +# Serving with llama_index + +vLLM is also available via [llama_index](https://github.com/run-llama/llama_index) . + +To install llamaindex, run + +```console +$ pip install llama-index-llms-vllm -q +``` + +To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`. + +```python +from llama_index.llms.vllm import Vllm + +llm = Vllm( + model="microsoft/Orca-2-7b", + tensor_parallel_size=4, + max_new_tokens=100, + vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5}, +) +``` + +Please refer to this [Tutorial](https://docs.llamaindex.ai/en/latest/examples/llm/vllm/) for more details. diff --git a/docs/source/serving/serving_with_llamaindex.rst b/docs/source/serving/serving_with_llamaindex.rst deleted file mode 100644 index 038e961344e47..0000000000000 --- a/docs/source/serving/serving_with_llamaindex.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. _run_on_llamaindex: - -Serving with llama_index -============================ - -vLLM is also available via `llama_index `_ . - -To install llamaindex, run - -.. code-block:: console - - $ pip install llama-index-llms-vllm -q - -To run inference on a single or multiple GPUs, use ``Vllm`` class from ``llamaindex``. - -.. code-block:: python - - from llama_index.llms.vllm import Vllm - - llm = Vllm( - model="microsoft/Orca-2-7b", - tensor_parallel_size=4, - max_new_tokens=100, - vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5}, - ) - -Please refer to this `Tutorial `_ for more details. diff --git a/docs/source/serving/serving_with_llamastack.md b/docs/source/serving/serving_with_llamastack.md new file mode 100644 index 0000000000000..71dadca7ad47c --- /dev/null +++ b/docs/source/serving/serving_with_llamastack.md @@ -0,0 +1,38 @@ +(run-on-llamastack)= + +# Serving with Llama Stack + +vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) . + +To install Llama Stack, run + +```console +$ pip install llama-stack -q +``` + +## Inference using OpenAI Compatible API + +Then start Llama Stack server pointing to your vLLM server with the following configuration: + +```yaml +inference: + - provider_id: vllm0 + provider_type: remote::vllm + config: + url: http://127.0.0.1:8000 +``` + +Please refer to [this guide](https://llama-stack.readthedocs.io/en/latest/distributions/self_hosted_distro/remote-vllm.html) for more details on this remote vLLM provider. + +## Inference via Embedded vLLM + +An [inline vLLM provider](https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm) +is also available. This is a sample of configuration using that method: + +```yaml +inference + - provider_type: vllm + config: + model: Llama3.1-8B-Instruct + tensor_parallel_size: 4 +``` diff --git a/docs/source/serving/serving_with_llamastack.rst b/docs/source/serving/serving_with_llamastack.rst deleted file mode 100644 index a2acd7b39f887..0000000000000 --- a/docs/source/serving/serving_with_llamastack.rst +++ /dev/null @@ -1,42 +0,0 @@ -.. _run_on_llamastack: - -Serving with Llama Stack -============================ - -vLLM is also available via `Llama Stack `_ . - -To install Llama Stack, run - -.. code-block:: console - - $ pip install llama-stack -q - -Inference using OpenAI Compatible API -------------------------------------- - -Then start Llama Stack server pointing to your vLLM server with the following configuration: - -.. code-block:: yaml - - inference: - - provider_id: vllm0 - provider_type: remote::vllm - config: - url: http://127.0.0.1:8000 - -Please refer to `this guide `_ for more details on this remote vLLM provider. - -Inference via Embedded vLLM ---------------------------- - -An `inline vLLM provider -`_ -is also available. This is a sample of configuration using that method: - -.. code-block:: yaml - - inference - - provider_type: vllm - config: - model: Llama3.1-8B-Instruct - tensor_parallel_size: 4 diff --git a/docs/source/serving/tensorizer.md b/docs/source/serving/tensorizer.md new file mode 100644 index 0000000000000..d3dd29d48f730 --- /dev/null +++ b/docs/source/serving/tensorizer.md @@ -0,0 +1,16 @@ +(tensorizer)= + +# Loading Models with CoreWeave's Tensorizer + +vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer). +vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized +at runtime extremely quickly directly to the GPU, resulting in significantly +shorter Pod startup times and CPU memory usage. Tensor encryption is also supported. + +For more information on CoreWeave's Tensorizer, please refer to +[CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see +the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/tensorize_vllm_model.html). + +```{note} +Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. +``` diff --git a/docs/source/serving/tensorizer.rst b/docs/source/serving/tensorizer.rst deleted file mode 100644 index 96a93db94871b..0000000000000 --- a/docs/source/serving/tensorizer.rst +++ /dev/null @@ -1,15 +0,0 @@ -.. _tensorizer: - -Loading Models with CoreWeave's Tensorizer -========================================== -vLLM supports loading models with `CoreWeave's Tensorizer `_. -vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized -at runtime extremely quickly directly to the GPU, resulting in significantly -shorter Pod startup times and CPU memory usage. Tensor encryption is also supported. - -For more information on CoreWeave's Tensorizer, please refer to -`CoreWeave's Tensorizer documentation `_. For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see -the `vLLM example script `_. - -.. note:: - Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. diff --git a/docs/source/usage/compatibility_matrix.md b/docs/source/usage/compatibility_matrix.md new file mode 100644 index 0000000000000..763b49dac4f8a --- /dev/null +++ b/docs/source/usage/compatibility_matrix.md @@ -0,0 +1,468 @@ +(compatibility-matrix)= + +# Compatibility Matrix + +The tables below show mutually exclusive features and the support on some hardware. + +```{note} +Check the '✗' with links to see tracking issue for unsupported feature/hardware combination. +``` + +## Feature x Feature + +```{raw} html + +``` + +```{list-table} + :header-rows: 1 + :stub-columns: 1 + :widths: auto + + * - Feature + - [CP](#chunked-prefill) + - [APC](#apc) + - [LoRA](#lora-adapter) + - prmpt adptr + - [SD](#spec_decode) + - CUDA graph + - pooling + - enc-dec + - logP + - prmpt logP + - async output + - multi-step + - mm + - best-of + - beam-search + - guided dec + * - [CP](#chunked-prefill) + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + * - [APC](#apc) + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + * - [LoRA](#lora-adapter) + - [✗](https://github.com/vllm-project/vllm/pull/9057) + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + - + - + * - prmpt adptr + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + - + * - [SD](#spec_decode) + - ✅ + - ✅ + - ✗ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + - + * - CUDA graph + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - + - + - + - + - + - + - + - + - + - + - + * - pooling + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + - + - + - + - + - + - + - + - + - + - + * - enc-dec + - ✗ + - [✗](https://github.com/vllm-project/vllm/issues/7366) + - ✗ + - ✗ + - [✗](https://github.com/vllm-project/vllm/issues/7366) + - ✅ + - ✅ + - + - + - + - + - + - + - + - + - + * - logP + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + - + - + - + - + - + - + - + - + * - prmpt logP + - ✅ + - ✅ + - ✅ + - ✅ + - [✗](https://github.com/vllm-project/vllm/pull/8199) + - ✅ + - ✗ + - ✅ + - ✅ + - + - + - + - + - + - + - + * - async output + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + - ✗ + - ✗ + - ✅ + - ✅ + - + - + - + - + - + - + * - multi-step + - ✗ + - ✅ + - ✗ + - ✅ + - ✗ + - ✅ + - ✗ + - ✗ + - ✅ + - [✗](https://github.com/vllm-project/vllm/issues/8198) + - ✅ + - + - + - + - + - + * - mm + - ✅ + - [✗](https://github.com/vllm-project/vllm/pull/8348) + - [✗](https://github.com/vllm-project/vllm/pull/7199) + - ? + - ? + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ? + - + - + - + - + * - best-of + - ✅ + - ✅ + - ✅ + - ✅ + - [✗](https://github.com/vllm-project/vllm/issues/6137) + - ✅ + - ✗ + - ✅ + - ✅ + - ✅ + - ? + - [✗](https://github.com/vllm-project/vllm/issues/7968) + - ✅ + - + - + - + * - beam-search + - ✅ + - ✅ + - ✅ + - ✅ + - [✗](https://github.com/vllm-project/vllm/issues/6137) + - ✅ + - ✗ + - ✅ + - ✅ + - ✅ + - ? + - [✗](https://github.com/vllm-project/vllm/issues/7968>) + - ? + - ✅ + - + - + * - guided dec + - ✅ + - ✅ + - ? + - ? + - ✅ + - ✅ + - ✗ + - ? + - ✅ + - ✅ + - ✅ + - [✗](https://github.com/vllm-project/vllm/issues/9893) + - ? + - ✅ + - ✅ + - + +``` + +### Feature x Hardware + +```{list-table} + :header-rows: 1 + :stub-columns: 1 + :widths: auto + + * - Feature + - Volta + - Turing + - Ampere + - Ada + - Hopper + - CPU + - AMD + * - [CP](#chunked-prefill) + - [✗](https://github.com/vllm-project/vllm/issues/2729) + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - [APC](#apc) + - [✗](https://github.com/vllm-project/vllm/issues/3687) + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - [LoRA](#lora-adapter) + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - [✗](https://github.com/vllm-project/vllm/pull/4830) + - ✅ + * - prmpt adptr + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - [✗](https://github.com/vllm-project/vllm/issues/8475) + - ✅ + * - [SD](#spec_decode) + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - CUDA graph + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✅ + * - pooling + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ? + * - enc-dec + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + * - mm + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - logP + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - prmpt logP + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - async output + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ + - ✗ + * - multi-step + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - [✗](https://github.com/vllm-project/vllm/issues/8477) + - ✅ + * - best-of + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - beam-search + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + * - guided dec + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ +``` diff --git a/docs/source/usage/compatibility_matrix.rst b/docs/source/usage/compatibility_matrix.rst deleted file mode 100644 index 04dd72b1e3527..0000000000000 --- a/docs/source/usage/compatibility_matrix.rst +++ /dev/null @@ -1,468 +0,0 @@ -.. _compatibility_matrix: - -Compatibility Matrix -==================== - -The tables below show mutually exclusive features and the support on some hardware. - -.. note:: - - Check the '✗' with links to see tracking issue for unsupported feature/hardware combination. - -Feature x Feature ------------------ - - -.. raw:: html - - - -.. list-table:: - :header-rows: 1 - :widths: auto - - * - Feature - - :ref:`CP ` - - :ref:`APC ` - - :ref:`LoRA ` - - :abbr:`prmpt adptr (Prompt Adapter)` - - :ref:`SD ` - - CUDA graph - - :abbr:`pooling (Pooling Models)` - - :abbr:`enc-dec (Encoder-Decoder Models)` - - :abbr:`logP (Logprobs)` - - :abbr:`prmpt logP (Prompt Logprobs)` - - :abbr:`async output (Async Output Processing)` - - multi-step - - :abbr:`mm (Multimodal Inputs)` - - best-of - - beam-search - - :abbr:`guided dec (Guided Decoding)` - * - :ref:`CP ` - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * - :ref:`APC ` - - ✅ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * - :ref:`LoRA ` - - `✗ `__ - - ✅ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * - :abbr:`prmpt adptr (Prompt Adapter)` - - ✅ - - ✅ - - ✅ - - - - - - - - - - - - - - - - - - - - - - - - - - - * - :ref:`SD ` - - ✅ - - ✅ - - ✗ - - ✅ - - - - - - - - - - - - - - - - - - - - - - - - - * - CUDA graph - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - - - - - - - - - - - - - - - - - - - - - - * - :abbr:`pooling (Pooling Models)` - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - - - - - - - - - - - - - - - - - - - - - * - :abbr:`enc-dec (Encoder-Decoder Models)` - - ✗ - - `✗ `__ - - ✗ - - ✗ - - `✗ `__ - - ✅ - - ✅ - - - - - - - - - - - - - - - - - - - * - :abbr:`logP (Logprobs)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - - ✅ - - - - - - - - - - - - - - - - - * - :abbr:`prmpt logP (Prompt Logprobs)` - - ✅ - - ✅ - - ✅ - - ✅ - - `✗ `__ - - ✅ - - ✗ - - ✅ - - ✅ - - - - - - - - - - - - - - - * - :abbr:`async output (Async Output Processing)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - - ✅ - - ✗ - - ✗ - - ✅ - - ✅ - - - - - - - - - - - - - * - multi-step - - ✗ - - ✅ - - ✗ - - ✅ - - ✗ - - ✅ - - ✗ - - ✗ - - ✅ - - `✗ `__ - - ✅ - - - - - - - - - - - * - :abbr:`mm (Multimodal Inputs)` - - ✅ - - `✗ `__ - - `✗ `__ - - ? - - ? - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ? - - - - - - - - - * - best-of - - ✅ - - ✅ - - ✅ - - ✅ - - `✗ `__ - - ✅ - - ✗ - - ✅ - - ✅ - - ✅ - - ? - - `✗ `__ - - ✅ - - - - - - - * - beam-search - - ✅ - - ✅ - - ✅ - - ✅ - - `✗ `__ - - ✅ - - ✗ - - ✅ - - ✅ - - ✅ - - ? - - `✗ `__ - - ? - - ✅ - - - - - * - :abbr:`guided dec (Guided Decoding)` - - ✅ - - ✅ - - ? - - ? - - ✅ - - ✅ - - ✗ - - ? - - ✅ - - ✅ - - ✅ - - `✗ `__ - - ? - - ✅ - - ✅ - - - - -Feature x Hardware -^^^^^^^^^^^^^^^^^^ - -.. list-table:: - :header-rows: 1 - :widths: auto - - * - Feature - - Volta - - Turing - - Ampere - - Ada - - Hopper - - CPU - - AMD - * - :ref:`CP ` - - `✗ `__ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - :ref:`APC ` - - `✗ `__ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - :ref:`LoRA ` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - `✗ `__ - - ✅ - * - :abbr:`prmpt adptr (Prompt Adapter)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - `✗ `__ - - ✅ - * - :ref:`SD ` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - CUDA graph - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - - ✅ - * - :abbr:`pooling (Pooling Models)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ? - * - :abbr:`enc-dec (Encoder-Decoder Models)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - * - :abbr:`mm (Multimodal Inputs)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - :abbr:`logP (Logprobs)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - :abbr:`prmpt logP (Prompt Logprobs)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - :abbr:`async output (Async Output Processing)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - - ✗ - * - multi-step - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - `✗ `__ - - ✅ - * - best-of - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - beam-search - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - :abbr:`guided dec (Guided Decoding)` - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ diff --git a/docs/source/usage/disagg_prefill.md b/docs/source/usage/disagg_prefill.md new file mode 100644 index 0000000000000..a61c00fad1e3c --- /dev/null +++ b/docs/source/usage/disagg_prefill.md @@ -0,0 +1,64 @@ +(disagg-prefill)= + +# Disaggregated prefilling (experimental) + +This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change. + +## Why disaggregated prefilling? + +Two main reasons: + +- **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. `tp` and `pp`) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT. +- **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL. + +```{note} +Disaggregated prefill DOES NOT improve throughput. +``` + +## Usage example + +Please refer to `examples/disaggregated_prefill.sh` for the example usage of disaggregated prefilling. + +## Benchmarks + +Please refer to `benchmarks/disagg_benchmarks/` for disaggregated prefilling benchmarks. + +## Development + +We implement disaggregated prefilling by running 2 vLLM instances. One for prefill (we call it prefill instance) and one for decode (we call it decode instance), and then use a connector to transfer the prefill KV caches and results from prefill instance to decode instance. + +All disaggregated prefilling implementation is under `vllm/distributed/kv_transfer`. + +Key abstractions for disaggregated prefilling: + +- **Connector**: Connector allows **kv consumer** to retrieve the KV caches of a batch of request from **kv producer**. +- **LookupBuffer**: LookupBuffer provides two API: `insert` KV cache and `drop_select` KV cache. The semantics of `insert` and `drop_select` are similar to SQL, where `insert` inserts a KV cache into the buffer, and `drop_select` returns the KV cache that matches the given condition and drop it from the buffer. +- **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports `send_tensor` and `recv_tensor`. + +```{note} +`insert` is non-blocking operation but `drop_select` is blocking operation. +``` + +Here is a figure illustrating how the above 3 abstractions are organized: + +```{image} /assets/usage/disagg_prefill/abstraction.jpg +:alt: Disaggregated prefilling abstractions +``` + +The workflow of disaggregated prefilling is as follows: + +```{image} /assets/usage/disagg_prefill/overview.jpg +:alt: Disaggregated prefilling workflow +``` + +The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer. + +## Third-party contributions + +Disaggregated prefilling is highly related to infrastructure, so vLLM relies on third-party connectors for production-level disaggregated prefilling (and vLLM team will actively review and merge new PRs for third-party connectors). + +We recommend three ways of implementations: + +- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions. +- **Database-like connector**: Implement your own `LookupBuffer` and support the `insert` and `drop_select` APIs just like SQL. +- **Distributed P2P connector**: Implement your own `Pipe` and support the `send_tensor` and `recv_tensor` APIs, just like `torch.distributed`. diff --git a/docs/source/usage/disagg_prefill.rst b/docs/source/usage/disagg_prefill.rst deleted file mode 100644 index 9fe714b4fd856..0000000000000 --- a/docs/source/usage/disagg_prefill.rst +++ /dev/null @@ -1,69 +0,0 @@ -.. _disagg_prefill: - -Disaggregated prefilling (experimental) -======================================= - -This page introduces you the disaggregated prefilling feature in vLLM. This feature is experimental and subject to change. - -Why disaggregated prefilling? ------------------------------ - -Two main reasons: - -* **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. ``tp`` and ``pp``) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT. -* **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL. - -.. note:: - Disaggregated prefill DOES NOT improve throughput. - -Usage example -------------- - -Please refer to ``examples/disaggregated_prefill.sh`` for the example usage of disaggregated prefilling. - - -Benchmarks ----------- - -Please refer to ``benchmarks/disagg_benchmarks/`` for disaggregated prefilling benchmarks. - - -Development ------------ - -We implement disaggregated prefilling by running 2 vLLM instances. One for prefill (we call it prefill instance) and one for decode (we call it decode instance), and then use a connector to transfer the prefill KV caches and results from prefill instance to decode instance. - -All disaggregated prefilling implementation is under ``vllm/distributed/kv_transfer``. - -Key abstractions for disaggregated prefilling: - -* **Connector**: Connector allows **kv consumer** to retrieve the KV caches of a batch of request from **kv producer**. -* **LookupBuffer**: LookupBuffer provides two API: ``insert`` KV cache and ``drop_select`` KV cache. The semantics of ``insert`` and ``drop_select`` are similar to SQL, where ``insert`` inserts a KV cache into the buffer, and ``drop_select`` returns the KV cache that matches the given condition and drop it from the buffer. -* **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports ``send_tensor`` and ``recv_tensor``. - -.. note:: - ``insert`` is non-blocking operation but ``drop_select`` is blocking operation. - -Here is a figure illustrating how the above 3 abstractions are organized: - -.. image:: /assets/usage/disagg_prefill/abstraction.jpg - :alt: Disaggregated prefilling abstractions - -The workflow of disaggregated prefilling is as follows: - -.. image:: /assets/usage/disagg_prefill/overview.jpg - :alt: Disaggregated prefilling workflow - -The ``buffer`` corresponds to ``insert`` API in LookupBuffer, and the ``drop_select`` corresponds to ``drop_select`` API in LookupBuffer. - - -Third-party contributions -------------------------- - -Disaggregated prefilling is highly related to infrastructure, so vLLM relies on third-party connectors for production-level disaggregated prefilling (and vLLM team will actively review and merge new PRs for third-party connectors). - -We recommend three ways of implementations: - -* **Fully-customized connector**: Implement your own ``Connector``, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions. -* **Database-like connector**: Implement your own ``LookupBuffer`` and support the ``insert`` and ``drop_select`` APIs just like SQL. -* **Distributed P2P connector**: Implement your own ``Pipe`` and support the ``send_tensor`` and ``recv_tensor`` APIs, just like `torch.distributed`. diff --git a/docs/source/usage/engine_args.rst b/docs/source/usage/engine_args.md similarity index 76% rename from docs/source/usage/engine_args.rst rename to docs/source/usage/engine_args.md index e7ce8cdcabe88..cd3c6a430b7fa 100644 --- a/docs/source/usage/engine_args.rst +++ b/docs/source/usage/engine_args.md @@ -1,23 +1,25 @@ -.. _engine_args: +(engine-args)= -Engine Arguments -================ +# Engine Arguments Below, you can find an explanation of every engine argument for vLLM: +```{eval-rst} .. argparse:: :module: vllm.engine.arg_utils :func: _engine_args_parser :prog: vllm serve :nodefaultconst: +``` -Async Engine Arguments ----------------------- +## Async Engine Arguments Below are the additional arguments related to the asynchronous engine: +```{eval-rst} .. argparse:: :module: vllm.engine.arg_utils :func: _async_engine_args_parser :prog: vllm serve - :nodefaultconst: \ No newline at end of file + :nodefaultconst: +``` diff --git a/docs/source/usage/env_vars.md b/docs/source/usage/env_vars.md new file mode 100644 index 0000000000000..f9b08077a03b4 --- /dev/null +++ b/docs/source/usage/env_vars.md @@ -0,0 +1,15 @@ +# Environment Variables + +vLLM uses the following environment variables to configure the system: + +```{warning} +Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work. + +All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables). +``` + +```{literalinclude} ../../../vllm/envs.py +:end-before: end-env-vars-definition +:language: python +:start-after: begin-env-vars-definition +``` diff --git a/docs/source/usage/env_vars.rst b/docs/source/usage/env_vars.rst deleted file mode 100644 index ff2259c0da3f1..0000000000000 --- a/docs/source/usage/env_vars.rst +++ /dev/null @@ -1,14 +0,0 @@ -Environment Variables -======================== - -vLLM uses the following environment variables to configure the system: - -.. warning:: - Please note that ``VLLM_PORT`` and ``VLLM_HOST_IP`` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use ``--host $VLLM_HOST_IP`` and ``--port $VLLM_PORT`` to start the API server, it will not work. - - All environment variables used by vLLM are prefixed with ``VLLM_``. **Special care should be taken for Kubernetes users**: please do not name the service as ``vllm``, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because `Kubernetes sets environment variables for each service with the capitalized service name as the prefix `_. - -.. literalinclude:: ../../../vllm/envs.py - :language: python - :start-after: begin-env-vars-definition - :end-before: end-env-vars-definition diff --git a/docs/source/usage/faq.rst b/docs/source/usage/faq.md similarity index 61% rename from docs/source/usage/faq.rst rename to docs/source/usage/faq.md index d88da32092924..fde2954f10c59 100644 --- a/docs/source/usage/faq.rst +++ b/docs/source/usage/faq.md @@ -1,34 +1,33 @@ -.. _faq: +(faq)= -Frequently Asked Questions -=========================== +# Frequently Asked Questions - Q: How can I serve multiple models on a single port using the OpenAI API? +> Q: How can I serve multiple models on a single port using the OpenAI API? A: Assuming that you're referring to using OpenAI compatible server to serve multiple models at once, that is not currently supported, you can run multiple instances of the server (each serving a different model) at the same time, and have another layer to route the incoming request to the correct server accordingly. ----------------------------------------- +______________________________________________________________________ - Q: Which model to use for offline inference embedding? +> Q: Which model to use for offline inference embedding? -A: You can try `e5-mistral-7b-instruct `__ and `BAAI/bge-base-en-v1.5 `__; -more are listed :ref:`here `. +A: You can try [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) and [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5); +more are listed [here](#supported-models). -By extracting hidden states, vLLM can automatically convert text generation models like `Llama-3-8B `__, -`Mistral-7B-Instruct-v0.3 `__ into embedding models, +By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B), +[Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models, but they are expected be inferior to models that are specifically trained on embedding tasks. ----------------------------------------- +______________________________________________________________________ - Q: Can the output of a prompt vary across runs in vLLM? +> Q: Can the output of a prompt vary across runs in vLLM? A: Yes, it can. vLLM does not guarantee stable log probabilities (logprobs) for the output tokens. Variations in logprobs may occur due to -numerical instability in Torch operations or non-deterministic behavior in batched Torch operations when batching changes. For more details, -see the `Numerical Accuracy section `_. +numerical instability in Torch operations or non-deterministic behavior in batched Torch operations when batching changes. For more details, +see the [Numerical Accuracy section](https://pytorch.org/docs/stable/notes/numerical_accuracy.html#batched-computations-or-slice-computations). In vLLM, the same requests might be batched differently due to factors such as other concurrent requests, -changes in batch size, or batch expansion in speculative decoding. These batching variations, combined with numerical instability of Torch operations, -can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in +changes in batch size, or batch expansion in speculative decoding. These batching variations, combined with numerical instability of Torch operations, +can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in different tokens being sampled. Once a different token is sampled, further divergence is likely. **Mitigation Strategies** diff --git a/docs/source/usage/lora.md b/docs/source/usage/lora.md new file mode 100644 index 0000000000000..e2ddde74aaa45 --- /dev/null +++ b/docs/source/usage/lora.md @@ -0,0 +1,215 @@ +(lora-adapter)= + +# LoRA Adapters + +This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09685) with vLLM on top of a base model. + +LoRA adapters can be used with any vLLM model that implements {class}`~vllm.model_executor.models.interfaces.SupportsLoRA`. + +Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save +them locally with + +```python +from huggingface_hub import snapshot_download + +sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") +``` + +Then we instantiate the base model and pass in the `enable_lora=True` flag: + +```python +from vllm import LLM, SamplingParams +from vllm.lora.request import LoRARequest + +llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True) +``` + +We can now submit the prompts and call `llm.generate` with the `lora_request` parameter. The first parameter +of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and +the third parameter is the path to the LoRA adapter. + +```python +sampling_params = SamplingParams( + temperature=0, + max_tokens=256, + stop=["[/assistant]"] +) + +prompts = [ + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", +] + +outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) +) +``` + +Check out [examples/multilora_inference.py](https://github.com/vllm-project/vllm/blob/main/examples/multilora_inference.py) +for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. + +## Serving LoRA Adapters + +LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use +`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kickoff the server: + +```bash +vllm serve meta-llama/Llama-2-7b-hf \ + --enable-lora \ + --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ +``` + +```{note} +The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one. +``` + +The server entrypoint accepts all other LoRA configuration parameters (`max_loras`, `max_lora_rank`, `max_cpu_loras`, +etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along +with its base model: + +```bash +curl localhost:8000/v1/models | jq . +{ + "object": "list", + "data": [ + { + "id": "meta-llama/Llama-2-7b-hf", + "object": "model", + ... + }, + { + "id": "sql-lora", + "object": "model", + ... + } + ] +} +``` + +Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be +processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other +LoRA adapter requests if they were provided and `max_loras` is set high enough). + +The following is an example request + +```bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "sql-lora", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' | jq +``` + +## Dynamically serving LoRA Adapters + +In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading +LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility +to change models on-the-fly is needed. + +Note: Enabling this feature in production environments is risky as user may participate model adapter management. + +To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING` +is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active. + +```bash +export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True +``` + +Loading a LoRA Adapter: + +To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary +details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter. + +Example request to load a LoRA adapter: + +```bash +curl -X POST http://localhost:8000/v1/load_lora_adapter \ +-H "Content-Type: application/json" \ +-d '{ + "lora_name": "sql_adapter", + "lora_path": "/path/to/sql-lora-adapter" +}' +``` + +Upon a successful request, the API will respond with a 200 OK status code. If an error occurs, such as if the adapter +cannot be found or loaded, an appropriate error message will be returned. + +Unloading a LoRA Adapter: + +To unload a LoRA adapter that has been previously loaded, send a POST request to the `/v1/unload_lora_adapter` endpoint +with the name or ID of the adapter to be unloaded. + +Example request to unload a LoRA adapter: + +```bash +curl -X POST http://localhost:8000/v1/unload_lora_adapter \ +-H "Content-Type: application/json" \ +-d '{ + "lora_name": "sql_adapter" +}' +``` + +## New format for `--lora-modules` + +In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example: + +```bash +--lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ +``` + +This would only include the `name` and `path` for each LoRA module, but did not provide a way to specify a `base_model_name`. +Now, you can specify a base_model_name alongside the name and path using JSON format. For example: + +```bash +--lora-modules '{"name": "sql-lora", "path": "/path/to/lora", "base_model_name": "meta-llama/Llama-2-7b"}' +``` + +To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case. + +## Lora model lineage in model card + +The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this: + +- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter. +- The `root` field points to the artifact location of the lora adapter. + +```bash +$ curl http://localhost:8000/v1/models + +{ + "object": "list", + "data": [ + { + "id": "meta-llama/Llama-2-7b-hf", + "object": "model", + "created": 1715644056, + "owned_by": "vllm", + "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/", + "parent": null, + "permission": [ + { + ..... + } + ] + }, + { + "id": "sql-lora", + "object": "model", + "created": 1715644056, + "owned_by": "vllm", + "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/", + "parent": meta-llama/Llama-2-7b-hf, + "permission": [ + { + .... + } + ] + } + ] +} +``` diff --git a/docs/source/usage/lora.rst b/docs/source/usage/lora.rst deleted file mode 100644 index c2c6fa2aebfaf..0000000000000 --- a/docs/source/usage/lora.rst +++ /dev/null @@ -1,225 +0,0 @@ -.. _lora: - -LoRA Adapters -============= - -This document shows you how to use `LoRA adapters `_ with vLLM on top of a base model. - -LoRA adapters can be used with any vLLM model that implements :class:`~vllm.model_executor.models.interfaces.SupportsLoRA`. - -Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save -them locally with - -.. code-block:: python - - from huggingface_hub import snapshot_download - - sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") - - -Then we instantiate the base model and pass in the ``enable_lora=True`` flag: - -.. code-block:: python - - from vllm import LLM, SamplingParams - from vllm.lora.request import LoRARequest - - llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_lora=True) - - -We can now submit the prompts and call ``llm.generate`` with the ``lora_request`` parameter. The first parameter -of ``LoRARequest`` is a human identifiable name, the second parameter is a globally unique ID for the adapter and -the third parameter is the path to the LoRA adapter. - -.. code-block:: python - - sampling_params = SamplingParams( - temperature=0, - max_tokens=256, - stop=["[/assistant]"] - ) - - prompts = [ - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", - ] - - outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) - ) - - -Check out `examples/multilora_inference.py `_ -for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. - -Serving LoRA Adapters ---------------------- -LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use -``--lora-modules {name}={path} {name}={path}`` to specify each LoRA module when we kickoff the server: - -.. code-block:: bash - - vllm serve meta-llama/Llama-2-7b-hf \ - --enable-lora \ - --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ - -.. note:: - The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one. - -The server entrypoint accepts all other LoRA configuration parameters (``max_loras``, ``max_lora_rank``, ``max_cpu_loras``, -etc.), which will apply to all forthcoming requests. Upon querying the ``/models`` endpoint, we should see our LoRA along -with its base model: - -.. code-block:: bash - - curl localhost:8000/v1/models | jq . - { - "object": "list", - "data": [ - { - "id": "meta-llama/Llama-2-7b-hf", - "object": "model", - ... - }, - { - "id": "sql-lora", - "object": "model", - ... - } - ] - } - -Requests can specify the LoRA adapter as if it were any other model via the ``model`` request parameter. The requests will be -processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other -LoRA adapter requests if they were provided and ``max_loras`` is set high enough). - -The following is an example request - -.. code-block:: bash - - curl http://localhost:8000/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "sql-lora", - "prompt": "San Francisco is a", - "max_tokens": 7, - "temperature": 0 - }' | jq - - -Dynamically serving LoRA Adapters ---------------------------------- - -In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading -LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility -to change models on-the-fly is needed. - -Note: Enabling this feature in production environments is risky as user may participate model adapter management. - -To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING` -is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active. - -.. code-block:: bash - - export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True - - -Loading a LoRA Adapter: - -To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary -details of the adapter to be loaded. The request payload should include the name and path to the LoRA adapter. - -Example request to load a LoRA adapter: - -.. code-block:: bash - - curl -X POST http://localhost:8000/v1/load_lora_adapter \ - -H "Content-Type: application/json" \ - -d '{ - "lora_name": "sql_adapter", - "lora_path": "/path/to/sql-lora-adapter" - }' - -Upon a successful request, the API will respond with a 200 OK status code. If an error occurs, such as if the adapter -cannot be found or loaded, an appropriate error message will be returned. - -Unloading a LoRA Adapter: - -To unload a LoRA adapter that has been previously loaded, send a POST request to the `/v1/unload_lora_adapter` endpoint -with the name or ID of the adapter to be unloaded. - -Example request to unload a LoRA adapter: - -.. code-block:: bash - - curl -X POST http://localhost:8000/v1/unload_lora_adapter \ - -H "Content-Type: application/json" \ - -d '{ - "lora_name": "sql_adapter" - }' - - -New format for `--lora-modules` -------------------------------- - -In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example: - -.. code-block:: bash - - --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ - -This would only include the `name` and `path` for each LoRA module, but did not provide a way to specify a `base_model_name`. -Now, you can specify a base_model_name alongside the name and path using JSON format. For example: - -.. code-block:: bash - - --lora-modules '{"name": "sql-lora", "path": "/path/to/lora", "base_model_name": "meta-llama/Llama-2-7b"}' - -To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case. - - -Lora model lineage in model card --------------------------------- - -The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this: - -- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter. -- The `root` field points to the artifact location of the lora adapter. - -.. code-block:: bash - - $ curl http://localhost:8000/v1/models - - { - "object": "list", - "data": [ - { - "id": "meta-llama/Llama-2-7b-hf", - "object": "model", - "created": 1715644056, - "owned_by": "vllm", - "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/", - "parent": null, - "permission": [ - { - ..... - } - ] - }, - { - "id": "sql-lora", - "object": "model", - "created": 1715644056, - "owned_by": "vllm", - "root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/", - "parent": meta-llama/Llama-2-7b-hf, - "permission": [ - { - .... - } - ] - } - ] - } diff --git a/docs/source/usage/multimodal_inputs.md b/docs/source/usage/multimodal_inputs.md new file mode 100644 index 0000000000000..b0c887398b1b7 --- /dev/null +++ b/docs/source/usage/multimodal_inputs.md @@ -0,0 +1,486 @@ +(multimodal-inputs)= + +# Multimodal Inputs + +This page teaches you how to pass multi-modal inputs to [multi-modal models](#supported-mm-models) in vLLM. + +```{note} +We are actively iterating on multi-modal support. See [this RFC](https://github.com/vllm-project/vllm/issues/4194) for upcoming changes, +and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests. +``` + +## Offline Inference + +To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`: + +- `prompt`: The prompt should follow the format that is documented on HuggingFace. +- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.MultiModalDataDict`. + +### Image + +You can pass a single image to the {code}`'image'` field of the multi-modal dictionary, as shown in the following examples: + +```python +llm = LLM(model="llava-hf/llava-1.5-7b-hf") + +# Refer to the HuggingFace repo for the correct format to use +prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" + +# Load the image using PIL.Image +image = PIL.Image.open(...) + +# Single prompt inference +outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": image}, +}) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + +# Batch inference +image_1 = PIL.Image.open(...) +image_2 = PIL.Image.open(...) +outputs = llm.generate( + [ + { + "prompt": "USER: \nWhat is the content of this image?\nASSISTANT:", + "multi_modal_data": {"image": image_1}, + }, + { + "prompt": "USER: \nWhat's the color of this image?\nASSISTANT:", + "multi_modal_data": {"image": image_2}, + } + ] +) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) +``` + +A code example can be found in [examples/offline_inference_vision_language.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py). + +To substitute multiple images inside the same text prompt, you can pass in a list of images instead: + +```python +llm = LLM( + model="microsoft/Phi-3.5-vision-instruct", + trust_remote_code=True, # Required to load Phi-3.5-vision + max_model_len=4096, # Otherwise, it may not fit in smaller GPUs + limit_mm_per_prompt={"image": 2}, # The maximum number to accept +) + +# Refer to the HuggingFace repo for the correct format to use +prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n" + +# Load the images using PIL.Image +image1 = PIL.Image.open(...) +image2 = PIL.Image.open(...) + +outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": { + "image": [image1, image2] + }, +}) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) +``` + +A code example can be found in [examples/offline_inference_vision_language_multi_image.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py). + +Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: + +```python +# Specify the maximum number of frames per video to be 4. This can be changed. +llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) + +# Create the request payload. +video_frames = ... # load your video making sure it only has the number of frames specified earlier. +message = { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, + ], +} +for i in range(len(video_frames)): + base64_image = encode_image(video_frames[i]) # base64 encoding. + new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} + message["content"].append(new_image) + +# Perform inference and log output. +outputs = llm.chat([message]) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) +``` + +### Video + +You can pass a list of NumPy arrays directly to the {code}`'video'` field of the multi-modal dictionary +instead of using multi-image input. + +Please refer to [examples/offline_inference_vision_language.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py) for more details. + +### Audio + +You can pass a tuple {code}`(array, sampling_rate)` to the {code}`'audio'` field of the multi-modal dictionary. + +Please refer to [examples/offline_inference_audio_language.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_audio_language.py) for more details. + +### Embedding + +To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model, +pass a tensor of shape {code}`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. + +```python +# Inference with image embeddings as input +llm = LLM(model="llava-hf/llava-1.5-7b-hf") + +# Refer to the HuggingFace repo for the correct format to use +prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" + +# Embeddings for single image +# torch.Tensor of shape (1, image_feature_size, hidden_size of LM) +image_embeds = torch.load(...) + +outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": image_embeds}, +}) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) +``` + +For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings: + +```python +# Construct the prompt based on your model +prompt = ... + +# Embeddings for multiple images +# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) +image_embeds = torch.load(...) + +# Qwen2-VL +llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) +mm_data = { + "image": { + "image_embeds": image_embeds, + # image_grid_thw is needed to calculate positional encoding. + "image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3), + } +} + +# MiniCPM-V +llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4}) +mm_data = { + "image": { + "image_embeds": image_embeds, + # image_size_list is needed to calculate details of the sliced image. + "image_size_list": [image.size for image in images], # list of image sizes + } +} + +outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": mm_data, +}) + +for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) +``` + +## Online Inference + +Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). + +```{important} +A chat template is **required** to use Chat Completions API. + +Although most models come with a chat template, for others you have to define one yourself. +The chat template can be inferred based on the documentation on the model's HuggingFace repo. +For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja). +``` + +### Image + +Image input is supported according to [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). +Here is a simple example using Phi-3.5-Vision. + +First, launch the OpenAI-compatible server: + +```bash +vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ + --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 +``` + +Then, you can use the OpenAI client as follows: + +```python +from openai import OpenAI + +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +# Single-image input inference +image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + +chat_response = client.chat.completions.create( + model="microsoft/Phi-3.5-vision-instruct", + messages=[{ + "role": "user", + "content": [ + # NOTE: The prompt formatting with the image token `` is not needed + # since the prompt will be processed automatically by the API server. + {"type": "text", "text": "What’s in this image?"}, + {"type": "image_url", "image_url": {"url": image_url}}, + ], + }], +) +print("Chat completion output:", chat_response.choices[0].message.content) + +# Multi-image input inference +image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" +image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" + +chat_response = client.chat.completions.create( + model="microsoft/Phi-3.5-vision-instruct", + messages=[{ + "role": "user", + "content": [ + {"type": "text", "text": "What are the animals in these images?"}, + {"type": "image_url", "image_url": {"url": image_url_duck}}, + {"type": "image_url", "image_url": {"url": image_url_lion}}, + ], + }], +) +print("Chat completion output:", chat_response.choices[0].message.content) +``` + +A full code example can be found in [examples/openai_chat_completion_client_for_multimodal.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py). + +```{tip} +Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine, +and pass the file path as `url` in the API request. +``` + +```{tip} +There is no need to place image placeholders in the text content of the API request - they are already represented by the image content. +In fact, you can place image placeholders in the middle of the text by interleaving text and image content. +``` + +````{note} +By default, the timeout for fetching images through HTTP URL is `5` seconds. +You can override this by setting the environment variable: + +```console +$ export VLLM_IMAGE_FETCH_TIMEOUT= +``` +```` + +### Video + +Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. + +You can use [these tests](https://github.com/vllm-project/vllm/blob/main/tests/entrypoints/openai/test_video.py) as reference. + +````{note} +By default, the timeout for fetching videos through HTTP URL url is `30` seconds. +You can override this by setting the environment variable: + +```console +$ export VLLM_VIDEO_FETCH_TIMEOUT= +``` +```` + +### Audio + +Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in). +Here is a simple example using Ultravox-v0.3. + +First, launch the OpenAI-compatible server: + +```bash +vllm serve fixie-ai/ultravox-v0_3 +``` + +Then, you can use the OpenAI client as follows: + +```python +import base64 +import requests +from openai import OpenAI +from vllm.assets.audio import AudioAsset + +def encode_base64_content_from_url(content_url: str) -> str: + """Encode a content retrieved from a remote url to base64 format.""" + + with requests.get(content_url) as response: + response.raise_for_status() + result = base64.b64encode(response.content).decode('utf-8') + + return result + +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +# Any format supported by librosa is supported +audio_url = AudioAsset("winning_call").url +audio_base64 = encode_base64_content_from_url(audio_url) + +chat_completion_from_base64 = client.chat.completions.create( + messages=[{ + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?" + }, + { + "type": "input_audio", + "input_audio": { + "data": audio_base64, + "format": "wav" + }, + }, + ], + }], + model=model, + max_completion_tokens=64, +) + +result = chat_completion_from_base64.choices[0].message.content +print("Chat completion output from input audio:", result) +``` + +Alternatively, you can pass {code}`audio_url`, which is the audio counterpart of {code}`image_url` for image input: + +```python +chat_completion_from_url = client.chat.completions.create( + messages=[{ + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?" + }, + { + "type": "audio_url", + "audio_url": { + "url": audio_url + }, + }, + ], + }], + model=model, + max_completion_tokens=64, +) + +result = chat_completion_from_url.choices[0].message.content +print("Chat completion output from audio url:", result) +``` + +A full code example can be found in [examples/openai_chat_completion_client_for_multimodal.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py). + +````{note} +By default, the timeout for fetching audios through HTTP URL is `10` seconds. +You can override this by setting the environment variable: + +```console +$ export VLLM_AUDIO_FETCH_TIMEOUT= +``` +```` + +### Embedding + +vLLM's Embeddings API is a superset of OpenAI's [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings), +where a list of chat `messages` can be passed instead of batched `inputs`. This enables multi-modal inputs to be passed to embedding models. + +```{tip} +The schema of `messages` is exactly the same as in Chat Completions API. +You can refer to the above tutorials for more details on how to pass each type of multi-modal data. +``` + +Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images. +Refer to the examples below for illustration. + +Here is an end-to-end example using VLM2Vec. To serve the model: + +```bash +vllm serve TIGER-Lab/VLM2Vec-Full --task embed \ + --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja +``` + +```{important} +Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed` +to run this model in embedding mode instead of text generation mode. + +The custom chat template is completely different from the original one for this model, +and can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/template_vlm2vec.jinja). +``` + +Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: + +```python +import requests + +image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + +response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": "TIGER-Lab/VLM2Vec-Full", + "messages": [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Represent the given image."}, + ], + }], + "encoding_format": "float", + }, +) +response.raise_for_status() +response_json = response.json() +print("Embedding output:", response_json["data"][0]["embedding"]) +``` + +Below is another example, this time using the `MrLight/dse-qwen2-2b-mrl-v1` model. + +```bash +vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \ + --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja +``` + +```{important} +Like with VLM2Vec, we have to explicitly pass `--task embed`. + +Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled +by [this custom chat template](https://github.com/vllm-project/vllm/blob/main/examples/template_dse_qwen2_vl.jinja). +``` + +```{important} +Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code +example below for details. +``` + +A full code example can be found in [examples/openai_chat_embedding_client_for_multimodal.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_embedding_client_for_multimodal.py). diff --git a/docs/source/usage/multimodal_inputs.rst b/docs/source/usage/multimodal_inputs.rst deleted file mode 100644 index 680382e457cc5..0000000000000 --- a/docs/source/usage/multimodal_inputs.rst +++ /dev/null @@ -1,492 +0,0 @@ -.. _multimodal_inputs: - -Multimodal Inputs -================= - -This page teaches you how to pass multi-modal inputs to :ref:`multi-modal models ` in vLLM. - -.. note:: - We are actively iterating on multi-modal support. See `this RFC `_ for upcoming changes, - and `open an issue on GitHub `_ if you have any feedback or feature requests. - -Offline Inference ------------------ - -To input multi-modal data, follow this schema in :class:`vllm.inputs.PromptType`: - -* ``prompt``: The prompt should follow the format that is documented on HuggingFace. -* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. - -Image -^^^^^ - -You can pass a single image to the :code:`'image'` field of the multi-modal dictionary, as shown in the following examples: - -.. code-block:: python - - llm = LLM(model="llava-hf/llava-1.5-7b-hf") - - # Refer to the HuggingFace repo for the correct format to use - prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" - - # Load the image using PIL.Image - image = PIL.Image.open(...) - - # Single prompt inference - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": {"image": image}, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - # Batch inference - image_1 = PIL.Image.open(...) - image_2 = PIL.Image.open(...) - outputs = llm.generate( - [ - { - "prompt": "USER: \nWhat is the content of this image?\nASSISTANT:", - "multi_modal_data": {"image": image_1}, - }, - { - "prompt": "USER: \nWhat's the color of this image?\nASSISTANT:", - "multi_modal_data": {"image": image_2}, - } - ] - ) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -A code example can be found in `examples/offline_inference_vision_language.py `_. - -To substitute multiple images inside the same text prompt, you can pass in a list of images instead: - -.. code-block:: python - - llm = LLM( - model="microsoft/Phi-3.5-vision-instruct", - trust_remote_code=True, # Required to load Phi-3.5-vision - max_model_len=4096, # Otherwise, it may not fit in smaller GPUs - limit_mm_per_prompt={"image": 2}, # The maximum number to accept - ) - - # Refer to the HuggingFace repo for the correct format to use - prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n" - - # Load the images using PIL.Image - image1 = PIL.Image.open(...) - image2 = PIL.Image.open(...) - - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": { - "image": [image1, image2] - }, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -A code example can be found in `examples/offline_inference_vision_language_multi_image.py `_. - -Multi-image input can be extended to perform video captioning. We show this with `Qwen2-VL `_ as it supports videos: - -.. code-block:: python - - # Specify the maximum number of frames per video to be 4. This can be changed. - llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) - - # Create the request payload. - video_frames = ... # load your video making sure it only has the number of frames specified earlier. - message = { - "role": "user", - "content": [ - {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, - ], - } - for i in range(len(video_frames)): - base64_image = encode_image(video_frames[i]) # base64 encoding. - new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} - message["content"].append(new_image) - - # Perform inference and log output. - outputs = llm.chat([message]) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -Video -^^^^^ - -You can pass a list of NumPy arrays directly to the :code:`'video'` field of the multi-modal dictionary -instead of using multi-image input. - -Please refer to `examples/offline_inference_vision_language.py `_ for more details. - -Audio -^^^^^ - -You can pass a tuple :code:`(array, sampling_rate)` to the :code:`'audio'` field of the multi-modal dictionary. - -Please refer to `examples/offline_inference_audio_language.py `_ for more details. - -Embedding -^^^^^^^^^ - -To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model, -pass a tensor of shape :code:`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. - -.. code-block:: python - - # Inference with image embeddings as input - llm = LLM(model="llava-hf/llava-1.5-7b-hf") - - # Refer to the HuggingFace repo for the correct format to use - prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" - - # Embeddings for single image - # torch.Tensor of shape (1, image_feature_size, hidden_size of LM) - image_embeds = torch.load(...) - - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": {"image": image_embeds}, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings: - -.. code-block:: python - - # Construct the prompt based on your model - prompt = ... - - # Embeddings for multiple images - # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) - image_embeds = torch.load(...) - - # Qwen2-VL - llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) - mm_data = { - "image": { - "image_embeds": image_embeds, - # image_grid_thw is needed to calculate positional encoding. - "image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3), - } - } - - # MiniCPM-V - llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4}) - mm_data = { - "image": { - "image_embeds": image_embeds, - # image_size_list is needed to calculate details of the sliced image. - "image_size_list": [image.size for image in images], # list of image sizes - } - } - - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": mm_data, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -Online Inference ----------------- - -Our OpenAI-compatible server accepts multi-modal data via the `Chat Completions API `_. - -.. important:: - A chat template is **required** to use Chat Completions API. - - Although most models come with a chat template, for others you have to define one yourself. - The chat template can be inferred based on the documentation on the model's HuggingFace repo. - For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here `__. - -Image -^^^^^ - -Image input is supported according to `OpenAI Vision API `_. -Here is a simple example using Phi-3.5-Vision. - -First, launch the OpenAI-compatible server: - -.. code-block:: bash - - vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ - --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 - -Then, you can use the OpenAI client as follows: - -.. code-block:: python - - from openai import OpenAI - - openai_api_key = "EMPTY" - openai_api_base = "http://localhost:8000/v1" - - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) - - # Single-image input inference - image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - - chat_response = client.chat.completions.create( - model="microsoft/Phi-3.5-vision-instruct", - messages=[{ - "role": "user", - "content": [ - # NOTE: The prompt formatting with the image token `` is not needed - # since the prompt will be processed automatically by the API server. - {"type": "text", "text": "What’s in this image?"}, - {"type": "image_url", "image_url": {"url": image_url}}, - ], - }], - ) - print("Chat completion output:", chat_response.choices[0].message.content) - - # Multi-image input inference - image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" - image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" - - chat_response = client.chat.completions.create( - model="microsoft/Phi-3.5-vision-instruct", - messages=[{ - "role": "user", - "content": [ - {"type": "text", "text": "What are the animals in these images?"}, - {"type": "image_url", "image_url": {"url": image_url_duck}}, - {"type": "image_url", "image_url": {"url": image_url_lion}}, - ], - }], - ) - print("Chat completion output:", chat_response.choices[0].message.content) - -A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py `_. - -.. tip:: - Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via ``--allowed-local-media-path`` when launching the API server/engine, - and pass the file path as ``url`` in the API request. - -.. tip:: - There is no need to place image placeholders in the text content of the API request - they are already represented by the image content. - In fact, you can place image placeholders in the middle of the text by interleaving text and image content. - -.. note:: - - By default, the timeout for fetching images through HTTP URL is ``5`` seconds. - You can override this by setting the environment variable: - - .. code-block:: console - - $ export VLLM_IMAGE_FETCH_TIMEOUT= - -Video -^^^^^ - -Instead of :code:`image_url`, you can pass a video file via :code:`video_url`. - -You can use `these tests `_ as reference. - -.. note:: - - By default, the timeout for fetching videos through HTTP URL url is ``30`` seconds. - You can override this by setting the environment variable: - - .. code-block:: console - - $ export VLLM_VIDEO_FETCH_TIMEOUT= - -Audio -^^^^^ - -Audio input is supported according to `OpenAI Audio API `_. -Here is a simple example using Ultravox-v0.3. - -First, launch the OpenAI-compatible server: - -.. code-block:: bash - - vllm serve fixie-ai/ultravox-v0_3 - -Then, you can use the OpenAI client as follows: - -.. code-block:: python - - import base64 - import requests - from openai import OpenAI - from vllm.assets.audio import AudioAsset - - def encode_base64_content_from_url(content_url: str) -> str: - """Encode a content retrieved from a remote url to base64 format.""" - - with requests.get(content_url) as response: - response.raise_for_status() - result = base64.b64encode(response.content).decode('utf-8') - - return result - - openai_api_key = "EMPTY" - openai_api_base = "http://localhost:8000/v1" - - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) - - # Any format supported by librosa is supported - audio_url = AudioAsset("winning_call").url - audio_base64 = encode_base64_content_from_url(audio_url) - - chat_completion_from_base64 = client.chat.completions.create( - messages=[{ - "role": "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "input_audio", - "input_audio": { - "data": audio_base64, - "format": "wav" - }, - }, - ], - }], - model=model, - max_completion_tokens=64, - ) - - result = chat_completion_from_base64.choices[0].message.content - print("Chat completion output from input audio:", result) - -Alternatively, you can pass :code:`audio_url`, which is the audio counterpart of :code:`image_url` for image input: - -.. code-block:: python - - chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "audio_url", - "audio_url": { - "url": audio_url - }, - }, - ], - }], - model=model, - max_completion_tokens=64, - ) - - result = chat_completion_from_url.choices[0].message.content - print("Chat completion output from audio url:", result) - -A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py `_. - -.. note:: - - By default, the timeout for fetching audios through HTTP URL is ``10`` seconds. - You can override this by setting the environment variable: - - .. code-block:: console - - $ export VLLM_AUDIO_FETCH_TIMEOUT= - -Embedding -^^^^^^^^^ - -vLLM's Embeddings API is a superset of OpenAI's `Embeddings API `_, -where a list of chat ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models. - -.. tip:: - The schema of ``messages`` is exactly the same as in Chat Completions API. - You can refer to the above tutorials for more details on how to pass each type of multi-modal data. - -Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images. -Refer to the examples below for illustration. - -Here is an end-to-end example using VLM2Vec. To serve the model: - -.. code-block:: bash - - vllm serve TIGER-Lab/VLM2Vec-Full --task embed \ - --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja - -.. important:: - - Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embed`` - to run this model in embedding mode instead of text generation mode. - - The custom chat template is completely different from the original one for this model, - and can be found `here `__. - -Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library: - -.. code-block:: python - - import requests - - image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - - response = requests.post( - "http://localhost:8000/v1/embeddings", - json={ - "model": "TIGER-Lab/VLM2Vec-Full", - "messages": [{ - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": image_url}}, - {"type": "text", "text": "Represent the given image."}, - ], - }], - "encoding_format": "float", - }, - ) - response.raise_for_status() - response_json = response.json() - print("Embedding output:", response_json["data"][0]["embedding"]) - -Below is another example, this time using the ``MrLight/dse-qwen2-2b-mrl-v1`` model. - -.. code-block:: bash - - vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \ - --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja - -.. important:: - - Like with VLM2Vec, we have to explicitly pass ``--task embed``. - - Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, which is handled - by `this custom chat template `__. - -.. important:: - - Also important, ``MrLight/dse-qwen2-2b-mrl-v1`` requires a placeholder image of the minimum image size for text query embeddings. See the full code - example below for details. - -A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py `_. diff --git a/docs/source/usage/performance.rst b/docs/source/usage/performance.md similarity index 54% rename from docs/source/usage/performance.rst rename to docs/source/usage/performance.md index 23b5ab79a7378..f028e28627a9f 100644 --- a/docs/source/usage/performance.rst +++ b/docs/source/usage/performance.md @@ -1,16 +1,15 @@ -.. _performance: +(performance)= -Performance and Tuning -====================== +# Performance and Tuning + +## Preemption -Preemption ----------- Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests. The vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes available again. When this occurs, the following warning is printed: ``` -WARNING 05-09 00:49:33 scheduler.py:1057] Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1 +WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1 ``` While this mechanism ensures system robustness, preemption and recomputation can adversely affect end-to-end latency. @@ -22,44 +21,44 @@ If you frequently encounter preemptions from the vLLM engine, consider the follo You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False. -.. _chunked-prefill: +(chunked-prefill)= -Chunked Prefill ---------------- -vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests. +## Chunked Prefill -You can enable the feature by specifying ``--enable-chunked-prefill`` in the command line or setting ``enable_chunked_prefill=True`` in the LLM constructor. +vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests. -.. code-block:: python +You can enable the feature by specifying `--enable-chunked-prefill` in the command line or setting `enable_chunked_prefill=True` in the LLM constructor. - llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True) - # Set max_num_batched_tokens to tune performance. - # NOTE: 512 is the default max_num_batched_tokens for chunked prefill. - # llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512) +```python +llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True) +# Set max_num_batched_tokens to tune performance. +# NOTE: 512 is the default max_num_batched_tokens for chunked prefill. +# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512) +``` By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch. This policy optimizes the TTFT (time to the first token), but incurs slower ITL (inter token latency) and inefficient GPU utilization. Once chunked prefill is enabled, the policy is changed to prioritize decode requests. It batches all pending decode requests to the batch before scheduling any prefill. -When there are available token_budget (``max_num_batched_tokens``), it schedules pending prefills. -If a last pending prefill request cannot fit into ``max_num_batched_tokens``, it chunks it. +When there are available token_budget (`max_num_batched_tokens`), it schedules pending prefills. +If a last pending prefill request cannot fit into `max_num_batched_tokens`, it chunks it. This policy has two benefits: - It improves ITL and generation decode because decode requests are prioritized. - It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch. -You can tune the performance by changing ``max_num_batched_tokens``. +You can tune the performance by changing `max_num_batched_tokens`. By default, it is set to 512, which has the best ITL on A100 in the initial benchmark (llama 70B and mixtral 8x22B). -Smaller ``max_num_batched_tokens`` achieves better ITL because there are fewer prefills interrupting decodes. -Higher ``max_num_batched_tokens`` achieves better TTFT as you can put more prefill to the batch. +Smaller `max_num_batched_tokens` achieves better ITL because there are fewer prefills interrupting decodes. +Higher `max_num_batched_tokens` achieves better TTFT as you can put more prefill to the batch. -- If ``max_num_batched_tokens`` is the same as ``max_model_len``, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes). -- Note that the default value (512) of ``max_num_batched_tokens`` is optimized for ITL, and it may have lower throughput than the default scheduler. +- If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes). +- Note that the default value (512) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler. -We recommend you set ``max_num_batched_tokens > 2048`` for throughput. +We recommend you set `max_num_batched_tokens > 2048` for throughput. -See related papers for more details (https://arxiv.org/pdf/2401.08671 or https://arxiv.org/pdf/2308.16369). +See related papers for more details ( or ). -Please try out this feature and let us know your feedback via GitHub issues! \ No newline at end of file +Please try out this feature and let us know your feedback via GitHub issues! diff --git a/docs/source/usage/spec_decode.md b/docs/source/usage/spec_decode.md new file mode 100644 index 0000000000000..77e35c437de30 --- /dev/null +++ b/docs/source/usage/spec_decode.md @@ -0,0 +1,205 @@ +(spec-decode)= + +# Speculative decoding + +```{warning} +Please note that speculative decoding in vLLM is not yet optimized and does +not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work +to optimize it is ongoing and can be followed in [this issue.](https://github.com/vllm-project/vllm/issues/4630) +``` + +```{warning} +Currently, speculative decoding in vLLM is not compatible with pipeline parallelism. +``` + +This document shows how to use [Speculative Decoding](https://x.com/karpathy/status/1697318534555336961) with vLLM. +Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference. + +## Speculating with a draft model + +The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time. + +```python +from vllm import LLM, SamplingParams + +prompts = [ + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +llm = LLM( + model="facebook/opt-6.7b", + tensor_parallel_size=1, + speculative_model="facebook/opt-125m", + num_speculative_tokens=5, +) +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +To perform the same with an online mode launch the server: + +```bash +python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \ + --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \ + --num_speculative_tokens 5 --gpu_memory_utilization 0.8 +``` + +Then use a client: + +```python +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +model = models.data[0].id + +# Completion API +stream = False +completion = client.completions.create( + model=model, + prompt="The future of AI is", + echo=False, + n=1, + stream=stream, +) + +print("Completion results:") +if stream: + for c in completion: + print(c) +else: + print(completion) +``` + +## Speculating by matching n-grams in the prompt + +The following code configures vLLM to use speculative decoding where proposals are generated by +matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259) + +```python +from vllm import LLM, SamplingParams + +prompts = [ + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +llm = LLM( + model="facebook/opt-6.7b", + tensor_parallel_size=1, + speculative_model="[ngram]", + num_speculative_tokens=5, + ngram_prompt_lookup_max=4, +) +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +## Speculating using MLP speculators + +The following code configures vLLM to use speculative decoding where proposals are generated by +draft models that conditioning draft predictions on both context vectors and sampled tokens. +For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or +[this technical report](https://arxiv.org/abs/2404.19124). + +```python +from vllm import LLM, SamplingParams + +prompts = [ + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +llm = LLM( + model="meta-llama/Meta-Llama-3.1-70B-Instruct", + tensor_parallel_size=4, + speculative_model="ibm-fms/llama3-70b-accelerator", + speculative_draft_tensor_parallel_size=1, +) +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +Note that these speculative models currently need to be run without tensor parallelism, although +it is possible to run the main model using tensor parallelism (see example above). Since the +speculative models are relatively small, we still see significant speedups. However, this +limitation will be fixed in a future release. + +A variety of speculative models of this type are available on HF hub: + +- [llama-13b-accelerator](https://huggingface.co/ibm-fms/llama-13b-accelerator) +- [llama3-8b-accelerator](https://huggingface.co/ibm-fms/llama3-8b-accelerator) +- [codellama-34b-accelerator](https://huggingface.co/ibm-fms/codellama-34b-accelerator) +- [llama2-70b-accelerator](https://huggingface.co/ibm-fms/llama2-70b-accelerator) +- [llama3-70b-accelerator](https://huggingface.co/ibm-fms/llama3-70b-accelerator) +- [granite-3b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-3b-code-instruct-accelerator) +- [granite-8b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-8b-code-instruct-accelerator) +- [granite-7b-instruct-accelerator](https://huggingface.co/ibm-granite/granite-7b-instruct-accelerator) +- [granite-20b-code-instruct-accelerator](https://huggingface.co/ibm-granite/granite-20b-code-instruct-accelerator) + +## Lossless guarantees of Speculative Decoding + +In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of +speculative decoding, breaking down the guarantees into three key areas: + +1. **Theoretical Losslessness** + \- Speculative decoding sampling is theoretically lossless up to the precision limits of hardware numerics. Floating-point errors might + cause slight variations in output distributions, as discussed + in [Accelerating Large Language Model Decoding with Speculative Sampling](https://arxiv.org/pdf/2302.01318) + +2. **Algorithmic Losslessness** + \- vLLM’s implementation of speculative decoding is algorithmically validated to be lossless. Key validation tests include: + + > - **Rejection Sampler Convergence**: Ensures that samples from vLLM’s rejection sampler align with the target + > distribution. [View Test Code](https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252) + > - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling + > without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler, + > provides a lossless guarantee. Almost all of the tests in [this directory](https://github.com/vllm-project/vllm/tree/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e) + > verify this property using [this assertion implementation](https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291) + +3. **vLLM Logprob Stability** + \- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the + same request across runs. For more details, see the FAQ section + titled *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs `. + +**Conclusion** + +While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding +can occur due to following factors: + +- **Floating-Point Precision**: Differences in hardware numerical precision may lead to slight discrepancies in the output distribution. +- **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially + due to non-deterministic behavior in batched operations or numerical instability. + +**Mitigation Strategies** + +For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs `. + +## Resources for vLLM contributors + +- [A Hacker's Guide to Speculative Decoding in vLLM](https://www.youtube.com/watch?v=9wNAgpX6z_4) +- [What is Lookahead Scheduling in vLLM?](https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a) +- [Information on batch expansion](https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8) +- [Dynamic speculative decoding](https://github.com/vllm-project/vllm/issues/4565) diff --git a/docs/source/usage/spec_decode.rst b/docs/source/usage/spec_decode.rst deleted file mode 100644 index f1f1917f974bb..0000000000000 --- a/docs/source/usage/spec_decode.rst +++ /dev/null @@ -1,210 +0,0 @@ -.. _spec_decode: - -Speculative decoding -==================== - -.. warning:: - Please note that speculative decoding in vLLM is not yet optimized and does - not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work - to optimize it is ongoing and can be followed in `this issue. `_ - -.. warning:: - Currently, speculative decoding in vLLM is not compatible with pipeline parallelism. - -This document shows how to use `Speculative Decoding `_ with vLLM. -Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference. - -Speculating with a draft model ------------------------------- - -The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time. - -.. code-block:: python - - from vllm import LLM, SamplingParams - - prompts = [ - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - llm = LLM( - model="facebook/opt-6.7b", - tensor_parallel_size=1, - speculative_model="facebook/opt-125m", - num_speculative_tokens=5, - ) - outputs = llm.generate(prompts, sampling_params) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -To perform the same with an online mode launch the server: - -.. code-block:: bash - - python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \ - --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \ - --num_speculative_tokens 5 --gpu_memory_utilization 0.8 - -Then use a client: - -.. code-block:: python - - from openai import OpenAI - - # Modify OpenAI's API key and API base to use vLLM's API server. - openai_api_key = "EMPTY" - openai_api_base = "http://localhost:8000/v1" - - client = OpenAI( - # defaults to os.environ.get("OPENAI_API_KEY") - api_key=openai_api_key, - base_url=openai_api_base, - ) - - models = client.models.list() - model = models.data[0].id - - # Completion API - stream = False - completion = client.completions.create( - model=model, - prompt="The future of AI is", - echo=False, - n=1, - stream=stream, - ) - - print("Completion results:") - if stream: - for c in completion: - print(c) - else: - print(completion) - -Speculating by matching n-grams in the prompt ---------------------------------------------- - -The following code configures vLLM to use speculative decoding where proposals are generated by -matching n-grams in the prompt. For more information read `this thread. `_ - -.. code-block:: python - - from vllm import LLM, SamplingParams - - prompts = [ - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - llm = LLM( - model="facebook/opt-6.7b", - tensor_parallel_size=1, - speculative_model="[ngram]", - num_speculative_tokens=5, - ngram_prompt_lookup_max=4, - ) - outputs = llm.generate(prompts, sampling_params) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -Speculating using MLP speculators ---------------------------------- - -The following code configures vLLM to use speculative decoding where proposals are generated by -draft models that conditioning draft predictions on both context vectors and sampled tokens. -For more information see `this blog `_ or -`this technical report `_. - -.. code-block:: python - - from vllm import LLM, SamplingParams - - prompts = [ - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - - llm = LLM( - model="meta-llama/Meta-Llama-3.1-70B-Instruct", - tensor_parallel_size=4, - speculative_model="ibm-fms/llama3-70b-accelerator", - speculative_draft_tensor_parallel_size=1, - ) - outputs = llm.generate(prompts, sampling_params) - - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -Note that these speculative models currently need to be run without tensor parallelism, although -it is possible to run the main model using tensor parallelism (see example above). Since the -speculative models are relatively small, we still see significant speedups. However, this -limitation will be fixed in a future release. - -A variety of speculative models of this type are available on HF hub: - -* `llama-13b-accelerator `_ -* `llama3-8b-accelerator `_ -* `codellama-34b-accelerator `_ -* `llama2-70b-accelerator `_ -* `llama3-70b-accelerator `_ -* `granite-3b-code-instruct-accelerator `_ -* `granite-8b-code-instruct-accelerator `_ -* `granite-7b-instruct-accelerator `_ -* `granite-20b-code-instruct-accelerator `_ - -Lossless guarantees of Speculative Decoding -------------------------------------------- -In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of -speculative decoding, breaking down the guarantees into three key areas: - -1. **Theoretical Losslessness** - - Speculative decoding sampling is theoretically lossless up to the precision limits of hardware numerics. Floating-point errors might - cause slight variations in output distributions, as discussed - in `Accelerating Large Language Model Decoding with Speculative Sampling `_ - -2. **Algorithmic Losslessness** - - vLLM’s implementation of speculative decoding is algorithmically validated to be lossless. Key validation tests include: - - - **Rejection Sampler Convergence**: Ensures that samples from vLLM’s rejection sampler align with the target - distribution. `View Test Code `_ - - - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling - without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler, - provides a lossless guarantee. Almost all of the tests in `this directory `_ - verify this property using `this assertion implementation `_ - -3. **vLLM Logprob Stability** - - vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the - same request across runs. For more details, see the FAQ section - titled *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs `. - - -**Conclusion** - -While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding -can occur due to following factors: - -- **Floating-Point Precision**: Differences in hardware numerical precision may lead to slight discrepancies in the output distribution. - -- **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially - due to non-deterministic behavior in batched operations or numerical instability. - -**Mitigation Strategies** - -For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs `. - -Resources for vLLM contributors -------------------------------- -* `A Hacker's Guide to Speculative Decoding in vLLM `_ -* `What is Lookahead Scheduling in vLLM? `_ -* `Information on batch expansion `_ -* `Dynamic speculative decoding `_ diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md new file mode 100644 index 0000000000000..14dd387743aac --- /dev/null +++ b/docs/source/usage/structured_outputs.md @@ -0,0 +1,260 @@ +(structured-outputs)= + +# Structured Outputs + +vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines) or [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer) as backends for the guided decoding. +This document shows you some examples of the different options that are available to generate structured outputs. + +## Online Inference (OpenAI API) + +You can generate structured outputs using the OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API. + +The following parameters are supported, which must be added as extra parameters: + +- `guided_choice`: the output will be exactly one of the choices. +- `guided_regex`: the output will follow the regex pattern. +- `guided_json`: the output will follow the JSON schema. +- `guided_grammar`: the output will follow the context free grammar. +- `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding. +- `guided_decoding_backend`: used to select the guided decoding backend to use. + +You can see the complete list of supported parameters on the [OpenAI Compatible Server](../serving/openai_compatible_server.md) page. + +Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one: + +```python +from openai import OpenAI +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="-", +) + +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + ], + extra_body={"guided_choice": ["positive", "negative"]}, +) +print(completion.choices[0].message.content) +``` + +The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template: + +```python +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + { + "role": "user", + "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n", + } + ], + extra_body={"guided_regex": "\w+@\w+\.com\n", "stop": ["\n"]}, +) +print(completion.choices[0].message.content) +``` + +One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. +For this we can use the `guided_json` parameter in two different ways: + +- Using directly a [JSON Schema](https://json-schema.org/) +- Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option). + +The next example shows how to use the `guided_json` parameter with a Pydantic model: + +```python +from pydantic import BaseModel +from enum import Enum + +class CarType(str, Enum): + sedan = "sedan" + suv = "SUV" + truck = "Truck" + coupe = "Coupe" + + +class CarDescription(BaseModel): + brand: str + model: str + car_type: CarType + + +json_schema = CarDescription.model_json_schema() + +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + { + "role": "user", + "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's", + } + ], + extra_body={"guided_json": json_schema}, +) +print(completion.choices[0].message.content) +``` + +```{tip} +While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them. +This can improve the results notably in most cases. +``` + +Finally we have the `guided_grammar`, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries. +It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below: + +```python +simplified_sql_grammar = """ + ?start: select_statement + + ?select_statement: "SELECT " column_list " FROM " table_name + + ?column_list: column_name ("," column_name)* + + ?table_name: identifier + + ?column_name: identifier + + ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ +""" + +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + { + "role": "user", + "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.", + } + ], + extra_body={"guided_grammar": simplified_sql_grammar}, +) +print(completion.choices[0].message.content) +``` + +The complete code of the examples can be found on [examples/openai_chat_completion_structured_outputs.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_structured_outputs.py). + +## Experimental Automatic Parsing (OpenAI API) + +This section covers the OpenAI beta wrapper over the `client.chat.completions.create()` method that provides richer integrations with Python specific types. + +At the time of writing (`openai==1.54.4`), this is a "beta" feature in the OpenAI client library. Code reference can be found [here](https://github.com/openai/openai-python/blob/52357cff50bee57ef442e94d78a0de38b4173fc2/src/openai/resources/beta/chat/completions.py#L100-L104). + +For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.1-8B-Instruct` + +Here is a simple example demonstrating how to get structured output using Pydantic models: + +```python +from pydantic import BaseModel +from openai import OpenAI + + +class Info(BaseModel): + name: str + age: int + + +client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") +completion = client.beta.chat.completions.parse( + model="meta-llama/Llama-3.1-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"}, + ], + response_format=Info, + extra_body=dict(guided_decoding_backend="outlines"), +) + +message = completion.choices[0].message +print(message) +assert message.parsed +print("Name:", message.parsed.name) +print("Age:", message.parsed.age) +``` + +Output: + +```console +ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28)) +Name: Cameron +Age: 28 +``` + +Here is a more complex example using nested Pydantic models to handle a step-by-step math solution: + +```python +from typing import List +from pydantic import BaseModel +from openai import OpenAI + + +class Step(BaseModel): + explanation: str + output: str + + +class MathResponse(BaseModel): + steps: List[Step] + final_answer: str + + +client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") +completion = client.beta.chat.completions.parse( + model="meta-llama/Llama-3.1-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful expert math tutor."}, + {"role": "user", "content": "Solve 8x + 31 = 2."}, + ], + response_format=MathResponse, + extra_body=dict(guided_decoding_backend="outlines"), +) + +message = completion.choices[0].message +print(message) +assert message.parsed +for i, step in enumerate(message.parsed.steps): + print(f"Step #{i}:", step) +print("Answer:", message.parsed.final_answer) +``` + +Output: + +```console +ParsedChatCompletionMessage[MathResponse](content='{ "steps": [{ "explanation": "First, let\'s isolate the term with the variable \'x\'. To do this, we\'ll subtract 31 from both sides of the equation.", "output": "8x + 31 - 31 = 2 - 31"}, { "explanation": "By subtracting 31 from both sides, we simplify the equation to 8x = -29.", "output": "8x = -29"}, { "explanation": "Next, let\'s isolate \'x\' by dividing both sides of the equation by 8.", "output": "8x / 8 = -29 / 8"}], "final_answer": "x = -29/8" }', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=MathResponse(steps=[Step(explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation.", output='8x + 31 - 31 = 2 - 31'), Step(explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.', output='8x = -29'), Step(explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8.", output='8x / 8 = -29 / 8')], final_answer='x = -29/8')) +Step #0: explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation." output='8x + 31 - 31 = 2 - 31' +Step #1: explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.' output='8x = -29' +Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8." output='8x / 8 = -29 / 8' +Answer: x = -29/8 +``` + +## Offline Inference + +Offline inference allows for the same types of guided decoding. +To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`. +The main available options inside `GuidedDecodingParams` are: + +- `json` +- `regex` +- `choice` +- `grammar` +- `backend` +- `whitespace_pattern` + +These parameters can be used in the same way as the parameters from the Online Inference examples above. +One example for the usage of the `choices` parameter is shown below: + +```python +from vllm import LLM, SamplingParams +from vllm.sampling_params import GuidedDecodingParams + +llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct") + +guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"]) +sampling_params = SamplingParams(guided_decoding=guided_decoding_params) +outputs = llm.generate( + prompts="Classify this sentiment: vLLM is wonderful!", + sampling_params=sampling_params, +) +print(outputs[0].outputs[0].text) +``` + +A complete example with all options can be found in [examples/offline_inference_structured_outputs.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_structured_outputs.py). diff --git a/docs/source/usage/structured_outputs.rst b/docs/source/usage/structured_outputs.rst deleted file mode 100644 index 484e1f17d191e..0000000000000 --- a/docs/source/usage/structured_outputs.rst +++ /dev/null @@ -1,267 +0,0 @@ -.. _structured_outputs: - -Structured Outputs -================== - -vLLM supports the generation of structured outputs using `outlines `_ or `lm-format-enforcer `_ as backends for the guided decoding. -This document shows you some examples of the different options that are available to generate structured outputs. - - -Online Inference (OpenAI API) ------------------------------ - -You can generate structured outputs using the OpenAI's `Completions `_ and `Chat `_ API. - -The following parameters are supported, which must be added as extra parameters: - -- ``guided_choice``: the output will be exactly one of the choices. -- ``guided_regex``: the output will follow the regex pattern. -- ``guided_json``: the output will follow the JSON schema. -- ``guided_grammar``: the output will follow the context free grammar. -- ``guided_whitespace_pattern``: used to override the default whitespace pattern for guided json decoding. -- ``guided_decoding_backend``: used to select the guided decoding backend to use. - -You can see the complete list of supported parameters on the `OpenAI Compatible Server `_ page. - -Now let´s see an example for each of the cases, starting with the ``guided_choice``, as it´s the easiest one: - -.. code-block:: python - - from openai import OpenAI - client = OpenAI( - base_url="http://localhost:8000/v1", - api_key="-", - ) - - completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", - messages=[ - {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} - ], - extra_body={"guided_choice": ["positive", "negative"]}, - ) - print(completion.choices[0].message.content) - - -The next example shows how to use the ``guided_regex``. The idea is to generate an email address, given a simple regex template: - -.. code-block:: python - - completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", - messages=[ - { - "role": "user", - "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n", - } - ], - extra_body={"guided_regex": "\w+@\w+\.com\n", "stop": ["\n"]}, - ) - print(completion.choices[0].message.content) - -One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. -For this we can use the ``guided_json`` parameter in two different ways: - -- Using directly a `JSON Schema `_ -- Defining a `Pydantic model `_ and then extracting the JSON Schema from it (which is normally an easier option). - -The next example shows how to use the ``guided_json`` parameter with a Pydantic model: - -.. code-block:: python - - from pydantic import BaseModel - from enum import Enum - - class CarType(str, Enum): - sedan = "sedan" - suv = "SUV" - truck = "Truck" - coupe = "Coupe" - - - class CarDescription(BaseModel): - brand: str - model: str - car_type: CarType - - - json_schema = CarDescription.model_json_schema() - - completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", - messages=[ - { - "role": "user", - "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's", - } - ], - extra_body={"guided_json": json_schema}, - ) - print(completion.choices[0].message.content) - -.. tip:: - While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them. - This can improve the results notably in most cases. - - -Finally we have the ``guided_grammar``, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries. -It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below: - -.. code-block:: python - - simplified_sql_grammar = """ - ?start: select_statement - - ?select_statement: "SELECT " column_list " FROM " table_name - - ?column_list: column_name ("," column_name)* - - ?table_name: identifier - - ?column_name: identifier - - ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ - """ - - completion = client.chat.completions.create( - model="Qwen/Qwen2.5-3B-Instruct", - messages=[ - { - "role": "user", - "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.", - } - ], - extra_body={"guided_grammar": simplified_sql_grammar}, - ) - print(completion.choices[0].message.content) - -The complete code of the examples can be found on `examples/openai_chat_completion_structured_outputs.py `_. - -Experimental Automatic Parsing (OpenAI API) --------------------------------------------- - -This section covers the OpenAI beta wrapper over the ``client.chat.completions.create()`` method that provides richer integrations with Python specific types. - -At the time of writing (``openai==1.54.4``), this is a "beta" feature in the OpenAI client library. Code reference can be found `here `_. - -For the following examples, vLLM was setup using ``vllm serve meta-llama/Llama-3.1-8B-Instruct`` - -Here is a simple example demonstrating how to get structured output using Pydantic models: - -.. code-block:: python - - from pydantic import BaseModel - from openai import OpenAI - - - class Info(BaseModel): - name: str - age: int - - - client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") - completion = client.beta.chat.completions.parse( - model="meta-llama/Llama-3.1-8B-Instruct", - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"}, - ], - response_format=Info, - extra_body=dict(guided_decoding_backend="outlines"), - ) - - message = completion.choices[0].message - print(message) - assert message.parsed - print("Name:", message.parsed.name) - print("Age:", message.parsed.age) - -Output: - -.. code-block:: console - - ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28)) - Name: Cameron - Age: 28 - - -Here is a more complex example using nested Pydantic models to handle a step-by-step math solution: - -.. code-block:: python - - from typing import List - from pydantic import BaseModel - from openai import OpenAI - - - class Step(BaseModel): - explanation: str - output: str - - - class MathResponse(BaseModel): - steps: List[Step] - final_answer: str - - - client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") - completion = client.beta.chat.completions.parse( - model="meta-llama/Llama-3.1-8B-Instruct", - messages=[ - {"role": "system", "content": "You are a helpful expert math tutor."}, - {"role": "user", "content": "Solve 8x + 31 = 2."}, - ], - response_format=MathResponse, - extra_body=dict(guided_decoding_backend="outlines"), - ) - - message = completion.choices[0].message - print(message) - assert message.parsed - for i, step in enumerate(message.parsed.steps): - print(f"Step #{i}:", step) - print("Answer:", message.parsed.final_answer) - -Output: - -.. code-block:: console - - ParsedChatCompletionMessage[MathResponse](content='{ "steps": [{ "explanation": "First, let\'s isolate the term with the variable \'x\'. To do this, we\'ll subtract 31 from both sides of the equation.", "output": "8x + 31 - 31 = 2 - 31"}, { "explanation": "By subtracting 31 from both sides, we simplify the equation to 8x = -29.", "output": "8x = -29"}, { "explanation": "Next, let\'s isolate \'x\' by dividing both sides of the equation by 8.", "output": "8x / 8 = -29 / 8"}], "final_answer": "x = -29/8" }', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=MathResponse(steps=[Step(explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation.", output='8x + 31 - 31 = 2 - 31'), Step(explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.', output='8x = -29'), Step(explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8.", output='8x / 8 = -29 / 8')], final_answer='x = -29/8')) - Step #0: explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation." output='8x + 31 - 31 = 2 - 31' - Step #1: explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.' output='8x = -29' - Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8." output='8x / 8 = -29 / 8' - Answer: x = -29/8 - -Offline Inference ------------------ - -Offline inference allows for the same types of guided decoding. -To use it, we´ll need to configure the guided decoding using the class ``GuidedDecodingParams`` inside ``SamplingParams``. -The main available options inside ``GuidedDecodingParams`` are: - -- ``json`` -- ``regex`` -- ``choice`` -- ``grammar`` -- ``backend`` -- ``whitespace_pattern`` - -These parameters can be used in the same way as the parameters from the Online Inference examples above. -One example for the usage of the ``choices`` parameter is shown below: - -.. code-block:: python - - from vllm import LLM, SamplingParams - from vllm.sampling_params import GuidedDecodingParams - - llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct") - - guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"]) - sampling_params = SamplingParams(guided_decoding=guided_decoding_params) - outputs = llm.generate( - prompts="Classify this sentiment: vLLM is wonderful!", - sampling_params=sampling_params, - ) - print(outputs[0].outputs[0].text) - -A complete example with all options can be found in `examples/offline_inference_structured_outputs.py `_. diff --git a/docs/source/usage/usage_stats.md b/docs/source/usage/usage_stats.md index a1e4b1c38acae..a7eb6144571a4 100644 --- a/docs/source/usage/usage_stats.md +++ b/docs/source/usage/usage_stats.md @@ -47,7 +47,7 @@ tail ~/.config/vllm/usage_stats.json ## Opt-out of Usage Stats Collection -You can opt-out of usage stats collection by setting the VLLM_NO_USAGE_STATS or DO_NOT_TRACK environment variable, or by creating a ~/.config/vllm/do_not_track file: +You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file: ```bash # Any of the following methods can disable usage stats collection diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 19daeb729ee61..480901f71047f 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -430,7 +430,7 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " diff --git a/vllm/config.py b/vllm/config.py index 643698f8bbec3..17602bda15c69 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -638,7 +638,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, self.use_async_output_proc = False return - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if not current_platform.is_async_output_supported(self.enforce_eager): logger.warning( @@ -658,7 +658,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, if self.runner_type == "pooling": self.use_async_output_proc = False - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if speculative_config: logger.warning("Async output processing is not supported with" @@ -2058,7 +2058,7 @@ def verify_with_model_config(self, model_config: ModelConfig): model_config.quantization) def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if scheduler_config.chunked_prefill_enabled: logger.warning("LoRA with chunked prefill is still experimental " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 997a952240ecb..21966d003c7ef 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1148,7 +1148,7 @@ def create_engine_config(self, disable_logprobs=self.disable_logprobs_during_spec_decoding, ) - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if self.num_scheduler_steps > 1: if speculative_config is not None: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index e78b6f4d26758..39f59e55da1f7 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -133,7 +133,7 @@ class LLMEngine: and the :class:`AsyncLLMEngine` class wraps this class for online serving. The config arguments are derived from :class:`~vllm.EngineArgs`. (See - :ref:`engine_args`) + :ref:`engine-args`) Args: model_config: The configuration related to the LLM model. diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index a9b638ed02a1e..1c6f735f39e04 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -65,7 +65,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup, @staticmethod @functools.lru_cache def _log_prompt_logprob_unsupported_warning_once(): - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid logger.warning( "Prompt logprob is not supported by multi step workers. " diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 94d4a4d89adc9..830f54c6a8afa 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -115,7 +115,7 @@ class LLM: integer, it is used as the level of compilation optimization. If it is a dictionary, it can specify the full compilation configuration. **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See - :ref:`engine_args`) + :ref:`engine-args`) Note: This class is intended to be used for offline inference. For online diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 2816b5c5c1f88..5495bc50ede83 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -22,7 +22,7 @@ class CPUExecutor(ExecutorBase): def _init_executor(self) -> None: assert self.device_config.device_type == "cpu" - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid assert self.lora_config is None, "cpu backend doesn't support LoRA" diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index d4402e77a3886..aaeecab7ffde1 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -13,7 +13,7 @@ to dispatch data processing according to the target model. See also: - :ref:`input_processing_pipeline` + :ref:`input-processing-pipeline` """ __all__ = [ diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index fb02627eb22bd..f3ec9d115c9ba 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -314,7 +314,7 @@ def dummy_data_for_profiling( The model is identified by ``model_config``. See also: - :ref:`enabling_multimodal_inputs` + :ref:`enabling-multimodal-inputs` Note: This should be called after @@ -391,7 +391,7 @@ def register_input_processor(self, processor: InputProcessor): happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`. See also: - :ref:`input_processing_pipeline` + :ref:`input-processing-pipeline` """ def wrapper(model_cls: N) -> N: @@ -435,7 +435,7 @@ def process_input(self, model_config: "ModelConfig", The model is identified by ``model_config``. See also: - :ref:`input_processing_pipeline` + :ref:`input-processing-pipeline` """ # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 928c31a2f2843..9255e062e4870 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -11,7 +11,7 @@ dispatch data processing according to its modality and the target model. See also: - :ref:`input_processing_pipeline` + :ref:`input-processing-pipeline` """ __all__ = [ diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index fe77a4635f7d8..1e5a46946c6c0 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -50,7 +50,7 @@ class MultiModalPlugin(ABC): (i.e., the modality of the data). See also: - :ref:`adding_multimodal_plugin` + :ref:`adding-multimodal-plugin` """ def __init__(self) -> None: @@ -94,8 +94,8 @@ def register_input_mapper( If `None` is provided, then the default input mapper is used instead. See also: - - :ref:`input_processing_pipeline` - - :ref:`enabling_multimodal_inputs` + - :ref:`input-processing-pipeline` + - :ref:`enabling-multimodal-inputs` """ def wrapper(model_cls: N) -> N: @@ -130,8 +130,8 @@ def map_input( TypeError: If the data type is not supported. See also: - - :ref:`input_processing_pipeline` - - :ref:`enabling_multimodal_inputs` + - :ref:`input-processing-pipeline` + - :ref:`enabling-multimodal-inputs` """ # Avoid circular import @@ -190,7 +190,7 @@ def register_max_multimodal_tokens( If `None` is provided, then the default calculation is used instead. See also: - :ref:`enabling_multimodal_inputs` + :ref:`enabling-multimodal-inputs` """ def wrapper(model_cls: N) -> N: @@ -222,7 +222,7 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: The model is identified by ``model_config``. See also: - :ref:`enabling_multimodal_inputs` + :ref:`enabling-multimodal-inputs` """ # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 138cc6a44c11a..9ecae2c1ca2bf 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -75,7 +75,7 @@ class MultiModalDataBuiltins(TypedDict, total=False): This dictionary also accepts modality keys defined outside :class:`MultiModalDataBuiltins` as long as a customized plugin is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. - Read more on that :ref:`here `. + Read more on that :ref:`here `. """ diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 6cd79d414c978..ded45a7184b5d 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -76,7 +76,7 @@ def register_plugin(self, plugin: MultiModalPlugin) -> None: Register a multi-modal plugin so it can be recognized by vLLM. See also: - :ref:`adding_multimodal_plugin` + :ref:`adding-multimodal-plugin` """ data_type_key = plugin.get_data_key() @@ -311,8 +311,8 @@ def register_processor( invoked to transform the data into a dictionary of model inputs. See also: - - :ref:`input_processing_pipeline` - - :ref:`enabling_multimodal_inputs` + - :ref:`input-processing-pipeline` + - :ref:`enabling-multimodal-inputs` """ def wrapper(model_cls: N) -> N: diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index d95a2b4cd5565..09bde9f065eaa 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -50,7 +50,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: import vllm.envs as envs from vllm.utils import GiB_bytes model_config = vllm_config.model_config - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if not model_config.enforce_eager: logger.warning( diff --git a/vllm/scripts.py b/vllm/scripts.py index a51c21cfa29e7..42e1c639eda10 100644 --- a/vllm/scripts.py +++ b/vllm/scripts.py @@ -165,7 +165,7 @@ def main(): required=False, help="Read CLI options from a config file." "Must be a YAML with the following options:" - "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server" + "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference" ) serve_parser = make_arg_parser(serve_parser) serve_parser.set_defaults(dispatch_function=serve) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 2689802161987..de593113b938b 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -108,7 +108,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": return spec_decode_worker -# Reminder: Please update docs/source/usage/compatibility_matrix.rst +# Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. diff --git a/vllm/utils.py b/vllm/utils.py index 1b90eca1cd6cc..49e532540d7ee 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -51,7 +51,7 @@ # Exception strings for non-implemented encoder/decoder scenarios -# Reminder: Please update docs/source/usage/compatibility_matrix.rst +# Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid STR_NOT_IMPL_ENC_DEC_SWA = \ diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index f3d7c726a29f1..65d9bab0e2822 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -821,7 +821,7 @@ def _pythonize_sampler_output( for sgdx, (seq_group, sample_result) in enumerate(zip(seq_groups, samples_list)): - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid # (Check for Guided Decoding) if seq_group.sampling_params.logits_processors: diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index 5f71ec0c14df8..8f2d343440d3e 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -13,7 +13,7 @@ def assert_enc_dec_mr_supported_scenario( a supported scenario. ''' - # Reminder: Please update docs/source/usage/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid if enc_dec_mr.cache_config.enable_prefix_caching: