From 5733c5760e9bcf22cef04e5306abc26e7d8feee0 Mon Sep 17 00:00:00 2001 From: AllentDan Date: Fri, 30 Aug 2024 16:15:00 +0800 Subject: [PATCH 1/7] Add torchrun launching multiple api_server --- docs/en/llm/api_server.md | 27 +++++++++++++++++++++++++++ docs/zh_cn/llm/api_server.md | 26 ++++++++++++++++++++++++++ lmdeploy/serve/openai/api_server.py | 16 +++++++++++++++- 3 files changed, 68 insertions(+), 1 deletion(-) diff --git a/docs/en/llm/api_server.md b/docs/en/llm/api_server.md index 285b0e32ff..52a7007b9f 100644 --- a/docs/en/llm/api_server.md +++ b/docs/en/llm/api_server.md @@ -249,6 +249,33 @@ curl http://{server_ip}:{server_port}/v1/chat/interactive \ lmdeploy serve gradio api_server_url --server-name ${gradio_ui_ip} --server-port ${gradio_ui_port} ``` +## Launch multiple api servers + +Following is a possible way to launch multiple api servers through torchrun. Just create a python script with the following codes. +Launch the script through `torchrun --nproc_per_node 2 script.py InternLM/internlm2-chat-1_8b`. + +```python +from typing import List +import fire + +import os + +def main(model_path: str, + port: int = 23333): + local_rank = int(os.environ.get('LOCAL_RANK', -1)) + if isinstance(port, List): + assert len(port) == int(os.environ.get('WORLD_SIZE', -1)) + port = port[local_rank] + else: + port += local_rank*10 + command = f'CUDA_VISIBLE_DEVICES={local_rank} lmdeploy serve api_server {model_path} --server-port {port}' + os.system(command) + + +if __name__ == '__main__': + fire.Fire(main) +``` + ## FAQ 1. When user got `"finish_reason":"length"`, it means the session is too long to be continued. The session length can be diff --git a/docs/zh_cn/llm/api_server.md b/docs/zh_cn/llm/api_server.md index d6c0c42aef..ee4ceec5b8 100644 --- a/docs/zh_cn/llm/api_server.md +++ b/docs/zh_cn/llm/api_server.md @@ -258,6 +258,32 @@ curl http://{server_ip}:{server_port}/v1/chat/interactive \ }' ``` +## 同时启动多个 api_server + +下面是一个可以用 torchrun 启动的脚本。用下面的代码跑 torchrun: `torchrun --nproc_per_node 2 script.py InternLM/internlm2-chat-1_8b`. + +```python +from typing import List +import fire + +import os + +def main(model_path: str, + port: int = 23333): + local_rank = int(os.environ.get('LOCAL_RANK', -1)) + if isinstance(port, List): + assert len(port) == int(os.environ.get('WORLD_SIZE', -1)) + port = port[local_rank] + else: + port += local_rank*10 + command = f'CUDA_VISIBLE_DEVICES={local_rank} lmdeploy serve api_server {model_path} --server-port {port}' + os.system(command) + + +if __name__ == '__main__': + fire.Fire(main) +``` + ## 接入 WebUI LMDeploy 提供 gradio 和 [OpenAOE](https://github.com/InternLM/OpenAOE) 两种方式,为 api_server 接入 WebUI。 diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index c434faf86f..c18eac6e2f 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -910,6 +910,20 @@ async def stream_results() -> AsyncGenerator[bytes, None]: return JSONResponse(ret) +def handle_torchrun(): + """To disable mmengine logging logic when using torchrun.""" + + def dummy_get_device_id(): + return 0 + + if int(os.environ.get('LOCAL_RANK', -1)) > 0: + from lmdeploy.vl.model.utils import _set_func + + # the replacement can't be recovered + _set_func('mmengine.logging.logger._get_device_id', + dummy_get_device_id) + + def serve(model_path: str, model_name: Optional[str] = None, backend: Literal['turbomind', 'pytorch'] = 'turbomind', @@ -986,8 +1000,8 @@ def serve(model_path: str, ssl_certfile = os.environ['SSL_CERTFILE'] http_or_https = 'https' + handle_torchrun() _, pipeline_class = get_task(model_path) - VariableInterface.async_engine = pipeline_class( model_path=model_path, model_name=model_name, From 08ae7fdd29c8f70e92c5ffe0b50ade3d5189e192 Mon Sep 17 00:00:00 2001 From: AllentDan Date: Wed, 11 Dec 2024 16:52:06 +0800 Subject: [PATCH 2/7] update with proxy --- docs/en/llm/api_server.md | 26 +++++++++++++++++++++----- docs/zh_cn/llm/api_server.md | 25 +++++++++++++++++++++---- 2 files changed, 42 insertions(+), 9 deletions(-) diff --git a/docs/en/llm/api_server.md b/docs/en/llm/api_server.md index 52a7007b9f..2c281da5ad 100644 --- a/docs/en/llm/api_server.md +++ b/docs/en/llm/api_server.md @@ -251,24 +251,40 @@ lmdeploy serve gradio api_server_url --server-name ${gradio_ui_ip} --server-port ## Launch multiple api servers -Following is a possible way to launch multiple api servers through torchrun. Just create a python script with the following codes. -Launch the script through `torchrun --nproc_per_node 2 script.py InternLM/internlm2-chat-1_8b`. +Following are two steps to launch multiple api servers through torchrun. Just create a python script with the following codes. + +1. Launch the proxy server through `lmdeploy serve proxy`. Get the correct proxy server url. +2. Launch the script through `torchrun --nproc_per_node 2 script.py InternLM/internlm2-chat-1_8b http://{proxy_node_name}:{proxy_node_port}`.**Note**: Please do not use `0.0.0.0:8000` here, instead, we input the real ip name, `11.25.34.55:8000` for example. ```python +import os +import socket from typing import List + import fire -import os + +def get_host_ip(): + try: + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.connect(('8.8.8.8', 80)) + ip = s.getsockname()[0] + finally: + s.close() + return ip + def main(model_path: str, + proxy_url: str = 'http://0.0.0.0:8000', port: int = 23333): local_rank = int(os.environ.get('LOCAL_RANK', -1)) + local_ip = get_host_ip() if isinstance(port, List): assert len(port) == int(os.environ.get('WORLD_SIZE', -1)) port = port[local_rank] else: - port += local_rank*10 - command = f'CUDA_VISIBLE_DEVICES={local_rank} lmdeploy serve api_server {model_path} --server-port {port}' + port += local_rank * 10 + command = f'CUDA_VISIBLE_DEVICES={local_rank} lmdeploy serve api_server {model_path} --server-name {local_ip} --server-port {port} --proxy-url {proxy_url}' os.system(command) diff --git a/docs/zh_cn/llm/api_server.md b/docs/zh_cn/llm/api_server.md index ee4ceec5b8..2a415b5768 100644 --- a/docs/zh_cn/llm/api_server.md +++ b/docs/zh_cn/llm/api_server.md @@ -260,23 +260,40 @@ curl http://{server_ip}:{server_port}/v1/chat/interactive \ ## 同时启动多个 api_server -下面是一个可以用 torchrun 启动的脚本。用下面的代码跑 torchrun: `torchrun --nproc_per_node 2 script.py InternLM/internlm2-chat-1_8b`. +两步直接启动多机多卡服务。先用下面的代码创建一个启动脚本。然后: + +1. 启动代理服务 `lmdeploy serve proxy`。 +2. torchrun 启动脚本 `torchrun --nproc_per_node 2 script.py InternLM/internlm2-chat-1_8b http://{proxy_node_name}:{proxy_node_port}`. **注意**: 多级多卡不要用默认 url `0.0.0.0:8000`,我们需要输入真实ip对应的地址,如:`11.25.34.55:8000`。 ```python +import os +import socket from typing import List + import fire -import os + +def get_host_ip(): + try: + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.connect(('8.8.8.8', 80)) + ip = s.getsockname()[0] + finally: + s.close() + return ip + def main(model_path: str, + proxy_url: str = 'http://0.0.0.0:8000', port: int = 23333): local_rank = int(os.environ.get('LOCAL_RANK', -1)) + local_ip = get_host_ip() if isinstance(port, List): assert len(port) == int(os.environ.get('WORLD_SIZE', -1)) port = port[local_rank] else: - port += local_rank*10 - command = f'CUDA_VISIBLE_DEVICES={local_rank} lmdeploy serve api_server {model_path} --server-port {port}' + port += local_rank * 10 + command = f'CUDA_VISIBLE_DEVICES={local_rank} lmdeploy serve api_server {model_path} --server-name {local_ip} --server-port {port} --proxy-url {proxy_url}' os.system(command) From b1e30ef2c7cf004a2dd8078ec6f96948179f95f3 Mon Sep 17 00:00:00 2001 From: AllentDan Date: Wed, 18 Dec 2024 12:48:10 +0800 Subject: [PATCH 3/7] custom tp and backend --- docs/en/llm/api_server.md | 25 +++++++++++++------------ docs/zh_cn/llm/api_server.md | 25 +++++++++++++------------ 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/docs/en/llm/api_server.md b/docs/en/llm/api_server.md index 2c281da5ad..fa92d6bca6 100644 --- a/docs/en/llm/api_server.md +++ b/docs/en/llm/api_server.md @@ -259,11 +259,8 @@ Following are two steps to launch multiple api servers through torchrun. Just cr ```python import os import socket -from typing import List - +from typing import List, Literal import fire - - def get_host_ip(): try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) @@ -272,22 +269,26 @@ def get_host_ip(): finally: s.close() return ip - - def main(model_path: str, + tp: int=1, proxy_url: str = 'http://0.0.0.0:8000', - port: int = 23333): + port: int = 23333, + backend: Literal['turbomind', 'pytorch']='turbomind'): local_rank = int(os.environ.get('LOCAL_RANK', -1)) + world_size = int(os.environ.get('WORLD_SIZE', -1)) local_ip = get_host_ip() if isinstance(port, List): - assert len(port) == int(os.environ.get('WORLD_SIZE', -1)) + assert len(port) == world_size port = port[local_rank] else: port += local_rank * 10 - command = f'CUDA_VISIBLE_DEVICES={local_rank} lmdeploy serve api_server {model_path} --server-name {local_ip} --server-port {port} --proxy-url {proxy_url}' - os.system(command) - - + if (world_size-local_rank)%tp==0: + rank_list = ','.join([str(local_rank+i) for i in range(tp)]) + command = f'CUDA_VISIBLE_DEVICES={rank_list} lmdeploy serve api_server {model_path} '\ + f'--server-name {local_ip} --server-port {port} --tp {tp} '\ + f'--proxy-url {proxy_url} --backend {backend}' + print(f'running command: {command}') + os.system(command) if __name__ == '__main__': fire.Fire(main) ``` diff --git a/docs/zh_cn/llm/api_server.md b/docs/zh_cn/llm/api_server.md index 2a415b5768..40514691a6 100644 --- a/docs/zh_cn/llm/api_server.md +++ b/docs/zh_cn/llm/api_server.md @@ -268,11 +268,8 @@ curl http://{server_ip}:{server_port}/v1/chat/interactive \ ```python import os import socket -from typing import List - +from typing import List, Literal import fire - - def get_host_ip(): try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) @@ -281,22 +278,26 @@ def get_host_ip(): finally: s.close() return ip - - def main(model_path: str, + tp: int=1, proxy_url: str = 'http://0.0.0.0:8000', - port: int = 23333): + port: int = 23333, + backend: Literal['turbomind', 'pytorch']='turbomind'): local_rank = int(os.environ.get('LOCAL_RANK', -1)) + world_size = int(os.environ.get('WORLD_SIZE', -1)) local_ip = get_host_ip() if isinstance(port, List): - assert len(port) == int(os.environ.get('WORLD_SIZE', -1)) + assert len(port) == world_size port = port[local_rank] else: port += local_rank * 10 - command = f'CUDA_VISIBLE_DEVICES={local_rank} lmdeploy serve api_server {model_path} --server-name {local_ip} --server-port {port} --proxy-url {proxy_url}' - os.system(command) - - + if (world_size-local_rank)%tp==0: + rank_list = ','.join([str(local_rank+i) for i in range(tp)]) + command = f'CUDA_VISIBLE_DEVICES={rank_list} lmdeploy serve api_server {model_path} '\ + f'--server-name {local_ip} --server-port {port} --tp {tp} '\ + f'--proxy-url {proxy_url} --backend {backend}' + print(f'running command: {command}') + os.system(command) if __name__ == '__main__': fire.Fire(main) ``` From a40f43529c5d1eb12cc39fbcee52496d4e1200fa Mon Sep 17 00:00:00 2001 From: AllentDan Date: Wed, 18 Dec 2024 13:12:05 +0800 Subject: [PATCH 4/7] typo --- docs/zh_cn/llm/api_server.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/zh_cn/llm/api_server.md b/docs/zh_cn/llm/api_server.md index 40514691a6..e8371d18b4 100644 --- a/docs/zh_cn/llm/api_server.md +++ b/docs/zh_cn/llm/api_server.md @@ -263,7 +263,7 @@ curl http://{server_ip}:{server_port}/v1/chat/interactive \ 两步直接启动多机多卡服务。先用下面的代码创建一个启动脚本。然后: 1. 启动代理服务 `lmdeploy serve proxy`。 -2. torchrun 启动脚本 `torchrun --nproc_per_node 2 script.py InternLM/internlm2-chat-1_8b http://{proxy_node_name}:{proxy_node_port}`. **注意**: 多级多卡不要用默认 url `0.0.0.0:8000`,我们需要输入真实ip对应的地址,如:`11.25.34.55:8000`。 +2. torchrun 启动脚本 `torchrun --nproc_per_node 2 script.py InternLM/internlm2-chat-1_8b http://{proxy_node_name}:{proxy_node_port}`. **注意**: 多机多卡不要用默认 url `0.0.0.0:8000`,我们需要输入真实ip对应的地址,如:`11.25.34.55:8000`。 ```python import os From 48893af38e61b509bf8638ed1ed10818e4311dfa Mon Sep 17 00:00:00 2001 From: AllentDan Date: Mon, 23 Dec 2024 19:03:12 +0800 Subject: [PATCH 5/7] add an example --- docs/en/llm/api_server.md | 2 +- docs/zh_cn/llm/api_server.md | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/docs/en/llm/api_server.md b/docs/en/llm/api_server.md index fa92d6bca6..9828672d2f 100644 --- a/docs/en/llm/api_server.md +++ b/docs/en/llm/api_server.md @@ -254,7 +254,7 @@ lmdeploy serve gradio api_server_url --server-name ${gradio_ui_ip} --server-port Following are two steps to launch multiple api servers through torchrun. Just create a python script with the following codes. 1. Launch the proxy server through `lmdeploy serve proxy`. Get the correct proxy server url. -2. Launch the script through `torchrun --nproc_per_node 2 script.py InternLM/internlm2-chat-1_8b http://{proxy_node_name}:{proxy_node_port}`.**Note**: Please do not use `0.0.0.0:8000` here, instead, we input the real ip name, `11.25.34.55:8000` for example. +2. Launch the script through `torchrun --nproc_per_node 2 script.py InternLM/internlm2-chat-1_8b --proxy_url http://{proxy_node_name}:{proxy_node_port}`.**Note**: Please do not use `0.0.0.0:8000` here, instead, we input the real ip name, `11.25.34.55:8000` for example. ```python import os diff --git a/docs/zh_cn/llm/api_server.md b/docs/zh_cn/llm/api_server.md index e8371d18b4..48ba080c01 100644 --- a/docs/zh_cn/llm/api_server.md +++ b/docs/zh_cn/llm/api_server.md @@ -263,7 +263,7 @@ curl http://{server_ip}:{server_port}/v1/chat/interactive \ 两步直接启动多机多卡服务。先用下面的代码创建一个启动脚本。然后: 1. 启动代理服务 `lmdeploy serve proxy`。 -2. torchrun 启动脚本 `torchrun --nproc_per_node 2 script.py InternLM/internlm2-chat-1_8b http://{proxy_node_name}:{proxy_node_port}`. **注意**: 多机多卡不要用默认 url `0.0.0.0:8000`,我们需要输入真实ip对应的地址,如:`11.25.34.55:8000`。 +2. torchrun 启动脚本 `torchrun --nproc_per_node 2 script.py InternLM/internlm2-chat-1_8b --proxy_url http://{proxy_node_name}:{proxy_node_port}`. **注意**: 多机多卡不要用默认 url `0.0.0.0:8000`,我们需要输入真实ip对应的地址,如:`11.25.34.55:8000`。多机情况下,因为不需要子节点间的通信,所以并不需要用户指定 torchrun 的 `--nnodes` 等参数,只要能保证每个节点执行一次单节点的 torchrun 就行。 ```python import os @@ -302,6 +302,37 @@ if __name__ == '__main__': fire.Fire(main) ``` +### 示例 + +为了进一步展示如何在集群环境中使用多机多卡服务。下面提供一个在火山云的用例: + +```shell +#!/bin/bash +# 激活 conda 环境 +source /path/to/your/home/miniconda3/bin/activate /path/to/your/home/miniconda3/envs/your_env +export HOME=/path/to/your/home +# 获取主节点IP地址(假设 MLP_WORKER_0_HOST 是主节点的IP) +MASTER_IP=${MLP_WORKER_0_HOST} +# 检查是否为主节点 +if [ "${MLP_ROLE_INDEX}" -eq 0 ]; then + # 启动 lmdeploy serve proxy 并放入后台 + echo "Starting lmdeploy serve proxy on master node..." + PROXY_PORT=8000 + lmdeploy serve proxy --server-name ${MASTER_IP} --server-port ${PROXY_PORT} & +else + echo "Not starting lmdeploy serve proxy on worker node ${MLP_ROLE_INDEX}." +fi +# 启动 torchrun 并放入后台 +# 再次强调多机环境下并不需要传--nnodes 或者 --master-addr 等参数,相当于每个机器上执行一次单节点的 torchrun 即可。 +torchrun \ +--nproc_per_node=${MLP_WORKER_GPU} \ +/path/to/script.py \ +InternLM/internlm2-chat-1_8b 8 http://${MASTER_IP}:${PROXY_PORT} +# 打印主机的IP地址 +echo "Host IP addresses:" +hostname -I +``` + ## 接入 WebUI LMDeploy 提供 gradio 和 [OpenAOE](https://github.com/InternLM/OpenAOE) 两种方式,为 api_server 接入 WebUI。 From 7729484b743198fbc271bb64d2fbe20750b2f019 Mon Sep 17 00:00:00 2001 From: AllentDan Date: Tue, 24 Dec 2024 15:30:13 +0800 Subject: [PATCH 6/7] refine --- docs/zh_cn/llm/api_server.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/zh_cn/llm/api_server.md b/docs/zh_cn/llm/api_server.md index 48ba080c01..ee5e42e7ee 100644 --- a/docs/zh_cn/llm/api_server.md +++ b/docs/zh_cn/llm/api_server.md @@ -320,6 +320,7 @@ if [ "${MLP_ROLE_INDEX}" -eq 0 ]; then PROXY_PORT=8000 lmdeploy serve proxy --server-name ${MASTER_IP} --server-port ${PROXY_PORT} & else + # 这里我们默认调度平台同时启动了所有机器,否则要sleep一会,等待 proxy 启动成功 echo "Not starting lmdeploy serve proxy on worker node ${MLP_ROLE_INDEX}." fi # 启动 torchrun 并放入后台 From 52888adae7a925bde42496bbd44ab4050fe26734 Mon Sep 17 00:00:00 2001 From: AllentDan Date: Thu, 26 Dec 2024 14:12:15 +0800 Subject: [PATCH 7/7] format code snippet --- docs/en/llm/api_server.md | 15 +++++++++++---- docs/zh_cn/llm/api_server.md | 15 +++++++++++---- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/docs/en/llm/api_server.md b/docs/en/llm/api_server.md index 9828672d2f..274ec2ff25 100644 --- a/docs/en/llm/api_server.md +++ b/docs/en/llm/api_server.md @@ -260,7 +260,10 @@ Following are two steps to launch multiple api servers through torchrun. Just cr import os import socket from typing import List, Literal + import fire + + def get_host_ip(): try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) @@ -269,11 +272,13 @@ def get_host_ip(): finally: s.close() return ip + + def main(model_path: str, - tp: int=1, + tp: int = 1, proxy_url: str = 'http://0.0.0.0:8000', port: int = 23333, - backend: Literal['turbomind', 'pytorch']='turbomind'): + backend: Literal['turbomind', 'pytorch'] = 'turbomind'): local_rank = int(os.environ.get('LOCAL_RANK', -1)) world_size = int(os.environ.get('WORLD_SIZE', -1)) local_ip = get_host_ip() @@ -282,13 +287,15 @@ def main(model_path: str, port = port[local_rank] else: port += local_rank * 10 - if (world_size-local_rank)%tp==0: - rank_list = ','.join([str(local_rank+i) for i in range(tp)]) + if (world_size - local_rank) % tp == 0: + rank_list = ','.join([str(local_rank + i) for i in range(tp)]) command = f'CUDA_VISIBLE_DEVICES={rank_list} lmdeploy serve api_server {model_path} '\ f'--server-name {local_ip} --server-port {port} --tp {tp} '\ f'--proxy-url {proxy_url} --backend {backend}' print(f'running command: {command}') os.system(command) + + if __name__ == '__main__': fire.Fire(main) ``` diff --git a/docs/zh_cn/llm/api_server.md b/docs/zh_cn/llm/api_server.md index ee5e42e7ee..8bb91c619e 100644 --- a/docs/zh_cn/llm/api_server.md +++ b/docs/zh_cn/llm/api_server.md @@ -269,7 +269,10 @@ curl http://{server_ip}:{server_port}/v1/chat/interactive \ import os import socket from typing import List, Literal + import fire + + def get_host_ip(): try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) @@ -278,11 +281,13 @@ def get_host_ip(): finally: s.close() return ip + + def main(model_path: str, - tp: int=1, + tp: int = 1, proxy_url: str = 'http://0.0.0.0:8000', port: int = 23333, - backend: Literal['turbomind', 'pytorch']='turbomind'): + backend: Literal['turbomind', 'pytorch'] = 'turbomind'): local_rank = int(os.environ.get('LOCAL_RANK', -1)) world_size = int(os.environ.get('WORLD_SIZE', -1)) local_ip = get_host_ip() @@ -291,13 +296,15 @@ def main(model_path: str, port = port[local_rank] else: port += local_rank * 10 - if (world_size-local_rank)%tp==0: - rank_list = ','.join([str(local_rank+i) for i in range(tp)]) + if (world_size - local_rank) % tp == 0: + rank_list = ','.join([str(local_rank + i) for i in range(tp)]) command = f'CUDA_VISIBLE_DEVICES={rank_list} lmdeploy serve api_server {model_path} '\ f'--server-name {local_ip} --server-port {port} --tp {tp} '\ f'--proxy-url {proxy_url} --backend {backend}' print(f'running command: {command}') os.system(command) + + if __name__ == '__main__': fire.Fire(main) ```