Torchrun launching multiple api_server (#2402)

* Add torchrun launching multiple api_server * update with proxy * custom tp and backend * typo * add an example * refine * format code snippet
InternLM · Dec 26, 2024 · d9b8372 · d9b8372
1 parent 191a7dd
commit d9b8372
Show file tree

Hide file tree

Showing 3 changed files with 149 additions and 1 deletion.
diff --git a/docs/en/llm/api_server.md b/docs/en/llm/api_server.md
@@ -249,6 +249,57 @@ curl http://{server_ip}:{server_port}/v1/chat/interactive \
 lmdeploy serve gradio api_server_url --server-name ${gradio_ui_ip} --server-port ${gradio_ui_port}
 ```
 
+## Launch multiple api servers
+
+Following are two steps to launch multiple api servers through torchrun. Just create a python script with the following codes.
+
+1. Launch the proxy server through `lmdeploy serve proxy`. Get the correct proxy server url.
+2. Launch the script through `torchrun --nproc_per_node 2 script.py InternLM/internlm2-chat-1_8b --proxy_url http://{proxy_node_name}:{proxy_node_port}`.**Note**: Please do not use `0.0.0.0:8000` here, instead, we input the real ip name, `11.25.34.55:8000` for example.
+
+```python
+import os
+import socket
+from typing import List, Literal
+
+import fire
+
+
+def get_host_ip():
+    try:
+        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+        s.connect(('8.8.8.8', 80))
+        ip = s.getsockname()[0]
+    finally:
+        s.close()
+    return ip
+
+
+def main(model_path: str,
+         tp: int = 1,
+         proxy_url: str = 'http://0.0.0.0:8000',
+         port: int = 23333,
+         backend: Literal['turbomind', 'pytorch'] = 'turbomind'):
+    local_rank = int(os.environ.get('LOCAL_RANK', -1))
+    world_size = int(os.environ.get('WORLD_SIZE', -1))
+    local_ip = get_host_ip()
+    if isinstance(port, List):
+        assert len(port) == world_size
+        port = port[local_rank]
+    else:
+        port += local_rank * 10
+    if (world_size - local_rank) % tp == 0:
+        rank_list = ','.join([str(local_rank + i) for i in range(tp)])
+        command = f'CUDA_VISIBLE_DEVICES={rank_list} lmdeploy serve api_server {model_path} '\
+                  f'--server-name {local_ip} --server-port {port} --tp {tp} '\
+                  f'--proxy-url {proxy_url} --backend {backend}'
+        print(f'running command: {command}')
+        os.system(command)
+
+
+if __name__ == '__main__':
+    fire.Fire(main)
+```
+
 ## FAQ
 
 1. When user got `"finish_reason":"length"`, it means the session is too long to be continued. The session length can be

diff --git a/docs/zh_cn/llm/api_server.md b/docs/zh_cn/llm/api_server.md
@@ -258,6 +258,89 @@ curl http://{server_ip}:{server_port}/v1/chat/interactive \
   }'
 ```
 
+## 同时启动多个 api_server
+
+两步直接启动多机多卡服务。先用下面的代码创建一个启动脚本。然后：
+
+1. 启动代理服务 `lmdeploy serve proxy`。
+2. torchrun 启动脚本 `torchrun --nproc_per_node 2 script.py InternLM/internlm2-chat-1_8b --proxy_url http://{proxy_node_name}:{proxy_node_port}`. **注意**： 多机多卡不要用默认 url `0.0.0.0:8000`，我们需要输入真实ip对应的地址，如：`11.25.34.55:8000`。多机情况下，因为不需要子节点间的通信，所以并不需要用户指定 torchrun 的 `--nnodes` 等参数，只要能保证每个节点执行一次单节点的 torchrun 就行。
+
+```python
+import os
+import socket
+from typing import List, Literal
+
+import fire
+
+
+def get_host_ip():
+    try:
+        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+        s.connect(('8.8.8.8', 80))
+        ip = s.getsockname()[0]
+    finally:
+        s.close()
+    return ip
+
+
+def main(model_path: str,
+         tp: int = 1,
+         proxy_url: str = 'http://0.0.0.0:8000',
+         port: int = 23333,
+         backend: Literal['turbomind', 'pytorch'] = 'turbomind'):
+    local_rank = int(os.environ.get('LOCAL_RANK', -1))
+    world_size = int(os.environ.get('WORLD_SIZE', -1))
+    local_ip = get_host_ip()
+    if isinstance(port, List):
+        assert len(port) == world_size
+        port = port[local_rank]
+    else:
+        port += local_rank * 10
+    if (world_size - local_rank) % tp == 0:
+        rank_list = ','.join([str(local_rank + i) for i in range(tp)])
+        command = f'CUDA_VISIBLE_DEVICES={rank_list} lmdeploy serve api_server {model_path} '\
+                  f'--server-name {local_ip} --server-port {port} --tp {tp} '\
+                  f'--proxy-url {proxy_url} --backend {backend}'
+        print(f'running command: {command}')
+        os.system(command)
+
+
+if __name__ == '__main__':
+    fire.Fire(main)
+```
+
+### 示例
+
+为了进一步展示如何在集群环境中使用多机多卡服务。下面提供一个在火山云的用例：
+
+```shell
+#!/bin/bash
+# 激活 conda 环境
+source /path/to/your/home/miniconda3/bin/activate /path/to/your/home/miniconda3/envs/your_env
+export HOME=/path/to/your/home
+# 获取主节点IP地址（假设 MLP_WORKER_0_HOST 是主节点的IP）
+MASTER_IP=${MLP_WORKER_0_HOST}
+# 检查是否为主节点
+if [ "${MLP_ROLE_INDEX}" -eq 0 ]; then
+    # 启动 lmdeploy serve proxy 并放入后台
+    echo "Starting lmdeploy serve proxy on master node..."
+    PROXY_PORT=8000
+    lmdeploy serve proxy --server-name ${MASTER_IP} --server-port ${PROXY_PORT} &
+else
+    # 这里我们默认调度平台同时启动了所有机器，否则要sleep一会，等待 proxy 启动成功
+    echo "Not starting lmdeploy serve proxy on worker node ${MLP_ROLE_INDEX}."
+fi
+# 启动 torchrun 并放入后台
+# 再次强调多机环境下并不需要传--nnodes 或者 --master-addr 等参数，相当于每个机器上执行一次单节点的 torchrun 即可。
+torchrun \
+--nproc_per_node=${MLP_WORKER_GPU} \
+/path/to/script.py \
+InternLM/internlm2-chat-1_8b 8 http://${MASTER_IP}:${PROXY_PORT}
+# 打印主机的IP地址
+echo "Host IP addresses:"
+hostname -I
+```
+
 ## 接入 WebUI
 
 LMDeploy 提供 gradio 和 [OpenAOE](https://github.com/InternLM/OpenAOE) 两种方式，为 api_server 接入 WebUI。

diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
@@ -946,6 +946,20 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
         return JSONResponse(ret)
 
 
+def handle_torchrun():
+    """To disable mmengine logging logic when using torchrun."""
+
+    def dummy_get_device_id():
+        return 0
+
+    if int(os.environ.get('LOCAL_RANK', -1)) > 0:
+        from lmdeploy.vl.model.utils import _set_func
+
+        # the replacement can't be recovered
+        _set_func('mmengine.logging.logger._get_device_id',
+                  dummy_get_device_id)
+
+
 @router.on_event('startup')
 async def startup_event():
     if VariableInterface.proxy_url is None:
@@ -1069,8 +1083,8 @@ def serve(model_path: str,
         ssl_certfile = os.environ['SSL_CERTFILE']
         http_or_https = 'https'
 
+    handle_torchrun()
     _, pipeline_class = get_task(model_path)
-
     VariableInterface.async_engine = pipeline_class(
         model_path=model_path,
         model_name=model_name,