Skip to content

Commit

Permalink
WIP: [CI/Build] simplify Dockerfile build for ARM64 / GH200 (#11212)
Browse files Browse the repository at this point in the history
Signed-off-by: drikster80 <ed.sealing@gmail.com>
Co-authored-by: drikster80 <ed.sealing@gmail.com>
  • Loading branch information
cennn and drikster80 authored Dec 16, 2024
1 parent 17138af commit b3b1526
Showing 5 changed files with 64 additions and 11 deletions.
40 changes: 32 additions & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -11,6 +11,7 @@ ARG CUDA_VERSION=12.4.1
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.12
ARG TARGETPLATFORM
ENV DEBIAN_FRONTEND=noninteractive

# Install Python and other dependencies
@@ -46,9 +47,14 @@ WORKDIR /workspace
# install build and runtime dependencies
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-cuda.txt

RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
python3 -m pip install -r requirements-cuda-arm64.txt; \
fi

# cuda arch list used by torch
# can be useful for both `dev` and `test`
@@ -63,13 +69,19 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}

#################### WHEEL BUILD IMAGE ####################
FROM base AS build
ARG TARGETPLATFORM

# install build dependencies
COPY requirements-build.txt requirements-build.txt

RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-build.txt

RUN --mount=type=cache,target=/root/.cache/pip \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
python3 -m pip install -r requirements-cuda-arm64.txt; \
fi

COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
@@ -134,15 +146,18 @@ COPY requirements-test.txt requirements-test.txt
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt

#################### DEV IMAGE ####################

#################### vLLM installation IMAGE ####################
# image with vLLM installed
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.12
WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive
ARG TARGETPLATFORM

COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt

RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
@@ -168,18 +183,25 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
# or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/

# install vllm wheel first, so that torch etc will be installed
# Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
python3 -m pip install dist/*.whl --verbose

RUN --mount=type=cache,target=/root/.cache/pip \
. /etc/environment && \
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
pip uninstall -y torch && \
python3 -m pip install -r requirements-cuda-arm64.txt; \
fi

RUN --mount=type=cache,target=/root/.cache/pip \
. /etc/environment && \
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
fi
COPY examples examples
#################### vLLM installation IMAGE ####################


#################### TEST IMAGE ####################
# image to run unit testing suite
# note that this uses vllm installed by `pip`
@@ -209,7 +231,6 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
RUN mkdir test_docs
RUN mv docs test_docs/
RUN mv vllm test_docs/

#################### TEST IMAGE ####################

#################### OPENAI API SERVER ####################
@@ -218,8 +239,11 @@ FROM vllm-base AS vllm-openai

# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' timm==0.9.10

if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \
else \
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
fi
ENV VLLM_USAGE_SOURCE production-docker-image

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
26 changes: 26 additions & 0 deletions docs/source/serving/deploying_with_docker.rst
Original file line number Diff line number Diff line change
@@ -37,6 +37,32 @@ You can build and run vLLM from source via the provided `Dockerfile <https://git
current GPU type the machine is running on, you can add the argument ``--build-arg torch_cuda_arch_list=""``
for vLLM to find the current GPU type and build for that.

Building for Arm64/aarch64
--------------------------

A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.

.. note::

Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits.
Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).

.. code-block:: console
# Example of building on Nvidia GH200 server. (Memory usage: ~12GB, Build time: ~1475s / ~25 min, Image size: 7.26GB)
$ DOCKER_BUILDKIT=1 sudo docker build . \
--target vllm-openai \
-platform "linux/arm64" \
-t vllm/vllm-gh200-openai:latest \
--build-arg max_jobs=66 \
--build-arg nvcc_threads=2 \
--build-arg torch_cuda_arch_list="9.0+PTX" \
--build-arg vllm_fa_cmake_gpu_arches="90-real"
To run vLLM:

2 changes: 1 addition & 1 deletion requirements-build.txt
Original file line number Diff line number Diff line change
@@ -4,6 +4,6 @@ ninja
packaging
setuptools>=61
setuptools-scm>=8
torch==2.5.1
torch==2.5.1; platform_machine != 'aarch64'
wheel
jinja2
3 changes: 3 additions & 0 deletions requirements-cuda-arm64.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
--index-url https://download.pytorch.org/whl/nightly/cu124
torchvision==0.22.0.dev20241215; platform_machine == 'aarch64'
torch==2.6.0.dev20241210+cu124; platform_machine == 'aarch64'
4 changes: 2 additions & 2 deletions requirements-cuda.txt
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@
# Dependencies for NVIDIA GPUs
ray >= 2.9
nvidia-ml-py >= 12.560.30 # for pynvml package
torch == 2.5.1
torch == 2.5.1; platform_machine != 'aarch64'
# These must be updated alongside torch
torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1

0 comments on commit b3b1526

Please sign in to comment.