Dockerfile.ubi

# Please update any changes made here to
# docs/source/dev/dockerfile-ubi/dockerfile-ubi.rst

## Global Args #################################################################
ARG BASE_UBI_IMAGE_TAG=9.4-1134
ARG PYTHON_VERSION=3.11

# NOTE: This setting only has an effect when not using prebuilt-wheel kernels
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"


## Base Layer ##################################################################
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base

WORKDIR /workspace

ENV LANG=C.UTF-8 \
    LC_ALL=C.UTF-8

# Some utils for dev purposes - tar required for kubectl cp
RUN microdnf install -y \
        which procps findutils tar vim \
    && microdnf clean all


## Python Installer ############################################################
FROM base as python-install

ARG PYTHON_VERSION
ARG MINIFORGE_VERSION=24.3.0-0

RUN curl -fsSL -o ~/miniforge3.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MINIFORGE_VERSION}/Miniforge3-$(uname)-$(uname -m).sh" && \
    chmod +x ~/miniforge3.sh && \
    bash ~/miniforge3.sh -b -p /opt/conda && \
    source "/opt/conda/etc/profile.d/conda.sh" && \
    conda create -y -p /opt/vllm python=${PYTHON_VERSION} && \
    conda activate /opt/vllm && \
    rm ~/miniforge3.sh
# use of the /opt/vllm env requires:
# ENV PATH=/opt/vllm/bin/:$PATH

## CUDA Base ###################################################################
FROM base as cuda-base

# The Nvidia operator won't allow deploying on CUDA 12.0 hosts if
# this env var is set to 12.2.0, even though it's compatible
#ENV CUDA_VERSION=12.2.0 \
ENV CUDA_VERSION=12.0.0 \
    NV_CUDA_LIB_VERSION=12.2.0-1 \
    NVIDIA_VISIBLE_DEVICES=all \
    NVIDIA_DRIVER_CAPABILITIES=compute,utility \
    NV_CUDA_CUDART_VERSION=12.2.53-1 \
    NV_CUDA_COMPAT_VERSION=535.104.12

RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
        https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo

RUN microdnf install -y \
        cuda-cudart-12-2-${NV_CUDA_CUDART_VERSION} \
        cuda-compat-12-2-${NV_CUDA_COMPAT_VERSION} \
    && microdnf clean all

ENV CUDA_HOME="/usr/local/cuda" \
    PATH="/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH}" \
    LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"


## CUDA Runtime ################################################################
FROM cuda-base as cuda-runtime

ENV NV_NVTX_VERSION=12.2.53-1 \
    NV_LIBNPP_VERSION=12.1.1.14-1 \
    NV_LIBCUBLAS_VERSION=12.2.1.16-1 \
    NV_LIBNCCL_PACKAGE_VERSION=2.18.5-1+cuda12.2

RUN microdnf install -y \
        cuda-libraries-12-2-${NV_CUDA_LIB_VERSION} \
        cuda-nvtx-12-2-${NV_NVTX_VERSION} \
        libnpp-12-2-${NV_LIBNPP_VERSION} \
        libcublas-12-2-${NV_LIBCUBLAS_VERSION} \
        libnccl-${NV_LIBNCCL_PACKAGE_VERSION} \
    && microdnf clean all


## CUDA Development ############################################################
FROM cuda-base as cuda-devel

ENV NV_CUDA_CUDART_DEV_VERSION=12.2.53-1 \
    NV_NVML_DEV_VERSION=12.2.81-1 \
    NV_LIBCUBLAS_DEV_VERSION=12.2.1.16-1 \
    NV_LIBNPP_DEV_VERSION=12.1.1.14-1 \
    NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.5-1+cuda12.2

RUN microdnf install -y \
        cuda-command-line-tools-12-2-${NV_CUDA_LIB_VERSION} \
        cuda-libraries-devel-12-2-${NV_CUDA_LIB_VERSION} \
        cuda-minimal-build-12-2-${NV_CUDA_LIB_VERSION} \
        cuda-cudart-devel-12-2-${NV_CUDA_CUDART_DEV_VERSION} \
        cuda-nvml-devel-12-2-${NV_NVML_DEV_VERSION} \
        libcublas-devel-12-2-${NV_LIBCUBLAS_DEV_VERSION} \
        libnpp-devel-12-2-${NV_LIBNPP_DEV_VERSION} \
        libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \
    && microdnf clean all

ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"

# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-12.2/compat/

## Python cuda base #################################################################
FROM cuda-devel as python-cuda-base

COPY --from=python-install --link /opt/vllm /opt/vllm
ENV PATH=/opt/vllm/bin/:$PATH

# install cuda and common dependencies
RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
    pip3 install \
        -r requirements-cuda.txt

## Development #################################################################
FROM python-cuda-base AS dev

# install build and runtime dependencies
RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
    --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
    --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
    --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
    pip3 install \
        -r requirements-cuda.txt \
        -r requirements-dev.txt

## Proto Compilation ###########################################################
FROM python-install AS gen-protos

ENV PATH=/opt/vllm/bin/:$PATH

RUN microdnf install -y \
        make \
        findutils \
    && microdnf clean all

RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=Makefile,target=Makefile \
    --mount=type=bind,source=proto,target=proto \
    make gen-protos

## Extension Cache #############################################################
# Instead of compiling artifacts every build just copy from pre-built wheel
# This might not work if the PyTorch and CUDA versions don't match!
FROM base as prebuilt-wheel

RUN microdnf install -y \
        unzip \
    && microdnf clean all

ARG PYTHON_VERSION
# 0.4.2 is built for CUDA 12.1 and PyTorch 2.3.0
ARG VLLM_WHEEL_VERSION=0.4.3

RUN curl -Lo vllm.whl https://github.com/vllm-project/vllm/releases/download/v${VLLM_WHEEL_VERSION}/vllm-${VLLM_WHEEL_VERSION}-cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}-manylinux1_x86_64.whl \
    && unzip vllm.whl \
    && rm vllm.whl
# compiled extensions located at /workspace/vllm/*.so

## Builder #####################################################################
FROM dev AS build

# install build dependencies
RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
    pip install -r requirements-build.txt

# copy input files
COPY csrc csrc
COPY setup.py setup.py
COPY cmake cmake
COPY CMakeLists.txt CMakeLists.txt
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
COPY pyproject.toml pyproject.toml
COPY vllm/__init__.py vllm/__init__.py

ARG TORCH_CUDA_ARCH_LIST
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST

# max jobs used by Ninja to build extensions
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# number of threads used by nvcc
ARG nvcc_threads=2
ENV NVCC_THREADS=$nvcc_threads
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1

# Setup path stuff? Ref: https://github.com/vllm-project/vllm/blob/main/.github/workflows/scripts/build.sh#L6-L8
ENV PATH=/usr/local/cuda/bin:$PATH
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

# Copy the entire directory before building wheel
COPY --link vllm vllm

# Comment if building *.so files from scratch
##################################################
# Copy the prebuilt *.so files
COPY --from=prebuilt-wheel --link /workspace/vllm/*.so /workspace/vllm/
ENV VLLM_USE_PRECOMPILED=1
##################################################
# Comment if not building .so files from scratch
#RUN microdnf install -y git \
#    && microdnf clean all
##################################################

# Copy over the generated *.pb2 files
COPY --from=gen-protos --link /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc/pb

ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/pip \
    CMAKE_BUILD_TYPE=Release python3 setup.py bdist_wheel --dist-dir=dist

#################### libsodium Build IMAGE ####################
FROM base as libsodium-builder

RUN microdnf install -y gcc gzip \
    && microdnf clean all

WORKDIR /usr/src/libsodium

ARG LIBSODIUM_VERSION=1.0.19
RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \
    && tar -xzvf libsodium*.tar.gz \
    && rm -f libsodium*.tar.gz \
    && mv libsodium*/* ./

RUN ./configure && make && make check

## Release #####################################################################
# Note from the non-UBI Dockerfile:
# We used base cuda image because pytorch installs its own cuda libraries.
# However pynccl depends on cuda libraries so we had to switch to the runtime image
# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
FROM cuda-runtime AS vllm-openai

WORKDIR /workspace

# Create release python environment
COPY --from=python-cuda-base --link /opt/vllm /opt/vllm
ENV PATH=/opt/vllm/bin/:$PATH

# install vllm wheel first, so that torch etc will be installed
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
    --mount=type=cache,target=/root/.cache/pip \
    pip install $(echo dist/*.whl)'[tensorizer]' --verbose

# Install the vllm_nccl package which is a bit quirky
RUN --mount=type=cache,target=/root/.cache/pip \
    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
    # The "install" happens in `setup.py` so it happens when built...
    # Remove the already installed package and the cached wheel
    pip uninstall -y vllm-nccl-cu12 \
    && pip cache remove vllm_nccl* \
    # install the version depended on by vllm requirements
    && pip install vllm-nccl-cu12 -r requirements-cuda.txt \
    # The lib is downloaded to root's home directory... move it
    && mv ~/.config/vllm/nccl/cu12/libnccl.so.2* /usr/local/lib/libnccl.so.2
ENV VLLM_NCCL_SO_PATH=/usr/local/lib/libnccl.so.2

RUN --mount=type=cache,target=/root/.cache/pip \
    pip3 install \
        # additional dependencies for the TGIS gRPC server
        grpcio-tools==1.63.0 \
        # additional dependencies for openai api_server
        accelerate==0.30.0 \
        # hf_transfer for faster HF hub downloads
        hf_transfer==0.1.6

# Triton needs a CC compiler
RUN microdnf install -y gcc \
    && microdnf clean all

# patch triton (fix for #720)
COPY triton_patch/custom_cache_manager.py /opt/vllm/lib/python3.11/site-packages/triton/runtime/custom_cache_manager.py

# Install libsodium for Tensorizer encryption
RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
    cd /usr/src/libsodium \
    && make install
ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH

ENV HF_HUB_OFFLINE=1 \
    PORT=8000 \
    GRPC_PORT=8033 \
    HOME=/home/vllm \
    VLLM_USAGE_SOURCE=production-docker-image \
    VLLM_WORKER_MULTIPROC_METHOD=fork \
    TRITON_CACHE_MANAGER="triton.runtime.custom_cache_manager:CustomCacheManager"

# setup non-root user for OpenShift
RUN microdnf install -y shadow-utils \
    && umask 002 \
    && useradd --uid 2000 --gid 0 vllm \
    && microdnf remove -y shadow-utils \
    && microdnf clean all \
    && chmod g+rwx $HOME /usr/src /workspace

COPY LICENSE /licenses/vllm.md

USER 2000
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]