diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index 6abd0381..97b1618a 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -88,9 +88,9 @@ jobs: DF_SUFFIX=".gaudi2" TARGET=${{steps.target.outputs.target}} if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then - dockerfile="dev/docker/Dockerfile.habana_vllm" + dockerfile="dev/docker/ci/Dockerfile.habana_vllm" else - dockerfile="dev/docker/Dockerfile.habana" + dockerfile="dev/docker/ci/Dockerfile.habana" fi docker build --build-arg CACHEBUST=1 -f ${dockerfile} -t ${TARGET}:habana . docker container prune -f diff --git a/.github/workflows/workflow_test_benchmark.yml b/.github/workflows/workflow_test_benchmark.yml index d737675c..ba57af94 100644 --- a/.github/workflows/workflow_test_benchmark.yml +++ b/.github/workflows/workflow_test_benchmark.yml @@ -68,7 +68,7 @@ jobs: run: | DF_SUFFIX=".vllm" TARGET=${{steps.target.outputs.target}} - docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ inputs.http_proxy }} --build-arg https_proxy=${{ inputs.https_proxy }} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest + docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ inputs.http_proxy }} --build-arg https_proxy=${{ inputs.https_proxy }} -f dev/docker/ci/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest docker container prune -f docker image prune -f diff --git a/README.md b/README.md index 4870bab4..b8f5c5ca 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ LLM-on-Ray's modular workflow structure is designed to comprehensively cater to ![llm-on-ray](./docs/assets/solution_technical_overview.png) -## Getting Started +## Getting Started Locally With Source code This guide will assist you in setting up LLM-on-Ray on Intel CPU locally, covering the initial setup, finetuning models, and deploying them for serving. ### Setup @@ -109,6 +109,70 @@ After deploying the model endpoint, you can access and test it by using the scri python examples/inference/api_server_simple/query_single.py --model_endpoint http://127.0.0.1:8000/gpt2 ``` +## Getting Started With Docker +This guide will assist you in setting up LLM-on-Ray on With Docker. + +```bash +git clone https://github.com/intel/llm-on-ray.git +cd llm-on-ray +``` +The dockerfile for user is in `dev/docker/Dockerfile.user`. + +#### 1. Source Docker Functions +Detailed parameter can be set up for docker in `dev/scripts/docker-functions.sh`. +```bash +source dev/scripts/docker-functions.sh +``` + +#### 2. Build Docker Image +Default cpu and deepspeed for llm serving. +```bash +build_docker +``` + +Change build_docker fuction's args for different environment. + +Use vllm for llm serving. +```bash +build_docker vllm +``` + +Use ipex-vllm for llm serving. +```bash +build_docker ipex-llm +``` + +#### 3. Start Docker +Change any settings in `dev/scripts/docker-functions.sh`. + +Run docker with cpu and gpt2 serving. +```bash +start_docker +``` + +Run docker with cpu and other support models serving. +```bash +start_docker llama-2-7b-chat-hf +``` + +Run docker with different environment and other models `start_docker {environment} {models}` like: +```bash +start_docker vllm llama-2-7b-chat-hf +``` + +#### 4. Start LLM-on-Ray +The model serving port in docker container has been mapped to local. + +Using requests library. +```bash +python examples/inference/api_server_openai/query_http_requests.py +``` + +Using OpenAI SDK +```bash +python examples/inference/api_server_openai/query_openai_sdk.py +``` + ## Documents The following are detailed guidelines for pretraining, finetuning and serving LLMs in various computing environment. diff --git a/dev/docker/Dockerfile.user b/dev/docker/Dockerfile.user new file mode 100644 index 00000000..18fa4ec3 --- /dev/null +++ b/dev/docker/Dockerfile.user @@ -0,0 +1,49 @@ +# syntax=docker/dockerfile:1 +FROM ubuntu:22.04 + +# Define build arguments +ARG DOCKER_NAME=default +ARG PYPJ=default +ENV LANG C.UTF-8 + +WORKDIR /root/ + +RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \ + && apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +ENV CONDA_DIR /opt/conda +RUN wget --quiet https://github.com/conda-forge/miniforge/releases/download/23.3.1-1/Miniforge3-Linux-x86_64.sh -O ~/miniforge.sh && \ + /bin/bash ~/miniforge.sh -b -p /opt/conda +ENV PATH $CONDA_DIR/bin:$PATH + +# setup env +SHELL ["/bin/bash", "--login", "-c"] + +RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \ + unset -f conda && \ + export PATH=$CONDA_DIR/bin/:${PATH} && \ + mamba config --add channels intel && \ + mamba install -y -c conda-forge python==3.9 gxx=12.3 gxx_linux-64=12.3 libxcrypt + +# Used to invalidate docker build cache with --build-arg CACHEBUST=$(date +%s) +ARG CACHEBUST=1 + +RUN git clone https://github.com/intel/llm-on-ray.git +RUN if [ -d "llm-on-ray" ]; then echo "Clone successful"; else echo "Clone failed" && exit 1; fi +WORKDIR /root/llm-on-ray + + +RUN ls -la + +RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[${PYPJ}] --extra-index-url https://download.pytorch.org/whl/cpu \ + --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ + +# Use shell scripting to conditionally install packages +RUN if [ "${DOCKER_NAME}" = ".cpu_and_deepspeed" ]; then ds_report && ./dev/scripts/install-oneapi.sh;fi +RUN if [ "${DOCKER_NAME}" = ".ipex-llm" ]; then ./dev/scripts/install-oneapi.sh; fi +RUN if [ "${DOCKER_NAME}" = ".vllm" ]; then ./dev/scripts/install-vllm-cpu.sh; fi + + +ENTRYPOINT ["sh", "./dev/scripts/entrypoint_user.sh"] diff --git a/dev/docker/README.md b/dev/docker/README.md index 8224e8e5..0e80243b 100644 --- a/dev/docker/README.md +++ b/dev/docker/README.md @@ -1 +1,6 @@ -Dockerfiles for CI tests. There could be one Dockerfile with ARG declared to distinguish different pip extras. However, ARG will bust cache of 'pip install', which usually takes long time, when build docker image. Instead, we have two almost identical Dockerfiles here to improve CI efficiency. +# Dockerfiles for Users + +* `Dockerfile.user` to build llm-on-ray docker image for running on Intel CPU. +* `Dockerfile.habana` to build llm-on-ray docker image for running on [Intel Gaudi AI accelerator](https://habana.ai/products/gaudi/). + +__NOTICE:__ Dockerfiles in `ci/` are for CI tests only and not intended for daily use. diff --git a/dev/docker/Dockerfile.cpu_and_deepspeed b/dev/docker/ci/Dockerfile.cpu_and_deepspeed similarity index 98% rename from dev/docker/Dockerfile.cpu_and_deepspeed rename to dev/docker/ci/Dockerfile.cpu_and_deepspeed index d4320a41..cf82ed0e 100644 --- a/dev/docker/Dockerfile.cpu_and_deepspeed +++ b/dev/docker/ci/Dockerfile.cpu_and_deepspeed @@ -39,4 +39,4 @@ RUN ds_report # Used to invalidate docker build cache with --build-arg CACHEBUST=$(date +%s) ARG CACHEBUST=1 COPY ./dev/scripts/install-oneapi.sh /tmp -RUN /tmp/install-oneapi.sh +RUN /tmp/install-oneapi.sh \ No newline at end of file diff --git a/dev/docker/Dockerfile.cpu_and_deepspeed.pip_non_editable b/dev/docker/ci/Dockerfile.cpu_and_deepspeed.pip_non_editable similarity index 97% rename from dev/docker/Dockerfile.cpu_and_deepspeed.pip_non_editable rename to dev/docker/ci/Dockerfile.cpu_and_deepspeed.pip_non_editable index 1f4da7e9..ea9151b8 100644 --- a/dev/docker/Dockerfile.cpu_and_deepspeed.pip_non_editable +++ b/dev/docker/ci/Dockerfile.cpu_and_deepspeed.pip_non_editable @@ -36,4 +36,4 @@ RUN ds_report # Used to invalidate docker build cache with --build-arg CACHEBUST=$(date +%s) ARG CACHEBUST=1 COPY ./dev/scripts/install-oneapi.sh /tmp -RUN /tmp/install-oneapi.sh +RUN /tmp/install-oneapi.sh \ No newline at end of file diff --git a/dev/docker/ci/Dockerfile.habana b/dev/docker/ci/Dockerfile.habana new file mode 100644 index 00000000..efdddf6c --- /dev/null +++ b/dev/docker/ci/Dockerfile.habana @@ -0,0 +1,32 @@ +FROM vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest + +ENV LANG=en_US.UTF-8 + +WORKDIR /root/llm-on-ray + +COPY ./pyproject.toml . +COPY ./MANIFEST.in . + +# create llm_on_ray package directory to bypass the following 'pip install -e' command +RUN mkdir ./llm_on_ray + +RUN pip install -e . && \ + pip install --upgrade-strategy eager optimum[habana] && \ + pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.1 + +# Optinal. Comment out if you are not using UI +COPY ./dev/scripts/install-ui.sh /tmp + +RUN /tmp/install-ui.sh + +RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ + service ssh restart + +ENV no_proxy=localhost,127.0.0.1 + +# Required by DeepSpeed +ENV RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES=1 + +ENV PT_HPU_LAZY_ACC_PAR_MODE=0 + +ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true diff --git a/dev/docker/Dockerfile.habana_vllm b/dev/docker/ci/Dockerfile.habana_vllm similarity index 100% rename from dev/docker/Dockerfile.habana_vllm rename to dev/docker/ci/Dockerfile.habana_vllm diff --git a/dev/docker/Dockerfile.ipex-llm b/dev/docker/ci/Dockerfile.ipex-llm similarity index 98% rename from dev/docker/Dockerfile.ipex-llm rename to dev/docker/ci/Dockerfile.ipex-llm index 07c6b971..24418380 100644 --- a/dev/docker/Dockerfile.ipex-llm +++ b/dev/docker/ci/Dockerfile.ipex-llm @@ -37,4 +37,4 @@ RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[ipex-llm] --extr # Used to invalidate docker build cache with --build-arg CACHEBUST=$(date +%s) ARG CACHEBUST=1 COPY ./dev/scripts/install-oneapi.sh /tmp -RUN /tmp/install-oneapi.sh +RUN /tmp/install-oneapi.sh \ No newline at end of file diff --git a/dev/docker/Dockerfile.tests_cpu b/dev/docker/ci/Dockerfile.tests_cpu similarity index 100% rename from dev/docker/Dockerfile.tests_cpu rename to dev/docker/ci/Dockerfile.tests_cpu diff --git a/dev/docker/Dockerfile.tests_cpu_and_deepspeed b/dev/docker/ci/Dockerfile.tests_cpu_and_deepspeed similarity index 100% rename from dev/docker/Dockerfile.tests_cpu_and_deepspeed rename to dev/docker/ci/Dockerfile.tests_cpu_and_deepspeed diff --git a/dev/docker/Dockerfile.vllm b/dev/docker/ci/Dockerfile.vllm similarity index 99% rename from dev/docker/Dockerfile.vllm rename to dev/docker/ci/Dockerfile.vllm index c4abc067..cd2da96c 100644 --- a/dev/docker/Dockerfile.vllm +++ b/dev/docker/ci/Dockerfile.vllm @@ -38,4 +38,4 @@ RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[cpu] --extra-ind # Install vllm-cpu # Activate base first for loading g++ envs ($CONDA_PREFIX/etc/conda/activate.d/*) RUN --mount=type=cache,target=/root/.cache/pip \ - source /opt/conda/bin/activate base && ./install-vllm-cpu.sh + source /opt/conda/bin/activate base && ./install-vllm-cpu.sh \ No newline at end of file diff --git a/dev/scripts/ci-functions.sh b/dev/scripts/ci-functions.sh index 2240a444..fde5f071 100644 --- a/dev/scripts/ci-functions.sh +++ b/dev/scripts/ci-functions.sh @@ -26,10 +26,10 @@ build_and_prune() { fi echo "Build Docker image and perform cleaning operation" - echo "docker build ./ ${docker_args[@]} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest && yes | docker container prune && yes | docker image prune -f" + echo "docker build ./ ${docker_args[@]} -f dev/docker/ci/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest && yes | docker container prune && yes | docker image prune -f" # Build Docker image and perform cleaning operation - docker build ./ "${docker_args[@]}" -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest && yes | docker container prune && yes + docker build ./ "${docker_args[@]}" -f dev/docker/ci/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest && yes | docker container prune && yes docker image prune -f } diff --git a/dev/scripts/docker-functions.sh b/dev/scripts/docker-functions.sh new file mode 100644 index 00000000..a0397ac9 --- /dev/null +++ b/dev/scripts/docker-functions.sh @@ -0,0 +1,75 @@ +#!/bin/bash +set -eo pipefail + +# If your model needs HF_TOKEN. Please modify the "model_description.config.use_auth_token" in the config file such as "llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml" +# Mount your own llm-on-ray directory here +code_checkout_path=$PWD +# Mount your own huggingface cache path here +model_cache_path=$HOME'/.cache/huggingface/hub' +MODEL_CACHE_PATH_LOACL='/root/.cache/huggingface/hub' +CODE_CHECKOUT_PATH_LOCAL='/root/llm-on-ray' + + +build_docker() { + local DOCKER_NAME=$1 + + docker_args=() + docker_args+=("--build-arg=CACHEBUST=1") + if [ "$DOCKER_NAME" == "vllm" ]; then + docker_args+=("--build-arg=DOCKER_NAME=".vllm"") + docker_args+=("--build-arg=PYPJ="vllm"") + elif [ "$DOCKER_NAME" == "ipex-llm" ]; then + docker_args+=("--build-arg=DOCKER_NAME=".ipex-llm"") + docker_args+=("--build-arg=PYPJ="ipex-llm"") + else + docker_args+=("--build-arg=DOCKER_NAME=".cpu_and_deepspeed"") + docker_args+=("--build-arg=PYPJ="cpu,deepspeed"") + fi + + if [ -n "$http_proxy" ]; then + docker_args+=("--build-arg=http_proxy=$http_proxy") + fi + + if [ -n "$https_proxy" ]; then + docker_args+=("--build-arg=https_proxy=$http_proxy") + fi + + + echo "Build Docker image and perform cleaning operation" + echo "docker build ./ ${docker_args[@]} -f dev/docker/Dockerfile.user -t serving${DOCKER_NAME}:latest" + + # Build Docker image and perform cleaning operation + docker build ./ "${docker_args[@]}" -f dev/docker/Dockerfile.user -t serving${DOCKER_NAME}:latest + +} + +start_docker() { + local DOCKER_NAME=$1 + local MODEL_NAME=$2 + + docker_args=() + docker_args+=("--name=serving${DOCKER_NAME}" ) + if [ -z "$MODEL_NAME" ]; then + echo "use default model" + else + docker_args+=("-e=model_name=${MODEL_NAME}") + fi + + if [ -n "$http_proxy" ]; then + docker_args+=("-e=http_proxy=$http_proxy") + fi + + if [ -n "$https_proxy" ]; then + docker_args+=("-e=https_proxy=$http_proxy") + fi + + docker_args+=("-e=OPENAI_BASE_URL=${OPENAI_BASE_URL:-http://localhost:8000/v1}") + docker_args+=("-e=OPENAI_API_KEY=${OPENAI_API_KEY:-not_a_real_key}") + + # # If you need to use the modified llm-on-ray repository or huggingface model cache, activate the corresponding row + docker_args+=("-v=$code_checkout_path:${CODE_CHECKOUT_PATH_LOCAL}") + docker_args+=("-v=${model_cache_path}:${MODEL_CACHE_PATH_LOACL}") + + echo "docker run -ti ${docker_args[@]} serving${DOCKER_NAME}:latest" + docker run -ti ${docker_args[@]} serving${DOCKER_NAME}:latest +} diff --git a/dev/scripts/entrypoint_user.sh b/dev/scripts/entrypoint_user.sh new file mode 100644 index 00000000..ea298fa4 --- /dev/null +++ b/dev/scripts/entrypoint_user.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -e + +# Default serve cmd +if ! pgrep -f 'ray'; then + echo "Ray is not running. Starting Ray..." + # start Ray + ray start --head + echo "Ray started." +else + echo "Ray is already running." +fi +# Prepare for openai related +pip install openai>=1.0 + +if [ -n "$model_name" ]; then + echo "Using User Model: $model_name" + llm_on_ray-serve --models $model_name --keep_serve_terminal +else + echo "Using Default Model: gpt2" + llm_on_ray-serve --config_file llm_on_ray/inference/models/gpt2.yaml --keep_serve_terminal +fi + +exec /bin/bash \ No newline at end of file