diff --git a/llama.cpp-base/Dockerfile b/llama.cpp-base/Dockerfile index b67187b..793d273 100644 --- a/llama.cpp-base/Dockerfile +++ b/llama.cpp-base/Dockerfile @@ -22,7 +22,7 @@ RUN apt-get update && \ WORKDIR /app -RUN git clone https://github.com/ggerganov/llama.cpp.git . && git checkout b3135 +RUN git clone https://github.com/ggerganov/llama.cpp.git . && git checkout b3853 # Set nvcc architecture ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} diff --git a/llama.cpp-base/llama.cpp-base/Dockerfile b/llama.cpp-base/llama.cpp-base/Dockerfile deleted file mode 100644 index 15b2c6d..0000000 --- a/llama.cpp-base/llama.cpp-base/Dockerfile +++ /dev/null @@ -1,55 +0,0 @@ -ARG UBUNTU_VERSION=22.04 -# This needs to generally match the container host's environment. -# We use a slightly older version for greater compatibility -ARG CUDA_VERSION=12.3.1 -# CUDA build image -ARG CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} -# CUDA runtime image -ARG CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} -# CUDA base image (excludes cublas) -ARG CUDA_BASE_CONTAINER=nvidia/cuda:${CUDA_VERSION}-base-ubuntu${UBUNTU_VERSION} - -FROM ${CUDA_DEV_CONTAINER} as build - -# try more targetted build -# ARG CUDA_DOCKER_ARCH=all -# try targetind sm_80, the A100's version -# this supports all ampere and later GPUs -ARG CUDA_DOCKER_ARCH=sm_80 - -RUN apt-get update && \ - apt-get install -y build-essential git libcurl4-openssl-dev ccache - -WORKDIR /app - -RUN git clone https://github.com/ggerganov/llama.cpp.git . && git checkout b3853 - -# Set nvcc architecture -ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} -# Enable CUDA -ENV LLAMA_CUDA=1 -# Enable cURL -ENV LLAMA_CURL=1 -# Enable quantized kv -ENV LLAMA_CUDA_FA_ALL_QUANTS=1 - -RUN make -j 12 - -FROM ${CUDA_BASE_CONTAINER} as runtime - -RUN /bin/echo -e '#!/bin/bash\nDEBIAN_FRONTEND=noninteractive\napt-get update && apt-get install -y $@ && apt-get clean autoclean && apt-get autoremove --yes && rm -rf /var/lib/apt/lists/*' \ - > /usr/local/sbin/apt_install_clean.sh && \ - chmod a+x /usr/local/sbin/apt_install_clean.sh -RUN /bin/echo -e '#!/bin/bash\nDEBIAN_FRONTEND=noninteractive\napt-get update && apt-get remove -y $@ && apt-get clean autoclean && apt-get autoremove --yes && rm -rf /var/lib/apt/lists/*' \ - > /usr/local/sbin/apt_remove_clean.sh && \ - chmod a+x /usr/local/sbin/apt_remove_clean.sh - -# we need just CUDA and CUBLAS -# this saves ~1GB vs the -runtime image -RUN /usr/local/sbin/apt_install_clean.sh libcublas-12-3 libcurl4 libgomp1 - -# copy server and batched bench -RUN mkdir -p /llama.cpp -COPY --from=build /app/llama-server /app/llama-batched-bench /llama.cpp/ - -ENTRYPOINT [ "/llama.cpp/server" ]