diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml new file mode 100644 index 000000000..8e050404e --- /dev/null +++ b/.github/release-drafter.yml @@ -0,0 +1,26 @@ +categories: + - title: '🚀 Features' + labels: + - 'type: enhancement' + - 'type: epic' + - 'type: feature request' + - title: '🐛 Bug Fixes' + labels: + - 'type: bug' + - title: '🧰 Maintenance' + labels: + - 'type: chore' + - 'type: ci' + - title: '📖 Documentaion' + labels: + - 'type: documentation' +change-template: '- $TITLE @$AUTHOR (#$NUMBER)' +change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks. +template: | + ## Changes + + $CHANGES + + ## Contributor + + $CONTRIBUTORS diff --git a/.github/runners/linux/Dockerfile.multi b/.github/runners/linux/Dockerfile.multi new file mode 100644 index 000000000..47989e687 --- /dev/null +++ b/.github/runners/linux/Dockerfile.multi @@ -0,0 +1,142 @@ +# Multi-stage Dockerfile +ARG BASE_IMAGE=nvcr.io/nvidia/pytorch +ARG BASE_TAG=24.02-py3 +ARG DEVEL_IMAGE=devel + +FROM ${BASE_IMAGE}:${BASE_TAG} as base + +# https://www.gnu.org/software/bash/manual/html_node/Bash-Startup-Files.html +# The default values come from `nvcr.io/nvidia/pytorch` +ENV BASH_ENV=${BASH_ENV:-/etc/bash.bashrc} +ENV ENV=${ENV:-/etc/shinit_v2} +SHELL ["/bin/bash", "-c"] + +FROM base as devel + +COPY docker/common/install_base.sh install_base.sh +RUN bash ./install_base.sh && rm install_base.sh + +COPY docker/common/install_cmake.sh install_cmake.sh +RUN bash ./install_cmake.sh && rm install_cmake.sh + +COPY docker/common/install_ccache.sh install_ccache.sh +RUN bash ./install_ccache.sh && rm install_ccache.sh + +# Download & install internal TRT release +ARG TRT_VER +ARG CUDA_VER +ARG CUDNN_VER +ARG NCCL_VER +ARG CUBLAS_VER +COPY docker/common/install_tensorrt.sh install_tensorrt.sh +RUN bash ./install_tensorrt.sh \ + --TRT_VER=${TRT_VER} \ + --CUDA_VER=${CUDA_VER} \ + --CUDNN_VER=${CUDNN_VER} \ + --NCCL_VER=${NCCL_VER} \ + --CUBLAS_VER=${CUBLAS_VER} && \ + rm install_tensorrt.sh + +# Install latest Polygraphy +COPY docker/common/install_polygraphy.sh install_polygraphy.sh +RUN bash ./install_polygraphy.sh && rm install_polygraphy.sh + +# Install mpi4py +COPY docker/common/install_mpi4py.sh install_mpi4py.sh +RUN bash ./install_mpi4py.sh && rm install_mpi4py.sh + +# Install PyTorch +ARG TORCH_INSTALL_TYPE="skip" +COPY docker/common/install_pytorch.sh install_pytorch.sh +RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh +COPY setup.py requirements.txt requirements-dev.txt ./ + +RUN pip install --no-cache-dir -r requirements-dev.txt + +RUN apt update && echo Y | apt install sudo cargo -y + +RUN cargo install sccache --locked + +ARG RUNNER_VERSION=2.317.0 + +ARG USER_ID=1000 +ARG USER_NAME=runner +ARG GROUP_ID=1000 +ARG GROUP_NAME=runner + +RUN (getent group ${GROUP_ID} || groupadd --gid ${GROUP_ID} ${GROUP_NAME}) && \ + (getent passwd ${USER_ID} || useradd --gid ${GROUP_ID} --uid ${USER_ID} --create-home --no-log-init --shell /bin/bash ${USER_NAME}) + + +RUN usermod -aG sudo ${USER_NAME} \ + && echo "%sudo ALL=(ALL:ALL) NOPASSWD:ALL" > /etc/sudoers \ + && echo "Defaults env_keep += \"DEBIAN_FRONTEND\"" >> /etc/sudoers + +ENV HOME=/home/runner + +# cd into the user directory, download and unzip the github actions runner +RUN cd /home/runner && mkdir actions-runner && cd actions-runner \ + && curl -O -L https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz \ + && tar xzf ./actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz + +RUN chown -R runner:runner /home/runner && /home/runner/actions-runner/bin/installdependencies.sh + +ADD docker/common/start.sh start.sh + +RUN chmod +x start.sh + +RUN sudo chmod +x /root/.cargo/bin/sccache + +USER runner + +ENV PATH=/root/.cargo/bin${PATH:+:${PATH}} + +ENTRYPOINT ["./start.sh"] + +FROM ${DEVEL_IMAGE} as wheel +WORKDIR /src/tensorrt_llm +COPY benchmarks benchmarks +COPY cpp cpp +COPY benchmarks benchmarks +COPY scripts scripts +COPY tensorrt_llm tensorrt_llm +COPY 3rdparty 3rdparty +COPY setup.py requirements.txt requirements-dev.txt ./ + +ARG BUILD_WHEEL_ARGS="--clean --trt_root /usr/local/tensorrt --python_bindings --benchmarks" +RUN python3 scripts/build_wheel.py ${BUILD_WHEEL_ARGS} + +FROM ${DEVEL_IMAGE} as release + +WORKDIR /app/tensorrt_llm +COPY --from=wheel /src/tensorrt_llm/build/tensorrt_llm*.whl . +RUN pip install tensorrt_llm*.whl --extra-index-url https://pypi.nvidia.com && \ + rm tensorrt_llm*.whl +COPY README.md ./ +COPY docs docs +COPY cpp/include include +RUN ln -sv $(python3 -c 'import site; print(f"{site.getsitepackages()[0]}/tensorrt_llm/libs")') lib && \ + test -f lib/libnvinfer_plugin_tensorrt_llm.so && \ + ln -sv lib/libnvinfer_plugin_tensorrt_llm.so lib/libnvinfer_plugin_tensorrt_llm.so.9 && \ + echo "/app/tensorrt_llm/lib" > /etc/ld.so.conf.d/tensorrt_llm.conf && \ + ldconfig +ARG SRC_DIR=/src/tensorrt_llm +COPY --from=wheel ${SRC_DIR}/benchmarks benchmarks +ARG CPP_BUILD_DIR=${SRC_DIR}/cpp/build +COPY --from=wheel \ + ${CPP_BUILD_DIR}/benchmarks/bertBenchmark \ + ${CPP_BUILD_DIR}/benchmarks/gptManagerBenchmark \ + ${CPP_BUILD_DIR}/benchmarks/gptSessionBenchmark \ + benchmarks/cpp/ +COPY examples examples +RUN chmod -R a+w examples && \ + rm -v \ + benchmarks/cpp/bertBenchmark.cpp \ + benchmarks/cpp/gptManagerBenchmark.cpp \ + benchmarks/cpp/gptSessionBenchmark.cpp \ + benchmarks/cpp/CMakeLists.txt +ARG GIT_COMMIT +ARG TRT_LLM_VER +ENV TRT_LLM_GIT_COMMIT=${GIT_COMMIT} \ + TRT_LLM_VERSION=${TRT_LLM_VER} + diff --git a/.github/runners/linux/start.sh b/.github/runners/linux/start.sh new file mode 100644 index 000000000..84d3c3dcd --- /dev/null +++ b/.github/runners/linux/start.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +RUNNER_REPO=$RUNNER_REPO +RUNNER_PAT=$RUNNER_PAT +RUNNER_GROUP=$RUNNER_GROUP +RUNNER_LABELS=$RUNNER_LABELS +RUNNER_NAME=$(hostname) + +cd /home/runner/actions-runner + +./config.sh --unattended --replace --url https://github.com/${RUNNER_REPO} --pat ${RUNNER_PAT} --name ${RUNNER_NAME} --runnergroup ${RUNNER_GROUP} --labels ${RUNNER_LABELS} --work /home/runner/actions-runner/_work + +cleanup() { + echo "Removing runner..." + ./config.sh remove --unattended --pat ${RUNNER_PAT} +} + +trap 'cleanup; exit 130' INT +trap 'cleanup; exit 143' TERM + +./run.sh & wait $! \ No newline at end of file diff --git a/.github/runners/windows/Dockerfile.window.runner b/.github/runners/windows/Dockerfile.window.runner new file mode 100644 index 000000000..b1cb5a7c9 --- /dev/null +++ b/.github/runners/windows/Dockerfile.window.runner @@ -0,0 +1,255 @@ +# https://learn.microsoft.com/en-us/visualstudio/install/build-tools-container?view=vs-2022 + +# Use the Windows Server Core 2019 image. +FROM mcr.microsoft.com/windows/servercore:ltsc2019 + +# Restore the default Windows shell for correct batch processing. +# (Used for VS Build Tools installation) +SHELL ["cmd", "/S", "/C"] + +# ----------------------------------------------------------------------------- + +# Install CUDA 12.2 + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + $ProgressPreference = 'SilentlyContinue'; \ + Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_537.13_windows.exe \ + -OutFile "cuda_installer.exe"; \ + Start-Process cuda_installer.exe -Wait -ArgumentList '-s'; \ + Remove-Item cuda_installer.exe -Force + +# ----------------------------------------------------------------------------- + +# Install Python 3.10.11 + +# Download and install Python +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + $ProgressPreference = 'SilentlyContinue'; \ + Invoke-WebRequest -Uri https://www.python.org/ftp/python/3.10.11/python-3.10.11-amd64.exe -OutFile python-3.10.11.exe ; \ + Start-Process python-3.10.11.exe -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' ; \ + Remove-Item python-3.10.11.exe -Force + +# Add python3 command +RUN powershell -Command \ + cp "\"C:\\\\Program Files\\\\Python310\\\\python.exe\" \"C:\\\\Program Files\\\\Python310\\\\python3.exe\"" + +# ----------------------------------------------------------------------------- + +# Install Microsoft MPI + +# The latest version is 10.1.3, but it requires you to get a temporary download +# link. +# https://learn.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes +# We use 10.1.1 which has a release on the GitHub page +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + $ProgressPreference = 'SilentlyContinue'; \ + Invoke-WebRequest -Uri https://github.com/microsoft/Microsoft-MPI/releases/download/v10.1.1/msmpisetup.exe \ + -OutFile "msmpisetup.exe"; \ + Start-Process .\msmpisetup.exe -Wait ; \ + Remove-Item msmpisetup.exe -Force + +# Add MPI binaries to Path +RUN setx Path "%Path%;C:\Program Files\Microsoft MPI\Bin" + +# Download the MSMPI SDK +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + $ProgressPreference = 'SilentlyContinue'; \ + Invoke-WebRequest -Uri https://github.com/microsoft/Microsoft-MPI/releases/download/v10.1.1/msmpisdk.msi \ + -OutFile "msmpisdk.msi"; \ + Start-Process msiexec.exe -Wait -ArgumentList '/I msmpisdk.msi /quiet'; \ + Remove-Item msmpisdk.msi -Force + +# ----------------------------------------------------------------------------- + +# Install CMake + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + $ProgressPreference = 'SilentlyContinue'; \ + Invoke-WebRequest -Uri https://github.com/Kitware/CMake/releases/download/v3.27.7/cmake-3.27.7-windows-x86_64.msi \ + -OutFile "cmake.msi"; \ + Start-Process msiexec.exe -Wait -ArgumentList '/I cmake.msi /quiet'; \ + Remove-Item cmake.msi -Force + +# Add CMake binaries to Path +RUN setx Path "%Path%;C:\Program Files\CMake\bin" + +# ----------------------------------------------------------------------------- + +# Install VS Build Tools + +RUN \ + # Download the Build Tools bootstrapper. + curl -SL --output vs_buildtools.exe https://aka.ms/vs/17/release/vs_buildtools.exe \ + \ + # Install Build Tools with the Microsoft.VisualStudio.Workload.AzureBuildTools workload, excluding workloads and components with known issues. + && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache \ + --installPath "%ProgramFiles(x86)%\Microsoft Visual Studio\2022\BuildTools" \ + --includeRecommended \ + --add Microsoft.VisualStudio.Workload.MSBuildTools \ + --add Microsoft.VisualStudio.Workload.VCTools \ + --remove Microsoft.VisualStudio.Component.Windows10SDK.10240 \ + --remove Microsoft.VisualStudio.Component.Windows10SDK.10586 \ + --remove Microsoft.VisualStudio.Component.Windows10SDK.14393 \ + --remove Microsoft.VisualStudio.Component.Windows81SDK \ + || IF "%ERRORLEVEL%"=="3010" EXIT 0) \ + \ + # Cleanup + && del /q vs_buildtools.exe + +# ----------------------------------------------------------------------------- + +# Install Vim (can delete this but it's nice to have) + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + $ProgressPreference = 'SilentlyContinue'; \ + Invoke-WebRequest -Uri https://ftp.nluug.nl/pub/vim/pc/gvim90.exe \ + -OutFile "install_vim.exe"; \ + Start-Process install_vim.exe -Wait -ArgumentList '/S'; \ + Remove-Item install_vim.exe -Force + +# Add Vim binaries to Path +RUN setx Path "%Path%;C:\Program Files (x86)\Vim\vim90" + +# ----------------------------------------------------------------------------- + +# Install Chocolatey +# Chocolatey is a package manager for Windows +# I probably could've used it to install some of the above, but I didn't... + +# If you try to install Chocolatey 2.0.0, it fails on .NET Framework 4.8 installation +# https://stackoverflow.com/a/76470753 +ENV chocolateyVersion=1.4.0 + +# https://docs.chocolatey.org/en-us/choco/setup#install-with-cmd.exe +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + powershell.exe -NoProfile -InputFormat None -ExecutionPolicy Bypass \ + -Command "[System.Net.ServicePointManager]::SecurityProtocol = 3072; \ + iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))" && \ + SET "PATH=%PATH%;%ALLUSERSPROFILE%\chocolatey\bin" + +# ----------------------------------------------------------------------------- + +# Install Git via Chocolatey +RUN powershell -Command \ + choco install git -y + +# ----------------------------------------------------------------------------- + +# Install CUDA 11.8 NVTX +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + $ProgressPreference = 'SilentlyContinue'; \ + Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe \ + -OutFile cuda_11.8.0_windows_network.exe; \ + Invoke-WebRequest -Uri https://7-zip.org/a/7zr.exe \ + -OutFile 7zr.exe + +RUN \ + 7zr.exe e -i!"nsight_nvtx\nsight_nvtx\NVIDIA NVTX Installer.x86_64.Release.v1.21018621.Win64.msi" cuda_11.8.0_windows_network.exe &&\ + msiexec.exe /i "NVIDIA NVTX Installer.x86_64.Release.v1.21018621.Win64.msi" /norestart /quiet &&\ + del "NVIDIA NVTX Installer.x86_64.Release.v1.21018621.Win64.msi" &&\ + del 7zr.exe &&\ + del cuda_11.8.0_windows_network.exe + +# ----------------------------------------------------------------------------- + +# Create a working directory +WORKDIR "C:\\\\workspace" + +# ----------------------------------------------------------------------------- + +# Download and unzip TensorrRT 9.3.0.1 for TensorRT-LLM +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + $ProgressPreference = 'SilentlyContinue'; \ + Invoke-WebRequest -Uri https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.3.0/tensorrt-9.3.0.1.windows10.win10.cuda-12.2.llm.beta.zip \ + -OutFile TensorRT-9.3.0.1.zip; \ + Expand-Archive .\TensorRT-9.3.0.1.zip -DestinationPath .; \ + Move-Item -Path .\TensorRT-9.3.0.1.Windows10.win10.cuda-12.2.llm.beta\TensorRT-9.3.0.1 -Destination .; \ + Remove-Item TensorRT-9.3.0.1.Windows10.win10.cuda-12.2.llm.beta -Force; \ + Remove-Item TensorRT-9.3.0.1.zip -Force + +# Add TensorRT libs to Path +RUN setx Path "%Path%;C:\workspace\TensorRT-9.3.0.1\lib" + +# Install TensorRT Python wheel +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + pip install TensorRT-9.3.0.1\python\tensorrt-9.3.0.post12.dev1-cp310-none-win_amd64.whl + +# ----------------------------------------------------------------------------- + +# Download and unzip cuDNN 8.9.7.29 for TensorRT-LLM +# https://developer.nvidia.com/downloads/compute/cudnn/secure/8.9.7/local_installers/12.x/cudnn-windows-x86_64-8.9.7.29_cuda12-archive.zip +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + $ProgressPreference = 'SilentlyContinue'; \ + Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-8.9.7.29_cuda12-archive.zip \ + -OutFile cuDNN.zip; \ + Expand-Archive .\cuDNN.zip -DestinationPath .; \ + New-Item -Path cuDNN -ItemType Directory; \ + Move-Item -Path .\cudnn-windows-x86_64-8.9.7.29_cuda12-archive\* -Destination .\cuDNN; \ + Remove-Item cudnn-windows-x86_64-8.9.7.29_cuda12-archive -Force; \ + Remove-Item cuDNN.zip -Force + +# Add cuDNN libs and bin to Path. +RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;" + +# ----------------------------------------------------------------------------- + +# Define the entry point for the docker container. +# This entry point launches the 64-bit PowerShell developer shell. +# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA +ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"] + +# ----------------------------------------------------------------------------- +# COPY requirements-windows.txt C:\\workspace\\requirements-windows.txt +# COPY requirements-dev-windows.txt C:\\workspace\\requirements-dev-windows.txt +# RUN python3 -m pip --no-cache-dir install -r C:\workspace\requirements-dev-windows.txt +# RUN Remove-Item "C:\workspace\requirements-windows.txt" -Force +# RUN Remove-Item "C:\workspace\requirements-dev-windows.txt" -Force + +# This bellow command lt MSVC recognize cuda compiler +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations' + +RUN powershell -Command \ + $ErrorActionPreference = 'Stop'; \ + Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v160\BuildCustomizations' + +RUN powershell -Command \ + choco install sccache make Ninja -y; + +# RUN [Environment]::SetEnvironmentVariable('Path', $Env:Path + ';C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools', [EnvironmentVariableTarget]::Machine) + +ADD ./requirements-dev-windows.txt ./requirements-dev-windows.txt +ADD ./requirements-windows.txt ./requirements-windows.txt + +RUN python3 -m pip install --no-cache-dir -r .\requirements-dev-windows.txt + +ARG RUNNER_VERSION=2.317.0 + +# Define the entry point for the docker container. +# This entry point launches the 64-bit PowerShell developer shell. +# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA +# ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"] + +RUN Invoke-WebRequest \ + -Uri https://github.com/actions/runner/releases/download/v$env:RUNNER_VERSION/actions-runner-win-x64-$env:RUNNER_VERSION.zip \ + -OutFile runner.zip; \ + Expand-Archive -Path ./runner.zip -DestinationPath ./actions-runner; \ + Remove-Item -Path .\runner.zip; + +ADD runner.ps1 ./runner.ps1 + +RUN powershell -Command New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force + +CMD ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-File", ".\\runner.ps1"] \ No newline at end of file diff --git a/.github/runners/windows/requirements-dev-windows.txt b/.github/runners/windows/requirements-dev-windows.txt new file mode 100644 index 000000000..76c840cae --- /dev/null +++ b/.github/runners/windows/requirements-dev-windows.txt @@ -0,0 +1,17 @@ +-r requirements-windows.txt +--extra-index-url https://download.pytorch.org/whl/cu121 +datasets +einops +graphviz +mypy +parameterized +pre-commit +pybind11 +pybind11-stubgen +pytest-cov +pytest-forked +pytest-xdist +pywin32 +rouge_score +cloudpickle +typing-extensions==4.8.0 diff --git a/.github/runners/windows/requirements-windows.txt b/.github/runners/windows/requirements-windows.txt new file mode 100644 index 000000000..0bbd1f26a --- /dev/null +++ b/.github/runners/windows/requirements-windows.txt @@ -0,0 +1,30 @@ +--extra-index-url https://pypi.nvidia.com +accelerate==0.25.0 +build +colored +cuda-python==12.3.0 +diffusers==0.15.0 +mpi4py +numpy +onnx>=1.12.0 +polygraphy +psutil +pynvml>=11.5.0 +pulp +pandas +h5py==3.10.0 +pywin32 +StrEnum +sentencepiece>=0.1.99 +# WAR the new posting of "nvidia-cudnn-cu12~=9.0". +# "tensorrt==9.3.0.post12.dev1" specifies "nvidia-cudnn-cu12" but actually requires "nvidia-cudnn-cu12~=8.9". +nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64" +tensorrt==9.3.0.post12.dev1 +tokenizers>=0.14 +# Default torch is CPU-only on Windows, so need to specify a torch version with GPU support +torch==2.2.0+cu121 +transformers==4.38.2 +wheel +optimum +evaluate +janus diff --git a/.github/runners/windows/runner.ps1 b/.github/runners/windows/runner.ps1 new file mode 100644 index 000000000..7ffa45de5 --- /dev/null +++ b/.github/runners/windows/runner.ps1 @@ -0,0 +1,3 @@ +$runnerName = (hostname.exe).Trim() +.\actions-runner\config.cmd --unattended --replace --url https://github.com/${env:RUNNER_REPO} --pat $env:RUNNER_PAT --runnergroup $env:RUNNER_GROUP --labels $env:RUNNER_LABELS --work $env:RUNNER_WORKDIR --name $runnerName +.\actions-runner\run.cmd \ No newline at end of file diff --git a/.github/workflows/auto_close_inactive_issues.yml b/.github/workflows/auto_close_inactive_issues.yml new file mode 100644 index 000000000..423adefbb --- /dev/null +++ b/.github/workflows/auto_close_inactive_issues.yml @@ -0,0 +1,25 @@ +# Ref: https://docs.github.com/en/actions/managing-issues-and-pull-requests/closing-inactive-issues +name: Close inactive issues +on: + schedule: + - cron: "30 1 * * *" + +jobs: + stale: + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + steps: + - uses: actions/stale@v9 + with: + days-before-issue-stale: 30 + days-before-issue-close: 15 + stale-issue-label: "stale" + exempt-issue-labels: "" + stale-issue-message: This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 15 days." + close-issue-message: "This issue was closed because it has been stalled for 15 days with no activity." + days-before-pr-stale: -1 + days-before-pr-close: -1 + repo-token: ${{ secrets.GITHUB_TOKEN }} + debug-only: true diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 000000000..f59d8363f --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,217 @@ +name: CI + +on: + push: + tags: ["v[0-9]+.[0-9]+.[0-9]+"] + paths: + [ + ".github/scripts/**", + ".github/workflows/build.yml", + "**/CMakeLists.txt", + "**/Makefile", + "**/*.h", + "**/*.hpp", + "**/*.c", + "**/*.cpp", + "**/*.cu", + "**/*.cc", + "**/*.cxx", + "llama.cpp", + "!docs/**", + "!.gitignore", + "!README.md", + ] + workflow_dispatch: + +jobs: + create-draft-release: + runs-on: ubuntu-latest + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') + outputs: + upload_url: ${{ steps.create_release.outputs.upload_url }} + version: ${{ steps.get_version.outputs.version }} + permissions: + contents: write + steps: + - name: Extract tag name without v prefix + id: get_version + run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV && echo "::set-output name=version::${GITHUB_REF#refs/tags/v}" + env: + GITHUB_REF: ${{ github.ref }} + - name: Create Draft Release + id: create_release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ github.ref_name }} + release_name: "${{ env.VERSION }}" + draft: true + prerelease: false + + build-and-test: + runs-on: ${{ matrix.runs-on }} + needs: [create-draft-release] + timeout-minutes: 1440 + strategy: + matrix: + include: + - os: "windows" + name: "cuda-12-2" + runs-on: "windows-tensorrt-llm-cuda-12-2" + run-e2e: false + s3-key-prefix: "windows-tensorrt-llm-ccache" + sccache-conf-path: 'C:\sccache.conf' + - os: "linux" + name: "cuda-12-3" + runs-on: "linux-tensorrt-llm-cuda-12-3" + run-e2e: false + s3-key-prefix: "linux-tensorrt-llm" + sccache-conf-path: '/tmp/sccache.conf' + permissions: + contents: write + steps: + - name: Clone + id: checkout + uses: actions/checkout@v3 + with: + submodules: recursive + lfs: true + + - name: Install choco on Windows + if: runner.os == 'Windows' + run: | + choco install make pkgconfiglite ccache awscli -y + + - name: create sccache.conf file Linux + if: runner.os == 'Linux' + run: | + echo "[cache.s3]" > ${{ matrix.sccache-conf-path }} + echo 'bucket = "${{ secrets.MINIO_BUCKET_NAME }}"' >> ${{ matrix.sccache-conf-path }} + echo 'endpoint = "${{ secrets.MINIO_ENDPOINT }}"' >> ${{ matrix.sccache-conf-path }} + echo 'key_prefix = "${{ matrix.s3-key-prefix }}"' >> ${{ matrix.sccache-conf-path }} + echo 'use_ssl = false' >> ${{ matrix.sccache-conf-path }} + echo 'server_side_encryption = false' >> ${{ matrix.sccache-conf-path }} + echo 'no_credentials = false' >> ${{ matrix.sccache-conf-path }} + + - name: Download ccache from s3 + continue-on-error: true + if: runner.os == 'Windows' + run: | + Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1" + refreshenv + aws s3 cp s3://${{ secrets.MINIO_BUCKET_NAME }}/${{ matrix.s3-key-prefix }} C:\Users\ContainerAdministrator\AppData\Local\ccache --recursive --endpoint ${{ secrets.MINIO_ENDPOINT }} + env: + AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}" + AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}" + AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}" + + - name: start sccache server for linux + if: runner.os == 'Linux' + working-directory: cpp + run: | + sccache --start-server + env: + SCCACHE_BUCKET: "${{ secrets.MINIO_BUCKET_NAME }}" + SCCACHE_REGION: "${{ secrets.MINIO_REGION }}" + SCCACHE_ENDPOINT: "${{ secrets.MINIO_ENDPOINT }}" + SCCACHE_S3_USE_SSL: "false" + SCCACHE_S3_SERVER_SIDE_ENCRYPTION: "false" + SCCACHE_S3_KEY_PREFIX: "${{ matrix.s3-key-prefix }}" + SCCACHE_LOG: "debug" + SCCACHE_CONF: '${{ matrix.sccache-conf-path }}' + AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}" + AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}" + SCCACHE_IDLE_TIMEOUT: "0" + + # - name: start sccache server for windows + # if: runner.os == 'Windows' + # run: | + # sccache --start-server + # env: + # SCCACHE_IDLE_TIMEOUT: "0" + + - name: Build Dependencies + working-directory: cpp + run: | + make build-deps + + - name: Build + working-directory: cpp + run: | + make build + + - name: Pre-package + working-directory: cpp + run: | + make pre-package + + - name: Package + working-directory: cpp + run: | + make package + + - name: Upload Artifact + uses: actions/upload-artifact@v2 + with: + name: cortex.tensorrt-llm-${{ matrix.os }}-${{ matrix.name }} + path: cpp/tensorrt_llm/cortex.tensorrt-llm/cortex.tensorrt-llm.tar.gz + + - uses: actions/upload-release-asset@v1.0.1 + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ needs.create-draft-release.outputs.upload_url }} + asset_path: cpp/tensorrt_llm/cortex.tensorrt-llm/cortex.tensorrt-llm.tar.gz + asset_name: cortex.tensorrt-llm-${{ needs.create-draft-release.outputs.version }}-${{ matrix.os }}-${{ matrix.name }}.tar.gz + asset_content_type: application/gzip + + - name: Clean + if: always() + continue-on-error: true + run: | + sccache --stop-server + rm ${{ matrix.sccache-conf-path }} + + - name: Upload ccache to s3 + continue-on-error: true + if: always() && runner.os == 'Windows' + run: | + Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1" + refreshenv + aws s3 cp C:\Users\ContainerAdministrator\AppData\Local\ccache s3://${{ secrets.MINIO_BUCKET_NAME }}/${{ matrix.s3-key-prefix }} --recursive --endpoint ${{ secrets.MINIO_ENDPOINT }} + env: + AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}" + AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}" + AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}" + update_release_draft: + needs: [build-and-test] + permissions: + # write permission is required to create a github release + contents: write + # write permission is required for autolabeler + # otherwise, read permission is required at least + pull-requests: write + runs-on: ubuntu-latest + steps: + # (Optional) GitHub Enterprise requires GHE_HOST variable set + #- name: Set GHE_HOST + # run: | + # echo "GHE_HOST=${GITHUB_SERVER_URL##https:\/\/}" >> $GITHUB_ENV + + # Drafts your next Release notes as Pull Requests are merged into "master" + - uses: release-drafter/release-drafter@v5 + # (Optional) specify config name to use, relative to .github/. Default: release-drafter.yml + # with: + # config-name: my-config.yml + # disable-autolabeler: true + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Clean + if: always() + continue-on-error: true + run: | + sccache --stop-server + rm ${{ matrix.sccache-conf-path }} \ No newline at end of file diff --git a/.github/workflows/quality-gate.yml b/.github/workflows/quality-gate.yml new file mode 100644 index 000000000..533a2f226 --- /dev/null +++ b/.github/workflows/quality-gate.yml @@ -0,0 +1,136 @@ +name: CI Quality Gate + +on: + pull_request: + types: [opened, synchronize, reopened] + workflow_dispatch: + +jobs: + build-and-test: + runs-on: ${{ matrix.runs-on }} + timeout-minutes: 1440 + strategy: + fail-fast: false + matrix: + include: + - os: "windows" + name: "cuda-12-2" + runs-on: "windows-tensorrt-llm-cuda-12-2" + run-e2e: false + s3-key-prefix: "windows-tensorrt-llm-ccache" + sccache-conf-path: 'C:\sccache.conf' + - os: "linux" + name: "cuda-12-3" + runs-on: "linux-tensorrt-llm-cuda-12-3" + run-e2e: false + s3-key-prefix: "linux-tensorrt-llm" + sccache-conf-path: '/tmp/sccache.conf' + permissions: + contents: write + steps: + - name: Clone + id: checkout + uses: actions/checkout@v3 + with: + submodules: recursive + lfs: true + + - name: Install choco on Windows + if: runner.os == 'Windows' + run: | + choco install make pkgconfiglite ccache awscli -y + + - name: create sccache.conf file Linux + if: runner.os == 'Linux' + run: | + echo "[cache.s3]" > ${{ matrix.sccache-conf-path }} + echo 'bucket = "${{ secrets.MINIO_BUCKET_NAME }}"' >> ${{ matrix.sccache-conf-path }} + echo 'endpoint = "${{ secrets.MINIO_ENDPOINT }}"' >> ${{ matrix.sccache-conf-path }} + echo 'key_prefix = "${{ matrix.s3-key-prefix }}"' >> ${{ matrix.sccache-conf-path }} + echo 'use_ssl = false' >> ${{ matrix.sccache-conf-path }} + echo 'server_side_encryption = false' >> ${{ matrix.sccache-conf-path }} + echo 'no_credentials = false' >> ${{ matrix.sccache-conf-path }} + + - name: Download ccache from s3 + continue-on-error: true + if: runner.os == 'Windows' + run: | + Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1" + refreshenv + aws s3 cp s3://${{ secrets.MINIO_BUCKET_NAME }}/${{ matrix.s3-key-prefix }} C:\Users\ContainerAdministrator\AppData\Local\ccache --recursive --endpoint ${{ secrets.MINIO_ENDPOINT }} + env: + AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}" + AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}" + AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}" + + - name: start sccache server for linux + if: runner.os == 'Linux' + working-directory: cpp + run: | + sccache --start-server + env: + SCCACHE_BUCKET: "${{ secrets.MINIO_BUCKET_NAME }}" + SCCACHE_REGION: "${{ secrets.MINIO_REGION }}" + SCCACHE_ENDPOINT: "${{ secrets.MINIO_ENDPOINT }}" + SCCACHE_S3_USE_SSL: "false" + SCCACHE_S3_SERVER_SIDE_ENCRYPTION: "false" + SCCACHE_S3_KEY_PREFIX: "${{ matrix.s3-key-prefix }}" + SCCACHE_LOG: "debug" + SCCACHE_CONF: '${{ matrix.sccache-conf-path }}' + AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}" + AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}" + SCCACHE_IDLE_TIMEOUT: "0" + + # - name: start sccache server for windows + # if: runner.os == 'Windows' + # run: | + # sccache --start-server + # env: + # SCCACHE_IDLE_TIMEOUT: "0" + + - name: Build Dependencies + working-directory: cpp + run: | + make build-deps + + - name: Build + working-directory: cpp + run: | + make build + + - name: Pre-package + working-directory: cpp + run: | + make pre-package + + - name: Package + working-directory: cpp + run: | + make package + + - name: Upload Artifact + uses: actions/upload-artifact@v2 + with: + name: cortex.tensorrt-llm-${{ matrix.os }}-${{ matrix.name }} + path: cpp/tensorrt_llm/cortex.tensorrt-llm/cortex.tensorrt-llm.tar.gz + + - name: Clean + if: always() + continue-on-error: true + run: | + sccache --stop-server + rm ${{ matrix.sccache-conf-path }} + + - name: Upload ccache to s3 + continue-on-error: true + if: always() && runner.os == 'Windows' + run: | + Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1" + refreshenv + aws s3 cp C:\Users\ContainerAdministrator\AppData\Local\ccache s3://${{ secrets.MINIO_BUCKET_NAME }}/${{ matrix.s3-key-prefix }} --recursive --endpoint ${{ secrets.MINIO_ENDPOINT }} + env: + AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}" + AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}" + AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}" + + \ No newline at end of file diff --git a/cpp/Makefile b/cpp/Makefile new file mode 100644 index 000000000..fc469e2f6 --- /dev/null +++ b/cpp/Makefile @@ -0,0 +1,104 @@ +# Makefile for Cortex cortex.tensorrt-llm engine - Build, Lint, Test, and Clean +.PHONY: all build package run-e2e-test + +RUN_TESTS ?= false +CODE_SIGN ?= false +AZURE_KEY_VAULT_URI ?= xxxx +AZURE_CLIENT_ID ?= xxxx +AZURE_TENANT_ID ?= xxxx +AZURE_CLIENT_SECRET ?= xxxx +AZURE_CERT_NAME ?= xxxx + +# Default target, does nothing +all: + @echo "Specify a target to run" + +# Build the Cortex engine +build-deps: +ifeq ($(OS),Windows_NT) + @powershell -Command "cd tensorrt_llm/cortex.tensorrt-llm; cmake -S ./third-party -B ./build_deps/third-party -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE=Release -DCMAKE_OBJECT_PATH_MAX=500; cmake --build ./build_deps/third-party --config Release -j8;" +else + @cd tensorrt_llm/cortex.tensorrt-llm && cmake -S ./third-party -B ./build_deps/third-party -DCMAKE_BUILD_TYPE=Release -DCMAKE_OBJECT_PATH_MAX=500 && make -C ./build_deps/third-party -j 10 && rm -rf ./build_deps/third-party; +endif + +# Build the Cortex engine +build: +ifeq ($(OS),Windows_NT) + @powershell -Command "cd ..; python .\scripts\build_wheel.py -a '80-real;86-real;89-real' --trt_root 'C:\workspace\TensorRT-9.3.0.1\' -D 'BUILD_CORTEX_TENSORRT-LLM=ON' --use_ccache" + @powershell -Command "cd build; cmake .. -DCMAKE_CUDA_ARCHITECTURES='80-real;86-real;89-real' -DTRT_LIB_DIR='C:/workspace/TensorRT-9.3.0.1/lib' -DTRT_INCLUDE_DIR='C:/workspace/TensorRT-9.3.0.1/include' -DBUILD_CORTEX_TENSORRT-LLM=ON -DCMAKE_CUDA_COMPILER='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe' -DENABLE_MULTI_DEVICE=0 -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -G Ninja; cmake --build . --parallel 2 --config Release" +else + @mkdir -p build && cd build; \ + cmake .. -GNinja -DBUILD_CORTEX_TENSORRT-LLM=ON -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -DBUILD_PYT='OFF' -DBUILD_PYBIND='OFF' -DNVTX_DISABLE='ON' -DCMAKE_CUDA_ARCHITECTURES='80-real;86-real;89-real' '-DENABLE_MULTI_DEVICE=0' '-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc' -DBUILD_BENCHMARKS=OFF '-DBUILD_TESTS=OFF' -DTRT_LIB_DIR=/usr/local/tensorrt/lib -DTRT_INCLUDE_DIR=/usr/local/tensorrt/include; \ + cmake --build . --config Release; +endif + +# Prepackage the Cortex engine +pre-package: +ifeq ($(OS),Windows_NT) + @powershell -Command "cd tensorrt_llm\cortex.tensorrt-llm\; mkdir -p cortex.tensorrt-llm; cp ..\..\build\tensorrt_llm\cortex.tensorrt-llm\engine.dll cortex.tensorrt-llm\;" + @powershell -Command "cd tensorrt_llm\cortex.tensorrt-llm\; cp -Force C:\workspace\TensorRT-9.3.0.1\lib\nvinfer.dll cortex.tensorrt-llm\;" + @powershell -Command "cd tensorrt_llm\cortex.tensorrt-llm\; cp -Force C:\workspace\cuDNN\bin\cudnn64_8.dll cortex.tensorrt-llm\;" + @powershell -Command "cd tensorrt_llm\cortex.tensorrt-llm\; cp -Force C:\workspace\cuDNN\bin\cudnn_ops_infer64_8.dll cortex.tensorrt-llm\;" + @powershell -Command "cd tensorrt_llm\cortex.tensorrt-llm\; cp -Force ..\..\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll cortex.tensorrt-llm\;" + @powershell -Command "cd tensorrt_llm\cortex.tensorrt-llm\; cp -Force ..\..\build\tensorrt_llm\tensorrt_llm.dll cortex.tensorrt-llm\;" + @powershell -Command "cd tensorrt_llm\cortex.tensorrt-llm\; cp -Force C:\Windows\System32\msmpi.dll cortex.tensorrt-llm\;" + @powershell -Command "cd tensorrt_llm\cortex.tensorrt-llm\; cp -Force .\build_deps\_install\bin\zlib.dll cortex.tensorrt-llm\;" +else + cd ./tensorrt_llm/cortex.tensorrt-llm && \ + mkdir -p cortex.tensorrt-llm && \ + cp ../../build/tensorrt_llm/cortex.tensorrt-llm/libengine.$(shell uname | tr '[:upper:]' '[:lower:]' | sed 's/darwin/dylib/;s/linux/so/') cortex.tensorrt-llm && \ + cp /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.9 cortex.tensorrt-llm && \ + cp /usr/lib/x86_64-linux-gnu/libcudnn.so.8 cortex.tensorrt-llm && \ + cp /usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8 cortex.tensorrt-llm && \ + cp /home/runner/actions-runner/_work/cortex.tensorrt-llm/cortex.tensorrt-llm/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so.9 cortex.tensorrt-llm && \ + cp /home/runner/actions-runner/_work/cortex.tensorrt-llm/cortex.tensorrt-llm/cpp/build/tensorrt_llm/libtensorrt_llm.so cortex.tensorrt-llm && \ + cp /opt/hpcx/ompi/lib/libmpi.so.40 cortex.tensorrt-llm && \ + cp /usr/lib/x86_64-linux-gnu/libnccl.so cortex.tensorrt-llm +endif + +codesign: +ifeq ($(CODE_SIGN),false) + @echo "Skipping Code Sign" + @exit 0 +endif + +ifeq ($(OS),Windows_NT) + @powershell -Command "dotnet tool install --global AzureSignTool;" +endif + +package: +ifeq ($(OS),Windows_NT) + @powershell -Command "cd tensorrt_llm\cortex.tensorrt-llm\; 7z a -ttar temp.tar cortex.tensorrt-llm\*; 7z a -tgzip cortex.tensorrt-llm.tar.gz temp.tar;" +else + @cd tensorrt_llm/cortex.tensorrt-llm && \ + tar -czvf cortex.tensorrt-llm.tar.gz cortex.tensorrt-llm +endif + +run-e2e-test: +ifeq ($(RUN_TESTS),false) + @echo "Skipping tests" + @exit 0 +endif +ifeq ($(OS),Windows_NT) + @powershell -Command "echo hello" +else + echo "hello" +endif + +run-python-e2e-test: +ifeq ($(RUN_TESTS),false) + @echo "Skipping tests" + @exit 0 +endif +ifeq ($(OS),Windows_NT) + echo hello +else + echo hello +endif + +clean: +ifeq ($(OS),Windows_NT) + echo hello +else + echo "hello" +endif \ No newline at end of file diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/CMakeLists.txt b/cpp/tensorrt_llm/cortex.tensorrt-llm/CMakeLists.txt index 5c5f25a25..0dde5a31b 100644 --- a/cpp/tensorrt_llm/cortex.tensorrt-llm/CMakeLists.txt +++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/CMakeLists.txt @@ -15,13 +15,12 @@ # C++17 # engine init include(CheckIncludeFileCXX) - check_include_file_cxx(any HAS_ANY) check_include_file_cxx(string_view HAS_STRING_VIEW) check_include_file_cxx(coroutine HAS_COROUTINE) if(HAS_ANY - AND HAS_STRING_VIEW - AND HAS_COROUTINE) + AND HAS_STRING_VIEW + AND HAS_COROUTINE) set(CMAKE_CXX_STANDARD 20) elseif(HAS_ANY AND HAS_STRING_VIEW) set(CMAKE_CXX_STANDARD 17) @@ -29,10 +28,10 @@ else() set(CMAKE_CXX_STANDARD 14) endif() - -set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) +SET(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) + set(CMAKE_PREFIX_PATH ${CMAKE_CURRENT_SOURCE_DIR}/build_deps/_install) message(STATUS "Current Source Directory CORTEX: ${CMAKE_CURRENT_SOURCE_DIR}") @@ -41,9 +40,6 @@ message(STATUS "Current Cmake Prefix Path of CORTEX: ${CMAKE_PREFIX_PATH}") set(OPENSSL_USE_STATIC_LIBS TRUE) - -# Enable pkg-config support in CMake -find_package(PkgConfig REQUIRED) find_library(TRANTOR NAMES trantor HINTS "${CMAKE_PREFIX_PATH}/lib" @@ -53,9 +49,9 @@ find_library(JSONCPP HINTS "${CMAKE_PREFIX_PATH}/lib" ) -# Use pkg-config to find the SentencePiece library - if(NOT WIN32) # Linux + # Enable pkg-config support in CMake + find_package(PkgConfig REQUIRED) # Use pkg-config to find the SentencePiece library pkg_search_module(SENTENCEPIECE REQUIRED sentencepiece) else() # Windows @@ -77,8 +73,6 @@ add_custom_target(engine_proj) set(CXXOPTS_SRC_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/cxxopts) add_subdirectory(${CXXOPTS_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/cxxopts) -# main -# add_executable(engine main.cc) add_library(engine SHARED src/tensorrt-llm_engine.cc) target_link_libraries( engine PUBLIC ${SHARED_TARGET} nvinfer_plugin_tensorrt_llm cxxopts::cxxopts sentencepiece PRIVATE ${JSONCPP} ${TRANTOR} ${CMAKE_THREAD_LIBS_INIT} ) diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/Makefile b/cpp/tensorrt_llm/cortex.tensorrt-llm/Makefile index cbf60a948..1628262a3 100644 --- a/cpp/tensorrt_llm/cortex.tensorrt-llm/Makefile +++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/Makefile @@ -34,21 +34,34 @@ endif package: ifeq ($(OS),Windows_NT) + @powershell -Command "mkdir -p cortex.tensorrt-llm; cp ..\..\build\tensorrt_llm\cortex.tensorrt-llm\engine.dll cortex.tensorrt-llm\;" + @powershell -Command "cp -Force C:\workspace\TensorRT-9.3.0.1\lib\nvinfer.dll cortex.tensorrt-llm\;" + @powershell -Command "cp -Force C:\workspace\cuDNN\bin\cudnn64_8.dll cortex.tensorrt-llm\;" + @powershell -Command "cp -Force C:\workspace\cuDNN\bin\cudnn_ops_infer64_8.dll cortex.tensorrt-llm\;" + @powershell -Command "cp -Force ..\..\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll cortex.tensorrt-llm\;" + @powershell -Command "cp -Force ..\..\build\tensorrt_llm\tensorrt_llm.dll cortex.tensorrt-llm\;" + @powershell -Command "cp -Force C:\Windows\System32\msmpi.dll cortex.tensorrt-llm\;" + @powershell -Command "cp -Force .\build_deps\_install\bin\zlib.dll cortex.tensorrt-llm\;" + @powershell -Command "7z a -ttar temp.tar cortex.tensorrt-llm\*; 7z a -tgzip cortex.tensorrt-llm.tar.gz temp.tar;" else @mkdir -p cortex.tensorrt-llm && \ cp ../../build/tensorrt_llm/cortex.tensorrt-llm/libengine.$(shell uname | tr '[:upper:]' '[:lower:]' | sed 's/darwin/dylib/;s/linux/so/') cortex.tensorrt-llm && \ - cp /usr/local/cuda-12.4/targets/x86_64-linux/lib/libcublas.so.12 cortex.tensorrt-llm && \ - cp /usr/local/cuda-12.4/targets/x86_64-linux/lib/libcublas.so.12.4.2.65 cortex.tensorrt-llm && \ - cp /usr/local/cuda-12.4/targets/x86_64-linux/lib/libcublasLt.so.12 cortex.tensorrt-llm && \ - cp /usr/local/cuda-12.4/targets/x86_64-linux/lib/libcublasLt.so.12.4.2.65 cortex.tensorrt-llm && \ - cp /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10 cortex.tensorrt-llm && \ - cp /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10.0.1 cortex.tensorrt-llm && \ - cp /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so cortex.tensorrt-llm && \ - cp /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so.10 cortex.tensorrt-llm && \ - cp /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm.so cortex.tensorrt-llm && \ - cp /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm_nvrtc_wrapper.so cortex.tensorrt-llm && \ + cp /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.9 cortex.tensorrt-llm && \ + cp /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.9.3.0 cortex.tensorrt-llm && \ + cp /usr/lib/x86_64-linux-gnu/libcudnn.so.8 cortex.tensorrt-llm && \ + cp /usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.7 cortex.tensorrt-llm && \ + cp /usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8 cortex.tensorrt-llm && \ + cp /usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.7 cortex.tensorrt-llm && \ + cp /home/runner/actions-runner/_work/cortex.tensorrt-llm/cortex.tensorrt-llm/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so cortex.tensorrt-llm && \ + cp /home/runner/actions-runner/_work/cortex.tensorrt-llm/cortex.tensorrt-llm/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so.9 cortex.tensorrt-llm && \ + cp /home/runner/actions-runner/_work/cortex.tensorrt-llm/cortex.tensorrt-llm/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so.9.3.0 cortex.tensorrt-llm && \ + cp /home/runner/actions-runner/_work/cortex.tensorrt-llm/cortex.tensorrt-llm/cpp/build/tensorrt_llm/libtensorrt_llm.so cortex.tensorrt-llm && \ + cp /opt/hpcx/ompi/lib/libmpi.so cortex.tensorrt-llm && \ + cp /opt/hpcx/ompi/lib/libmpi.so.40 cortex.tensorrt-llm && \ + cp /opt/hpcx/ompi/lib/libmpi.so.40.30.5 cortex.tensorrt-llm && \ + cp /usr/lib/x86_64-linux-gnu/libnccl.so cortex.tensorrt-llm && \ cp /usr/lib/x86_64-linux-gnu/libnccl.so.2 cortex.tensorrt-llm && \ - cp /usr/lib/x86_64-linux-gnu/libnccl.so.2.20.5 cortex.tensorrt-llm && \ + cp /usr/lib/x86_64-linux-gnu/libnccl.so.2.19.3 cortex.tensorrt-llm && \ tar -czvf cortex.tensorrt-llm.tar.gz cortex.tensorrt-llm endif diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/base/cortex-common/cortextensorrtllmi.h b/cpp/tensorrt_llm/cortex.tensorrt-llm/base/cortex-common/cortextensorrtllmi.h deleted file mode 100644 index 681d4d0cd..000000000 --- a/cpp/tensorrt_llm/cortex.tensorrt-llm/base/cortex-common/cortextensorrtllmi.h +++ /dev/null @@ -1,21 +0,0 @@ -#pragma once - -#include -#include - -#include "json/value.h" - -class CortexTensorrtLlmEngineI { - public: - virtual ~CortexTensorrtLlmEngineI() {} - - virtual void HandleChatCompletion( - std::shared_ptr jsonBody, - std::function&& callback) = 0; - virtual void LoadModel( - std::shared_ptr jsonBody, - std::function&& callback) = 0; - virtual void Destroy( - std::shared_ptr jsonBody, - std::function&& callback) = 0; -}; diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/base/cortex-common/enginei.h b/cpp/tensorrt_llm/cortex.tensorrt-llm/base/cortex-common/enginei.h new file mode 100644 index 000000000..bcc0d81b7 --- /dev/null +++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/base/cortex-common/enginei.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include + +#include "json/value.h" + +// Interface for inference engine. +// Note: only append new function to keep the compatibility. +class EngineI { + public: + virtual ~EngineI() {} + + virtual void HandleChatCompletion( + std::shared_ptr json_body, + std::function&& callback) = 0; + virtual void HandleEmbedding( + std::shared_ptr json_body, + std::function&& callback) = 0; + virtual void LoadModel( + std::shared_ptr json_body, + std::function&& callback) = 0; + virtual void UnloadModel( + std::shared_ptr json_body, + std::function&& callback) = 0; + virtual void GetModelStatus( + std::shared_ptr json_body, + std::function&& callback) = 0; + + // For backward compatible checking, add to list when we add more APIs + virtual bool IsSupported(const std::string& f) { + if (f == "HandleChatCompletion" || f == "HandleEmbedding" || + f == "UnloadModel" || f == "GetModelStatus" || + f == "GetModels") { + return true; + } + return false; + } + + // API to get running models. + virtual void GetModels( + std::shared_ptr json_body, + std::function&& callback) = 0; +}; diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/examples/server/server.h b/cpp/tensorrt_llm/cortex.tensorrt-llm/examples/server/server.h index 471725403..b38242d69 100644 --- a/cpp/tensorrt_llm/cortex.tensorrt-llm/examples/server/server.h +++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/examples/server/server.h @@ -1,6 +1,6 @@ #pragma once -#include "cortex-common/cortextensorrtllmi.h" +#include "cortex-common/enginei.h" #include "dylib.h" #include @@ -11,7 +11,7 @@ class Server { public: Server() { dylib_ = std::make_unique("./engines/cortex.tensorrt-llm", "engine"); - auto func = dylib_->get_function("get_engine"); + auto func = dylib_->get_function("get_engine"); engine_ = func(); } @@ -23,7 +23,7 @@ class Server { public: std::unique_ptr dylib_; - CortexTensorrtLlmEngineI* engine_; + EngineI* engine_; struct SyncQueue { void push(std::pair&& p) { diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/models/chat_completion_request.h b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/models/chat_completion_request.h index 1c98871bd..bac53f10f 100644 --- a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/models/chat_completion_request.h +++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/models/chat_completion_request.h @@ -9,7 +9,6 @@ struct ChatCompletionRequest { float temperature = 0.00001f; float frequency_penalty = 1.3; float presence_penalty = 0; - std::string model_id = "default"; Json::Value messages = Json::Value(Json::arrayValue); Json::Value stop = Json::Value(Json::arrayValue); }; @@ -23,7 +22,6 @@ inline ChatCompletionRequest fromJson(std::shared_ptr json_body) { request.temperature = json_body->get("temperature", 0.00001f).asFloat(); request.frequency_penalty = json_body->get("frequency_penalty", 1.3).asFloat(); request.presence_penalty = json_body->get("presence_penalty", 0).asFloat(); - request.model_id = json_body->get("model_id", "default").asString(); request.messages = json_body->operator[]("messages"); request.stop = json_body->operator[]("stop"); } diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/models/load_model_request.h b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/models/load_model_request.h index ff305df6d..7658ef762 100644 --- a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/models/load_model_request.h +++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/models/load_model_request.h @@ -7,8 +7,7 @@ namespace tensorrtllm::model { struct LoadModelRequest { int ctx_len = 2048; int n_parallel = 1; - std::string model_id = "default"; - std::string engine_path; + std::string model_path; std::string user_prompt = "<|im_end|>\n<|im_start|>user\n"; std::string ai_prompt = "<|im_end|>\n<|im_start|>user\n"; std::string system_prompt = "<|im_end|>\n<|im_start|>user\n"; @@ -19,8 +18,7 @@ inline LoadModelRequest fromJson(std::shared_ptr json_body) { if (json_body) { request.ctx_len = json_body->get("ctx_len", 2048).asInt(); request.n_parallel = json_body->get("n_parallel", 1).asInt(); - request.model_id = json_body->get("model_id", "default").asString(); - request.engine_path = json_body->get("engine_path", "").asString(); + request.model_path = json_body->get("model_path", "").asString(); request.user_prompt = json_body->get("user_prompt", "<|im_end|>\n<|im_start|>user\n").asString(); request.ai_prompt = json_body->get("ai_prompt", "<|im_end|>\n<|im_start|>assistant\n").asString(); request.system_prompt = json_body->get("system_prompt", "<|im_start|>system\n").asString(); diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.cc b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.cc index 5ad402972..9d449769f 100644 --- a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.cc +++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.cc @@ -26,29 +26,12 @@ constexpr const int k400BadRequest = 400; constexpr const int k409Conflict = 409; constexpr const int k500InternalServerError = 500; -void RemoveId(std::vector& vec, int id) { - vec.erase(std::remove(vec.begin(), vec.end(), id), vec.end()); -} - TensorrtllmEngine::~TensorrtllmEngine() {} -void TensorrtllmEngine::LoadModel( - std::shared_ptr json_body, - std::function&& callback) { - - LoadModelImpl(model::fromJson(json_body), std::move(callback)); -} -void TensorrtllmEngine::HandleChatCompletion( - std::shared_ptr json_body, - std::function&& callback) { - - HandleChatCompletionImpl(inferences::fromJson(json_body), std::move(callback)); +void RemoveId(std::vector& vec, int id) { + vec.erase(std::remove(vec.begin(), vec.end(), id), vec.end()); } -// ####################### -// ### IMPLEMENTATION #### -// ####################### - bool HandleMatch(std::string const& rew_text, std::shared_ptr infer_state) { if (infer_state->IsComplete()) { return false; @@ -97,13 +80,17 @@ GenerationInput TensorrtllmEngine::CreateGenerationInput(std::vector in input_ids_host, ITensor::makeShape({batchSize, input_len}), MemoryType::kGPU); GenerationInput generation_input{0, 0, input_ids, input_lengths, model_config->usePackedInput()}; generation_input.stopWordsList = GetTensorChatMLStopWordList(); + + LOG_INFO << "Create generation input successfully"; return generation_input; } GenerationOutput TensorrtllmEngine::CreateGenerationOutput() { GenerationOutput generation_output { - gpt_session->getBufferManager().emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32), - gpt_session->getBufferManager().emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)}; + gpt_session->getBufferManager().emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32), + gpt_session->getBufferManager().emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32) + }; + LOG_INFO << "Create generation input successfully"; return generation_output; } @@ -117,12 +104,14 @@ void InferenceThread( int outputLen) { // Input preparation + LOG_INFO << "Inference thread started"; GenerationInput generation_input = self->CreateGenerationInput(input_ids_host); GenerationOutput generation_output = self->CreateGenerationOutput(); // Define the callback to stream each generated token generation_output.onTokenGenerated = [&infer_state, input_len, outputLen, self, &generation_output]( GenerationOutput::TensorPtr const& output_ids, SizeType step, bool finished) { + LOG_INFO << "Generating tokenizer in thread"; // Assuming the shape of output_ids tensor is (1, 1, 160), where 160 is the number of tokens int output_length = output_ids->getShape().d[2]; // Get the length of output IDs based on the tensor shape // Copy output IDs from GPU to host for printing @@ -159,7 +148,49 @@ void InferenceThread( self->gpt_session->generate(generation_output, generation_input, sampling_config); } -void TensorrtllmEngine::HandleChatCompletionImpl(inferences::ChatCompletionRequest&& request, std::function&& callback) { +inline std::string GetModelId(const Json::Value& json_body) { + // First check if model exists in request + if (!json_body["model"].isNull()) { + return json_body["model"].asString(); + } else if (!json_body["model_alias"].isNull()) { + return json_body["model_alias"].asString(); + } + + // We check model_path for loadmodel request + auto input = json_body["model_path"]; + if (!input.isNull()) { + auto s = input.asString(); + std::replace(s.begin(), s.end(), '\\', '/'); + auto const pos = s.find_last_of('/'); + return s.substr(pos + 1); + } + return {}; +} + +bool TensorrtllmEngine::CheckModelLoaded(std::function& callback) { + if (!model_loaded_) { + LOG_WARN << "Model is not loaded yet"; + Json::Value json_resp; + json_resp["message"] = + "Model has not been loaded, please load model into cortex.tensorrt-llm"; + Json::Value status; + status["is_done"] = false; + status["has_error"] = true; + status["is_stream"] = false; + status["status_code"] = k409Conflict; + callback(std::move(status), std::move(json_resp)); + return false; + } + return true; +} + +//######################### +//### ENGINE END POINTS ### +//######################### + + +void TensorrtllmEngine::HandleChatCompletion(std::shared_ptr json_body, std::function&& callback) { + inferences::ChatCompletionRequest request = inferences::fromJson(json_body); std::string formatted_input = pre_prompt; nlohmann::json data; // data["stream"] = completion.stream; @@ -214,8 +245,8 @@ void TensorrtllmEngine::HandleChatCompletionImpl(inferences::ChatCompletionReque std::thread inference_thread(InferenceThread, infer_state, input_ids_host, callback, this, sampling_config, input_len, outputLen); inference_thread.detach(); // Detach the thread to allow it to run independently - this->q = std::make_unique(1, request.model_id); - this->q->runTaskInQueue([cb = std::move(callback), infer_state]() { + q_->runTaskInQueue([cb = std::move(callback), infer_state]() { + LOG_INFO << "Preparing to run inference task queue..."; while (true) { // Continuously check if the queue is not empty std::unique_lock lock(infer_state->queue_mutex); // Lock the queue for exclusive access if (!infer_state->texts_to_stream.empty()) { @@ -256,9 +287,7 @@ void TensorrtllmEngine::HandleChatCompletionImpl(inferences::ChatCompletionReque status["is_stream"] = true; status["status_code"] = k200OK; cb(std::move(status), std::move(resp_data)); - continue;; - } - else { + } else { // If the queue is empty, release the lock and wait before trying again lock.unlock(); } @@ -269,33 +298,31 @@ void TensorrtllmEngine::HandleChatCompletionImpl(inferences::ChatCompletionReque return; }; -void TensorrtllmEngine::LoadModelImpl(model::LoadModelRequest&& request, std::function&& callback) { - std::filesystem::path const engine_dir = request.engine_path; +void TensorrtllmEngine::LoadModel(std::shared_ptr json_body, std::function&& callback) { + model::LoadModelRequest request = model::fromJson(json_body); + std::filesystem::path model_dir = request.model_path; int ctx_len = request.ctx_len; this->user_prompt = request.user_prompt; this->ai_prompt = request.ai_prompt; this->system_prompt = request.system_prompt; + this->model_id_ = GetModelId(*json_body); logger = std::make_shared(); logger->setLevel(nvinfer1::ILogger::Severity::kINFO); - // Fixed settings - std::string const model_name = "mistral"; initTrtLlmPlugins(logger.get()); - // Load model configuration - std::filesystem::path json_file_name = engine_dir / "config.json"; - std::filesystem::path tokenizerModelName = engine_dir / "tokenizer.json"; - cortex_tokenizer = std::make_unique(tokenizerModelName.string()); - LOG_INFO << "Loaded tokenizer"; + std::filesystem::path tokenizer_model_name = model_dir / "tokenizer.model"; + cortex_tokenizer = std::make_unique(tokenizer_model_name.string()); + LOG_INFO << "Loaded tokenizer from " << tokenizer_model_name.string(); - auto const json = GptJsonConfig::parse(json_file_name); + std::filesystem::path json_file_name = model_dir / "config.json"; + auto json = GptJsonConfig::parse(json_file_name); auto config = json.getModelConfig(); model_config = std::make_unique(config); - auto const worldConfig = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism()); - auto const enginePath = engine_dir / json.engineFilename(worldConfig, model_name); - LOG_INFO << "Engine Path : " << enginePath.string(); - auto const dtype = model_config->getDataType(); + auto world_config = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism()); + LOG_INFO << "Loaded config from " << json_file_name.string(); + // auto dtype = model_config->getDataType(); // Currently doing fixed session config session_config.maxBatchSize = batchSize; @@ -304,25 +331,102 @@ void TensorrtllmEngine::LoadModelImpl(model::LoadModelRequest&& request, std::fu session_config.cudaGraphMode = true; // Fixed for simplicity // Init gpt_session - gpt_session = std::make_unique(session_config, *model_config, worldConfig, enginePath.string(), logger); + auto model_path = model_dir / json.engineFilename(world_config, model_id_); + gpt_session = std::make_unique(session_config, *model_config, world_config, model_path.string(), logger); + + model_loaded_ = true; + if (q_ == nullptr) { + q_ = std::make_unique(1, model_id_); + } + // Model loaded successfully + LOG_INFO << "Model " << model_id_ << " loaded successfully from path " << model_path.string(); Json::Value json_resp; json_resp["message"] = "Model loaded successfully"; Json::Value status_resp; status_resp["status_code"] = k200OK; callback(std::move(status_resp), std::move(json_resp)); - LOG_INFO << "Model loaded successfully: " << model_name; return; }; -void TensorrtllmEngine::Destroy(std::shared_ptr json_body, std::function&& callback) { - LOG_INFO << "Program is exitting, goodbye!"; - exit(0); - return; -}; +void TensorrtllmEngine::UnloadModel(std::shared_ptr json_body, std::function&& callback) { + if (!CheckModelLoaded(callback)) { + LOG_WARN << "Model was not loaded"; + Json::Value json_resp; + json_resp["message"] = "Model was not loaded"; + Json::Value status; + status["status_code"] = k400BadRequest; + callback(std::move(status), std::move(json_resp)); + return; + } + + gpt_session.reset(); + cortex_tokenizer.reset(); + q_.reset(); + model_config.reset(); + logger.reset(); + model_loaded_ = false; + + Json::Value json_resp; + json_resp["message"] = "Model unloaded successfully"; + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = k200OK; + callback(std::move(status), std::move(json_resp)); + LOG_INFO << "Model unloaded sucessfully"; +} + +void TensorrtllmEngine::HandleEmbedding( std::shared_ptr json_body, std::function&& callback) { + LOG_WARN << "Engine does not support embedding yet"; + Json::Value json_resp; + json_resp["message"] = "Engine does not support embedding yet"; + Json::Value status; + status["status_code"] = k409Conflict; + callback(std::move(status), std::move(json_resp)); +} + +void TensorrtllmEngine::GetModelStatus(std::shared_ptr json_body, std::function&& callback) { + LOG_WARN << "Engine does not support get model status method yet"; + Json::Value json_resp; + json_resp["message"] = "Engine does not support get model status method yet"; + Json::Value status; + status["status_code"] = k409Conflict; + callback(std::move(status), std::move(json_resp)); +} + +void TensorrtllmEngine::GetModels( + std::shared_ptr json_body, + std::function&& callback) { + Json::Value json_resp; + Json::Value model_array = Json::arrayValue; + + if (model_loaded_) { + Json::Value val; + val["id"] = model_id_; + val["engine"] = "cortex.tensorrt-llm"; + val["start_time"] = start_time_; + val["vram"] = "-"; + val["ram"] = "-"; + val["object"] = "model"; + model_array.append(val); + } + + json_resp["object"] = "list"; + json_resp["data"] = model_array; + + Json::Value status; + status["is_done"] = true; + status["has_error"] = false; + status["is_stream"] = false; + status["status_code"] = k200OK; + callback(std::move(status), std::move(json_resp)); + LOG_INFO << "Running models responded"; +} extern "C" { -CortexTensorrtLlmEngineI* get_engine() { +EngineI* get_engine() { return new TensorrtllmEngine(); } } diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.h b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.h index debcf5e61..cc971f7eb 100644 --- a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.h +++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.h @@ -8,7 +8,7 @@ #include #include "NvInfer.h" -#include "base/cortex-common/cortextensorrtllmi.h" +#include "base/cortex-common/enginei.h" #include "models/chat_completion_request.h" #include "models/load_model_request.h" #include "sentencepiece_processor.h" @@ -27,49 +27,9 @@ using namespace tensorrt_llm::runtime; -// class Tokenizer { -// private: -// sentencepiece::SentencePieceProcessor processor; - -// void ReplaceSubstring(std::string& base, const std::string& from, const std::string& to) { -// size_t start_pos = 0; -// while ((start_pos = base.find(from, start_pos)) != std::string::npos) { -// base.replace(start_pos, from.length(), to); -// start_pos += to.length(); -// } -// } - -// public: -// Tokenizer(const std::string& model_path) { -// auto status = processor.Load(model_path); -// if (!status.ok()) { -// std::cerr << status.ToString() << std::endl; -// } -// LOG_INFO << "Successully loaded the tokenizer"; -// } - -// std::string DecodeWithSpace(const int id) { -// std::string text = processor.IdToPiece(id); -// ReplaceSubstring(text, "▁", " "); -// return text; -// } - -// std::string Decode(const std::vector ids) { -// std::string text = processor.DecodeIds(ids); -// return text; -// } - -// std::vector Encode(const std::string& input) { -// std::vector ids; -// processor.Encode(input, &ids); -// return ids; -// } -// }; - class Tokenizer { private: - std::unordered_map vocab; - std::unordered_map id_to_token; + sentencepiece::SentencePieceProcessor processor; void ReplaceSubstring(std::string& base, const std::string& from, const std::string& to) { size_t start_pos = 0; @@ -80,89 +40,78 @@ class Tokenizer { } public: - Tokenizer(const std::string& json_path) { - // Load tokenizer.json - std::ifstream file(json_path); - if (!file.is_open()) { - throw std::runtime_error("Failed to open tokenizer JSON file"); - } - - nlohmann::json tokenizer_json; - file >> tokenizer_json; - - // Parse vocabulary - vocab = tokenizer_json["model"]["vocab"].get>(); - for (const auto& [key, value] : vocab) { - id_to_token[value] = key; + Tokenizer(const std::string& model_path) { + auto status = processor.Load(model_path); + if (!status.ok()) { + std::cerr << status.ToString() << std::endl; } - - LOG_INFO << "Successfully loaded the tokenizer from JSON"; + LOG_INFO << "Successully loaded the tokenizer"; } std::string DecodeWithSpace(const int id) { - std::string text = id_to_token[id]; + std::string text = processor.IdToPiece(id); ReplaceSubstring(text, "▁", " "); return text; } - std::string Decode(const std::vector& ids) { - std::string text; - for (int id : ids) { - text += id_to_token[id]; - } - ReplaceSubstring(text, "▁", " "); + std::string Decode(const std::vector ids) { + std::string text = processor.DecodeIds(ids); return text; } std::vector Encode(const std::string& input) { std::vector ids; - std::string word; - for (char ch : input) { - word += ch; - if (vocab.find(word) != vocab.end()) { - ids.push_back(vocab[word]); - word.clear(); - } - } + processor.Encode(input, &ids); return ids; } }; struct InferenceState { - int prev_pos{0}; - std::string prev_text; - bool is_finished; - std::queue texts_to_stream; - std::mutex queue_mutex; // Mutex to protect access to textsToStream - size_t stop_word_match_len = 0; - std::vector sequence{"<", "|", "im", "_", "end", "|", ">"}; - int token_gen_count = 0; - - void Reset() { - stop_word_match_len = 0; - prev_text = ""; - } + int prev_pos{0}; + std::string prev_text; + bool is_finished; + std::queue texts_to_stream; + std::mutex queue_mutex; // Mutex to protect access to textsToStream + size_t stop_word_match_len = 0; + std::vector sequence{"<", "|", "im", "_", "end", "|", ">"}; + int token_gen_count = 0; + + void Reset() { + stop_word_match_len = 0; + prev_text = ""; + } - bool IsComplete() const { - return stop_word_match_len >= sequence.size(); - } + bool IsComplete() const { + return stop_word_match_len >= sequence.size(); + } }; namespace tensorrtllm { -class TensorrtllmEngine : public CortexTensorrtLlmEngineI { +class TensorrtllmEngine : public EngineI { public: ~TensorrtllmEngine() final; // ### Interface ### + void HandleChatCompletion( + std::shared_ptr json_body, + std::function&& callback) final; + void HandleEmbedding( + std::shared_ptr json_body, + std::function&& callback) final; void LoadModel( std::shared_ptr json_body, std::function&& callback) final; - void HandleChatCompletion( + void UnloadModel( + std::shared_ptr json_body, + std::function&& callback) final; + void GetModelStatus( + std::shared_ptr json_body, + std::function&& callback) final; + + // API to get running models. + void GetModels( std::shared_ptr json_body, std::function&& callback) final; - void Destroy( - std::shared_ptr jsonBody, - std::function&& callback) final; GenerationInput::TensorPtr GetTensorSingleStopWordList(int stopToken); GenerationInput CreateGenerationInput(std::vector inputIds); @@ -172,10 +121,10 @@ class TensorrtllmEngine : public CortexTensorrtLlmEngineI { std::unique_ptr gpt_session; std::unique_ptr cortex_tokenizer; - void LoadModelImpl(model::LoadModelRequest&& request, std::function&& callback); - void HandleChatCompletionImpl(inferences::ChatCompletionRequest&& request, std::function&& callback); private: - std::unique_ptr q; + bool CheckModelLoaded( + std::function& callback); + GptSession::Config session_config{1, 1, 1}; SamplingConfig sampling_config{1}; std::unique_ptr model_config; @@ -185,6 +134,10 @@ class TensorrtllmEngine : public CortexTensorrtLlmEngineI { std::string system_prompt; std::string pre_prompt; int batchSize = 1; + std::string model_id_; + uint64_t start_time_; + std::atomic model_loaded_; + std::unique_ptr q_; }; } // namespace inferences diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/utils/tensorrt-llm_utils.h b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/utils/tensorrt-llm_utils.h index 6aec47012..c17d06ec7 100644 --- a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/utils/tensorrt-llm_utils.h +++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/utils/tensorrt-llm_utils.h @@ -14,7 +14,6 @@ // Include platform-specific headers #ifdef _WIN32 #include -#include #else #include #endif