diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml
new file mode 100644
index 000000000..8e050404e
--- /dev/null
+++ b/.github/release-drafter.yml
@@ -0,0 +1,26 @@
+categories:
+  - title: '🚀 Features'
+    labels:
+      - 'type: enhancement'
+      - 'type: epic'
+      - 'type: feature request'
+  - title: '🐛 Bug Fixes'
+    labels:
+      - 'type: bug'
+  - title: '🧰 Maintenance'
+    labels: 
+      - 'type: chore'
+      - 'type: ci'
+  - title: '📖 Documentaion'
+    labels: 
+      - 'type: documentation'
+change-template: '- $TITLE @$AUTHOR (#$NUMBER)'
+change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks.
+template: |
+  ## Changes
+
+  $CHANGES
+
+  ## Contributor
+
+  $CONTRIBUTORS
diff --git a/.github/runners/linux/Dockerfile.multi b/.github/runners/linux/Dockerfile.multi
new file mode 100644
index 000000000..47989e687
--- /dev/null
+++ b/.github/runners/linux/Dockerfile.multi
@@ -0,0 +1,142 @@
+# Multi-stage Dockerfile
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
+ARG BASE_TAG=24.02-py3
+ARG DEVEL_IMAGE=devel
+
+FROM ${BASE_IMAGE}:${BASE_TAG} as base
+
+# https://www.gnu.org/software/bash/manual/html_node/Bash-Startup-Files.html
+# The default values come from `nvcr.io/nvidia/pytorch`
+ENV BASH_ENV=${BASH_ENV:-/etc/bash.bashrc}
+ENV ENV=${ENV:-/etc/shinit_v2}
+SHELL ["/bin/bash", "-c"]
+
+FROM base as devel
+
+COPY docker/common/install_base.sh install_base.sh
+RUN bash ./install_base.sh && rm install_base.sh
+
+COPY docker/common/install_cmake.sh install_cmake.sh
+RUN bash ./install_cmake.sh && rm install_cmake.sh
+
+COPY docker/common/install_ccache.sh install_ccache.sh
+RUN bash ./install_ccache.sh && rm install_ccache.sh
+
+# Download & install internal TRT release
+ARG TRT_VER
+ARG CUDA_VER
+ARG CUDNN_VER
+ARG NCCL_VER
+ARG CUBLAS_VER
+COPY docker/common/install_tensorrt.sh install_tensorrt.sh
+RUN bash ./install_tensorrt.sh \
+    --TRT_VER=${TRT_VER} \
+    --CUDA_VER=${CUDA_VER} \
+    --CUDNN_VER=${CUDNN_VER} \
+    --NCCL_VER=${NCCL_VER} \
+    --CUBLAS_VER=${CUBLAS_VER} && \
+    rm install_tensorrt.sh
+
+# Install latest Polygraphy
+COPY docker/common/install_polygraphy.sh install_polygraphy.sh
+RUN bash ./install_polygraphy.sh && rm install_polygraphy.sh
+
+# Install mpi4py
+COPY docker/common/install_mpi4py.sh install_mpi4py.sh
+RUN bash ./install_mpi4py.sh && rm install_mpi4py.sh
+
+# Install PyTorch
+ARG TORCH_INSTALL_TYPE="skip"
+COPY docker/common/install_pytorch.sh install_pytorch.sh
+RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
+COPY setup.py requirements.txt requirements-dev.txt ./
+
+RUN pip install --no-cache-dir -r requirements-dev.txt
+
+RUN apt update && echo Y | apt install sudo cargo -y
+
+RUN cargo install sccache --locked
+
+ARG RUNNER_VERSION=2.317.0
+
+ARG USER_ID=1000
+ARG USER_NAME=runner
+ARG GROUP_ID=1000
+ARG GROUP_NAME=runner
+
+RUN (getent group ${GROUP_ID} || groupadd --gid ${GROUP_ID} ${GROUP_NAME}) && \
+    (getent passwd ${USER_ID} || useradd --gid ${GROUP_ID} --uid ${USER_ID} --create-home --no-log-init --shell /bin/bash ${USER_NAME})
+
+
+RUN usermod -aG sudo ${USER_NAME} \
+    && echo "%sudo   ALL=(ALL:ALL) NOPASSWD:ALL" > /etc/sudoers \
+    && echo "Defaults env_keep += \"DEBIAN_FRONTEND\"" >> /etc/sudoers
+
+ENV HOME=/home/runner
+
+# cd into the user directory, download and unzip the github actions runner
+RUN cd /home/runner && mkdir actions-runner && cd actions-runner \
+    && curl -O -L https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz \
+    && tar xzf ./actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz
+
+RUN chown -R runner:runner /home/runner && /home/runner/actions-runner/bin/installdependencies.sh
+
+ADD docker/common/start.sh start.sh
+
+RUN chmod +x start.sh
+
+RUN sudo chmod +x /root/.cargo/bin/sccache
+
+USER runner
+
+ENV PATH=/root/.cargo/bin${PATH:+:${PATH}}
+
+ENTRYPOINT ["./start.sh"]
+
+FROM ${DEVEL_IMAGE} as wheel
+WORKDIR /src/tensorrt_llm
+COPY benchmarks benchmarks
+COPY cpp cpp
+COPY benchmarks benchmarks
+COPY scripts scripts
+COPY tensorrt_llm tensorrt_llm
+COPY 3rdparty 3rdparty
+COPY setup.py requirements.txt requirements-dev.txt ./
+
+ARG BUILD_WHEEL_ARGS="--clean --trt_root /usr/local/tensorrt --python_bindings --benchmarks"
+RUN python3 scripts/build_wheel.py ${BUILD_WHEEL_ARGS}
+
+FROM ${DEVEL_IMAGE} as release
+
+WORKDIR /app/tensorrt_llm
+COPY --from=wheel /src/tensorrt_llm/build/tensorrt_llm*.whl .
+RUN pip install tensorrt_llm*.whl --extra-index-url https://pypi.nvidia.com && \
+    rm tensorrt_llm*.whl
+COPY README.md ./
+COPY docs docs
+COPY cpp/include include
+RUN ln -sv $(python3 -c 'import site; print(f"{site.getsitepackages()[0]}/tensorrt_llm/libs")') lib && \
+    test -f lib/libnvinfer_plugin_tensorrt_llm.so && \
+    ln -sv lib/libnvinfer_plugin_tensorrt_llm.so lib/libnvinfer_plugin_tensorrt_llm.so.9 && \
+    echo "/app/tensorrt_llm/lib" > /etc/ld.so.conf.d/tensorrt_llm.conf && \
+    ldconfig
+ARG SRC_DIR=/src/tensorrt_llm
+COPY --from=wheel ${SRC_DIR}/benchmarks benchmarks
+ARG CPP_BUILD_DIR=${SRC_DIR}/cpp/build
+COPY --from=wheel \
+     ${CPP_BUILD_DIR}/benchmarks/bertBenchmark \
+     ${CPP_BUILD_DIR}/benchmarks/gptManagerBenchmark \
+     ${CPP_BUILD_DIR}/benchmarks/gptSessionBenchmark \
+     benchmarks/cpp/
+COPY examples examples
+RUN chmod -R a+w examples && \
+    rm -v \
+      benchmarks/cpp/bertBenchmark.cpp \
+      benchmarks/cpp/gptManagerBenchmark.cpp \
+      benchmarks/cpp/gptSessionBenchmark.cpp \
+      benchmarks/cpp/CMakeLists.txt
+ARG GIT_COMMIT
+ARG TRT_LLM_VER
+ENV TRT_LLM_GIT_COMMIT=${GIT_COMMIT} \
+    TRT_LLM_VERSION=${TRT_LLM_VER}
+
diff --git a/.github/runners/linux/start.sh b/.github/runners/linux/start.sh
new file mode 100644
index 000000000..84d3c3dcd
--- /dev/null
+++ b/.github/runners/linux/start.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+RUNNER_REPO=$RUNNER_REPO
+RUNNER_PAT=$RUNNER_PAT
+RUNNER_GROUP=$RUNNER_GROUP
+RUNNER_LABELS=$RUNNER_LABELS
+RUNNER_NAME=$(hostname)
+
+cd /home/runner/actions-runner
+
+./config.sh --unattended --replace --url https://github.com/${RUNNER_REPO} --pat ${RUNNER_PAT} --name ${RUNNER_NAME} --runnergroup ${RUNNER_GROUP} --labels ${RUNNER_LABELS} --work /home/runner/actions-runner/_work
+
+cleanup() {
+    echo "Removing runner..."
+    ./config.sh remove --unattended --pat ${RUNNER_PAT}
+}
+
+trap 'cleanup; exit 130' INT
+trap 'cleanup; exit 143' TERM
+
+./run.sh & wait $!
\ No newline at end of file
diff --git a/.github/runners/windows/Dockerfile.window.runner b/.github/runners/windows/Dockerfile.window.runner
new file mode 100644
index 000000000..b1cb5a7c9
--- /dev/null
+++ b/.github/runners/windows/Dockerfile.window.runner
@@ -0,0 +1,255 @@
+# https://learn.microsoft.com/en-us/visualstudio/install/build-tools-container?view=vs-2022
+
+# Use the Windows Server Core 2019 image.
+FROM mcr.microsoft.com/windows/servercore:ltsc2019
+
+# Restore the default Windows shell for correct batch processing.
+# (Used for VS Build Tools installation)
+SHELL ["cmd", "/S", "/C"]
+
+# -----------------------------------------------------------------------------
+
+# Install CUDA 12.2
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    $ProgressPreference = 'SilentlyContinue'; \
+    Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_537.13_windows.exe \
+    -OutFile "cuda_installer.exe"; \
+    Start-Process cuda_installer.exe -Wait -ArgumentList '-s'; \
+    Remove-Item cuda_installer.exe -Force
+
+# -----------------------------------------------------------------------------
+
+# Install Python 3.10.11
+
+# Download and install Python
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    $ProgressPreference = 'SilentlyContinue'; \
+    Invoke-WebRequest -Uri https://www.python.org/ftp/python/3.10.11/python-3.10.11-amd64.exe -OutFile python-3.10.11.exe ; \
+    Start-Process python-3.10.11.exe -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' ; \
+    Remove-Item python-3.10.11.exe -Force
+
+# Add python3 command
+RUN powershell -Command \
+    cp "\"C:\\\\Program Files\\\\Python310\\\\python.exe\" \"C:\\\\Program Files\\\\Python310\\\\python3.exe\""
+
+# -----------------------------------------------------------------------------
+
+# Install Microsoft MPI
+
+# The latest version is 10.1.3, but it requires you to get a temporary download
+# link.
+# https://learn.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes
+# We use 10.1.1 which has a release on the GitHub page
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    $ProgressPreference = 'SilentlyContinue'; \
+    Invoke-WebRequest -Uri https://github.com/microsoft/Microsoft-MPI/releases/download/v10.1.1/msmpisetup.exe \
+    -OutFile "msmpisetup.exe"; \
+    Start-Process .\msmpisetup.exe -Wait ; \
+    Remove-Item msmpisetup.exe -Force
+
+# Add MPI binaries to Path
+RUN setx Path "%Path%;C:\Program Files\Microsoft MPI\Bin"
+
+# Download the MSMPI SDK
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    $ProgressPreference = 'SilentlyContinue'; \
+    Invoke-WebRequest -Uri https://github.com/microsoft/Microsoft-MPI/releases/download/v10.1.1/msmpisdk.msi \
+    -OutFile "msmpisdk.msi"; \
+    Start-Process msiexec.exe -Wait -ArgumentList '/I msmpisdk.msi /quiet'; \
+    Remove-Item msmpisdk.msi -Force
+
+# -----------------------------------------------------------------------------
+
+# Install CMake
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    $ProgressPreference = 'SilentlyContinue'; \
+    Invoke-WebRequest -Uri https://github.com/Kitware/CMake/releases/download/v3.27.7/cmake-3.27.7-windows-x86_64.msi \
+    -OutFile "cmake.msi"; \
+    Start-Process msiexec.exe -Wait -ArgumentList '/I cmake.msi /quiet'; \
+    Remove-Item cmake.msi -Force
+
+# Add CMake binaries to Path
+RUN setx Path "%Path%;C:\Program Files\CMake\bin"
+
+# -----------------------------------------------------------------------------
+
+# Install VS Build Tools
+
+RUN \
+    # Download the Build Tools bootstrapper.
+    curl -SL --output vs_buildtools.exe https://aka.ms/vs/17/release/vs_buildtools.exe \
+    \
+    # Install Build Tools with the Microsoft.VisualStudio.Workload.AzureBuildTools workload, excluding workloads and components with known issues.
+    && (start /w vs_buildtools.exe --quiet --wait --norestart --nocache \
+        --installPath "%ProgramFiles(x86)%\Microsoft Visual Studio\2022\BuildTools" \
+        --includeRecommended \
+        --add Microsoft.VisualStudio.Workload.MSBuildTools \
+        --add Microsoft.VisualStudio.Workload.VCTools \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.10240 \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.10586 \
+        --remove Microsoft.VisualStudio.Component.Windows10SDK.14393 \
+        --remove Microsoft.VisualStudio.Component.Windows81SDK \
+        || IF "%ERRORLEVEL%"=="3010" EXIT 0) \
+    \
+    # Cleanup
+    && del /q vs_buildtools.exe
+
+# -----------------------------------------------------------------------------
+
+# Install Vim (can delete this but it's nice to have)
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    $ProgressPreference = 'SilentlyContinue'; \
+    Invoke-WebRequest -Uri https://ftp.nluug.nl/pub/vim/pc/gvim90.exe \
+    -OutFile "install_vim.exe"; \
+    Start-Process install_vim.exe -Wait -ArgumentList '/S'; \
+    Remove-Item install_vim.exe -Force
+
+# Add Vim binaries to Path
+RUN setx Path "%Path%;C:\Program Files (x86)\Vim\vim90"
+
+# -----------------------------------------------------------------------------
+
+# Install Chocolatey
+# Chocolatey is a package manager for Windows
+# I probably could've used it to install some of the above, but I didn't...
+
+# If you try to install Chocolatey 2.0.0, it fails on .NET Framework 4.8 installation
+# https://stackoverflow.com/a/76470753
+ENV chocolateyVersion=1.4.0
+
+# https://docs.chocolatey.org/en-us/choco/setup#install-with-cmd.exe
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    powershell.exe -NoProfile -InputFormat None -ExecutionPolicy Bypass \
+    -Command "[System.Net.ServicePointManager]::SecurityProtocol = 3072; \
+    iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))" && \
+    SET "PATH=%PATH%;%ALLUSERSPROFILE%\chocolatey\bin"
+
+# -----------------------------------------------------------------------------
+
+# Install Git via Chocolatey
+RUN powershell -Command \
+    choco install git -y
+
+# -----------------------------------------------------------------------------
+
+# Install CUDA 11.8 NVTX
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    $ProgressPreference = 'SilentlyContinue'; \
+    Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/11.8.0/network_installers/cuda_11.8.0_windows_network.exe \
+    -OutFile cuda_11.8.0_windows_network.exe; \
+    Invoke-WebRequest -Uri https://7-zip.org/a/7zr.exe \
+    -OutFile 7zr.exe
+
+RUN \
+    7zr.exe e -i!"nsight_nvtx\nsight_nvtx\NVIDIA NVTX Installer.x86_64.Release.v1.21018621.Win64.msi" cuda_11.8.0_windows_network.exe &&\
+    msiexec.exe /i "NVIDIA NVTX Installer.x86_64.Release.v1.21018621.Win64.msi" /norestart /quiet &&\
+    del "NVIDIA NVTX Installer.x86_64.Release.v1.21018621.Win64.msi" &&\
+    del 7zr.exe &&\
+    del cuda_11.8.0_windows_network.exe
+
+# -----------------------------------------------------------------------------
+
+# Create a working directory
+WORKDIR "C:\\\\workspace"
+
+# -----------------------------------------------------------------------------
+
+# Download and unzip TensorrRT 9.3.0.1 for TensorRT-LLM
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    $ProgressPreference = 'SilentlyContinue'; \
+    Invoke-WebRequest -Uri https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/9.3.0/tensorrt-9.3.0.1.windows10.win10.cuda-12.2.llm.beta.zip \
+    -OutFile TensorRT-9.3.0.1.zip; \
+    Expand-Archive .\TensorRT-9.3.0.1.zip -DestinationPath .; \
+    Move-Item -Path .\TensorRT-9.3.0.1.Windows10.win10.cuda-12.2.llm.beta\TensorRT-9.3.0.1 -Destination .; \
+    Remove-Item TensorRT-9.3.0.1.Windows10.win10.cuda-12.2.llm.beta -Force; \
+    Remove-Item TensorRT-9.3.0.1.zip -Force
+
+# Add TensorRT libs to Path
+RUN setx Path "%Path%;C:\workspace\TensorRT-9.3.0.1\lib"
+
+# Install TensorRT Python wheel
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    pip install TensorRT-9.3.0.1\python\tensorrt-9.3.0.post12.dev1-cp310-none-win_amd64.whl
+
+# -----------------------------------------------------------------------------
+
+# Download and unzip cuDNN 8.9.7.29 for TensorRT-LLM
+# https://developer.nvidia.com/downloads/compute/cudnn/secure/8.9.7/local_installers/12.x/cudnn-windows-x86_64-8.9.7.29_cuda12-archive.zip
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    $ProgressPreference = 'SilentlyContinue'; \
+    Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-8.9.7.29_cuda12-archive.zip \
+    -OutFile cuDNN.zip; \
+    Expand-Archive .\cuDNN.zip -DestinationPath .; \
+    New-Item -Path cuDNN -ItemType Directory; \
+    Move-Item -Path .\cudnn-windows-x86_64-8.9.7.29_cuda12-archive\* -Destination .\cuDNN; \
+    Remove-Item cudnn-windows-x86_64-8.9.7.29_cuda12-archive -Force; \
+    Remove-Item cuDNN.zip -Force
+
+# Add cuDNN libs and bin to Path.
+RUN setx Path "%Path%;C:\workspace\cuDNN\lib;C:\workspace\cuDNN\bin;"
+
+# -----------------------------------------------------------------------------
+
+# Define the entry point for the docker container.
+# This entry point launches the 64-bit PowerShell developer shell.
+# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
+ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
+
+# -----------------------------------------------------------------------------
+# COPY requirements-windows.txt C:\\workspace\\requirements-windows.txt
+# COPY requirements-dev-windows.txt C:\\workspace\\requirements-dev-windows.txt
+# RUN python3 -m pip --no-cache-dir install -r C:\workspace\requirements-dev-windows.txt
+# RUN Remove-Item "C:\workspace\requirements-windows.txt" -Force
+# RUN Remove-Item "C:\workspace\requirements-dev-windows.txt" -Force
+
+# This bellow command lt MSVC recognize cuda compiler
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations'
+
+RUN powershell -Command \
+    $ErrorActionPreference = 'Stop'; \
+    Copy-Item -Path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\visual_studio_integration\MSBuildExtensions\*' -Destination 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v160\BuildCustomizations'
+
+RUN powershell -Command \
+    choco install sccache make Ninja -y;
+
+# RUN [Environment]::SetEnvironmentVariable('Path', $Env:Path + ';C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools', [EnvironmentVariableTarget]::Machine)
+
+ADD ./requirements-dev-windows.txt ./requirements-dev-windows.txt
+ADD ./requirements-windows.txt ./requirements-windows.txt
+
+RUN python3 -m pip install --no-cache-dir -r .\requirements-dev-windows.txt
+
+ARG RUNNER_VERSION=2.317.0
+
+# Define the entry point for the docker container.
+# This entry point launches the 64-bit PowerShell developer shell.
+# We need to launch with amd64 arch otherwise Powershell defaults to x86 32-bit build commands which don't jive with CUDA
+# ENTRYPOINT ["C:\\Program Files (x86)\\Microsoft Visual Studio\\2022\\BuildTools\\Common7\\Tools\\VsDevCmd.bat", "-arch=amd64", "&&", "powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass"]
+
+RUN Invoke-WebRequest \
+      -Uri https://github.com/actions/runner/releases/download/v$env:RUNNER_VERSION/actions-runner-win-x64-$env:RUNNER_VERSION.zip \
+      -OutFile runner.zip; \
+    Expand-Archive -Path ./runner.zip -DestinationPath ./actions-runner; \
+    Remove-Item -Path .\runner.zip;
+
+ADD runner.ps1 ./runner.ps1
+
+RUN powershell -Command New-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 -PropertyType DWORD -Force
+
+CMD ["powershell.exe", "-ExecutionPolicy", "Unrestricted", "-File", ".\\runner.ps1"]
\ No newline at end of file
diff --git a/.github/runners/windows/requirements-dev-windows.txt b/.github/runners/windows/requirements-dev-windows.txt
new file mode 100644
index 000000000..76c840cae
--- /dev/null
+++ b/.github/runners/windows/requirements-dev-windows.txt
@@ -0,0 +1,17 @@
+-r requirements-windows.txt
+--extra-index-url https://download.pytorch.org/whl/cu121
+datasets
+einops
+graphviz
+mypy
+parameterized
+pre-commit
+pybind11
+pybind11-stubgen
+pytest-cov
+pytest-forked
+pytest-xdist
+pywin32
+rouge_score
+cloudpickle
+typing-extensions==4.8.0
diff --git a/.github/runners/windows/requirements-windows.txt b/.github/runners/windows/requirements-windows.txt
new file mode 100644
index 000000000..0bbd1f26a
--- /dev/null
+++ b/.github/runners/windows/requirements-windows.txt
@@ -0,0 +1,30 @@
+--extra-index-url https://pypi.nvidia.com
+accelerate==0.25.0
+build
+colored
+cuda-python==12.3.0
+diffusers==0.15.0
+mpi4py
+numpy
+onnx>=1.12.0
+polygraphy
+psutil
+pynvml>=11.5.0
+pulp
+pandas
+h5py==3.10.0
+pywin32
+StrEnum
+sentencepiece>=0.1.99
+# WAR the new posting of "nvidia-cudnn-cu12~=9.0".
+# "tensorrt==9.3.0.post12.dev1" specifies "nvidia-cudnn-cu12" but actually requires "nvidia-cudnn-cu12~=8.9".
+nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
+tensorrt==9.3.0.post12.dev1
+tokenizers>=0.14
+# Default torch is CPU-only on Windows, so need to specify a torch version with GPU support
+torch==2.2.0+cu121
+transformers==4.38.2
+wheel
+optimum
+evaluate
+janus
diff --git a/.github/runners/windows/runner.ps1 b/.github/runners/windows/runner.ps1
new file mode 100644
index 000000000..7ffa45de5
--- /dev/null
+++ b/.github/runners/windows/runner.ps1
@@ -0,0 +1,3 @@
+$runnerName = (hostname.exe).Trim()
+.\actions-runner\config.cmd --unattended --replace --url https://github.com/${env:RUNNER_REPO} --pat $env:RUNNER_PAT --runnergroup $env:RUNNER_GROUP --labels $env:RUNNER_LABELS --work $env:RUNNER_WORKDIR --name $runnerName
+.\actions-runner\run.cmd
\ No newline at end of file
diff --git a/.github/workflows/auto_close_inactive_issues.yml b/.github/workflows/auto_close_inactive_issues.yml
new file mode 100644
index 000000000..423adefbb
--- /dev/null
+++ b/.github/workflows/auto_close_inactive_issues.yml
@@ -0,0 +1,25 @@
+# Ref: https://docs.github.com/en/actions/managing-issues-and-pull-requests/closing-inactive-issues
+name: Close inactive issues
+on:
+  schedule:
+    - cron: "30 1 * * *"
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v9
+        with:
+          days-before-issue-stale: 30
+          days-before-issue-close: 15
+          stale-issue-label: "stale"
+          exempt-issue-labels: ""
+          stale-issue-message: This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 15 days."
+          close-issue-message: "This issue was closed because it has been stalled for 15 days with no activity."
+          days-before-pr-stale: -1
+          days-before-pr-close: -1
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          debug-only: true
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 000000000..f59d8363f
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,217 @@
+name: CI
+
+on:
+  push:
+    tags: ["v[0-9]+.[0-9]+.[0-9]+"]
+    paths:
+      [
+        ".github/scripts/**",
+        ".github/workflows/build.yml",
+        "**/CMakeLists.txt",
+        "**/Makefile",
+        "**/*.h",
+        "**/*.hpp",
+        "**/*.c",
+        "**/*.cpp",
+        "**/*.cu",
+        "**/*.cc",
+        "**/*.cxx",
+        "llama.cpp",
+        "!docs/**",
+        "!.gitignore",
+        "!README.md",
+      ]
+  workflow_dispatch:
+
+jobs:
+  create-draft-release:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+    outputs:
+      upload_url: ${{ steps.create_release.outputs.upload_url }}
+      version: ${{ steps.get_version.outputs.version }}
+    permissions:
+      contents: write
+    steps:
+      - name: Extract tag name without v prefix
+        id: get_version
+        run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_ENV && echo "::set-output name=version::${GITHUB_REF#refs/tags/v}"
+        env:
+          GITHUB_REF: ${{ github.ref }}
+      - name: Create Draft Release
+        id: create_release
+        uses: actions/create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ github.ref_name }}
+          release_name: "${{ env.VERSION }}"
+          draft: true
+          prerelease: false
+
+  build-and-test:
+    runs-on: ${{ matrix.runs-on }}
+    needs: [create-draft-release]
+    timeout-minutes: 1440
+    strategy:
+      matrix:
+        include:
+          - os: "windows"
+            name: "cuda-12-2"
+            runs-on: "windows-tensorrt-llm-cuda-12-2"
+            run-e2e: false
+            s3-key-prefix: "windows-tensorrt-llm-ccache"
+            sccache-conf-path: 'C:\sccache.conf'
+          - os: "linux"
+            name: "cuda-12-3"
+            runs-on: "linux-tensorrt-llm-cuda-12-3"
+            run-e2e: false
+            s3-key-prefix: "linux-tensorrt-llm"
+            sccache-conf-path: '/tmp/sccache.conf'
+    permissions:
+      contents: write
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+          lfs: true
+
+      - name: Install choco on Windows
+        if: runner.os == 'Windows'
+        run: |
+          choco install make pkgconfiglite ccache awscli -y
+
+      - name: create sccache.conf file Linux
+        if: runner.os == 'Linux'
+        run: |
+          echo "[cache.s3]" > ${{ matrix.sccache-conf-path }}
+          echo 'bucket = "${{ secrets.MINIO_BUCKET_NAME }}"' >> ${{ matrix.sccache-conf-path }}
+          echo 'endpoint = "${{ secrets.MINIO_ENDPOINT }}"' >> ${{ matrix.sccache-conf-path }}
+          echo 'key_prefix = "${{ matrix.s3-key-prefix }}"' >> ${{ matrix.sccache-conf-path }}
+          echo 'use_ssl = false' >> ${{ matrix.sccache-conf-path }}
+          echo 'server_side_encryption = false' >> ${{ matrix.sccache-conf-path }}
+          echo 'no_credentials = false' >> ${{ matrix.sccache-conf-path }}
+      
+      - name: Download ccache from s3
+        continue-on-error: true
+        if: runner.os == 'Windows'
+        run: |
+          Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1"
+          refreshenv
+          aws s3 cp s3://${{ secrets.MINIO_BUCKET_NAME }}/${{ matrix.s3-key-prefix }} C:\Users\ContainerAdministrator\AppData\Local\ccache --recursive --endpoint ${{ secrets.MINIO_ENDPOINT }}
+        env:
+          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
+          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
+          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
+
+      - name: start sccache server for linux
+        if: runner.os == 'Linux'
+        working-directory: cpp
+        run: |
+          sccache --start-server
+        env:
+          SCCACHE_BUCKET: "${{ secrets.MINIO_BUCKET_NAME }}"
+          SCCACHE_REGION: "${{ secrets.MINIO_REGION }}"
+          SCCACHE_ENDPOINT: "${{ secrets.MINIO_ENDPOINT }}"
+          SCCACHE_S3_USE_SSL: "false"
+          SCCACHE_S3_SERVER_SIDE_ENCRYPTION: "false"
+          SCCACHE_S3_KEY_PREFIX: "${{ matrix.s3-key-prefix }}"
+          SCCACHE_LOG: "debug"
+          SCCACHE_CONF: '${{ matrix.sccache-conf-path }}'
+          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
+          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
+          SCCACHE_IDLE_TIMEOUT: "0"
+
+      # - name: start sccache server for windows
+      #   if: runner.os == 'Windows'
+      #   run: |
+      #     sccache --start-server
+      #   env:
+      #     SCCACHE_IDLE_TIMEOUT: "0"
+
+      - name: Build Dependencies
+        working-directory: cpp
+        run: |
+          make build-deps
+
+      - name: Build
+        working-directory: cpp
+        run: |
+          make build
+
+      - name: Pre-package
+        working-directory: cpp
+        run: |
+          make pre-package
+
+      - name: Package
+        working-directory: cpp
+        run: |
+          make package
+
+      - name: Upload Artifact
+        uses: actions/upload-artifact@v2
+        with:
+          name: cortex.tensorrt-llm-${{ matrix.os }}-${{ matrix.name }}
+          path: cpp/tensorrt_llm/cortex.tensorrt-llm/cortex.tensorrt-llm.tar.gz
+
+      - uses: actions/upload-release-asset@v1.0.1
+        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
+          asset_path: cpp/tensorrt_llm/cortex.tensorrt-llm/cortex.tensorrt-llm.tar.gz
+          asset_name: cortex.tensorrt-llm-${{ needs.create-draft-release.outputs.version }}-${{ matrix.os }}-${{ matrix.name }}.tar.gz
+          asset_content_type: application/gzip
+      
+      - name: Clean
+        if: always()
+        continue-on-error: true
+        run: |
+          sccache --stop-server
+          rm ${{ matrix.sccache-conf-path }}
+
+      - name: Upload ccache to s3
+        continue-on-error: true
+        if: always() && runner.os == 'Windows'
+        run: |
+          Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1"
+          refreshenv
+          aws s3 cp C:\Users\ContainerAdministrator\AppData\Local\ccache s3://${{ secrets.MINIO_BUCKET_NAME }}/${{ matrix.s3-key-prefix }} --recursive --endpoint ${{ secrets.MINIO_ENDPOINT }}
+        env:
+          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
+          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
+          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
+  update_release_draft:
+    needs: [build-and-test]
+    permissions:
+      # write permission is required to create a github release
+      contents: write
+      # write permission is required for autolabeler
+      # otherwise, read permission is required at least
+      pull-requests: write
+    runs-on: ubuntu-latest
+    steps:
+      # (Optional) GitHub Enterprise requires GHE_HOST variable set
+      #- name: Set GHE_HOST
+      #  run: |
+      #    echo "GHE_HOST=${GITHUB_SERVER_URL##https:\/\/}" >> $GITHUB_ENV
+
+      # Drafts your next Release notes as Pull Requests are merged into "master"
+      - uses: release-drafter/release-drafter@v5
+        # (Optional) specify config name to use, relative to .github/. Default: release-drafter.yml
+        # with:
+        #   config-name: my-config.yml
+        #   disable-autolabeler: true
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Clean
+        if: always()
+        continue-on-error: true
+        run: |
+          sccache --stop-server
+          rm ${{ matrix.sccache-conf-path }}
\ No newline at end of file
diff --git a/.github/workflows/quality-gate.yml b/.github/workflows/quality-gate.yml
new file mode 100644
index 000000000..533a2f226
--- /dev/null
+++ b/.github/workflows/quality-gate.yml
@@ -0,0 +1,136 @@
+name: CI Quality Gate
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+  workflow_dispatch:
+
+jobs:
+  build-and-test:
+    runs-on: ${{ matrix.runs-on }}
+    timeout-minutes: 1440
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: "windows"
+            name: "cuda-12-2"
+            runs-on: "windows-tensorrt-llm-cuda-12-2"
+            run-e2e: false
+            s3-key-prefix: "windows-tensorrt-llm-ccache"
+            sccache-conf-path: 'C:\sccache.conf'
+          - os: "linux"
+            name: "cuda-12-3"
+            runs-on: "linux-tensorrt-llm-cuda-12-3"
+            run-e2e: false
+            s3-key-prefix: "linux-tensorrt-llm"
+            sccache-conf-path: '/tmp/sccache.conf'
+    permissions:
+      contents: write
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+          lfs: true
+
+      - name: Install choco on Windows
+        if: runner.os == 'Windows'
+        run: |
+          choco install make pkgconfiglite ccache awscli -y
+
+      - name: create sccache.conf file Linux
+        if: runner.os == 'Linux'
+        run: |
+          echo "[cache.s3]" > ${{ matrix.sccache-conf-path }}
+          echo 'bucket = "${{ secrets.MINIO_BUCKET_NAME }}"' >> ${{ matrix.sccache-conf-path }}
+          echo 'endpoint = "${{ secrets.MINIO_ENDPOINT }}"' >> ${{ matrix.sccache-conf-path }}
+          echo 'key_prefix = "${{ matrix.s3-key-prefix }}"' >> ${{ matrix.sccache-conf-path }}
+          echo 'use_ssl = false' >> ${{ matrix.sccache-conf-path }}
+          echo 'server_side_encryption = false' >> ${{ matrix.sccache-conf-path }}
+          echo 'no_credentials = false' >> ${{ matrix.sccache-conf-path }}
+      
+      - name: Download ccache from s3
+        continue-on-error: true
+        if: runner.os == 'Windows'
+        run: |
+          Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1"
+          refreshenv
+          aws s3 cp s3://${{ secrets.MINIO_BUCKET_NAME }}/${{ matrix.s3-key-prefix }} C:\Users\ContainerAdministrator\AppData\Local\ccache --recursive --endpoint ${{ secrets.MINIO_ENDPOINT }}
+        env:
+          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
+          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
+          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
+
+      - name: start sccache server for linux
+        if: runner.os == 'Linux'
+        working-directory: cpp
+        run: |
+          sccache --start-server
+        env:
+          SCCACHE_BUCKET: "${{ secrets.MINIO_BUCKET_NAME }}"
+          SCCACHE_REGION: "${{ secrets.MINIO_REGION }}"
+          SCCACHE_ENDPOINT: "${{ secrets.MINIO_ENDPOINT }}"
+          SCCACHE_S3_USE_SSL: "false"
+          SCCACHE_S3_SERVER_SIDE_ENCRYPTION: "false"
+          SCCACHE_S3_KEY_PREFIX: "${{ matrix.s3-key-prefix }}"
+          SCCACHE_LOG: "debug"
+          SCCACHE_CONF: '${{ matrix.sccache-conf-path }}'
+          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
+          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
+          SCCACHE_IDLE_TIMEOUT: "0"
+
+      # - name: start sccache server for windows
+      #   if: runner.os == 'Windows'
+      #   run: |
+      #     sccache --start-server
+      #   env:
+      #     SCCACHE_IDLE_TIMEOUT: "0"
+
+      - name: Build Dependencies
+        working-directory: cpp
+        run: |
+          make build-deps
+
+      - name: Build
+        working-directory: cpp
+        run: |
+          make build
+
+      - name: Pre-package
+        working-directory: cpp
+        run: |
+          make pre-package
+
+      - name: Package
+        working-directory: cpp
+        run: |
+          make package
+
+      - name: Upload Artifact
+        uses: actions/upload-artifact@v2
+        with:
+          name: cortex.tensorrt-llm-${{ matrix.os }}-${{ matrix.name }}
+          path: cpp/tensorrt_llm/cortex.tensorrt-llm/cortex.tensorrt-llm.tar.gz
+      
+      - name: Clean
+        if: always()
+        continue-on-error: true
+        run: |
+          sccache --stop-server
+          rm ${{ matrix.sccache-conf-path }}
+
+      - name: Upload ccache to s3
+        continue-on-error: true
+        if: always() && runner.os == 'Windows'
+        run: |
+          Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1"
+          refreshenv
+          aws s3 cp C:\Users\ContainerAdministrator\AppData\Local\ccache s3://${{ secrets.MINIO_BUCKET_NAME }}/${{ matrix.s3-key-prefix }} --recursive --endpoint ${{ secrets.MINIO_ENDPOINT }}
+        env:
+          AWS_ACCESS_KEY_ID: "${{ secrets.MINIO_ACCESS_KEY_ID }}"
+          AWS_SECRET_ACCESS_KEY: "${{ secrets.MINIO_SECRET_ACCESS_KEY }}"
+          AWS_DEFAULT_REGION: "${{ secrets.MINIO_REGION }}"
+
+          
\ No newline at end of file
diff --git a/cpp/Makefile b/cpp/Makefile
new file mode 100644
index 000000000..fc469e2f6
--- /dev/null
+++ b/cpp/Makefile
@@ -0,0 +1,104 @@
+# Makefile for Cortex cortex.tensorrt-llm engine - Build, Lint, Test, and Clean
+.PHONY: all build package run-e2e-test
+
+RUN_TESTS ?= false
+CODE_SIGN ?= false
+AZURE_KEY_VAULT_URI ?= xxxx
+AZURE_CLIENT_ID ?= xxxx
+AZURE_TENANT_ID ?= xxxx
+AZURE_CLIENT_SECRET ?= xxxx
+AZURE_CERT_NAME ?= xxxx
+
+# Default target, does nothing
+all:
+	@echo "Specify a target to run"
+
+# Build the Cortex engine
+build-deps:
+ifeq ($(OS),Windows_NT)
+	@powershell -Command "cd tensorrt_llm/cortex.tensorrt-llm; cmake -S ./third-party -B ./build_deps/third-party -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -DCMAKE_BUILD_TYPE=Release -DCMAKE_OBJECT_PATH_MAX=500; cmake --build ./build_deps/third-party --config Release -j8;"
+else
+	@cd tensorrt_llm/cortex.tensorrt-llm && cmake -S ./third-party -B ./build_deps/third-party -DCMAKE_BUILD_TYPE=Release -DCMAKE_OBJECT_PATH_MAX=500 && make -C ./build_deps/third-party -j 10 && rm -rf ./build_deps/third-party;
+endif
+
+# Build the Cortex engine
+build:
+ifeq ($(OS),Windows_NT)
+	@powershell -Command "cd ..; python .\scripts\build_wheel.py -a '80-real;86-real;89-real' --trt_root 'C:\workspace\TensorRT-9.3.0.1\' -D 'BUILD_CORTEX_TENSORRT-LLM=ON' --use_ccache"
+	@powershell -Command "cd build; cmake .. -DCMAKE_CUDA_ARCHITECTURES='80-real;86-real;89-real' -DTRT_LIB_DIR='C:/workspace/TensorRT-9.3.0.1/lib' -DTRT_INCLUDE_DIR='C:/workspace/TensorRT-9.3.0.1/include' -DBUILD_CORTEX_TENSORRT-LLM=ON -DCMAKE_CUDA_COMPILER='C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/bin/nvcc.exe' -DENABLE_MULTI_DEVICE=0 -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache -G Ninja; cmake --build . --parallel 2 --config Release"
+else
+	@mkdir -p build && cd build; \
+	cmake .. -GNinja -DBUILD_CORTEX_TENSORRT-LLM=ON -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -DBUILD_PYT='OFF' -DBUILD_PYBIND='OFF' -DNVTX_DISABLE='ON' -DCMAKE_CUDA_ARCHITECTURES='80-real;86-real;89-real' '-DENABLE_MULTI_DEVICE=0' '-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc' -DBUILD_BENCHMARKS=OFF '-DBUILD_TESTS=OFF' -DTRT_LIB_DIR=/usr/local/tensorrt/lib -DTRT_INCLUDE_DIR=/usr/local/tensorrt/include; \
+	cmake --build . --config Release;
+endif
+
+# Prepackage the Cortex engine
+pre-package:
+ifeq ($(OS),Windows_NT)
+	@powershell -Command "cd tensorrt_llm\cortex.tensorrt-llm\; mkdir -p cortex.tensorrt-llm; cp ..\..\build\tensorrt_llm\cortex.tensorrt-llm\engine.dll cortex.tensorrt-llm\;"
+	@powershell -Command "cd tensorrt_llm\cortex.tensorrt-llm\; cp -Force C:\workspace\TensorRT-9.3.0.1\lib\nvinfer.dll cortex.tensorrt-llm\;"
+	@powershell -Command "cd tensorrt_llm\cortex.tensorrt-llm\; cp -Force C:\workspace\cuDNN\bin\cudnn64_8.dll cortex.tensorrt-llm\;"
+	@powershell -Command "cd tensorrt_llm\cortex.tensorrt-llm\; cp -Force C:\workspace\cuDNN\bin\cudnn_ops_infer64_8.dll cortex.tensorrt-llm\;"
+	@powershell -Command "cd tensorrt_llm\cortex.tensorrt-llm\; cp -Force ..\..\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll cortex.tensorrt-llm\;"
+	@powershell -Command "cd tensorrt_llm\cortex.tensorrt-llm\; cp -Force ..\..\build\tensorrt_llm\tensorrt_llm.dll cortex.tensorrt-llm\;"
+	@powershell -Command "cd tensorrt_llm\cortex.tensorrt-llm\; cp -Force C:\Windows\System32\msmpi.dll cortex.tensorrt-llm\;"
+	@powershell -Command "cd tensorrt_llm\cortex.tensorrt-llm\; cp -Force .\build_deps\_install\bin\zlib.dll cortex.tensorrt-llm\;"
+else
+	cd ./tensorrt_llm/cortex.tensorrt-llm && \
+	mkdir -p cortex.tensorrt-llm && \
+	cp ../../build/tensorrt_llm/cortex.tensorrt-llm/libengine.$(shell uname | tr '[:upper:]' '[:lower:]' | sed 's/darwin/dylib/;s/linux/so/') cortex.tensorrt-llm && \
+	cp /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.9 cortex.tensorrt-llm && \
+	cp /usr/lib/x86_64-linux-gnu/libcudnn.so.8 cortex.tensorrt-llm && \
+	cp /usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8 cortex.tensorrt-llm && \
+	cp /home/runner/actions-runner/_work/cortex.tensorrt-llm/cortex.tensorrt-llm/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so.9 cortex.tensorrt-llm && \
+	cp /home/runner/actions-runner/_work/cortex.tensorrt-llm/cortex.tensorrt-llm/cpp/build/tensorrt_llm/libtensorrt_llm.so cortex.tensorrt-llm && \
+	cp /opt/hpcx/ompi/lib/libmpi.so.40 cortex.tensorrt-llm && \
+	cp /usr/lib/x86_64-linux-gnu/libnccl.so cortex.tensorrt-llm
+endif
+
+codesign:
+ifeq ($(CODE_SIGN),false)
+	@echo "Skipping Code Sign"
+	@exit 0
+endif
+
+ifeq ($(OS),Windows_NT)
+	@powershell -Command "dotnet tool install --global AzureSignTool;"
+endif
+
+package:
+ifeq ($(OS),Windows_NT)
+	@powershell -Command "cd tensorrt_llm\cortex.tensorrt-llm\; 7z a -ttar temp.tar cortex.tensorrt-llm\*; 7z a -tgzip cortex.tensorrt-llm.tar.gz temp.tar;"
+else
+	@cd tensorrt_llm/cortex.tensorrt-llm && \
+	tar -czvf cortex.tensorrt-llm.tar.gz cortex.tensorrt-llm
+endif
+
+run-e2e-test:
+ifeq ($(RUN_TESTS),false)
+	@echo "Skipping tests"
+	@exit 0
+endif
+ifeq ($(OS),Windows_NT)
+	@powershell -Command "echo hello"
+else
+	echo "hello"
+endif
+
+run-python-e2e-test:
+ifeq ($(RUN_TESTS),false)
+	@echo "Skipping tests"
+	@exit 0
+endif
+ifeq ($(OS),Windows_NT)
+	echo hello
+else
+	echo hello
+endif
+
+clean:
+ifeq ($(OS),Windows_NT)
+	echo hello
+else
+	echo "hello"
+endif
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/CMakeLists.txt b/cpp/tensorrt_llm/cortex.tensorrt-llm/CMakeLists.txt
index 5c5f25a25..0dde5a31b 100644
--- a/cpp/tensorrt_llm/cortex.tensorrt-llm/CMakeLists.txt
+++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/CMakeLists.txt
@@ -15,13 +15,12 @@
 # C++17
 # engine init
 include(CheckIncludeFileCXX)
-
 check_include_file_cxx(any HAS_ANY)
 check_include_file_cxx(string_view HAS_STRING_VIEW)
 check_include_file_cxx(coroutine HAS_COROUTINE)
 if(HAS_ANY
-   AND HAS_STRING_VIEW
-   AND HAS_COROUTINE)
+  AND HAS_STRING_VIEW
+  AND HAS_COROUTINE)
   set(CMAKE_CXX_STANDARD 20)
 elseif(HAS_ANY AND HAS_STRING_VIEW)
   set(CMAKE_CXX_STANDARD 17)
@@ -29,10 +28,10 @@ else()
   set(CMAKE_CXX_STANDARD 14)
 endif()
 
-
-set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
+SET(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+
 set(CMAKE_PREFIX_PATH ${CMAKE_CURRENT_SOURCE_DIR}/build_deps/_install)
 
 message(STATUS "Current Source Directory CORTEX: ${CMAKE_CURRENT_SOURCE_DIR}")
@@ -41,9 +40,6 @@ message(STATUS "Current Cmake Prefix Path of CORTEX: ${CMAKE_PREFIX_PATH}")
 
 set(OPENSSL_USE_STATIC_LIBS TRUE)
 
-
-# Enable pkg-config support in CMake
-find_package(PkgConfig REQUIRED)
 find_library(TRANTOR
     NAMES trantor
     HINTS "${CMAKE_PREFIX_PATH}/lib"
@@ -53,9 +49,9 @@ find_library(JSONCPP
     HINTS "${CMAKE_PREFIX_PATH}/lib"
 )
 
-# Use pkg-config to find the SentencePiece library
-
 if(NOT WIN32) # Linux
+  # Enable pkg-config support in CMake
+  find_package(PkgConfig REQUIRED)
   # Use pkg-config to find the SentencePiece library
   pkg_search_module(SENTENCEPIECE REQUIRED sentencepiece)
 else() # Windows
@@ -77,8 +73,6 @@ add_custom_target(engine_proj)
 set(CXXOPTS_SRC_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/cxxopts)
 add_subdirectory(${CXXOPTS_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/cxxopts)
 
-# main
-# add_executable(engine main.cc)
 add_library(engine SHARED src/tensorrt-llm_engine.cc)
 target_link_libraries(
   engine PUBLIC ${SHARED_TARGET} nvinfer_plugin_tensorrt_llm cxxopts::cxxopts sentencepiece PRIVATE ${JSONCPP} ${TRANTOR} ${CMAKE_THREAD_LIBS_INIT} )
diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/Makefile b/cpp/tensorrt_llm/cortex.tensorrt-llm/Makefile
index cbf60a948..1628262a3 100644
--- a/cpp/tensorrt_llm/cortex.tensorrt-llm/Makefile
+++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/Makefile
@@ -34,21 +34,34 @@ endif
 
 package:
 ifeq ($(OS),Windows_NT)
+	@powershell -Command "mkdir -p cortex.tensorrt-llm; cp ..\..\build\tensorrt_llm\cortex.tensorrt-llm\engine.dll cortex.tensorrt-llm\;"
+	@powershell -Command "cp -Force C:\workspace\TensorRT-9.3.0.1\lib\nvinfer.dll cortex.tensorrt-llm\;"
+	@powershell -Command "cp -Force C:\workspace\cuDNN\bin\cudnn64_8.dll cortex.tensorrt-llm\;"
+	@powershell -Command "cp -Force C:\workspace\cuDNN\bin\cudnn_ops_infer64_8.dll cortex.tensorrt-llm\;"
+	@powershell -Command "cp -Force ..\..\build\tensorrt_llm\plugins\nvinfer_plugin_tensorrt_llm.dll cortex.tensorrt-llm\;"
+	@powershell -Command "cp -Force ..\..\build\tensorrt_llm\tensorrt_llm.dll cortex.tensorrt-llm\;"
+	@powershell -Command "cp -Force C:\Windows\System32\msmpi.dll cortex.tensorrt-llm\;"
+	@powershell -Command "cp -Force .\build_deps\_install\bin\zlib.dll cortex.tensorrt-llm\;"
+	@powershell -Command "7z a -ttar temp.tar cortex.tensorrt-llm\*; 7z a -tgzip cortex.tensorrt-llm.tar.gz temp.tar;"
 else
 	@mkdir -p cortex.tensorrt-llm && \
 	cp ../../build/tensorrt_llm/cortex.tensorrt-llm/libengine.$(shell uname | tr '[:upper:]' '[:lower:]' | sed 's/darwin/dylib/;s/linux/so/') cortex.tensorrt-llm && \
-	cp /usr/local/cuda-12.4/targets/x86_64-linux/lib/libcublas.so.12 cortex.tensorrt-llm && \
-	cp /usr/local/cuda-12.4/targets/x86_64-linux/lib/libcublas.so.12.4.2.65 cortex.tensorrt-llm && \
-	cp /usr/local/cuda-12.4/targets/x86_64-linux/lib/libcublasLt.so.12 cortex.tensorrt-llm && \
-	cp /usr/local/cuda-12.4/targets/x86_64-linux/lib/libcublasLt.so.12.4.2.65 cortex.tensorrt-llm && \
-	cp /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10 cortex.tensorrt-llm && \
-	cp /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10.0.1 cortex.tensorrt-llm && \
-	cp /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so cortex.tensorrt-llm && \
-	cp /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so.10 cortex.tensorrt-llm && \
-	cp /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm.so cortex.tensorrt-llm && \
-	cp /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm_nvrtc_wrapper.so cortex.tensorrt-llm && \
+	cp /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.9 cortex.tensorrt-llm && \
+	cp /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.9.3.0 cortex.tensorrt-llm && \
+	cp /usr/lib/x86_64-linux-gnu/libcudnn.so.8 cortex.tensorrt-llm && \
+	cp /usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.7 cortex.tensorrt-llm && \
+	cp /usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8 cortex.tensorrt-llm && \
+	cp /usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.7 cortex.tensorrt-llm && \
+	cp /home/runner/actions-runner/_work/cortex.tensorrt-llm/cortex.tensorrt-llm/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so cortex.tensorrt-llm && \
+	cp /home/runner/actions-runner/_work/cortex.tensorrt-llm/cortex.tensorrt-llm/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so.9 cortex.tensorrt-llm && \
+	cp /home/runner/actions-runner/_work/cortex.tensorrt-llm/cortex.tensorrt-llm/cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so.9.3.0 cortex.tensorrt-llm && \
+	cp /home/runner/actions-runner/_work/cortex.tensorrt-llm/cortex.tensorrt-llm/cpp/build/tensorrt_llm/libtensorrt_llm.so cortex.tensorrt-llm && \
+	cp /opt/hpcx/ompi/lib/libmpi.so cortex.tensorrt-llm && \
+	cp /opt/hpcx/ompi/lib/libmpi.so.40 cortex.tensorrt-llm && \
+	cp /opt/hpcx/ompi/lib/libmpi.so.40.30.5 cortex.tensorrt-llm && \
+	cp /usr/lib/x86_64-linux-gnu/libnccl.so cortex.tensorrt-llm && \
 	cp /usr/lib/x86_64-linux-gnu/libnccl.so.2 cortex.tensorrt-llm && \
-	cp /usr/lib/x86_64-linux-gnu/libnccl.so.2.20.5 cortex.tensorrt-llm && \
+	cp /usr/lib/x86_64-linux-gnu/libnccl.so.2.19.3 cortex.tensorrt-llm && \
 	tar -czvf cortex.tensorrt-llm.tar.gz cortex.tensorrt-llm
 endif
 
diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/base/cortex-common/cortextensorrtllmi.h b/cpp/tensorrt_llm/cortex.tensorrt-llm/base/cortex-common/cortextensorrtllmi.h
deleted file mode 100644
index 681d4d0cd..000000000
--- a/cpp/tensorrt_llm/cortex.tensorrt-llm/base/cortex-common/cortextensorrtllmi.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#pragma once
-
-#include <functional>
-#include <memory>
-
-#include "json/value.h"
-
-class CortexTensorrtLlmEngineI {
- public: 
-  virtual ~CortexTensorrtLlmEngineI() {}
-
-  virtual void HandleChatCompletion(
-      std::shared_ptr<Json::Value> jsonBody,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
-  virtual void LoadModel(
-      std::shared_ptr<Json::Value> jsonBody,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
-  virtual void Destroy(
-      std::shared_ptr<Json::Value> jsonBody,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;  
-};
diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/base/cortex-common/enginei.h b/cpp/tensorrt_llm/cortex.tensorrt-llm/base/cortex-common/enginei.h
new file mode 100644
index 000000000..bcc0d81b7
--- /dev/null
+++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/base/cortex-common/enginei.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+
+#include "json/value.h"
+
+// Interface for inference engine.
+// Note: only append new function to keep the compatibility.
+class EngineI {
+ public:
+  virtual ~EngineI() {}
+
+  virtual void HandleChatCompletion(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+  virtual void HandleEmbedding(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+  virtual void LoadModel(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+  virtual void UnloadModel(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+  virtual void GetModelStatus(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+
+  // For backward compatible checking, add to list when we add more APIs
+  virtual bool IsSupported(const std::string& f) {
+    if (f == "HandleChatCompletion" || f == "HandleEmbedding" ||
+        f == "UnloadModel" || f == "GetModelStatus" ||
+        f == "GetModels") {
+      return true;
+    }
+    return false;
+  }
+
+  // API to get running models.
+  virtual void GetModels(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+};
diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/examples/server/server.h b/cpp/tensorrt_llm/cortex.tensorrt-llm/examples/server/server.h
index 471725403..b38242d69 100644
--- a/cpp/tensorrt_llm/cortex.tensorrt-llm/examples/server/server.h
+++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/examples/server/server.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "cortex-common/cortextensorrtllmi.h"
+#include "cortex-common/enginei.h"
 #include "dylib.h"
 
 #include <condition_variable>
@@ -11,7 +11,7 @@ class Server {
  public:
   Server() {
     dylib_ = std::make_unique<dylib>("./engines/cortex.tensorrt-llm", "engine");
-    auto func = dylib_->get_function<CortexTensorrtLlmEngineI*()>("get_engine");
+    auto func = dylib_->get_function<EngineI*()>("get_engine");
     engine_ = func();
   }
 
@@ -23,7 +23,7 @@ class Server {
 
  public:
   std::unique_ptr<dylib> dylib_;
-  CortexTensorrtLlmEngineI* engine_;
+  EngineI* engine_;
 
   struct SyncQueue {
     void push(std::pair<Json::Value, Json::Value>&& p) {
diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/models/chat_completion_request.h b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/models/chat_completion_request.h
index 1c98871bd..bac53f10f 100644
--- a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/models/chat_completion_request.h
+++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/models/chat_completion_request.h
@@ -9,7 +9,6 @@ struct ChatCompletionRequest {
   float temperature = 0.00001f;
   float frequency_penalty = 1.3;
   float presence_penalty = 0;
-  std::string model_id = "default";
   Json::Value messages = Json::Value(Json::arrayValue);
   Json::Value stop = Json::Value(Json::arrayValue);
 };
@@ -23,7 +22,6 @@ inline ChatCompletionRequest fromJson(std::shared_ptr<Json::Value> json_body) {
     request.temperature       = json_body->get("temperature", 0.00001f).asFloat();
     request.frequency_penalty = json_body->get("frequency_penalty", 1.3).asFloat();
     request.presence_penalty  = json_body->get("presence_penalty", 0).asFloat();
-    request.model_id          = json_body->get("model_id", "default").asString();
     request.messages          = json_body->operator[]("messages");
     request.stop              = json_body->operator[]("stop");
   }
diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/models/load_model_request.h b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/models/load_model_request.h
index ff305df6d..7658ef762 100644
--- a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/models/load_model_request.h
+++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/models/load_model_request.h
@@ -7,8 +7,7 @@ namespace tensorrtllm::model {
 struct LoadModelRequest {
     int ctx_len = 2048;
     int n_parallel = 1;
-    std::string model_id = "default";
-    std::string engine_path;
+    std::string model_path;
     std::string user_prompt = "<|im_end|>\n<|im_start|>user\n";
     std::string ai_prompt = "<|im_end|>\n<|im_start|>user\n";
     std::string system_prompt = "<|im_end|>\n<|im_start|>user\n";
@@ -19,8 +18,7 @@ inline LoadModelRequest fromJson(std::shared_ptr<Json::Value> json_body) {
   if (json_body) {
     request.ctx_len       = json_body->get("ctx_len", 2048).asInt();
     request.n_parallel    = json_body->get("n_parallel", 1).asInt();
-    request.model_id      = json_body->get("model_id", "default").asString();
-    request.engine_path   = json_body->get("engine_path", "").asString();
+    request.model_path   = json_body->get("model_path", "").asString();
     request.user_prompt   = json_body->get("user_prompt", "<|im_end|>\n<|im_start|>user\n").asString();
     request.ai_prompt     = json_body->get("ai_prompt", "<|im_end|>\n<|im_start|>assistant\n").asString();
     request.system_prompt = json_body->get("system_prompt", "<|im_start|>system\n").asString();
diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.cc b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.cc
index 5ad402972..9d449769f 100644
--- a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.cc
+++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.cc
@@ -26,29 +26,12 @@ constexpr const int k400BadRequest = 400;
 constexpr const int k409Conflict = 409;
 constexpr const int k500InternalServerError = 500;
 
-void RemoveId(std::vector<int>& vec, int id) {
-  vec.erase(std::remove(vec.begin(), vec.end(), id), vec.end());
-}
-
 TensorrtllmEngine::~TensorrtllmEngine() {}
 
-void TensorrtllmEngine::LoadModel(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-  
-  LoadModelImpl(model::fromJson(json_body), std::move(callback));
-}
-void TensorrtllmEngine::HandleChatCompletion(
-    std::shared_ptr<Json::Value> json_body,
-    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-
-  HandleChatCompletionImpl(inferences::fromJson(json_body), std::move(callback));
+void RemoveId(std::vector<int>& vec, int id) {
+  vec.erase(std::remove(vec.begin(), vec.end(), id), vec.end());
 }
 
-// #######################
-// ### IMPLEMENTATION ####
-// #######################
-
 bool HandleMatch(std::string const& rew_text, std::shared_ptr<InferenceState> infer_state) {
   if (infer_state->IsComplete()) {
     return false;
@@ -97,13 +80,17 @@ GenerationInput TensorrtllmEngine::CreateGenerationInput(std::vector<int32_t> in
       input_ids_host, ITensor::makeShape({batchSize, input_len}), MemoryType::kGPU);
   GenerationInput generation_input{0, 0, input_ids, input_lengths, model_config->usePackedInput()};
   generation_input.stopWordsList = GetTensorChatMLStopWordList();
+
+  LOG_INFO << "Create generation input successfully";
   return generation_input;
 }
 
 GenerationOutput TensorrtllmEngine::CreateGenerationOutput() {
   GenerationOutput generation_output {
-      gpt_session->getBufferManager().emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32),
-      gpt_session->getBufferManager().emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)};
+    gpt_session->getBufferManager().emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32),
+    gpt_session->getBufferManager().emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)
+  };
+  LOG_INFO << "Create generation input successfully";
   return generation_output;
 }
 
@@ -117,12 +104,14 @@ void InferenceThread(
     int outputLen) {
 
   // Input preparation
+  LOG_INFO << "Inference thread started";
   GenerationInput generation_input = self->CreateGenerationInput(input_ids_host);
   GenerationOutput generation_output = self->CreateGenerationOutput();
 
   // Define the callback to stream each generated token
   generation_output.onTokenGenerated = [&infer_state, input_len, outputLen, self, &generation_output](
                                           GenerationOutput::TensorPtr const& output_ids, SizeType step, bool finished) {
+    LOG_INFO << "Generating tokenizer in thread";                                            
     // Assuming the shape of output_ids tensor is (1, 1, 160), where 160 is the number of tokens
     int output_length = output_ids->getShape().d[2]; // Get the length of output IDs based on the tensor shape
     // Copy output IDs from GPU to host for printing
@@ -159,7 +148,49 @@ void InferenceThread(
   self->gpt_session->generate(generation_output, generation_input, sampling_config);
 }
 
-void TensorrtllmEngine::HandleChatCompletionImpl(inferences::ChatCompletionRequest&& request, std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+inline std::string GetModelId(const Json::Value& json_body) {
+  // First check if model exists in request
+  if (!json_body["model"].isNull()) {
+    return json_body["model"].asString();
+  } else if (!json_body["model_alias"].isNull()) {
+    return json_body["model_alias"].asString();
+  }
+
+  // We check model_path for loadmodel request
+  auto input = json_body["model_path"];
+  if (!input.isNull()) {
+    auto s = input.asString();
+    std::replace(s.begin(), s.end(), '\\', '/');
+    auto const pos = s.find_last_of('/');
+    return s.substr(pos + 1);
+  }
+  return {};
+}
+
+bool TensorrtllmEngine::CheckModelLoaded(std::function<void(Json::Value&&, Json::Value&&)>& callback) {
+  if (!model_loaded_) {
+    LOG_WARN << "Model is not loaded yet";
+    Json::Value json_resp;
+    json_resp["message"] =
+        "Model has not been loaded, please load model into cortex.tensorrt-llm";
+    Json::Value status;
+    status["is_done"] = false;
+    status["has_error"] = true;
+    status["is_stream"] = false;
+    status["status_code"] = k409Conflict;
+    callback(std::move(status), std::move(json_resp));
+    return false;
+  }
+  return true;
+}
+
+//#########################
+//### ENGINE END POINTS ###
+//#########################
+
+
+void TensorrtllmEngine::HandleChatCompletion(std::shared_ptr<Json::Value> json_body, std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+  inferences::ChatCompletionRequest request = inferences::fromJson(json_body);
   std::string formatted_input = pre_prompt;
   nlohmann::json data;
   // data["stream"] = completion.stream;
@@ -214,8 +245,8 @@ void TensorrtllmEngine::HandleChatCompletionImpl(inferences::ChatCompletionReque
   std::thread inference_thread(InferenceThread, infer_state, input_ids_host, callback, this, sampling_config, input_len, outputLen);
   inference_thread.detach(); // Detach the thread to allow it to run independently
 
-  this->q = std::make_unique<trantor::ConcurrentTaskQueue>(1, request.model_id);
-  this->q->runTaskInQueue([cb = std::move(callback), infer_state]() {
+  q_->runTaskInQueue([cb = std::move(callback), infer_state]() {
+    LOG_INFO << "Preparing to run inference task queue...";
     while (true) { // Continuously check if the queue is not empty
       std::unique_lock<std::mutex> lock(infer_state->queue_mutex); // Lock the queue for exclusive access
       if (!infer_state->texts_to_stream.empty()) {
@@ -256,9 +287,7 @@ void TensorrtllmEngine::HandleChatCompletionImpl(inferences::ChatCompletionReque
         status["is_stream"] = true;
         status["status_code"] = k200OK;
         cb(std::move(status), std::move(resp_data));
-        continue;;
-      }
-      else {
+      } else {
         // If the queue is empty, release the lock and wait before trying again
         lock.unlock();
       }
@@ -269,33 +298,31 @@ void TensorrtllmEngine::HandleChatCompletionImpl(inferences::ChatCompletionReque
   return;
 };
 
-void TensorrtllmEngine::LoadModelImpl(model::LoadModelRequest&& request, std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-    std::filesystem::path const engine_dir = request.engine_path;
+void TensorrtllmEngine::LoadModel(std::shared_ptr<Json::Value> json_body, std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+    model::LoadModelRequest request = model::fromJson(json_body);
+    std::filesystem::path model_dir = request.model_path;
 
     int ctx_len = request.ctx_len;
     this->user_prompt = request.user_prompt;
     this->ai_prompt = request.ai_prompt;
     this->system_prompt = request.system_prompt;
+    this->model_id_ = GetModelId(*json_body);
 
     logger = std::make_shared<TllmLogger>();
     logger->setLevel(nvinfer1::ILogger::Severity::kINFO);
-    // Fixed settings
-    std::string const model_name = "mistral";
     initTrtLlmPlugins(logger.get());
-    // Load model configuration
-    std::filesystem::path json_file_name = engine_dir / "config.json";
-    std::filesystem::path tokenizerModelName = engine_dir / "tokenizer.json";
 
-    cortex_tokenizer = std::make_unique<Tokenizer>(tokenizerModelName.string());
-    LOG_INFO << "Loaded tokenizer";
+    std::filesystem::path tokenizer_model_name = model_dir / "tokenizer.model";
+    cortex_tokenizer = std::make_unique<Tokenizer>(tokenizer_model_name.string());
+    LOG_INFO << "Loaded tokenizer from " << tokenizer_model_name.string();
 
-    auto const json = GptJsonConfig::parse(json_file_name);
+    std::filesystem::path json_file_name = model_dir / "config.json";
+    auto json = GptJsonConfig::parse(json_file_name);
     auto config = json.getModelConfig();
     model_config = std::make_unique<GptModelConfig>(config);
-    auto const worldConfig = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism());
-    auto const enginePath = engine_dir / json.engineFilename(worldConfig, model_name);
-    LOG_INFO << "Engine Path : " << enginePath.string();
-    auto const dtype = model_config->getDataType();
+    auto world_config = WorldConfig::mpi(1, json.getTensorParallelism(), json.getPipelineParallelism());
+    LOG_INFO << "Loaded config from " << json_file_name.string();
+    // auto dtype = model_config->getDataType();
 
     // Currently doing fixed session config
     session_config.maxBatchSize = batchSize;
@@ -304,25 +331,102 @@ void TensorrtllmEngine::LoadModelImpl(model::LoadModelRequest&& request, std::fu
     session_config.cudaGraphMode = true; // Fixed for simplicity
 
     // Init gpt_session
-    gpt_session = std::make_unique<GptSession>(session_config, *model_config, worldConfig, enginePath.string(), logger);
+    auto model_path = model_dir / json.engineFilename(world_config, model_id_);
+    gpt_session = std::make_unique<GptSession>(session_config, *model_config, world_config, model_path.string(), logger);
+
+    model_loaded_ = true;
+    if (q_ == nullptr) {
+     q_ = std::make_unique<trantor::ConcurrentTaskQueue>(1, model_id_);
+    }
+
     // Model loaded successfully
+    LOG_INFO << "Model " << model_id_ << " loaded successfully from path " << model_path.string();
     Json::Value json_resp;
     json_resp["message"] = "Model loaded successfully";
     Json::Value status_resp;
     status_resp["status_code"] = k200OK;
     callback(std::move(status_resp), std::move(json_resp));
-    LOG_INFO << "Model loaded successfully: " << model_name;
     return;
 };
 
-void TensorrtllmEngine::Destroy(std::shared_ptr<Json::Value> json_body, std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
-  LOG_INFO << "Program is exitting, goodbye!";
-  exit(0);
-  return;
-};
+void TensorrtllmEngine::UnloadModel(std::shared_ptr<Json::Value> json_body, std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+  if (!CheckModelLoaded(callback)) {
+    LOG_WARN << "Model was not loaded";
+    Json::Value json_resp;
+    json_resp["message"] = "Model was not loaded";
+    Json::Value status;
+    status["status_code"] = k400BadRequest;
+    callback(std::move(status), std::move(json_resp));
+    return;
+  }
+    
+  gpt_session.reset();
+  cortex_tokenizer.reset();
+  q_.reset();
+  model_config.reset();
+  logger.reset();
+  model_loaded_ = false;
+
+  Json::Value json_resp;
+  json_resp["message"] = "Model unloaded successfully";
+  Json::Value status;
+  status["is_done"] = true;
+  status["has_error"] = false;
+  status["is_stream"] = false;
+  status["status_code"] = k200OK;
+  callback(std::move(status), std::move(json_resp));
+  LOG_INFO << "Model unloaded sucessfully";
+}
+
+void TensorrtllmEngine::HandleEmbedding( std::shared_ptr<Json::Value> json_body, std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+  LOG_WARN << "Engine does not support embedding yet";
+  Json::Value json_resp;
+  json_resp["message"] = "Engine does not support embedding yet";
+  Json::Value status;
+  status["status_code"] = k409Conflict;
+  callback(std::move(status), std::move(json_resp));
+}
+
+void TensorrtllmEngine::GetModelStatus(std::shared_ptr<Json::Value> json_body, std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+  LOG_WARN << "Engine does not support get model status method yet";
+  Json::Value json_resp;
+  json_resp["message"] = "Engine does not support get model status method yet";
+  Json::Value status;
+  status["status_code"] = k409Conflict;
+  callback(std::move(status), std::move(json_resp));
+}
+
+void TensorrtllmEngine::GetModels(
+    std::shared_ptr<Json::Value> json_body,
+    std::function<void(Json::Value&&, Json::Value&&)>&& callback) {
+  Json::Value json_resp;
+  Json::Value model_array = Json::arrayValue;
+
+  if (model_loaded_) {
+    Json::Value val;
+    val["id"] = model_id_;
+    val["engine"] = "cortex.tensorrt-llm";
+    val["start_time"] = start_time_;
+    val["vram"] = "-";
+    val["ram"] = "-";
+    val["object"] = "model";
+    model_array.append(val);
+  }
+
+  json_resp["object"] = "list";
+  json_resp["data"] = model_array;
+
+  Json::Value status;
+  status["is_done"] = true;
+  status["has_error"] = false;
+  status["is_stream"] = false;
+  status["status_code"] = k200OK;
+  callback(std::move(status), std::move(json_resp));
+  LOG_INFO << "Running models responded";
+}
 
 extern "C" {
-CortexTensorrtLlmEngineI* get_engine() {
+EngineI* get_engine() {
   return new TensorrtllmEngine();
 }
 }
diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.h b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.h
index debcf5e61..cc971f7eb 100644
--- a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.h
+++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/tensorrt-llm_engine.h
@@ -8,7 +8,7 @@
 #include <string>
 
 #include "NvInfer.h"
-#include "base/cortex-common/cortextensorrtllmi.h"
+#include "base/cortex-common/enginei.h"
 #include "models/chat_completion_request.h"
 #include "models/load_model_request.h"
 #include "sentencepiece_processor.h"
@@ -27,49 +27,9 @@
 
 using namespace tensorrt_llm::runtime;
 
-// class Tokenizer {
-//  private:
-//   sentencepiece::SentencePieceProcessor processor;
-
-//   void ReplaceSubstring(std::string& base, const std::string& from, const std::string& to) {
-//     size_t start_pos = 0;
-//     while ((start_pos = base.find(from, start_pos)) != std::string::npos) {
-//         base.replace(start_pos, from.length(), to);
-//         start_pos += to.length();
-//     }
-//   }
-
-//  public:
-//   Tokenizer(const std::string& model_path) {
-//     auto status = processor.Load(model_path);
-//     if (!status.ok()) {
-//       std::cerr << status.ToString() << std::endl;
-//     }
-//     LOG_INFO << "Successully loaded the tokenizer";
-//   }
-
-//   std::string DecodeWithSpace(const int id) {
-//     std::string text = processor.IdToPiece(id);
-//     ReplaceSubstring(text, "▁", " ");
-//     return text;
-//   }
-
-//   std::string Decode(const std::vector<int32_t> ids) {
-//     std::string text = processor.DecodeIds(ids);
-//     return text;
-//   }
-
-//   std::vector<int> Encode(const std::string& input) {
-//     std::vector<int> ids;
-//     processor.Encode(input, &ids);
-//     return ids;
-//   }
-// };
-
 class Tokenizer {
  private:
-  std::unordered_map<std::string, int> vocab;
-  std::unordered_map<int, std::string> id_to_token;
+  sentencepiece::SentencePieceProcessor processor;
 
   void ReplaceSubstring(std::string& base, const std::string& from, const std::string& to) {
     size_t start_pos = 0;
@@ -80,89 +40,78 @@ class Tokenizer {
   }
 
  public:
-  Tokenizer(const std::string& json_path) {
-    // Load tokenizer.json
-    std::ifstream file(json_path);
-    if (!file.is_open()) {
-      throw std::runtime_error("Failed to open tokenizer JSON file");
-    }
-
-    nlohmann::json tokenizer_json;
-    file >> tokenizer_json;
-
-    // Parse vocabulary
-    vocab = tokenizer_json["model"]["vocab"].get<std::unordered_map<std::string, int>>();
-    for (const auto& [key, value] : vocab) {
-      id_to_token[value] = key;
+  Tokenizer(const std::string& model_path) {
+    auto status = processor.Load(model_path);
+    if (!status.ok()) {
+      std::cerr << status.ToString() << std::endl;
     }
-
-    LOG_INFO << "Successfully loaded the tokenizer from JSON";
+    LOG_INFO << "Successully loaded the tokenizer";
   }
 
   std::string DecodeWithSpace(const int id) {
-    std::string text = id_to_token[id];
+    std::string text = processor.IdToPiece(id);
     ReplaceSubstring(text, "▁", " ");
     return text;
   }
 
-  std::string Decode(const std::vector<int32_t>& ids) {
-    std::string text;
-    for (int id : ids) {
-      text += id_to_token[id];
-    }
-    ReplaceSubstring(text, "▁", " ");
+  std::string Decode(const std::vector<int32_t> ids) {
+    std::string text = processor.DecodeIds(ids);
     return text;
   }
 
   std::vector<int> Encode(const std::string& input) {
     std::vector<int> ids;
-    std::string word;
-    for (char ch : input) {
-      word += ch;
-      if (vocab.find(word) != vocab.end()) {
-        ids.push_back(vocab[word]);
-        word.clear();
-      }
-    }
+    processor.Encode(input, &ids);
     return ids;
   }
 };
 
 struct InferenceState {
-    int prev_pos{0};
-    std::string prev_text;
-    bool is_finished;
-    std::queue<std::string> texts_to_stream;
-    std::mutex queue_mutex; // Mutex to protect access to textsToStream
-    size_t stop_word_match_len = 0;
-    std::vector<std::string> sequence{"<", "|", "im", "_", "end", "|", ">"};
-    int token_gen_count = 0;
-
-    void Reset() {
-        stop_word_match_len = 0;
-        prev_text = "";
-    }
+  int prev_pos{0};
+  std::string prev_text;
+  bool is_finished;
+  std::queue<std::string> texts_to_stream;
+  std::mutex queue_mutex; // Mutex to protect access to textsToStream
+  size_t stop_word_match_len = 0;
+  std::vector<std::string> sequence{"<", "|", "im", "_", "end", "|", ">"};
+  int token_gen_count = 0;
+
+  void Reset() {
+      stop_word_match_len = 0;
+      prev_text = "";
+  }
 
-    bool IsComplete() const {
-        return stop_word_match_len >= sequence.size();
-    }
+  bool IsComplete() const {
+      return stop_word_match_len >= sequence.size();
+  }
 };
 
 namespace tensorrtllm {
 
-class TensorrtllmEngine : public CortexTensorrtLlmEngineI {
+class TensorrtllmEngine : public EngineI {
  public:
   ~TensorrtllmEngine() final;
   // ### Interface ###
+  void HandleChatCompletion(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) final;
+  void HandleEmbedding(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) final;
   void LoadModel(
       std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) final;
-  void HandleChatCompletion(
+  void UnloadModel(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) final;
+  void GetModelStatus(
+      std::shared_ptr<Json::Value> json_body,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) final;
+
+  // API to get running models.
+  void GetModels(
       std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) final;
-  void Destroy(
-      std::shared_ptr<Json::Value> jsonBody,
-      std::function<void(Json::Value&&, Json::Value&&)>&& callback) final; 
 
   GenerationInput::TensorPtr GetTensorSingleStopWordList(int stopToken);
   GenerationInput CreateGenerationInput(std::vector<int32_t> inputIds);
@@ -172,10 +121,10 @@ class TensorrtllmEngine : public CortexTensorrtLlmEngineI {
   std::unique_ptr<GptSession> gpt_session;
   std::unique_ptr<Tokenizer> cortex_tokenizer;
 
-  void LoadModelImpl(model::LoadModelRequest&& request, std::function<void(Json::Value&&, Json::Value&&)>&& callback);
-  void HandleChatCompletionImpl(inferences::ChatCompletionRequest&& request, std::function<void(Json::Value&&, Json::Value&&)>&& callback);  
  private:
-  std::unique_ptr<trantor::ConcurrentTaskQueue> q;
+  bool CheckModelLoaded(
+      std::function<void(Json::Value&&, Json::Value&&)>& callback);
+
   GptSession::Config session_config{1, 1, 1};
   SamplingConfig sampling_config{1};
   std::unique_ptr<GptModelConfig> model_config;
@@ -185,6 +134,10 @@ class TensorrtllmEngine : public CortexTensorrtLlmEngineI {
   std::string system_prompt;
   std::string pre_prompt;
   int batchSize = 1;
+  std::string model_id_;
+  uint64_t start_time_;
+  std::atomic<bool> model_loaded_;
+  std::unique_ptr<trantor::ConcurrentTaskQueue> q_;
 };
 
 } // namespace inferences
diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/utils/tensorrt-llm_utils.h b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/utils/tensorrt-llm_utils.h
index 6aec47012..c17d06ec7 100644
--- a/cpp/tensorrt_llm/cortex.tensorrt-llm/src/utils/tensorrt-llm_utils.h
+++ b/cpp/tensorrt_llm/cortex.tensorrt-llm/src/utils/tensorrt-llm_utils.h
@@ -14,7 +14,6 @@
 // Include platform-specific headers
 #ifdef _WIN32
 #include <windows.h>
-#include <winsock2.h>
 #else
 #include <dirent.h>
 #endif