From 66481c48dc8dad16bc460da231c616d4f176cf4d Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Tue, 3 Sep 2024 09:38:48 -0700 Subject: [PATCH 1/2] Multi-Node EKS Support --- .../1. Create_EKS_Cluster.md | 197 ++++++++ .../2. Configure_EKS_Cluster.md | 326 +++++++++++++ .../3. Deploy_Triton.md | 444 ++++++++++++++++++ .../EKS_Multinode_Triton_TRTLLM/README.md | 17 + .../eks_cluster_config.yaml | 52 ++ .../aws-efa-k8s-device-plugin/.helmignore | 23 + .../aws-efa-k8s-device-plugin/Chart.yaml | 9 + .../aws-efa-k8s-device-plugin/README.md | 38 ++ .../templates/NOTES.txt | 1 + .../templates/_helpers.tpl | 62 +++ .../templates/daemonset.yaml | 78 +++ .../aws-efa-k8s-device-plugin/values.yaml | 145 ++++++ .../multinode_helm_chart/chart/Chart.yaml | 20 + .../chart/example_values.yaml | 60 +++ .../chart/templates/NOTES.txt | 25 + .../chart/templates/deployment.yaml | 249 ++++++++++ .../chart/templates/hpa.yaml | 42 ++ .../chart/templates/pod-monitor.yaml | 34 ++ .../chart/templates/rbac.yaml | 78 +++ .../chart/templates/service.yaml | 44 ++ .../chart/values.schema.json | 282 +++++++++++ .../multinode_helm_chart/chart/values.yaml | 105 +++++ .../multinode_helm_chart/containers/README.md | 26 + .../multinode_helm_chart/containers/kubessh | 19 + .../multinode_helm_chart/containers/server.py | 279 +++++++++++ .../containers/triton_trt_llm.containerfile | 167 +++++++ .../multinode_helm_chart/gen_ai_perf.yaml | 18 + .../multinode_helm_chart/nccl_test.yaml | 96 ++++ .../nvidia_dcgm-exporter_values.yaml | 107 +++++ ...vidia_gpu-feature-discovery_daemonset.yaml | 87 ++++ .../multinode_helm_chart/setup_ssh_efs.yaml | 29 ++ .../triton-metrics_prometheus-rule.yaml | 38 ++ .../p5-trtllm-cluster-config.yaml | 70 +++ .../pvc/claim.yaml | 11 + .../EKS_Multinode_Triton_TRTLLM/pvc/pv.yaml | 15 + .../pvc/storageclass.yaml | 5 + 36 files changed, 3298 insertions(+) create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/1. Create_EKS_Cluster.md create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/2. Configure_EKS_Cluster.md create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/3. Deploy_Triton.md create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/README.md create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/eks_cluster_config.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/.helmignore create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/Chart.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/README.md create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/templates/NOTES.txt create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/templates/_helpers.tpl create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/templates/daemonset.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/values.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/Chart.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/example_values.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/NOTES.txt create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/deployment.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/hpa.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/pod-monitor.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/rbac.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/service.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/values.schema.json create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/values.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/README.md create mode 100755 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/kubessh create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/server.py create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/triton_trt_llm.containerfile create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/gen_ai_perf.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/nccl_test.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/nvidia_dcgm-exporter_values.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/nvidia_gpu-feature-discovery_daemonset.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/setup_ssh_efs.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/triton-metrics_prometheus-rule.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/p5-trtllm-cluster-config.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/pvc/claim.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/pvc/pv.yaml create mode 100644 Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/pvc/storageclass.yaml diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/1. Create_EKS_Cluster.md b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/1. Create_EKS_Cluster.md new file mode 100644 index 00000000..832a5d04 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/1. Create_EKS_Cluster.md @@ -0,0 +1,197 @@ +# Steps to create EKS cluster with EFS + +## 1. Install CLIs + +### a. Install AWS CLI (steps [here](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)) + +``` +sudo apt install unzip +curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" +unzip awscliv2.zip +sudo ./aws/install +``` + +### b. Install Kubernetes CLI (steps [here](https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html)) + +``` +curl -O https://s3.us-west-2.amazonaws.com/amazon-eks/1.30.0/2024-05-12/bin/linux/amd64/kubectl +chmod +x ./kubectl +mkdir -p $HOME/bin && cp ./kubectl $HOME/bin/kubectl && export PATH=$HOME/bin:$PATH +echo 'export PATH=$HOME/bin:$PATH' >> ~/.bashrc +``` + +### c. Install EKS CLI (steps [here](https://eksctl.io/installation/)) + +``` +ARCH=amd64 +PLATFORM=$(uname -s)_$ARCH +curl -sLO "https://github.com/eksctl-io/eksctl/releases/latest/download/eksctl_$PLATFORM.tar.gz" +curl -sL "https://github.com/eksctl-io/eksctl/releases/latest/download/eksctl_checksums.txt" | grep $PLATFORM | sha256sum --check +tar -xzf eksctl_$PLATFORM.tar.gz -C /tmp && rm eksctl_$PLATFORM.tar.gz +sudo mv /tmp/eksctl /usr/local/bin +``` + +### d. Install Helm CLI (steps [here](https://docs.aws.amazon.com/eks/latest/userguide/helm.html)) + +``` +curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 > get_helm.sh +chmod 700 get_helm.sh +./get_helm.sh +``` + +## 2. Create an EKS cluster + +In this example we create an EKS cluster consisting of two `g5.12xlarge` compute nodes, each with four NVIDIA A10G GPUs and `c5.2xlarge` CPU node as control plane. We also setup EFA between the compute nodes. + +### a. Configure AWS CLI + +``` +aws configure +``` + +### b. Create a config file for EKS cluster creation + +We have provided an example file here: [eks_cluster_config.yaml](./eks_cluster_config.yaml) + +``` +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: wenhant-eks-cluster + version: "1.30" + region: us-east-1 + +availabilityZones: + - us-east-1a + - us-east-1b + - us-east-1c + - us-east-1d + - us-east-1e + - us-east-1f + +iam: + withOIDC: true + +managedNodeGroups: + - name: sys-nodes-2 + instanceType: c5.2xlarge + minSize: 1 + desiredCapacity: 1 + maxSize: 1 + availabilityZones: ["us-east-1a"] + iam: + withAddonPolicies: + imageBuilder: true + autoScaler: true + ebs: true + efs: true + awsLoadBalancerController: true + cloudWatch: true + albIngress: true + + - name: efa-compute-ng-2 + instanceType: g5.12xlarge + minSize: 1 + desiredCapacity: 1 + maxSize: 1 + volumeSize: 300 + efaEnabled: true + privateNetworking: true + availabilityZones: ["us-east-1a"] + iam: + withAddonPolicies: + imageBuilder: true + autoScaler: true + ebs: true + efs: true + awsLoadBalancerController: true + cloudWatch: true + albIngress: true +``` + +> [!NOTE] +> We set `minSize` and `desiredCapacity` to be 1 because AWS does not create your cluster successfully if no nodes are available. For example, if you specify `desiredCapacity` to be 2 but there are no available 2 nodes, your cluster creation will fail due to timeout even though there are no errors. The easiest way to avoid this is to create the cluster with 1 node and increase the number of nodes later in the EKS console. After you increase number of nodes in your node groups, make sure GPU nodes are in the same subnet. This is required for EFA to work. + +### c. Create the EKS cluster + +``` +eksctl create cluster -f eks_cluster_config.yaml +``` + +## 3. Create an EFS file system + +To enable multiple pods deployed to multiple nodes to load shards of the same model so that they can used in coordination to serve inference request too large to loaded by a single GPU, we'll need a common, shared storage location. In Kubernetes, these common, shared storage locations are referred to as persistent volumes. Persistent volumes can be volume mapped in to any number of pods and then accessed by processes running inside of said pods as if they were part of the pod's file system. We will be using EFS as persistent volume. + +Additionally, we will need to create a persistent-volume claim which can use to assign the persistent volume to a pod. +### a. Create an IAM role + +Follow the steps to create an IAM role for your EFS file system: https://docs.aws.amazon.com/eks/latest/userguide/efs-csi.html#efs-create-iam-resources. This role will be used later when you install the EFS CSI Driver. + +### b. Install EFS CSI driver + +Install the EFS CSI Driver through the Amazon EKS add-on in AWS console: https://docs.aws.amazon.com/eks/latest/userguide/efs-csi.html#efs-install-driver. Once it's done, check the Add-ons section in EKS console, you should see the driver is showing `Active` under Status. + +### c. Create EFS file system + +Follow the steps to create an EFS file system: https://github.com/kubernetes-sigs/aws-efs-csi-driver/blob/master/docs/efs-create-filesystem.md. Make sure you mount subnets in the last step correctly. This will affect whether your nodes are able to access the created EFS file system. + +## 4. Test + +Follow the steps to check if your EFS file system is working properly with your nodes: https://github.com/kubernetes-sigs/aws-efs-csi-driver/tree/master/examples/kubernetes/multiple_pods. This test is going to mount your EFS file system on all of your available nodes and write a text file to the file system. + +## 5. Create an PVC for the created EFS file system + +We have provided an example in here: [pvc](./pvc/). This folder contains three files: `pv.yaml`, `claim.yaml`, and `storageclass.yaml`. Make sure you modify the `pv.yaml` file and change the `volumeHandle` value to your own EFS file system ID. + +pv.yaml + +``` +apiVersion: v1 +kind: PersistentVolume +metadata: + name: efs-pv +spec: + capacity: + storage: 200Gi + volumeMode: Filesystem + accessModes: + - ReadWriteMany + persistentVolumeReclaimPolicy: Retain + storageClassName: efs-sc + csi: + driver: efs.csi.aws.com + volumeHandle: fs-0cf1f987d6f5af59c # Change to your own ID +``` + +claim.yaml + +``` +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: efs-claim +spec: + accessModes: + - ReadWriteMany + storageClassName: efs-sc + resources: + requests: + storage: 200Gi +``` + +storageclass.yaml + +``` +kind: StorageClass +apiVersion: storage.k8s.io/v1 +metadata: + name: efs-sc +provisioner: efs.csi.aws.com +``` + +Run the below command to deploy: + +``` +kubectl apply -f pvc/ +``` diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/2. Configure_EKS_Cluster.md b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/2. Configure_EKS_Cluster.md new file mode 100644 index 00000000..54617261 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/2. Configure_EKS_Cluster.md @@ -0,0 +1,326 @@ +# Steps to set up cluster + +In this guide we will set up the Kubernetes cluster for the deployment of LLMs using Triton Server and TRT-LLM. +* +## 1. Add node label and taint + +As first step we will add node labels and taints + +* A node label of `nvidia.com/gpu=present` to more easily identify nodes with NVIDIA GPUs. +* A node taint of `nvidia.com/gpu=present:NoSchedule` to prevent non-GPU pods from being deployed to GPU nodes. + +Run the following command to get nodes: + +``` +kubectl get nodes +``` + +You should see output something similar to below: + +``` +NAME STATUS ROLES AGE VERSION +ip-192-168-117-30.ec2.internal Ready 3h10m v1.30.2-eks-1552ad0 +ip-192-168-127-31.ec2.internal Ready 155m v1.30.2-eks-1552ad0 +ip-192-168-26-106.ec2.internal Ready 3h23m v1.30.2-eks-1552ad0 +``` + +> [!Note] +> Here we have 3 nodes: 1 CPU node and 2 GPU nodes. You only need to apply labels and taints to GPU nodes. Note that because EFA is enabled on GPU nodes, they have to be in the same subnet in your VPC. Thus, their IP addresses are closer. In this case, the top 2 nodes are likely to be the GPU nodes. You can also run `kubectl describe node ` to verify. + +Run the following command to add label and taints: + +``` +kubectl label nodes ip-192-168-117-30.ec2.internal nvidia.com/gpu=present +kubectl label nodes ip-192-168-127-31.ec2.internal nvidia.com/gpu=present +kubectl taint nodes ip-192-168-117-30.ec2.internal nvidia.com/gpu=present:NoSchedule +kubectl taint nodes ip-192-168-127-31.ec2.internal nvidia.com/gpu=present:NoSchedule +``` + +Alternatively, you can add labels and taints in node groups under [EKS console](https://console.aws.amazon.com/eks/home). + +## 2. Install Kubernetes Node Feature Discovery service + +This allows for the deployment of a pod onto a node with a matching taint that we set above. + +``` +kubectl create namespace monitoring +helm repo add kube-nfd https://kubernetes-sigs.github.io/node-feature-discovery/charts && helm repo update +helm install -n kube-system node-feature-discovery kube-nfd/node-feature-discovery \ + --set nameOverride=node-feature-discovery \ + --set worker.tolerations[0].key=nvidia.com/gpu \ + --set worker.tolerations[0].operator=Exists \ + --set worker.tolerations[0].effect=NoSchedule +``` + +## 3. Install NVIDIA Device Plugin + +We are using NVIDIA Device Plugin here because the default EKS optimzied AMI (Amazon Linux 2) already has NVIDIA drivers pre-installed. If you would like to use EKS Ubuntu AMI which does not have the drivers pre-installed, you need to install [NVIDIA GPU Operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/amazon-eks.html#nvidia-gpu-operator-with-amazon-eks) instead. + +``` +kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.15.0/deployments/static/nvidia-device-plugin.yml +``` + +## 4. Install NVIDIA GPU Feature Discovery service + +``` +cd multinode_helm_chart/ +kubectl apply -f nvidia_gpu-feature-discovery_daemonset.yaml +``` + +## 5. Install Prometheus Kubernetes Stack + +The Prometheus Kubernetes Stack installs the necessary components including Prometheus, Kube-State-Metrics, Grafana, etc. for metrics collection. + +``` +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts && helm repo update +helm install -n monitoring prometheus prometheus-community/kube-prometheus-stack \ + --set tolerations[0].key=nvidia.com/gpu \ + --set tolerations[0].operator=Exists \ + --set tolerations[0].effect=NoSchedule +``` + +## 6. Install NVIDIA DCGM Exporter + +This exporter allows us to collect GPU metrics through DCGM which is the recommended way to monitor GPU status in our cluster. + +``` +helm repo add nvidia-dcgm https://nvidia.github.io/dcgm-exporter/helm-charts && helm repo update +helm install -n monitoring dcgm-exporter nvidia-dcgm/dcgm-exporter --values nvidia_dcgm-exporter_values.yaml +``` + +You can verify by showing the metrics collected by DCGM: + +``` +kubectl -n monitoring port-forward svc/dcgm-exporter 8080:9400 +``` + +In you local browser, you should be able to see metrics in `localhost:8080`. + +## 7. Install Prometheus Adapter + +This allows the Triton metrics collected by Prometheus server to be available to Kuberntes' Horizontal Pod Autoscaler service. + +``` +helm install -n monitoring prometheus-adapter prometheus-community/prometheus-adapter \ + --set metricsRelistInterval=6s \ + --set customLabels.monitoring=prometheus-adapter \ + --set customLabels.release=prometheus \ + --set prometheus.url=http://prometheus-kube-prometheus-prometheus \ + --set additionalLabels.release=prometheus +``` + +To verify that Prometheus Adapter is working properly, run the following command: + +``` +kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1 +``` + +If the command fails, wait longer and retry. If the command fails for more than a few minutes then the adapter is misconfigured and will require intervention. + +## 8. Install Prometheus rule for Triton metrics + +This generates custom metrics from a formula that uses the Triton metrics collected by Prometheus. One of the custom metrics is used in Horizontal Pod Autoscaler (HPA). Users can modify this manifest to create their own custom metrics and set them in the HPA manifest. + +``` +kubectl apply -f triton-metrics_prometheus-rule.yaml +``` + +At this point, all metrics components should have been installed. All metrics including Triton metrics, DCGM metrics, and custom metrics should be availble to Prometheus server now. You can verify by showing all metrics in Prometheus server: + +``` +kubectl -n monitoring port-forward svc/prometheus-kube-prometheus-prometheus 8080:9090 +``` + +In you local browser, you should be able to see all the metrics mentioned above in `localhost:8080`. + +## 9. Install EFA Kubernetes Device Plugin + +Pull the EFA Kubernetes Device Plugin helm chart: + +``` +helm repo add eks https://aws.github.io/eks-charts +helm pull eks/aws-efa-k8s-device-plugin --untar +``` + +Add tolerations in `aws-efa-k8s-device-plugin/values.yaml` at line 134 like below: + +``` +tolerations: +- key: nvidia.com/gpu + operator: Exists + effect: NoSchedule +``` + +Install the EFA Kubernetes Device Plugin helm chart: + +``` +helm install aws-efa-k8s-device-plugin --namespace kube-system ./aws-efa-k8s-device-plugin/ +``` + +## 10. Install Cluster Autoscaler + +> [!Note] +> - Autoscaler IAM add-on policy needs to be attached (done already if using the example config to create an EKS cluster). +> - The Cluster Autoscaler won't exceed the maximum number of nodes that you set in your node group. So if you want to allow more nodes to be added to your node group by the Cluste Autoscaler, make sure you set maximum nodes accordingly. +> - The Cluster Autoscaler only scales up number of nodes when there are `unschedulable` pods. It also scales down when the additional nodes become "free". + +### a. Deploy the Cluster Autoscaler deployment + +``` +kubectl apply -f https://raw.githubusercontent.com/kubernetes/autoscaler/master/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml +``` + +### b. Set image version + +Here we set the image version to be `v1.30.2`. Make sure it matches your EKS cluster version. + +``` +kubectl -n kube-system set image deployment.apps/cluster-autoscaler cluster-autoscaler=registry.k8s.io/autoscaling/cluster-autoscaler:v1.30.2 +``` + +### c. Add the required safe-to-evict annotation to the deployment + +``` +kubectl -n kube-system annotate deployment.apps/cluster-autoscaler cluster-autoscaler.kubernetes.io/safe-to-evict="false" +``` + +### d. Edit the manifest file + +``` +kubectl -n kube-system edit deployment.apps/cluster-autoscaler + +# Change 1: Add your cluster name: +- --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/ + +# Change 2: Add the following two lines below the line above: +- --balance-similar-node-groups +- --skip-nodes-with-system-pods=false +``` + +## 11. Install the LeaderWorkerSet + +This allows us to use the LeaderWorkerSet API for our multi-node Triton deployment. + +``` +VERSION=v0.3.0 +kubectl apply --server-side -f https://github.com/kubernetes-sigs/lws/releases/download/$VERSION/manifests.yaml +``` + +## 12. Verify installation + +List all the Pods: + +``` +kubectl get pods -A +``` + +``` +NAMESPACE NAME READY STATUS RESTARTS AGE +kube-system aws-node-55lp4 2/2 Running 0 3h11m +kube-system aws-node-lz7sm 2/2 Running 0 144m +kube-system aws-node-qr69w 2/2 Running 0 179m +kube-system cluster-autoscaler-7bc88498df-cjjrg 1/1 Running 0 3h35m +kube-system coredns-d9b6d6c7d-hlx8k 1/1 Running 0 3h35m +kube-system coredns-d9b6d6c7d-pgzl8 1/1 Running 0 3h20m +kube-system efa-aws-efa-k8s-device-plugin-6m7wl 1/1 Running 0 144m +kube-system efa-aws-efa-k8s-device-plugin-tz2j8 1/1 Running 0 179m +kube-system efs-csi-controller-7675bbb88-98rcz 3/3 Running 0 3h35m +kube-system efs-csi-controller-7675bbb88-vhwvq 3/3 Running 0 3h35m +kube-system efs-csi-node-b6ltd 3/3 Running 0 179m +kube-system efs-csi-node-cp229 3/3 Running 0 144m +kube-system efs-csi-node-z2r8v 3/3 Running 0 3h11m +kube-system gpu-feature-discovery-shmzv 1/1 Running 0 102m +kube-system gpu-feature-discovery-tpg4m 1/1 Running 0 102m +kube-system kube-proxy-8mf5m 1/1 Running 0 179m +kube-system kube-proxy-mp5x4 1/1 Running 0 3h11m +kube-system kube-proxy-wx8rq 1/1 Running 0 144m +kube-system node-feature-discovery-gc-7fd4d8b94f-668tz 1/1 Running 0 3h35m +kube-system node-feature-discovery-master-5d589d89b6-fm4dv 1/1 Running 0 3h35m +kube-system node-feature-discovery-worker-28njz 1/1 Running 0 144m +kube-system node-feature-discovery-worker-74vrx 1/1 Running 0 179m +kube-system node-feature-discovery-worker-cp2k4 1/1 Running 0 3h11m +kube-system nvidia-device-plugin-daemonset-5wmfq 1/1 Running 0 179m +kube-system nvidia-device-plugin-daemonset-btm97 1/1 Running 0 144m +kube-system nvidia-device-plugin-daemonset-wdv5t 1/1 Running 0 3h11m +lws-system lws-controller-manager-799c9c77bc-wk897 2/2 Running 0 3h35m +monitoring alertmanager-prometheus-kube-prometheus-alertmanager-0 2/2 Running 0 3h35m +monitoring dcgm-exporter-jmf5l 1/1 Running 0 102m +monitoring dcgm-exporter-r7f8n 1/1 Running 0 102m +monitoring prometheus-adapter-5447c4cc95-8db8g 1/1 Running 0 3h35m +monitoring prometheus-grafana-5f846bc55f-7dnsm 3/3 Running 0 3h35m +monitoring prometheus-kube-prometheus-operator-5464cbd4d5-svrn6 1/1 Running 0 3h35m +monitoring prometheus-kube-state-metrics-5749f84cb-m56c7 1/1 Running 0 3h35m +monitoring prometheus-prometheus-kube-prometheus-prometheus-0 2/2 Running 0 3h35m +monitoring prometheus-prometheus-node-exporter-dbm6m 1/1 Running 0 179m +monitoring prometheus-prometheus-node-exporter-jglc6 1/1 Running 0 3h11m +monitoring prometheus-prometheus-node-exporter-zghvb 1/1 Running 0 144m +``` + +List all the Deployments: + +``` +kubectl get deployments -A +``` + +``` +NAMESPACE NAME READY UP-TO-DATE AVAILABLE AGE +kube-system cluster-autoscaler 1/1 1 1 4h42m +kube-system coredns 2/2 2 2 42d +kube-system efs-csi-controller 2/2 2 2 42d +kube-system node-feature-discovery-gc 1/1 1 1 42d +kube-system node-feature-discovery-master 1/1 1 1 42d +lws-system lws-controller-manager 1/1 1 1 11d +monitoring prometheus-adapter 1/1 1 1 42d +monitoring prometheus-grafana 1/1 1 1 42d +monitoring prometheus-kube-prometheus-operator 1/1 1 1 42d +monitoring prometheus-kube-state-metrics 1/1 1 1 42d +``` + +List all the DaemonSets: + +``` +kubectl get ds -A +``` + +``` +NAMESPACE NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE +kube-system aws-node 3 3 3 3 3 42d +kube-system efa-aws-efa-k8s-device-plugin 2 2 2 2 2 14d +kube-system efs-csi-node 3 3 3 3 3 kubernetes.io/os=linux 42d +kube-system gpu-feature-discovery 2 2 2 2 2 18d +kube-system kube-proxy 3 3 3 3 3 42d +kube-system node-feature-discovery-worker 3 3 3 3 3 42d +kube-system nvidia-device-plugin-daemonset 3 3 3 3 3 18d +monitoring dcgm-exporter 2 2 2 2 2 nvidia.com/gpu=present 14d +monitoring prometheus-prometheus-node-exporter 3 3 3 3 3 kubernetes.io/os=linux 42d +``` + +List all the Services: + +``` +kubectl get services -A +``` + +``` +NAMESPACE NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +default kubernetes ClusterIP 10.100.0.1 443/TCP 42d +kube-system kube-dns ClusterIP 10.100.0.10 53/UDP,53/TCP 42d +kube-system prometheus-kube-prometheus-coredns ClusterIP None 9153/TCP 42d +kube-system prometheus-kube-prometheus-kube-controller-manager ClusterIP None 10257/TCP 42d +kube-system prometheus-kube-prometheus-kube-etcd ClusterIP None 2381/TCP 42d +kube-system prometheus-kube-prometheus-kube-proxy ClusterIP None 10249/TCP 42d +kube-system prometheus-kube-prometheus-kube-scheduler ClusterIP None 10259/TCP 42d +kube-system prometheus-kube-prometheus-kubelet ClusterIP None 10250/TCP,10255/TCP,4194/TCP 42d +lws-system lws-controller-manager-metrics-service ClusterIP 10.100.213.37 8443/TCP 11d +lws-system lws-webhook-service ClusterIP 10.100.103.158 443/TCP 11d +monitoring alertmanager-operated ClusterIP None 9093/TCP,9094/TCP,9094/UDP 42d +monitoring dcgm-exporter ClusterIP 10.100.62.240 9400/TCP 14d +monitoring prometheus-adapter ClusterIP 10.100.13.192 443/TCP 42d +monitoring prometheus-grafana ClusterIP 10.100.56.89 80/TCP 42d +monitoring prometheus-kube-prometheus-alertmanager ClusterIP 10.100.40.55 9093/TCP,8080/TCP 42d +monitoring prometheus-kube-prometheus-operator ClusterIP 10.100.232.224 443/TCP 42d +monitoring prometheus-kube-prometheus-prometheus ClusterIP 10.100.144.122 9090/TCP,8080/TCP 42d +monitoring prometheus-kube-state-metrics ClusterIP 10.100.194.231 8080/TCP 42d +monitoring prometheus-operated ClusterIP None 9090/TCP 42d +monitoring prometheus-prometheus-node-exporter ClusterIP 10.100.44.228 9100/TCP 42d +``` diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/3. Deploy_Triton.md b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/3. Deploy_Triton.md new file mode 100644 index 00000000..cfd1b5cc --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/3. Deploy_Triton.md @@ -0,0 +1,444 @@ +# Steps to deploy multi-node LLM using Triton + TRT-LLM on EKS cluster + +## 1. Build the custom container image and push it to Amazon ECR + +We need to build a custom image on top of Triton TRT-LLM NGC container to include the kubessh file, server.py, and other EFA libraries and will then push this image to Amazon ECR. You can take a look at the [Dockerfile here](https://github.com/Wenhan-Tan/EKS_Multinode_Triton_TRTLLM/blob/main/multinode_helm_chart/containers/triton_trt_llm.containerfile). + +``` +## AWS +export AWS_REGION=us-east-1 +export ACCOUNT=$(aws sts get-caller-identity --query Account --output text) + +## Docker Image +export REGISTRY=${ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/ +export IMAGE=triton_trtllm_multinode +export TAG=":24.07" + +docker build \ + --file ./triton_trt_llm.containerfile \ + --rm \ + --tag ${REGISTRY}${IMAGE}${TAG} \ + . + +echo "Logging in to $REGISTRY ..." +aws ecr get-login-password | docker login --username AWS --password-stdin $REGISTRY + +# Create registry if it does not exist +REGISTRY_COUNT=$(aws ecr describe-repositories | grep ${IMAGE} | wc -l) +if [ "$REGISTRY_COUNT" == "0" ]; then + echo "" + echo "Creating repository ${IMAGE} ..." + aws ecr create-repository --repository-name ${IMAGE} +fi + +# Push image +docker image push ${REGISTRY}${IMAGE}${TAG} +``` + +## 2. Setup Triton model repository for LLM deployment: + +To build the TRT-LLM engine and set up Triton model repository inside the compute node use the following steps: + +### a. Modify the `setup_ssh_efs.yaml` file + +We use the `setup_ssh_efs.yaml` file which does "sleep infinity" to set up ssh access inside the compute node along with EFS. + +Adjust the following values: + +- `image`: change image tag. Default is 24.07 which supports TRT-LLM v0.11.0 +- `nvidia.com/gpu`: set to the number of GPUs per node in your cluster, adjust in both the limits and requests section +- `claimName`: set to your EFS pvc name + +### b. SSH into compute node and build TRT-LLM engine + +Deploy the pod: + +``` +cd multinode_helm_chart/ +kubectl apply -f setup_ssh_efs.yaml +kubectl exec -it setup_ssh_efs -- bash +``` + +Clone the Triton TRT-LLM backend repository: + +``` +cd +git clone https://github.com/triton-inference-server/tensorrtllm_backend.git -b v0.11.0 +cd tensorrtllm_backend +git lfs install +git submodule update --init --recursive +``` + +Build a Llama3-8B engine with Tensor Parallelism=4, Pipeline Parallelism=2 to run on 2 nodes of g5.12xlarge (4 A10G GPUs each), so total of 8 GPUs across 2 nodes. + +``` +cd tensorrtllm_backend/tensorrt_llm/examples/llama +git clone https://huggingface.co/meta-llama/Meta-Llama-3-8B + +python convert_checkpoint.py --model_dir ./Meta-Llama-3-8B \ + --output_dir ./converted_checkpoint \ + --dtype float16 \ + --tp_size 4 \ + --pp_size 2 + +trtllm-build --checkpoint_dir ./converted_checkpoint \ + --output_dir ./output_engines \ + --gemm_plugin float16 \ + --use_custom_all_reduce disable \ # only disable on non-NVLink machines like g5.12xlarge + --max_input_len 2048 \ + --max_output_len 2048 \ + --max_batch_size 4 +``` + +### c. Prepare the Triton model repository + +``` +cd /tensorrtllm_backend +mkdir triton_model_repo + +cp -r all_models/inflight_batcher_llm/ensemble triton_model_repo/ +cp -r all_models/inflight_batcher_llm/preprocessing triton_model_repo/ +cp -r all_models/inflight_batcher_llm/postprocessing triton_model_repo/ +cp -r all_models/inflight_batcher_llm/tensorrt_llm triton_model_repo/ + +python3 tools/fill_template.py -i triton_model_repo/preprocessing/config.pbtxt tokenizer_dir:,tokenizer_type:llama,triton_max_batch_size:4,preprocessing_instance_count:1 +python3 tools/fill_template.py -i triton_model_repo/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:4,decoupled_mode:True,max_beam_width:1,engine_dir:,,enable_kv_cache_reuse:False,batching_strategy:inflight_batching,max_queue_delay_microseconds:0 +python3 tools/fill_template.py -i triton_model_repo/postprocessing/config.pbtxt tokenizer_dir:,tokenizer_type:llama,triton_max_batch_size:4,postprocessing_instance_count:1 +python3 tools/fill_template.py -i triton_model_repo/ensemble/config.pbtxt triton_max_batch_size:4 +``` + +> [!Note] +> Be sure to substitute the correct values for `` and `` in the example above. Keep in mind that the tokenizer, the TRT-LLM engines, and the Triton model repository shoudl be in a shared file storage between your nodes. They're required to launch your model in Triton. For example, if using AWS EFS, the values for `` and `` should be respect to the actutal EFS mount path. This is determined by your persistent-volume claim and mount path in chart/templates/deployment.yaml. Make sure that your nodes are able to access these files. + +## 3. Create `example_values.yaml` file for deployment + +Make sure you go over the provided `values.yaml` first to understand what each value represents. + +Below is the `example_values.yaml` file we use where `=/var/run/models`: + +``` +gpu: NVIDIA-A10G +gpuPerNode: 4 +persistentVolumeClaim: efs-claim + +tensorrtLLM: + parallelism: + tensor: 4 + pipeline: 2 + +triton: + image: + name: wenhant16/triton_trtllm_multinode:24.07.10 + # name: ${ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/triton_trtllm_multinode:24.07 + resources: + cpu: 4 + memory: 64Gi + efa: 1 # If you don't want to enable EFA, set this to 0. + triton_model_repo_path: /var/run/models/tensorrtllm_backend/triton_model_repo + enable_nsys: false # Note if you send lots of requests, nsys report can be very large. + +logging: + tritonServer: + verbose: False + +autoscaling: + enable: true + replicas: + maximum: 2 + minimum: 1 + metric: + name: triton:queue_compute:ratio + value: 1 +``` + +## 4. Install the Helm chart + +``` +helm install multinode_deployment \ + --values ./chart/values.yaml \ + --values ./chart/example_values.yaml \ + ./chart/. +``` + +In this example, we are going to deploy Triton server on 2 nodes with 4 GPUs each. This will result in having 2 pods running in your cluster. Command `kubectl get pods` should output something similar to below: + +``` +NAME READY STATUS RESTARTS AGE +leaderworkerset-sample-0 1/1 Running 0 28m +leaderworkerset-sample-0-1 1/1 Running 0 28m +``` + +Use the following command to check Triton logs: + +``` +kubectl logs --follow leaderworkerset-sample-0 +``` + +You should output something similar to below: + +``` +I0717 23:01:28.501008 300 server.cc:674] ++----------------+---------+--------+ +| Model | Version | Status | ++----------------+---------+--------+ +| ensemble | 1 | READY | +| postprocessing | 1 | READY | +| preprocessing | 1 | READY | +| tensorrt_llm | 1 | READY | ++----------------+---------+--------+ + +I0717 23:01:28.501073 300 tritonserver.cc:2579] ++----------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Option | Value | ++----------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| server_id | rank0 | +| server_version | 2.47.0 | +| server_extensions | classification sequence model_repository model_repository(unload_dependents) schedule_policy model_configuration system_shared_memory cuda_shared_memory binary_tensor_data parameters statistics trace logging | +| model_repository_path[0] | /var/run/models/tensorrtllm_backend/triton_model_repo | +| model_control_mode | MODE_NONE | +| strict_model_config | 1 | +| model_config_name | | +| rate_limit | OFF | +| pinned_memory_pool_byte_size | 268435456 | +| cuda_memory_pool_byte_size{0} | 67108864 | +| min_supported_compute_capability | 6.0 | +| strict_readiness | 1 | +| exit_timeout | 30 | +| cache_enabled | 0 | ++----------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +I0717 23:01:28.502835 300 grpc_server.cc:2463] "Started GRPCInferenceService at 0.0.0.0:8001" +I0717 23:01:28.503047 300 http_server.cc:4692] "Started HTTPService at 0.0.0.0:8000" +I0717 23:01:28.544321 300 http_server.cc:362] "Started Metrics Service at 0.0.0.0:8002" +``` + +> [!Note] +> You may run into an error of `the GPU number is incompatible with 8 gpusPerNode when MPI size is 8`. The root cause is starting from v0.11.0, TRT-LLM backend checks the gpusPerNode parameter in the `config.json` file inside the output engines folder. This parameter is set during engine build time. If the value is the not the same as the number of GPUs in your node, this assertion error shows up. To resolve this, simply change the value in the file to match the number of GPUs in your node. + +## 5. Send a Curl POST request for infernce + +In this AWS example, we can view the external IP address of Load Balancer by running `kubectl get services`. Note that we use `multinode_deployment` as helm chart installation name here. Your output should look something similar to below: + +``` +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +kubernetes ClusterIP 10.100.0.1 443/TCP 43d +leaderworkerset-sample ClusterIP None 54m +multinode_deployment LoadBalancer 10.100.44.170 a69c447a535104f088d2e924f5523d41-634913838.us-east-1.elb.amazonaws.com 8000:32120/TCP,8001:32263/TCP,8002:31957/TCP 54m +``` + +You can send a CURL request to the `ensemble` TRT-LLM Llama-3 model hosted in Triton Server with the following command: + +``` +curl -X POST a69c447a535104f088d2e924f5523d41-634913838.us-east-1.elb.amazonaws.com:8000/v2/models/ensemble/generate -d '{"text_input": "What is machine learning?", "max_tokens": 64, "bad_words": "", "stop_words": "", "pad_id": 2, "end_id": 2}' +``` + +You should output similar to below: + +``` +{"context_logits":0.0,"cum_log_probs":0.0,"generation_logits":0.0,"model_name":"ensemble","model_version":"1","output_log_probs":[0.0,0.0,0.0,0.0,0.0],"sequence_end":false,"sequence_id":0,"sequence_start":false,"text_output":" Machine learning is a branch of artificial intelligence that deals with the development of algorithms that allow computers to learn from data and make predictions or decisions without being explicitly programmed. Machine learning algorithms are used in a wide range of applications, including image recognition, natural language processing, and predictive analytics.\nWhat is the difference between machine learning and"} +``` + +> [!Note] +> You may run into an error of `Multiple tagged security groups found for instance i-*************`. The root cause is both EKS cluster security group and EFA security group are using the same tag of `kubernetes.io/cluster/wenhant-eks-cluster : owned`. This tag should only be attached to 1 security group, usually your main security group. To resolve this, simply delete the tag from the EFA security group. + +## 6. Test Horizontal Pod Autoscaler and Cluster Autoscaler + +To check HPA status, run: + +``` +kubectl get hpa multinode_deployment +``` + +You should output something similar to below: + +``` +NAME REFERENCE TARGETS MINPODS MAXPODS REPLICAS AGE +multinode_deployment LeaderWorkerSet/leaderworkerset-sample 0/1 1 2 1 66m +``` + +From the output above, the current metric value is 0 and the target value is 1. Note that in this example, our metric is a custom metric defined in Prometheus Rule. You can find more details in the [Install Prometheus rule for Triton metrics](Cluster_Setup_Steps.md#8-install-prometheus-rule-for-triton-metrics) step. When the current value exceed 1, the HPA will start to create a new replica. We can either increase traffic by sending a large amount of requests to the LoadBalancer or manually increase minimum number of replicas to let the HPA create the second replica. In this example, we are going to choose the latter and run the following command: + +``` +kubectl patch hpa multinode_deployment -p '{"spec":{"minReplicas": 2}}' +``` + +Your `kubectl get pods` command should output something similar to below: + +``` +NAME READY STATUS RESTARTS AGE +leaderworkerset-sample-0 1/1 Running 0 6h48m +leaderworkerset-sample-0-1 1/1 Running 0 6h48m +leaderworkerset-sample-1 0/1 Pending 0 13s +leaderworkerset-sample-1-1 0/1 Pending 0 13s +``` + +Here we can see the second replica is created but in `Pending` status. If you run `kubectl describe pod leaderworkerset-sample-1`, you should see events similar to below: + +``` +Events: + Type Reason Age From Message + ---- ------ ---- ---- ------- + Warning FailedScheduling 48s default-scheduler 0/3 nodes are available: 1 node(s) didn't match Pod's node affinity/selector, 2 Insufficient nvidia.com/gpu, 2 Insufficient vpc.amazonaws.com/efa. preemption: 0/3 nodes are available: 1 Preemption is not helpful for scheduling, 2 No preemption victims found for incoming pod. + Normal TriggeredScaleUp 15s cluster-autoscaler pod triggered scale-up: [{eks-efa-compute-ng-2-7ac8948c-e79a-9ad8-f27f-70bf073a9bfa 2->4 (max: 4)}] +``` + +The first event means that there are no available nodes to schedule any pods. This explains why the second 2 pods are in `Pending` status. The second event states that the Cluster Autoscaler detects that this pod is `unschedulable`, so it is going to increase number of nodes in our cluster until maximum is reached. You can find more details in the [Install Cluster Autoscaler](Cluster_Setup_Steps.md#10-install-cluster-autoscaler) step. This process can take some time depending on whether AWS have enough nodes available to add to your cluster. Eventually, the Cluster Autoscaler will add 2 more nodes in your node group so that the 2 `Pending` pods can be scheduled on them. Your `kubectl get nodes` and `kubectl get pods` commands should output something similar to below: + +``` +NAME STATUS ROLES AGE VERSION +ip-192-168-103-11.ec2.internal Ready 15m v1.30.2-eks-1552ad0 +ip-192-168-106-8.ec2.internal Ready 15m v1.30.2-eks-1552ad0 +ip-192-168-117-30.ec2.internal Ready 11h v1.30.2-eks-1552ad0 +ip-192-168-127-31.ec2.internal Ready 11h v1.30.2-eks-1552ad0 +ip-192-168-26-106.ec2.internal Ready 11h v1.30.2-eks-1552ad0 +``` + +``` +leaderworkerset-sample-0 1/1 Running 0 7h26m +leaderworkerset-sample-0-1 1/1 Running 0 7h26m +leaderworkerset-sample-1 1/1 Running 0 38m +leaderworkerset-sample-1-1 1/1 Running 0 38m +``` + +You can run the following command to change minimum replica back to 1: + +``` +kubectl patch hpa multinode_deployment -p '{"spec":{"minReplicas": 1}}' +``` + +The HPA will delete the second replica if current metric does not exceed the target value. The Cluster Autoscaler will also remove the added 2 nodes when it detects them as "free". + +## 7. Uninstall the Helm chart + +``` +helm uninstall +``` + +## 8. (Optional) NCCL Test + +To test whether EFA is working properly, we can run a NCCL test across nodes. Make sure you modify the [nccl_test.yaml](./multinode_helm_chart/nccl_test.yaml) file and adjust the following values: + +- `slotsPerWorker`: set to the number of GPUs per node in your cluster +- `-np`: set to "number_of_worker_nodes" * "number_of_gpus_per_node" +- `-N`: set to number_of_gpus_per_node +- `Worker: replicas`: set to number of worker pods you would like the test to run on. This must be less than or eaqual to the number of nodes in your cluster +- `node.kubernetes.io/instance-type`: set to the instance type of the nodes in your cluster against which you would like the nccl test to be run +- `nvidia.com/gpu`: set to the number of GPUs per node in your cluster, adjust in both the limits and requests section +- `vpc.amazonaws.com/efa`: set to the number of EFA adapters per node in your cluster, adjust in both the limits and requests section + +Run the command below to deploy the MPI Operator which is required by the NCCL Test manifest: + +``` +kubectl apply --server-side -f https://raw.githubusercontent.com/kubeflow/mpi-operator/v0.5.0/deploy/v2beta1/mpi-operator.yaml +``` + +Run the command below to deploy NCCL test: + +``` +kubectl apply -f nccl_test.yaml +``` + +Note that the launcher pod will keep restarting until the connection is established with the worker pods. Run the command below to see the launcher pod logs: + +``` +kubectl logs -f $(kubectl get pods | grep launcher | cut -d ' ' -f 1) +``` + +You should output something similar to below (example of 2 x g5.12xlarge): + +``` +[1,0]:# out-of-place in-place +[1,0]:# size count type redop root time algbw busbw #wrong time algbw busbw #wrong +[1,0]:# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) +[1,0]: 8 2 float sum -1[1,0]: 99.10 0.00 0.00 0[1,0]: 100.6 0.00 0.00 0 +[1,0]: 16 4 float sum -1[1,0]: 103.4 0.00 0.00 0[1,0]: 102.5 0.00 0.00 0 +[1,0]: 32 8 float sum -1[1,0]: 103.5 0.00 0.00 0[1,0]: 102.5 0.00 0.00 0 +[1,0]: 64 16 float sum -1[1,0]: 103.6 0.00 0.00 0[1,0]: 102.3 0.00 0.00 0 +[1,0]: 128 32 float sum -1[1,0]: 103.8 0.00 0.00 0[1,0]: 103.1 0.00 0.00 0 +[1,0]: 256 64 float sum -1[1,0]: 103.9 0.00 0.00 0[1,0]: 103.3 0.00 0.00 0 +[1,0]: 512 128 float sum -1[1,0]: 104.3 0.00 0.01 0[1,0]: 102.9 0.00 0.01 0 +[1,0]: 1024 256 float sum -1[1,0]: 105.8 0.01 0.02 0[1,0]: 104.9 0.01 0.02 0 +[1,0]: 2048 512 float sum -1[1,0]: 116.4 0.02 0.03 0[1,0]: 115.5 0.02 0.03 0 +[1,0]: 4096 1024 float sum -1[1,0]: 120.4 0.03 0.06 0[1,0]: 119.0 0.03 0.06 0 +[1,0]: 8192 2048 float sum -1[1,0]: 134.2 0.06 0.11 0[1,0]: 134.6 0.06 0.11 0 +[1,0]: 16384 4096 float sum -1[1,0]: 147.9 0.11 0.19 0[1,0]: 147.3 0.11 0.19 0 +[1,0]: 32768 8192 float sum -1[1,0]: 182.3 0.18 0.31 0[1,0]: 183.1 0.18 0.31 0 +[1,0]: 65536 16384 float sum -1[1,0]: 194.6 0.34 0.59 0[1,0]: 193.5 0.34 0.59 0 +[1,0]: 131072 32768 float sum -1[1,0]: 267.5 0.49 0.86 0[1,0]: 266.3 0.49 0.86 0 +[1,0]: 262144 65536 float sum -1[1,0]: 495.7 0.53 0.93 0[1,0]: 496.6 0.53 0.92 0 +[1,0]: 524288 131072 float sum -1[1,0]: 746.2 0.70 1.23 0[1,0]: 736.2 0.71 1.25 0 +[1,0]: 1048576 262144 float sum -1[1,0]: 1337.1 0.78 1.37 0[1,0]: 1333.2 0.79 1.38 0 +[1,0]: 2097152 524288 float sum -1[1,0]: 2542.1 0.82 1.44 0[1,0]: 2540.8 0.83 1.44 0 +[1,0]: 4194304 1048576 float sum -1[1,0]: 3377.7 1.24 2.17 0[1,0]: 3381.8 1.24 2.17 0 +[1,0]: 8388608 2097152 float sum -1[1,0]: 5370.6 1.56 2.73 0[1,0]: 5363.3 1.56 2.74 0 +[1,0]: 16777216 4194304 float sum -1[1,0]: 9547.6 1.76 3.08 0[1,0]: 9578.5 1.75 3.07 0 +[1,0]: 33554432 8388608 float sum -1[1,0]: 17590 1.91 3.34 0[1,0]: 17605 1.91 3.34 0 +[1,0]: 67108864 16777216 float sum -1[1,0]: 34096 1.97 3.44 0[1,0]: 34121 1.97 3.44 0 +[1,0]: 134217728 33554432 float sum -1[1,0]: 67100 2.00 3.50 0[1,0]: 67259 2.00 3.49 0 +[1,0]: 268435456 67108864 float sum -1[1,0]: 133445 2.01 3.52 0[1,0]: 133455 2.01 3.52 0 +[1,0]: 536870912 134217728 float sum -1[1,0]: 266505 2.01 3.53 0[1,0]: 266527 2.01 3.53 0 +[1,0]: 1073741824 268435456 float sum -1[1,0]: 536019 2.00 3.51 0[1,0]: 535942 2.00 3.51 0 +[1,0]: 2147483648 536870912 float sum -1[1,0]: 1079960 1.99 3.48 0[1,0]: 1079922 1.99 3.48 0 +[1,0]: 4294967296 1073741824 float sum -1[1,0]: 2271140 1.89 3.31 0[1,0]: 2268693 1.89 3.31 0 +[1,0]:# Out of bounds values : 0 OK +[1,0]:# Avg bus bandwidth : 1.42557 +``` + +## 9. (Optional) GenAI-Perf + +GenAI-Perf is a benchmarking tool for Triton server to measure latency and throughput of LLMs. We provide an example here. + +### a. Modify the `gen_ai_perf.yaml` file + +Adjust the following values: + +- `image`: change image tag. Default is 24.07 which supports TRT-LLM v0.11.0 +- `claimName`: set to your EFS pvc name + +### b. Run benchmark + +Run the below command to start a Triton server SDK container: + +``` +kubectl apply -f gen_ai_perf.yaml +kubectl exec -it gen-ai-perf -- bash +``` + +Run the below command to start benchmarking: + +``` +genai-perf \ + -m ensemble \ + --service-kind triton \ + --backend tensorrtllm \ + --num-prompts 100 \ + --random-seed 123 \ + --synthetic-input-tokens-mean 1024 \ + --synthetic-input-tokens-stddev 0 \ + --streaming \ + --output-tokens-mean 1024 \ + --output-tokens-stddev 0 \ + --output-tokens-mean-deterministic \ + --tokenizer hf-internal-testing/llama-tokenizer \ + --concurrency 1 \ + --measurement-interval 10000 \ + --url a69c447a535104f088d2e924f5523d41-634913838.us-east-1.elb.amazonaws.com:8001 \ + -- --request-count=10 +``` + +You should output something similar to below (example of Mixtral 8x7B on 2 x g5.12xlarge): + +``` + LLM Metrics +┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┓ +┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━┩ +│ Time to first token (ms) │ 675.08 │ 459.99 │ 2,473.49 │ 2,294.37 │ 682.23 │ 482.85 │ +│ Inter token latency (ms) │ 22.86 │ 19.98 │ 24.37 │ 24.32 │ 23.79 │ 23.41 │ +│ Request latency (ms) │ 29,906.05 │ 29,675.12 │ 31,814.10 │ 31,624.46 │ 29,917.75 │ 29,706.24 │ +│ Output sequence length │ 1,282.70 │ 1,200.00 │ 1,463.00 │ 1,448.24 │ 1,315.40 │ 1,291.75 │ +│ Input sequence length │ 1,024.00 │ 1,024.00 │ 1,024.00 │ 1,024.00 │ 1,024.00 │ 1,024.00 │ +└──────────────────────────┴───────────┴───────────┴───────────┴───────────┴───────────┴───────────┘ +Output token throughput (per sec): 42.89 +Request throughput (per sec): 0.03 +``` diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/README.md b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/README.md new file mode 100644 index 00000000..dc2c47b9 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/README.md @@ -0,0 +1,17 @@ +# Multi-Node Triton + TRT-LLM Deployment on EKS + +This repository provides instructions for multi-node deployment of LLMs on EKS (Amazon Elastic Kubernetes Service). This includes instructions for building custom image to enable features like EFA, Helm chart and associated Python script. This deployment flow uses NVIDIA TensorRT-LLM as the inference engine and NVIDIA Triton Inference Server as the model server. + +We have 1 pod per node, so the main challenge in deploying models that require multi-node is that one instance of the model spans multiple nodes hence multiple pods. Consequently, the atomic unit that needs to be ready before requests can be served, as well as the unit that needs to be scaled becomes group of pods. This example shows how to get around these problems and provides code to set up the following + + 1. **LeaderWorkerSet for launching Triton+TRT-LLM on groups of pods:** To launch Triton and TRT-LLM across nodes you use MPI to have one node launch TRT-LLM processes on all the nodes (including itself) that will make up one instance of the model. Doing this requires knowing the hostnames of all involved nodes. Consequently we need to spawn groups of pods and know which model instance group they belong to. To achieve this we use [LeaderWorkerSet](https://github.com/kubernetes-sigs/lws/tree/main), which lets us create "megapods" that consist of a group of pods - one leader pod and a specified number of worker pods - and provides pod labels identifying group membership. We configure the LeaderWorkerSet and launch Triton+TRT-LLM via MPI in [`deployment.yaml`](multinode_helm_chart/chart/templates/deployment.yaml) and [server.py](multinode_helm_chart/containers/server.py). + 2. **Gang Scheduling:** Gang scheduling simply means ensuring all pods that make up a model instance are ready before Triton+TRT-LLM is launched. We show how to use `kubessh` to achieve this in the `wait_for_workers` function of [server.py](multinode_helm_chart/containers/server.py). + 3. **Autoscaling:** By default the Horizontal Pod Autoscaler (HPA) scales individual pods, but LeaderWorkerSet makes it possible to scale each "megapod". However, since these are GPU workloads we don't want to use cpu and host memory usage for autoscaling. We show how to leverage the metrics Triton Server exposes through Prometheus and set up GPU utilization recording rules in [`triton-metrics_prometheus-rule.yaml`](multinode_helm_chart/triton-metrics_prometheus-rule.yaml). We also demonstrate how to properly set up PodMonitors and an HPA in [`pod-monitor.yaml`](multinode_helm_chart/chart/templates/pod-monitor.yaml) and [`hpa.yaml`](multinode_helm_chart/chart/templates/hpa.yaml) (the key is to only scrape metrics from the leader pods). Instructions for properly setting up Prometheus and exposing GPU metrics are found in [Configure EKS Cluster and Install Dependencies](https://github.com/Wenhan-Tan/EKS_Multinode_Triton_TRTLLM/blob/main/Cluster_Setup_Steps.md). To enable deployment to dynamically add more nodes in reponse to HPA, we also setup [Cluster Autoscaler](https://github.com/Wenhan-Tan/EKS_Multinode_Triton_TRTLLM/blob/main/Cluster_Setup_Steps.md#10-install-cluster-autoscaler) + 4. **LoadBalancer Setup:** Although there are multiple pods in each instance of the model, only one pod within each group accepts requests. We show how to correctly set up a LoadBalancer Service to allow external clients to submit requests in [`service.yaml`](multinode_helm_chart/chart/templates/service.yaml) + + +## Setup and Installation + + 1. [Create EKS Cluster](https://github.com/Wenhan-Tan/EKS_Multinode_Triton_TRTLLM/blob/main/1.%20Create_EKS_Cluster.md) + 2. [Configure EKS Cluster](https://github.com/Wenhan-Tan/EKS_Multinode_Triton_TRTLLM/blob/main/2.%20Configure_EKS_Cluster.md) + 3. [Deploy Triton](https://github.com/Wenhan-Tan/EKS_Multinode_Triton_TRTLLM/blob/main/3.%20Deploy_Triton.md) diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/eks_cluster_config.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/eks_cluster_config.yaml new file mode 100644 index 00000000..433d6d06 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/eks_cluster_config.yaml @@ -0,0 +1,52 @@ +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: wenhant-eks-cluster-east2 + version: "1.30" + region: us-east-2 + +availabilityZones: + - us-east-2a + - us-east-2b + - us-east-2c + +iam: + withOIDC: true + +managedNodeGroups: + - name: sys-nodes + instanceType: c5.2xlarge + minSize: 1 + desiredCapacity: 1 + maxSize: 1 + volumeSize: 80 + availabilityZones: ["us-east-2a"] + iam: + withAddonPolicies: + imageBuilder: true + autoScaler: true + ebs: true + efs: true + awsLoadBalancerController: true + cloudWatch: true + albIngress: true + + - name: efa-compute-ng + instanceType: g5.12xlarge + minSize: 1 + desiredCapacity: 1 + maxSize: 1 + volumeSize: 300 + efaEnabled: true + privateNetworking: true + availabilityZones: ["us-east-2a"] + iam: + withAddonPolicies: + imageBuilder: true + autoScaler: true + ebs: true + efs: true + awsLoadBalancerController: true + cloudWatch: true + albIngress: true diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/.helmignore b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/.helmignore new file mode 100644 index 00000000..0e8a0eb3 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/Chart.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/Chart.yaml new file mode 100644 index 00000000..dd0ba019 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/Chart.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +appVersion: v0.5.3 +description: A Helm chart for EFA device plugin. +home: https://github.com/aws/eks-charts +icon: https://raw.githubusercontent.com/aws/eks-charts/master/docs/logo/aws.png +name: aws-efa-k8s-device-plugin +sources: +- https://github.com/aws/eks-charts +version: v0.5.3 diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/README.md b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/README.md new file mode 100644 index 00000000..6a025e8e --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/README.md @@ -0,0 +1,38 @@ +# AWS EFA Kubernetes Device Plugin +This chart installs the AWS EFA Kubernetes Device Plugin daemonset + +## Prerequisites +- Helm v3 + +## Installing the Chart +First add the EKS repository to Helm: + +```shell +helm repo add eks https://aws.github.io/eks-charts +``` + +To install the chart with the release name `efa` in the `kube-system` namespace and default configuration: + +```shell +helm install efa ./aws-efa-k8s-device-plugin -n kube-system +``` + +# Configuration + +Paramter | Description | Default +--- | --- | --- +`image.repository` | EFA image repository | `602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/aws-efa-k8s-device-plugin` +`image.tag` | EFA image tag | `v0.5.3` +`securityContext.allowPrivilegeEscalation` | Controls whether a process can gain more privilege than its parent process | `false` +`securityContext` | EFA plugin security context | `capabilities: drop: ["ALL"] runAsNonRoot: false` +`supportedInstanceLabels.keys` | Kubernetes key to interpret as instance type | `nodes.kubernetes.io/instance-type` +`supportedInstanceLabels.values` | List of instances which currently support EFA devices | `see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types` +`resources` | Resources for containers in pod | `requests.cpu: 10m requests.memory: 20Mi` +`nodeSelector` | Node labels for pod assignment | `{}` +`tolerations` | Optional deployment tolerations | `[]` +`additionalPodAnnotations` | Pod annotations to apply in addition to the default ones | `{}` +`additionalPodLabels` | Pod labels to apply in addition to the defualt ones | `{}` +`nameOverride` | Override the name of the chart | `""` +`fullnameOverride` | Override the full name of the chart | `""` +`imagePullSecrets` | Docker registry pull secret | `[]` + diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/templates/NOTES.txt b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/templates/NOTES.txt new file mode 100644 index 00000000..aa3293db --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/templates/NOTES.txt @@ -0,0 +1 @@ +EFA device plugin is installed, it can be requested as `vpc.amazonaws.com/efa` resource. \ No newline at end of file diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/templates/_helpers.tpl b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/templates/_helpers.tpl new file mode 100644 index 00000000..a454828a --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "aws-efa-k8s-device-plugin.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "aws-efa-k8s-device-plugin.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "aws-efa-k8s-device-plugin.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "aws-efa-k8s-device-plugin.labels" -}} +helm.sh/chart: {{ include "aws-efa-k8s-device-plugin.chart" . }} +{{ include "aws-efa-k8s-device-plugin.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "aws-efa-k8s-device-plugin.selectorLabels" -}} +app.kubernetes.io/name: {{ include "aws-efa-k8s-device-plugin.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "aws-efa-k8s-device-plugin.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "aws-efa-k8s-device-plugin.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/templates/daemonset.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/templates/daemonset.yaml new file mode 100644 index 00000000..6fad9cd4 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/templates/daemonset.yaml @@ -0,0 +1,78 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "aws-efa-k8s-device-plugin.fullname" . }} + labels: + {{- include "aws-efa-k8s-device-plugin.labels" . | nindent 4 }} +spec: + selector: + matchLabels: + name: {{ include "aws-efa-k8s-device-plugin.fullname" . }} + updateStrategy: + type: RollingUpdate + template: + metadata: + {{- if .Values.additionalPodAnnotations }} + annotations: + {{- with .Values.additionalPodAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + labels: + name: {{ include "aws-efa-k8s-device-plugin.fullname" . }} + {{- with .Values.additionalPodLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + tolerations: + - key: CriticalAddonsOnly + operator: Exists + {{- with .Values.tolerations }} + {{- toYaml . | nindent 8 }} + {{- end }} + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + {{- range .Values.supportedInstanceLabels.keys }} + - matchExpressions: + - key: {{ . }} + operator: In + values: + {{- toYaml $.Values.supportedInstanceLabels.values | nindent 20 }} + {{- end }} + hostNetwork: true + containers: + - image: {{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }} + name: aws-efa-k8s-device-plugin + securityContext: + {{- toYaml .Values.securityContext | nindent 12}} + {{- with .Values.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + - name: infiniband-volume + mountPath: /dev/infiniband/ + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins + - name: infiniband-volume + hostPath: + path: /dev/infiniband/ \ No newline at end of file diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/values.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/values.yaml new file mode 100644 index 00000000..5fe7f434 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/values.yaml @@ -0,0 +1,145 @@ +image: + repository: 602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/aws-efa-k8s-device-plugin + # Overrides the image tag whose default is the chart appVersion. + tag: "v0.5.3" +securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + runAsNonRoot: false +supportedInstanceLabels: # EFA supported instances: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-instance-types + keys: + - "node.kubernetes.io/instance-type" + values: + - m5dn.24xlarge + - m5dn.metal + - m5n.24xlarge + - m5n.metal + - m5zn.12xlarge + - m5zn.metal + - m6a.48xlarge + - m6a.metal + - m6i.32xlarge + - m6i.metal + - m6id.32xlarge + - m6id.metal + - m6idn.32xlarge + - m6idn.metal + - m6in.32xlarge + - m6in.metal + - m7a.48xlarge + - m7a.metal-48xl + - m7g.16xlarge + - m7g.metal + - m7gd.16xlarge + - m7i.48xlarge + - m7i.metal-48xl + - c5n.9xlarge + - c5n.18xlarge + - c5n.metal + - c6a.48xlarge + - c6a.metal + - c6gn.16xlarge + - c6i.32xlarge + - c6i.metal + - c6id.32xlarge + - c6id.metal + - c6in.32xlarge + - c6in.metal + - c7a.48xlarge + - c7a.metal-48xl + - c7g.16xlarge + - c7g.metal + - c7gd.16xlarge + - c7gn.16xlarge + - c7i.48xlarge + - c7i.metal-48xl + - r5dn.24xlarge + - r5dn.metal + - r5n.24xlarge + - r5n.metal + - r6a.48xlarge + - r6a.metal + - r6i.32xlarge + - r6i.metal + - r6idn.32xlarge + - r6idn.metal + - r6in.32xlarge + - r6in.metal + - r6id.32xlarge + - r6id.metal + - r7a.48xlarge + - r7a.metal-48xl + - r7g.16xlarge + - r7g.metal + - r7gd.16xlarge + - r7i.48xlarge + - r7i.metal-48xl + - r7iz.32xlarge + - r7iz.metal-32xl + - x2idn.32xlarge + - x2idn.metal + - x2iedn.32xlarge + - x2iedn.metal + - x2iezn.12xlarge + - x2iezn.metal + - i3en.12xlarge + - i3en.24xlarge + - i3en.metal + - i4g.16xlarge + - i4i.32xlarge + - i4i.metal + - im4gn.16xlarge + - dl1.24xlarge + - dl2q.24xlarge + - g4dn.8xlarge + - g4dn.12xlarge + - g4dn.16xlarge + - g4dn.metal + - g5.8xlarge + - g5.12xlarge + - g5.16xlarge + - g5.24xlarge + - g5.48xlarge + - inf1.24xlarge + - p3dn.24xlarge + - p4d.24xlarge + - p4de.24xlarge + - p5.48xlarge + - trn1.32xlarge + - trn1n.32xlarge + - vt1.24xlarge + - hpc6a.48xlarge + - hpc6id.32xlarge + - hpc7a.12xlarge + - hpc7a.24xlarge + - hpc7a.48xlarge + - hpc7a.96xlarge + - hpc7g.4xlarge + - hpc7g.8xlarge + - hpc7g.16xlarge +resources: + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + requests: + cpu: 10m + memory: 20Mi +nodeSelector: {} +# efa: present +tolerations: +- key: nvidia.com/gpu + operator: Exists + effect: NoSchedule +# - key: aws.amazon.com/efa +# operator: Exists +# effect: NoSchedule +additionalPodAnnotations: {} +additionalPodLabels: {} +nameOverride: "" +fullnameOverride: "" +imagePullSecrets: [] diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/Chart.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/Chart.yaml new file mode 100644 index 00000000..03e6d381 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/Chart.yaml @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v2 +appVersion: 0.1.0 +description: Generative AI Multi-Node w/ Triton and TensorRT-LLM Guide/Tutorial +icon: https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/01-nvidia-logo-vert-500x200-2c50-d@2x.png +name: triton_trt-llm_multi-node_example +version: 0.1.0 diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/example_values.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/example_values.yaml new file mode 100644 index 00000000..a36dc2a0 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/example_values.yaml @@ -0,0 +1,60 @@ + +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See values.yaml for reference values. + +gpu: NVIDIA-A10G +# gpu: NVIDIA-A100-SXM4-40GB +gpuPerNode: 4 +persistentVolumeClaim: efs-claim-2 + +tensorrtLLM: + parallelism: + tensor: 4 + pipeline: 2 + +triton: + image: + name: 210086341041.dkr.ecr.us-west-2.amazonaws.com/triton_trtllm_multinode:24.08 + # name: 354625738399.dkr.ecr.us-east-1.amazonaws.com/wenhant_triton_trtllm_multinode:24.07.3 + resources: + cpu: 8 + memory: 32Gi + efa: 1 # If you don't want to enable EFA, set this to 0. + # triton_model_repo_path: /var/run/models/mixtral_8x7b_tp8_ep2_moetp4/triton_model_repo + # triton_model_repo_path: /var/run/models/llama3_8b_tp2_pp4/triton_model_repo + # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x7b_tp4_moeep2_moetp2_pp2_v11_a10g/triton_model_repo + # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x7b_tp4_moetp4_pp2_v11_a10g/triton_model_repo + # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x7b_tp4_moeep4_pp2_v11_a10g/triton_model_repo + # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x7b_pp8_v11_a10g/triton_model_repo + # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x7b_tp8_v11_a10g/triton_model_repo + # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x7b_tp4_pp2_v11_a10g/triton_model_repo + triton_model_repo_path: /var/run/models/tensorrtllm_backend/triton_model_repo + # triton_model_repo_path: /var/run/models/triton_repo_mixtral_8x22b_tp16_v11_a100/triton_model_repo + enable_nsys: false # Note if you send lots of requests, nsys report can be very large. + +logging: + tritonServer: + verbose: true + +autoscaling: + enable: true + replicas: + maximum: 2 + minimum: 1 + metric: + name: triton:queue_compute:ratio + value: 1 + diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/NOTES.txt b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/NOTES.txt new file mode 100644 index 00000000..431f844f --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/NOTES.txt @@ -0,0 +1,25 @@ +{{ $.Chart.Name }} ({{ $.Chart.Version }}) installation complete. + +Release Name: {{ $.Release.Name }} +Namespace: {{ $.Release.Namespace }} +Deployment Name: {{ $.Release.Name }} +{{- if not $.Values.kubernetes.noService }} +Service Name: {{ $.Release.Name }} +{{- end }} +{{- if $.Values.kubernetes.serviceAccount }} +ServiceAccount Name: {{ $.Release.Name }} +{{- end }} + +Helpful commands: + + $ helm status --namespace={{ $.Release.Namespace }} {{ $.Release.Name }} + $ helm get --namespace={{ $.Release.Namespace }} all {{ $.Release.Name }} + $ kubectl get --namespace={{ $.Release.Namespace }} --selector='app={{ $.Release.Name }}' deployments +,pods +{{- if not $.Values.kubernetes.noService -}} +,services +{{- end -}} +,podmonitors +{{- if $.Values.kubernetes.serviceAccount -}} +,serviceAccounts +{{- end -}} diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/deployment.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/deployment.yaml new file mode 100644 index 00000000..fb5c6a5b --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/deployment.yaml @@ -0,0 +1,249 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: leaderworkerset-sample +spec: + replicas: 1 + leaderWorkerTemplate: + size: 2 + leaderTemplate: + metadata: + name: {{ $.Release.Name }}-leader + labels: + app: {{ $.Release.Name }} + {{- with $.Values.kubernetes }} + {{- with .labels }} + {{ toYaml . | indent 4 }} + {{- end }} + {{- end }} + role: leader + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: nvidia.com/gpu + operator: Exists + - key: nvidia.com/gpu.product + operator: In + values: + - {{ required "Property '.gpu' is required." $.Values.gpu }} + containers: + - name: triton + command: + - python3 + - ./server.py + - leader + - --triton_model_repo_dir={{ $.Values.triton.triton_model_repo_path }} + - --namespace={{ $.Release.Namespace }} + - --pp={{ $.Values.tensorrtLLM.parallelism.pipeline }} + - --tp={{ $.Values.tensorrtLLM.parallelism.tensor }} + - --gpu_per_node={{ $.Values.gpuPerNode }} + - --stateful_set_group_key=$(GROUP_KEY) + {{- with $.Values.logging }} + {{- with .tritonServer }} + {{- if .useIso8601 }} + - --iso8601 + {{- end }} + {{- if .verbose }} + - --verbose + {{- end }} + {{- end }} + {{- end }} + {{- with $.Values.triton }} + {{ if .enable_nsys }} + - --enable_nsys + {{- end }} + {{- end }} + env: + - name: GROUP_KEY + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/group-key'] + {{- with $.Values.logging }} + {{- with .tritonServer }} + {{- if .verbose }} + - name: NCCL_DEBUG + value: INFO + {{- end }} + {{- end }} + {{- end }} + image: {{ $.Values.triton.image.name }} + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 15 + httpGet: + path: /v2/health/live + port: 8000 + initialDelaySeconds: 10 + periodSeconds: 2 + successThreshold: 1 + ports: + - containerPort: 8000 + name: http + - containerPort: 8001 + name: grpc + - containerPort: 8002 + name: metrics + readinessProbe: + failureThreshold: 15 + httpGet: + path: /v2/health/ready + port: 8000 + initialDelaySeconds: 15 + periodSeconds: 2 + successThreshold: 1 + resources: + limits: + cpu: {{ $.Values.triton.resources.cpu }} + ephemeral-storage: 1Gi + memory: {{ $.Values.triton.resources.memory }} + nvidia.com/gpu: {{ $.Values.gpuPerNode }} + vpc.amazonaws.com/efa: {{ $.Values.triton.resources.efa }} + requests: + cpu: {{ $.Values.triton.resources.cpu }} + ephemeral-storage: 1Gi + memory: {{ $.Values.triton.resources.memory }} + nvidia.com/gpu: {{ $.Values.gpuPerNode }} + vpc.amazonaws.com/efa: {{ $.Values.triton.resources.efa }} + startupProbe: + failureThreshold: 60 + httpGet: + path: /v2/health/ready + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 15 + successThreshold: 1 + volumeMounts: + - mountPath: /var/run/models + name: model-repository + # readOnly: true + - mountPath: /dev/shm + name: dshm + {{- with $.Values }} + {{- with .pullSecrets }} + imagePullSecrets: + {{ toYaml . | indent 6 }} + {{- end }} + {{- end }} + # restartPolicy: Always + serviceAccountName: {{ $.Release.Name }} + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + {{- with $.Values.kubernetes }} + {{- with .tolerations }} + {{ toYaml . | indent 6 }} + {{- end }} + {{- end }} + volumes: + - name: model-repository + persistentVolumeClaim: + claimName: {{ $.Values.persistentVolumeClaim }} + # readOnly: false + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 256Gi + workerTemplate: + metadata: + labels: + app: {{ $.Release.Name }} + {{- with $.Values.kubernetes }} + {{- with .labels }} + {{ toYaml . | indent 4 }} + {{- end }} + {{- end }} + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: nvidia.com/gpu + operator: Exists + - key: nvidia.com/gpu.product + operator: In + values: + - {{ required "Property '.gpu' is required." $.Values.gpu }} + containers: + - name: worker + command: + - python3 + - ./server.py + - worker + - --triton_model_repo_dir={{ $.Values.triton.triton_model_repo_path }} + env: + {{- with $.Values.logging }} + {{- with .tritonServer }} + {{- if .verbose }} + - name: NCCL_DEBUG + value: INFO + {{- end }} + {{- end }} + {{- end }} + image: {{ $.Values.triton.image.name }} + imagePullPolicy: IfNotPresent + resources: + limits: + cpu: {{ $.Values.triton.resources.cpu }} + ephemeral-storage: 4Gi + memory: {{ $.Values.triton.resources.memory }} + nvidia.com/gpu: {{ $.Values.gpuPerNode }} + vpc.amazonaws.com/efa: {{ $.Values.triton.resources.efa }} + requests: + cpu: {{ $.Values.triton.resources.cpu }} + ephemeral-storage: 4Gi + memory: {{ $.Values.triton.resources.memory }} + nvidia.com/gpu: {{ $.Values.gpuPerNode }} + vpc.amazonaws.com/efa: {{ $.Values.triton.resources.efa }} + volumeMounts: + - mountPath: /var/run/models + name: model-repository + # readOnly: true + - mountPath: /dev/shm + name: dshm + {{- with $.Values }} + {{- with .pullSecrets }} + imagePullSecrets: + {{ toYaml . | indent 6 }} + {{- end }} + {{- end }} + # restartPolicy: Always + serviceAccountName: {{ $.Release.Name }} + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + {{- with $.Values.kubernetes }} + {{- with .tolerations }} + {{ toYaml . | indent 6 }} + {{- end }} + {{- end }} + volumes: + - name: model-repository + persistentVolumeClaim: + claimName: {{ $.Values.persistentVolumeClaim }} + # readOnly: true + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 256Gi diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/hpa.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/hpa.yaml new file mode 100644 index 00000000..3ddc98ad --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/hpa.yaml @@ -0,0 +1,42 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ $.Release.Name }} + labels: + app: {{ $.Release.Name }} + app.kubernetes.io/component: autoscaler + release: prometheus +{{- with $.Values.kubernetes }} +{{- with .labels }} +{{ toYaml . | indent 4 }} +{{- end }} +{{- end }} +spec: + maxReplicas: {{ $.Values.autoscaling.replicas.maximum }} + minReplicas: {{ $.Values.autoscaling.replicas.minimum }} + metrics: + - type: Pods + pods: + metric: + name: {{ $.Values.autoscaling.metric.name }} + target: + type: AverageValue + averageValue: {{ $.Values.autoscaling.metric.value }} + scaleTargetRef: + apiVersion: leaderworkerset.x-k8s.io/v1 + kind: LeaderWorkerSet + name: leaderworkerset-sample diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/pod-monitor.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/pod-monitor.yaml new file mode 100644 index 00000000..6563609b --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/pod-monitor.yaml @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: {{ $.Release.Name }} + labels: + app: {{ $.Release.Name }} + app.kubernetes.io/component: monitor + release: prometheus +{{- with $.Values.kubernetes }} +{{- with .labels }} +{{ toYaml . | indent 4 }} +{{- end }} +{{- end }} +spec: + selector: + matchLabels: + role: leader + podMetricsEndpoints: + - port: metrics + path: /metrics diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/rbac.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/rbac.yaml new file mode 100644 index 00000000..0dc61687 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/rbac.yaml @@ -0,0 +1,78 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- if not $.Values.kubernetes.service_account }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + labels: +{{- with $.Values.kubernetes }} +{{- with .labels }} +{{ toYaml . | indent 4 }} +{{- end }} +{{- end }} + name: {{ $.Release.Name }} +rules: +- apiGroups: + - '' + - apps + - batch + resources: + - deployments + - jobs + - pods + - pods/status + - services + verbs: + - get + - list +- apiGroups: [''] + resources: + - pods/exec + verbs: + - create + +--- + +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: +{{- with $.Values.kubernetes }} +{{- with .labels }} +{{ toYaml . | indent 4 }} +{{- end }} +{{- end }} + name: {{ $.Release.Name }} + +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + labels: +{{- with $.Values.kubernetes }} +{{- with .labels }} +{{ toYaml . | indent 4 }} +{{- end }} +{{- end }} + name: {{ $.Release.Name }} +subjects: +- kind: ServiceAccount + name: {{ $.Release.Name }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ $.Release.Name }} +{{- end }} diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/service.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/service.yaml new file mode 100644 index 00000000..31ed5db1 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/templates/service.yaml @@ -0,0 +1,44 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- if $.Values.kubernetes.noService }} +# Chart values optioned to not create a service. Service not created. +{{- else }} +apiVersion: v1 +kind: Service +metadata: + name: {{ $.Release.Name }} + labels: + app: {{ $.Release.Name }} + app.kubernetes.io/component: service +{{- with $.Values.kubernetes }} +{{- with .labels }} +{{ toYaml . | indent 4 }} +{{- end }} +{{- end }} +spec: + type: LoadBalancer + ports: + - name: http + port: 8000 + targetPort: http + - name: grpc + port: 8001 + targetPort: grpc + - name: metrics + port: 8002 + targetPort: metrics + selector: + role: leader +{{- end }} diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/values.schema.json b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/values.schema.json new file mode 100644 index 00000000..422781b0 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/values.schema.json @@ -0,0 +1,282 @@ +{ + "$schema": "https://json-schema.org/draft-07/schema#", + "copyright": [ + "# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.", + "# NVIDIA CORPORATION and its licensors retain all intellectual property", + "# and proprietary rights in and to this software, related documentation", + "# and any modifications thereto. Any use, reproduction, disclosure or", + "# distribution of this software and related documentation without an express", + "# license agreement from NVIDIA CORPORATION is strictly prohibited." + ], + "properties": { + "gpu": { + "description": "Value must match the node's `.metadata.labels.nvidia.com/gpu.product` label.", + "type": "string" + }, + "gpuPerNode": { + "description": "GPUs per node", + "type": "integer" + }, + "persistentVolumeClaim": { + "description": "Persistent volume claim where model content will be persisted.", + "type": "string" + }, + "pullSecret": { + "description": "Name of the secret to pull image.", + "oneOf": [ + { "type": "string" }, + { "type": "null" } + ] + }, + "tensorrtLlm": { + "description": "Configuration options related to the conversion of a non-optimized model into TensorRT format.", + "properties": { + "parallelism": { + "description": "TensorRT-LLM format, specifically if/how the model is partitioned for deployment to multiple GPUs.", + "oneOf": [ + { + "properties": { + "pipeline": { + "oneOf": [ + { + "minimum": 1, + "type": "integer" + }, + { "type": "null" } + ] + }, + "tensor": { + "oneOf": [ + { + "minimum": 1, + "type": "integer" + }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + "triton": { + "description": "Configuration options for Triton Server.", + "properties": { + "image": { + "description": "Configuration options related to the container image for Triton Server.", + "properties": { + "name": { + "description": "Name of the container image containing the version of Triton Server to be used.", + "type": "string" + } + }, + "required": [ "name" ], + "type": "object" + }, + "resources": { + "description": "Configuration options managing the resources assigned to individual Triton Server instances. ", + "oneOf": [ + { + "properties": { + "cpu": { + "description": "Number of logical CPU cores reserved for, and assigned to each instance of Triton Server.", + "oneOf": [ + { + "minimum": 1, + "type": "integer" + }, + { + "pattern": "^\\d+m$", + "type": "string" + }, + { "type": "null" } + ] + }, + "memory": { + "description": "Amount of CPU-visible system memory allocated to, and reserved for each instance of Triton Server.", + "oneOf": [ + { + "pattern": "^\\d+[GKMgkm]i$", + "type": "string" + }, + { "type": "null" } + ] + }, + "efa": { + "description": "Number of EFA adapters in your nodes. If you don't want to enable EFA, simply set it to 0.", + "type": "integer" + } + }, + "type": "object" + }, + { "type": "null" } + ] + }, + "triton_model_repo_path": { + "description": "Triton model repo path", + "type": "string" + }, + "enable_nsys": { + "description": "Enable profiling on Triton server. Note if you send lots of requests, nsys report can be very large.", + "type": "boolean" + } + }, + "required": [ + "image", + "triton_model_repo_path" + ], + "type": "object" + }, + "logging": { + "description": "Configuration options related to how various components generate logs.", + "properties": { + "tritonServer": { + "description": "Logging configuration options specific to Triton Server.", + "oneOf": [ + { + "properties": { + "useIso8601": { + "description": "When `true` Triton Server logs are formatted using the ISO8601 standard; otherwise Triton's default format will be used. ", + "oneOf": [ + { "type": "boolean" }, + { "type": "null" } + ] + }, + "verbose": { + "description": "When `true` Triton Server uses verbose logging; otherwise standard logging is used.", + "oneOf": [ + { "type": "boolean" }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + } + } + }, + "kubernetes": { + "description": "Configurations option related to the Kubernetes objects created by the chart.", + "oneOf": [ + { + "properties": { + "labels": { + "description": "Optional set of labels to be applied to created Kubernetes objects.", + "oneOf": [ + { "type": "object" }, + { "type": "null" } + ] + }, + "noService": { + "description": "When `false`, a service will not be created when the chart is installed; otherwise a service will be created.", + "oneOf": [ + { "type": "boolean" }, + { "type": "null" } + ] + }, + "tolerations": { + "description": "Tolerations applied to every pod deployed as part of this deployment.", + "oneOf": [ + { + "items": [ + { + "description": "Toleration applied to every pod deployed as part of this deployment.", + "type": "object" + }, + { "type": "null" } + ], + "type": "array" + }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + } + }, + "autoscaling": { + "description": "Configuration options for automatic scaling of Triton Server deployments.", + "oneOf": [ + { + "properties": { + "enable": { + "description": "Determines if autoscaling is enabled for deployment or not.", + "oneOf": [ + { "type": "boolean" }, + { "type": "null" } + ] + }, + "metric": { + "description": "Metric used to determine autoscaling decisions.", + "oneOf": [ + { + "properties": { + "name": { + "description": "Name of the metric monitored.", + "oneOf": [ + { "type": "string" }, + { "type": "null" } + ] + }, + "value": { + "description": "Threshold or targeted value used to determine the number of replicas concurrently deployed." + } + }, + "type": "object" + }, + { "type": "null" } + ] + }, + "replicas": { + "description": "Controls the number of Triton Server replicas are deployed.", + "oneOf": [ + { + "properties": { + "maximum": { + "description": "Upper bound of the number of Triton Server replicas deployed concurrently.", + "oneOf": [ + { + "minimum": 1, + "type": "integer" + }, + { "type": "null" } + ] + }, + "minimum": { + "description": "Lower bound of the number of Triton Server replicas deployed concurrently.", + "oneOf": [ + { + "minimum": 1, + "type": "integer" + }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + } + }, + "type": "object" + }, + { "type": "null" } + ] + }, + "required": [ + "gpu", + "gpuPerNode", + "persistentVolumeClaim", + "triton" + ] +} diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/values.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/values.yaml new file mode 100644 index 00000000..32a2360a --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/values.yaml @@ -0,0 +1,105 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Value must match the node's `.metadata.labels.nvidia.com/gpu.product` label. +# Run 'kubectl get nodes' to find node names. +# Run 'kubectl describe node ' to inspect a node's labels. +# Example values: NVIDIA-A100-SXM4-40GB, NVIDIA-A10G, Tesla-V100-SXM2-16GB, Tesla-T4 +gpu: # (required) + +# Example values: 1, 4, 8 +gpuPerNode: # (required) + +# Persistent volume claim where model content will be persisted. +# Expected to support read/write many access. +persistentVolumeClaim: # (required) + +# Name of the secret to pull image +pullSecret: # (optional) + +# Configuration options related to the AI model to be deployed. +tensorrtLLM: # (optional) + # TensorRT-LLM format, specifically if/how the model is partitioned for deployment to multiple GPUs. + parallelism: # (optional) + # Pipeline parallelism involves sharding the model (vertically) into chunks, where each chunk comprises a + # subset of layers that is executed on a separate device. + # The main limitation of this method is that, due to the sequential nature of the processing, some devices or + # layers may remain idle while waiting for the output. + pipeline: 1 # (default: 1) + # Tensor parallelism involves sharding (horizontally) individual layers of the model into smaller, + # independent blocks of computation that can be executed on different devices. + # Attention blocks and multi-layer perceptron (MLP) layers are major components of transformers that can take advantage of + # tensor parallelism. + # In multi-head attention blocks, each head or group of heads can be assigned to a different device so they can be computed + # independently and in parallel. + tensor: 1 # (default: 1) + +# Configuration options for Triton Server. +triton: # (required) + # Configuration options related to the container image for Triton Server. + image: # (required) + # Name of the container image containing the version of Triton Server to be used. + name: 210086341041.dkr.ecr.us-west-2.amazonaws.com/triton_trtllm_multinode:24.07 + # Configuration options managing the resources assigned to individual Triton Server instances. + resources: # (optional) + # Number of logical CPU cores reserved for, and assigned to each instance of Triton Server. + cpu: 4 # (default: 4) + # Amount of CPU-visible system memory allocated to, and reserved for each instance of Triton Server. + memory: 0 # (default: 32Gi) + # Amount of EFA adapters in your nodes. If you don't want to enable EFA, simply set it to 0. + efa: 1 # (default: 1) + triton_model_repo_path: # (required) + # Enable profiling on Triton server. Note if you send lots of requests, nsys report can be very large. + enable_nsys: false # (default: false) + +# Configuration options related to how various components generate logs. +logging: # (optional) + # Logging configuration options specific to Triton Server. + tritonServer: + # When `true` Triton Server logs are formatted using the ISO8601 standard; otherwise Triton's default format will be used. + useIso8601: false # (default: false) + # When `true` Triton Server uses verbose logging; otherwise standard logging is used. + verbose: false # (default: false) + +# Configurations option related to the Kubernetes objects created by the chart. +kubernetes: # (optional) + # Optional set of labels to be applied to created Kubernetes objects. + # These labels can be used for association with a preexisting service object. + labels: # (optional) + # customLabel: exampleValue + # When `false`, a service will not be created when the chart is installed; otherwise a service will be created. + noService: false # (default: false) + # Name of the service account to use when deploying components. + # When not provided, a service account will be created. + serviceAccount: 0 # (optional) + # Tolerations applied to every pod deployed as part of this deployment. + # Template already includes `nvidia.com/gpu=present:NoSchedule`. + tolerations: # (optional) + +# Configuration options for automatic scaling of Triton Server deployments. +autoscaling: # (optional) + # Determines if autoscaling is enabled for deployment or not. + enable: true # (default: true) + # Controls the number of Triton Server replicas are deployed. + replicas: # (optional) + # Upper bound of the number of Triton Server replicas deployed concurrently. + maximum: 2 # (default: 4) + # Lower bound of the number of Triton Server replicas deployed concurrently. + minimum: 1 # (default: 1) + # Metric used to determine autoscaling decisions. + metric: # (optional) + # Name of the metric monitored. + name: triton:queue_compute:ratio # (default: triton:queue_compute:ratio) + # Threshold or targeted value used to determine the number of replicas concurrently deployed. + value: 1 # (default: 1) diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/README.md b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/README.md new file mode 100644 index 00000000..1c8f1835 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/README.md @@ -0,0 +1,26 @@ + + + +# Container Generation + +The files in this folder are intended to be used to create the custom container image for multi-node Triton + TRT-LLM EKS deployment including installation of EFA componenets. + +Run the following command to create the container image. + +```bash +docker build --file ./triton_trt_llm.containerfile --tag . +``` diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/kubessh b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/kubessh new file mode 100755 index 00000000..4eb88dab --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/kubessh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pod=$1 +shift +kubectl exec $pod -- /bin/sh -c "$*" diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/server.py b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/server.py new file mode 100644 index 00000000..d14e74b1 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/server.py @@ -0,0 +1,279 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import shutil +import signal +import subprocess +import sys +import time + +ERROR_EXIT_DELAY = 15 +ERROR_CODE_FATAL = 255 +ERROR_CODE_USAGE = 253 +EXIT_SUCCESS = 0 +DELAY_BETWEEN_QUERIES = 2 + +def die(exit_code: int): + if exit_code is None: + exit_code = ERROR_CODE_FATAL + + write_error(f" Waiting {ERROR_EXIT_DELAY} second before exiting.") + # Delay the process' termination to provide a small window for administrators to capture the logs before it exits and restarts. + time.sleep(ERROR_EXIT_DELAY) + + exit(exit_code) + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument("mode", type=str, choices=["leader", "worker"]) + parser.add_argument("--triton_model_repo_dir", type=str, default=None,required=True,help="Directory that contains Triton Model Repo to be served") + parser.add_argument("--pp", type=int, default=1, help="Pipeline parallelism.") + parser.add_argument("--tp", type=int, default=1, help="Tensor parallelism.") + parser.add_argument("--iso8601", action="count", default=0) + parser.add_argument("--verbose", action="store_true", help="Enable verbose output") + parser.add_argument( + "--namespace", + type=str, + default="default", + help="Namespace of the Kubernetes deployment.", + ) + parser.add_argument( + "--gpu_per_node", + type=int, + help="How many gpus are in each pod/node (We launch one pod per node). Only required in leader mode.", + ) + parser.add_argument("--stateful_set_group_key",type=str,default=None,help="Value of leaderworkerset.sigs.k8s.io/group-key, Leader uses this to gang schedule and its only needed in leader mode") + parser.add_argument("--enable_nsys", action="store_true", help="Enable Triton server profiling") + + return parser.parse_args() + +def run_command(cmd_args: [str], omit_args: [int] = None): + command = "" + + for i, arg in enumerate(cmd_args): + command += " " + if omit_args is not None and i in omit_args: + command += "*****" + else: + command += arg + + write_output(f">{command}") + write_output(" ") + + return subprocess.call(cmd_args, stderr=sys.stderr, stdout=sys.stdout) + +def signal_handler(sig, frame): + write_output(f"Signal {sig} detected, quitting.") + exit(EXIT_SUCCESS) + +def wait_for_workers(num_total_pod: int, args): + if num_total_pod is None or num_total_pod <= 0: + raise RuntimeError("Argument `world_size` must be greater than zero.") + + write_output("Begin waiting for worker pods.") + + cmd_args = [ + "kubectl", + "get", + "pods", + "-n", + f"{args.namespace}", + "-l", + f"leaderworkerset.sigs.k8s.io/group-key={args.stateful_set_group_key}", + "--field-selector", + "status.phase=Running", + "-o", + "jsonpath='{.items[*].metadata.name}'", + ] + command = " ".join(cmd_args) + + workers = [] + + while len(workers) < num_total_pod: + time.sleep(DELAY_BETWEEN_QUERIES) + + if args.verbose: + write_output(f"> {command}") + + output = subprocess.check_output(cmd_args).decode("utf-8") + + if args.verbose: + write_output(output) + + output = output.strip("'") + + workers = output.split(" ") + + if len(workers) < num_total_pod: + write_output( + f"Waiting for worker pods, {len(workers)} of {num_total_pod} ready." + ) + else: + write_output(f"{len(workers)} of {num_total_pod} workers ready.") + + write_output(" ") + + if workers is not None and len(workers) > 1: + workers.sort() + + return workers + +def write_output(message: str): + print(message, file=sys.stdout, flush=True) + +def write_error(message: str): + print(message, file=sys.stderr, flush=True) + +def do_leader(args): + write_output(f"Server is assuming each node has {args.gpu_per_node} GPUs. To change this, use --gpu_per_node") + + world_size = args.tp * args.pp + + if world_size <= 0: + raise Exception( + "usage: Options --tp and --pp must both be equal to or greater than 1." + ) + + write_output(f"Executing Leader (world size: {world_size})") + + workers = wait_for_workers(world_size / args.gpu_per_node, args) + + if len(workers) != (world_size / args.gpu_per_node): + write_error(f"fatal: {len(workers)} found, expected {world_size / args.gpu_per_node}.") + die(ERROR_EXIT_DELAY) + + workers_with_mpi_slots = [worker + f":{args.gpu_per_node}" for worker in workers] + + if args.enable_nsys: + cmd_args = [ + "/var/run/models/nsight-systems-cli-DVS/bin/nsys", + "profile", + "--force-overwrite", + "true", + "-t", + "cuda,nvtx", + "--enable", + "efa_metrics", + "-o", + "/var/run/models/nsys_report", + "/opt/amazon/openmpi/bin/mpirun", + "--allow-run-as-root", + ] + else: + cmd_args = [ + "/opt/amazon/openmpi/bin/mpirun", + "--allow-run-as-root", + ] + + if args.verbose: + cmd_args += ["--debug-devel"] + + cmd_args += [ + "--report-bindings", + "-mca", + "plm_rsh_agent", + "kubessh", + "-np", + f"{world_size}", + "--host", + ",".join(workers_with_mpi_slots), + ] + + # Add per node command lines separated by ':'. + for i in range(world_size): + if i != 0: + cmd_args += [":"] + + cmd_args += [ + "-n", + "1", + "tritonserver", + "--allow-cpu-metrics=false", + "--allow-gpu-metrics=false", + "--disable-auto-complete-config", + f"--id=rank{i}", + "--model-load-thread-count=2", + f"--model-repository={args.triton_model_repo_dir}", + ] + + # Rank0 node needs to support metrics collection and web services. + if i == 0: + cmd_args += [ + "--allow-metrics=true", + "--metrics-interval-ms=1000", + ] + + if args.verbose: + cmd_args += ["--log-verbose=2"] + + if args.iso8601 > 0: + cmd_args += ["--log-format=ISO8601"] + + # Rank(N) nodes can disable metrics, web services, and logging. + else: + cmd_args += [ + "--allow-http=false", + "--allow-grpc=false", + "--allow-metrics=false", + "--model-control-mode=explicit", + "--load-model=tensorrt_llm", + "--log-info=false", + "--log-warning=false", + ] + + result = run_command(cmd_args) + + if result != 0: + die(result) + + exit(result) + +def do_worker(args): + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + write_output("Worker paused awaiting SIGINT or SIGTERM.") + signal.pause() + +def main(): + write_output("Reporting system information.") + run_command(["whoami"]) + run_command(["cgget", "-n", "--values-only", "--variable memory.limit_in_bytes", "/"]) + run_command(["nvidia-smi"]) + + args = parse_arguments() + if args.triton_model_repo_dir is None: + raise Exception(f"--triton_model_repo_dir is required") + + if args.verbose: + write_output(f"Triton model repository is at:'{args.triton_model_repo_dir}'") + + if args.mode == "leader": + if args.gpu_per_node is None: + raise Exception("--gpu_per_node is required for leader mode") + if args.stateful_set_group_key is None: + raise Exception("--stateful_set_group_key is required for leader mode") + do_leader(args) + elif args.mode == "worker": + do_worker(args) + else: + write_error(f"usage: server.py [].") + write_error(f' Invalid mode ("{args.mode}") provided.') + write_error(f' Supported values are "init" or "exec".') + die(ERROR_CODE_USAGE) + +if __name__ == '__main__': + main() diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/triton_trt_llm.containerfile b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/triton_trt_llm.containerfile new file mode 100644 index 00000000..21ee6fd5 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/triton_trt_llm.containerfile @@ -0,0 +1,167 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +ARG BASE_CONTAINER_IMAGE=nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 +FROM ${BASE_CONTAINER_IMAGE} + +ENV EFA_INSTALLER_VERSION=1.33.0 +ENV AWS_OFI_NCCL_VERSION=1.9.2-aws +ENV NCCL_VERSION=2.21.5-1 +ENV NCCL_TESTS_VERSION=2.13.9 + +# Set a set of useful labels. +LABEL "base"="${BASE_CONTAINER_IMAGE}" +LABEL "role"="server" + +# Stop APT (Debian package manager) from complaining about interactivity. +ENV DEBIAN_FRONTEND=noninteractive +# Set additional environment values that make usage more pleasant. +ENV TERM=xterm-256color + +RUN apt update \ + && apt install --yes \ + apt-transport-https \ + ca-certificates \ + curl \ + gnupg \ + cgroup-tools \ + && rm -rf /var/lib/apt/lists/* + +# Install kubectl because server.py script depends on it. +# Step 1: acquire the Kubernetes APT GPG key. +RUN curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.30/deb/Release.key \ + | gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg \ + && chmod 644 /etc/apt/keyrings/kubernetes-apt-keyring.gpg + +# Step 2: Acquire the API sources list for Kubernetes. +RUN echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.30/deb/ /' \ + | tee /etc/apt/sources.list.d/kubernetes.list \ + && chmod 644 /etc/apt/sources.list.d/kubernetes.list + +# Step 3: Install kubectl. +RUN apt update \ + && apt install --yes \ + kubectl \ + && apt autoremove --yes \ + && apt purge --yes \ + && rm -rf /var/lib/apt/lists/* + +############################################### +#### For EFA and NCCL Test +RUN apt-get update -y +RUN apt-get remove -y --allow-change-held-packages \ + libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 + +RUN rm -rf /opt/hpcx/ompi \ + && rm -rf /usr/local/mpi \ + && rm -fr /opt/hpcx/nccl_rdma_sharp_plugin \ + && ldconfig +ENV OPAL_PREFIX= +RUN apt-get install -y --allow-unauthenticated \ + git \ + gcc \ + vim \ + kmod \ + openssh-client \ + openssh-server \ + build-essential \ + curl \ + autoconf \ + libtool \ + gdb \ + automake \ + cmake \ + apt-utils \ + libhwloc-dev \ + aptitude && \ + apt autoremove -y + +RUN mkdir -p /var/run/sshd +RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ + echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config + +# NCCL +RUN apt-get remove -y libnccl2 libnccl-dev \ + && cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ + && cd nccl \ + && make -j src.build BUILDDIR=/usr/local \ + NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90" \ + && rm -rf /tmp/nccl + +# EFA +RUN apt-get update && \ + apt-get install -y libhwloc-dev && \ + cd /tmp && \ + curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ + tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ + cd aws-efa-installer && \ + ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify && \ + ldconfig && \ + rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* + +# NCCL EFA Plugin +RUN mkdir -p /tmp && \ + cd /tmp && \ + curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ + tar -xzf /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ + rm /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ + mv aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} aws-ofi-nccl && \ + cd /tmp/aws-ofi-nccl && \ + ./autogen.sh && \ + ./configure --prefix=/opt/amazon/efa \ + --with-libfabric=/opt/amazon/efa \ + --with-cuda=/usr/local/cuda \ + --enable-platform-aws \ + --with-mpi=/opt/amazon/openmpi && \ + make -j$(nproc) install && \ + rm -rf /tmp/aws-ofi/nccl + +# NCCL ENV setup +RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/local.conf && \ + echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \ + ldconfig + +ENV OMPI_MCA_pml=^cm,ucx \ + OMPI_MCA_btl=tcp,self \ + OMPI_MCA_btl_tcp_if_exclude=lo,docker0 \ + OPAL_PREFIX=/opt/amazon/openmpi \ + NCCL_SOCKET_IFNAME=^docker,lo + +# NCCL-tests +RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ + && cd /opt/nccl-tests \ + && git checkout v${NCCL_TESTS_VERSION} \ + && make MPI=1 \ + MPI_HOME=/opt/amazon/openmpi \ + CUDA_HOME=/usr/local/cuda \ + # nvcc to target p5 and p4 instances + NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90" + +############################################### + +# Set the active working directory. +WORKDIR /workspace + +# Copy kubessh script w/ executable permissions for everyone. +# This enables the script to be executed no matter the user the container is run as. +# This works around the issue of the file being non-executable when the container is build on a Windows host. +COPY --chmod=555 kubessh . +COPY server.py . + +RUN apt list --installed \ + && pip list --version + +ENTRYPOINT [ "/bin/bash" ] diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/gen_ai_perf.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/gen_ai_perf.yaml new file mode 100644 index 00000000..9e125767 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/gen_ai_perf.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gen-ai-perf + labels: + app: gen-ai-perf +spec: + containers: + - name: triton + image: nvcr.io/nvidia/tritonserver:24.07-py3-sdk + command: ["sleep", "infinity"] + volumeMounts: + - mountPath: /var/run/models + name: model-repository + volumes: + - name: model-repository + persistentVolumeClaim: + claimName: efs-claim-2 diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/nccl_test.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/nccl_test.yaml new file mode 100644 index 00000000..4ef28119 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/nccl_test.yaml @@ -0,0 +1,96 @@ +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: test-nccl +spec: + runPolicy: + cleanPodPolicy: Running + backoffLimit: 20 + slotsPerWorker: 4 + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + restartPolicy: OnFailure + containers: + - image: 210086341041.dkr.ecr.us-west-2.amazonaws.com/triton_trtllm_multinode:24.08 + name: test-nccl-launcher + env: + - name: LD_LIBRARY_PATH + value: /opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/nvidia/lib:$LD_LIBRARY_PATH + - name: PATH + value: $PATH:/opt/amazon/efa/bin:/usr/bin + command: + - /opt/amazon/openmpi/bin/mpirun + - --allow-run-as-root + - --tag-output + - -np + - "8" + - -N + - "4" + - --bind-to + - none + - -x + - PATH + - -x + - LD_LIBRARY_PATH + - -x + - FI_PROVIDER=efa + - -x + - FI_EFA_FORK_SAFE=1 + - -x + - NCCL_DEBUG=INFO + - -x + - NCCL_BUFFSIZE=8388608 + - -x + - NCCL_P2P_NET_CHUNKSIZE=524288 + - --mca + - pml + - ^cm,ucx + - --mca + - btl + - tcp,self + - --mca + - btl_tcp_if_exclude + - lo,docker0,veth_def_agent + - /opt/nccl-tests/build/all_reduce_perf + - -b + - "4" + - -e + - "16G" + - -f + - "2" + - -g + - "1" + - -c + - "1" + - -n + - "100" + Worker: + replicas: 2 + template: + spec: + nodeSelector: + node.kubernetes.io/instance-type: "g5.12xlarge" + containers: + - image: 210086341041.dkr.ecr.us-west-2.amazonaws.com/triton_trtllm_multinode:24.08 + name: test-nccl-worker + volumeMounts: + - name: shmem + mountPath: /dev/shm + resources: + limits: + nvidia.com/gpu: 4 + hugepages-2Mi: 5120Mi + vpc.amazonaws.com/efa: 1 + memory: 3200Mi + requests: + nvidia.com/gpu: 4 + hugepages-2Mi: 5120Mi + vpc.amazonaws.com/efa: 1 + memory: 3200Mi + volumes: + - name: shmem + hostPath: + path: /dev/shm diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/nvidia_dcgm-exporter_values.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/nvidia_dcgm-exporter_values.yaml new file mode 100644 index 00000000..30111dad --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/nvidia_dcgm-exporter_values.yaml @@ -0,0 +1,107 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# All values are defaults unless specified otherwise. + +image: + repository: nvcr.io/nvidia/k8s/dcgm-exporter + pullPolicy: IfNotPresent + tag: 3.3.5-3.4.1-ubuntu22.04 + +arguments: + # Reduces the delay between GPU metrics collection passed to 1 second. +- --collect-interval=1000 +- --collectors=/etc/dcgm-exporter/dcp-metrics-included.csv + # Required. Enables Kubernetes specific metric collection features. +- --kubernetes=true + +serviceAccount: + create: true + annotations: { } + name: + +rollingUpdate: + maxUnavailable: 1 + maxSurge: 0 + +podLabels: { } + +podAnnotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9400" + # Required by Prometheus Operator for proper metrics collection. + release: prometheus +podSecurityContext: { } + +securityContext: + # Enables advanced GPU metrics features. Optional. + privileged: true + runAsNonRoot: false + runAsUser: 0 + capabilities: + add: [ "SYS_ADMIN" ] + +service: + enable: true + type: ClusterIP + port: 9400 + address: ":9400" + annotations: + prometheus.io/port: "9400" + prometheus.io/scrape: "true" + release: prometheus + +resources: + # Sets proper resource utilization limits, and enables Kubernetes to manage the pod's resource consumption. + # All contains should have these. + limits: + cpu: 2 + memory: 1Gi + # Sets proper resource requirements, and enables Kubernetes to account for the pod's resource consumption. + # All contains should have these. + requests: + cpu: 1 + memory: 1Gi + +serviceMonitor: + enabled: true + # Reduces the delay between metric collection passes. + interval: 1s + honorLabels: false + additionalLabels: + # Useful for helping Prometheus identify metrics collectors. + monitoring: prometheus + # Required by Prometheus to identify metrics collectors. + release: prometheus + +nodeSelector: + # Ensures that DCGM Exporter process is only deployed to nodes with GPUs. + nvidia.com/gpu: present + +tolerations: +# Enables the DCGM Exporter pods to be deployed to nodes with GPUs. +- key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + +affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + # Ensures that DCGM Exporter process is only deployed to nodes with GPUs. + - key: nvidia.com/gpu + operator: Exists + +kubeletPath: "/var/lib/kubelet/pod-resources" diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/nvidia_gpu-feature-discovery_daemonset.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/nvidia_gpu-feature-discovery_daemonset.yaml new file mode 100644 index 00000000..02ac2cd8 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/nvidia_gpu-feature-discovery_daemonset.yaml @@ -0,0 +1,87 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# In the document below, the version `0.8.2` of the gpu-feature-discovery container is used. +# It is always wise to check if a new version has been released and to use the latest available release when possible. +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: gpu-feature-discovery + namespace: kube-system + labels: + app.kubernetes.io/name: gpu-feature-discovery + app.kubernetes.io/version: 0.8.2 + app.kubernetes.io/part-of: nvidia-gpu +spec: + selector: + matchLabels: + app.kubernetes.io/name: gpu-feature-discovery + app.kubernetes.io/part-of: nvidia-gpu + template: + metadata: + labels: + app.kubernetes.io/name: gpu-feature-discovery + app.kubernetes.io/version: 0.8.2 + app.kubernetes.io/part-of: nvidia-gpu + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + # The following set of node selector match expressions restrict the nodes the service's pods + # can be deployed to, to node which meet one or more of the following criteria: + # * Nodes with NVIDIA PCIE devices attached (10DE is NVIDIA's PCIE device number). + # * Nodes with NVIDIA CPUs. + # * Nodes with NVIDIA GPUs. + nodeSelectorTerms: + - matchExpressions: + - key: feature.node.kubernetes.io/pci-10de.present + operator: In + values: + - "true" + - matchExpressions: + - key: feature.node.kubernetes.io/cpu-model.vendor_id + operator: In + values: + - "NVIDIA" + - matchExpressions: + - key: "nvidia.com/gpu" + operator: In + values: + - "true" + - present + containers: + - image: nvcr.io/nvidia/gpu-feature-discovery:v0.8.2 + name: gpu-feature-discovery + volumeMounts: + - name: output-dir + mountPath: "/etc/kubernetes/node-feature-discovery/features.d" + - name: host-sys + mountPath: "/sys" + env: + - name: MIG_STRATEGY + value: none + securityContext: + privileged: true + # Enables the service's pods to be deployed on nodes with GPUs. + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + volumes: + - name: output-dir + hostPath: + path: "/etc/kubernetes/node-feature-discovery/features.d" + - name: host-sys + hostPath: + path: "/sys" diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/setup_ssh_efs.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/setup_ssh_efs.yaml new file mode 100644 index 00000000..0c0b22fe --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/setup_ssh_efs.yaml @@ -0,0 +1,29 @@ +apiVersion: v1 +kind: Pod +metadata: + name: setup-ssh-efs + labels: + app: setup-ssh-efs +spec: + containers: + - name: triton + image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 + command: ["sleep", "infinity"] + resources: + limits: + nvidia.com/gpu: 4 + requests: + nvidia.com/gpu: 4 + volumeMounts: + - mountPath: /var/run/models + name: model-repository + - mountPath: /dev/shm + name: dshm + volumes: + - name: model-repository + persistentVolumeClaim: + claimName: efs-claim-2 + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 32Gi diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/triton-metrics_prometheus-rule.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/triton-metrics_prometheus-rule.yaml new file mode 100644 index 00000000..be91701a --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/triton-metrics_prometheus-rule.yaml @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: triton-metrics + labels: + app.kubernetes.io/component: autoscaler + release: prometheus +spec: + groups: + - name: autoscaling + interval: 6s + rules: + # Average number of microseconds inference requests take to compute after unqueueing (not including cache hits). + - expr: rate(nv_inference_compute_infer_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1) + record: triton:compute_duration:average + # Average number of microseconds inference requests spend queue before being processed (not including cache hits). + - expr: rate(nv_inference_queue_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1) + record: triton:queue_duration:average + # Average number of microseconds inference requests take in total (not including cache hits). + - expr: rate(nv_inference_request_duration_us[1m])/clamp_min(rate(nv_inference_request_success[1m]),1) + record: triton:request_duration:average + # Average percentage of time inference requests spend in queue (not including cache hits). + - expr: rate(nv_inference_queue_duration_us[1m])/clamp_min(rate(nv_inference_compute_infer_duration_us[1m]),1) + record: triton:queue_compute:ratio diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/p5-trtllm-cluster-config.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/p5-trtllm-cluster-config.yaml new file mode 100644 index 00000000..3e523540 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/p5-trtllm-cluster-config.yaml @@ -0,0 +1,70 @@ +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig + +metadata: + name: trtllm-inference-cluster + region: us-east-1 + version: "1.30" + +vpc: + id: $PLACEHOLDER_VPC_ID + subnets: + private: + us-east-1a: + id: $PLACEHOLDER_SUBNET_PRIVATE_1 + public: + us-east-1a: + id: $PLACEHOLDER_SUBNET_PUBLIC_1 + + clusterEndpoints: + privateAccess: true + publicAccess: true + +cloudwatch: + clusterLogging: + enableTypes: ["*"] + +iam: + withOIDC: true + + +managedNodeGroups: + - name: cpu-node-group + instanceType: c5.2xlarge + minSize: 0 + desiredCapacity: 0 + maxSize: 1 + iam: + withAddonPolicies: + imageBuilder: true + autoScaler: true + ebs: true + efs: true + awsLoadBalancerController: true + cloudWatch: true + albIngress: true + - name: gpu-compute-node-group + instanceType: p5.48xlarge + instancePrefix: trtllm-compute-node + privateNetworking: true + efaEnabled: true + minSize: 0 + desiredCapacity: 0 + maxSize: 2 + volumeSize: 500 + # comment out capacityReservation if you do not need ODCR + capacityReservation: + capacityReservationTarget: + capacityReservationID: "cr-xxxxxxxxxxxxxx" + iam: + withAddonPolicies: + imageBuilder: true + autoScaler: true + ebs: true + efs: true + awsLoadBalancerController: true + cloudWatch: true + albIngress: true + externalDNS: true + certManager: true + autoScaler: true diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/pvc/claim.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/pvc/claim.yaml new file mode 100644 index 00000000..c095ecf7 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/pvc/claim.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: efs-claim-2 +spec: + accessModes: + - ReadWriteMany + storageClassName: efs-sc-1 + resources: + requests: + storage: 200Gi diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/pvc/pv.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/pvc/pv.yaml new file mode 100644 index 00000000..b5a7f809 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/pvc/pv.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolume +metadata: + name: efs-pv-2 +spec: + capacity: + storage: 200Gi + volumeMode: Filesystem + accessModes: + - ReadWriteMany + persistentVolumeReclaimPolicy: Retain + storageClassName: efs-sc-1 + csi: + driver: efs.csi.aws.com + volumeHandle: fs-0d5ec63b9f8ebb2db diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/pvc/storageclass.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/pvc/storageclass.yaml new file mode 100644 index 00000000..98f38d98 --- /dev/null +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/pvc/storageclass.yaml @@ -0,0 +1,5 @@ +kind: StorageClass +apiVersion: storage.k8s.io/v1 +metadata: + name: efs-sc-1 +provisioner: efs.csi.aws.com From 2a20922925728fcd7244079bcb975e0e9b9c412d Mon Sep 17 00:00:00 2001 From: Indrajit Bhosale Date: Tue, 8 Oct 2024 11:03:36 -0700 Subject: [PATCH 2/2] Pre-commit fixes --- .pre-commit-config.yaml | 2 +- .../2. Configure_EKS_Cluster.md | 8 ++-- .../3. Deploy_Triton.md | 14 +++--- .../EKS_Multinode_Triton_TRTLLM/README.md | 2 +- .../aws-efa-k8s-device-plugin/README.md | 4 +- .../chart/values.schema.json | 2 +- .../multinode_helm_chart/containers/README.md | 2 +- .../multinode_helm_chart/containers/server.py | 46 +++++++++++++++---- .../p5-trtllm-cluster-config.yaml | 10 ++-- 9 files changed, 60 insertions(+), 30 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 74570a59..57c5990a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -65,7 +65,7 @@ repos: - id: check-json - id: check-toml - id: check-yaml - exclude: ^Deployment/Kubernetes/[^/]+/chart/templates/.+$ + exclude: ^Deployment/Kubernetes/.+$ - id: check-shebang-scripts-are-executable - id: end-of-file-fixer types_or: [c, c++, cuda, proto, textproto, java, python] diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/2. Configure_EKS_Cluster.md b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/2. Configure_EKS_Cluster.md index 54617261..39df1e30 100644 --- a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/2. Configure_EKS_Cluster.md +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/2. Configure_EKS_Cluster.md @@ -1,7 +1,7 @@ # Steps to set up cluster -In this guide we will set up the Kubernetes cluster for the deployment of LLMs using Triton Server and TRT-LLM. -* +In this guide we will set up the Kubernetes cluster for the deployment of LLMs using Triton Server and TRT-LLM. +* ## 1. Add node label and taint As first step we will add node labels and taints @@ -98,7 +98,7 @@ In you local browser, you should be able to see metrics in `localhost:8080`. ## 7. Install Prometheus Adapter -This allows the Triton metrics collected by Prometheus server to be available to Kuberntes' Horizontal Pod Autoscaler service. +This allows the Triton metrics collected by Prometheus server to be available to Kubernetes' Horizontal Pod Autoscaler service. ``` helm install -n monitoring prometheus-adapter prometheus-community/prometheus-adapter \ @@ -125,7 +125,7 @@ This generates custom metrics from a formula that uses the Triton metrics collec kubectl apply -f triton-metrics_prometheus-rule.yaml ``` -At this point, all metrics components should have been installed. All metrics including Triton metrics, DCGM metrics, and custom metrics should be availble to Prometheus server now. You can verify by showing all metrics in Prometheus server: +At this point, all metrics components should have been installed. All metrics including Triton metrics, DCGM metrics, and custom metrics should be available to Prometheus server now. You can verify by showing all metrics in Prometheus server: ``` kubectl -n monitoring port-forward svc/prometheus-kube-prometheus-prometheus 8080:9090 diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/3. Deploy_Triton.md b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/3. Deploy_Triton.md index cfd1b5cc..eb7f7375 100644 --- a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/3. Deploy_Triton.md +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/3. Deploy_Triton.md @@ -87,7 +87,7 @@ trtllm-build --checkpoint_dir ./converted_checkpoint \ --use_custom_all_reduce disable \ # only disable on non-NVLink machines like g5.12xlarge --max_input_len 2048 \ --max_output_len 2048 \ - --max_batch_size 4 + --max_batch_size 4 ``` ### c. Prepare the Triton model repository @@ -108,7 +108,7 @@ python3 tools/fill_template.py -i triton_model_repo/ensemble/config.pbtxt triton ``` > [!Note] -> Be sure to substitute the correct values for `` and `` in the example above. Keep in mind that the tokenizer, the TRT-LLM engines, and the Triton model repository shoudl be in a shared file storage between your nodes. They're required to launch your model in Triton. For example, if using AWS EFS, the values for `` and `` should be respect to the actutal EFS mount path. This is determined by your persistent-volume claim and mount path in chart/templates/deployment.yaml. Make sure that your nodes are able to access these files. +> Be sure to substitute the correct values for `` and `` in the example above. Keep in mind that the tokenizer, the TRT-LLM engines, and the Triton model repository should be in a shared file storage between your nodes. They're required to launch your model in Triton. For example, if using AWS EFS, the values for `` and `` should be respect to the actutal EFS mount path. This is determined by your persistent-volume claim and mount path in chart/templates/deployment.yaml. Make sure that your nodes are able to access these files. ## 3. Create `example_values.yaml` file for deployment @@ -177,7 +177,7 @@ kubectl logs --follow leaderworkerset-sample-0 You should output something similar to below: ``` -I0717 23:01:28.501008 300 server.cc:674] +I0717 23:01:28.501008 300 server.cc:674] +----------------+---------+--------+ | Model | Version | Status | +----------------+---------+--------+ @@ -187,7 +187,7 @@ I0717 23:01:28.501008 300 server.cc:674] | tensorrt_llm | 1 | READY | +----------------+---------+--------+ -I0717 23:01:28.501073 300 tritonserver.cc:2579] +I0717 23:01:28.501073 300 tritonserver.cc:2579] +----------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Option | Value | +----------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ @@ -347,9 +347,9 @@ kubectl logs -f $(kubectl get pods | grep launcher | cut -d ' ' -f 1) You should output something similar to below (example of 2 x g5.12xlarge): ``` -[1,0]:# out-of-place in-place +[1,0]:# out-of-place in-place [1,0]:# size count type redop root time algbw busbw #wrong time algbw busbw #wrong -[1,0]:# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) +[1,0]:# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) [1,0]: 8 2 float sum -1[1,0]: 99.10 0.00 0.00 0[1,0]: 100.6 0.00 0.00 0 [1,0]: 16 4 float sum -1[1,0]: 103.4 0.00 0.00 0[1,0]: 102.5 0.00 0.00 0 [1,0]: 32 8 float sum -1[1,0]: 103.5 0.00 0.00 0[1,0]: 102.5 0.00 0.00 0 @@ -429,7 +429,7 @@ genai-perf \ You should output something similar to below (example of Mixtral 8x7B on 2 x g5.12xlarge): ``` - LLM Metrics + LLM Metrics ┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┓ ┃ Statistic ┃ avg ┃ min ┃ max ┃ p99 ┃ p90 ┃ p75 ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━┩ diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/README.md b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/README.md index dc2c47b9..a5db4283 100644 --- a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/README.md +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/README.md @@ -6,7 +6,7 @@ We have 1 pod per node, so the main challenge in deploying models that require m 1. **LeaderWorkerSet for launching Triton+TRT-LLM on groups of pods:** To launch Triton and TRT-LLM across nodes you use MPI to have one node launch TRT-LLM processes on all the nodes (including itself) that will make up one instance of the model. Doing this requires knowing the hostnames of all involved nodes. Consequently we need to spawn groups of pods and know which model instance group they belong to. To achieve this we use [LeaderWorkerSet](https://github.com/kubernetes-sigs/lws/tree/main), which lets us create "megapods" that consist of a group of pods - one leader pod and a specified number of worker pods - and provides pod labels identifying group membership. We configure the LeaderWorkerSet and launch Triton+TRT-LLM via MPI in [`deployment.yaml`](multinode_helm_chart/chart/templates/deployment.yaml) and [server.py](multinode_helm_chart/containers/server.py). 2. **Gang Scheduling:** Gang scheduling simply means ensuring all pods that make up a model instance are ready before Triton+TRT-LLM is launched. We show how to use `kubessh` to achieve this in the `wait_for_workers` function of [server.py](multinode_helm_chart/containers/server.py). - 3. **Autoscaling:** By default the Horizontal Pod Autoscaler (HPA) scales individual pods, but LeaderWorkerSet makes it possible to scale each "megapod". However, since these are GPU workloads we don't want to use cpu and host memory usage for autoscaling. We show how to leverage the metrics Triton Server exposes through Prometheus and set up GPU utilization recording rules in [`triton-metrics_prometheus-rule.yaml`](multinode_helm_chart/triton-metrics_prometheus-rule.yaml). We also demonstrate how to properly set up PodMonitors and an HPA in [`pod-monitor.yaml`](multinode_helm_chart/chart/templates/pod-monitor.yaml) and [`hpa.yaml`](multinode_helm_chart/chart/templates/hpa.yaml) (the key is to only scrape metrics from the leader pods). Instructions for properly setting up Prometheus and exposing GPU metrics are found in [Configure EKS Cluster and Install Dependencies](https://github.com/Wenhan-Tan/EKS_Multinode_Triton_TRTLLM/blob/main/Cluster_Setup_Steps.md). To enable deployment to dynamically add more nodes in reponse to HPA, we also setup [Cluster Autoscaler](https://github.com/Wenhan-Tan/EKS_Multinode_Triton_TRTLLM/blob/main/Cluster_Setup_Steps.md#10-install-cluster-autoscaler) + 3. **Autoscaling:** By default the Horizontal Pod Autoscaler (HPA) scales individual pods, but LeaderWorkerSet makes it possible to scale each "megapod". However, since these are GPU workloads we don't want to use cpu and host memory usage for autoscaling. We show how to leverage the metrics Triton Server exposes through Prometheus and set up GPU utilization recording rules in [`triton-metrics_prometheus-rule.yaml`](multinode_helm_chart/triton-metrics_prometheus-rule.yaml). We also demonstrate how to properly set up PodMonitors and an HPA in [`pod-monitor.yaml`](multinode_helm_chart/chart/templates/pod-monitor.yaml) and [`hpa.yaml`](multinode_helm_chart/chart/templates/hpa.yaml) (the key is to only scrape metrics from the leader pods). Instructions for properly setting up Prometheus and exposing GPU metrics are found in [Configure EKS Cluster and Install Dependencies](https://github.com/Wenhan-Tan/EKS_Multinode_Triton_TRTLLM/blob/main/Cluster_Setup_Steps.md). To enable deployment to dynamically add more nodes in response to HPA, we also setup [Cluster Autoscaler](https://github.com/Wenhan-Tan/EKS_Multinode_Triton_TRTLLM/blob/main/Cluster_Setup_Steps.md#10-install-cluster-autoscaler) 4. **LoadBalancer Setup:** Although there are multiple pods in each instance of the model, only one pod within each group accepts requests. We show how to correctly set up a LoadBalancer Service to allow external clients to submit requests in [`service.yaml`](multinode_helm_chart/chart/templates/service.yaml) diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/README.md b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/README.md index 6a025e8e..574ef13b 100644 --- a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/README.md +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/aws-efa-k8s-device-plugin/README.md @@ -19,7 +19,7 @@ helm install efa ./aws-efa-k8s-device-plugin -n kube-system # Configuration -Paramter | Description | Default +Parameter | Description | Default --- | --- | --- `image.repository` | EFA image repository | `602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/aws-efa-k8s-device-plugin` `image.tag` | EFA image tag | `v0.5.3` @@ -31,7 +31,7 @@ Paramter | Description | Default `nodeSelector` | Node labels for pod assignment | `{}` `tolerations` | Optional deployment tolerations | `[]` `additionalPodAnnotations` | Pod annotations to apply in addition to the default ones | `{}` -`additionalPodLabels` | Pod labels to apply in addition to the defualt ones | `{}` +`additionalPodLabels` | Pod labels to apply in addition to the default ones | `{}` `nameOverride` | Override the name of the chart | `""` `fullnameOverride` | Override the full name of the chart | `""` `imagePullSecrets` | Docker registry pull secret | `[]` diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/values.schema.json b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/values.schema.json index 422781b0..19120e48 100644 --- a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/values.schema.json +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/chart/values.schema.json @@ -127,7 +127,7 @@ }, "required": [ "image", - "triton_model_repo_path" + "triton_model_repo_path" ], "type": "object" }, diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/README.md b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/README.md index 1c8f1835..ab24bc39 100644 --- a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/README.md +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/README.md @@ -17,7 +17,7 @@ # Container Generation -The files in this folder are intended to be used to create the custom container image for multi-node Triton + TRT-LLM EKS deployment including installation of EFA componenets. +The files in this folder are intended to be used to create the custom container image for multi-node Triton + TRT-LLM EKS deployment including installation of EFA components. Run the following command to create the container image. diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/server.py b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/server.py index d14e74b1..e12b24d3 100644 --- a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/server.py +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/multinode_helm_chart/containers/server.py @@ -26,6 +26,7 @@ EXIT_SUCCESS = 0 DELAY_BETWEEN_QUERIES = 2 + def die(exit_code: int): if exit_code is None: exit_code = ERROR_CODE_FATAL @@ -36,10 +37,17 @@ def die(exit_code: int): exit(exit_code) + def parse_arguments(): parser = argparse.ArgumentParser() parser.add_argument("mode", type=str, choices=["leader", "worker"]) - parser.add_argument("--triton_model_repo_dir", type=str, default=None,required=True,help="Directory that contains Triton Model Repo to be served") + parser.add_argument( + "--triton_model_repo_dir", + type=str, + default=None, + required=True, + help="Directory that contains Triton Model Repo to be served", + ) parser.add_argument("--pp", type=int, default=1, help="Pipeline parallelism.") parser.add_argument("--tp", type=int, default=1, help="Tensor parallelism.") parser.add_argument("--iso8601", action="count", default=0) @@ -55,11 +63,19 @@ def parse_arguments(): type=int, help="How many gpus are in each pod/node (We launch one pod per node). Only required in leader mode.", ) - parser.add_argument("--stateful_set_group_key",type=str,default=None,help="Value of leaderworkerset.sigs.k8s.io/group-key, Leader uses this to gang schedule and its only needed in leader mode") - parser.add_argument("--enable_nsys", action="store_true", help="Enable Triton server profiling") + parser.add_argument( + "--stateful_set_group_key", + type=str, + default=None, + help="Value of leaderworkerset.sigs.k8s.io/group-key, Leader uses this to gang schedule and its only needed in leader mode", + ) + parser.add_argument( + "--enable_nsys", action="store_true", help="Enable Triton server profiling" + ) return parser.parse_args() + def run_command(cmd_args: [str], omit_args: [int] = None): command = "" @@ -75,10 +91,12 @@ def run_command(cmd_args: [str], omit_args: [int] = None): return subprocess.call(cmd_args, stderr=sys.stderr, stdout=sys.stdout) + def signal_handler(sig, frame): write_output(f"Signal {sig} detected, quitting.") exit(EXIT_SUCCESS) + def wait_for_workers(num_total_pod: int, args): if num_total_pod is None or num_total_pod <= 0: raise RuntimeError("Argument `world_size` must be greater than zero.") @@ -131,14 +149,19 @@ def wait_for_workers(num_total_pod: int, args): return workers + def write_output(message: str): print(message, file=sys.stdout, flush=True) + def write_error(message: str): print(message, file=sys.stderr, flush=True) + def do_leader(args): - write_output(f"Server is assuming each node has {args.gpu_per_node} GPUs. To change this, use --gpu_per_node") + write_output( + f"Server is assuming each node has {args.gpu_per_node} GPUs. To change this, use --gpu_per_node" + ) world_size = args.tp * args.pp @@ -152,9 +175,11 @@ def do_leader(args): workers = wait_for_workers(world_size / args.gpu_per_node, args) if len(workers) != (world_size / args.gpu_per_node): - write_error(f"fatal: {len(workers)} found, expected {world_size / args.gpu_per_node}.") + write_error( + f"fatal: {len(workers)} found, expected {world_size / args.gpu_per_node}." + ) die(ERROR_EXIT_DELAY) - + workers_with_mpi_slots = [worker + f":{args.gpu_per_node}" for worker in workers] if args.enable_nsys: @@ -241,6 +266,7 @@ def do_leader(args): exit(result) + def do_worker(args): signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) @@ -248,10 +274,13 @@ def do_worker(args): write_output("Worker paused awaiting SIGINT or SIGTERM.") signal.pause() + def main(): write_output("Reporting system information.") run_command(["whoami"]) - run_command(["cgget", "-n", "--values-only", "--variable memory.limit_in_bytes", "/"]) + run_command( + ["cgget", "-n", "--values-only", "--variable memory.limit_in_bytes", "/"] + ) run_command(["nvidia-smi"]) args = parse_arguments() @@ -275,5 +304,6 @@ def main(): write_error(f' Supported values are "init" or "exec".') die(ERROR_CODE_USAGE) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/p5-trtllm-cluster-config.yaml b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/p5-trtllm-cluster-config.yaml index 3e523540..a724b26f 100644 --- a/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/p5-trtllm-cluster-config.yaml +++ b/Deployment/Kubernetes/EKS_Multinode_Triton_TRTLLM/p5-trtllm-cluster-config.yaml @@ -15,19 +15,19 @@ vpc: public: us-east-1a: id: $PLACEHOLDER_SUBNET_PUBLIC_1 - + clusterEndpoints: privateAccess: true publicAccess: true - + cloudwatch: clusterLogging: - enableTypes: ["*"] + enableTypes: ["*"] iam: withOIDC: true - + managedNodeGroups: - name: cpu-node-group instanceType: c5.2xlarge @@ -45,7 +45,7 @@ managedNodeGroups: albIngress: true - name: gpu-compute-node-group instanceType: p5.48xlarge - instancePrefix: trtllm-compute-node + instancePrefix: trtllm-compute-node privateNetworking: true efaEnabled: true minSize: 0