kaito-project · Fei-Guo · Oct 13, 2023 · Oct 11, 2023 · Oct 11, 2023 · Oct 13, 2023
diff --git a/presets/falcon/benchmark/README.md → pkg/presets/falcon/benchmark/README.md b/presets/falcon/benchmark/README.md → pkg/presets/falcon/benchmark/README.md
diff --git a/...s/falcon/benchmark/inference-benchmark.py → ...s/falcon/benchmark/inference-benchmark.py b/...s/falcon/benchmark/inference-benchmark.py → ...s/falcon/benchmark/inference-benchmark.py
diff --git a/presets/falcon/benchmark/plot.ipynb → pkg/presets/falcon/benchmark/plot.ipynb b/presets/falcon/benchmark/plot.ipynb → pkg/presets/falcon/benchmark/plot.ipynb
diff --git a/pkg/presets/k8s/convert_fill.sh b/pkg/presets/k8s/convert_fill.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# Ensure the correct number of arguments are provided
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <acr-name.azurecr.io/repository-name:tag>"
+    exit 1
+fi
+
+INPUT=$1
+
+# Extract the ACR name, repository name, and tag from the input
+ACR_NAME=$(echo "$INPUT" | cut -d. -f1)
+REPO_SHORT_NAME=$(echo "$INPUT" | cut -d/ -f2 | cut -d: -f1)
+TAG=$(echo "$INPUT" | cut -d: -f2)
+
+# Construct full repository name
+FULL_REPO_NAME="$ACR_NAME.azurecr.io/$REPO_SHORT_NAME"
+
+# Fetch the digest using the 'az' CLI
+DIGEST=$(az acr repository show-manifests --name $ACR_NAME --repository $REPO_SHORT_NAME --query "[?tags[? @ == '$TAG']].digest" -o tsv)
+
+# Fetch the username and password
+USERNAME=$(az acr credential show --name $ACR_NAME --query "username" -o tsv)
+PASSWORD=$(az acr credential show --name $ACR_NAME --query "passwords[0].value" -o tsv)
+
+# Combine USERNAME:PASSWORD for the arg
+CREDENTIAL="$USERNAME:$PASSWORD"
+
+# Use sed to replace placeholders; we'll use a different delimiter (|) to avoid conflicts with potential special characters in the variables
+sed -e "s|<REPO_NAME>|${FULL_REPO_NAME}|g" -e "s|<DIGEST>|${DIGEST}|g" -e "s|<USERNAME>|${CREDENTIAL}|g" convert_template.yaml > convert_filled.yaml
+
+echo "Filled YAML saved as convert_filled.yaml"
diff --git a/pkg/presets/k8s/convert_template.yaml b/pkg/presets/k8s/convert_template.yaml
@@ -0,0 +1,24 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: convert-pod
+  namespace: default
+spec:
+  containers:
+  - name: convert-container
+    image: gkdpaz.azurecr.io/local-convert:v1.6
+    command: ["./convert"]
+    args:
+    - "--repository"
+    - "<REPO_NAME>"
+    - "--input-digest"
+    - "<DIGEST>"
+    - "--username"
+    - "<USERNAME>"
+    resources:
+      requests:
+        cpu: "1"
+        ephemeral-storage: "500Gi"
+      limits:
+        cpu: "1"
+        ephemeral-storage: "500Gi"
diff --git a/pkg/presets/k8s/falcon-7b-instruct/falcon-7b-instruct-service.yaml b/pkg/presets/k8s/falcon-7b-instruct/falcon-7b-instruct-service.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: falcon-7b-instruct
+spec:
+  selector:
+    app: falcon
+    statefulset.kubernetes.io/pod-name: falcon-7b-instruct-0
+  ports:
+    - protocol: TCP
+      port: 80
+      targetPort: 5000
+  type: LoadBalancer
+  publishNotReadyAddresses: true
diff --git a/pkg/presets/k8s/falcon-7b-instruct/falcon-7b-instruct-statefulset.yaml b/pkg/presets/k8s/falcon-7b-instruct/falcon-7b-instruct-statefulset.yaml
@@ -0,0 +1,56 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: falcon-7b-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: falcon
+  podManagementPolicy: Parallel
+  template:
+    metadata:
+      labels:
+        app: falcon
+    spec:
+      containers:
+        - name: falcon-container
+          image: aimodelsregistry.azurecr.io/falcon-7b-instruct:latest
+          command:
+            - /bin/sh
+            - -c
+            - accelerate launch --config_file config.yaml --num_processes 1 --num_machines 1 --use_deepspeed --machine_rank 0 --gpu_ids all inference-api.py
+          # resources:
+          #   limits:
+          #     nvidia.com/gpu: "1"
+          #   requests:
+          #     nvidia.com/gpu: "1"
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 5000
+            initialDelaySeconds: 600 # 10 Min
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /healthz
+              port: 5000
+            initialDelaySeconds: 30
+            periodSeconds: 10
+          volumeMounts:
+            - name: dshm
+              mountPath: /dev/shm
+      volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+      tolerations:
+        - effect: NoSchedule
+          key: sku
+          operator: Equal
+          value: gpu
+        - effect: NoSchedule
+          key: nvidia.com/gpu
+          operator: Exists
+      nodeSelector:
+        pool: on7binstruct
diff --git a/pkg/presets/k8s/falcon-7b/falcon-7b-service.yaml b/pkg/presets/k8s/falcon-7b/falcon-7b-service.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: falcon-7b
+spec:
+  selector:
+    app: falcon
+    statefulset.kubernetes.io/pod-name: falcon-7b-0
+  ports:
+    - protocol: TCP
+      port: 80
+      targetPort: 5000
+  type: LoadBalancer
+  publishNotReadyAddresses: true
diff --git a/pkg/presets/k8s/falcon-7b/falcon-7b-statefulset.yaml b/pkg/presets/k8s/falcon-7b/falcon-7b-statefulset.yaml
@@ -0,0 +1,56 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: falcon-7b
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: falcon
+  podManagementPolicy: Parallel
+  template:
+    metadata:
+      labels:
+        app: falcon
+    spec:
+      containers:
+        - name: falcon-container
+          image: aimodelsregistry.azurecr.io/falcon-7b:latest
+          command:
+            - /bin/sh
+            - -c
+            - accelerate launch --config_file config.yaml --num_processes 1 --num_machines 1 --use_deepspeed --machine_rank 0 --gpu_ids all inference-api.py
+          # resources:
+          #   limits:
+          #     nvidia.com/gpu: "1"
+          #   requests:
+          #     nvidia.com/gpu: "1"
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 5000
+            initialDelaySeconds: 600 # 10 Min
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /healthz
+              port: 5000
+            initialDelaySeconds: 30
+            periodSeconds: 10
+          volumeMounts:
+            - name: dshm
+              mountPath: /dev/shm
+      volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+      tolerations:
+        - effect: NoSchedule
+          key: sku
+          operator: Equal
+          value: gpu
+        - effect: NoSchedule
+          key: nvidia.com/gpu
+          operator: Exists
+      nodeSelector:
+        pool: falcon7b
diff --git a/pkg/presets/k8s/llama-2-13b-chat/llama-2-13b-chat-service.yaml b/pkg/presets/k8s/llama-2-13b-chat/llama-2-13b-chat-service.yaml
@@ -0,0 +1,19 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-2-13b-chat
+spec:
+  selector:
+    app: llama
+    statefulset.kubernetes.io/pod-name: llama-2-13b-chat-0
+  ports:
+    - name: http
+      protocol: TCP
+      port: 80
+      targetPort: 5000
+    - name: torchrun
+      protocol: TCP
+      port: 29500
+      targetPort: 29500
+  type: LoadBalancer
+  publishNotReadyAddresses: true
diff --git a/pkg/presets/k8s/llama-2-13b-chat/llama-2-13b-chat-statefulset.yaml b/pkg/presets/k8s/llama-2-13b-chat/llama-2-13b-chat-statefulset.yaml
@@ -0,0 +1,72 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: llama-2-13b-chat
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      app: llama
+  podManagementPolicy: Parallel
+  template:
+    metadata:
+      labels:
+        app: llama
+    spec:
+      affinity:
+        podAntiAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+          - labelSelector:
+              matchExpressions:
+              - key: app
+                operator: In
+                values:
+                - llama
+            topologyKey: "kubernetes.io/hostname"
+      containers:
+        - name: llama-container
+          image: aimodelsregistry.azurecr.io/llama-2-13b-chat:TAG_HERE # Placeholder that will be replaced
+          env:
+          - name: MASTER_ADDR
+            value: "MASTER_ADDR_HERE"  # Placeholder that will be replaced
+          command:
+            - /bin/sh
+            - -c
+            - |
+              echo "MASTER_ADDR: $MASTER_ADDR"
+              NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$')
+              cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port 29500 web_example_chat_completion.py
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              nvidia.com/gpu: "1"
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 5000
+            initialDelaySeconds: 600 # 10 Min
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /healthz
+              port: 5000
+            initialDelaySeconds: 30
+            periodSeconds: 10
+          volumeMounts:
+            - name: dshm
+              mountPath: /dev/shm
+      volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+      tolerations:
+        - effect: NoSchedule
+          key: sku
+          operator: Equal
+          value: gpu
+        - effect: NoSchedule
+          key: nvidia.com/gpu
+          operator: Exists
+      nodeSelector:
+        pool: lama213bchat
diff --git a/pkg/presets/k8s/llama-2-13b/llama-2-13b-service.yaml b/pkg/presets/k8s/llama-2-13b/llama-2-13b-service.yaml
@@ -0,0 +1,19 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-2-13b
+spec:
+  selector:
+    app: llama
+    statefulset.kubernetes.io/pod-name: llama-2-13b-0
+  ports:
+    - name: http
+      protocol: TCP
+      port: 80
+      targetPort: 5000
+    - name: torchrun
+      protocol: TCP
+      port: 29500
+      targetPort: 29500
+  type: LoadBalancer
+  publishNotReadyAddresses: true
diff --git a/pkg/presets/k8s/llama-2-13b/llama-2-13b-statefulset.yaml b/pkg/presets/k8s/llama-2-13b/llama-2-13b-statefulset.yaml
@@ -0,0 +1,72 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: llama-2-13b
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      app: llama
+  podManagementPolicy: Parallel
+  template:
+    metadata:
+      labels:
+        app: llama
+    spec:
+      affinity:
+        podAntiAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+          - labelSelector:
+              matchExpressions:
+              - key: app
+                operator: In
+                values:
+                - llama
+            topologyKey: "kubernetes.io/hostname"
+      containers:
+        - name: llama-container
+          image: aimodelsregistry.azurecr.io/llama-2-13b:TAG_HERE # Placeholder that will be replaced
+          env:
+          - name: MASTER_ADDR
+            value: "MASTER_ADDR_HERE"  # Placeholder that will be replaced
+          command:
+            - /bin/sh
+            - -c
+            - |
+              echo "MASTER_ADDR: $MASTER_ADDR"
+              NODE_RANK=$(echo $HOSTNAME | grep -o '[^-]*$')
+              cd /workspace/llama/llama-2 && torchrun --nnodes 2 --nproc_per_node 1 --node_rank $NODE_RANK --master-addr $MASTER_ADDR --master-port 29500 web_example_text_completion.py
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              nvidia.com/gpu: "1"
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 5000
+            initialDelaySeconds: 600 # 10 Min
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /healthz
+              port: 5000
+            initialDelaySeconds: 30
+            periodSeconds: 10
+          volumeMounts:
+            - name: dshm
+              mountPath: /dev/shm
+      volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+      tolerations:
+        - effect: NoSchedule
+          key: sku
+          operator: Equal
+          value: gpu
+        - effect: NoSchedule
+          key: nvidia.com/gpu
+          operator: Exists
+      nodeSelector:
+        pool: llama213b