From e6e5bd9d6df2cd4212023ef96c127a4e3bfc7c95 Mon Sep 17 00:00:00 2001
From: jhM00n <answlgns2056@kookmin.ac.kr>
Date: Sat, 18 May 2024 02:09:37 +0900
Subject: [PATCH 1/5] add deploy_llama_inference

---
 .../llama/Dockerfile.kubernetes_gpu           |  12 ++
 .../llama/kubernetes_app_llama2.py            | 128 ++++++++++++++++++
 .../llama/push_aws_ecr.sh.sample              |   7 +
 .../llama/requirements_kubernetes_gpu.txt     |  15 ++
 4 files changed, 162 insertions(+)
 create mode 100644 inference/template_code/llama/Dockerfile.kubernetes_gpu
 create mode 100644 inference/template_code/llama/kubernetes_app_llama2.py
 create mode 100644 inference/template_code/llama/push_aws_ecr.sh.sample
 create mode 100644 inference/template_code/llama/requirements_kubernetes_gpu.txt

diff --git a/inference/template_code/llama/Dockerfile.kubernetes_gpu b/inference/template_code/llama/Dockerfile.kubernetes_gpu
new file mode 100644
index 0000000000..8d58317e60
--- /dev/null
+++ b/inference/template_code/llama/Dockerfile.kubernetes_gpu
@@ -0,0 +1,12 @@
+FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
+
+WORKDIR /app
+
+COPY requirements_kubernetes_gpu.txt /app/requirements.txt
+RUN pip3 install --no-cache-dir -r requirements.txt 
+
+COPY kubernetes_app_llama2.py /app/app.py
+
+CMD [ "python3", "/app/app.py" ]
+
+EXPOSE 8080
\ No newline at end of file
diff --git a/inference/template_code/llama/kubernetes_app_llama2.py b/inference/template_code/llama/kubernetes_app_llama2.py
new file mode 100644
index 0000000000..5371a6d38d
--- /dev/null
+++ b/inference/template_code/llama/kubernetes_app_llama2.py
@@ -0,0 +1,128 @@
+import os
+import requests
+import shutil
+import zipfile
+import torch
+from fastapi import FastAPI, Request
+import uvicorn
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+)
+from peft import PeftModel, LoraConfig, get_peft_model
+
+app = FastAPI()
+
+# 환경 변수에서 모델 S3 URL 가져오기
+model_s3_url = os.getenv('MODEL_S3_URL')
+
+# 모델 다운로드
+model_download = requests.get(model_s3_url)
+model_filename = model_s3_url.split('/')[-1]
+model_temp_path = os.path.join('temp', model_filename) 
+os.makedirs('temp', exist_ok=True) 
+with open(model_temp_path, 'wb') as file:
+    file.write(model_download.content) 
+
+# 기존 모델 디렉토리 삭제 및 생성
+if os.path.exists('model'):
+    shutil.rmtree('model')
+os.makedirs('model')
+
+# 모델 압축 해제
+with zipfile.ZipFile(model_temp_path, 'r') as zip_ref:
+    zip_ref.extractall('model')  
+
+# 임시 파일 및 디렉토리 삭제
+os.remove(model_temp_path) 
+shutil.rmtree('temp') 
+
+# 모델 이름 설정 (고정)
+model_name = "NousResearch/Llama-2-7b-chat-hf"
+
+# 양자화 구성 설정
+compute_dtype = getattr(torch, "float16")
+
+quant_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=compute_dtype,
+    bnb_4bit_use_double_quant=False,
+)
+
+# 모델 로드 (고정된 모델 이름 사용)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    quantization_config=quant_config,
+    device_map={"": 0}
+)
+
+# 로컬 경로
+model_path = '/app/model/model'
+
+# 토크나이저 로드 (로컬 경로에서 불러오기)
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 
+
+# PEFT 모델의 가중치 로드
+model = PeftModel.from_pretrained(model, model_path)
+
+# PEFT 파라미터 설정
+peft_params = LoraConfig(
+    lora_alpha=16,
+    lora_dropout=0.1,
+    r=64,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+
+model = get_peft_model(model, peft_params)
+
+# 모델 평가 모드로 전환
+model.eval()
+
+@app.get("/")
+async def healthcheck():
+    return {
+        "body": "healthy"
+    }
+
+@app.post("/{full_path:path}")
+async def inference(request: Request):
+    data = await request.json()
+    prompt = data.get('prompt', '')
+    try:
+        # 입력 데이터 준비
+        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+    except Exception as e:
+        return {
+            "error": "Tokenization failed",
+            "message": str(e)
+        }
+
+    with torch.no_grad():
+        try:
+            # 텍스트 생성
+            outputs = model.generate(**inputs, max_length=1024)
+        except Exception as e:
+            return {
+                "error": "Inference failed",
+                "message": str(e)
+            }
+    
+    try:
+        # 생성된 텍스트 디코딩
+        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    except Exception as e:
+        return {
+            "error": "Decoding failed",
+            "message": str(e)
+        }
+
+    return {
+        "output": generated_text
+    }
+
+
+if __name__ == '__main__':
+    uvicorn.run(app, host="0.0.0.0", port=8080)
\ No newline at end of file
diff --git a/inference/template_code/llama/push_aws_ecr.sh.sample b/inference/template_code/llama/push_aws_ecr.sh.sample
new file mode 100644
index 0000000000..4a8702c532
--- /dev/null
+++ b/inference/template_code/llama/push_aws_ecr.sh.sample
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+ECR_URI=""
+
+aws ecr get-login-password --region ap-northeast-2 | docker login --username AWS --password-stdin $ECR_URI
+docker build -t $ECR_URI/llama2-inference:latest . -f Dockerfile.kubernetes_gpu
+docker push $ECR_URI/llama2-inference:latest
\ No newline at end of file
diff --git a/inference/template_code/llama/requirements_kubernetes_gpu.txt b/inference/template_code/llama/requirements_kubernetes_gpu.txt
new file mode 100644
index 0000000000..0c51191697
--- /dev/null
+++ b/inference/template_code/llama/requirements_kubernetes_gpu.txt
@@ -0,0 +1,15 @@
+torch==2.3.0
+fastapi==0.111.0
+uvicorn==0.29.0
+numpy==1.26.4
+requests==2.31.0
+bitsandbytes==0.40.2
+accelerate==0.21.0
+peft==0.4.0
+transformers==4.31.0
+datasets==2.19.1
+pandas==2.2.2
+pyarrow==16.0.0
+scipy==1.13.0
+tensorboardX==2.6.2.2
+xformers==0.0.26.post1
\ No newline at end of file

From 0bf845666dafac64166f55e7cbeed82d90d0a4a2 Mon Sep 17 00:00:00 2001
From: jhM00n <answlgns2056@kookmin.ac.kr>
Date: Sat, 18 May 2024 02:10:13 +0900
Subject: [PATCH 2/5] add llama_inference_deploy

---
 automation/llama_inference_deploy/.gitignore  |   4 +
 automation/llama_inference_deploy/Dockerfile  |  13 ++
 .../llama_inference_deploy/IaC/.gitignore     |   1 +
 automation/llama_inference_deploy/IaC/main.tf |  35 ++++
 .../llama_inference_deploy/IaC/var.tf.sample  |  19 ++
 automation/llama_inference_deploy/main.py     | 183 ++++++++++++++++++
 .../push_aws_ecr.sh.sample                    |   7 +
 7 files changed, 262 insertions(+)
 create mode 100644 automation/llama_inference_deploy/.gitignore
 create mode 100644 automation/llama_inference_deploy/Dockerfile
 create mode 100644 automation/llama_inference_deploy/IaC/.gitignore
 create mode 100644 automation/llama_inference_deploy/IaC/main.tf
 create mode 100644 automation/llama_inference_deploy/IaC/var.tf.sample
 create mode 100644 automation/llama_inference_deploy/main.py
 create mode 100644 automation/llama_inference_deploy/push_aws_ecr.sh.sample

diff --git a/automation/llama_inference_deploy/.gitignore b/automation/llama_inference_deploy/.gitignore
new file mode 100644
index 0000000000..ee1f4c4fb5
--- /dev/null
+++ b/automation/llama_inference_deploy/.gitignore
@@ -0,0 +1,4 @@
+push_aws_ecr.sh
+ecr_login.sh
+get_kubeconfig.sh
+*test*
\ No newline at end of file
diff --git a/automation/llama_inference_deploy/Dockerfile b/automation/llama_inference_deploy/Dockerfile
new file mode 100644
index 0000000000..d7c845cb6f
--- /dev/null
+++ b/automation/llama_inference_deploy/Dockerfile
@@ -0,0 +1,13 @@
+FROM public.ecr.aws/lambda/python:3.11
+
+RUN pip install awscli requests --no-cache-dir
+
+# x86_64
+RUN curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" \
+    && chmod +x ./kubectl
+
+COPY main.py ${LAMBDA_TASK_ROOT}
+
+RUN chmod +x /var/task
+
+CMD ["main.handler"]
\ No newline at end of file
diff --git a/automation/llama_inference_deploy/IaC/.gitignore b/automation/llama_inference_deploy/IaC/.gitignore
new file mode 100644
index 0000000000..4ee3ae2ee7
--- /dev/null
+++ b/automation/llama_inference_deploy/IaC/.gitignore
@@ -0,0 +1 @@
+var.tf
\ No newline at end of file
diff --git a/automation/llama_inference_deploy/IaC/main.tf b/automation/llama_inference_deploy/IaC/main.tf
new file mode 100644
index 0000000000..fb44f42dec
--- /dev/null
+++ b/automation/llama_inference_deploy/IaC/main.tf
@@ -0,0 +1,35 @@
+# prefix, container_repository 변경 (대채적인 이름)
+
+module "llama_inference_deploy" {
+  source                     = "github.com/kookmin-sw/capstone-2024-12//IaC/serverless_api_template"
+  prefix                     = "llama-inference-deploy"
+  container_registry         = "694448341573.dkr.ecr.ap-northeast-2.amazonaws.com"
+  container_repository       = "llama-inference-deploy"
+  container_image_tag        = "latest"
+  lambda_ram_size            = 2048
+  attach_s3_policy           = true
+  attach_ec2_policy          = true
+  attach_eks_policy          = true
+  attach_ssm_readonly_policy = true
+  region_name = var.region
+  eks_cluster_name = var.eks_cluster_name
+  db_api_url = var.db_api_url
+}
+
+output "llama_inference_deploy_function_url" {
+  value = module.llama_inference_deploy.function_url
+}
+
+provider "aws" {
+  region  = var.region
+  profile = var.awscli_profile
+}
+
+terraform {
+  backend "s3" {
+    bucket  = "sskai-terraform-state"
+    key     = "llama_inference_deploy/tf.state"
+    region  = "ap-northeast-2"
+    encrypt = true
+  }
+}
diff --git a/automation/llama_inference_deploy/IaC/var.tf.sample b/automation/llama_inference_deploy/IaC/var.tf.sample
new file mode 100644
index 0000000000..758c8ab6c2
--- /dev/null
+++ b/automation/llama_inference_deploy/IaC/var.tf.sample
@@ -0,0 +1,19 @@
+variable "region" {
+  type    = string
+  default = "ap-northeast-2"
+}
+
+variable "awscli_profile" {
+  type    = string
+  default = ""
+}
+
+variable "eks_cluster_name" {
+    type = string
+    default = ""
+}
+
+variable "db_api_url" {
+    type = string
+    default = "" 
+}
\ No newline at end of file
diff --git a/automation/llama_inference_deploy/main.py b/automation/llama_inference_deploy/main.py
new file mode 100644
index 0000000000..52c1181653
--- /dev/null
+++ b/automation/llama_inference_deploy/main.py
@@ -0,0 +1,183 @@
+import subprocess
+import requests
+import os
+import json
+import time
+
+kubectl = '/var/task/kubectl'
+kubeconfig = '/tmp/kubeconfig'
+
+eks_cluster_name = os.getenv('EKS_CLUSTER_NAME')
+region = os.getenv("REGION")
+db_api_url = os.getenv("DB_API_URL")
+ecr_uri = os.getenv("ECR_URI")
+
+# get eks cluster kubernetes configuration by aws cli
+result_get_kubeconfig = subprocess.run([
+    "aws", "eks", "update-kubeconfig",
+    "--name", eks_cluster_name,
+    "--region", region,
+    "--kubeconfig", kubeconfig
+])
+
+def generate_yaml(user_namespace, endpoint_uid, model_s3_url, node_pool_name, ram_size):
+    content = f"""---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: {user_namespace}
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  namespace: {user_namespace}
+  name: deployment-{endpoint_uid}
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: app-{endpoint_uid}
+  replicas: 2
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: app-{endpoint_uid}
+    spec:
+      containers:
+      - image: {ecr_uri}/llama2-inference:latest
+        imagePullPolicy: Always
+        name: app-{endpoint_uid}
+        ports:
+        - containerPort: 8080
+        env:
+        - name: MODEL_S3_URL
+          value: {model_s3_url}
+        resources:
+            requests:
+                cpu: 2000m
+                memory: 2000M
+                nvidia.com/gpu: 1
+            limits:
+                cpu: 2000m
+                memory: 2000M
+                nvidia.com/gpu: 1
+      nodeSelector:
+        karpenter.sh/nodepool: {node_pool_name}
+---
+apiVersion: v1
+kind: Service
+metadata:
+  namespace: {user_namespace}
+  name: service-{endpoint_uid}
+spec:
+  ports:
+    - port: 8080
+      targetPort: 8080
+      protocol: TCP
+  type: ClusterIP
+  selector:
+    app.kubernetes.io/name: app-{endpoint_uid}
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  namespace: {user_namespace}
+  name: ingress-{endpoint_uid}
+  annotations:
+    alb.ingress.kubernetes.io/scheme: internet-facing
+    alb.ingress.kubernetes.io/target-type: ip
+    alb.ingress.kubernetes.io/group.name: "{user_namespace}"
+spec:
+  ingressClassName: alb
+  rules:
+    - http:
+        paths:
+        - path: /{endpoint_uid}
+          pathType: Prefix
+          backend:
+            service:
+              name: service-{endpoint_uid}
+              port:
+                number: 8080
+"""
+
+    filepath = f"/tmp/{endpoint_uid}.yaml"
+    with open(filepath, 'w') as f:
+        f.write(content)
+
+    return filepath
+
+def apply_yaml(user_namespace, endpoint_uid, model_s3_url, node_pool_name, ram_size):
+    filename = generate_yaml(user_namespace, endpoint_uid, model_s3_url, node_pool_name, ram_size)
+    result = subprocess.run([
+        kubectl, "apply", "-f", filename, "--kubeconfig", kubeconfig
+    ])
+    if result.returncode != 0: print("create resource returncode != 0")
+    return result.returncode
+
+def delete_resource(user_namespace, endpoint_uid):
+    deployment_name = f"deployment-{endpoint_uid}"
+    service_name = f"service-{endpoint_uid}"
+    ingress_name = f"ingress-{endpoint_uid}"
+    ingress_result = subprocess.run([
+        kubectl, "-n", user_namespace, "delete",  "ingress", ingress_name, "--kubeconfig", kubeconfig
+    ])
+    service_result = subprocess.run([
+        kubectl, "-n", user_namespace, "delete",  "service", service_name, "--kubeconfig", kubeconfig
+    ])
+    deployment_result = subprocess.run([
+        kubectl, "-n", user_namespace, "delete",  "deployment", deployment_name, "--kubeconfig", kubeconfig
+    ])
+    result = 0
+    if ingress_result.returncode != 0 or service_result.returncode != 0 or deployment_result.returncode != 0:
+        result = 1
+        print("delete resource returncode != 0")
+    return result
+
+def handler(event, context):
+    body = json.loads(event.get("body", "{}"))
+    user_uid = body.get("user").lower()
+    endpoint_uid = body.get("uid").lower()
+    action = body.get("action")
+
+    if action == "create":
+        model_s3_url = body['model']['s3_url']
+        node_pool_name = body['model']['deployment_type']
+        ram_size = body['model']['max_used_ram']
+        result = apply_yaml(user_uid, endpoint_uid, model_s3_url, node_pool_name, ram_size)
+
+        cmd = "{} get ingress -A --kubeconfig {} | grep {}".format(kubectl, kubeconfig, endpoint_uid)
+        time.sleep(10)
+        endpoint_url = subprocess.run(cmd, capture_output=True, shell=True).stdout.decode('utf-8').strip().split()[4]
+        print(f"endpoint_url: {endpoint_url}")
+        update_data = {
+            "endpoint": f"http://{endpoint_url}/{endpoint_uid}"
+        }
+        response = requests.put(url=f"{db_api_url}/inferences/{endpoint_uid}", json=update_data)
+        if result == 0:
+            return {
+                'statusCode': 200,
+                'body': "complete create inference endpoint"
+            }  
+        else:
+            return {
+                'statusCode': 500,
+                'body': "error with create inference endpoint"
+            }
+    elif action == "delete":
+        result = delete_resource(user_uid, endpoint_uid)
+        if result == 0:
+            requests.delete(url=f"{db_api_url}/inferences/{endpoint_uid}")
+            return {
+                'statusCode': 200,
+                'body': "complete delete inference deployment"
+            }
+        else:
+            return {
+                'statusCode': 500,
+                'body': "error with delete inference endpoint"
+            }
+    else:
+        return {
+            'statusCode': 500,
+            'body': "invalid action"
+        }
\ No newline at end of file
diff --git a/automation/llama_inference_deploy/push_aws_ecr.sh.sample b/automation/llama_inference_deploy/push_aws_ecr.sh.sample
new file mode 100644
index 0000000000..2c2b9d8f77
--- /dev/null
+++ b/automation/llama_inference_deploy/push_aws_ecr.sh.sample
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+ECR_URI=""
+
+aws ecr get-login-password --region ap-northeast-2 | docker login --username AWS --password-stdin $ECR_URI
+docker build -t $ECR_URI/llama-inference-deploy:latest .
+docker push $ECR_URI/llama-inference-deploy:latest
\ No newline at end of file

From a2d6d9fd6731c84c220b103285effb5314e8ac8c Mon Sep 17 00:00:00 2001
From: jhM00n <answlgns2056@kookmin.ac.kr>
Date: Sat, 18 May 2024 02:11:03 +0900
Subject: [PATCH 3/5] fix typo

---
 automation/kubernetes_inference_deploy/main.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/automation/kubernetes_inference_deploy/main.py b/automation/kubernetes_inference_deploy/main.py
index 35ddd646ac..189c380443 100644
--- a/automation/kubernetes_inference_deploy/main.py
+++ b/automation/kubernetes_inference_deploy/main.py
@@ -2,6 +2,7 @@
 import requests
 import os
 import json
+import time
 
 kubectl = '/var/task/kubectl'
 kubeconfig = '/tmp/kubeconfig'
@@ -145,6 +146,7 @@ def handler(event, context):
         result = apply_yaml(user_uid, endpoint_uid, model_s3_url, node_pool_name, ram_size)
 
         cmd = "{} get ingress -A --kubeconfig {} | grep {}".format(kubectl, kubeconfig, endpoint_uid)
+        time.sleep(10)
         endpoint_url = subprocess.run(cmd, capture_output=True, shell=True).stdout.decode('utf-8').strip().split()[4]
         print(f"endpoint_url: {endpoint_url}")
         update_data = {

From 108154ec9033ab07b2c399b01d3ebfc9c569ee15 Mon Sep 17 00:00:00 2001
From: jhM00n <answlgns2056@kookmin.ac.kr>
Date: Sun, 19 May 2024 00:41:11 +0900
Subject: [PATCH 4/5] fix max_len

---
 automation/deploy_streamlit/main.py                    | 4 ++--
 inference/template_code/llama/kubernetes_app_llama2.py | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/automation/deploy_streamlit/main.py b/automation/deploy_streamlit/main.py
index 945671c0f1..6f20b71017 100644
--- a/automation/deploy_streamlit/main.py
+++ b/automation/deploy_streamlit/main.py
@@ -84,7 +84,7 @@ def init_streamlit(user_namespace, endpoint_uid, endpoint_url, image_name, image
   annotations:
     alb.ingress.kubernetes.io/scheme: internet-facing
     alb.ingress.kubernetes.io/target-type: ip
-    alb.ingress.kubernetes.io/group.name: "streamlit-{user_namespace}"
+    alb.ingress.kubernetes.io/group.name: "{user_namespace}"
 spec:
   ingressClassName: alb
   rules:
@@ -155,7 +155,7 @@ def handler(event, context):
       # 추론 엔드포인트 주소
       endpoint_url = body.get("endpoint_url")
       result = apply_yaml(user_uid, endpoint_uid, endpoint_url, image_name, image_py_name)
-      cmd = "{} get ingress -A --kubeconfig {} | grep {}".format(kubectl, kubeconfig, endpoint_uid)
+      cmd = "{} get ingress -A --kubeconfig {} | grep {} | grep streamlit".format(kubectl, kubeconfig, endpoint_uid)
       # streamlit endpoint 주소
       streamlit_endpoint_url = subprocess.run(cmd, capture_output=True, shell=True).stdout.decode('utf-8').strip().split()[4]
       print(f"streamlit_endpoint_url: {streamlit_endpoint_url}/streamlit/{endpoint_uid}")
diff --git a/inference/template_code/llama/kubernetes_app_llama2.py b/inference/template_code/llama/kubernetes_app_llama2.py
index 5371a6d38d..0b424f27eb 100644
--- a/inference/template_code/llama/kubernetes_app_llama2.py
+++ b/inference/template_code/llama/kubernetes_app_llama2.py
@@ -91,6 +91,7 @@ async def healthcheck():
 async def inference(request: Request):
     data = await request.json()
     prompt = data.get('prompt', '')
+    max_gen_length = data.get('max_gen_len', 512)
     try:
         # 입력 데이터 준비
         inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
@@ -103,7 +104,7 @@ async def inference(request: Request):
     with torch.no_grad():
         try:
             # 텍스트 생성
-            outputs = model.generate(**inputs, max_length=1024)
+            outputs = model.generate(**inputs, max_length=max_gen_length)
         except Exception as e:
             return {
                 "error": "Inference failed",

From 92ce3c916c1eac5d06ee2beb8022a3c558c8f105 Mon Sep 17 00:00:00 2001
From: jhM00n <answlgns2056@kookmin.ac.kr>
Date: Sun, 19 May 2024 00:49:36 +0900
Subject: [PATCH 5/5] update cpu, mem, nodepool

---
 automation/llama_inference_deploy/main.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/automation/llama_inference_deploy/main.py b/automation/llama_inference_deploy/main.py
index 52c1181653..ccebc2917b 100644
--- a/automation/llama_inference_deploy/main.py
+++ b/automation/llama_inference_deploy/main.py
@@ -53,12 +53,12 @@ def generate_yaml(user_namespace, endpoint_uid, model_s3_url, node_pool_name, ra
           value: {model_s3_url}
         resources:
             requests:
-                cpu: 2000m
-                memory: 2000M
+                cpu: 1700m
+                memory: 3800M
                 nvidia.com/gpu: 1
             limits:
-                cpu: 2000m
-                memory: 2000M
+                cpu: 1700m
+                memory: 3800M
                 nvidia.com/gpu: 1
       nodeSelector:
         karpenter.sh/nodepool: {node_pool_name}
@@ -141,7 +141,7 @@ def handler(event, context):
 
     if action == "create":
         model_s3_url = body['model']['s3_url']
-        node_pool_name = body['model']['deployment_type']
+        node_pool_name = "nodepool-1"
         ram_size = body['model']['max_used_ram']
         result = apply_yaml(user_uid, endpoint_uid, model_s3_url, node_pool_name, ram_size)