Merge pull request #78 from kookmin-sw/jihun

Llama Inference Template Code 제작
kookmin-sw · May 18, 2024 · 9f6a0da · 9f6a0da
2 parents 5fe81da + 92ce3c9
commit 9f6a0da
Show file tree

Hide file tree

Showing 13 changed files with 429 additions and 2 deletions.
diff --git a/automation/deploy_streamlit/main.py b/automation/deploy_streamlit/main.py
@@ -84,7 +84,7 @@ def init_streamlit(user_namespace, endpoint_uid, endpoint_url, image_name, image
   annotations:
     alb.ingress.kubernetes.io/scheme: internet-facing
     alb.ingress.kubernetes.io/target-type: ip
-    alb.ingress.kubernetes.io/group.name: "streamlit-{user_namespace}"
+    alb.ingress.kubernetes.io/group.name: "{user_namespace}"
 spec:
   ingressClassName: alb
   rules:
@@ -155,7 +155,7 @@ def handler(event, context):
       # 추론 엔드포인트 주소
       endpoint_url = body.get("endpoint_url")
       result = apply_yaml(user_uid, endpoint_uid, endpoint_url, image_name, image_py_name)
-      cmd = "{} get ingress -A --kubeconfig {} | grep {}".format(kubectl, kubeconfig, endpoint_uid)
+      cmd = "{} get ingress -A --kubeconfig {} | grep {} | grep streamlit".format(kubectl, kubeconfig, endpoint_uid)
       # streamlit endpoint 주소
       streamlit_endpoint_url = subprocess.run(cmd, capture_output=True, shell=True).stdout.decode('utf-8').strip().split()[4]
       print(f"streamlit_endpoint_url: {streamlit_endpoint_url}/streamlit/{endpoint_uid}")

diff --git a/automation/kubernetes_inference_deploy/main.py b/automation/kubernetes_inference_deploy/main.py
@@ -2,6 +2,7 @@
 import requests
 import os
 import json
+import time
 
 kubectl = '/var/task/kubectl'
 kubeconfig = '/tmp/kubeconfig'
@@ -145,6 +146,7 @@ def handler(event, context):
         result = apply_yaml(user_uid, endpoint_uid, model_s3_url, node_pool_name, ram_size)
 
         cmd = "{} get ingress -A --kubeconfig {} | grep {}".format(kubectl, kubeconfig, endpoint_uid)
+        time.sleep(10)
         endpoint_url = subprocess.run(cmd, capture_output=True, shell=True).stdout.decode('utf-8').strip().split()[4]
         print(f"endpoint_url: {endpoint_url}")
         update_data = {

diff --git a/automation/llama_inference_deploy/.gitignore b/automation/llama_inference_deploy/.gitignore
@@ -0,0 +1,4 @@
+push_aws_ecr.sh
+ecr_login.sh
+get_kubeconfig.sh
+*test*
diff --git a/automation/llama_inference_deploy/Dockerfile b/automation/llama_inference_deploy/Dockerfile
@@ -0,0 +1,13 @@
+FROM public.ecr.aws/lambda/python:3.11
+
+RUN pip install awscli requests --no-cache-dir
+
+# x86_64
+RUN curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" \
+    && chmod +x ./kubectl
+
+COPY main.py ${LAMBDA_TASK_ROOT}
+
+RUN chmod +x /var/task
+
+CMD ["main.handler"]
diff --git a/automation/llama_inference_deploy/IaC/.gitignore b/automation/llama_inference_deploy/IaC/.gitignore
@@ -0,0 +1 @@
+var.tf
diff --git a/automation/llama_inference_deploy/IaC/main.tf b/automation/llama_inference_deploy/IaC/main.tf
@@ -0,0 +1,35 @@
+# prefix, container_repository 변경 (대채적인 이름)
+
+module "llama_inference_deploy" {
+  source                     = "github.com/kookmin-sw/capstone-2024-12//IaC/serverless_api_template"
+  prefix                     = "llama-inference-deploy"
+  container_registry         = "694448341573.dkr.ecr.ap-northeast-2.amazonaws.com"
+  container_repository       = "llama-inference-deploy"
+  container_image_tag        = "latest"
+  lambda_ram_size            = 2048
+  attach_s3_policy           = true
+  attach_ec2_policy          = true
+  attach_eks_policy          = true
+  attach_ssm_readonly_policy = true
+  region_name = var.region
+  eks_cluster_name = var.eks_cluster_name
+  db_api_url = var.db_api_url
+}
+
+output "llama_inference_deploy_function_url" {
+  value = module.llama_inference_deploy.function_url
+}
+
+provider "aws" {
+  region  = var.region
+  profile = var.awscli_profile
+}
+
+terraform {
+  backend "s3" {
+    bucket  = "sskai-terraform-state"
+    key     = "llama_inference_deploy/tf.state"
+    region  = "ap-northeast-2"
+    encrypt = true
+  }
+}
diff --git a/automation/llama_inference_deploy/IaC/var.tf.sample b/automation/llama_inference_deploy/IaC/var.tf.sample
@@ -0,0 +1,19 @@
+variable "region" {
+  type    = string
+  default = "ap-northeast-2"
+}
+
+variable "awscli_profile" {
+  type    = string
+  default = ""
+}
+
+variable "eks_cluster_name" {
+    type = string
+    default = ""
+}
+
+variable "db_api_url" {
+    type = string
+    default = "" 
+}
diff --git a/automation/llama_inference_deploy/main.py b/automation/llama_inference_deploy/main.py
@@ -0,0 +1,183 @@
+import subprocess
+import requests
+import os
+import json
+import time
+
+kubectl = '/var/task/kubectl'
+kubeconfig = '/tmp/kubeconfig'
+
+eks_cluster_name = os.getenv('EKS_CLUSTER_NAME')
+region = os.getenv("REGION")
+db_api_url = os.getenv("DB_API_URL")
+ecr_uri = os.getenv("ECR_URI")
+
+# get eks cluster kubernetes configuration by aws cli
+result_get_kubeconfig = subprocess.run([
+    "aws", "eks", "update-kubeconfig",
+    "--name", eks_cluster_name,
+    "--region", region,
+    "--kubeconfig", kubeconfig
+])
+
+def generate_yaml(user_namespace, endpoint_uid, model_s3_url, node_pool_name, ram_size):
+    content = f"""---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: {user_namespace}
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  namespace: {user_namespace}
+  name: deployment-{endpoint_uid}
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: app-{endpoint_uid}
+  replicas: 2
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: app-{endpoint_uid}
+    spec:
+      containers:
+      - image: {ecr_uri}/llama2-inference:latest
+        imagePullPolicy: Always
+        name: app-{endpoint_uid}
+        ports:
+        - containerPort: 8080
+        env:
+        - name: MODEL_S3_URL
+          value: {model_s3_url}
+        resources:
+            requests:
+                cpu: 1700m
+                memory: 3800M
+                nvidia.com/gpu: 1
+            limits:
+                cpu: 1700m
+                memory: 3800M
+                nvidia.com/gpu: 1
+      nodeSelector:
+        karpenter.sh/nodepool: {node_pool_name}
+---
+apiVersion: v1
+kind: Service
+metadata:
+  namespace: {user_namespace}
+  name: service-{endpoint_uid}
+spec:
+  ports:
+    - port: 8080
+      targetPort: 8080
+      protocol: TCP
+  type: ClusterIP
+  selector:
+    app.kubernetes.io/name: app-{endpoint_uid}
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  namespace: {user_namespace}
+  name: ingress-{endpoint_uid}
+  annotations:
+    alb.ingress.kubernetes.io/scheme: internet-facing
+    alb.ingress.kubernetes.io/target-type: ip
+    alb.ingress.kubernetes.io/group.name: "{user_namespace}"
+spec:
+  ingressClassName: alb
+  rules:
+    - http:
+        paths:
+        - path: /{endpoint_uid}
+          pathType: Prefix
+          backend:
+            service:
+              name: service-{endpoint_uid}
+              port:
+                number: 8080
+"""
+
+    filepath = f"/tmp/{endpoint_uid}.yaml"
+    with open(filepath, 'w') as f:
+        f.write(content)
+
+    return filepath
+
+def apply_yaml(user_namespace, endpoint_uid, model_s3_url, node_pool_name, ram_size):
+    filename = generate_yaml(user_namespace, endpoint_uid, model_s3_url, node_pool_name, ram_size)
+    result = subprocess.run([
+        kubectl, "apply", "-f", filename, "--kubeconfig", kubeconfig
+    ])
+    if result.returncode != 0: print("create resource returncode != 0")
+    return result.returncode
+
+def delete_resource(user_namespace, endpoint_uid):
+    deployment_name = f"deployment-{endpoint_uid}"
+    service_name = f"service-{endpoint_uid}"
+    ingress_name = f"ingress-{endpoint_uid}"
+    ingress_result = subprocess.run([
+        kubectl, "-n", user_namespace, "delete",  "ingress", ingress_name, "--kubeconfig", kubeconfig
+    ])
+    service_result = subprocess.run([
+        kubectl, "-n", user_namespace, "delete",  "service", service_name, "--kubeconfig", kubeconfig
+    ])
+    deployment_result = subprocess.run([
+        kubectl, "-n", user_namespace, "delete",  "deployment", deployment_name, "--kubeconfig", kubeconfig
+    ])
+    result = 0
+    if ingress_result.returncode != 0 or service_result.returncode != 0 or deployment_result.returncode != 0:
+        result = 1
+        print("delete resource returncode != 0")
+    return result
+
+def handler(event, context):
+    body = json.loads(event.get("body", "{}"))
+    user_uid = body.get("user").lower()
+    endpoint_uid = body.get("uid").lower()
+    action = body.get("action")
+
+    if action == "create":
+        model_s3_url = body['model']['s3_url']
+        node_pool_name = "nodepool-1"
+        ram_size = body['model']['max_used_ram']
+        result = apply_yaml(user_uid, endpoint_uid, model_s3_url, node_pool_name, ram_size)
+
+        cmd = "{} get ingress -A --kubeconfig {} | grep {}".format(kubectl, kubeconfig, endpoint_uid)
+        time.sleep(10)
+        endpoint_url = subprocess.run(cmd, capture_output=True, shell=True).stdout.decode('utf-8').strip().split()[4]
+        print(f"endpoint_url: {endpoint_url}")
+        update_data = {
+            "endpoint": f"http://{endpoint_url}/{endpoint_uid}"
+        }
+        response = requests.put(url=f"{db_api_url}/inferences/{endpoint_uid}", json=update_data)
+        if result == 0:
+            return {
+                'statusCode': 200,
+                'body': "complete create inference endpoint"
+            }  
+        else:
+            return {
+                'statusCode': 500,
+                'body': "error with create inference endpoint"
+            }
+    elif action == "delete":
+        result = delete_resource(user_uid, endpoint_uid)
+        if result == 0:
+            requests.delete(url=f"{db_api_url}/inferences/{endpoint_uid}")
+            return {
+                'statusCode': 200,
+                'body': "complete delete inference deployment"
+            }
+        else:
+            return {
+                'statusCode': 500,
+                'body': "error with delete inference endpoint"
+            }
+    else:
+        return {
+            'statusCode': 500,
+            'body': "invalid action"
+        }
diff --git a/automation/llama_inference_deploy/push_aws_ecr.sh.sample b/automation/llama_inference_deploy/push_aws_ecr.sh.sample
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+ECR_URI=""
+
+aws ecr get-login-password --region ap-northeast-2 | docker login --username AWS --password-stdin $ECR_URI
+docker build -t $ECR_URI/llama-inference-deploy:latest .
+docker push $ECR_URI/llama-inference-deploy:latest
diff --git a/inference/template_code/llama/Dockerfile.kubernetes_gpu b/inference/template_code/llama/Dockerfile.kubernetes_gpu
@@ -0,0 +1,12 @@
+FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
+
+WORKDIR /app
+
+COPY requirements_kubernetes_gpu.txt /app/requirements.txt
+RUN pip3 install --no-cache-dir -r requirements.txt 
+
+COPY kubernetes_app_llama2.py /app/app.py
+
+CMD [ "python3", "/app/app.py" ]
+
+EXPOSE 8080