Skip to content

Commit

Permalink
Merge pull request #78 from kookmin-sw/jihun
Browse files Browse the repository at this point in the history
Llama Inference Template Code 제작
  • Loading branch information
mh3ong authored May 18, 2024
2 parents 5fe81da + 92ce3c9 commit 9f6a0da
Show file tree
Hide file tree
Showing 13 changed files with 429 additions and 2 deletions.
4 changes: 2 additions & 2 deletions automation/deploy_streamlit/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def init_streamlit(user_namespace, endpoint_uid, endpoint_url, image_name, image
annotations:
alb.ingress.kubernetes.io/scheme: internet-facing
alb.ingress.kubernetes.io/target-type: ip
alb.ingress.kubernetes.io/group.name: "streamlit-{user_namespace}"
alb.ingress.kubernetes.io/group.name: "{user_namespace}"
spec:
ingressClassName: alb
rules:
Expand Down Expand Up @@ -155,7 +155,7 @@ def handler(event, context):
# 추론 엔드포인트 주소
endpoint_url = body.get("endpoint_url")
result = apply_yaml(user_uid, endpoint_uid, endpoint_url, image_name, image_py_name)
cmd = "{} get ingress -A --kubeconfig {} | grep {}".format(kubectl, kubeconfig, endpoint_uid)
cmd = "{} get ingress -A --kubeconfig {} | grep {} | grep streamlit".format(kubectl, kubeconfig, endpoint_uid)
# streamlit endpoint 주소
streamlit_endpoint_url = subprocess.run(cmd, capture_output=True, shell=True).stdout.decode('utf-8').strip().split()[4]
print(f"streamlit_endpoint_url: {streamlit_endpoint_url}/streamlit/{endpoint_uid}")
Expand Down
2 changes: 2 additions & 0 deletions automation/kubernetes_inference_deploy/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import requests
import os
import json
import time

kubectl = '/var/task/kubectl'
kubeconfig = '/tmp/kubeconfig'
Expand Down Expand Up @@ -145,6 +146,7 @@ def handler(event, context):
result = apply_yaml(user_uid, endpoint_uid, model_s3_url, node_pool_name, ram_size)

cmd = "{} get ingress -A --kubeconfig {} | grep {}".format(kubectl, kubeconfig, endpoint_uid)
time.sleep(10)
endpoint_url = subprocess.run(cmd, capture_output=True, shell=True).stdout.decode('utf-8').strip().split()[4]
print(f"endpoint_url: {endpoint_url}")
update_data = {
Expand Down
4 changes: 4 additions & 0 deletions automation/llama_inference_deploy/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
push_aws_ecr.sh
ecr_login.sh
get_kubeconfig.sh
*test*
13 changes: 13 additions & 0 deletions automation/llama_inference_deploy/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
FROM public.ecr.aws/lambda/python:3.11

RUN pip install awscli requests --no-cache-dir

# x86_64
RUN curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" \
&& chmod +x ./kubectl

COPY main.py ${LAMBDA_TASK_ROOT}

RUN chmod +x /var/task

CMD ["main.handler"]
1 change: 1 addition & 0 deletions automation/llama_inference_deploy/IaC/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
var.tf
35 changes: 35 additions & 0 deletions automation/llama_inference_deploy/IaC/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# prefix, container_repository 변경 (대채적인 이름)

module "llama_inference_deploy" {
source = "github.com/kookmin-sw/capstone-2024-12//IaC/serverless_api_template"
prefix = "llama-inference-deploy"
container_registry = "694448341573.dkr.ecr.ap-northeast-2.amazonaws.com"
container_repository = "llama-inference-deploy"
container_image_tag = "latest"
lambda_ram_size = 2048
attach_s3_policy = true
attach_ec2_policy = true
attach_eks_policy = true
attach_ssm_readonly_policy = true
region_name = var.region
eks_cluster_name = var.eks_cluster_name
db_api_url = var.db_api_url
}

output "llama_inference_deploy_function_url" {
value = module.llama_inference_deploy.function_url
}

provider "aws" {
region = var.region
profile = var.awscli_profile
}

terraform {
backend "s3" {
bucket = "sskai-terraform-state"
key = "llama_inference_deploy/tf.state"
region = "ap-northeast-2"
encrypt = true
}
}
19 changes: 19 additions & 0 deletions automation/llama_inference_deploy/IaC/var.tf.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
variable "region" {
type = string
default = "ap-northeast-2"
}

variable "awscli_profile" {
type = string
default = ""
}

variable "eks_cluster_name" {
type = string
default = ""
}

variable "db_api_url" {
type = string
default = ""
}
183 changes: 183 additions & 0 deletions automation/llama_inference_deploy/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
import subprocess
import requests
import os
import json
import time

kubectl = '/var/task/kubectl'
kubeconfig = '/tmp/kubeconfig'

eks_cluster_name = os.getenv('EKS_CLUSTER_NAME')
region = os.getenv("REGION")
db_api_url = os.getenv("DB_API_URL")
ecr_uri = os.getenv("ECR_URI")

# get eks cluster kubernetes configuration by aws cli
result_get_kubeconfig = subprocess.run([
"aws", "eks", "update-kubeconfig",
"--name", eks_cluster_name,
"--region", region,
"--kubeconfig", kubeconfig
])

def generate_yaml(user_namespace, endpoint_uid, model_s3_url, node_pool_name, ram_size):
content = f"""---
apiVersion: v1
kind: Namespace
metadata:
name: {user_namespace}
---
apiVersion: apps/v1
kind: Deployment
metadata:
namespace: {user_namespace}
name: deployment-{endpoint_uid}
spec:
selector:
matchLabels:
app.kubernetes.io/name: app-{endpoint_uid}
replicas: 2
template:
metadata:
labels:
app.kubernetes.io/name: app-{endpoint_uid}
spec:
containers:
- image: {ecr_uri}/llama2-inference:latest
imagePullPolicy: Always
name: app-{endpoint_uid}
ports:
- containerPort: 8080
env:
- name: MODEL_S3_URL
value: {model_s3_url}
resources:
requests:
cpu: 1700m
memory: 3800M
nvidia.com/gpu: 1
limits:
cpu: 1700m
memory: 3800M
nvidia.com/gpu: 1
nodeSelector:
karpenter.sh/nodepool: {node_pool_name}
---
apiVersion: v1
kind: Service
metadata:
namespace: {user_namespace}
name: service-{endpoint_uid}
spec:
ports:
- port: 8080
targetPort: 8080
protocol: TCP
type: ClusterIP
selector:
app.kubernetes.io/name: app-{endpoint_uid}
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
namespace: {user_namespace}
name: ingress-{endpoint_uid}
annotations:
alb.ingress.kubernetes.io/scheme: internet-facing
alb.ingress.kubernetes.io/target-type: ip
alb.ingress.kubernetes.io/group.name: "{user_namespace}"
spec:
ingressClassName: alb
rules:
- http:
paths:
- path: /{endpoint_uid}
pathType: Prefix
backend:
service:
name: service-{endpoint_uid}
port:
number: 8080
"""

filepath = f"/tmp/{endpoint_uid}.yaml"
with open(filepath, 'w') as f:
f.write(content)

return filepath

def apply_yaml(user_namespace, endpoint_uid, model_s3_url, node_pool_name, ram_size):
filename = generate_yaml(user_namespace, endpoint_uid, model_s3_url, node_pool_name, ram_size)
result = subprocess.run([
kubectl, "apply", "-f", filename, "--kubeconfig", kubeconfig
])
if result.returncode != 0: print("create resource returncode != 0")
return result.returncode

def delete_resource(user_namespace, endpoint_uid):
deployment_name = f"deployment-{endpoint_uid}"
service_name = f"service-{endpoint_uid}"
ingress_name = f"ingress-{endpoint_uid}"
ingress_result = subprocess.run([
kubectl, "-n", user_namespace, "delete", "ingress", ingress_name, "--kubeconfig", kubeconfig
])
service_result = subprocess.run([
kubectl, "-n", user_namespace, "delete", "service", service_name, "--kubeconfig", kubeconfig
])
deployment_result = subprocess.run([
kubectl, "-n", user_namespace, "delete", "deployment", deployment_name, "--kubeconfig", kubeconfig
])
result = 0
if ingress_result.returncode != 0 or service_result.returncode != 0 or deployment_result.returncode != 0:
result = 1
print("delete resource returncode != 0")
return result

def handler(event, context):
body = json.loads(event.get("body", "{}"))
user_uid = body.get("user").lower()
endpoint_uid = body.get("uid").lower()
action = body.get("action")

if action == "create":
model_s3_url = body['model']['s3_url']
node_pool_name = "nodepool-1"
ram_size = body['model']['max_used_ram']
result = apply_yaml(user_uid, endpoint_uid, model_s3_url, node_pool_name, ram_size)

cmd = "{} get ingress -A --kubeconfig {} | grep {}".format(kubectl, kubeconfig, endpoint_uid)
time.sleep(10)
endpoint_url = subprocess.run(cmd, capture_output=True, shell=True).stdout.decode('utf-8').strip().split()[4]
print(f"endpoint_url: {endpoint_url}")
update_data = {
"endpoint": f"http://{endpoint_url}/{endpoint_uid}"
}
response = requests.put(url=f"{db_api_url}/inferences/{endpoint_uid}", json=update_data)
if result == 0:
return {
'statusCode': 200,
'body': "complete create inference endpoint"
}
else:
return {
'statusCode': 500,
'body': "error with create inference endpoint"
}
elif action == "delete":
result = delete_resource(user_uid, endpoint_uid)
if result == 0:
requests.delete(url=f"{db_api_url}/inferences/{endpoint_uid}")
return {
'statusCode': 200,
'body': "complete delete inference deployment"
}
else:
return {
'statusCode': 500,
'body': "error with delete inference endpoint"
}
else:
return {
'statusCode': 500,
'body': "invalid action"
}
7 changes: 7 additions & 0 deletions automation/llama_inference_deploy/push_aws_ecr.sh.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/sh

ECR_URI=""

aws ecr get-login-password --region ap-northeast-2 | docker login --username AWS --password-stdin $ECR_URI
docker build -t $ECR_URI/llama-inference-deploy:latest .
docker push $ECR_URI/llama-inference-deploy:latest
12 changes: 12 additions & 0 deletions inference/template_code/llama/Dockerfile.kubernetes_gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime

WORKDIR /app

COPY requirements_kubernetes_gpu.txt /app/requirements.txt
RUN pip3 install --no-cache-dir -r requirements.txt

COPY kubernetes_app_llama2.py /app/app.py

CMD [ "python3", "/app/app.py" ]

EXPOSE 8080
Loading

0 comments on commit 9f6a0da

Please sign in to comment.