From e6e5bd9d6df2cd4212023ef96c127a4e3bfc7c95 Mon Sep 17 00:00:00 2001 From: jhM00n Date: Sat, 18 May 2024 02:09:37 +0900 Subject: [PATCH 1/5] add deploy_llama_inference --- .../llama/Dockerfile.kubernetes_gpu | 12 ++ .../llama/kubernetes_app_llama2.py | 128 ++++++++++++++++++ .../llama/push_aws_ecr.sh.sample | 7 + .../llama/requirements_kubernetes_gpu.txt | 15 ++ 4 files changed, 162 insertions(+) create mode 100644 inference/template_code/llama/Dockerfile.kubernetes_gpu create mode 100644 inference/template_code/llama/kubernetes_app_llama2.py create mode 100644 inference/template_code/llama/push_aws_ecr.sh.sample create mode 100644 inference/template_code/llama/requirements_kubernetes_gpu.txt diff --git a/inference/template_code/llama/Dockerfile.kubernetes_gpu b/inference/template_code/llama/Dockerfile.kubernetes_gpu new file mode 100644 index 0000000000..8d58317e60 --- /dev/null +++ b/inference/template_code/llama/Dockerfile.kubernetes_gpu @@ -0,0 +1,12 @@ +FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime + +WORKDIR /app + +COPY requirements_kubernetes_gpu.txt /app/requirements.txt +RUN pip3 install --no-cache-dir -r requirements.txt + +COPY kubernetes_app_llama2.py /app/app.py + +CMD [ "python3", "/app/app.py" ] + +EXPOSE 8080 \ No newline at end of file diff --git a/inference/template_code/llama/kubernetes_app_llama2.py b/inference/template_code/llama/kubernetes_app_llama2.py new file mode 100644 index 0000000000..5371a6d38d --- /dev/null +++ b/inference/template_code/llama/kubernetes_app_llama2.py @@ -0,0 +1,128 @@ +import os +import requests +import shutil +import zipfile +import torch +from fastapi import FastAPI, Request +import uvicorn +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, +) +from peft import PeftModel, LoraConfig, get_peft_model + +app = FastAPI() + +# 환경 변수에서 모델 S3 URL 가져오기 +model_s3_url = os.getenv('MODEL_S3_URL') + +# 모델 다운로드 +model_download = requests.get(model_s3_url) +model_filename = model_s3_url.split('/')[-1] +model_temp_path = os.path.join('temp', model_filename) +os.makedirs('temp', exist_ok=True) +with open(model_temp_path, 'wb') as file: + file.write(model_download.content) + +# 기존 모델 디렉토리 삭제 및 생성 +if os.path.exists('model'): + shutil.rmtree('model') +os.makedirs('model') + +# 모델 압축 해제 +with zipfile.ZipFile(model_temp_path, 'r') as zip_ref: + zip_ref.extractall('model') + +# 임시 파일 및 디렉토리 삭제 +os.remove(model_temp_path) +shutil.rmtree('temp') + +# 모델 이름 설정 (고정) +model_name = "NousResearch/Llama-2-7b-chat-hf" + +# 양자화 구성 설정 +compute_dtype = getattr(torch, "float16") + +quant_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=compute_dtype, + bnb_4bit_use_double_quant=False, +) + +# 모델 로드 (고정된 모델 이름 사용) +model = AutoModelForCausalLM.from_pretrained( + model_name, + quantization_config=quant_config, + device_map={"": 0} +) + +# 로컬 경로 +model_path = '/app/model/model' + +# 토크나이저 로드 (로컬 경로에서 불러오기) +tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + +# PEFT 모델의 가중치 로드 +model = PeftModel.from_pretrained(model, model_path) + +# PEFT 파라미터 설정 +peft_params = LoraConfig( + lora_alpha=16, + lora_dropout=0.1, + r=64, + bias="none", + task_type="CAUSAL_LM", +) + +model = get_peft_model(model, peft_params) + +# 모델 평가 모드로 전환 +model.eval() + +@app.get("/") +async def healthcheck(): + return { + "body": "healthy" + } + +@app.post("/{full_path:path}") +async def inference(request: Request): + data = await request.json() + prompt = data.get('prompt', '') + try: + # 입력 데이터 준비 + inputs = tokenizer(prompt, return_tensors="pt").to("cuda") + except Exception as e: + return { + "error": "Tokenization failed", + "message": str(e) + } + + with torch.no_grad(): + try: + # 텍스트 생성 + outputs = model.generate(**inputs, max_length=1024) + except Exception as e: + return { + "error": "Inference failed", + "message": str(e) + } + + try: + # 생성된 텍스트 디코딩 + generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) + except Exception as e: + return { + "error": "Decoding failed", + "message": str(e) + } + + return { + "output": generated_text + } + + +if __name__ == '__main__': + uvicorn.run(app, host="0.0.0.0", port=8080) \ No newline at end of file diff --git a/inference/template_code/llama/push_aws_ecr.sh.sample b/inference/template_code/llama/push_aws_ecr.sh.sample new file mode 100644 index 0000000000..4a8702c532 --- /dev/null +++ b/inference/template_code/llama/push_aws_ecr.sh.sample @@ -0,0 +1,7 @@ +#!/bin/sh + +ECR_URI="" + +aws ecr get-login-password --region ap-northeast-2 | docker login --username AWS --password-stdin $ECR_URI +docker build -t $ECR_URI/llama2-inference:latest . -f Dockerfile.kubernetes_gpu +docker push $ECR_URI/llama2-inference:latest \ No newline at end of file diff --git a/inference/template_code/llama/requirements_kubernetes_gpu.txt b/inference/template_code/llama/requirements_kubernetes_gpu.txt new file mode 100644 index 0000000000..0c51191697 --- /dev/null +++ b/inference/template_code/llama/requirements_kubernetes_gpu.txt @@ -0,0 +1,15 @@ +torch==2.3.0 +fastapi==0.111.0 +uvicorn==0.29.0 +numpy==1.26.4 +requests==2.31.0 +bitsandbytes==0.40.2 +accelerate==0.21.0 +peft==0.4.0 +transformers==4.31.0 +datasets==2.19.1 +pandas==2.2.2 +pyarrow==16.0.0 +scipy==1.13.0 +tensorboardX==2.6.2.2 +xformers==0.0.26.post1 \ No newline at end of file From 0bf845666dafac64166f55e7cbeed82d90d0a4a2 Mon Sep 17 00:00:00 2001 From: jhM00n Date: Sat, 18 May 2024 02:10:13 +0900 Subject: [PATCH 2/5] add llama_inference_deploy --- automation/llama_inference_deploy/.gitignore | 4 + automation/llama_inference_deploy/Dockerfile | 13 ++ .../llama_inference_deploy/IaC/.gitignore | 1 + automation/llama_inference_deploy/IaC/main.tf | 35 ++++ .../llama_inference_deploy/IaC/var.tf.sample | 19 ++ automation/llama_inference_deploy/main.py | 183 ++++++++++++++++++ .../push_aws_ecr.sh.sample | 7 + 7 files changed, 262 insertions(+) create mode 100644 automation/llama_inference_deploy/.gitignore create mode 100644 automation/llama_inference_deploy/Dockerfile create mode 100644 automation/llama_inference_deploy/IaC/.gitignore create mode 100644 automation/llama_inference_deploy/IaC/main.tf create mode 100644 automation/llama_inference_deploy/IaC/var.tf.sample create mode 100644 automation/llama_inference_deploy/main.py create mode 100644 automation/llama_inference_deploy/push_aws_ecr.sh.sample diff --git a/automation/llama_inference_deploy/.gitignore b/automation/llama_inference_deploy/.gitignore new file mode 100644 index 0000000000..ee1f4c4fb5 --- /dev/null +++ b/automation/llama_inference_deploy/.gitignore @@ -0,0 +1,4 @@ +push_aws_ecr.sh +ecr_login.sh +get_kubeconfig.sh +*test* \ No newline at end of file diff --git a/automation/llama_inference_deploy/Dockerfile b/automation/llama_inference_deploy/Dockerfile new file mode 100644 index 0000000000..d7c845cb6f --- /dev/null +++ b/automation/llama_inference_deploy/Dockerfile @@ -0,0 +1,13 @@ +FROM public.ecr.aws/lambda/python:3.11 + +RUN pip install awscli requests --no-cache-dir + +# x86_64 +RUN curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" \ + && chmod +x ./kubectl + +COPY main.py ${LAMBDA_TASK_ROOT} + +RUN chmod +x /var/task + +CMD ["main.handler"] \ No newline at end of file diff --git a/automation/llama_inference_deploy/IaC/.gitignore b/automation/llama_inference_deploy/IaC/.gitignore new file mode 100644 index 0000000000..4ee3ae2ee7 --- /dev/null +++ b/automation/llama_inference_deploy/IaC/.gitignore @@ -0,0 +1 @@ +var.tf \ No newline at end of file diff --git a/automation/llama_inference_deploy/IaC/main.tf b/automation/llama_inference_deploy/IaC/main.tf new file mode 100644 index 0000000000..fb44f42dec --- /dev/null +++ b/automation/llama_inference_deploy/IaC/main.tf @@ -0,0 +1,35 @@ +# prefix, container_repository 변경 (대채적인 이름) + +module "llama_inference_deploy" { + source = "github.com/kookmin-sw/capstone-2024-12//IaC/serverless_api_template" + prefix = "llama-inference-deploy" + container_registry = "694448341573.dkr.ecr.ap-northeast-2.amazonaws.com" + container_repository = "llama-inference-deploy" + container_image_tag = "latest" + lambda_ram_size = 2048 + attach_s3_policy = true + attach_ec2_policy = true + attach_eks_policy = true + attach_ssm_readonly_policy = true + region_name = var.region + eks_cluster_name = var.eks_cluster_name + db_api_url = var.db_api_url +} + +output "llama_inference_deploy_function_url" { + value = module.llama_inference_deploy.function_url +} + +provider "aws" { + region = var.region + profile = var.awscli_profile +} + +terraform { + backend "s3" { + bucket = "sskai-terraform-state" + key = "llama_inference_deploy/tf.state" + region = "ap-northeast-2" + encrypt = true + } +} diff --git a/automation/llama_inference_deploy/IaC/var.tf.sample b/automation/llama_inference_deploy/IaC/var.tf.sample new file mode 100644 index 0000000000..758c8ab6c2 --- /dev/null +++ b/automation/llama_inference_deploy/IaC/var.tf.sample @@ -0,0 +1,19 @@ +variable "region" { + type = string + default = "ap-northeast-2" +} + +variable "awscli_profile" { + type = string + default = "" +} + +variable "eks_cluster_name" { + type = string + default = "" +} + +variable "db_api_url" { + type = string + default = "" +} \ No newline at end of file diff --git a/automation/llama_inference_deploy/main.py b/automation/llama_inference_deploy/main.py new file mode 100644 index 0000000000..52c1181653 --- /dev/null +++ b/automation/llama_inference_deploy/main.py @@ -0,0 +1,183 @@ +import subprocess +import requests +import os +import json +import time + +kubectl = '/var/task/kubectl' +kubeconfig = '/tmp/kubeconfig' + +eks_cluster_name = os.getenv('EKS_CLUSTER_NAME') +region = os.getenv("REGION") +db_api_url = os.getenv("DB_API_URL") +ecr_uri = os.getenv("ECR_URI") + +# get eks cluster kubernetes configuration by aws cli +result_get_kubeconfig = subprocess.run([ + "aws", "eks", "update-kubeconfig", + "--name", eks_cluster_name, + "--region", region, + "--kubeconfig", kubeconfig +]) + +def generate_yaml(user_namespace, endpoint_uid, model_s3_url, node_pool_name, ram_size): + content = f"""--- +apiVersion: v1 +kind: Namespace +metadata: + name: {user_namespace} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + namespace: {user_namespace} + name: deployment-{endpoint_uid} +spec: + selector: + matchLabels: + app.kubernetes.io/name: app-{endpoint_uid} + replicas: 2 + template: + metadata: + labels: + app.kubernetes.io/name: app-{endpoint_uid} + spec: + containers: + - image: {ecr_uri}/llama2-inference:latest + imagePullPolicy: Always + name: app-{endpoint_uid} + ports: + - containerPort: 8080 + env: + - name: MODEL_S3_URL + value: {model_s3_url} + resources: + requests: + cpu: 2000m + memory: 2000M + nvidia.com/gpu: 1 + limits: + cpu: 2000m + memory: 2000M + nvidia.com/gpu: 1 + nodeSelector: + karpenter.sh/nodepool: {node_pool_name} +--- +apiVersion: v1 +kind: Service +metadata: + namespace: {user_namespace} + name: service-{endpoint_uid} +spec: + ports: + - port: 8080 + targetPort: 8080 + protocol: TCP + type: ClusterIP + selector: + app.kubernetes.io/name: app-{endpoint_uid} +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + namespace: {user_namespace} + name: ingress-{endpoint_uid} + annotations: + alb.ingress.kubernetes.io/scheme: internet-facing + alb.ingress.kubernetes.io/target-type: ip + alb.ingress.kubernetes.io/group.name: "{user_namespace}" +spec: + ingressClassName: alb + rules: + - http: + paths: + - path: /{endpoint_uid} + pathType: Prefix + backend: + service: + name: service-{endpoint_uid} + port: + number: 8080 +""" + + filepath = f"/tmp/{endpoint_uid}.yaml" + with open(filepath, 'w') as f: + f.write(content) + + return filepath + +def apply_yaml(user_namespace, endpoint_uid, model_s3_url, node_pool_name, ram_size): + filename = generate_yaml(user_namespace, endpoint_uid, model_s3_url, node_pool_name, ram_size) + result = subprocess.run([ + kubectl, "apply", "-f", filename, "--kubeconfig", kubeconfig + ]) + if result.returncode != 0: print("create resource returncode != 0") + return result.returncode + +def delete_resource(user_namespace, endpoint_uid): + deployment_name = f"deployment-{endpoint_uid}" + service_name = f"service-{endpoint_uid}" + ingress_name = f"ingress-{endpoint_uid}" + ingress_result = subprocess.run([ + kubectl, "-n", user_namespace, "delete", "ingress", ingress_name, "--kubeconfig", kubeconfig + ]) + service_result = subprocess.run([ + kubectl, "-n", user_namespace, "delete", "service", service_name, "--kubeconfig", kubeconfig + ]) + deployment_result = subprocess.run([ + kubectl, "-n", user_namespace, "delete", "deployment", deployment_name, "--kubeconfig", kubeconfig + ]) + result = 0 + if ingress_result.returncode != 0 or service_result.returncode != 0 or deployment_result.returncode != 0: + result = 1 + print("delete resource returncode != 0") + return result + +def handler(event, context): + body = json.loads(event.get("body", "{}")) + user_uid = body.get("user").lower() + endpoint_uid = body.get("uid").lower() + action = body.get("action") + + if action == "create": + model_s3_url = body['model']['s3_url'] + node_pool_name = body['model']['deployment_type'] + ram_size = body['model']['max_used_ram'] + result = apply_yaml(user_uid, endpoint_uid, model_s3_url, node_pool_name, ram_size) + + cmd = "{} get ingress -A --kubeconfig {} | grep {}".format(kubectl, kubeconfig, endpoint_uid) + time.sleep(10) + endpoint_url = subprocess.run(cmd, capture_output=True, shell=True).stdout.decode('utf-8').strip().split()[4] + print(f"endpoint_url: {endpoint_url}") + update_data = { + "endpoint": f"http://{endpoint_url}/{endpoint_uid}" + } + response = requests.put(url=f"{db_api_url}/inferences/{endpoint_uid}", json=update_data) + if result == 0: + return { + 'statusCode': 200, + 'body': "complete create inference endpoint" + } + else: + return { + 'statusCode': 500, + 'body': "error with create inference endpoint" + } + elif action == "delete": + result = delete_resource(user_uid, endpoint_uid) + if result == 0: + requests.delete(url=f"{db_api_url}/inferences/{endpoint_uid}") + return { + 'statusCode': 200, + 'body': "complete delete inference deployment" + } + else: + return { + 'statusCode': 500, + 'body': "error with delete inference endpoint" + } + else: + return { + 'statusCode': 500, + 'body': "invalid action" + } \ No newline at end of file diff --git a/automation/llama_inference_deploy/push_aws_ecr.sh.sample b/automation/llama_inference_deploy/push_aws_ecr.sh.sample new file mode 100644 index 0000000000..2c2b9d8f77 --- /dev/null +++ b/automation/llama_inference_deploy/push_aws_ecr.sh.sample @@ -0,0 +1,7 @@ +#!/bin/sh + +ECR_URI="" + +aws ecr get-login-password --region ap-northeast-2 | docker login --username AWS --password-stdin $ECR_URI +docker build -t $ECR_URI/llama-inference-deploy:latest . +docker push $ECR_URI/llama-inference-deploy:latest \ No newline at end of file From a2d6d9fd6731c84c220b103285effb5314e8ac8c Mon Sep 17 00:00:00 2001 From: jhM00n Date: Sat, 18 May 2024 02:11:03 +0900 Subject: [PATCH 3/5] fix typo --- automation/kubernetes_inference_deploy/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/automation/kubernetes_inference_deploy/main.py b/automation/kubernetes_inference_deploy/main.py index 35ddd646ac..189c380443 100644 --- a/automation/kubernetes_inference_deploy/main.py +++ b/automation/kubernetes_inference_deploy/main.py @@ -2,6 +2,7 @@ import requests import os import json +import time kubectl = '/var/task/kubectl' kubeconfig = '/tmp/kubeconfig' @@ -145,6 +146,7 @@ def handler(event, context): result = apply_yaml(user_uid, endpoint_uid, model_s3_url, node_pool_name, ram_size) cmd = "{} get ingress -A --kubeconfig {} | grep {}".format(kubectl, kubeconfig, endpoint_uid) + time.sleep(10) endpoint_url = subprocess.run(cmd, capture_output=True, shell=True).stdout.decode('utf-8').strip().split()[4] print(f"endpoint_url: {endpoint_url}") update_data = { From 108154ec9033ab07b2c399b01d3ebfc9c569ee15 Mon Sep 17 00:00:00 2001 From: jhM00n Date: Sun, 19 May 2024 00:41:11 +0900 Subject: [PATCH 4/5] fix max_len --- automation/deploy_streamlit/main.py | 4 ++-- inference/template_code/llama/kubernetes_app_llama2.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/automation/deploy_streamlit/main.py b/automation/deploy_streamlit/main.py index 945671c0f1..6f20b71017 100644 --- a/automation/deploy_streamlit/main.py +++ b/automation/deploy_streamlit/main.py @@ -84,7 +84,7 @@ def init_streamlit(user_namespace, endpoint_uid, endpoint_url, image_name, image annotations: alb.ingress.kubernetes.io/scheme: internet-facing alb.ingress.kubernetes.io/target-type: ip - alb.ingress.kubernetes.io/group.name: "streamlit-{user_namespace}" + alb.ingress.kubernetes.io/group.name: "{user_namespace}" spec: ingressClassName: alb rules: @@ -155,7 +155,7 @@ def handler(event, context): # 추론 엔드포인트 주소 endpoint_url = body.get("endpoint_url") result = apply_yaml(user_uid, endpoint_uid, endpoint_url, image_name, image_py_name) - cmd = "{} get ingress -A --kubeconfig {} | grep {}".format(kubectl, kubeconfig, endpoint_uid) + cmd = "{} get ingress -A --kubeconfig {} | grep {} | grep streamlit".format(kubectl, kubeconfig, endpoint_uid) # streamlit endpoint 주소 streamlit_endpoint_url = subprocess.run(cmd, capture_output=True, shell=True).stdout.decode('utf-8').strip().split()[4] print(f"streamlit_endpoint_url: {streamlit_endpoint_url}/streamlit/{endpoint_uid}") diff --git a/inference/template_code/llama/kubernetes_app_llama2.py b/inference/template_code/llama/kubernetes_app_llama2.py index 5371a6d38d..0b424f27eb 100644 --- a/inference/template_code/llama/kubernetes_app_llama2.py +++ b/inference/template_code/llama/kubernetes_app_llama2.py @@ -91,6 +91,7 @@ async def healthcheck(): async def inference(request: Request): data = await request.json() prompt = data.get('prompt', '') + max_gen_length = data.get('max_gen_len', 512) try: # 입력 데이터 준비 inputs = tokenizer(prompt, return_tensors="pt").to("cuda") @@ -103,7 +104,7 @@ async def inference(request: Request): with torch.no_grad(): try: # 텍스트 생성 - outputs = model.generate(**inputs, max_length=1024) + outputs = model.generate(**inputs, max_length=max_gen_length) except Exception as e: return { "error": "Inference failed", From 92ce3c916c1eac5d06ee2beb8022a3c558c8f105 Mon Sep 17 00:00:00 2001 From: jhM00n Date: Sun, 19 May 2024 00:49:36 +0900 Subject: [PATCH 5/5] update cpu, mem, nodepool --- automation/llama_inference_deploy/main.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/automation/llama_inference_deploy/main.py b/automation/llama_inference_deploy/main.py index 52c1181653..ccebc2917b 100644 --- a/automation/llama_inference_deploy/main.py +++ b/automation/llama_inference_deploy/main.py @@ -53,12 +53,12 @@ def generate_yaml(user_namespace, endpoint_uid, model_s3_url, node_pool_name, ra value: {model_s3_url} resources: requests: - cpu: 2000m - memory: 2000M + cpu: 1700m + memory: 3800M nvidia.com/gpu: 1 limits: - cpu: 2000m - memory: 2000M + cpu: 1700m + memory: 3800M nvidia.com/gpu: 1 nodeSelector: karpenter.sh/nodepool: {node_pool_name} @@ -141,7 +141,7 @@ def handler(event, context): if action == "create": model_s3_url = body['model']['s3_url'] - node_pool_name = body['model']['deployment_type'] + node_pool_name = "nodepool-1" ram_size = body['model']['max_used_ram'] result = apply_yaml(user_uid, endpoint_uid, model_s3_url, node_pool_name, ram_size)