forked from sgl-project/sglang
-
Notifications
You must be signed in to change notification settings - Fork 1
/
k8s-sglang-service.yaml
76 lines (76 loc) · 1.94 KB
/
k8s-sglang-service.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: nvidia
handler: nvidia
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: meta-llama-31-8b-instruct-sglang
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app: meta-llama-31-8b-instruct-sglang
template:
metadata:
labels:
app: meta-llama-31-8b-instruct-sglang
model: meta-llama-31-8b-instruct
engine: sglang
spec:
hostIPC: true
restartPolicy: Always
runtimeClassName: nvidia
containers:
- name: meta-llama-31-8b-instruct-sglang
image: docker.io/lmsysorg/sglang:latest
imagePullPolicy: Always # IfNotPresent or Never
ports:
- containerPort: 30000
command: ["python3", "-m", "sglang.launch_server"]
args: ["--model-path", "meta-llama/Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"]
env:
- name: HF_TOKEN
value: <secret>
resources:
limits:
nvidia.com/gpu: 1
volumeMounts:
- name: hf-cache
mountPath: /root/.cache/huggingface
readOnly: true
- name: localtime
mountPath: /etc/localtime
readOnly: true
livenessProbe:
httpGet:
path: /health
port: 30000
initialDelaySeconds: 30
periodSeconds: 10
volumes:
- name: hf-cache
hostPath:
path: /root/.cache/huggingface
type: Directory
- name: localtime
hostPath:
path: /etc/localtime
type: File
---
apiVersion: v1
kind: Service
metadata:
name: meta-llama-31-8b-instruct-sglang
spec:
selector:
app: meta-llama-31-8b-instruct-sglang
ports:
- protocol: TCP
port: 30000 # port on host
targetPort: 30000 # port in container
type: LoadBalancer