-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaccelerate_job.json
86 lines (86 loc) · 3.49 KB
/
accelerate_job.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
{
"displayName": "brittrock_hf_laion_bs1_16_real",
"jobSpec": {
"workerPoolSpecs": [
{
"machineSpec": {
"machineType": "a3-megagpu-8g",
"acceleratorType": "NVIDIA_H100_MEGA_80GB",
"acceleratorCount": 8,
"reservationAffinity": {
"reservationAffinityType": "SPECIFIC_RESERVATION",
"key": "compute.googleapis.com/reservation-name",
"values": [
"projects/disco-sector-292704/zones/us-east4-a/reservations/snap-a3-mega-reservation"
]
}
},
"replicaCount": "1",
"containerSpec": {
"imageUri": "us-east5-docker.pkg.dev/google.com/vertex-training-dlexamples/nemo-sd-training-repository/sd-accelerate_train:latest",
"command": [
"sh",
"-c"
],
"args": [
"git clone https://github.com/bvrockwell/hf-multi.git && chmod +x hf-multi/set_env.sh && ./hf-multi/set_env.sh"
],
"env": [
{
"name": "LD_LIBRARY_PATH",
"value": "/usr/local/nvidia/lib64"
},
{
"name": "NODE_COUNT",
"value": "2"
},
{
"name": "ACC_CONFIG",
"value": "/hf-multi/2host_config.yaml"
}
]
}
},
{
"machineSpec": {
"machineType": "a3-megagpu-8g",
"acceleratorType": "NVIDIA_H100_MEGA_80GB",
"acceleratorCount": 8,
"reservationAffinity": {
"reservationAffinityType": "SPECIFIC_RESERVATION",
"key": "compute.googleapis.com/reservation-name",
"values": [
"projects/disco-sector-292704/zones/us-east4-a/reservations/snap-a3-mega-reservation"
]
}
},
"replicaCount": "1",
"containerSpec": {
"imageUri": "us-east5-docker.pkg.dev/google.com/vertex-training-dlexamples/nemo-sd-training-repository/sd-accelerate_train:latest",
"command": [
"sh",
"-c"
],
"args": [
"git clone https://github.com/bvrockwell/hf-multi.git && chmod +x hf-multi/set_env.sh && ./hf-multi/set_env.sh"
],
"env": [
{
"name": "LD_LIBRARY_PATH",
"value": "/usr/local/nvidia/lib64"
},
{
"name": "NODE_COUNT",
"value": "2"
},
{
"name": "ACC_CONFIG",
"value": "/hf-multi/2host_config.yaml"
}
]
}
}
],
"enableWebAccess": true
}
}