-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathray-cluster.yaml
109 lines (94 loc) · 3.63 KB
/
ray-cluster.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# An unique identifier for the head node and workers of this cluster.
cluster_name: default
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
initial_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 8
# The initial number of worker nodes to launch in addition to the head
# node. When the cluster is first brought up (or when it is refreshed with a
# subsequent `ray up`) this number of nodes will be started.
# The autoscaler will scale up the cluster to this target fraction of resource
# usage. For example, if a cluster of 10 nodes is 100% busy and
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
# can be decreased to increase the aggressiveness of upscaling.
# This value must be less than 1.0 for scaling to happen.
target_utilization_fraction: 0.8
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Cloud-provider specific configuration.
provider:
type: gcp
region: us-east1
availability_zone: us-east1-b
project_id: mlwork-216319
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: abisulco
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below. This requires that you have added the key into the
# project wide meta-data.
# ssh_private_key: /path/to/your/key.pem
# Provider-specix
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
head_node:
machineType: n1-standard-16
guestAccelerators:
- acceleratorType: projects/mlwork-216319/zones/us-east1-b/acceleratorTypes/nvidia-tesla-p100
acceleratorCount: 1
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
sourceImage: projects/mlwork-216319/global/images/head-rl-img2 # Ubuntu
scheduling:
- preemptible: false
onHostMaintenance: TERMINATE
automaticRestart: true
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
worker_nodes:
machineType: n1-standard-16
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/mlwork-216319/global/images/worker-rl-img2 # Ubuntu
# Run workers on preemtible instance by default.
# Comment this out to use on-demand.
scheduling:
- preemptible: true
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
setup_commands:
- sudo apt update
# Custom commands that will be run on the head node after common setup.
head_setup_commands: []
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
- ray stop
- >-
ulimit -n 65536;
ray start
--head
--redis-port=6379
--object-manager-port=8076
--autoscaling-config=~/ray_bootstrap_config.yaml
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- >-
ulimit -n 65536;
ray start
--redis-address=$RAY_HEAD_IP:6379
--object-manager-port=8076