ray-cluster.yaml

# An unique identifier for the head node and workers of this cluster.
cluster_name: default

# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0

initial_workers: 0


# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 8

# The initial number of worker nodes to launch in addition to the head
# node. When the cluster is first brought up (or when it is refreshed with a
# subsequent `ray up`) this number of nodes will be started.

# The autoscaler will scale up the cluster to this target fraction of resource
# usage. For example, if a cluster of 10 nodes is 100% busy and
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
# can be decreased to increase the aggressiveness of upscaling.
# This value must be less than 1.0 for scaling to happen.
target_utilization_fraction: 0.8

# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5

# Cloud-provider specific configuration.
provider:
    type: gcp
    region: us-east1
    availability_zone: us-east1-b
    project_id: mlwork-216319 

# How Ray will authenticate with newly launched nodes.
auth:
    ssh_user: abisulco
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below. This requires that you have added the key into the
# project wide meta-data.
#    ssh_private_key: /path/to/your/key.pem

# Provider-specix
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
head_node:
    machineType: n1-standard-16
    guestAccelerators:
      - acceleratorType: projects/mlwork-216319/zones/us-east1-b/acceleratorTypes/nvidia-tesla-p100
        acceleratorCount: 1
    disks:
      - boot: true
        autoDelete: true
        type: PERSISTENT
        initializeParams:
          diskSizeGb: 50
          sourceImage: projects/mlwork-216319/global/images/head-rl-img2 # Ubuntu
    scheduling:
      - preemptible: false
        onHostMaintenance: TERMINATE
        automaticRestart: true
    # Additional options can be found in in the compute docs at
    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert

worker_nodes:
    machineType: n1-standard-16 
    disks:
      - boot: true
        autoDelete: true
        type: PERSISTENT
        initializeParams:
          diskSizeGb: 50
          # See https://cloud.google.com/compute/docs/images for more images
          sourceImage: projects/mlwork-216319/global/images/worker-rl-img2  # Ubuntu
    # Run workers on preemtible instance by default.
    # Comment this out to use on-demand.
    scheduling:
      - preemptible: true

    # Additional options can be found in in the compute docs at
    # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert

setup_commands:
  - sudo apt update
# Custom commands that will be run on the head node after common setup.
head_setup_commands: []

# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []

# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
    - ray stop
    - >-
      ulimit -n 65536;
      ray start
      --head
      --redis-port=6379
      --object-manager-port=8076
      --autoscaling-config=~/ray_bootstrap_config.yaml
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
    - ray stop
    - >-
      ulimit -n 65536;
      ray start
      --redis-address=$RAY_HEAD_IP:6379
      --object-manager-port=8076