Skip to content

Commit

Permalink
Let AWS Databricks automatically choose an Availability Zone (#11714)
Browse files Browse the repository at this point in the history
To address the AWS Databricks failure: InsufficientInstanceCapacity:
    Could not complete request as AWS does not currently have enough available capacity to fulfill your request for instance type g5.4xlarge.

    Our system will be working on provisioning additional capacity. You can currently get g5.4xlarge capacity by not specifying an Availability Zone in your request or choosing us-west-2a, us-west-2b.

Also, print the cluster ID to stdout so that a calling script can easily access it

Signed-off-by: timl <[email protected]>
  • Loading branch information
NvTimLiu authored Nov 12, 2024
1 parent 894b636 commit 862dab0
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 3 deletions.
4 changes: 3 additions & 1 deletion jenkins/databricks/clusterutils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -85,6 +85,8 @@ def wait_for_cluster_start(workspace, clusterid, token, retries=20, printLoc=sys
if current_state in ['INTERNAL_ERROR', 'SKIPPED', 'TERMINATED'] or p >= 60:
if p >= retries:
print("Waited %d times already, stopping" % p)
# Output the cluster ID to stdout so a calling script can get it easily
print(clusterid, file=sys.stdout)
sys.exit(4)
p = p + 1
print("Done starting cluster", file=printLoc)
Expand Down
4 changes: 2 additions & 2 deletions jenkins/databricks/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,14 @@ def main():
sshkey = ''
cluster_name = 'CI-GPU-databricks-24.12.0-SNAPSHOT'
idletime = 240
runtime = '7.0.x-gpu-ml-scala2.12'
runtime = '13.3.x-gpu-ml-scala2.12'
num_workers = 1
worker_type = 'g4dn.xlarge'
driver_type = 'g4dn.xlarge'
cloud_provider = 'aws'
# comma separated init scripts in Databricks workspace, e.g. /foo,/bar,...
init_scripts = ''
aws_zone='us-west-2c'
aws_zone='auto'


try:
Expand Down

0 comments on commit 862dab0

Please sign in to comment.