forked from parallelworks/pw-cluster-automation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstartClusters.py
executable file
·185 lines (139 loc) · 5.66 KB
/
startClusters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/usr/bin/env python3
"""
This script will automatically connect to the ParallelWorks gateway to retrieve
information about the current clusters using the user's API key.
Critical files that must exist:
$HOME/.ssh/pw_api.key - this file must contain the API key in the first and only
line. Treat this file as any secure file and place in
.ssh directory. Change permissions to mode 600.
New files created:
$HOME/.hosts - this is a new file created every time the script is run.
Do not modify this file externally as any change will be
lost. If no active clusters exists, the file will be created
with one commented line. For the hosts to be recognized,
the HOSTALIASES environment variable must point to this
file (i.e. export HOSTALIASES=$HOME/.hosts).
"""
import subprocess
import json
import requests
import sys
import time
import os
from client import Client
# inputs
PW_PLATFORM_HOST = None
if 'PW_PLATFORM_HOST' in os.environ:
PW_PLATFORM_HOST = os.environ['PW_PLATFORM_HOST']
else:
print("No PW_PLATFORM_HOST environment variable found. Please set it to the Parallel Works platform host name. e.g. cloud.parallel.works")
sys.exit(1)
pw_url = "https://" + PW_PLATFORM_HOST
# specify the clusters to start and wait for activation
clusters_to_start = sys.argv[1].split(',')
print('\nStarting clusters:', clusters_to_start)
# used to run test ssh commands after the clusters start
# ensure your public key is added to the cluster configuration on Parallel Works
# Get user specific files
homedir = os.environ['HOME']
# The .hosts file will get re-written every time
hostsfile = homedir + '/.hosts'
keyfile = homedir + '/.ssh/pw_api.key'
# Prepare a header to go into the user's .hosts file
cluster_hosts = [f'# Generated Automatically ' + os.path.basename(__file__)]
# get my personal API key
# with the environment variable PW_API_KEY taking precedence
# over the file $HOME/.ssh/pw_api.key
api_key = None
if 'PW_API_KEY' in os.environ:
api_key = os.environ['PW_API_KEY']
else:
try:
f = open(keyfile, "r")
api_key = f.readline().strip()
f.close()
except:
pass
if api_key is None or api_key == "":
print("No API key found. Please set the environment variable PW_API_KEY or create the file $HOME/.ssh/pw_api.key.")
sys.exit(1)
# create a new Parallel Works client
c = Client(pw_url, api_key)
# get the account username
session = c.get_identity()
user = session['username']
print("\nRunning as user", user+'...')
my_clusters = c.get_resources()
for cluster_name in clusters_to_start:
print("\nChecking cluster status", cluster_name+"...")
started = []
# check if resource exists and is on
# find cluster_name in my_clusters
cluster = next(
(item for item in my_clusters if item["name"] == cluster_name), None)
if cluster:
if cluster['status'] == "off":
# if resource not on, start it
print("Starting cluster", cluster['name']+"...")
time.sleep(0.2)
print(c.start_resource(cluster['id']))
else:
print(cluster_name, "already running...")
ip = cluster['controllerIp']
entry = ' '.join([cluster['name'], ip])
print(entry)
cluster_hosts.append(entry)
started.append(cluster['name'])
else:
print("No cluster found.")
sys.exit(1)
print("\nWaiting for", len(clusters_to_start), "cluster(s) to start...")
laststate = {}
while True:
current_state = c.get_resources()
for cluster in current_state:
if cluster['name'] in clusters_to_start and cluster['status'] == 'on':
if cluster['name'] not in started:
state = cluster['state']
if cluster['name'] not in laststate:
print(cluster['name'], state)
laststate[cluster['name']] = state
elif laststate[cluster['name']] != state:
print(cluster['name'], state)
laststate[cluster['name']] = state
# if state == 'ok':
# break
# elif (state == 'deleted' or state == 'error'):
# raise Exception('Simulation had an error. Please try again')
if 'masterNode' in cluster['state']:
if cluster['state']['masterNode'] != None:
ip = cluster['state']['masterNode']
entry = ' '.join([cluster['name'], ip])
print(entry)
cluster_hosts.append(entry)
started.append(cluster['name'])
if len(started) == len(clusters_to_start):
print('\nStarted all clusters... writing hosts file')
break
time.sleep(5)
# Generate the user's local .hosts file
with open(hostsfile, 'w+') as f:
f.writelines("%s\n" % l for l in cluster_hosts)
print('SUCCESS - the', hostsfile, 'was updated.')
f.close()
# run example ssh command on each started cluster
print("\nRunning test ssh commands on the clusters...")
testcmd = "sinfo"
for ei, entry in enumerate(cluster_hosts):
if ei > 0: # skip the host header
name = entry.split()[0]
ip = entry.split()[1]
cmd = "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null %s@%s %s" % (
user, ip, testcmd)
print("")
print(name+':', '"'+cmd+'"')
out = subprocess.check_output(
cmd,
stderr=subprocess.STDOUT,
shell=True).decode(sys.stdout.encoding)
print(out)