Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixed various install issues #2

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
openrc.sh
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ To build your own Virtual cluster, starting on your localhost:
the NodeName and PartitionName line.
* If you'd like to change the default node size, the ```node_size=```line
in ```slurm_resume.sh``` must be changed.
This should take values corresponding to instance sizes in Jetstream, like
"m1.small" or "m1.large". Be sure to edit the ```slurm.conf``` file to
reflect the number of CPUs available.
* If you'd like to enable any specific software, you should edit
```compute_build_base_img.yml```. The task named "install basic packages"
can be easily extended to install anything available from a yum
Expand Down
37 changes: 30 additions & 7 deletions compute_build_base_img.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
- hosts: localhost

vars:
compute_base_image: "JS-API-Featured-CentOS7-May-20-2019"
sec_group_global: "{{ clouds.tacc.auth.username }}-global-ssh"
sec_group_internal: "{{ clouds.tacc.auth.username }}-cluster-internal"
compute_base_image: "JS-API-Featured-CentOS7-Sep-09-2019"
sec_group_global: "{{ clouds.tacc.cluster_name }}-global-ssh"
sec_group_internal: "{{ clouds.tacc.cluster_name }}-cluster-internal"
compute_base_size: "m1.small"
network_name: "{{ clouds.tacc.auth.username }}-elastic-net"
JS_ssh_keyname: "{{ clouds.tacc.auth.username }}-{{ clouds.tacc.auth.project_name }}-slurm-key"
network_name: "{{ clouds.tacc.cluster_name }}-elastic-net"
JS_ssh_keyname: "{{ clouds.tacc.cluster_name }}-{{ clouds.tacc.auth.project_name }}-slurm-key"

vars_files:
- clouds.yaml
Expand All @@ -19,7 +19,7 @@
os_server:
timeout: 300
state: present
name: "compute-{{ clouds.tacc.auth.username }}-base-instance"
name: "compute-{{ clouds.tacc.cluster_name }}-base-instance"
cloud: "tacc"
image: "{{ compute_base_image }}"
key_name: "{{ JS_ssh_keyname }}"
Expand Down Expand Up @@ -63,6 +63,13 @@
- "openmpi-gnu-ohpc"
- "ohpc-slurm-client"
- "lmod-ohpc"
- "git"
- "bind-utils"
- "gcc"
- "wget"
- "jq"
- "nodejs"
- "singularity"

tasks:

Expand Down Expand Up @@ -96,6 +103,12 @@
state: present
lock_timeout: 300

- name: enable hostfs mount on singularity
replace:
path: /etc/singularity/singularity.conf
regexp: "^(mount hostfs = no)$"
replace: "mount hostfs = yes"

- name: fix slurm user uid
user:
name: slurm
Expand Down Expand Up @@ -191,6 +204,10 @@
- name: enable slurmd
service: name=slurmd enabled=yes

- name: create local user called "user"
user:
name: user

#cat /etc/systemd/system/multi-user.target.wants/slurmd.service
#[Unit]
#Description=Slurm node daemon
Expand Down Expand Up @@ -230,17 +247,23 @@
tasks:

- name: create compute instance snapshot
<<<<<<< HEAD
script: ./compute_take_snapshot.sh "compute-{{ clouds.tacc.cluster_name }}-base-instance"
=======
expect:
command: ./compute_take_snapshot.sh
responses:
Password: "{{ clouds.tacc.auth.password }}"
timeout: null # Need to check if this refers to the whole command or just the expect part?
no_log: true # to avoid putting OS_PASSWORD in logs - uncomment and re-run if you run into errors!

>>>>>>> 7b201c00234ded4ae207a94673916057e7a63d39

- name: remove compute instance
os_server:
timeout: 200
state: absent
name: "compute-{{ clouds.tacc.auth.username }}-base-instance"
name: "compute-{{ clouds.tacc.cluster_name }}-base-instance"
cloud: "tacc"


7 changes: 5 additions & 2 deletions compute_take_snapshot.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@

source openrc.sh

compute_image="${OS_USERNAME}-compute-image-$(date +%m-%d-%Y)"
compute_instance="compute-${OS_USERNAME}-base-instance"
#compute_instance="compute-${cluster_name}-base-instance"
cluster_name=$(hostname -s)

compute_instance=$1
compute_image="${cluster_name}-compute-image-$(date +%m-%d-%Y)"

openstack server stop $compute_instance

Expand Down
83 changes: 35 additions & 48 deletions headnode_create.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,37 +9,38 @@ if [[ -z "$1" ]]; then
echo "NO SERVER NAME GIVEN! Please re-run with ./headnode_create.sh <server-name>"
exit
fi
server_name=$1

if [[ ! -e ${HOME}/.ssh/id_rsa.pub ]]; then
#This may be temporary... but seems fairly reasonable.
echo "NO KEY FOUND IN ${HOME}/.ssh/id_rsa.pub! - please create one and re-run!"
exit
fi

server_name=$1
source ./openrc.sh

# Defining a function here to check for quotas, and exit if this script will cause problems!
# also, storing 'quotas' in a global var, so we're not calling it every single time
quotas=$(openstack quota show)
quota_check ()
{
quota_name=$1
type_name=$2 #the name for a quota and the name for the thing itself are not the same
number_created=$3 #number of the thing that we'll create here.
quota_check () {
quota_name=$1
type_name=$2 #the name for a quota and the name for the thing itself are not the same
number_created=$3 #number of the thing that we'll create here.

current_num=$(openstack ${type_name} list -f value | wc -l)
current_num=$(openstack ${type_name} list -f value | wc -l)

max_types=$(echo "${quotas}" | awk -v quota=${quota_name} '$0 ~ quota {print $4}')
max_types=$(echo "${quotas}" | awk -v quota=${quota_name} '$0 ~ quota {print $4}')

#echo "checking quota for ${quota_name} of ${type_name} to create ${number_created} - want ${current_num} to be less than ${max_types}"
#echo "checking quota for ${quota_name} of ${type_name} to create ${number_created} - want ${current_num} to be less than ${max_types}"

if [[ "${current_num}" -lt "$((max_types + number_created))" ]]; then
return 0
fi
return 1
if [[ "${current_num}" -lt "$((max_types + number_created))" ]]; then
return 0
fi
return 1
}

set -x #show use which commands are executed
set -e #terminate as soon as any command fails

quota_check "secgroups" "security group" 1
quota_check "networks" "network" 1
Expand All @@ -49,28 +50,27 @@ quota_check "key-pairs" "keypair" 1
quota_check "instances" "server" 1

# Ensure that the correct private network/router/subnet exists
if [[ -z "$(openstack network list | grep ${OS_USERNAME}-elastic-net)" ]]; then
openstack network create ${OS_USERNAME}-elastic-net
openstack subnet create --network ${OS_USERNAME}-elastic-net --subnet-range 10.0.0.0/24 ${OS_USERNAME}-elastic-subnet1
if [[ -z "$(openstack network list | grep ${server_name}-elastic-net)" ]]; then
openstack network create ${server_name}-elastic-net
openstack subnet create --network ${server_name}-elastic-net --subnet-range 10.0.0.0/24 ${server_name}-elastic-subnet1
fi
##openstack subnet list
if [[ -z "$(openstack router list | grep ${OS_USERNAME}-elastic-router)" ]]; then
openstack router create ${OS_USERNAME}-elastic-router
openstack router add subnet ${OS_USERNAME}-elastic-router ${OS_USERNAME}-elastic-subnet1
openstack router set --external-gateway public ${OS_USERNAME}-elastic-router
if [[ -z "$(openstack router list | grep ${server_name}-elastic-router)" ]]; then
openstack router create ${server_name}-elastic-router
openstack router add subnet ${server_name}-elastic-router ${server_name}-elastic-subnet1
openstack router set --external-gateway public ${server_name}-elastic-router
fi
#openstack router show ${OS_USERNAME}-api-router

security_groups=$(openstack security group list -f value)
if [[ ! ("${security_groups}" =~ "${OS_USERNAME}-global-ssh") ]]; then
openstack security group create --description "ssh \& icmp enabled" ${OS_USERNAME}-global-ssh
openstack security group rule create --protocol tcp --dst-port 22:22 --remote-ip 0.0.0.0/0 ${OS_USERNAME}-global-ssh
openstack security group rule create --protocol icmp ${OS_USERNAME}-global-ssh
if [[ ! ("$security_groups" =~ "${server_name}-global-ssh") ]]; then
openstack security group create --description "ssh \& icmp enabled" $server_name-global-ssh
openstack security group rule create --protocol tcp --dst-port 22:22 --remote-ip 0.0.0.0/0 $server_name-global-ssh
openstack security group rule create --protocol icmp $server_name-global-ssh
fi
if [[ ! ("${security_groups}" =~ "${OS_USERNAME}-cluster-internal") ]]; then
openstack security group create --description "internal group for cluster" ${OS_USERNAME}-cluster-internal
openstack security group rule create --protocol tcp --dst-port 1:65535 --remote-ip 10.0.0.0/0 ${OS_USERNAME}-cluster-internal
openstack security group rule create --protocol icmp ${OS_USERNAME}-cluster-internal
if [[ ! ("$security_groups" =~ "${server_name}-cluster-internal") ]]; then
openstack security group create --description "internal group for cluster" $server_name-cluster-internal
openstack security group rule create --protocol tcp --dst-port 1:65535 --remote-ip 10.0.0.0/0 $server_name-cluster-internal
openstack security group rule create --protocol icmp $server_name-cluster-internal
fi

#Check if ${HOME}/.ssh/id_rsa.pub exists in JS
Expand All @@ -79,33 +79,19 @@ if [[ -e ${HOME}/.ssh/id_rsa.pub ]]; then
fi
openstack_keys=$(openstack keypair list -f value)

home_key_in_OS=$(echo "${openstack_keys}" | awk -v mykey=${home_key_fingerprint} '$2 ~ mykey {print $1}')
home_key_in_OS=$(echo "${openstack_keys}" | awk -v mykey="${home_key_fingerprint}" '$2 ~ mykey {print $1}')

if [[ -n "${home_key_in_OS}" ]]; then
OS_keyname=${home_key_in_OS}
elif [[ -n $(echo "${openstack_keys}" | grep ${OS_USERNAME}-elastic-key) ]]; then
openstack keypair delete ${OS_USERNAME}-elastic-key
# This doesn't need to depend on the OS_PROJECT_NAME, as the slurm-key does, in install.sh and slurm_resume
openstack keypair create --public-key ${HOME}/.ssh/id_rsa.pub ${OS_USERNAME}-elastic-key
OS_keyname=${OS_USERNAME}-elastic-key
else
elif [[ -n $(echo "${openstack_keys}" | grep ${server_name}-elastic-key) ]]; then
openstack keypair delete ${server_name}-elastic-key
# This doesn't need to depend on the OS_PROJECT_NAME, as the slurm-key does, in install.sh and slurm_resume
openstack keypair create --public-key ${HOME}/.ssh/id_rsa.pub ${OS_USERNAME}-elastic-key
OS_keyname=${OS_USERNAME}-elastic-key
openstack keypair create --public-key ${HOME}/.ssh/id_rsa.pub ${server_name}-elastic-key
OS_keyname=${server_name}-elastic-key
fi

centos_base_image=$(openstack image list --status active | grep -iE "API-Featured-centos7-[[:alpha:]]{3,4}-[0-9]{2}-[0-9]{4}" | awk '{print $4}' | tail -n 1)

echo -e "openstack server create\
--user-data prevent-updates.ci \
--flavor m1.small \
--image ${centos_base_image} \
--key-name ${OS_keyname} \
--security-group ${OS_USERNAME}-global-ssh \
--security-group ${OS_USERNAME}-cluster-internal \
--nic net-id=${OS_USERNAME}-elastic-net \
${server_name}"

openstack server create \
--user-data prevent-updates.ci \
--flavor m1.small \
Expand All @@ -117,6 +103,7 @@ openstack server create \
${server_name}

public_ip=$(openstack floating ip create public | awk '/floating_ip_address/ {print $4}')

#For some reason there's a time issue here - adding a sleep command to allow network to become ready
sleep 10
openstack server add floating ip ${server_name} ${public_ip}
Expand Down
Loading