-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
127 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# LSF Flux Cluster Job | ||
|
||
Use `lsf.flux.sh` script to create a light weighted job management Flux cluster for your own. | ||
The [Flux](https://flux-framework.readthedocs.io/en/latest/) cluster manages resouces | ||
allocated by `LSF` based on job's resource requirement. | ||
|
||
## Usage | ||
|
||
Submit the script as an interactive job to get a shell with Flux cluster deployed. | ||
```bash | ||
$ bsub -Is ./lsf.flux.sh | ||
``` | ||
|
||
## Example | ||
|
||
Create a Flux cluster in an `LSF` job with 2 nodes: 1 core on linux01 and 2 cores on linux02. | ||
```bash | ||
$ bsub -R '1*{affinity[core(1)]} + 1*{affinity[core(2)]}' -m 'linux01! linux02' -Is ./lsf.flux.sh | ||
Job <320> is submitted to default queue <interactive>. | ||
<<Waiting for dispatch ...>> | ||
<<Starting on linux01>> | ||
[info] generating Flux key for communication. | ||
[info] preparing configuration file for the Flux cluster. | ||
[info] starting Flux brokers on nodes linux02. | ||
[info] the Flux cluster with multiple nodes is running in job <320>. | ||
lsfadmin@linux01:~/shared/test$ flux resource list | ||
STATE NNODES NCORES NGPUS NODELIST | ||
free 2 3 0 linux01,linux02 | ||
allocated 0 0 0 | ||
down 0 0 0 | ||
lsfadmin@linux01:~/shared/test$ flux mini submit sleep 9999 | ||
ƒ27fJYEVm | ||
lsfadmin@linux01:~/shared/test$ flux mini submit sleep 9999 | ||
ƒ28LWmEnj | ||
lsfadmin@linux01:~/shared/test$ flux mini submit sleep 9999 | ||
ƒ28s2vTGT | ||
lsfadmin@linux01:~/shared/test$ flux mini submit sleep 9999 | ||
ƒ29MwgSzT | ||
lsfadmin@linux01:~/shared/test$ flux jobs | ||
JOBID USER NAME ST NTASKS NNODES RUNTIME NODELIST | ||
ƒ29MwgSzT lsfadmin sleep PD 1 - - - | ||
ƒ28s2vTGT lsfadmin sleep R 1 1 5.914s linux02 | ||
ƒ28LWmEnj lsfadmin sleep R 1 1 7.110s linux01 | ||
ƒ27fJYEVm lsfadmin sleep R 1 1 8.634s linux02 | ||
``` | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
#!/bin/bash | ||
|
||
# Copyright International Business Machines Corp, 2021 | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
################################################################################ | ||
# | ||
# configure a location to a shared directory | ||
# the location stores flux configuration files accessed by all Flux member nodes | ||
################################################################################ | ||
|
||
LSF_FLUX_SHARE_DIR=${HOME}/shared/ | ||
|
||
if [ "${LSB_JOBID}" == "" ]; then | ||
echo "[warning] this script should be run as a job in LSF." | ||
fi | ||
|
||
WORKERS=$(echo ${LSB_HOSTS} | tr ' ' '\n'| uniq | sed 1d | xargs) | ||
if [ "${WORKERS}" == "" ]; then | ||
echo "[info] the Flux cluster with a single node is starting." | ||
flux start | ||
|
||
echo "[info] bye!" | ||
exit | ||
fi | ||
|
||
# a flux cluster across hosts | ||
LSF_FLUX_TOP=${LSF_FLUX_SHARE_DIR}/.lsf.flux/${LSB_JOBID} | ||
|
||
echo "[info] generating Flux key for communication." | ||
mkdir -p ${LSF_FLUX_TOP} | ||
flux keygen ${LSF_FLUX_TOP}/key | ||
|
||
|
||
echo "[info] preparing configuration file for the Flux cluster." | ||
mkdir -p ${LSF_FLUX_TOP}/config/ | ||
cat <<EOF > ${LSF_FLUX_TOP}/config/cluster.toml | ||
[bootstrap] | ||
curve_cert="${LSF_FLUX_TOP}/key" | ||
hosts = [ | ||
{ host = "$(hostname)", bind = "tcp://0.0.0.0:9001", connect = "tcp://$(hostname):9001"}, | ||
EOF | ||
|
||
# add computing nodes to the flux cluster | ||
IFS=' ' read -ra HOSTS <<< "${WORKERS}" | ||
for host in ${HOSTS[@]} | ||
do | ||
echo " { host = \"${host}\" }," >> ${LSF_FLUX_TOP}/config/cluster.toml | ||
done | ||
|
||
# end of the configure file | ||
echo "]" >> ${LSF_FLUX_TOP}/config/cluster.toml | ||
|
||
|
||
echo "[info] starting Flux brokers on nodes ${WORKERS}." | ||
blaunch -no-wait -z "${WORKERS}" flux broker -c ${LSF_FLUX_TOP}/config/ | ||
|
||
echo "[info] the Flux cluster with multiple nodes is starting in the job <${LSB_JOBID}>." | ||
flux broker -c ${LSF_FLUX_TOP}/config/ | ||
|
||
echo "[info] cleaning generated files" | ||
rm -rf ${LSF_FLUX_TOP} | ||
|
||
echo "[info] bye!" | ||
|