diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..f5707ea --- /dev/null +++ b/tests/README.md @@ -0,0 +1,31 @@ +# Tests for the datalad Slurm extension + +The following tests scripts can be executed manually and should run correctly or produce errors that should be handled as errors. + +Since it needs to work on datalad repositories which are also git repositories, and because a working Slurm environment is required, this is not (yet) part of automated CI tests ... let's see later if this would be feasible via git CI anyway. + + + +## In general + +Each test should be run as: + +`./test_x.sh `, where `` is some (temporary) directory to store the test results. + +All tests will create their own temporary datalad repo inside `` and work inside that. They can be removed after with `chmod -R u+w datalad-slurm-test*/; rm -Rf datalad-slurm-test*/` + +The `slurm_test*.template.sh` files need to be modified to match the local slurm environment. + +## Test 01 + +Test creating many job dirs with job scripts in it, then `datalad schedule` and run all jobs, wait until all run through, then `datalad finish` all jobs. + +This should run without any errors. + +## Test 02 + +Test creating many job dirs with job scripts in it like in Test 01. However, they have conflicting output directories so datalad should refuse to schedule some of them. + +This should produce some errors by datalad: +* The first bunch of jobs should run fine including a clean `datalad finish` +* The second bunch of jobs schould not get scheduled because datalad sees the conflict and refuses to schedule them. diff --git a/tests/slurm_test01.template.sh b/tests/slurm_test01.template.sh new file mode 100644 index 0000000..80397e6 --- /dev/null +++ b/tests/slurm_test01.template.sh @@ -0,0 +1,25 @@ +#!/bin/bash +#SBATCH --job-name="DLtest01" # name of the job +#SBATCH --partition=casus_genoa # partition to be used (defq, gpu or intel) +#SBATCH -A casus +#SBATCH --time=0:05:00 # walltime (up to 96 hours) +#SBATCH --ntasks=1 # number of nodes +#SBATCH --cpus-per-task=1 # number of tasks per node +#SBATCH --output=log.slurm-%j.out + + +echo "started" + +OUTPUT="output_test_"`date -Is|tr -d ":"`.txt + +# simulate some text output +for i in `seq 1 50`; do + + echo $i | tee -a $OUTPUT + sleep 1s +done + +# simulate some binary output which will become an annex file +bzip2 -k $OUTPUT + +echo "ended" diff --git a/tests/test_01_many_jobs.sh b/tests/test_01_many_jobs.sh new file mode 100755 index 0000000..5ac18dc --- /dev/null +++ b/tests/test_01_many_jobs.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash + +set -e # abort on errors + +# Test datalad 'schedule' and 'finish' functionality +# - create some job dirs and job scripts and 'commit' them +# - then 'datalad schedule' all jobs from their job dirs +# - wait until all of them are finished, then run 'datalad finish' +# +# Expected results: should run without any errors + +if [[ -z $1 ]] ; then + + echo "no temporary directory for tests given, abort" + echo "" + echo "... call as $0 " + + exit -1 +fi + +D=$1 + +echo "start" + +B=`dirname $0` + +echo "from src dir "$B + +## create a test repo + +TESTDIR=$D/"datalad-slurm-test-01_"`date -Is|tr -d ":"` + +datalad create -c text2git $TESTDIR + + +### generic part for all the tests ending here, specific parts follow ### + + +cp $B/slurm_test01.template.sh $TESTDIR/ +cd $TESTDIR + +TARGETS=`seq 17 21` + +for i in $TARGETS ; do + + DIR="test_01_output_dir_"$i + mkdir -p $DIR + + cp slurm_test01.template.sh $DIR/slurm_test01.sh + +done + +datalad save -m "add test job dirs and scripts" + +for i in $TARGETS ; do + + DIR="test_01_output_dir_"$i + + cd $DIR + datalad schedule -o $PWD sbatch slurm_test01.sh + cd .. + +done + +while [[ 0 != `squeue -u $USER | grep "DLtest01" | wc -l` ]] ; do + + echo " ... wait for jobs to finish" + sleep 1m +done + +datalad finish --list-open-jobs + +echo "finishing completed jobs:" +datalad finish + +echo " ### git log in this repo ### " +echo "" +git log + + + diff --git a/tests/test_02_conflicting_jobs.sh b/tests/test_02_conflicting_jobs.sh new file mode 100755 index 0000000..667eba0 --- /dev/null +++ b/tests/test_02_conflicting_jobs.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash + +set +e # do NOT abort on errors + +# Test datalad 'schedule' and 'finish' functionality +# - create some job dirs and job scripts and 'commit' them +# - then 'datalad schedule' all jobs from their job dirs +# - then 'datalad schedule' more jobs from the same set of job dirs +# - wait until all of them are finished, then run 'datalad finish' +# +# Expected results: should handle the first set of jobs fine until the end, +# but refuse to schedule the second set of jobs + +if [[ -z $1 ]] ; then + + echo "no temporary directory for tests given, abort" + echo "" + echo "... call as $0 " + + exit -1 +fi + +D=$1 + +echo "start" + +B=`dirname $0` + +echo "from src dir "$B + +## create a test repo + +TESTDIR=$D/"datalad-slurm-test-02_"`date -Is|tr -d ":"` + +datalad create -c text2git $TESTDIR + + +### generic part for all the tests ending here, specific parts follow ### + + +cp $B/slurm_test01.template.sh $TESTDIR/ +cd $TESTDIR + +TARGETS=`seq 17 21` + +for i in $TARGETS ; do + + DIR="test_02_output_dir_"$i + mkdir -p $DIR + + cp slurm_test01.template.sh $DIR/slurm_test01.sh + cp slurm_test01.template.sh $DIR/slurm_test02.sh + +done + +datalad save -m "add test job dirs and scripts" + +echo " --> schedule some jobs" + +for i in $TARGETS ; do + + DIR="test_02_output_dir_"$i + + cd $DIR + datalad schedule -o $PWD sbatch slurm_test01.sh + cd .. + +done + +sleep 5s + +echo " --> now try to schedule conflicting jobs" + +for i in $TARGETS ; do + + DIR="test_02_output_dir_"$i + + cd $DIR + datalad schedule -o $PWD sbatch slurm_test02.sh + cd .. + +done + + +while [[ 0 != `squeue -u $USER | grep "DLtest01" | wc -l` ]] ; do + + echo " ... wait for jobs to finish" + sleep 1m +done + +datalad finish --list-open-jobs + +echo "finishing completed jobs:" +datalad finish + +echo " ### git log in this repo ### " +echo "" +git log + + +