diff --git a/bot/test.sh b/bot/test.sh index 4984340e6e..04bff346cd 100755 --- a/bot/test.sh +++ b/bot/test.sh @@ -204,6 +204,9 @@ if [[ -z ${RESUME_DIR} ]]; then else TEST_STEP_ARGS+=("--resume" "${RESUME_DIR}") fi +# Bind mount /sys/fs/cgroup so that we can determine the amount of memory available in our cgroup for +# Reframe configuration +TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro") # prepare arguments to test_suite.sh (specific to test step) declare -a TEST_SUITE_ARGS=() diff --git a/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2022b.yml b/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2022b.yml index 1bca410982..1699960669 100644 --- a/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2022b.yml +++ b/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2022b.yml @@ -13,3 +13,4 @@ easyconfigs: from-pr: 20379 - ParaView-5.11.1-foss-2022b.eb - SEPP-4.5.1-foss-2022b.eb + - Valgrind-3.21.0-gompi-2022b.eb diff --git a/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023a.yml b/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023a.yml index 3f6978f36e..e2a6978e10 100644 --- a/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023a.yml +++ b/easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023a.yml @@ -55,6 +55,9 @@ easyconfigs: - PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb: options: cuda-compute-capabilities: 6.0,6.1,7.0,7.5,8.0,8.6,8.9,9.0 + - BLAST+-2.14.1-gompi-2023a.eb: + options: + from-pr: 20751 # PyTorch-bundle-CUDA's dependencies without CUDA - librosa-0.10.1-foss-2023a.eb - NLTK-3.8.1-foss-2023a.eb @@ -76,3 +79,6 @@ easyconfigs: - tensorboard-2.15.1-gfbf-2023a.eb - tqdm-4.66.1-GCCcore-12.3.0.eb - bx-python-0.10.0-foss-2023a.eb + - BLAST+-2.14.1-gompi-2023a.eb: + options: + from-pr: 20784 diff --git a/eessi_container.sh b/eessi_container.sh index e6bb13cbe7..6495ee5401 100755 --- a/eessi_container.sh +++ b/eessi_container.sh @@ -70,46 +70,48 @@ export EESSI_REPOS_CFG_FILE="${EESSI_REPOS_CFG_DIR}/repos.cfg" display_help() { echo "usage: $0 [OPTIONS] [[--] SCRIPT or COMMAND]" echo " OPTIONS:" - echo " -a | --access {ro,rw} - ro (read-only), rw (read & write) [default: ro]" - echo " -c | --container IMG - image file or URL defining the container to use" - echo " [default: docker://ghcr.io/eessi/build-node:debian11]" - echo " -f | --fakeroot - run the container with --fakeroot [default: false]" - echo " Note, currently this option is ignored." - echo " -g | --storage DIR - directory space on host machine (used for" - echo " temporary data) [default: 1. TMPDIR, 2. /tmp]" - echo " -h | --help - display this usage information [default: false]" - echo " -i | --host-injections - directory to link to for host_injections " - echo " [default: /..storage../opt-eessi]" - echo " -l | --list-repos - list available repository identifiers [default: false]" - echo " -m | --mode MODE - with MODE==shell (launch interactive shell) or" - echo " MODE==run (run a script or command) [default: shell]" - echo " -n | --nvidia MODE - configure the container to work with NVIDIA GPUs," - echo " MODE==install for a CUDA installation, MODE==run to" - echo " attach a GPU, MODE==all for both [default: false]" - echo " -o | --lower-dirs DIRS - list of ':' separated directories that are used" - echo " in front of the default lower dir (CVMFS repo);" - echo " fuse-overlayfs will merge all lower directories;" - echo " the option can be used to make certain directories" - echo " in the CVMFS repo writable [default: none]" - echo " -r | --repository CFG - configuration file or identifier defining the" - echo " repository to use [default: EESSI via" - echo " default container, see --container]" - echo " -u | --resume DIR/TGZ - resume a previous run from a directory or tarball," - echo " where DIR points to a previously used tmp directory" - echo " (check for output 'Using DIR as tmp ...' of a previous" - echo " run) and TGZ is the path to a tarball which is" - echo " unpacked the tmp dir stored on the local storage space" - echo " (see option --storage above) [default: not set]" - echo " -s | --save DIR/TGZ - save contents of tmp directory to a tarball in" - echo " directory DIR or provided with the fixed full path TGZ" - echo " when a directory is provided, the format of the" - echo " tarball's name will be {REPO_ID}-{TIMESTAMP}.tgz" - echo " [default: not set]" - echo " -v | --verbose - display more information [default: false]" - echo " -x | --http-proxy URL - provides URL for the env variable http_proxy" - echo " [default: not set]; uses env var \$http_proxy if set" - echo " -y | --https-proxy URL - provides URL for the env variable https_proxy" - echo " [default: not set]; uses env var \$https_proxy if set" + echo " -a | --access {ro,rw} - ro (read-only), rw (read & write) [default: ro]" + echo " -b | --extra-bind-paths - specify extra paths to be bound into the container." + echo " To specify multiple bind paths, separate by comma." + echo " Example: '/src:/dest:ro,/src2:/dest2:rw'" + echo " -c | --container IMG - image file or URL defining the container to use" + echo " [default: docker://ghcr.io/eessi/build-node:debian11]" + echo " -f | --fakeroot - run the container with --fakeroot [default: false]" + echo " -g | --storage DIR - directory space on host machine (used for" + echo " temporary data) [default: 1. TMPDIR, 2. /tmp]" + echo " -h | --help - display this usage information [default: false]" + echo " -i | --host-injections - directory to link to for host_injections " + echo " [default: /..storage../opt-eessi]" + echo " -l | --list-repos - list available repository identifiers [default: false]" + echo " -m | --mode MODE - with MODE==shell (launch interactive shell) or" + echo " MODE==run (run a script or command) [default: shell]" + echo " -n | --nvidia MODE - configure the container to work with NVIDIA GPUs," + echo " MODE==install for a CUDA installation, MODE==run to" + echo " attach a GPU, MODE==all for both [default: false]" + echo " -o | --lower-dirs DIRS - list of ':' separated directories that are used" + echo " in front of the default lower dir (CVMFS repo);" + echo " fuse-overlayfs will merge all lower directories;" + echo " the option can be used to make certain directories" + echo " in the CVMFS repo writable [default: none]" + echo " -r | --repository CFG - configuration file or identifier defining the" + echo " repository to use [default: EESSI via" + echo " default container, see --container]" + echo " -u | --resume DIR/TGZ - resume a previous run from a directory or tarball," + echo " where DIR points to a previously used tmp directory" + echo " (check for output 'Using DIR as tmp ...' of a previous" + echo " run) and TGZ is the path to a tarball which is" + echo " unpacked the tmp dir stored on the local storage space" + echo " (see option --storage above) [default: not set]" + echo " -s | --save DIR/TGZ - save contents of tmp directory to a tarball in" + echo " directory DIR or provided with the fixed full path TGZ" + echo " when a directory is provided, the format of the" + echo " tarball's name will be {REPO_ID}-{TIMESTAMP}.tgz" + echo " [default: not set]" + echo " -v | --verbose - display more information [default: false]" + echo " -x | --http-proxy URL - provides URL for the env variable http_proxy" + echo " [default: not set]; uses env var \$http_proxy if set" + echo " -y | --https-proxy URL - provides URL for the env variable https_proxy" + echo " [default: not set]; uses env var \$https_proxy if set" echo echo " If value for --mode is 'run', the SCRIPT/COMMAND provided is executed. If" echo " arguments to the script/command start with '-' or '--', use the flag terminator" @@ -141,6 +143,10 @@ while [[ $# -gt 0 ]]; do ACCESS="$2" shift 2 ;; + -b|--extra-bind-paths) + EXTRA_BIND_PATHS="$2" + shift 2 + ;; -c|--container) CONTAINER="$2" shift 2 @@ -453,6 +459,9 @@ fi BIND_PATHS="${EESSI_CVMFS_VAR_LIB}:/var/lib/cvmfs,${EESSI_CVMFS_VAR_RUN}:/var/run/cvmfs,${HOST_INJECTIONS}:/opt/eessi" # provide a '/tmp' inside the container BIND_PATHS="${BIND_PATHS},${EESSI_TMPDIR}:${TMP_IN_CONTAINER}" +if [[ ! -z ${EXTRA_BIND_PATHS} ]]; then + BIND_PATHS="${BIND_PATHS},${EXTRA_BIND_PATHS}" +fi [[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}" diff --git a/reframe_config_bot.py.tmpl b/reframe_config_bot.py.tmpl index 0cc3e9f530..607373767a 100644 --- a/reframe_config_bot.py.tmpl +++ b/reframe_config_bot.py.tmpl @@ -34,6 +34,11 @@ site_configuration = { 'options': ['--mem={size}'], } ], + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': __MEM_PER_NODE__, + }, 'max_jobs': 1 } ] diff --git a/test_suite.sh b/test_suite.sh index 2f304dd9bc..59407f49cb 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -135,7 +135,7 @@ export RFM_PREFIX=$PWD/reframe_runs echo "Configured reframe with the following environment variables:" env | grep "RFM_" -# Inject correct CPU properties into the ReFrame config file +# Inject correct CPU/memory properties into the ReFrame config file cpuinfo=$(lscpu) if [[ "${cpuinfo}" =~ CPU\(s\):[^0-9]*([0-9]+) ]]; then cpu_count=${BASH_REMATCH[1]} @@ -165,11 +165,20 @@ if [[ "${cpuinfo}" =~ (Core\(s\) per socket:[^0-9]*([0-9]+)) ]]; then else fatal_error "Failed to get the number of cores per socket for the current test hardware with lscpu." fi +cgroup_mem_bytes=$(cat /hostsys/fs/cgroup/memory/slurm/uid_${UID}/job_${SLURM_JOB_ID}/memory.limit_in_bytes) +if [[ $? -eq 0 ]]; then + # Convert to MiB + cgroup_mem_mib=$((cgroup_mem_bytes/(1024*1024))) +else + fatal_error "Failed to get the memory limit in bytes from the current cgroup" +fi cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES} sed -i "s/__NUM_CPUS__/${cpu_count}/g" $RFM_CONFIG_FILES sed -i "s/__NUM_SOCKETS__/${socket_count}/g" $RFM_CONFIG_FILES sed -i "s/__NUM_CPUS_PER_CORE__/${threads_per_core}/g" $RFM_CONFIG_FILES sed -i "s/__NUM_CPUS_PER_SOCKET__/${cores_per_socket}/g" $RFM_CONFIG_FILES +# on local systems the change below is not the case, it works on AWS +# sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib}/g" $RFM_CONFIG_FILES # Workaround for https://github.com/EESSI/software-layer/pull/467#issuecomment-1973341966 export PSM3_DEVICES='self,shm' # this is enough, since we only run single node for now