Skip to content

Commit

Permalink
Merge branch 'nessi.no-2023.06' of github.com:NorESSI/software-layer …
Browse files Browse the repository at this point in the history
…into nessi-2023.06-Valgrind/3.21.0-gompi/2023b
  • Loading branch information
Richard Top committed Jun 17, 2024
2 parents eeade02 + b5cf785 commit 6794648
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 41 deletions.
3 changes: 3 additions & 0 deletions bot/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,9 @@ if [[ -z ${RESUME_DIR} ]]; then
else
TEST_STEP_ARGS+=("--resume" "${RESUME_DIR}")
fi
# Bind mount /sys/fs/cgroup so that we can determine the amount of memory available in our cgroup for
# Reframe configuration
TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro")

# prepare arguments to test_suite.sh (specific to test step)
declare -a TEST_SUITE_ARGS=()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ easyconfigs:
from-pr: 20379
- ParaView-5.11.1-foss-2022b.eb
- SEPP-4.5.1-foss-2022b.eb
- Valgrind-3.21.0-gompi-2022b.eb
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ easyconfigs:
- PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb:
options:
cuda-compute-capabilities: 6.0,6.1,7.0,7.5,8.0,8.6,8.9,9.0
- BLAST+-2.14.1-gompi-2023a.eb:
options:
from-pr: 20751
# PyTorch-bundle-CUDA's dependencies without CUDA
- librosa-0.10.1-foss-2023a.eb
- NLTK-3.8.1-foss-2023a.eb
Expand All @@ -76,3 +79,6 @@ easyconfigs:
- tensorboard-2.15.1-gfbf-2023a.eb
- tqdm-4.66.1-GCCcore-12.3.0.eb
- bx-python-0.10.0-foss-2023a.eb
- BLAST+-2.14.1-gompi-2023a.eb:
options:
from-pr: 20784
89 changes: 49 additions & 40 deletions eessi_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,46 +70,48 @@ export EESSI_REPOS_CFG_FILE="${EESSI_REPOS_CFG_DIR}/repos.cfg"
display_help() {
echo "usage: $0 [OPTIONS] [[--] SCRIPT or COMMAND]"
echo " OPTIONS:"
echo " -a | --access {ro,rw} - ro (read-only), rw (read & write) [default: ro]"
echo " -c | --container IMG - image file or URL defining the container to use"
echo " [default: docker://ghcr.io/eessi/build-node:debian11]"
echo " -f | --fakeroot - run the container with --fakeroot [default: false]"
echo " Note, currently this option is ignored."
echo " -g | --storage DIR - directory space on host machine (used for"
echo " temporary data) [default: 1. TMPDIR, 2. /tmp]"
echo " -h | --help - display this usage information [default: false]"
echo " -i | --host-injections - directory to link to for host_injections "
echo " [default: /..storage../opt-eessi]"
echo " -l | --list-repos - list available repository identifiers [default: false]"
echo " -m | --mode MODE - with MODE==shell (launch interactive shell) or"
echo " MODE==run (run a script or command) [default: shell]"
echo " -n | --nvidia MODE - configure the container to work with NVIDIA GPUs,"
echo " MODE==install for a CUDA installation, MODE==run to"
echo " attach a GPU, MODE==all for both [default: false]"
echo " -o | --lower-dirs DIRS - list of ':' separated directories that are used"
echo " in front of the default lower dir (CVMFS repo);"
echo " fuse-overlayfs will merge all lower directories;"
echo " the option can be used to make certain directories"
echo " in the CVMFS repo writable [default: none]"
echo " -r | --repository CFG - configuration file or identifier defining the"
echo " repository to use [default: EESSI via"
echo " default container, see --container]"
echo " -u | --resume DIR/TGZ - resume a previous run from a directory or tarball,"
echo " where DIR points to a previously used tmp directory"
echo " (check for output 'Using DIR as tmp ...' of a previous"
echo " run) and TGZ is the path to a tarball which is"
echo " unpacked the tmp dir stored on the local storage space"
echo " (see option --storage above) [default: not set]"
echo " -s | --save DIR/TGZ - save contents of tmp directory to a tarball in"
echo " directory DIR or provided with the fixed full path TGZ"
echo " when a directory is provided, the format of the"
echo " tarball's name will be {REPO_ID}-{TIMESTAMP}.tgz"
echo " [default: not set]"
echo " -v | --verbose - display more information [default: false]"
echo " -x | --http-proxy URL - provides URL for the env variable http_proxy"
echo " [default: not set]; uses env var \$http_proxy if set"
echo " -y | --https-proxy URL - provides URL for the env variable https_proxy"
echo " [default: not set]; uses env var \$https_proxy if set"
echo " -a | --access {ro,rw} - ro (read-only), rw (read & write) [default: ro]"
echo " -b | --extra-bind-paths - specify extra paths to be bound into the container."
echo " To specify multiple bind paths, separate by comma."
echo " Example: '/src:/dest:ro,/src2:/dest2:rw'"
echo " -c | --container IMG - image file or URL defining the container to use"
echo " [default: docker://ghcr.io/eessi/build-node:debian11]"
echo " -f | --fakeroot - run the container with --fakeroot [default: false]"
echo " -g | --storage DIR - directory space on host machine (used for"
echo " temporary data) [default: 1. TMPDIR, 2. /tmp]"
echo " -h | --help - display this usage information [default: false]"
echo " -i | --host-injections - directory to link to for host_injections "
echo " [default: /..storage../opt-eessi]"
echo " -l | --list-repos - list available repository identifiers [default: false]"
echo " -m | --mode MODE - with MODE==shell (launch interactive shell) or"
echo " MODE==run (run a script or command) [default: shell]"
echo " -n | --nvidia MODE - configure the container to work with NVIDIA GPUs,"
echo " MODE==install for a CUDA installation, MODE==run to"
echo " attach a GPU, MODE==all for both [default: false]"
echo " -o | --lower-dirs DIRS - list of ':' separated directories that are used"
echo " in front of the default lower dir (CVMFS repo);"
echo " fuse-overlayfs will merge all lower directories;"
echo " the option can be used to make certain directories"
echo " in the CVMFS repo writable [default: none]"
echo " -r | --repository CFG - configuration file or identifier defining the"
echo " repository to use [default: EESSI via"
echo " default container, see --container]"
echo " -u | --resume DIR/TGZ - resume a previous run from a directory or tarball,"
echo " where DIR points to a previously used tmp directory"
echo " (check for output 'Using DIR as tmp ...' of a previous"
echo " run) and TGZ is the path to a tarball which is"
echo " unpacked the tmp dir stored on the local storage space"
echo " (see option --storage above) [default: not set]"
echo " -s | --save DIR/TGZ - save contents of tmp directory to a tarball in"
echo " directory DIR or provided with the fixed full path TGZ"
echo " when a directory is provided, the format of the"
echo " tarball's name will be {REPO_ID}-{TIMESTAMP}.tgz"
echo " [default: not set]"
echo " -v | --verbose - display more information [default: false]"
echo " -x | --http-proxy URL - provides URL for the env variable http_proxy"
echo " [default: not set]; uses env var \$http_proxy if set"
echo " -y | --https-proxy URL - provides URL for the env variable https_proxy"
echo " [default: not set]; uses env var \$https_proxy if set"
echo
echo " If value for --mode is 'run', the SCRIPT/COMMAND provided is executed. If"
echo " arguments to the script/command start with '-' or '--', use the flag terminator"
Expand Down Expand Up @@ -141,6 +143,10 @@ while [[ $# -gt 0 ]]; do
ACCESS="$2"
shift 2
;;
-b|--extra-bind-paths)
EXTRA_BIND_PATHS="$2"
shift 2
;;
-c|--container)
CONTAINER="$2"
shift 2
Expand Down Expand Up @@ -453,6 +459,9 @@ fi
BIND_PATHS="${EESSI_CVMFS_VAR_LIB}:/var/lib/cvmfs,${EESSI_CVMFS_VAR_RUN}:/var/run/cvmfs,${HOST_INJECTIONS}:/opt/eessi"
# provide a '/tmp' inside the container
BIND_PATHS="${BIND_PATHS},${EESSI_TMPDIR}:${TMP_IN_CONTAINER}"
if [[ ! -z ${EXTRA_BIND_PATHS} ]]; then
BIND_PATHS="${BIND_PATHS},${EXTRA_BIND_PATHS}"
fi

[[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}"

Expand Down
5 changes: 5 additions & 0 deletions reframe_config_bot.py.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ site_configuration = {
'options': ['--mem={size}'],
}
],
'extras': {
# Make sure to round down, otherwise a job might ask for more mem than is available
# per node
'mem_per_node': __MEM_PER_NODE__,
},
'max_jobs': 1
}
]
Expand Down
11 changes: 10 additions & 1 deletion test_suite.sh
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ export RFM_PREFIX=$PWD/reframe_runs
echo "Configured reframe with the following environment variables:"
env | grep "RFM_"

# Inject correct CPU properties into the ReFrame config file
# Inject correct CPU/memory properties into the ReFrame config file
cpuinfo=$(lscpu)
if [[ "${cpuinfo}" =~ CPU\(s\):[^0-9]*([0-9]+) ]]; then
cpu_count=${BASH_REMATCH[1]}
Expand Down Expand Up @@ -165,11 +165,20 @@ if [[ "${cpuinfo}" =~ (Core\(s\) per socket:[^0-9]*([0-9]+)) ]]; then
else
fatal_error "Failed to get the number of cores per socket for the current test hardware with lscpu."
fi
cgroup_mem_bytes=$(cat /hostsys/fs/cgroup/memory/slurm/uid_${UID}/job_${SLURM_JOB_ID}/memory.limit_in_bytes)
if [[ $? -eq 0 ]]; then
# Convert to MiB
cgroup_mem_mib=$((cgroup_mem_bytes/(1024*1024)))
else
fatal_error "Failed to get the memory limit in bytes from the current cgroup"
fi
cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES}
sed -i "s/__NUM_CPUS__/${cpu_count}/g" $RFM_CONFIG_FILES
sed -i "s/__NUM_SOCKETS__/${socket_count}/g" $RFM_CONFIG_FILES
sed -i "s/__NUM_CPUS_PER_CORE__/${threads_per_core}/g" $RFM_CONFIG_FILES
sed -i "s/__NUM_CPUS_PER_SOCKET__/${cores_per_socket}/g" $RFM_CONFIG_FILES
# on local systems the change below is not the case, it works on AWS
# sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib}/g" $RFM_CONFIG_FILES

# Workaround for https://github.com/EESSI/software-layer/pull/467#issuecomment-1973341966
export PSM3_DEVICES='self,shm' # this is enough, since we only run single node for now
Expand Down

0 comments on commit 6794648

Please sign in to comment.