From 095124210c23bbdc166908f82fcb6554720d41a4 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 7 Aug 2024 13:32:57 +0100 Subject: [PATCH 1/6] Turn off higher priority MPI net devices --- ansible/roles/hpctests/templates/pingpong.sh.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/hpctests/templates/pingpong.sh.j2 b/ansible/roles/hpctests/templates/pingpong.sh.j2 index 4dc2eebd5..ae652e71c 100644 --- a/ansible/roles/hpctests/templates/pingpong.sh.j2 +++ b/ansible/roles/hpctests/templates/pingpong.sh.j2 @@ -16,4 +16,4 @@ echo UCX_NET_DEVICES: $UCX_NET_DEVICES module load {{ hpctests_pingpong_modules | join(' ' ) }} #srun --mpi=pmi2 IMB-MPI1 pingpong # doesn't work in ohpc v2.1 -mpirun IMB-MPI1 pingpong +mpirun -mca pml_ucx_tls any -mca pml_ucx_devices any IMB-MPI1 pingpong From 9bedf4c37e57753b1387323b7d3c65b713c6ff8f Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 7 Aug 2024 13:33:44 +0100 Subject: [PATCH 2/6] Update pingmatrix.sh.j2 --- ansible/roles/hpctests/templates/pingmatrix.sh.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/hpctests/templates/pingmatrix.sh.j2 b/ansible/roles/hpctests/templates/pingmatrix.sh.j2 index d886e9ac8..6498c671c 100644 --- a/ansible/roles/hpctests/templates/pingmatrix.sh.j2 +++ b/ansible/roles/hpctests/templates/pingmatrix.sh.j2 @@ -16,4 +16,4 @@ echo UCX_NET_DEVICES: $UCX_NET_DEVICES module load {{ hpctests_pingmatrix_modules | join(' ' ) }} mpicc -o nxnlatbw mpi_nxnlatbw.c -mpirun nxnlatbw +mpirun -mca pml_ucx_tls any -mca pml_ucx_devices any nxnlatbw From eba9883254d1ed968b1499c5687b94137853ff9d Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 7 Aug 2024 15:09:44 +0100 Subject: [PATCH 3/6] Update pingmatrix.sh.j2 --- ansible/roles/hpctests/templates/pingmatrix.sh.j2 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ansible/roles/hpctests/templates/pingmatrix.sh.j2 b/ansible/roles/hpctests/templates/pingmatrix.sh.j2 index 6498c671c..d91b41dc3 100644 --- a/ansible/roles/hpctests/templates/pingmatrix.sh.j2 +++ b/ansible/roles/hpctests/templates/pingmatrix.sh.j2 @@ -16,4 +16,8 @@ echo UCX_NET_DEVICES: $UCX_NET_DEVICES module load {{ hpctests_pingmatrix_modules | join(' ' ) }} mpicc -o nxnlatbw mpi_nxnlatbw.c + +{# mpirun flags force using UCX TCP transports, overriding higher #} +{# priority of OpenMPI btl/openib component, which is also using RDMA #} +{# https://wiki.stackhpc.com/s/985dae84-7bd8-4924-94b7-9629a7827100 #} mpirun -mca pml_ucx_tls any -mca pml_ucx_devices any nxnlatbw From edd730ee2b1366a41dcb3a95a827852a5c20dd2b Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 7 Aug 2024 15:10:08 +0100 Subject: [PATCH 4/6] Update pingpong.sh.j2 --- ansible/roles/hpctests/templates/pingpong.sh.j2 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ansible/roles/hpctests/templates/pingpong.sh.j2 b/ansible/roles/hpctests/templates/pingpong.sh.j2 index ae652e71c..ee1e8a99d 100644 --- a/ansible/roles/hpctests/templates/pingpong.sh.j2 +++ b/ansible/roles/hpctests/templates/pingpong.sh.j2 @@ -16,4 +16,8 @@ echo UCX_NET_DEVICES: $UCX_NET_DEVICES module load {{ hpctests_pingpong_modules | join(' ' ) }} #srun --mpi=pmi2 IMB-MPI1 pingpong # doesn't work in ohpc v2.1 + +{# mpirun flags force using UCX TCP transports, overriding higher #} +{# priority of OpenMPI btl/openib component, which is also using RDMA #} +{# https://wiki.stackhpc.com/s/985dae84-7bd8-4924-94b7-9629a7827100 #} mpirun -mca pml_ucx_tls any -mca pml_ucx_devices any IMB-MPI1 pingpong From d34687124058fe3e7fef984d98147eadf39601e2 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 7 Aug 2024 15:19:17 +0100 Subject: [PATCH 5/6] Replace j2 comments with bash --- ansible/roles/hpctests/templates/pingmatrix.sh.j2 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/roles/hpctests/templates/pingmatrix.sh.j2 b/ansible/roles/hpctests/templates/pingmatrix.sh.j2 index d91b41dc3..990018d85 100644 --- a/ansible/roles/hpctests/templates/pingmatrix.sh.j2 +++ b/ansible/roles/hpctests/templates/pingmatrix.sh.j2 @@ -17,7 +17,7 @@ module load {{ hpctests_pingmatrix_modules | join(' ' ) }} mpicc -o nxnlatbw mpi_nxnlatbw.c -{# mpirun flags force using UCX TCP transports, overriding higher #} -{# priority of OpenMPI btl/openib component, which is also using RDMA #} -{# https://wiki.stackhpc.com/s/985dae84-7bd8-4924-94b7-9629a7827100 #} +# mpirun flags force using UCX TCP transports, overriding higher +# priority of OpenMPI btl/openib component, which is also using RDMA +# https://wiki.stackhpc.com/s/985dae84-7bd8-4924-94b7-9629a7827100 mpirun -mca pml_ucx_tls any -mca pml_ucx_devices any nxnlatbw From 055a31cb3a008192ecf9d4101d1dab697afe9625 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 7 Aug 2024 15:19:46 +0100 Subject: [PATCH 6/6] Update pingpong.sh.j2 --- ansible/roles/hpctests/templates/pingpong.sh.j2 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/roles/hpctests/templates/pingpong.sh.j2 b/ansible/roles/hpctests/templates/pingpong.sh.j2 index ee1e8a99d..dad4499b1 100644 --- a/ansible/roles/hpctests/templates/pingpong.sh.j2 +++ b/ansible/roles/hpctests/templates/pingpong.sh.j2 @@ -17,7 +17,7 @@ module load {{ hpctests_pingpong_modules | join(' ' ) }} #srun --mpi=pmi2 IMB-MPI1 pingpong # doesn't work in ohpc v2.1 -{# mpirun flags force using UCX TCP transports, overriding higher #} -{# priority of OpenMPI btl/openib component, which is also using RDMA #} -{# https://wiki.stackhpc.com/s/985dae84-7bd8-4924-94b7-9629a7827100 #} +# mpirun flags force using UCX TCP transports, overriding higher +# priority of OpenMPI btl/openib component, which is also using RDMA +# https://wiki.stackhpc.com/s/985dae84-7bd8-4924-94b7-9629a7827100 mpirun -mca pml_ucx_tls any -mca pml_ucx_devices any IMB-MPI1 pingpong