Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CI move to ALPS (daint-mc -> eiger) #1192

Draft
wants to merge 13 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 18 additions & 18 deletions ci/.gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
include:
- local: 'ci/cpu/asan_ubsan_lsan.yml'
- local: 'ci/cpu/clang15_release_cxx20.yml'
- local: 'ci/cpu/clang15_release_stdexec.yml'
- local: 'ci/cpu/clang15_release.yml'
- local: 'ci/cpu/clang16_release.yml'
- local: 'ci/cpu/clang18_release.yml'
- local: 'ci/cpu/gcc11_release_stdexec.yml'
- local: 'ci/cpu/gcc11_debug_stdexec.yml'
- local: 'ci/cpu/gcc12_release_cxx20.yml'
- local: 'ci/cpu/gcc13_codecov.yml'
- local: 'ci/cpu/gcc13_release.yml'
- local: 'ci/cuda/gcc11_release.yml'
- local: 'ci/cuda/gcc11_release_scalapack.yml'
- local: 'ci/cuda/gcc11_codecov.yml'
- local: 'ci/cuda/gcc11_debug_scalapack.yml'
- local: 'ci/cuda/gcc13_release_stdexec.yml'
- local: 'ci/rocm/clang14_release.yml'
- local: 'ci/rocm/clang14_release_stdexec.yml'
# - local: 'ci/cpu/asan_ubsan_lsan.yml'
# - local: 'ci/cpu/clang15_release_cxx20.yml'
# - local: 'ci/cpu/clang15_release_stdexec.yml'
# - local: 'ci/cpu/clang15_release.yml'
# - local: 'ci/cpu/clang16_release.yml'
# - local: 'ci/cpu/clang18_release.yml'
# - local: 'ci/cpu/gcc11_release_stdexec.yml'
# - local: 'ci/cpu/gcc11_debug_stdexec.yml'
# - local: 'ci/cpu/gcc12_release_cxx20.yml'
# - local: 'ci/cpu/gcc13_codecov.yml'
# - local: 'ci/cpu/gcc13_release.yml'
- local: 'ci/cuda/gcc13_release.yml'
# - local: 'ci/cuda/gcc11_release_scalapack.yml'
# - local: 'ci/cuda/gcc11_codecov.yml'
# - local: 'ci/cuda/gcc11_debug_scalapack.yml'
# - local: 'ci/cuda/gcc13_release_stdexec.yml'
# - local: 'ci/rocm/clang14_release.yml'
# - local: 'ci/rocm/clang14_release_stdexec.yml'
94 changes: 72 additions & 22 deletions ci/common-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,9 @@ stages:
- build
- test

##
## BUILDS
##
## BUILD DEPS

.build_deps_common:
extends: .container-builder
.build_deps_common_base:
stage: build_deps
timeout: 6 hours
before_script:
Expand All @@ -24,10 +21,9 @@ stages:
- TAG_REPO=`find $SPACK_DLAF_REPO -type f -exec sha256sum {} \; | sha256sum - | head -c 16`
- TAG_ENVIRONMENT=`cat $SPACK_ENVIRONMENT $COMMON_SPACK_ENVIRONMENT | sha256sum | head -c 16`
- TAG=${TAG_IMAGE}-${TAG_APTGET}-${TAG_COMPILER}-MKL${USE_MKL}-${TAG_DOCKERFILE}-${TAG_SPACK}-${TAG_REPO}-${TAG_ENVIRONMENT}
- export PERSIST_IMAGE_NAME=$BUILD_IMAGE:$TAG
- echo "BUILD_IMAGE=$PERSIST_IMAGE_NAME" > build.env
- export PERSIST_IMAGE_NAME=$DEPS_IMAGE:$TAG
- echo "DEPS_IMAGE=$PERSIST_IMAGE_NAME" > build.env
- echo "USE_MKL=$USE_MKL" >> build.env
- echo "USE_ROCBLAS=$USE_ROCBLAS" >> build.env
- echo "USE_CODECOV=$USE_CODECOV" >> build.env
- 'echo "INFO: Building image $PERSIST_IMAGE_NAME"'
- 'echo "INFO: Using NUM_CORES_BUILD_DEPS=$NUM_CORES_BUILD_DEPS"'
Expand All @@ -39,7 +35,6 @@ stages:
SPACK_DLAF_REPO: ./spack
DOCKER_BUILD_ARGS: '[
"BASE_IMAGE",
"BUILDKIT_INLINE_CACHE=1",
"SPACK_SHA",
"EXTRA_APTGET",
"COMPILER",
Expand All @@ -57,51 +52,106 @@ stages:
EXTRA_APTGET: ""
CXXSTD: 17
USE_MKL: "OFF"
USE_ROCBLAS: "OFF"
COMMON_SPACK_ENVIRONMENT: ci/docker/common.yaml
USE_CODECOV: "false"

.build_common:
extends: .container-builder
.build_deps_common:
extends:
- .container-builder-cscs-zen2
- .build_deps_common_base
variables:
COMMON_SPACK_ENVIRONMENT: ci/docker/common.yaml

.build_deps_common_gh200:
extends:
- .container-builder-cscs-gh200
- .build_deps_common_base
variables:
COMMON_SPACK_ENVIRONMENT: ci/docker/common-gh200.yaml
SLURM_RESERVATION: "daint"
rasolca marked this conversation as resolved.
Show resolved Hide resolved

## BUILD DLAF

.build_common_base:
stage: build
timeout: 2 hours
before_script:
- 'echo $DOCKERHUB_TOKEN | podman login docker.io -u $DOCKERHUB_USERNAME --password-stdin'
- 'echo "INFO: Using NUM_CORES_BUILD_DLAF=$NUM_CORES_BUILD_DLAF"'
after_script:
- podman run -v $PWD/ci/ctest_to_gitlab.sh:/ctest_to_gitlab.sh $DEPLOY_IMAGE /ctest_to_gitlab.sh "$DEPLOY_IMAGE" "$USE_CODECOV" "$THREADS_PER_NODE" "$SLURM_CONSTRAINT" > pipeline.yml
- podman run -v $PWD/ci/ctest_to_gitlab.sh:/ctest_to_gitlab.sh $DLAF_IMAGE /ctest_to_gitlab.sh "$DLAF_IMAGE" "$USE_CODECOV" "$THREADS_MAX_PER_TASK" "$THREADS_PER_NODE" "$SLURM_CONSTRAINT" "$RUNNER" > pipeline.yml
variables:
PERSIST_IMAGE_NAME: $DEPLOY_IMAGE
PERSIST_IMAGE_NAME: $DLAF_IMAGE
DOCKER_BUILD_ARGS: '[
"BUILD_IMAGE",
"DEPLOY_BASE_IMAGE",
"EXTRA_APTGET_DEPLOY",
"DEPS_IMAGE",
"PIP_OPTS",
"USE_MKL",
"USE_ROCBLAS",
"NUM_PROCS=$NUM_CORES_BUILD_DLAF"
]'
# default configuration variables
# can be overwritten in the configuration as needed
DOCKERFILE: ci/docker/deploy.Dockerfile
DEPLOY_BASE_IMAGE: docker.io/ubuntu:24.04
EXTRA_APTGET_DEPLOY: ""
PIP_OPTS: ""
artifacts:
paths:
- pipeline.yml

.build_common:
extends:
- .container-builder-cscs-zen2
- .build_common_base
variables:
LD_PRELOAD: "/lib/x86_64-linux-gnu/libSegFault.so"


.build_common_gh200:
extends:
- .container-builder-cscs-gh200
- .build_common_base
variables:
LD_PRELOAD: "/lib/aarch64-linux-gnu/libSegFault.so"
SLURM_RESERVATION: "daint"

.build_for_daint-mc:
variables:
RUNNER: ".container-runner-daint"
SLURM_CONSTRAINT: mc
THREADS_MAX_PER_TASK: 72
THREADS_PER_NODE: 72

.build_for_daint-gpu:
variables:
RUNNER: ".container-runner-daint"
SLURM_CONSTRAINT: gpu
THREADS_MAX_PER_TASK: 24
THREADS_PER_NODE: 24

.build_for_eiger:
variables:
RUNNER: ".container-runner-eiger"
SLURM_CONSTRAINT: mc
THREADS_MAX_PER_TASK: 32
THREADS_PER_NODE: 256

.build_for_alps_gh200:
variables:
RUNNER: ".container-runner-todi-gh200"
SLURM_CONSTRAINT: gpu
# 64 / 2 to avoid ranks on multiple sockets for RANK6
THREADS_MAX_PER_TASK: 32
THREADS_PER_NODE: 256

## RUN

.run_common:
stage: test
trigger:
strategy: depend
forward:
pipeline_variables: true

.run_todi:
extends: .run_common
variables:
SLURM_RESERVATION: "daint"
SLURM_MPI: "pmi2"
# Workaround after update until hooks are fixed
ENROOT_LIBRARY_PATH: /capstor/scratch/cscs/fmohamed/enrootlibn
10 changes: 4 additions & 6 deletions ci/cpu/asan_ubsan_lsan.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,18 @@ cpu asan ubsan lsan deps:
variables:
EXTRA_APTGET: "clang-18 libclang-rt-18-dev libomp-18-dev"
COMPILER: clang@18
USE_MKL: "ON"
SPACK_ENVIRONMENT: ci/docker/asan-ubsan-lsan.yaml
BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-asan-ubsan-lsan/build
USE_MKL: "ON"
DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-asan-ubsan-lsan/deps

cpu asan ubsan lsan build:
extends:
- .build_common
- .build_for_daint-mc
- .build_for_eiger
needs:
- cpu asan ubsan lsan deps
variables:
DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-asan-ubsan-lsan/deploy:$CI_COMMIT_SHA
# For symbolizing stacktraces with llvm-symbolizer
EXTRA_APTGET_DEPLOY: "llvm-18"
DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-asan-ubsan-lsan/dlaf:$CI_COMMIT_SHA

cpu asan ubsan lsan test:
extends: .run_common
Expand Down
8 changes: 4 additions & 4 deletions ci/cpu/clang15_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,18 @@ cpu clang15 release deps:
variables:
EXTRA_APTGET: "clang-15"
COMPILER: clang@15
USE_MKL: "ON"
SPACK_ENVIRONMENT: ci/docker/release-cpu-serial.yaml
BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-release/build
USE_MKL: "ON"
DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-release/deps

cpu clang15 release build:
extends:
- .build_common
- .build_for_daint-mc
- .build_for_eiger
needs:
- cpu clang15 release deps
variables:
DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-release/deploy:$CI_COMMIT_SHA
DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-release/dlaf:$CI_COMMIT_SHA

cpu clang15 release test:
extends: .run_common
Expand Down
6 changes: 3 additions & 3 deletions ci/cpu/clang15_release_cxx20.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@ cpu clang15 cxx20 release deps:
CXXSTD: 20
SPACK_ENVIRONMENT: ci/docker/release-cpu.yaml
USE_MKL: "ON"
BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-20-release/build
DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-20-release/deps

cpu clang15 cxx20 release build:
extends:
- .build_common
- .build_for_daint-mc
- .build_for_eiger
needs:
- cpu clang15 cxx20 release deps
variables:
DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-cxx20-release/deploy:$CI_COMMIT_SHA
DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-cxx20-release/dlaf:$CI_COMMIT_SHA

cpu clang15 cxx20 release test:
extends: .run_common
Expand Down
8 changes: 4 additions & 4 deletions ci/cpu/clang15_release_stdexec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,18 @@ cpu clang15 stdexec release deps:
EXTRA_APTGET: "clang-15 libomp-15-dev"
COMPILER: clang@15
CXXSTD: 20
USE_MKL: "ON"
SPACK_ENVIRONMENT: ci/docker/release-cpu-stdexec.yaml
BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-stdexec-release/build
USE_MKL: "ON"
DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-stdexec-release/deps

cpu clang15 stdexec release build:
extends:
- .build_common
- .build_for_daint-mc
- .build_for_eiger
needs:
- cpu clang15 stdexec release deps
variables:
DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-stdexec-release/deploy:$CI_COMMIT_SHA
DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang15-stdexec-release/dlaf:$CI_COMMIT_SHA

cpu clang15 stdexec release test:
extends: .run_common
Expand Down
8 changes: 4 additions & 4 deletions ci/cpu/clang16_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,18 @@ cpu clang16 release deps:
variables:
EXTRA_APTGET: "clang-16 libomp-16-dev"
COMPILER: clang@16
USE_MKL: "ON"
SPACK_ENVIRONMENT: ci/docker/release-cpu.yaml
BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang16-release/build
USE_MKL: "ON"
DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang16-release/deps

cpu clang16 release build:
extends:
- .build_common
- .build_for_daint-mc
- .build_for_eiger
needs:
- cpu clang16 release deps
variables:
DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang16-release/deploy:$CI_COMMIT_SHA
DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang16-release/dlaf:$CI_COMMIT_SHA

cpu clang16 release test:
extends: .run_common
Expand Down
8 changes: 4 additions & 4 deletions ci/cpu/clang18_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,18 @@ cpu clang18 release deps:
variables:
EXTRA_APTGET: "clang-18 libomp-18-dev"
COMPILER: clang@18
USE_MKL: "ON"
SPACK_ENVIRONMENT: ci/docker/release-cpu.yaml
BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang18-release/build
USE_MKL: "ON"
DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang18-release/deps

cpu clang18 release build:
extends:
- .build_common
- .build_for_daint-mc
- .build_for_eiger
needs:
- cpu clang18 release deps
variables:
DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang18-release/deploy:$CI_COMMIT_SHA
DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-clang18-release/dlaf:$CI_COMMIT_SHA

cpu clang18 release test:
extends: .run_common
Expand Down
6 changes: 3 additions & 3 deletions ci/cpu/gcc11_debug_stdexec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@ cpu gcc11 stdexec debug deps:
CXXSTD: 20
SPACK_ENVIRONMENT: ci/docker/debug-cpu-stdexec.yaml
USE_MKL: "ON"
BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-debug/build
DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-debug/deps

cpu gcc11 stdexec debug build:
extends:
- .build_common
- .build_for_daint-mc
- .build_for_eiger
needs:
- cpu gcc11 stdexec debug deps
variables:
DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-debug/deploy:$CI_COMMIT_SHA
DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-debug/dlaf:$CI_COMMIT_SHA

cpu gcc11 stdexec debug test:
extends: .run_common
Expand Down
8 changes: 4 additions & 4 deletions ci/cpu/gcc11_release_stdexec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,18 @@ cpu gcc11 stdexec release deps:
EXTRA_APTGET: "gcc-11 g++-11 gfortran-11"
COMPILER: gcc@11
CXXSTD: 20
USE_MKL: "ON"
SPACK_ENVIRONMENT: ci/docker/release-cpu-stdexec.yaml
BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-release/build
USE_MKL: "ON"
DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-release/deps

cpu gcc11 stdexec release build:
extends:
- .build_common
- .build_for_daint-mc
- .build_for_eiger
needs:
- cpu gcc11 stdexec release deps
variables:
DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-release/deploy:$CI_COMMIT_SHA
DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc11-stdexec-release/dlaf:$CI_COMMIT_SHA

cpu gcc11 stdexec release test:
extends: .run_common
Expand Down
6 changes: 3 additions & 3 deletions ci/cpu/gcc12_release_cxx20.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@ cpu gcc12 cxx20 release deps:
CXXSTD: 20
SPACK_ENVIRONMENT: ci/docker/release-cpu.yaml
USE_MKL: "ON"
BUILD_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc12-release/build
DEPS_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc12-release/deps

cpu gcc12 cxx20 release build:
extends:
- .build_common
- .build_for_daint-mc
- .build_for_eiger
needs:
- cpu gcc12 cxx20 release deps
variables:
DEPLOY_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc12-cxx20-release/deploy:$CI_COMMIT_SHA
DLAF_IMAGE: $CSCS_REGISTRY_PATH/cpu-gcc12-cxx20-release/dlaf:$CI_COMMIT_SHA

cpu gcc12 cxx20 release test:
extends: .run_common
Expand Down
Loading
Loading