From 9c4082f84b4e08a07157406daa514625abf5fc5c Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Thu, 21 Jan 2021 15:28:45 +0100 Subject: [PATCH 1/2] Add very rudimentary support for AMD GPUs - Bind mount /dev/dri and /dev/kfd in the rootfs - Add the amdgpu hook and install it by default - Enable amdgpu when --amdgpu is passed --- etc/CMakeLists.txt | 3 +- etc/templates/hooks.d/11-amdgpu-hook.json.in | 12 ++ src/cli/CommandRun.hpp | 4 + src/cli/test/test_CLI.cpp | 6 + src/common/Config.hpp | 1 + src/hooks/CMakeLists.txt | 1 + src/hooks/amdgpu/AmdGpuHook.cpp | 115 +++++++++++++++++++ src/hooks/amdgpu/AmdGpuHook.hpp | 49 ++++++++ src/hooks/amdgpu/CMakeLists.txt | 14 +++ src/hooks/amdgpu/main.cpp | 24 ++++ src/runtime/ConfigsMerger.cpp | 4 + 11 files changed, 232 insertions(+), 1 deletion(-) create mode 100644 etc/templates/hooks.d/11-amdgpu-hook.json.in create mode 100644 src/hooks/amdgpu/AmdGpuHook.cpp create mode 100644 src/hooks/amdgpu/AmdGpuHook.hpp create mode 100644 src/hooks/amdgpu/CMakeLists.txt create mode 100644 src/hooks/amdgpu/main.cpp diff --git a/etc/CMakeLists.txt b/etc/CMakeLists.txt index f2887bd1..9ad107a3 100644 --- a/etc/CMakeLists.txt +++ b/etc/CMakeLists.txt @@ -5,4 +5,5 @@ install(FILES sarus.schema.json hook.schema.json definitions.schema.json DESTINA install(DIRECTORY templates DESTINATION ${CMAKE_INSTALL_PREFIX}/etc) install(FILES templates/hooks.d/07-ssh-hook.json.in DESTINATION ${CMAKE_INSTALL_PREFIX}/etc/hooks.d) -install(FILES templates/hooks.d/09-slurm-global-sync-hook.json.in DESTINATION ${CMAKE_INSTALL_PREFIX}/etc/hooks.d) \ No newline at end of file +install(FILES templates/hooks.d/09-slurm-global-sync-hook.json.in DESTINATION ${CMAKE_INSTALL_PREFIX}/etc/hooks.d) +install(FILES templates/hooks.d/11-amdgpu-hook.json.in DESTINATION ${CMAKE_INSTALL_PREFIX}/etc/hooks.d) \ No newline at end of file diff --git a/etc/templates/hooks.d/11-amdgpu-hook.json.in b/etc/templates/hooks.d/11-amdgpu-hook.json.in new file mode 100644 index 00000000..00d1181a --- /dev/null +++ b/etc/templates/hooks.d/11-amdgpu-hook.json.in @@ -0,0 +1,12 @@ +{ + "version": "1.0.0", + "hook": { + "path": "@INSTALL_PATH@/bin/amdgpu_hook" + }, + "when": { + "annotations": { + "^com.hooks.amdgpu.enabled$": "^true$" + } + }, + "stages": ["prestart"] +} diff --git a/src/cli/CommandRun.hpp b/src/cli/CommandRun.hpp index 6f99e9b5..cf4f00af 100644 --- a/src/cli/CommandRun.hpp +++ b/src/cli/CommandRun.hpp @@ -106,6 +106,7 @@ class CommandRun : public Command { boost::program_options::value>(&conf->commandRun.userMounts), "Mount custom directories into the container") ("mpi,m", "Enable MPI support. Implies '--glibc'") + ("amdgpu", "Enable AMD GPU support.") ("ssh", "Enable SSH in the container") ("tty,t", "Allocate a pseudo-TTY in the container") ("workdir,w", @@ -158,6 +159,9 @@ class CommandRun : public Command { if(values.count("mpi")) { conf->commandRun.useMPI = true; } + if(values.count("amdgpu")) { + conf->commandRun.enableAmdGpu = true; + } if(values.count("ssh")) { conf->commandRun.enableSSH = true; } diff --git a/src/cli/test/test_CLI.cpp b/src/cli/test/test_CLI.cpp index 96dbaa69..22adf494 100644 --- a/src/cli/test/test_CLI.cpp +++ b/src/cli/test/test_CLI.cpp @@ -206,6 +206,7 @@ TEST(CLITestGroup, generated_config_for_CommandRun) { CHECK_EQUAL(conf->commandRun.addInitProcess, false); CHECK_EQUAL(conf->commandRun.mounts.size(), 1); // 1 site mount + 0 user mount CHECK_EQUAL(conf->commandRun.useMPI, false); + CHECK_EQUAL(conf->commandRun.enableAmdGpu, false); CHECK_EQUAL(conf->commandRun.enableGlibcReplacement, 0); CHECK_EQUAL(conf->commandRun.enableSSH, false); CHECK_EQUAL(conf->commandRun.allocatePseudoTTY, false); @@ -247,6 +248,11 @@ TEST(CLITestGroup, generated_config_for_CommandRun) { conf = generateConfig({"run", "-m", "image"}); CHECK_EQUAL(conf->commandRun.useMPI, true); } + // amdgpu + { + auto conf = generateConfig({"run", "--amdgpu", "image"}); + CHECK_EQUAL(conf->commandRun.enableAmdGpu, true); + } // ssh { auto conf = generateConfig({"run", "--ssh", "image"}); diff --git a/src/common/Config.hpp b/src/common/Config.hpp index cee686a2..7460887a 100644 --- a/src/common/Config.hpp +++ b/src/common/Config.hpp @@ -74,6 +74,7 @@ class Config { bool useMPI = false; bool enableGlibcReplacement = false; bool enableSSH = false; + bool enableAmdGpu = false; }; boost::filesystem::path getImageFile() const; diff --git a/src/hooks/CMakeLists.txt b/src/hooks/CMakeLists.txt index ad0194d1..d0702f44 100644 --- a/src/hooks/CMakeLists.txt +++ b/src/hooks/CMakeLists.txt @@ -6,3 +6,4 @@ add_subdirectory(ssh) add_subdirectory(slurm_global_sync) add_subdirectory(timestamp) add_subdirectory(stdout_stderr_test) +add_subdirectory(amdgpu) \ No newline at end of file diff --git a/src/hooks/amdgpu/AmdGpuHook.cpp b/src/hooks/amdgpu/AmdGpuHook.cpp new file mode 100644 index 00000000..fb94d7b1 --- /dev/null +++ b/src/hooks/amdgpu/AmdGpuHook.cpp @@ -0,0 +1,115 @@ +/* + * Sarus + * + * Copyright (c) 2018-2020, ETH Zurich. All rights reserved. + * + * Please, refer to the LICENSE file in the root directory. + * SPDX-License-Identifier: BSD-3-Clause + * + */ + +#include "AmdGpuHook.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "common/Logger.hpp" +#include "common/Utility.hpp" +#include "hooks/common/Utility.hpp" +#include "runtime/mount_utilities.hpp" + +namespace sarus { +namespace hooks { +namespace amdgpu { + +AmdGpuHook::AmdGpuHook() { + log("Initializing hook", sarus::common::LogLevel::INFO); + + std::tie(bundleDir, pidOfContainer) = hooks::common::utility::parseStateOfContainerFromStdin(); + sarus::hooks::common::utility::enterNamespacesOfProcess(pidOfContainer); + parseConfigJSONOfBundle(); + + log("Successfully initialized hook", sarus::common::LogLevel::INFO); +} + +void AmdGpuHook::activate() { + log("Activating AMD GPU support", sarus::common::LogLevel::INFO); + + bindMountDevices(); + + log("Successfully activated AMD GPU support", sarus::common::LogLevel::INFO); +} + +void AmdGpuHook::parseConfigJSONOfBundle() { + log("Parsing bundle's config.json", sarus::common::LogLevel::INFO); + + auto json = sarus::common::readJSON(bundleDir / "config.json"); + + hooks::common::utility::applyLoggingConfigIfAvailable(json); + + auto root = boost::filesystem::path{ json["root"]["path"].GetString() }; + if(root.is_absolute()) { + rootfsDir = root; + } + else { + rootfsDir = bundleDir / root; + } + + // get uid + gid of user + uidOfUser = json["process"]["user"]["uid"].GetInt(); + gidOfUser = json["process"]["user"]["gid"].GetInt(); + + log("Successfully parsed bundle's config.json", sarus::common::LogLevel::INFO); +} + +void AmdGpuHook::bindMountDevices() const { + log("Performing bind mounts", sarus::common::LogLevel::INFO); + + for(const auto& mount : {"/dev/kfd", "/dev/dri"}) { + validatedBindMount(mount, rootfsDir / mount, MS_REC); + } + + log("Successfully performed bind mounts", sarus::common::LogLevel::INFO); +} + + +void AmdGpuHook::validatedBindMount(const boost::filesystem::path& from, const boost::filesystem::path& to, unsigned long flags) const { + auto rootIdentity = sarus::common::UserIdentity{}; + auto userIdentity = sarus::common::UserIdentity(uidOfUser, gidOfUser, {}); + + // Validate mount source is visible for user and destination is on allowed device + sarus::common::switchIdentity(userIdentity); + sarus::runtime::validateMountSource(from); + sarus::runtime::validateMountDestination(to, bundleDir, rootfsDir); + sarus::common::switchIdentity(rootIdentity); + + // Create file or folder if necessary, after validation + if (boost::filesystem::is_directory(from)){ + sarus::common::createFoldersIfNecessary(to); + } + else { + sarus::common::createFileIfNecessary(to); + } + sarus::runtime::bindMount(from, to, flags); +} + +void AmdGpuHook::log(const std::string& message, sarus::common::LogLevel level) const { + sarus::common::Logger::getInstance().log(message, "AMD GPU hook", level); +} + +void AmdGpuHook::log(const boost::format& message, sarus::common::LogLevel level) const { + sarus::common::Logger::getInstance().log(message.str(), "AMD GPU hook", level); +} + +}}} // namespace diff --git a/src/hooks/amdgpu/AmdGpuHook.hpp b/src/hooks/amdgpu/AmdGpuHook.hpp new file mode 100644 index 00000000..86803cfc --- /dev/null +++ b/src/hooks/amdgpu/AmdGpuHook.hpp @@ -0,0 +1,49 @@ +/* + * Sarus + * + * Copyright (c) 2018-2020, ETH Zurich. All rights reserved. + * + * Please, refer to the LICENSE file in the root directory. + * SPDX-License-Identifier: BSD-3-Clause + * + */ + +#ifndef sarus_hooks_amdgpu_AmdGpuHook_hpp +#define sarus_hooks_amdgpu_AmdGpuHook_hpp + +#include +#include +#include +#include +#include + +#include "common/LogLevel.hpp" +#include "common/PathHash.hpp" + +namespace sarus { +namespace hooks { +namespace amdgpu { + +class AmdGpuHook { +public: + AmdGpuHook(); + void activate(); + +private: + void parseConfigJSONOfBundle(); + void bindMountDevices() const; + void validatedBindMount(const boost::filesystem::path& from, const boost::filesystem::path& to, unsigned long flags) const; + void log(const std::string& message, sarus::common::LogLevel level) const; + void log(const boost::format& message, sarus::common::LogLevel level) const; + +private: + boost::filesystem::path bundleDir; + boost::filesystem::path rootfsDir; + pid_t pidOfContainer; + uid_t uidOfUser; + gid_t gidOfUser; +}; + +}}} // namespace + +#endif diff --git a/src/hooks/amdgpu/CMakeLists.txt b/src/hooks/amdgpu/CMakeLists.txt new file mode 100644 index 00000000..f38d6ec2 --- /dev/null +++ b/src/hooks/amdgpu/CMakeLists.txt @@ -0,0 +1,14 @@ + +file(GLOB hooks_amdgpu_srcs "*.cpp" "*.c") +list(REMOVE_ITEM hooks_amdgpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp) +add_library(hooks_amdgpu_library STATIC ${hooks_amdgpu_srcs}) +target_link_libraries(hooks_amdgpu_library runtime_library hooks_common_library common_library) + +add_executable(amdgpu_hook "main.cpp") +target_link_libraries(amdgpu_hook hooks_amdgpu_library) +install(TARGETS amdgpu_hook DESTINATION ${CMAKE_INSTALL_PREFIX}/bin PERMISSIONS + OWNER_READ OWNER_WRITE OWNER_EXECUTE + GROUP_READ GROUP_EXECUTE + WORLD_READ WORLD_EXECUTE) + +# add_subdirectory(test) \ No newline at end of file diff --git a/src/hooks/amdgpu/main.cpp b/src/hooks/amdgpu/main.cpp new file mode 100644 index 00000000..5a440a91 --- /dev/null +++ b/src/hooks/amdgpu/main.cpp @@ -0,0 +1,24 @@ +/* + * Sarus + * + * Copyright (c) 2018-2020, ETH Zurich. All rights reserved. + * + * Please, refer to the LICENSE file in the root directory. + * SPDX-License-Identifier: BSD-3-Clause + * + */ + +#include "common/Error.hpp" +#include "common/Logger.hpp" +#include "hooks/common/Utility.hpp" +#include "AmdGpuHook.hpp" + +int main(int argc, char* argv[]) { + try { + sarus::hooks::amdgpu::AmdGpuHook{}.activate(); + } catch(const sarus::common::Error& e) { + sarus::common::Logger::getInstance().logErrorTrace(e, "AMD GPU hook"); + exit(EXIT_FAILURE); + } + return 0; +} diff --git a/src/runtime/ConfigsMerger.cpp b/src/runtime/ConfigsMerger.cpp index 51129f29..6bb30854 100644 --- a/src/runtime/ConfigsMerger.cpp +++ b/src/runtime/ConfigsMerger.cpp @@ -62,6 +62,10 @@ std::unordered_map ConfigsMerger::getBundleAnnotations annotations["com.hooks.ssh.enabled"] = "true"; } + if(config->commandRun.enableAmdGpu) { + annotations["com.hooks.amdgpu.enabled"] = "true"; + } + using IntType = typename std::underlying_type::type; auto level = static_cast(common::Logger::getInstance().getLevel()); annotations["com.hooks.logging.level"] = std::to_string(level); From 0fa9c5e35deb8d8ef770ed80f1f0f7f551708ccb Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Thu, 21 Jan 2021 15:29:44 +0100 Subject: [PATCH 2/2] Show the source path in error when mounting non-existing path --- src/runtime/mount_utilities.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/runtime/mount_utilities.cpp b/src/runtime/mount_utilities.cpp index 117316fb..0e5a6b6b 100644 --- a/src/runtime/mount_utilities.cpp +++ b/src/runtime/mount_utilities.cpp @@ -32,7 +32,8 @@ void validateMountSource(const boost::filesystem::path& source) { utility::logMessage(boost::format("Validating mount source: %s") % source, common::LogLevel::DEBUG); // check that directory exists, i.e. is visible to user if (!boost::filesystem::exists(source)) { - SARUS_THROW_ERROR("mount source doesn't exist"); + auto msg = boost::format("mount source %s does not exist") % source; + SARUS_THROW_ERROR(msg.str()); } utility::logMessage(std::string("Mount source successfully validated"), common::LogLevel::DEBUG); }