Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMD GPU support #20

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion etc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ install(FILES sarus.schema.json hook.schema.json definitions.schema.json DESTINA
install(DIRECTORY templates DESTINATION ${CMAKE_INSTALL_PREFIX}/etc)

install(FILES templates/hooks.d/07-ssh-hook.json.in DESTINATION ${CMAKE_INSTALL_PREFIX}/etc/hooks.d)
install(FILES templates/hooks.d/09-slurm-global-sync-hook.json.in DESTINATION ${CMAKE_INSTALL_PREFIX}/etc/hooks.d)
install(FILES templates/hooks.d/09-slurm-global-sync-hook.json.in DESTINATION ${CMAKE_INSTALL_PREFIX}/etc/hooks.d)
install(FILES templates/hooks.d/11-amdgpu-hook.json.in DESTINATION ${CMAKE_INSTALL_PREFIX}/etc/hooks.d)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would not add this hook to a default installation, mainly because it is targeted at very specific hardware, and therefore it should be explicitly chosen by the system administrator (like the MPI and NVIDIA hooks).
Another reason would be that at present we have no way to test it as part of the automated tests.

12 changes: 12 additions & 0 deletions etc/templates/hooks.d/11-amdgpu-hook.json.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"version": "1.0.0",
"hook": {
"path": "@INSTALL_PATH@/bin/amdgpu_hook"
},
"when": {
"annotations": {
"^com.hooks.amdgpu.enabled$": "^true$"
}
},
"stages": ["prestart"]
}
4 changes: 4 additions & 0 deletions src/cli/CommandRun.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ class CommandRun : public Command {
boost::program_options::value<std::vector<std::string>>(&conf->commandRun.userMounts),
"Mount custom directories into the container")
("mpi,m", "Enable MPI support. Implies '--glibc'")
("amdgpu", "Enable AMD GPU support.")
("ssh", "Enable SSH in the container")
("tty,t", "Allocate a pseudo-TTY in the container")
("workdir,w",
Expand Down Expand Up @@ -158,6 +159,9 @@ class CommandRun : public Command {
if(values.count("mpi")) {
conf->commandRun.useMPI = true;
}
if(values.count("amdgpu")) {
conf->commandRun.enableAmdGpu = true;
}
if(values.count("ssh")) {
conf->commandRun.enableSSH = true;
}
Expand Down
6 changes: 6 additions & 0 deletions src/cli/test/test_CLI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ TEST(CLITestGroup, generated_config_for_CommandRun) {
CHECK_EQUAL(conf->commandRun.addInitProcess, false);
CHECK_EQUAL(conf->commandRun.mounts.size(), 1); // 1 site mount + 0 user mount
CHECK_EQUAL(conf->commandRun.useMPI, false);
CHECK_EQUAL(conf->commandRun.enableAmdGpu, false);
CHECK_EQUAL(conf->commandRun.enableGlibcReplacement, 0);
CHECK_EQUAL(conf->commandRun.enableSSH, false);
CHECK_EQUAL(conf->commandRun.allocatePseudoTTY, false);
Expand Down Expand Up @@ -247,6 +248,11 @@ TEST(CLITestGroup, generated_config_for_CommandRun) {
conf = generateConfig({"run", "-m", "image"});
CHECK_EQUAL(conf->commandRun.useMPI, true);
}
// amdgpu
{
auto conf = generateConfig({"run", "--amdgpu", "image"});
CHECK_EQUAL(conf->commandRun.enableAmdGpu, true);
}
// ssh
{
auto conf = generateConfig({"run", "--ssh", "image"});
Expand Down
1 change: 1 addition & 0 deletions src/common/Config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ class Config {
bool useMPI = false;
bool enableGlibcReplacement = false;
bool enableSSH = false;
bool enableAmdGpu = false;
};

boost::filesystem::path getImageFile() const;
Expand Down
1 change: 1 addition & 0 deletions src/hooks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ add_subdirectory(ssh)
add_subdirectory(slurm_global_sync)
add_subdirectory(timestamp)
add_subdirectory(stdout_stderr_test)
add_subdirectory(amdgpu)
115 changes: 115 additions & 0 deletions src/hooks/amdgpu/AmdGpuHook.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/*
* Sarus
*
* Copyright (c) 2018-2020, ETH Zurich. All rights reserved.
*
* Please, refer to the LICENSE file in the root directory.
* SPDX-License-Identifier: BSD-3-Clause
*
*/

#include "AmdGpuHook.hpp"

#include <vector>
#include <fstream>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are all the headers here needed? For example, I don't think you need fstream, boost/regex, and possibly others.

#include <sstream>
#include <cstring>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

#include <boost/format.hpp>
#include <boost/regex.hpp>
#include <boost/algorithm/string.hpp>
#include <rapidjson/document.h>
#include <rapidjson/istreamwrapper.h>

#include "common/Logger.hpp"
#include "common/Utility.hpp"
#include "hooks/common/Utility.hpp"
#include "runtime/mount_utilities.hpp"

namespace sarus {
namespace hooks {
namespace amdgpu {

AmdGpuHook::AmdGpuHook() {
log("Initializing hook", sarus::common::LogLevel::INFO);

std::tie(bundleDir, pidOfContainer) = hooks::common::utility::parseStateOfContainerFromStdin();
sarus::hooks::common::utility::enterNamespacesOfProcess(pidOfContainer);
parseConfigJSONOfBundle();

log("Successfully initialized hook", sarus::common::LogLevel::INFO);
}

void AmdGpuHook::activate() {
log("Activating AMD GPU support", sarus::common::LogLevel::INFO);

bindMountDevices();

log("Successfully activated AMD GPU support", sarus::common::LogLevel::INFO);
}

void AmdGpuHook::parseConfigJSONOfBundle() {
log("Parsing bundle's config.json", sarus::common::LogLevel::INFO);

auto json = sarus::common::readJSON(bundleDir / "config.json");

hooks::common::utility::applyLoggingConfigIfAvailable(json);

auto root = boost::filesystem::path{ json["root"]["path"].GetString() };
if(root.is_absolute()) {
rootfsDir = root;
}
else {
rootfsDir = bundleDir / root;
}

// get uid + gid of user
uidOfUser = json["process"]["user"]["uid"].GetInt();
gidOfUser = json["process"]["user"]["gid"].GetInt();

log("Successfully parsed bundle's config.json", sarus::common::LogLevel::INFO);
}

void AmdGpuHook::bindMountDevices() const {
log("Performing bind mounts", sarus::common::LogLevel::INFO);

for(const auto& mount : {"/dev/kfd", "/dev/dri"}) {
validatedBindMount(mount, rootfsDir / mount, MS_REC);
}

log("Successfully performed bind mounts", sarus::common::LogLevel::INFO);
}


void AmdGpuHook::validatedBindMount(const boost::filesystem::path& from, const boost::filesystem::path& to, unsigned long flags) const {
auto rootIdentity = sarus::common::UserIdentity{};
auto userIdentity = sarus::common::UserIdentity(uidOfUser, gidOfUser, {});

// Validate mount source is visible for user and destination is on allowed device
sarus::common::switchIdentity(userIdentity);
sarus::runtime::validateMountSource(from);
sarus::runtime::validateMountDestination(to, bundleDir, rootfsDir);
sarus::common::switchIdentity(rootIdentity);

// Create file or folder if necessary, after validation
if (boost::filesystem::is_directory(from)){
sarus::common::createFoldersIfNecessary(to);
}
else {
sarus::common::createFileIfNecessary(to);
}
sarus::runtime::bindMount(from, to, flags);
}

void AmdGpuHook::log(const std::string& message, sarus::common::LogLevel level) const {
sarus::common::Logger::getInstance().log(message, "AMD GPU hook", level);
}

void AmdGpuHook::log(const boost::format& message, sarus::common::LogLevel level) const {
sarus::common::Logger::getInstance().log(message.str(), "AMD GPU hook", level);
}

}}} // namespace
49 changes: 49 additions & 0 deletions src/hooks/amdgpu/AmdGpuHook.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Sarus
*
* Copyright (c) 2018-2020, ETH Zurich. All rights reserved.
*
* Please, refer to the LICENSE file in the root directory.
* SPDX-License-Identifier: BSD-3-Clause
*
*/

#ifndef sarus_hooks_amdgpu_AmdGpuHook_hpp
#define sarus_hooks_amdgpu_AmdGpuHook_hpp

#include <vector>
#include <unordered_map>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As pointed out for the .cpp file above, could you check if all headers are effectively used?

#include <boost/format.hpp>
#include <boost/filesystem.hpp>
#include <sys/types.h>

#include "common/LogLevel.hpp"
#include "common/PathHash.hpp"

namespace sarus {
namespace hooks {
namespace amdgpu {

class AmdGpuHook {
public:
AmdGpuHook();
void activate();

private:
void parseConfigJSONOfBundle();
void bindMountDevices() const;
void validatedBindMount(const boost::filesystem::path& from, const boost::filesystem::path& to, unsigned long flags) const;
void log(const std::string& message, sarus::common::LogLevel level) const;
void log(const boost::format& message, sarus::common::LogLevel level) const;

private:
boost::filesystem::path bundleDir;
boost::filesystem::path rootfsDir;
pid_t pidOfContainer;
uid_t uidOfUser;
gid_t gidOfUser;
};

}}} // namespace

#endif
14 changes: 14 additions & 0 deletions src/hooks/amdgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

file(GLOB hooks_amdgpu_srcs "*.cpp" "*.c")
list(REMOVE_ITEM hooks_amdgpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp)
add_library(hooks_amdgpu_library STATIC ${hooks_amdgpu_srcs})
target_link_libraries(hooks_amdgpu_library runtime_library hooks_common_library common_library)

add_executable(amdgpu_hook "main.cpp")
target_link_libraries(amdgpu_hook hooks_amdgpu_library)
install(TARGETS amdgpu_hook DESTINATION ${CMAKE_INSTALL_PREFIX}/bin PERMISSIONS
OWNER_READ OWNER_WRITE OWNER_EXECUTE
GROUP_READ GROUP_EXECUTE
WORLD_READ WORLD_EXECUTE)

# add_subdirectory(test)
24 changes: 24 additions & 0 deletions src/hooks/amdgpu/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/*
* Sarus
*
* Copyright (c) 2018-2020, ETH Zurich. All rights reserved.
*
* Please, refer to the LICENSE file in the root directory.
* SPDX-License-Identifier: BSD-3-Clause
*
*/

#include "common/Error.hpp"
#include "common/Logger.hpp"
#include "hooks/common/Utility.hpp"
#include "AmdGpuHook.hpp"

int main(int argc, char* argv[]) {
try {
sarus::hooks::amdgpu::AmdGpuHook{}.activate();
} catch(const sarus::common::Error& e) {
sarus::common::Logger::getInstance().logErrorTrace(e, "AMD GPU hook");
exit(EXIT_FAILURE);
}
return 0;
}
4 changes: 4 additions & 0 deletions src/runtime/ConfigsMerger.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ std::unordered_map<std::string, std::string> ConfigsMerger::getBundleAnnotations
annotations["com.hooks.ssh.enabled"] = "true";
}

if(config->commandRun.enableAmdGpu) {
annotations["com.hooks.amdgpu.enabled"] = "true";
}

using IntType = typename std::underlying_type<common::LogLevel>::type;
auto level = static_cast<IntType>(common::Logger::getInstance().getLevel());
annotations["com.hooks.logging.level"] = std::to_string(level);
Expand Down
3 changes: 2 additions & 1 deletion src/runtime/mount_utilities.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ void validateMountSource(const boost::filesystem::path& source) {
utility::logMessage(boost::format("Validating mount source: %s") % source, common::LogLevel::DEBUG);
// check that directory exists, i.e. is visible to user
if (!boost::filesystem::exists(source)) {
SARUS_THROW_ERROR("mount source doesn't exist");
auto msg = boost::format("mount source %s does not exist") % source;
SARUS_THROW_ERROR(msg.str());
}
utility::logMessage(std::string("Mount source successfully validated"), common::LogLevel::DEBUG);
}
Expand Down