From 44dc3d00a3a8c7b4ca85722ee8dd8489bbb6c77e Mon Sep 17 00:00:00 2001 From: Gigon Bae Date: Thu, 26 Oct 2023 12:54:33 -0700 Subject: [PATCH] Apply pybind11 patch to avoid deadlock This applies the following patches to pybind11: - https://github.com/pybind/pybind11/pull/4857 - https://github.com/pybind/pybind11/pull/4877 to avoid deadlock when using pybind11 without importing numpy in multi-threaded environment. --- cpp/cmake/deps/pybind11.cmake | 1 + cpp/cmake/deps/pybind11_pr4857_4877.patch | 247 +++++++++++++++++++ python/cmake/deps/pybind11.cmake | 1 + python/cmake/deps/pybind11_pr4857_4877.patch | 247 +++++++++++++++++++ 4 files changed, 496 insertions(+) create mode 100644 cpp/cmake/deps/pybind11_pr4857_4877.patch create mode 100644 python/cmake/deps/pybind11_pr4857_4877.patch diff --git a/cpp/cmake/deps/pybind11.cmake b/cpp/cmake/deps/pybind11.cmake index 2bab5c8b4..e4ea6f583 100644 --- a/cpp/cmake/deps/pybind11.cmake +++ b/cpp/cmake/deps/pybind11.cmake @@ -19,6 +19,7 @@ if (NOT TARGET deps::pybind11) GIT_REPOSITORY https://github.com/pybind/pybind11.git GIT_TAG v2.11.1 GIT_SHALLOW TRUE + PATCH_COMMAND git apply "${CMAKE_CURRENT_LIST_DIR}/pybind11_pr4857_4877.patch" || true ) FetchContent_GetProperties(deps-pybind11) if (NOT deps-pybind11_POPULATED) diff --git a/cpp/cmake/deps/pybind11_pr4857_4877.patch b/cpp/cmake/deps/pybind11_pr4857_4877.patch new file mode 100644 index 000000000..d6abec392 --- /dev/null +++ b/cpp/cmake/deps/pybind11_pr4857_4877.patch @@ -0,0 +1,247 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 87ec103..f2f1d19 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -132,6 +132,7 @@ set(PYBIND11_HEADERS + include/pybind11/embed.h + include/pybind11/eval.h + include/pybind11/gil.h ++ include/pybind11/gil_safe_call_once.h + include/pybind11/iostream.h + include/pybind11/functional.h + include/pybind11/numpy.h +diff --git a/include/pybind11/detail/common.h b/include/pybind11/detail/common.h +index 31a54c7..8906c1f 100644 +--- a/include/pybind11/detail/common.h ++++ b/include/pybind11/detail/common.h +@@ -118,6 +118,14 @@ + # endif + #endif + ++#if defined(PYBIND11_CPP20) ++# define PYBIND11_CONSTINIT constinit ++# define PYBIND11_DTOR_CONSTEXPR constexpr ++#else ++# define PYBIND11_CONSTINIT ++# define PYBIND11_DTOR_CONSTEXPR ++#endif ++ + // Compiler version assertions + #if defined(__INTEL_COMPILER) + # if __INTEL_COMPILER < 1800 +diff --git a/include/pybind11/gil.h b/include/pybind11/gil.h +index 570a558..da22f48 100644 +--- a/include/pybind11/gil.h ++++ b/include/pybind11/gil.h +@@ -11,6 +11,8 @@ + + #include "detail/common.h" + ++#include ++ + #if defined(WITH_THREAD) && !defined(PYBIND11_SIMPLE_GIL_MANAGEMENT) + # include "detail/internals.h" + #endif +@@ -137,7 +139,9 @@ private: + + class gil_scoped_release { + public: ++ // PRECONDITION: The GIL must be held when this constructor is called. + explicit gil_scoped_release(bool disassoc = false) : disassoc(disassoc) { ++ assert(PyGILState_Check()); + // `get_internals()` must be called here unconditionally in order to initialize + // `internals.tstate` for subsequent `gil_scoped_acquire` calls. Otherwise, an + // initialization race could occur as multiple threads try `gil_scoped_acquire`. +@@ -201,7 +205,11 @@ class gil_scoped_release { + PyThreadState *state; + + public: +- gil_scoped_release() : state{PyEval_SaveThread()} {} ++ // PRECONDITION: The GIL must be held when this constructor is called. ++ gil_scoped_release() { ++ assert(PyGILState_Check()); ++ state = PyEval_SaveThread(); ++ } + gil_scoped_release(const gil_scoped_release &) = delete; + gil_scoped_release &operator=(const gil_scoped_release &) = delete; + ~gil_scoped_release() { PyEval_RestoreThread(state); } +diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h +new file mode 100644 +index 0000000..58b90b8 +--- /dev/null ++++ b/include/pybind11/gil_safe_call_once.h +@@ -0,0 +1,90 @@ ++// Copyright (c) 2023 The pybind Community. ++ ++ ++#include "detail/common.h" ++#include "gil.h" ++ ++#include ++#include ++ ++PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) ++ ++// Use the `gil_safe_call_once_and_store` class below instead of the naive ++// ++// static auto imported_obj = py::module_::import("module_name"); // BAD, DO NOT USE! ++// ++// which has two serious issues: ++// ++// 1. Py_DECREF() calls potentially after the Python interpreter was finalized already, and ++// 2. deadlocks in multi-threaded processes (because of missing lock ordering). ++// ++// The following alternative avoids both problems: ++// ++// PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store storage; ++// auto &imported_obj = storage // Do NOT make this `static`! ++// .call_once_and_store_result([]() { ++// return py::module_::import("module_name"); ++// }) ++// .get_stored(); ++// ++// The parameter of `call_once_and_store_result()` must be callable. It can make ++// CPython API calls, and in particular, it can temporarily release the GIL. ++// ++// `T` can be any C++ type, it does not have to involve CPython API types. ++// ++// The behavior with regard to signals, e.g. `SIGINT` (`KeyboardInterrupt`), ++// is not ideal. If the main thread is the one to actually run the `Callable`, ++// then a `KeyboardInterrupt` will interrupt it if it is running normal Python ++// code. The situation is different if a non-main thread runs the ++// `Callable`, and then the main thread starts waiting for it to complete: ++// a `KeyboardInterrupt` will not interrupt the non-main thread, but it will ++// get processed only when it is the main thread's turn again and it is running ++// normal Python code. However, this will be unnoticeable for quick call-once ++// functions, which is usually the case. ++template ++class gil_safe_call_once_and_store { ++public: ++ // PRECONDITION: The GIL must be held when `call_once_and_store_result()` is called. ++ template ++ gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn) { ++ if (!is_initialized_) { // This read is guarded by the GIL. ++ // Multiple threads may enter here, because the GIL is released in the next line and ++ // CPython API calls in the `fn()` call below may release and reacquire the GIL. ++ gil_scoped_release gil_rel; // Needed to establish lock ordering. ++ std::call_once(once_flag_, [&] { ++ // Only one thread will ever enter here. ++ gil_scoped_acquire gil_acq; ++ ::new (storage_) T(fn()); // fn may release, but will reacquire, the GIL. ++ is_initialized_ = true; // This write is guarded by the GIL. ++ }); ++ // All threads will observe `is_initialized_` as true here. ++ } ++ // Intentionally not returning `T &` to ensure the calling code is self-documenting. ++ return *this; ++ } ++ ++ // This must only be called after `call_once_and_store_result()` was called. ++ T &get_stored() { ++ assert(is_initialized_); ++ PYBIND11_WARNING_PUSH ++#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 ++ // Needed for gcc 4.8.5 ++ PYBIND11_WARNING_DISABLE_GCC("-Wstrict-aliasing") ++#endif ++ return *reinterpret_cast(storage_); ++ PYBIND11_WARNING_POP ++ } ++ ++ constexpr gil_safe_call_once_and_store() = default; ++ PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; ++ ++private: ++ alignas(T) char storage_[sizeof(T)] = {}; ++ std::once_flag once_flag_ = {}; ++ bool is_initialized_ = false; ++ // The `is_initialized_`-`storage_` pair is very similar to `std::optional`, ++ // but the latter does not have the triviality properties of former, ++ // therefore `std::optional` is not a viable alternative here. ++}; ++ ++PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) +diff --git a/include/pybind11/numpy.h b/include/pybind11/numpy.h +index 36077ec..1217c8d 100644 +--- a/include/pybind11/numpy.h ++++ b/include/pybind11/numpy.h +@@ -10,7 +10,10 @@ + #pragma once + + #include "pybind11.h" ++#include "detail/common.h" + #include "complex.h" ++#include "gil_safe_call_once.h" ++#include "pytypes.h" + + #include + #include +@@ -120,6 +123,20 @@ inline numpy_internals &get_numpy_internals() { + return *ptr; + } + ++PYBIND11_NOINLINE module_ import_numpy_core_submodule(const char *submodule_name) { ++ module_ numpy = module_::import("numpy"); ++ str version_string = numpy.attr("__version__"); ++ ++ module_ numpy_lib = module_::import("numpy.lib"); ++ object numpy_version = numpy_lib.attr("NumpyVersion")(version_string); ++ int major_version = numpy_version.attr("major").cast(); ++ ++ /* `numpy.core` was renamed to `numpy._core` in NumPy 2.0 as it officially ++ became a private module. */ ++ std::string numpy_core_path = major_version >= 2 ? "numpy._core" : "numpy.core"; ++ return module_::import((numpy_core_path + "." + submodule_name).c_str()); ++} ++ + template + struct same_size { + template +@@ -192,8 +209,8 @@ struct npy_api { + }; + + static npy_api &get() { +- static npy_api api = lookup(); +- return api; ++ PYBIND11_CONSTINIT static gil_safe_call_once_and_store storage; ++ return storage.call_once_and_store_result(lookup).get_stored(); + } + + bool PyArray_Check_(PyObject *obj) const { +@@ -263,9 +280,13 @@ private: + }; + + static npy_api lookup() { +- module_ m = module_::import("numpy.core.multiarray"); ++ module_ m = detail::import_numpy_core_submodule("multiarray"); + auto c = m.attr("_ARRAY_API"); + void **api_ptr = (void **) PyCapsule_GetPointer(c.ptr(), nullptr); ++ if (api_ptr == nullptr) { ++ raise_from(PyExc_SystemError, "FAILURE obtaining numpy _ARRAY_API pointer."); ++ throw error_already_set(); ++ } + npy_api api; + #define DECL_NPY_API(Func) api.Func##_ = (decltype(api.Func##_)) api_ptr[API_##Func]; + DECL_NPY_API(PyArray_GetNDArrayCFeatureVersion); +@@ -625,13 +646,14 @@ public: + char flags() const { return detail::array_descriptor_proxy(m_ptr)->flags; } + + private: +- static object _dtype_from_pep3118() { +- static PyObject *obj = module_::import("numpy.core._internal") +- .attr("_dtype_from_pep3118") +- .cast() +- .release() +- .ptr(); +- return reinterpret_borrow(obj); ++ static object &_dtype_from_pep3118() { ++ PYBIND11_CONSTINIT static gil_safe_call_once_and_store storage; ++ return storage ++ .call_once_and_store_result([]() { ++ return detail::import_numpy_core_submodule("_internal") ++ .attr("_dtype_from_pep3118"); ++ }) ++ .get_stored(); + } + + dtype strip_padding(ssize_t itemsize) { diff --git a/python/cmake/deps/pybind11.cmake b/python/cmake/deps/pybind11.cmake index 7347ed651..ab0eab7f8 100644 --- a/python/cmake/deps/pybind11.cmake +++ b/python/cmake/deps/pybind11.cmake @@ -19,6 +19,7 @@ if (NOT TARGET deps::pybind11) GIT_REPOSITORY https://github.com/pybind/pybind11.git GIT_TAG v2.11.1 GIT_SHALLOW TRUE + PATCH_COMMAND git apply "${CMAKE_CURRENT_LIST_DIR}/pybind11_pr4857_4877.patch" || true ) FetchContent_GetProperties(deps-pybind11) if (NOT deps-pybind11_POPULATED) diff --git a/python/cmake/deps/pybind11_pr4857_4877.patch b/python/cmake/deps/pybind11_pr4857_4877.patch new file mode 100644 index 000000000..d6abec392 --- /dev/null +++ b/python/cmake/deps/pybind11_pr4857_4877.patch @@ -0,0 +1,247 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 87ec103..f2f1d19 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -132,6 +132,7 @@ set(PYBIND11_HEADERS + include/pybind11/embed.h + include/pybind11/eval.h + include/pybind11/gil.h ++ include/pybind11/gil_safe_call_once.h + include/pybind11/iostream.h + include/pybind11/functional.h + include/pybind11/numpy.h +diff --git a/include/pybind11/detail/common.h b/include/pybind11/detail/common.h +index 31a54c7..8906c1f 100644 +--- a/include/pybind11/detail/common.h ++++ b/include/pybind11/detail/common.h +@@ -118,6 +118,14 @@ + # endif + #endif + ++#if defined(PYBIND11_CPP20) ++# define PYBIND11_CONSTINIT constinit ++# define PYBIND11_DTOR_CONSTEXPR constexpr ++#else ++# define PYBIND11_CONSTINIT ++# define PYBIND11_DTOR_CONSTEXPR ++#endif ++ + // Compiler version assertions + #if defined(__INTEL_COMPILER) + # if __INTEL_COMPILER < 1800 +diff --git a/include/pybind11/gil.h b/include/pybind11/gil.h +index 570a558..da22f48 100644 +--- a/include/pybind11/gil.h ++++ b/include/pybind11/gil.h +@@ -11,6 +11,8 @@ + + #include "detail/common.h" + ++#include ++ + #if defined(WITH_THREAD) && !defined(PYBIND11_SIMPLE_GIL_MANAGEMENT) + # include "detail/internals.h" + #endif +@@ -137,7 +139,9 @@ private: + + class gil_scoped_release { + public: ++ // PRECONDITION: The GIL must be held when this constructor is called. + explicit gil_scoped_release(bool disassoc = false) : disassoc(disassoc) { ++ assert(PyGILState_Check()); + // `get_internals()` must be called here unconditionally in order to initialize + // `internals.tstate` for subsequent `gil_scoped_acquire` calls. Otherwise, an + // initialization race could occur as multiple threads try `gil_scoped_acquire`. +@@ -201,7 +205,11 @@ class gil_scoped_release { + PyThreadState *state; + + public: +- gil_scoped_release() : state{PyEval_SaveThread()} {} ++ // PRECONDITION: The GIL must be held when this constructor is called. ++ gil_scoped_release() { ++ assert(PyGILState_Check()); ++ state = PyEval_SaveThread(); ++ } + gil_scoped_release(const gil_scoped_release &) = delete; + gil_scoped_release &operator=(const gil_scoped_release &) = delete; + ~gil_scoped_release() { PyEval_RestoreThread(state); } +diff --git a/include/pybind11/gil_safe_call_once.h b/include/pybind11/gil_safe_call_once.h +new file mode 100644 +index 0000000..58b90b8 +--- /dev/null ++++ b/include/pybind11/gil_safe_call_once.h +@@ -0,0 +1,90 @@ ++// Copyright (c) 2023 The pybind Community. ++ ++ ++#include "detail/common.h" ++#include "gil.h" ++ ++#include ++#include ++ ++PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE) ++ ++// Use the `gil_safe_call_once_and_store` class below instead of the naive ++// ++// static auto imported_obj = py::module_::import("module_name"); // BAD, DO NOT USE! ++// ++// which has two serious issues: ++// ++// 1. Py_DECREF() calls potentially after the Python interpreter was finalized already, and ++// 2. deadlocks in multi-threaded processes (because of missing lock ordering). ++// ++// The following alternative avoids both problems: ++// ++// PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store storage; ++// auto &imported_obj = storage // Do NOT make this `static`! ++// .call_once_and_store_result([]() { ++// return py::module_::import("module_name"); ++// }) ++// .get_stored(); ++// ++// The parameter of `call_once_and_store_result()` must be callable. It can make ++// CPython API calls, and in particular, it can temporarily release the GIL. ++// ++// `T` can be any C++ type, it does not have to involve CPython API types. ++// ++// The behavior with regard to signals, e.g. `SIGINT` (`KeyboardInterrupt`), ++// is not ideal. If the main thread is the one to actually run the `Callable`, ++// then a `KeyboardInterrupt` will interrupt it if it is running normal Python ++// code. The situation is different if a non-main thread runs the ++// `Callable`, and then the main thread starts waiting for it to complete: ++// a `KeyboardInterrupt` will not interrupt the non-main thread, but it will ++// get processed only when it is the main thread's turn again and it is running ++// normal Python code. However, this will be unnoticeable for quick call-once ++// functions, which is usually the case. ++template ++class gil_safe_call_once_and_store { ++public: ++ // PRECONDITION: The GIL must be held when `call_once_and_store_result()` is called. ++ template ++ gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn) { ++ if (!is_initialized_) { // This read is guarded by the GIL. ++ // Multiple threads may enter here, because the GIL is released in the next line and ++ // CPython API calls in the `fn()` call below may release and reacquire the GIL. ++ gil_scoped_release gil_rel; // Needed to establish lock ordering. ++ std::call_once(once_flag_, [&] { ++ // Only one thread will ever enter here. ++ gil_scoped_acquire gil_acq; ++ ::new (storage_) T(fn()); // fn may release, but will reacquire, the GIL. ++ is_initialized_ = true; // This write is guarded by the GIL. ++ }); ++ // All threads will observe `is_initialized_` as true here. ++ } ++ // Intentionally not returning `T &` to ensure the calling code is self-documenting. ++ return *this; ++ } ++ ++ // This must only be called after `call_once_and_store_result()` was called. ++ T &get_stored() { ++ assert(is_initialized_); ++ PYBIND11_WARNING_PUSH ++#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 ++ // Needed for gcc 4.8.5 ++ PYBIND11_WARNING_DISABLE_GCC("-Wstrict-aliasing") ++#endif ++ return *reinterpret_cast(storage_); ++ PYBIND11_WARNING_POP ++ } ++ ++ constexpr gil_safe_call_once_and_store() = default; ++ PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default; ++ ++private: ++ alignas(T) char storage_[sizeof(T)] = {}; ++ std::once_flag once_flag_ = {}; ++ bool is_initialized_ = false; ++ // The `is_initialized_`-`storage_` pair is very similar to `std::optional`, ++ // but the latter does not have the triviality properties of former, ++ // therefore `std::optional` is not a viable alternative here. ++}; ++ ++PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE) +diff --git a/include/pybind11/numpy.h b/include/pybind11/numpy.h +index 36077ec..1217c8d 100644 +--- a/include/pybind11/numpy.h ++++ b/include/pybind11/numpy.h +@@ -10,7 +10,10 @@ + #pragma once + + #include "pybind11.h" ++#include "detail/common.h" + #include "complex.h" ++#include "gil_safe_call_once.h" ++#include "pytypes.h" + + #include + #include +@@ -120,6 +123,20 @@ inline numpy_internals &get_numpy_internals() { + return *ptr; + } + ++PYBIND11_NOINLINE module_ import_numpy_core_submodule(const char *submodule_name) { ++ module_ numpy = module_::import("numpy"); ++ str version_string = numpy.attr("__version__"); ++ ++ module_ numpy_lib = module_::import("numpy.lib"); ++ object numpy_version = numpy_lib.attr("NumpyVersion")(version_string); ++ int major_version = numpy_version.attr("major").cast(); ++ ++ /* `numpy.core` was renamed to `numpy._core` in NumPy 2.0 as it officially ++ became a private module. */ ++ std::string numpy_core_path = major_version >= 2 ? "numpy._core" : "numpy.core"; ++ return module_::import((numpy_core_path + "." + submodule_name).c_str()); ++} ++ + template + struct same_size { + template +@@ -192,8 +209,8 @@ struct npy_api { + }; + + static npy_api &get() { +- static npy_api api = lookup(); +- return api; ++ PYBIND11_CONSTINIT static gil_safe_call_once_and_store storage; ++ return storage.call_once_and_store_result(lookup).get_stored(); + } + + bool PyArray_Check_(PyObject *obj) const { +@@ -263,9 +280,13 @@ private: + }; + + static npy_api lookup() { +- module_ m = module_::import("numpy.core.multiarray"); ++ module_ m = detail::import_numpy_core_submodule("multiarray"); + auto c = m.attr("_ARRAY_API"); + void **api_ptr = (void **) PyCapsule_GetPointer(c.ptr(), nullptr); ++ if (api_ptr == nullptr) { ++ raise_from(PyExc_SystemError, "FAILURE obtaining numpy _ARRAY_API pointer."); ++ throw error_already_set(); ++ } + npy_api api; + #define DECL_NPY_API(Func) api.Func##_ = (decltype(api.Func##_)) api_ptr[API_##Func]; + DECL_NPY_API(PyArray_GetNDArrayCFeatureVersion); +@@ -625,13 +646,14 @@ public: + char flags() const { return detail::array_descriptor_proxy(m_ptr)->flags; } + + private: +- static object _dtype_from_pep3118() { +- static PyObject *obj = module_::import("numpy.core._internal") +- .attr("_dtype_from_pep3118") +- .cast() +- .release() +- .ptr(); +- return reinterpret_borrow(obj); ++ static object &_dtype_from_pep3118() { ++ PYBIND11_CONSTINIT static gil_safe_call_once_and_store storage; ++ return storage ++ .call_once_and_store_result([]() { ++ return detail::import_numpy_core_submodule("_internal") ++ .attr("_dtype_from_pep3118"); ++ }) ++ .get_stored(); + } + + dtype strip_padding(ssize_t itemsize) {