From 77ba5bbd5177264d3b6311b5b33f06b46c8c3d2a Mon Sep 17 00:00:00 2001 From: Florian Engelhardt Date: Fri, 14 Jun 2024 14:53:09 +0200 Subject: [PATCH 1/6] remove mutex --- profiling/src/allocation.rs | 5 +++-- profiling/src/exception.rs | 4 ++-- profiling/src/lib.rs | 35 +++++++------------------------ profiling/src/pcntl.rs | 19 +++++++---------- profiling/src/profiling/mod.rs | 38 ++++++++++++++++++++++++++++++++-- profiling/src/timeline.rs | 27 ++++++++++-------------- profiling/src/wall_time.rs | 4 ++-- 7 files changed, 69 insertions(+), 63 deletions(-) diff --git a/profiling/src/allocation.rs b/profiling/src/allocation.rs index 8e3c25911b..c36307603b 100644 --- a/profiling/src/allocation.rs +++ b/profiling/src/allocation.rs @@ -2,7 +2,8 @@ use crate::bindings::{ self as zend, datadog_php_install_handler, datadog_php_zif_handler, ddog_php_prof_copy_long_into_zval, }; -use crate::{PROFILER, PROFILER_NAME, REQUEST_LOCALS}; +use crate::profiling::Profiler; +use crate::{PROFILER_NAME, REQUEST_LOCALS}; use lazy_static::lazy_static; use libc::{c_char, c_int, c_void, size_t}; use log::{debug, error, trace, warn}; @@ -93,7 +94,7 @@ impl AllocationProfilingStats { self.next_sampling_interval(); - if let Some(profiler) = PROFILER.lock().unwrap().as_ref() { + if let Some(profiler) = Profiler::get() { // Safety: execute_data was provided by the engine, and the profiler doesn't mutate it. unsafe { profiler.collect_allocations( diff --git a/profiling/src/exception.rs b/profiling/src/exception.rs index 71da753b06..5f35f3647d 100644 --- a/profiling/src/exception.rs +++ b/profiling/src/exception.rs @@ -1,5 +1,5 @@ +use crate::profiling::Profiler; use crate::zend::{self, zend_execute_data, zend_generator, zval, InternalFunctionHandler}; -use crate::PROFILER; use crate::REQUEST_LOCALS; use log::{error, info}; use rand::rngs::ThreadRng; @@ -83,7 +83,7 @@ impl ExceptionProfilingStats { self.next_sampling_interval(); - if let Some(profiler) = PROFILER.lock().unwrap().as_ref() { + if let Some(profiler) = Profiler::get() { // Safety: execute_data was provided by the engine, and the profiler doesn't mutate it. unsafe { profiler.collect_exception( diff --git a/profiling/src/lib.rs b/profiling/src/lib.rs index 32aaf3647d..3a796e6b8e 100644 --- a/profiling/src/lib.rs +++ b/profiling/src/lib.rs @@ -41,16 +41,12 @@ use std::ffi::CStr; use std::ops::Deref; use std::os::raw::c_int; use std::sync::atomic::{AtomicBool, AtomicU32, Ordering}; -use std::sync::{Arc, Mutex, Once}; +use std::sync::{Arc, Once}; use std::time::{Duration, Instant}; use uuid::Uuid; use crate::zend::datadog_sapi_globals_request_info; -/// The global profiler. Profiler gets made during the first rinit after an -/// minit, and is destroyed on mshutdown. -static PROFILER: Mutex> = Mutex::new(None); - /// Name of the profiling module and zend_extension. Must not contain any /// interior null bytes and must be null terminated. static PROFILER_NAME: &[u8] = b"datadog-profiling\0"; @@ -506,18 +502,7 @@ extern "C" fn rinit(_type: c_int, _module_number: c_int) -> ZendResult { return ZendResult::Success; } - // reminder: this cannot be done in minit because of Apache forking model - { - /* It would be nice if this could be cheaper. OnceCell would be cheaper - * but it doesn't quite fit the model, as going back to uninitialized - * requires either a &mut or .take(), and neither works for us (unless - * we go for unsafe, which is what we are trying to avoid). - */ - let mut profiler = PROFILER.lock().unwrap(); - if profiler.is_none() { - *profiler = Some(Profiler::new(system_settings)) - } - }; + Profiler::init(system_settings); if system_settings.profiling_enabled { REQUEST_LOCALS.with(|cell| { @@ -554,7 +539,7 @@ extern "C" fn rinit(_type: c_int, _module_number: c_int) -> ZendResult { return; } - if let Some(profiler) = PROFILER.lock().unwrap().as_ref() { + if let Some(profiler) = Profiler::get() { let interrupt = VmInterrupt { interrupt_count_ptr: &locals.interrupt_count as *const AtomicU32, engine_ptr: locals.vm_interrupt_addr, @@ -610,7 +595,7 @@ extern "C" fn rshutdown(_type: c_int, _module_number: c_int) -> ZendResult { // wall-time is not expected to ever be disabled, except in testing, // and we don't need to optimize for that. if system_settings.profiling_enabled { - if let Some(profiler) = PROFILER.lock().unwrap().as_ref() { + if let Some(profiler) = Profiler::get() { let interrupt = VmInterrupt { interrupt_count_ptr: &locals.interrupt_count, engine_ptr: locals.vm_interrupt_addr, @@ -820,10 +805,7 @@ extern "C" fn mshutdown(_type: c_int, _module_number: c_int) -> ZendResult { #[cfg(feature = "exception_profiling")] exception::exception_profiling_mshutdown(); - let mut profiler = PROFILER.lock().unwrap(); - if let Some(profiler) = profiler.as_mut() { - profiler.stop(Duration::from_secs(1)); - } + Profiler::stop(Duration::from_secs(1)); ZendResult::Success } @@ -857,10 +839,7 @@ extern "C" fn shutdown(_extension: *mut ZendExtension) { #[cfg(debug_assertions)] trace!("shutdown({:p})", _extension); - let mut profiler = PROFILER.lock().unwrap(); - if let Some(profiler) = profiler.take() { - profiler.shutdown(Duration::from_secs(2)); - } + Profiler::shutdown(Duration::from_secs(2)); // SAFETY: calling in shutdown before zai config is shutdown, and after // all configuration is done being accessed. Well... in the happy-path, @@ -889,7 +868,7 @@ fn notify_trace_finished(local_root_span_id: u64, span_type: Cow, resource: return; } - if let Some(profiler) = PROFILER.lock().unwrap().as_ref() { + if let Some(profiler) = Profiler::get() { let message = LocalRootSpanResourceMessage { local_root_span_id, resource: resource.into_owned(), diff --git a/profiling/src/pcntl.rs b/profiling/src/pcntl.rs index b91545b761..8a00de9ea9 100644 --- a/profiling/src/pcntl.rs +++ b/profiling/src/pcntl.rs @@ -3,10 +3,9 @@ use crate::bindings::{ datadog_php_install_handler, datadog_php_zif_handler, zend_execute_data, zend_long, zval, InternalFunctionHandler, }; -use crate::{config, Profiler, PROFILER}; +use crate::{config, Profiler}; use log::{error, warn}; use std::ffi::CStr; -use std::mem::{forget, swap}; use std::ptr; static mut PCNTL_FORK_HANDLER: InternalFunctionHandler = None; @@ -23,8 +22,8 @@ fn handle_pcntl_fork( * the threads while the fork is occurring, they cannot acquire locks * since this thread holds them, preventing a deadlock situation. */ - let mut profiler_lock = PROFILER.lock().unwrap(); - if let Some(profiler) = profiler_lock.as_ref() { + let profiler_lock = Profiler::get(); + if let Some(profiler) = profiler_lock { profiler.fork_prepare(); } @@ -45,7 +44,7 @@ fn handle_pcntl_fork( * to to prevent further profiling which could cause a crash/deadlock. */ - stop_and_forget_profiling(&mut profiler_lock); + stop_and_forget_profiling(); } else { // Safety: we checked it wasn't null above. let result: Result = unsafe { &mut *return_value }.try_into(); @@ -56,12 +55,12 @@ fn handle_pcntl_fork( name, r#type ); - stop_and_forget_profiling(&mut profiler_lock); + stop_and_forget_profiling(); } Ok(pid) => { // The child gets pid of 0. For now, stop profiling for safety. if pid == 0 { - stop_and_forget_profiling(&mut profiler_lock); + stop_and_forget_profiling(); /* When we fully support forking remember: * - Reset last_cpu_time and last_wall_time. @@ -80,7 +79,7 @@ fn handle_pcntl_fork( } } -fn stop_and_forget_profiling(maybe_profiler: &mut Option) { +fn stop_and_forget_profiling() { /* In a forking situation, the currently active profiler may not be valid * because it has join handles and other state shared by other threads, * and threads are not copied when the process is forked. @@ -91,9 +90,7 @@ fn stop_and_forget_profiling(maybe_profiler: &mut Option) { * and forgetting it, which avoids running the destructor. Yes, this will * most likely leak some small amount of memory. */ - let mut old_profiler = None; - swap(&mut *maybe_profiler, &mut old_profiler); - forget(old_profiler); + Profiler::kill(); alloc_prof_rshutdown(); diff --git a/profiling/src/profiling/mod.rs b/profiling/src/profiling/mod.rs index 242e9a7a00..994ecf1ee6 100644 --- a/profiling/src/profiling/mod.rs +++ b/profiling/src/profiling/mod.rs @@ -5,6 +5,7 @@ mod thread_utils; mod uploader; pub use interrupts::*; +use once_cell::sync::OnceCell; pub use sample_type_filter::*; pub use stack_walking::*; use uploader::*; @@ -30,6 +31,7 @@ use std::borrow::Cow; use std::collections::HashMap; use std::hash::Hash; use std::intrinsics::transmute; +use std::mem::forget; use std::num::NonZeroI64; use std::sync::atomic::{AtomicBool, AtomicU32, Ordering}; use std::sync::{Arc, Barrier}; @@ -56,6 +58,10 @@ pub const NO_TIMESTAMP: i64 = 0; // magnitude for the capacity. const UPLOAD_CHANNEL_CAPACITY: usize = 8; +/// The global profiler. Profiler gets made during the first rinit after an +/// minit, and is destroyed on mshutdown. +static mut PROFILER: OnceCell = OnceCell::new(); + /// Order this array this way: /// 1. Always enabled types. /// 2. On by default types. @@ -519,6 +525,17 @@ pub enum UploadMessage { const COW_EVAL: Cow = Cow::Borrowed("[eval]"); impl Profiler { + pub fn init(system_settings: &SystemSettings) { + let profiler = unsafe { PROFILER.get() }; + if profiler.is_none() { + let _ = unsafe { PROFILER.set(Profiler::new(system_settings)) }; + } + } + + pub fn get() -> Option<&'static Profiler> { + unsafe { PROFILER.get() } + } + pub fn new(system_settings: &SystemSettings) -> Self { let fork_barrier = Arc::new(Barrier::new(3)); let interrupt_manager = Arc::new(InterruptManager::new()); @@ -630,7 +647,13 @@ impl Profiler { /// Note that you must call [Profiler::shutdown] afterwards; it's two /// parts of the same operation. It's split so you (or other extensions) /// can do something while the other threads finish up. - pub fn stop(&mut self, timeout: Duration) { + pub fn stop(timeout: Duration) { + if let Some(profiler) = unsafe { PROFILER.get_mut() } { + profiler.join_and_drop_sender(timeout); + } + } + + pub fn join_and_drop_sender(&mut self, timeout: Duration) { debug!("Stopping profiler."); let sent = match self @@ -661,7 +684,13 @@ impl Profiler { /// Completes the shutdown process; to start it, call [Profiler::stop] /// before calling [Profiler::shutdown]. /// Note the timeout is per thread, and there may be multiple threads. - pub fn shutdown(self, timeout: Duration) { + pub fn shutdown(timeout: Duration) { + if let Some(profiler) = unsafe { PROFILER.take() } { + profiler.join_collector_and_uploader(timeout); + } + } + + pub fn join_collector_and_uploader(self, timeout: Duration) { if self.should_join.load(Ordering::SeqCst) { thread_utils::join_timeout( self.time_collector_handle, @@ -680,6 +709,11 @@ impl Profiler { } } + pub fn kill() { + let maybe_profiler = unsafe { PROFILER.take() }; + forget(maybe_profiler); + } + /// Collect a stack sample with elapsed wall time. Collects CPU time if /// it's enabled and available. pub fn collect_time(&self, execute_data: *mut zend_execute_data, interrupt_count: u32) { diff --git a/profiling/src/timeline.rs b/profiling/src/timeline.rs index f0e8928526..64055a9df7 100644 --- a/profiling/src/timeline.rs +++ b/profiling/src/timeline.rs @@ -1,8 +1,9 @@ +use crate::profiling::Profiler; use crate::zend::{ self, zai_str_from_zstr, zend_execute_data, zend_get_executed_filename_ex, zval, InternalFunctionHandler, }; -use crate::{PROFILER, REQUEST_LOCALS}; +use crate::REQUEST_LOCALS; use libc::c_char; use log::{error, trace, warn}; use std::cell::RefCell; @@ -80,16 +81,10 @@ fn try_sleeping_fn( // case it does, short-circuit the function. let now = SystemTime::now().duration_since(UNIX_EPOCH)?; - match PROFILER.lock() { - Ok(guard) => { - // If the profiler isn't there, it's probably just not enabled. - if let Some(profiler) = guard.as_ref() { - let now = now.as_nanos() as i64; - let duration = duration.as_nanos() as i64; - profiler.collect_idle(now, duration, state.as_str()) - } - } - Err(err) => anyhow::bail!("profiler mutex: {err:#}"), + if let Some(profiler) = Profiler::get() { + let now = now.as_nanos() as i64; + let duration = duration.as_nanos() as i64; + profiler.collect_idle(now, duration, state.as_str()) } Ok(()) } @@ -234,7 +229,7 @@ pub unsafe fn timeline_rinit() { return; } - if let Some(profiler) = PROFILER.lock().unwrap().as_ref() { + if let Some(profiler) = Profiler::get() { profiler.collect_idle( // Safety: checked for `is_err()` above SystemTime::now() @@ -293,7 +288,7 @@ pub(crate) unsafe fn timeline_mshutdown() { return; } - if let Some(profiler) = PROFILER.lock().unwrap().as_ref() { + if let Some(profiler) = Profiler::get() { profiler.collect_idle( // Safety: checked for `is_err()` above SystemTime::now() @@ -357,7 +352,7 @@ unsafe extern "C" fn ddog_php_prof_compile_string( duration.as_nanos(), ); - if let Some(profiler) = PROFILER.lock().unwrap().as_ref() { + if let Some(profiler) = Profiler::get() { profiler.collect_compile_string( // Safety: checked for `is_err()` above now.unwrap().as_nanos() as i64, @@ -423,7 +418,7 @@ unsafe extern "C" fn ddog_php_prof_compile_file( duration.as_nanos(), ); - if let Some(profiler) = PROFILER.lock().unwrap().as_ref() { + if let Some(profiler) = Profiler::get() { profiler.collect_compile_file( // Safety: checked for `is_err()` above now.unwrap().as_nanos() as i64, @@ -493,7 +488,7 @@ unsafe extern "C" fn ddog_php_prof_gc_collect_cycles() -> i32 { duration.as_nanos() ); - if let Some(profiler) = PROFILER.lock().unwrap().as_ref() { + if let Some(profiler) = Profiler::get() { cfg_if::cfg_if! { if #[cfg(php_gc_status)] { profiler.collect_garbage_collection( diff --git a/profiling/src/wall_time.rs b/profiling/src/wall_time.rs index 2635092b92..42f7277843 100644 --- a/profiling/src/wall_time.rs +++ b/profiling/src/wall_time.rs @@ -5,7 +5,7 @@ use crate::bindings::{ zend_execute_data, zend_execute_internal, zend_interrupt_function, zval, VmInterruptFn, ZEND_ACC_CALL_VIA_TRAMPOLINE, }; -use crate::{zend, PROFILER, REQUEST_LOCALS}; +use crate::{profiling::Profiler, zend, REQUEST_LOCALS}; use std::mem::MaybeUninit; use std::sync::atomic::Ordering; @@ -55,7 +55,7 @@ pub extern "C" fn ddog_php_prof_interrupt_function(execute_data: *mut zend_execu return; } - if let Some(profiler) = PROFILER.lock().unwrap().as_ref() { + if let Some(profiler) = Profiler::get() { // Safety: execute_data was provided by the engine, and the profiler doesn't mutate it. profiler.collect_time(execute_data, interrupt_count); } From a2ed7b7a641ea76d6a2ecd53f90634e0d136857b Mon Sep 17 00:00:00 2001 From: Florian Engelhardt Date: Fri, 14 Jun 2024 15:39:47 +0200 Subject: [PATCH 2/6] cleanup timeline handling --- profiling/src/timeline.rs | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/profiling/src/timeline.rs b/profiling/src/timeline.rs index 64055a9df7..596c887a14 100644 --- a/profiling/src/timeline.rs +++ b/profiling/src/timeline.rs @@ -5,7 +5,7 @@ use crate::zend::{ }; use crate::REQUEST_LOCALS; use libc::c_char; -use log::{error, trace, warn}; +use log::{error, trace}; use std::cell::RefCell; use std::ffi::CStr; use std::ptr; @@ -46,13 +46,12 @@ impl State { } } -#[inline] -fn try_sleeping_fn( +fn sleeping_fn( func: unsafe extern "C" fn(execute_data: *mut zend_execute_data, return_value: *mut zval), execute_data: *mut zend_execute_data, return_value: *mut zval, state: State, -) -> anyhow::Result<()> { +) { let timeline_enabled = REQUEST_LOCALS.with(|cell| { cell.try_borrow() .map(|locals| locals.system_settings().profiling_timeline_enabled) @@ -61,7 +60,7 @@ fn try_sleeping_fn( if !timeline_enabled { unsafe { func(execute_data, return_value) }; - return Ok(()); + return; } let start = Instant::now(); @@ -75,28 +74,18 @@ fn try_sleeping_fn( let duration = start.elapsed(); - // > Returns an Err if earlier is later than self, and the error contains - // > how far from self the time is. - // This shouldn't ever happen (now is always later than the epoch) but in - // case it does, short-circuit the function. - let now = SystemTime::now().duration_since(UNIX_EPOCH)?; + // This shouldn't ever happen (now is always later than the epoch) + let now = SystemTime::now().duration_since(UNIX_EPOCH); - if let Some(profiler) = Profiler::get() { - let now = now.as_nanos() as i64; - let duration = duration.as_nanos() as i64; - profiler.collect_idle(now, duration, state.as_str()) + if now.is_err() { + return; } - Ok(()) -} -fn sleeping_fn( - func: unsafe extern "C" fn(execute_data: *mut zend_execute_data, return_value: *mut zval), - execute_data: *mut zend_execute_data, - return_value: *mut zval, - state: State, -) { - if let Err(err) = try_sleeping_fn(func, execute_data, return_value, state) { - warn!("error creating profiling timeline sample for an internal function: {err:#}"); + if let Some(profiler) = Profiler::get() { + // Safety: `unwrap` can be unchecked, as we checked for `is_err()` + let now = unsafe { now.unwrap_unchecked().as_nanos() } as i64; + let duration = duration.as_nanos() as i64; + profiler.collect_idle(now, duration, state.as_str()); } } From afaeb3978e057a6eeeb9d82c5ebfada8d73954f9 Mon Sep 17 00:00:00 2001 From: Levi Morrison Date: Fri, 14 Jun 2024 19:14:56 -0600 Subject: [PATCH 3/6] refactor: to make safety conditions easier to see --- profiling/src/pcntl.rs | 162 +++++++++++++++++---------------- profiling/src/profiling/mod.rs | 62 +++++++++---- 2 files changed, 131 insertions(+), 93 deletions(-) diff --git a/profiling/src/pcntl.rs b/profiling/src/pcntl.rs index 8a00de9ea9..3f114c141d 100644 --- a/profiling/src/pcntl.rs +++ b/profiling/src/pcntl.rs @@ -4,6 +4,7 @@ use crate::bindings::{ InternalFunctionHandler, }; use crate::{config, Profiler}; +use ddcommon::cstr; use log::{error, warn}; use std::ffi::CStr; use std::ptr; @@ -12,84 +13,95 @@ static mut PCNTL_FORK_HANDLER: InternalFunctionHandler = None; static mut PCNTL_RFORK_HANDLER: InternalFunctionHandler = None; static mut PCNTL_FORKX_HANDLER: InternalFunctionHandler = None; -fn handle_pcntl_fork( - name: &str, +enum ForkId { + Child, + Parent, +} + +enum ForkError { + NullRetval, + ZvalType(u8), +} + +unsafe fn dispatch( handler: unsafe extern "C" fn(*mut zend_execute_data, *mut zval), execute_data: *mut zend_execute_data, return_value: *mut zval, -) { - /* Hold mutexes across the handler. If there are any spurious wakeups by - * the threads while the fork is occurring, they cannot acquire locks - * since this thread holds them, preventing a deadlock situation. - */ - let profiler_lock = Profiler::get(); - if let Some(profiler) = profiler_lock { - profiler.fork_prepare(); - } - +) -> Result { // Safety: we're calling the original handler with correct args. unsafe { handler(execute_data, return_value) }; if return_value.is_null() { // This _shouldn't_ ever be hit. - - if profiler_lock.is_some() { - error!( - "Failed to read return value of {}. A crash or deadlock may occur.", - name - ); - } - - /* We don't know if this is actually the parent or not, so do our best - * to to prevent further profiling which could cause a crash/deadlock. - */ - - stop_and_forget_profiling(); + Err(ForkError::NullRetval) } else { // Safety: we checked it wasn't null above. let result: Result = unsafe { &mut *return_value }.try_into(); match result { - Err(r#type) => { - warn!( - "Return type of {} was unexpected: {}. A crash or deadlock may occur.", - name, r#type - ); - - stop_and_forget_profiling(); - } + Err(r#type) => Err(ForkError::ZvalType(r#type)), Ok(pid) => { // The child gets pid of 0. For now, stop profiling for safety. if pid == 0 { - stop_and_forget_profiling(); - - /* When we fully support forking remember: - * - Reset last_cpu_time and last_wall_time. - * - Reset process_id and runtime-id tags. - */ + Ok(ForkId::Child) } else { - /* If it's negative, then no child process was made so we must be the parent. - * If it's positive then this is definitely the parent process. - */ - if let Some(profiler) = profiler_lock.as_ref() { - profiler.post_fork_parent(); - } + // If it's positive, then this is the parent process. + // If it's negative, then no child process was made, and + // this must be the parent. + Ok(ForkId::Parent) } } } } } -fn stop_and_forget_profiling() { - /* In a forking situation, the currently active profiler may not be valid - * because it has join handles and other state shared by other threads, - * and threads are not copied when the process is forked. - * Additionally, if we've hit certain other issues like not being able to - * determine the return type of the pcntl_fork function, we don't know if - * we're the parent or child. - * So, we throw away the current profiler by swapping it with a None, and - * and forgetting it, which avoids running the destructor. Yes, this will - * most likely leak some small amount of memory. - */ +#[cold] +#[inline(never)] +unsafe fn handle_fork( + handler: unsafe extern "C" fn(*mut zend_execute_data, *mut zval), + execute_data: *mut zend_execute_data, + return_value: *mut zval, +) { + if let Some(profiler) = Profiler::get() { + if let Err(err) = profiler.fork_prepare() { + error!("fork error: {err}"); + } + } + + match dispatch(handler, execute_data, return_value) { + Ok(ForkId::Parent) => { + if let Some(profiler) = Profiler::get() { + profiler.post_fork_parent(); + } + return; + } + Ok(ForkId::Child) => { /* fallthrough */ } + Err(ForkError::NullRetval) => { + // Skip error message if no profiler. + if Profiler::get().is_some() { + error!( + "Failed to read return value of forking function. A crash or deadlock may occur." + ); + } + // fallthrough + } + + Err(ForkError::ZvalType(r#type)) => { + // Skip error message if no profiler. + if Profiler::get().is_some() { + warn!( + "Return type of forking function was unexpected: {type}. A crash or deadlock may occur." + ); + } + // fallthrough + } + } + + // Disable the profiler because either: + // 1. This is the child, and we don't support this yet. + // 2. Something went wrong, and disable it to be safe. + + // SAFETY: This thread isn't using the PROFILER object, and there are no + // other threads. Profiler::kill(); alloc_prof_rshutdown(); @@ -97,57 +109,55 @@ fn stop_and_forget_profiling() { // Reset some global state to prevent further profiling and to not handle // any pending interrupts. // SAFETY: done after config is used to shut down other things, and in a - // thread-safe context. - unsafe { config::on_fork_in_child() }; + // thread-safe context. + config::on_fork_in_child(); } -unsafe extern "C" fn pcntl_fork(execute_data: *mut zend_execute_data, return_value: *mut zval) { +unsafe extern "C" fn fork(execute_data: *mut zend_execute_data, return_value: *mut zval) { // Safety: this function is only called if PCNTL_FORK_HANDLER was set. let handler = PCNTL_FORK_HANDLER.unwrap(); - handle_pcntl_fork("pcntl_fork", handler, execute_data, return_value); + handle_fork(handler, execute_data, return_value); } -unsafe extern "C" fn pcntl_rfork(execute_data: *mut zend_execute_data, return_value: *mut zval) { +unsafe extern "C" fn rfork(execute_data: *mut zend_execute_data, return_value: *mut zval) { // Safety: this function is only called if PCNTL_RFORK_HANDLER was set. let handler = PCNTL_RFORK_HANDLER.unwrap(); - handle_pcntl_fork("pcntl_rfork", handler, execute_data, return_value); + handle_fork(handler, execute_data, return_value); } -unsafe extern "C" fn pcntl_forkx(execute_data: *mut zend_execute_data, return_value: *mut zval) { +unsafe extern "C" fn forkx(execute_data: *mut zend_execute_data, return_value: *mut zval) { // Safety: this function is only called if PCNTL_FORKX_HANDLER was set. let handler = PCNTL_FORKX_HANDLER.unwrap(); - handle_pcntl_fork("pcntl_forkx", handler, execute_data, return_value); + handle_fork(handler, execute_data, return_value); } -// Safety: the provided slices are nul-terminated and don't contain any interior nul bytes. -const PCNTL_FORK: &CStr = unsafe { CStr::from_bytes_with_nul_unchecked(b"pcntl_fork\0") }; -const PCNTL_RFORK: &CStr = unsafe { CStr::from_bytes_with_nul_unchecked(b"pcntl_rfork\0") }; -const PCNTL_FORKX: &CStr = unsafe { CStr::from_bytes_with_nul_unchecked(b"pcntl_forkx\0") }; +const PCNTL_FORK: &CStr = cstr!("pcntl_fork"); +const PCNTL_RFORK: &CStr = cstr!("pcntl_rfork"); +const PCNTL_FORKX: &CStr = cstr!("pcntl_forkx"); /// # Safety /// Only call this from zend_extension's startup function. It's not designed /// to be called from anywhere else. pub(crate) unsafe fn startup() { - /* Some of these handlers exist only on recent PHP versions, but since - * datadog_php_install_handler fails gracefully on a missing function, we - * don't have to worry about creating Rust configurations for them. - * Safety: we can modify our own globals in the startup context. - */ + // Some of these handlers exist only on recent PHP versions, but since + // datadog_php_install_handler fails gracefully on a missing function, we + // don't have to worry about creating Rust configurations for them. + // Safety: we can modify our own globals in the startup context. let handlers = [ datadog_php_zif_handler::new( PCNTL_FORK, ptr::addr_of_mut!(PCNTL_FORK_HANDLER), - Some(pcntl_fork), + Some(fork), ), datadog_php_zif_handler::new( PCNTL_RFORK, ptr::addr_of_mut!(PCNTL_RFORK_HANDLER), - Some(pcntl_rfork), + Some(rfork), ), datadog_php_zif_handler::new( PCNTL_FORKX, ptr::addr_of_mut!(PCNTL_FORKX_HANDLER), - Some(pcntl_forkx), + Some(forkx), ), ]; diff --git a/profiling/src/profiling/mod.rs b/profiling/src/profiling/mod.rs index 994ecf1ee6..21895487bd 100644 --- a/profiling/src/profiling/mod.rs +++ b/profiling/src/profiling/mod.rs @@ -26,7 +26,7 @@ use datadog_profiling::api::{ }; use datadog_profiling::exporter::Tag; use datadog_profiling::internal::Profile as InternalProfile; -use log::{debug, error, info, trace, warn}; +use log::{debug, info, trace, warn}; use std::borrow::Cow; use std::collections::HashMap; use std::hash::Hash; @@ -524,6 +524,9 @@ pub enum UploadMessage { #[cfg(feature = "timeline")] const COW_EVAL: Cow = Cow::Borrowed("[eval]"); +const DDPROF_TIME: &str = "ddprof_time"; +const DDPROF_UPLOAD: &str = "ddprof_upload"; + impl Profiler { pub fn init(system_settings: &SystemSettings) { let profiler = unsafe { PROFILER.get() }; @@ -557,21 +560,19 @@ impl Profiler { Utc::now(), ); - let ddprof_time = "ddprof_time"; - let ddprof_upload = "ddprof_upload"; let sample_types_filter = SampleTypeFilter::new(system_settings); Profiler { fork_barrier, interrupt_manager, message_sender, upload_sender, - time_collector_handle: thread_utils::spawn(ddprof_time, move || { + time_collector_handle: thread_utils::spawn(DDPROF_TIME, move || { time_collector.run(); - trace!("thread {ddprof_time} complete, shutting down"); + trace!("thread {DDPROF_TIME} complete, shutting down"); }), - uploader_handle: thread_utils::spawn(ddprof_upload, move || { + uploader_handle: thread_utils::spawn(DDPROF_UPLOAD, move || { uploader.run(); - trace!("thread {ddprof_upload} complete, shutting down"); + trace!("thread {DDPROF_UPLOAD} complete, shutting down"); }), should_join: AtomicBool::new(true), sample_types_filter, @@ -601,22 +602,26 @@ impl Profiler { } /// Call before a fork, on the thread of the parent process that will fork. - pub fn fork_prepare(&self) { - // Send the message to the uploader first, as it has a longer worst- + pub fn fork_prepare(&self) -> anyhow::Result<()> { + // Send the message to the uploader first, as it has a longer worst // case time to wait. let uploader_result = self.upload_sender.send(UploadMessage::Pause); let profiler_result = self.message_sender.send(ProfilerMessage::Pause); - // todo: handle fails more gracefully, but it's tricky to sync 3 - // threads, any of which could have crashed or be delayed. This - // could also deadlock. match (uploader_result, profiler_result) { (Ok(_), Ok(_)) => { self.fork_barrier.wait(); + Ok(()) + } + (Err(err), Ok(_)) => { + anyhow::bail!("failed to prepare {DDPROF_UPLOAD} thread for forking: {err}") } - (_, _) => { - error!("failed to prepare the profiler for forking, a deadlock could occur") + (Ok(_), Err(err)) => { + anyhow::bail!("failed to prepare {DDPROF_TIME} thread for forking: {err}") } + (Err(_), Err(_)) => anyhow::bail!( + "failed to prepare both {DDPROF_UPLOAD} and {DDPROF_TIME} threads for forking" + ), } } @@ -709,9 +714,32 @@ impl Profiler { } } - pub fn kill() { - let maybe_profiler = unsafe { PROFILER.take() }; - forget(maybe_profiler); + /// Throws away the profiler and moves it to uninitialized. + /// + /// In a forking situation, the currently active profiler may not be valid + /// because it has join handles and other state shared by other threads, + /// and threads are not copied when the process is forked. + /// Additionally, if we've hit certain other issues like not being able to + /// determine the return type of the pcntl_fork function, we don't know if + /// we're the parent or child. + /// So, we throw away the current profiler and forget it, which avoids + /// running the destructor. Yes, this will leak some memory. + /// + /// # Safety + /// Must be called when no other thread is using the PROFILER object. That + /// includes this thread in some kind of recursive manner. + pub unsafe fn kill() { + // SAFETY: see this function's safety conditions. + if let Some(mut profiler) = PROFILER.take() { + // Drop some things to reduce memory. + profiler.interrupt_manager = Arc::new(InterruptManager::new()); + profiler.message_sender = crossbeam_channel::bounded(0).0; + profiler.upload_sender = crossbeam_channel::bounded(0).0; + + // But we're not 100% sure everything is safe to drop, notably the + // join handles, so we leak the rest. + forget(profiler) + } } /// Collect a stack sample with elapsed wall time. Collects CPU time if From 4a9cde44f04465e453c29a0ac3f213afbf52f07a Mon Sep 17 00:00:00 2001 From: Florian Engelhardt Date: Fri, 2 Aug 2024 10:14:06 +0200 Subject: [PATCH 4/6] guard `Profiler` init with a mutex --- profiling/src/profiling/mod.rs | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/profiling/src/profiling/mod.rs b/profiling/src/profiling/mod.rs index 21f391e7fa..07e9c01e98 100644 --- a/profiling/src/profiling/mod.rs +++ b/profiling/src/profiling/mod.rs @@ -34,7 +34,7 @@ use std::intrinsics::transmute; use std::mem::forget; use std::num::NonZeroI64; use std::sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, Ordering}; -use std::sync::{Arc, Barrier}; +use std::sync::{Arc, Barrier, Mutex}; use std::thread::JoinHandle; use std::time::{Duration, Instant, SystemTime}; @@ -519,15 +519,34 @@ const COW_EVAL: Cow = Cow::Borrowed("[eval]"); const DDPROF_TIME: &str = "ddprof_time"; const DDPROF_UPLOAD: &str = "ddprof_upload"; +static PROFILER_INIT: Mutex<()> = Mutex::new(()); + impl Profiler { + /// Will initialize the `PROFILER` OnceCell and makes sure that only one thread will do so. pub fn init(system_settings: &mut SystemSettings) { - let profiler = unsafe { PROFILER.get() }; - if profiler.is_none() { + // Safety: there could be a race in which another thread already initializes `PROFILER`, + // this is guarded by the `PROFILER_INIT` mutex next. + if unsafe { PROFILER.get() }.is_some() { + return; + } + + // Waiting point for all threads but one. + let _lock = PROFILER_INIT.lock().unwrap(); + + // For one thread (presumably the first) this will evaluate to true, all other threads will + // evaluate this to false and skip initialization. Future calls to this method will trigger + // the guard clause above. + // Safety: only one thread will ever call `PROFILER::get()` while the OnceCell is not + // initialized. Other threads that saw an uninitialized `PROFILER` but came to late to + // acquire `PROFILER_INIT` mutex will get `Some` and not do anything with the OnceCell. + if unsafe { PROFILER.get() }.is_none() { let _ = unsafe { PROFILER.set(Profiler::new(system_settings)) }; } } pub fn get() -> Option<&'static Profiler> { + // Safety: Besides the `InterruptManager`, which guards adding new vm_interrupts with a + // mutex, non of the public functions do mutations to global data unsafe { PROFILER.get() } } @@ -681,6 +700,8 @@ impl Profiler { /// Completes the shutdown process; to start it, call [Profiler::stop] /// before calling [Profiler::shutdown]. /// Note the timeout is per thread, and there may be multiple threads. + /// + /// Safety: only safe to be called in `SHUTDOWN`/`MSHUTDOWN` phase pub fn shutdown(timeout: Duration) { if let Some(profiler) = unsafe { PROFILER.take() } { profiler.join_collector_and_uploader(timeout); From f83125993ac7b416d27f351d6ad7bb6b8485691c Mon Sep 17 00:00:00 2001 From: Levi Morrison Date: Thu, 15 Aug 2024 10:02:29 -0600 Subject: [PATCH 5/6] Simplify Profiler::init and add safety comments to PROFILER --- profiling/src/profiling/mod.rs | 39 +++++++++++++--------------------- 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/profiling/src/profiling/mod.rs b/profiling/src/profiling/mod.rs index 07e9c01e98..4972cf7251 100644 --- a/profiling/src/profiling/mod.rs +++ b/profiling/src/profiling/mod.rs @@ -5,7 +5,6 @@ mod thread_utils; mod uploader; pub use interrupts::*; -use once_cell::sync::OnceCell; pub use sample_type_filter::*; pub use stack_walking::*; use uploader::*; @@ -27,6 +26,7 @@ use datadog_profiling::api::{ use datadog_profiling::exporter::Tag; use datadog_profiling::internal::Profile as InternalProfile; use log::{debug, info, trace, warn}; +use once_cell::sync::OnceCell; use std::borrow::Cow; use std::collections::HashMap; use std::hash::Hash; @@ -34,7 +34,7 @@ use std::intrinsics::transmute; use std::mem::forget; use std::num::NonZeroI64; use std::sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, Ordering}; -use std::sync::{Arc, Barrier, Mutex}; +use std::sync::{Arc, Barrier}; use std::thread::JoinHandle; use std::time::{Duration, Instant, SystemTime}; @@ -519,34 +519,19 @@ const COW_EVAL: Cow = Cow::Borrowed("[eval]"); const DDPROF_TIME: &str = "ddprof_time"; const DDPROF_UPLOAD: &str = "ddprof_upload"; -static PROFILER_INIT: Mutex<()> = Mutex::new(()); - impl Profiler { /// Will initialize the `PROFILER` OnceCell and makes sure that only one thread will do so. pub fn init(system_settings: &mut SystemSettings) { - // Safety: there could be a race in which another thread already initializes `PROFILER`, - // this is guarded by the `PROFILER_INIT` mutex next. - if unsafe { PROFILER.get() }.is_some() { - return; - } - - // Waiting point for all threads but one. - let _lock = PROFILER_INIT.lock().unwrap(); - - // For one thread (presumably the first) this will evaluate to true, all other threads will - // evaluate this to false and skip initialization. Future calls to this method will trigger - // the guard clause above. - // Safety: only one thread will ever call `PROFILER::get()` while the OnceCell is not - // initialized. Other threads that saw an uninitialized `PROFILER` but came to late to - // acquire `PROFILER_INIT` mutex will get `Some` and not do anything with the OnceCell. - if unsafe { PROFILER.get() }.is_none() { - let _ = unsafe { PROFILER.set(Profiler::new(system_settings)) }; - } + // SAFETY: the `get_or_init` access is a thread-safe API, and the + // PROFILER is not being mutated outside single-threaded phases such + // as minit/mshutdown. + unsafe { PROFILER.get_or_init(|| Profiler::new(system_settings)) }; } pub fn get() -> Option<&'static Profiler> { - // Safety: Besides the `InterruptManager`, which guards adding new vm_interrupts with a - // mutex, non of the public functions do mutations to global data + // SAFETY: the `get` access is a thread-safe API, and the PROFILER is + // not being mutated outside single-threaded phases such as minit and + // mshutdown. unsafe { PROFILER.get() } } @@ -664,6 +649,9 @@ impl Profiler { /// parts of the same operation. It's split so you (or other extensions) /// can do something while the other threads finish up. pub fn stop(timeout: Duration) { + // SAFETY: the `get_mut` access is a thread-safe API, and the PROFILER + // is not being mutated outside single-threaded phases such as minit + // and mshutdown. if let Some(profiler) = unsafe { PROFILER.get_mut() } { profiler.join_and_drop_sender(timeout); } @@ -703,6 +691,9 @@ impl Profiler { /// /// Safety: only safe to be called in `SHUTDOWN`/`MSHUTDOWN` phase pub fn shutdown(timeout: Duration) { + // SAFETY: the `take` access is a thread-safe API, and the PROFILER is + // not being mutated outside single-threaded phases such as minit and + // mshutdown. if let Some(profiler) = unsafe { PROFILER.take() } { profiler.join_collector_and_uploader(timeout); } From 48498a4c6f27ce21512a935560de2debccea6def Mon Sep 17 00:00:00 2001 From: Levi Morrison Date: Thu, 15 Aug 2024 10:06:31 -0600 Subject: [PATCH 6/6] refactor(profiling): extract SystemProfiler The global profiler (e.g. PROFILER) and the Profiler::* functions that worked on it were moved into a new struct SystemProfiler. --- profiling/src/allocation.rs | 4 +- profiling/src/exception.rs | 4 +- profiling/src/lib.rs | 14 ++-- profiling/src/pcntl.rs | 12 +-- profiling/src/profiling/mod.rs | 78 +------------------ profiling/src/profiling/system_profiler.rs | 87 ++++++++++++++++++++++ profiling/src/timeline.rs | 14 ++-- profiling/src/wall_time.rs | 4 +- 8 files changed, 115 insertions(+), 102 deletions(-) create mode 100644 profiling/src/profiling/system_profiler.rs diff --git a/profiling/src/allocation.rs b/profiling/src/allocation.rs index 8c05881a94..cea47f3464 100644 --- a/profiling/src/allocation.rs +++ b/profiling/src/allocation.rs @@ -2,7 +2,7 @@ use crate::bindings::{ self as zend, datadog_php_install_handler, datadog_php_zif_handler, ddog_php_prof_copy_long_into_zval, }; -use crate::profiling::Profiler; +use crate::profiling::SystemProfiler; use crate::{PROFILER_NAME, REQUEST_LOCALS}; use lazy_static::lazy_static; use libc::{c_char, c_int, c_void, size_t}; @@ -95,7 +95,7 @@ impl AllocationProfilingStats { self.next_sampling_interval(); - if let Some(profiler) = Profiler::get() { + if let Some(profiler) = SystemProfiler::get() { // Safety: execute_data was provided by the engine, and the profiler doesn't mutate it. unsafe { profiler.collect_allocations( diff --git a/profiling/src/exception.rs b/profiling/src/exception.rs index 5f35f3647d..78b4152d49 100644 --- a/profiling/src/exception.rs +++ b/profiling/src/exception.rs @@ -1,4 +1,4 @@ -use crate::profiling::Profiler; +use crate::profiling::SystemProfiler; use crate::zend::{self, zend_execute_data, zend_generator, zval, InternalFunctionHandler}; use crate::REQUEST_LOCALS; use log::{error, info}; @@ -83,7 +83,7 @@ impl ExceptionProfilingStats { self.next_sampling_interval(); - if let Some(profiler) = Profiler::get() { + if let Some(profiler) = SystemProfiler::get() { // Safety: execute_data was provided by the engine, and the profiler doesn't mutate it. unsafe { profiler.collect_exception( diff --git a/profiling/src/lib.rs b/profiling/src/lib.rs index 02cb66a8e8..c6c5b98411 100644 --- a/profiling/src/lib.rs +++ b/profiling/src/lib.rs @@ -33,7 +33,7 @@ use lazy_static::lazy_static; use libc::c_char; use log::{debug, error, info, trace, warn}; use once_cell::sync::{Lazy, OnceCell}; -use profiling::{LocalRootSpanResourceMessage, Profiler, VmInterrupt}; +use profiling::{LocalRootSpanResourceMessage, SystemProfiler, VmInterrupt}; use sapi::Sapi; use std::borrow::Cow; use std::cell::RefCell; @@ -504,7 +504,7 @@ extern "C" fn rinit(_type: c_int, _module_number: c_int) -> ZendResult { return ZendResult::Success; } - Profiler::init(system_settings); + SystemProfiler::init(system_settings); if system_settings.profiling_enabled { REQUEST_LOCALS.with(|cell| { @@ -541,7 +541,7 @@ extern "C" fn rinit(_type: c_int, _module_number: c_int) -> ZendResult { return; } - if let Some(profiler) = Profiler::get() { + if let Some(profiler) = SystemProfiler::get() { let interrupt = VmInterrupt { interrupt_count_ptr: &locals.interrupt_count as *const AtomicU32, engine_ptr: locals.vm_interrupt_addr, @@ -597,7 +597,7 @@ extern "C" fn rshutdown(_type: c_int, _module_number: c_int) -> ZendResult { // wall-time is not expected to ever be disabled, except in testing, // and we don't need to optimize for that. if system_settings.profiling_enabled { - if let Some(profiler) = Profiler::get() { + if let Some(profiler) = SystemProfiler::get() { let interrupt = VmInterrupt { interrupt_count_ptr: &locals.interrupt_count, engine_ptr: locals.vm_interrupt_addr, @@ -807,7 +807,7 @@ extern "C" fn mshutdown(_type: c_int, _module_number: c_int) -> ZendResult { #[cfg(feature = "exception_profiling")] exception::exception_profiling_mshutdown(); - Profiler::stop(Duration::from_secs(1)); + SystemProfiler::stop(Duration::from_secs(1)); ZendResult::Success } @@ -841,7 +841,7 @@ extern "C" fn shutdown(_extension: *mut ZendExtension) { #[cfg(debug_assertions)] trace!("shutdown({:p})", _extension); - Profiler::shutdown(Duration::from_secs(2)); + SystemProfiler::shutdown(Duration::from_secs(2)); // SAFETY: calling in shutdown before zai config is shutdown, and after // all configuration is done being accessed. Well... in the happy-path, @@ -870,7 +870,7 @@ fn notify_trace_finished(local_root_span_id: u64, span_type: Cow, resource: return; } - if let Some(profiler) = Profiler::get() { + if let Some(profiler) = SystemProfiler::get() { let message = LocalRootSpanResourceMessage { local_root_span_id, resource: resource.into_owned(), diff --git a/profiling/src/pcntl.rs b/profiling/src/pcntl.rs index d0e230fb4a..480b148c72 100644 --- a/profiling/src/pcntl.rs +++ b/profiling/src/pcntl.rs @@ -3,7 +3,7 @@ use crate::bindings::{ datadog_php_install_handler, datadog_php_zif_handler, zend_execute_data, zend_long, zval, InternalFunctionHandler, }; -use crate::{config, Profiler}; +use crate::{config, SystemProfiler}; use ddcommon::cstr; use log::{error, warn}; use std::ffi::CStr; @@ -61,13 +61,13 @@ unsafe fn handle_fork( // Hold mutexes across the handler. If there are any spurious wakeups by // the threads while the fork is occurring, they cannot acquire locks // since this thread holds them, preventing a deadlock situation. - if let Some(profiler) = Profiler::get() { + if let Some(profiler) = SystemProfiler::get() { let _ = profiler.fork_prepare(); } match dispatch(handler, execute_data, return_value) { Ok(ForkId::Parent) => { - if let Some(profiler) = Profiler::get() { + if let Some(profiler) = SystemProfiler::get() { profiler.post_fork_parent(); } return; @@ -75,7 +75,7 @@ unsafe fn handle_fork( Ok(ForkId::Child) => { /* fallthrough */ } Err(ForkError::NullRetval) => { // Skip error message if no profiler. - if Profiler::get().is_some() { + if SystemProfiler::get().is_some() { error!( "Failed to read return value of forking function. A crash or deadlock may occur." ); @@ -85,7 +85,7 @@ unsafe fn handle_fork( Err(ForkError::ZvalType(r#type)) => { // Skip error message if no profiler. - if Profiler::get().is_some() { + if SystemProfiler::get().is_some() { warn!( "Return type of forking function was unexpected: {type}. A crash or deadlock may occur." ); @@ -99,7 +99,7 @@ unsafe fn handle_fork( // 2. Something went wrong, and disable it to be safe. // And then leak the old profiler. Its drop method is not safe to run in // these situations. - Profiler::kill(); + SystemProfiler::kill(); alloc_prof_rshutdown(); diff --git a/profiling/src/profiling/mod.rs b/profiling/src/profiling/mod.rs index 4972cf7251..5aa465d32c 100644 --- a/profiling/src/profiling/mod.rs +++ b/profiling/src/profiling/mod.rs @@ -1,12 +1,14 @@ mod interrupts; mod sample_type_filter; pub mod stack_walking; +mod system_profiler; mod thread_utils; mod uploader; pub use interrupts::*; pub use sample_type_filter::*; pub use stack_walking::*; +pub use system_profiler::*; use uploader::*; #[cfg(all(php_has_fibers, not(test)))] @@ -26,12 +28,10 @@ use datadog_profiling::api::{ use datadog_profiling::exporter::Tag; use datadog_profiling::internal::Profile as InternalProfile; use log::{debug, info, trace, warn}; -use once_cell::sync::OnceCell; use std::borrow::Cow; use std::collections::HashMap; use std::hash::Hash; use std::intrinsics::transmute; -use std::mem::forget; use std::num::NonZeroI64; use std::sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, Ordering}; use std::sync::{Arc, Barrier}; @@ -58,10 +58,6 @@ pub const NO_TIMESTAMP: i64 = 0; // magnitude for the capacity. const UPLOAD_CHANNEL_CAPACITY: usize = 8; -/// The global profiler. Profiler gets made during the first rinit after an -/// minit, and is destroyed on mshutdown. -static mut PROFILER: OnceCell = OnceCell::new(); - /// Order this array this way: /// 1. Always enabled types. /// 2. On by default types. @@ -520,21 +516,6 @@ const DDPROF_TIME: &str = "ddprof_time"; const DDPROF_UPLOAD: &str = "ddprof_upload"; impl Profiler { - /// Will initialize the `PROFILER` OnceCell and makes sure that only one thread will do so. - pub fn init(system_settings: &mut SystemSettings) { - // SAFETY: the `get_or_init` access is a thread-safe API, and the - // PROFILER is not being mutated outside single-threaded phases such - // as minit/mshutdown. - unsafe { PROFILER.get_or_init(|| Profiler::new(system_settings)) }; - } - - pub fn get() -> Option<&'static Profiler> { - // SAFETY: the `get` access is a thread-safe API, and the PROFILER is - // not being mutated outside single-threaded phases such as minit and - // mshutdown. - unsafe { PROFILER.get() } - } - pub fn new(system_settings: &mut SystemSettings) -> Self { let fork_barrier = Arc::new(Barrier::new(3)); let interrupt_manager = Arc::new(InterruptManager::new()); @@ -644,19 +625,6 @@ impl Profiler { .map_err(Box::new) } - /// Begins the shutdown process. To complete it, call [Profiler::shutdown]. - /// Note that you must call [Profiler::shutdown] afterwards; it's two - /// parts of the same operation. It's split so you (or other extensions) - /// can do something while the other threads finish up. - pub fn stop(timeout: Duration) { - // SAFETY: the `get_mut` access is a thread-safe API, and the PROFILER - // is not being mutated outside single-threaded phases such as minit - // and mshutdown. - if let Some(profiler) = unsafe { PROFILER.get_mut() } { - profiler.join_and_drop_sender(timeout); - } - } - pub fn join_and_drop_sender(&mut self, timeout: Duration) { debug!("Stopping profiler."); @@ -685,20 +653,6 @@ impl Profiler { std::mem::swap(&mut self.upload_sender, &mut empty_sender); } - /// Completes the shutdown process; to start it, call [Profiler::stop] - /// before calling [Profiler::shutdown]. - /// Note the timeout is per thread, and there may be multiple threads. - /// - /// Safety: only safe to be called in `SHUTDOWN`/`MSHUTDOWN` phase - pub fn shutdown(timeout: Duration) { - // SAFETY: the `take` access is a thread-safe API, and the PROFILER is - // not being mutated outside single-threaded phases such as minit and - // mshutdown. - if let Some(profiler) = unsafe { PROFILER.take() } { - profiler.join_collector_and_uploader(timeout); - } - } - pub fn join_collector_and_uploader(self, timeout: Duration) { if self.should_join.load(Ordering::SeqCst) { thread_utils::join_timeout( @@ -718,34 +672,6 @@ impl Profiler { } } - /// Throws away the profiler and moves it to uninitialized. - /// - /// In a forking situation, the currently active profiler may not be valid - /// because it has join handles and other state shared by other threads, - /// and threads are not copied when the process is forked. - /// Additionally, if we've hit certain other issues like not being able to - /// determine the return type of the pcntl_fork function, we don't know if - /// we're the parent or child. - /// So, we throw away the current profiler and forget it, which avoids - /// running the destructor. Yes, this will leak some memory. - /// - /// # Safety - /// Must be called when no other thread is using the PROFILER object. That - /// includes this thread in some kind of recursive manner. - pub unsafe fn kill() { - // SAFETY: see this function's safety conditions. - if let Some(mut profiler) = PROFILER.take() { - // Drop some things to reduce memory. - profiler.interrupt_manager = Arc::new(InterruptManager::new()); - profiler.message_sender = crossbeam_channel::bounded(0).0; - profiler.upload_sender = crossbeam_channel::bounded(0).0; - - // But we're not 100% sure everything is safe to drop, notably the - // join handles, so we leak the rest. - forget(profiler) - } - } - /// Collect a stack sample with elapsed wall time. Collects CPU time if /// it's enabled and available. pub fn collect_time(&self, execute_data: *mut zend_execute_data, interrupt_count: u32) { diff --git a/profiling/src/profiling/system_profiler.rs b/profiling/src/profiling/system_profiler.rs new file mode 100644 index 0000000000..da783f9d00 --- /dev/null +++ b/profiling/src/profiling/system_profiler.rs @@ -0,0 +1,87 @@ +use super::{InterruptManager, Profiler, SystemSettings}; +use once_cell::sync::OnceCell; +use std::mem::forget; +use std::sync::Arc; +use std::time::Duration; + +/// The global profiler. Profiler gets made during the first rinit after an +/// minit, and is destroyed on mshutdown. +static mut PROFILER: SystemProfiler = SystemProfiler::new(); + +pub struct SystemProfiler(OnceCell); + +impl SystemProfiler { + pub const fn new() -> Self { + SystemProfiler(OnceCell::new()) + } + + /// Initializes the system profiler safely by one thread. + pub fn init(system_settings: &mut SystemSettings) { + // SAFETY: the `get_or_init` access is a thread-safe API, and the + // PROFILER is not being mutated outside single-threaded phases such + // as minit and mshutdown. + unsafe { PROFILER.0.get_or_init(|| Profiler::new(system_settings)) }; + } + + pub fn get() -> Option<&'static Profiler> { + // SAFETY: the `get` access is a thread-safe API, and the PROFILER is + // not being mutated outside single-threaded phases such as minit and + // mshutdown. + unsafe { PROFILER.0.get() } + } + + /// Begins the shutdown process. To complete it, call [Profiler::shutdown]. + /// Note that you must call [Profiler::shutdown] afterward; it's two + /// parts of the same operation. It's split so you (or other extensions) + /// can do something while the other threads finish up. + pub fn stop(timeout: Duration) { + // SAFETY: the `get_mut` access is a thread-safe API, and the PROFILER + // is not being mutated outside single-threaded phases such as minit + // and mshutdown. + if let Some(profiler) = unsafe { PROFILER.0.get_mut() } { + profiler.join_and_drop_sender(timeout); + } + } + + /// Completes the shutdown process; to start it, call [Profiler::stop] + /// before calling [Profiler::shutdown]. + /// Note the timeout is per thread, and there may be multiple threads. + /// + /// Safety: only safe to be called in `SHUTDOWN`/`MSHUTDOWN` phase + pub fn shutdown(timeout: Duration) { + // SAFETY: the `take` access is a thread-safe API, and the PROFILER is + // not being mutated outside single-threaded phases such as minit and + // mshutdown. + if let Some(profiler) = unsafe { PROFILER.0.take() } { + profiler.join_collector_and_uploader(timeout); + } + } + + /// Throws away the profiler and moves it to uninitialized. + /// + /// In a forking situation, the currently active profiler may not be valid + /// because it has join handles and other state shared by other threads, + /// and threads are not copied when the process is forked. + /// Additionally, if we've hit certain other issues like not being able to + /// determine the return type of the pcntl_fork function, we don't know if + /// we're the parent or child. + /// So, we throw away the current profiler and forget it, which avoids + /// running the destructor. Yes, this will leak some memory. + /// + /// # Safety + /// Must be called when no other thread is using the PROFILER object. That + /// includes this thread in some kind of recursive manner. + pub unsafe fn kill() { + // SAFETY: see this function's safety conditions. + if let Some(mut profiler) = PROFILER.0.take() { + // Drop some things to reduce memory. + profiler.interrupt_manager = Arc::new(InterruptManager::new()); + profiler.message_sender = crossbeam_channel::bounded(0).0; + profiler.upload_sender = crossbeam_channel::bounded(0).0; + + // But we're not 100% sure everything is safe to drop, notably the + // join handles, so we leak the rest. + forget(profiler) + } + } +} diff --git a/profiling/src/timeline.rs b/profiling/src/timeline.rs index 596c887a14..79b7b4a47c 100644 --- a/profiling/src/timeline.rs +++ b/profiling/src/timeline.rs @@ -1,4 +1,4 @@ -use crate::profiling::Profiler; +use crate::profiling::SystemProfiler; use crate::zend::{ self, zai_str_from_zstr, zend_execute_data, zend_get_executed_filename_ex, zval, InternalFunctionHandler, @@ -81,7 +81,7 @@ fn sleeping_fn( return; } - if let Some(profiler) = Profiler::get() { + if let Some(profiler) = SystemProfiler::get() { // Safety: `unwrap` can be unchecked, as we checked for `is_err()` let now = unsafe { now.unwrap_unchecked().as_nanos() } as i64; let duration = duration.as_nanos() as i64; @@ -218,7 +218,7 @@ pub unsafe fn timeline_rinit() { return; } - if let Some(profiler) = Profiler::get() { + if let Some(profiler) = SystemProfiler::get() { profiler.collect_idle( // Safety: checked for `is_err()` above SystemTime::now() @@ -277,7 +277,7 @@ pub(crate) unsafe fn timeline_mshutdown() { return; } - if let Some(profiler) = Profiler::get() { + if let Some(profiler) = SystemProfiler::get() { profiler.collect_idle( // Safety: checked for `is_err()` above SystemTime::now() @@ -341,7 +341,7 @@ unsafe extern "C" fn ddog_php_prof_compile_string( duration.as_nanos(), ); - if let Some(profiler) = Profiler::get() { + if let Some(profiler) = SystemProfiler::get() { profiler.collect_compile_string( // Safety: checked for `is_err()` above now.unwrap().as_nanos() as i64, @@ -407,7 +407,7 @@ unsafe extern "C" fn ddog_php_prof_compile_file( duration.as_nanos(), ); - if let Some(profiler) = Profiler::get() { + if let Some(profiler) = SystemProfiler::get() { profiler.collect_compile_file( // Safety: checked for `is_err()` above now.unwrap().as_nanos() as i64, @@ -477,7 +477,7 @@ unsafe extern "C" fn ddog_php_prof_gc_collect_cycles() -> i32 { duration.as_nanos() ); - if let Some(profiler) = Profiler::get() { + if let Some(profiler) = SystemProfiler::get() { cfg_if::cfg_if! { if #[cfg(php_gc_status)] { profiler.collect_garbage_collection( diff --git a/profiling/src/wall_time.rs b/profiling/src/wall_time.rs index 42f7277843..f4711d7b59 100644 --- a/profiling/src/wall_time.rs +++ b/profiling/src/wall_time.rs @@ -5,7 +5,7 @@ use crate::bindings::{ zend_execute_data, zend_execute_internal, zend_interrupt_function, zval, VmInterruptFn, ZEND_ACC_CALL_VIA_TRAMPOLINE, }; -use crate::{profiling::Profiler, zend, REQUEST_LOCALS}; +use crate::{profiling::SystemProfiler, zend, REQUEST_LOCALS}; use std::mem::MaybeUninit; use std::sync::atomic::Ordering; @@ -55,7 +55,7 @@ pub extern "C" fn ddog_php_prof_interrupt_function(execute_data: *mut zend_execu return; } - if let Some(profiler) = Profiler::get() { + if let Some(profiler) = SystemProfiler::get() { // Safety: execute_data was provided by the engine, and the profiler doesn't mutate it. profiler.collect_time(execute_data, interrupt_count); }