From d984905879b75fdad9c242c3854c2f2ceb3a452f Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Tue, 17 Sep 2024 14:55:57 -0500 Subject: [PATCH] Single rank profiling (#288) * Make only local master do energy profiling. * Use ZES to query devices in order to get around affinity masks. * Use ZES for drivers as well. * set ZES * Update ze/tracer_ze_helpers.include.c Co-authored-by: Brice Videau * Update ze/tracer_ze_helpers.include.c Co-authored-by: Brice Videau * Update xprof/xprof.rb.in --------- Co-authored-by: Brice Videau Co-authored-by: Thomas Applencourt --- xprof/xprof.rb.in | 10 +++++--- ze/tracer_ze_helpers.include.c | 46 ++++++++++++++++------------------ 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index a21587d0..509db769 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -345,8 +345,7 @@ end def sampling? return false unless OPTIONS[:sample] - - env_fetch_first('LTTNG_UST_SAMPLING_MASTER_ONLY', default: '0') == '0' || mpi_local_master? + env_fetch_first('LTTNG_UST_SAMPLING_MASTER_ONLY', default: '1') == '0' || mpi_local_master? end def env_tracers @@ -398,11 +397,16 @@ def env_tracers end # Sample + # Currently the same `so` does the tracing, and the sampling + # This mean that is the local rank is not part of the `traced-ranks` + # No sampling will be performed if sampling? LOGGER.debug('Sampling Enabled') h['LTTNG_UST_SAMPLING'] = 1 h['LTTNG_UST_SAMPLING_ENERGY'] = 1 - h['ZES_ENABLE_SYSMAN'] = 1 if OPTIONS[:'backend-names'].include?('ze') + # The current only reliable way to use zes api + # is to call zesInit and set ZES_ENABLE_SYSMAN to 0 + h['ZES_ENABLE_SYSMAN'] = 0 if OPTIONS[:'backend-names'].include?('ze') end backends = [] unless need_backend diff --git a/ze/tracer_ze_helpers.include.c b/ze/tracer_ze_helpers.include.c index 292e993f..164149a9 100644 --- a/ze/tracer_ze_helpers.include.c +++ b/ze/tracer_ze_helpers.include.c @@ -793,8 +793,8 @@ static int _sampling_freq_initialized = 0; static int _sampling_pwr_initialized = 0; static int _sampling_engines_initialized = 0; // Static handles to stay throughout the execution -static ze_driver_handle_t* _sampling_hDrivers = NULL; -static ze_device_handle_t** _sampling_hDevices = NULL; +static zes_driver_handle_t* _sampling_hDrivers = NULL; +static zes_device_handle_t** _sampling_hDevices = NULL; static zes_freq_handle_t*** _sampling_hFrequencies = NULL; static zes_pwr_handle_t*** _sampling_hPowers = NULL; static zes_engine_handle_t*** _sampling_engineHandles = NULL; @@ -909,58 +909,54 @@ static void intializeEngines() { static int initializeHandles() { ze_result_t res; - const char *e = getenv("ZES_ENABLE_SYSMAN"); - if (!(e && e[0] == '1')) { - fprintf(stderr,"ZES_ENABLE_SYSMAN needs to be set!\n"); - return -1; - } -#ifdef CALL_ZEINIT - res = zeInit(ZE_INIT_FLAG_GPU_ONLY); + res = ZES_INIT_PTR(0); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("zeInit", res); + _ZE_ERROR_MSG("ZES_INIT_PTR", res); return -1; } -#endif // Query driver _sampling_driverCount = 0; - res = ZE_DRIVER_GET_PTR(&_sampling_driverCount, NULL); + res = ZES_DRIVER_GET_PTR(&_sampling_driverCount, NULL); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("1st ZE_DRIVER_GET_PTR", res); + _ZE_ERROR_MSG("1st ZES_DRIVER_GET_PTR", res); return -1; } - _sampling_hDrivers = (ze_driver_handle_t*) calloc(_sampling_driverCount, sizeof(ze_driver_handle_t)); - res = ZE_DRIVER_GET_PTR(&_sampling_driverCount, _sampling_hDrivers); + _sampling_hDrivers = (zes_driver_handle_t*) calloc(_sampling_driverCount, sizeof(zes_driver_handle_t)); + res = ZES_DRIVER_GET_PTR(&_sampling_driverCount, _sampling_hDrivers); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("2nd ZE_DRIVER_GET_PTR", res); + _ZE_ERROR_MSG("2nd ZES_DRIVER_GET_PTR", res); return -1; } _sampling_deviceCount = (uint32_t*) calloc(_sampling_driverCount, sizeof(uint32_t)); _sampling_subDeviceCount = (uint32_t**) calloc(_sampling_driverCount, sizeof(uint32_t*)); - _sampling_hDevices = (ze_device_handle_t**) calloc(_sampling_driverCount, sizeof(ze_device_handle_t*)); + _sampling_hDevices = (zes_device_handle_t**) calloc(_sampling_driverCount, sizeof(zes_device_handle_t*)); // Query device count for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) { - res = ZE_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], NULL); + res = ZES_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], NULL); if (res != ZE_RESULT_SUCCESS || _sampling_deviceCount[driverIdx] == 0) { fprintf(stderr, "ERROR: No device found!\n"); - _ZE_ERROR_MSG("1st ZE_DEVICE_GET_PTR", res); + _ZE_ERROR_MSG("1st ZES_DEVICE_GET_PTR", res); return -1; } - _sampling_hDevices[driverIdx] = (ze_device_handle_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(ze_device_handle_t)); - res = ZE_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], _sampling_hDevices[driverIdx]); + _sampling_hDevices[driverIdx] = (zes_device_handle_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(zes_device_handle_t)); + res = ZES_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], _sampling_hDevices[driverIdx]); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("2nd ZE_DEVICE_GET_PTR", res); + _ZE_ERROR_MSG("2nd ZES_DEVICE_GET_PTR", res); free(_sampling_hDevices[driverIdx]); return -1; } //Get no sub-devices _sampling_subDeviceCount[driverIdx] = (uint32_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(uint32_t)); for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) { - res = ZE_DEVICE_GET_SUB_DEVICES_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_subDeviceCount[driverIdx][deviceIdx], NULL); + zes_device_properties_t deviceProperties = {0}; + deviceProperties.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES; + res = ZES_DEVICE_GET_PROPERTIES_PTR(_sampling_hDevices[driverIdx][deviceIdx], &deviceProperties); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("ZE_DEVICE_GET_SUB_DEVICES_PTR", res); + _ZE_ERROR_MSG("ZES_DEVICE_GET_PROPERTIES_PTR", res); _sampling_subDeviceCount[driverIdx][deviceIdx] = 0; - } + } else + _sampling_subDeviceCount[driverIdx][deviceIdx] = deviceProperties.numSubdevices; if (_sampling_subDeviceCount[driverIdx][deviceIdx] == 0) { _sampling_subDeviceCount[driverIdx][deviceIdx] = 1; }