Skip to content

Commit

Permalink
Single rank profiling (#288)
Browse files Browse the repository at this point in the history
* Make only local master do energy profiling.

* Use ZES to query devices in order to get around affinity masks.

* Use ZES for drivers as well.

* set ZES

* Update ze/tracer_ze_helpers.include.c

Co-authored-by: Brice Videau <[email protected]>

* Update ze/tracer_ze_helpers.include.c

Co-authored-by: Brice Videau <[email protected]>

* Update xprof/xprof.rb.in

---------

Co-authored-by: Brice Videau <[email protected]>
Co-authored-by: Thomas Applencourt <[email protected]>
  • Loading branch information
3 people authored and Thomas Applencourt committed Oct 4, 2024
1 parent 6cd5eb6 commit d984905
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 28 deletions.
10 changes: 7 additions & 3 deletions xprof/xprof.rb.in
Original file line number Diff line number Diff line change
Expand Up @@ -345,8 +345,7 @@ end

def sampling?
return false unless OPTIONS[:sample]

env_fetch_first('LTTNG_UST_SAMPLING_MASTER_ONLY', default: '0') == '0' || mpi_local_master?
env_fetch_first('LTTNG_UST_SAMPLING_MASTER_ONLY', default: '1') == '0' || mpi_local_master?
end

def env_tracers
Expand Down Expand Up @@ -398,11 +397,16 @@ def env_tracers
end

# Sample
# Currently the same `so` does the tracing, and the sampling
# This mean that is the local rank is not part of the `traced-ranks`
# No sampling will be performed
if sampling?
LOGGER.debug('Sampling Enabled')
h['LTTNG_UST_SAMPLING'] = 1
h['LTTNG_UST_SAMPLING_ENERGY'] = 1
h['ZES_ENABLE_SYSMAN'] = 1 if OPTIONS[:'backend-names'].include?('ze')
# The current only reliable way to use zes api
# is to call zesInit and set ZES_ENABLE_SYSMAN to 0
h['ZES_ENABLE_SYSMAN'] = 0 if OPTIONS[:'backend-names'].include?('ze')
end

backends = [] unless need_backend
Expand Down
46 changes: 21 additions & 25 deletions ze/tracer_ze_helpers.include.c
Original file line number Diff line number Diff line change
Expand Up @@ -793,8 +793,8 @@ static int _sampling_freq_initialized = 0;
static int _sampling_pwr_initialized = 0;
static int _sampling_engines_initialized = 0;
// Static handles to stay throughout the execution
static ze_driver_handle_t* _sampling_hDrivers = NULL;
static ze_device_handle_t** _sampling_hDevices = NULL;
static zes_driver_handle_t* _sampling_hDrivers = NULL;
static zes_device_handle_t** _sampling_hDevices = NULL;
static zes_freq_handle_t*** _sampling_hFrequencies = NULL;
static zes_pwr_handle_t*** _sampling_hPowers = NULL;
static zes_engine_handle_t*** _sampling_engineHandles = NULL;
Expand Down Expand Up @@ -909,58 +909,54 @@ static void intializeEngines() {

static int initializeHandles() {
ze_result_t res;
const char *e = getenv("ZES_ENABLE_SYSMAN");
if (!(e && e[0] == '1')) {
fprintf(stderr,"ZES_ENABLE_SYSMAN needs to be set!\n");
return -1;
}
#ifdef CALL_ZEINIT
res = zeInit(ZE_INIT_FLAG_GPU_ONLY);
res = ZES_INIT_PTR(0);
if (res != ZE_RESULT_SUCCESS) {
_ZE_ERROR_MSG("zeInit", res);
_ZE_ERROR_MSG("ZES_INIT_PTR", res);
return -1;
}
#endif

// Query driver
_sampling_driverCount = 0;
res = ZE_DRIVER_GET_PTR(&_sampling_driverCount, NULL);
res = ZES_DRIVER_GET_PTR(&_sampling_driverCount, NULL);
if (res != ZE_RESULT_SUCCESS) {
_ZE_ERROR_MSG("1st ZE_DRIVER_GET_PTR", res);
_ZE_ERROR_MSG("1st ZES_DRIVER_GET_PTR", res);
return -1;
}
_sampling_hDrivers = (ze_driver_handle_t*) calloc(_sampling_driverCount, sizeof(ze_driver_handle_t));
res = ZE_DRIVER_GET_PTR(&_sampling_driverCount, _sampling_hDrivers);
_sampling_hDrivers = (zes_driver_handle_t*) calloc(_sampling_driverCount, sizeof(zes_driver_handle_t));
res = ZES_DRIVER_GET_PTR(&_sampling_driverCount, _sampling_hDrivers);
if (res != ZE_RESULT_SUCCESS) {
_ZE_ERROR_MSG("2nd ZE_DRIVER_GET_PTR", res);
_ZE_ERROR_MSG("2nd ZES_DRIVER_GET_PTR", res);
return -1;
}
_sampling_deviceCount = (uint32_t*) calloc(_sampling_driverCount, sizeof(uint32_t));
_sampling_subDeviceCount = (uint32_t**) calloc(_sampling_driverCount, sizeof(uint32_t*));
_sampling_hDevices = (ze_device_handle_t**) calloc(_sampling_driverCount, sizeof(ze_device_handle_t*));
_sampling_hDevices = (zes_device_handle_t**) calloc(_sampling_driverCount, sizeof(zes_device_handle_t*));
// Query device count
for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) {
res = ZE_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], NULL);
res = ZES_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], NULL);
if (res != ZE_RESULT_SUCCESS || _sampling_deviceCount[driverIdx] == 0) {
fprintf(stderr, "ERROR: No device found!\n");
_ZE_ERROR_MSG("1st ZE_DEVICE_GET_PTR", res);
_ZE_ERROR_MSG("1st ZES_DEVICE_GET_PTR", res);
return -1;
}
_sampling_hDevices[driverIdx] = (ze_device_handle_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(ze_device_handle_t));
res = ZE_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], _sampling_hDevices[driverIdx]);
_sampling_hDevices[driverIdx] = (zes_device_handle_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(zes_device_handle_t));
res = ZES_DEVICE_GET_PTR(_sampling_hDrivers[driverIdx], &_sampling_deviceCount[driverIdx], _sampling_hDevices[driverIdx]);
if (res != ZE_RESULT_SUCCESS) {
_ZE_ERROR_MSG("2nd ZE_DEVICE_GET_PTR", res);
_ZE_ERROR_MSG("2nd ZES_DEVICE_GET_PTR", res);
free(_sampling_hDevices[driverIdx]);
return -1;
}
//Get no sub-devices
_sampling_subDeviceCount[driverIdx] = (uint32_t*) calloc(_sampling_deviceCount[driverIdx], sizeof(uint32_t));
for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) {
res = ZE_DEVICE_GET_SUB_DEVICES_PTR(_sampling_hDevices[driverIdx][deviceIdx], &_sampling_subDeviceCount[driverIdx][deviceIdx], NULL);
zes_device_properties_t deviceProperties = {0};
deviceProperties.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
res = ZES_DEVICE_GET_PROPERTIES_PTR(_sampling_hDevices[driverIdx][deviceIdx], &deviceProperties);
if (res != ZE_RESULT_SUCCESS) {
_ZE_ERROR_MSG("ZE_DEVICE_GET_SUB_DEVICES_PTR", res);
_ZE_ERROR_MSG("ZES_DEVICE_GET_PROPERTIES_PTR", res);
_sampling_subDeviceCount[driverIdx][deviceIdx] = 0;
}
} else
_sampling_subDeviceCount[driverIdx][deviceIdx] = deviceProperties.numSubdevices;
if (_sampling_subDeviceCount[driverIdx][deviceIdx] == 0) {
_sampling_subDeviceCount[driverIdx][deviceIdx] = 1;
}
Expand Down

0 comments on commit d984905

Please sign in to comment.