From 798d7aba9092be1bc91f046305c636c0c8e59fee Mon Sep 17 00:00:00 2001 From: John Cortell Date: Fri, 6 Jan 2023 15:07:35 -0600 Subject: [PATCH 1/3] Error out of counter splitting if group max is zero A counter group having a counter max of zero is invalid and will ultimately result in a hang when we try to split counters into multiple passes. This is one of various scenarios that result in a hang during counter splitting; see https://github.com/GPUOpen-Tools/gpu_performance_api/issues/69 This fixes only that specific scenario. We now check that the group max isn't zero, and if it is, we give up trying to split a public counter's HW counters into multiple passes. We log an error, too. Again, this isn't a comprehensive fix for issue 69. There could be other cases of bad data that result in a hang. Issue 69 should be fixed with a pass cap limit to cover all cases. But this commit still adds value in that it flags the specific invalid GPU counter metadata in addition to avoiding the hang. Change-Id: I56d7d2043ba92c1b6088f0fdd68f5ec844e7b823 --- .../gpu_performance_api/gpu_perf_api_types.h | 1 + .../gpa_counter_scheduler_base.cc | 24 ++++++++++++++++--- .../gpa_split_counters_interfaces.h | 7 +++++- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/include/gpu_performance_api/gpu_perf_api_types.h b/include/gpu_performance_api/gpu_perf_api_types.h index 84906087..4d2a0012 100644 --- a/include/gpu_performance_api/gpu_perf_api_types.h +++ b/include/gpu_performance_api/gpu_perf_api_types.h @@ -170,6 +170,7 @@ typedef enum kGpaStatusErrorLibAlreadyLoaded = -41, kGpaStatusErrorOtherSessionActive = -42, kGpaStatusErrorException = -43, + gGpaInvalidCounterGroupData = -44, kGpaStatusMin = kGpaStatusErrorException, kGpaStatusInternal = 256, ///< Status codes used internally within GPUPerfAPI. } GpaStatus; diff --git a/source/gpu_perf_api_counter_generator/gpa_counter_scheduler_base.cc b/source/gpu_perf_api_counter_generator/gpa_counter_scheduler_base.cc index 04734eed..2e504e9c 100644 --- a/source/gpu_perf_api_counter_generator/gpa_counter_scheduler_base.cc +++ b/source/gpu_perf_api_counter_generator/gpa_counter_scheduler_base.cc @@ -308,19 +308,37 @@ GpaStatus GpaCounterSchedulerBase::GetNumRequiredPasses(GpaUInt32* num_required_ // Add the HW groups max's. for (unsigned int i = 0; i < hw_counters->internal_counter_groups_.size(); ++i) { - max_counters_per_group.push_back(hw_counters->internal_counter_groups_[i].max_active_discrete_counters); + auto count = hw_counters->internal_counter_groups_[i].max_active_discrete_counters; + if (count == 0) + { + GPA_LOG_DEBUG_ERROR("ERROR: hardware counter group '%s' has zero for max-counters-per-group", hw_counters->internal_counter_groups_[i].name); + return gGpaInvalidCounterGroupData; + } + max_counters_per_group.push_back(count); } // Add the Additional groups max's. for (unsigned int i = 0; i < hw_counters->additional_group_count_; ++i) { - max_counters_per_group.push_back(hw_counters->additional_groups_[i].max_active_discrete_counters); + auto count = hw_counters->additional_groups_[i].max_active_discrete_counters; + if (count == 0) + { + GPA_LOG_DEBUG_ERROR("ERROR: hardware counter additional group '%s' has zero for max-counters-per-group", hw_counters->additional_groups_[i].name); + return gGpaInvalidCounterGroupData; + } + max_counters_per_group.push_back(count); } // TODO: properly handle software groups -- right now, this works because there is only ever a single group defined. if (sw_counters->group_count_ == 1) { - max_counters_per_group.push_back(DoGetNumSoftwareCounters()); + auto count = DoGetNumSoftwareCounters(); + if (count == 0) + { + GPA_LOG_DEBUG_ERROR("ERROR: software counter group has zero for max-counters-per-group"); + return gGpaInvalidCounterGroupData; + } + max_counters_per_group.push_back(count); } GpaCounterGroupAccessor accessor(hw_counters->internal_counter_groups_, diff --git a/source/gpu_perf_api_counter_generator/gpa_split_counters_interfaces.h b/source/gpu_perf_api_counter_generator/gpa_split_counters_interfaces.h index 2b0fa6e5..57c8bc58 100644 --- a/source/gpu_perf_api_counter_generator/gpa_split_counters_interfaces.h +++ b/source/gpu_perf_api_counter_generator/gpa_split_counters_interfaces.h @@ -16,10 +16,10 @@ #ifdef DEBUG_PUBLIC_COUNTER_SPLITTER #include -#include "gpu_perf_api_common/logging.h" #endif #include "gpu_perf_api_counter_generator/gpa_derived_counter.h" +#include "gpu_perf_api_common/logging.h" /// @brief Enum to represent the different SQ shader stages. enum GpaSqShaderStage @@ -390,6 +390,11 @@ class IGpaSplitCounters } unsigned int group_limit = max_counters_per_group[group_index]; + if (group_limit == 0) + { + GPA_LOG_DEBUG_ERROR("ERROR: group(%d) count limit is zero", group_index); + return false; + } return new_group_used_count <= group_limit; } From fe012c46e4a605340bd937850732748f7c708fb3 Mon Sep 17 00:00:00 2001 From: John Cortell Date: Tue, 17 Jan 2023 10:39:52 -0600 Subject: [PATCH 2/3] Address code review comments Fixed constant name and log statement --- include/gpu_performance_api/gpu_perf_api_types.h | 4 ++-- .../gpa_counter_scheduler_base.cc | 6 +++--- .../gpa_split_counters_interfaces.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/gpu_performance_api/gpu_perf_api_types.h b/include/gpu_performance_api/gpu_perf_api_types.h index 4d2a0012..7a290c05 100644 --- a/include/gpu_performance_api/gpu_perf_api_types.h +++ b/include/gpu_performance_api/gpu_perf_api_types.h @@ -170,8 +170,8 @@ typedef enum kGpaStatusErrorLibAlreadyLoaded = -41, kGpaStatusErrorOtherSessionActive = -42, kGpaStatusErrorException = -43, - gGpaInvalidCounterGroupData = -44, - kGpaStatusMin = kGpaStatusErrorException, + kGpaStatusErrorInvalidCounterGroupData = -44, + kGpaStatusMin = kGpaStatusErrorInvalidCounterGroupData, kGpaStatusInternal = 256, ///< Status codes used internally within GPUPerfAPI. } GpaStatus; diff --git a/source/gpu_perf_api_counter_generator/gpa_counter_scheduler_base.cc b/source/gpu_perf_api_counter_generator/gpa_counter_scheduler_base.cc index 2e504e9c..3ffaa2e8 100644 --- a/source/gpu_perf_api_counter_generator/gpa_counter_scheduler_base.cc +++ b/source/gpu_perf_api_counter_generator/gpa_counter_scheduler_base.cc @@ -312,7 +312,7 @@ GpaStatus GpaCounterSchedulerBase::GetNumRequiredPasses(GpaUInt32* num_required_ if (count == 0) { GPA_LOG_DEBUG_ERROR("ERROR: hardware counter group '%s' has zero for max-counters-per-group", hw_counters->internal_counter_groups_[i].name); - return gGpaInvalidCounterGroupData; + return kGpaStatusErrorInvalidCounterGroupData; } max_counters_per_group.push_back(count); } @@ -324,7 +324,7 @@ GpaStatus GpaCounterSchedulerBase::GetNumRequiredPasses(GpaUInt32* num_required_ if (count == 0) { GPA_LOG_DEBUG_ERROR("ERROR: hardware counter additional group '%s' has zero for max-counters-per-group", hw_counters->additional_groups_[i].name); - return gGpaInvalidCounterGroupData; + return kGpaStatusErrorInvalidCounterGroupData; } max_counters_per_group.push_back(count); } @@ -336,7 +336,7 @@ GpaStatus GpaCounterSchedulerBase::GetNumRequiredPasses(GpaUInt32* num_required_ if (count == 0) { GPA_LOG_DEBUG_ERROR("ERROR: software counter group has zero for max-counters-per-group"); - return gGpaInvalidCounterGroupData; + return kGpaStatusErrorInvalidCounterGroupData; } max_counters_per_group.push_back(count); } diff --git a/source/gpu_perf_api_counter_generator/gpa_split_counters_interfaces.h b/source/gpu_perf_api_counter_generator/gpa_split_counters_interfaces.h index 57c8bc58..b42359f4 100644 --- a/source/gpu_perf_api_counter_generator/gpa_split_counters_interfaces.h +++ b/source/gpu_perf_api_counter_generator/gpa_split_counters_interfaces.h @@ -392,7 +392,7 @@ class IGpaSplitCounters unsigned int group_limit = max_counters_per_group[group_index]; if (group_limit == 0) { - GPA_LOG_DEBUG_ERROR("ERROR: group(%d) count limit is zero", group_index); + GPA_LOG_DEBUG_ERROR("ERROR: group(%d) counter limit is zero", group_index); return false; } From 3118bfd0ab6924894cf486829a759d7567b431c4 Mon Sep 17 00:00:00 2001 From: John Cortell Date: Wed, 18 Jan 2023 09:56:28 -0600 Subject: [PATCH 3/3] Adjust error messages Also added missing GPA_ENUM_STRING_VAL for new error code --- source/gpu_perf_api_common/gpu_perf_api.cc | 3 ++- .../gpa_counter_scheduler_base.cc | 6 +++--- .../gpa_split_counters_interfaces.h | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/source/gpu_perf_api_common/gpu_perf_api.cc b/source/gpu_perf_api_common/gpu_perf_api.cc index ac13c6ae..641bb9da 100644 --- a/source/gpu_perf_api_common/gpu_perf_api.cc +++ b/source/gpu_perf_api_common/gpu_perf_api.cc @@ -1671,7 +1671,8 @@ static const char* kErrorString[] = { GPA_ENUM_STRING_VAL(kGpaStatusErrorTimeout, "GPA Error: Attempt to Retrieve Data Failed due to Timeout."), GPA_ENUM_STRING_VAL(kGpaStatusErrorLibAlreadyLoaded, "GPA Error: Library Is Already Loaded."), GPA_ENUM_STRING_VAL(kGpaStatusErrorOtherSessionActive, "GPA Error: Other Session Is Active."), - GPA_ENUM_STRING_VAL(kGpaStatusErrorException, "GPA Error: Exception Occurred.")}; + GPA_ENUM_STRING_VAL(kGpaStatusErrorException, "GPA Error: Exception Occurred."), + GPA_ENUM_STRING_VAL(kGpaStatusErrorInvalidCounterGroupData, "GPA Error: Counter group data is invalid.")}; /// Size of kErrorString array. static size_t kErrorStringSize = sizeof(kErrorString) / sizeof(const char*); diff --git a/source/gpu_perf_api_counter_generator/gpa_counter_scheduler_base.cc b/source/gpu_perf_api_counter_generator/gpa_counter_scheduler_base.cc index 3ffaa2e8..8712e06e 100644 --- a/source/gpu_perf_api_counter_generator/gpa_counter_scheduler_base.cc +++ b/source/gpu_perf_api_counter_generator/gpa_counter_scheduler_base.cc @@ -311,7 +311,7 @@ GpaStatus GpaCounterSchedulerBase::GetNumRequiredPasses(GpaUInt32* num_required_ auto count = hw_counters->internal_counter_groups_[i].max_active_discrete_counters; if (count == 0) { - GPA_LOG_DEBUG_ERROR("ERROR: hardware counter group '%s' has zero for max-counters-per-group", hw_counters->internal_counter_groups_[i].name); + GPA_LOG_DEBUG_ERROR("Hardware counter group '%s' has zero for max-counters-per-group.", hw_counters->internal_counter_groups_[i].name); return kGpaStatusErrorInvalidCounterGroupData; } max_counters_per_group.push_back(count); @@ -323,7 +323,7 @@ GpaStatus GpaCounterSchedulerBase::GetNumRequiredPasses(GpaUInt32* num_required_ auto count = hw_counters->additional_groups_[i].max_active_discrete_counters; if (count == 0) { - GPA_LOG_DEBUG_ERROR("ERROR: hardware counter additional group '%s' has zero for max-counters-per-group", hw_counters->additional_groups_[i].name); + GPA_LOG_DEBUG_ERROR("Hardware counter additional group '%s' has zero for max-counters-per-group.", hw_counters->additional_groups_[i].name); return kGpaStatusErrorInvalidCounterGroupData; } max_counters_per_group.push_back(count); @@ -335,7 +335,7 @@ GpaStatus GpaCounterSchedulerBase::GetNumRequiredPasses(GpaUInt32* num_required_ auto count = DoGetNumSoftwareCounters(); if (count == 0) { - GPA_LOG_DEBUG_ERROR("ERROR: software counter group has zero for max-counters-per-group"); + GPA_LOG_DEBUG_ERROR("Software counter group has zero for max-counters-per-group."); return kGpaStatusErrorInvalidCounterGroupData; } max_counters_per_group.push_back(count); diff --git a/source/gpu_perf_api_counter_generator/gpa_split_counters_interfaces.h b/source/gpu_perf_api_counter_generator/gpa_split_counters_interfaces.h index b42359f4..07c27b14 100644 --- a/source/gpu_perf_api_counter_generator/gpa_split_counters_interfaces.h +++ b/source/gpu_perf_api_counter_generator/gpa_split_counters_interfaces.h @@ -392,7 +392,7 @@ class IGpaSplitCounters unsigned int group_limit = max_counters_per_group[group_index]; if (group_limit == 0) { - GPA_LOG_DEBUG_ERROR("ERROR: group(%d) counter limit is zero", group_index); + GPA_LOG_DEBUG_ERROR("Group(%d) counter limit is zero.", group_index); return false; }