diff --git a/.devcontainer/make_devcontainers.sh b/.devcontainer/make_devcontainers.sh
index 64b92c0..f868cc1 100755
--- a/.devcontainer/make_devcontainers.sh
+++ b/.devcontainer/make_devcontainers.sh
@@ -111,6 +111,9 @@ mv "./temp_devcontainer.json" ${base_devcontainer_file}
 # Create an array to keep track of valid subdirectory names
 valid_subdirs=()
 
+# The img folder should not be removed:
+valid_subdirs+=("img")
+
 # For each unique combination
 for combination in $combinations; do
     cuda_version=$(echo "$combination" | jq -r '.cuda')
diff --git a/ci/build_nvbench.sh b/ci/build_nvbench.sh
index cc245d3..e9ba372 100755
--- a/ci/build_nvbench.sh
+++ b/ci/build_nvbench.sh
@@ -8,6 +8,23 @@ PRESET="nvbench-ci"
 
 CMAKE_OPTIONS=""
 
+function version_lt() {
+  local lhs="${1//v/}"
+  local rhs="${2//v/}"
+  # If the versions are equal, return false
+  [ "$lhs" = "$rhs" ] && return 1
+  # If the left-hand side is less than the right-hand side, return true
+  [  "$lhs" = `echo -e "$lhs\n$rhs" | sort -V | head -n1` ]
+}
+
+# If CUDA_COMPILER is nvcc and the version < 11.3, disable CUPTI
+if [[ "$CUDA_COMPILER" == *"nvcc"* ]]; then
+  CUDA_VERSION=$(nvcc --version | grep release | sed -r 's/.*release ([0-9.]+).*/\1/')
+  if version_lt "$CUDA_VERSION" "11.3"; then
+    CMAKE_OPTIONS+=" -DNVBench_ENABLE_CUPTI=OFF "
+  fi
+fi
+
 configure_and_build_preset "NVBench" "$PRESET" "$CMAKE_OPTIONS"
 
 print_time_summary
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 9959473..563dac0 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -47,40 +47,40 @@ msvc2022: &msvc2022 { name: 'cl', version: '14.39', exe: 'cl++' }
 # Configurations that will run for every PR
 pull_request:
   nvcc:
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc7,     extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc8,     extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc9,     extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *llvm9,    extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
-    - {cuda: *cuda_prev_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10}
-    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11}
-    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10}
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11}
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12}
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc7     }
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc8     }
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc9     }
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *llvm9    }
+    - {cuda: *cuda_prev_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11    }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7     }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8     }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9     }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10    }
+    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11    }
+    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12    }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9    }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10   }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11   }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12   }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13   }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7     }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8     }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9     }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10    }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11    }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12    }
     # Fails to compile simple input on CTK12.4. Try to add later.
-    # {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc13}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14}
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm15}
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm16}
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm17}
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm18,   extra_build_args: "-cmake-options '-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler'"}
-    - {cuda: *cuda_curr_max, os: 'windows2022', cpu: 'amd64', compiler: *msvc2019, extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF -DNVBench_ENABLE_NVML=OFF'"}
-    - {cuda: *cuda_curr_max, os: 'windows2022', cpu: 'amd64', compiler: *msvc2022, extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF -DNVBench_ENABLE_NVML=OFF'"}
+    # {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc13    }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9    }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm15   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm16   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm17   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm18,  extra_build_args: "-cmake-options '-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler'"}
+    - {cuda: *cuda_curr_max, os: 'windows2022', cpu: 'amd64', compiler: *msvc2019 }
+    - {cuda: *cuda_curr_max, os: 'windows2022', cpu: 'amd64', compiler: *msvc2022 }
diff --git a/ci/windows/build_nvbench.ps1 b/ci/windows/build_nvbench.ps1
index 1ac8bd1..7240698 100644
--- a/ci/windows/build_nvbench.ps1
+++ b/ci/windows/build_nvbench.ps1
@@ -20,7 +20,7 @@ $CMAKE_OPTIONS = ""
 
 # Append any arguments pass in on the command line
 If($ARG_CMAKE_OPTIONS -ne "") {
-    $CMAKE_OPTIONS += "$ARG_CMAKE_OPTIONS"
+    $CMAKE_OPTIONS += " $ARG_CMAKE_OPTIONS"
 }
 
 configure_and_build_preset "NVBench" "$PRESET" "$CMAKE_OPTIONS"
diff --git a/ci/windows/test_nvbench.ps1 b/ci/windows/test_nvbench.ps1
index bcd9f2c..4ee5106 100644
--- a/ci/windows/test_nvbench.ps1
+++ b/ci/windows/test_nvbench.ps1
@@ -20,7 +20,7 @@ $CMAKE_OPTIONS = ""
 
 # Append any arguments pass in on the command line
 If($ARG_CMAKE_OPTIONS -ne "") {
-    $CMAKE_OPTIONS += "$ARG_CMAKE_OPTIONS"
+    $CMAKE_OPTIONS += " $ARG_CMAKE_OPTIONS"
 }
 
 configure_and_build_preset "NVBench" "$PRESET" "$CMAKE_OPTIONS"
diff --git a/cmake/NVBenchNVML.cmake b/cmake/NVBenchNVML.cmake
index f2aadbb..4b005f3 100644
--- a/cmake/NVBenchNVML.cmake
+++ b/cmake/NVBenchNVML.cmake
@@ -1,37 +1,43 @@
-# Since this file is installed, we need to make sure that the CUDAToolkit has
-# been found by consumers:
-if (NOT TARGET CUDA::toolkit)
-  find_package(CUDAToolkit REQUIRED)
-endif()
-
-if (WIN32)
-  # The CUDA:: targets currently don't provide dll locations through the
-  # `IMPORTED_LOCATION` property, nor are they marked as `SHARED` libraries
-  # (they're currently `UNKNOWN`). This prevents the `nvbench_setup_dep_dlls`
-  # CMake function from copying the dlls to the build / install directories.
-  # This is discussed in https://gitlab.kitware.com/cmake/cmake/-/issues/22845
-  # and the other CMake issues it links to.
-  #
-  # We create a nvbench-specific target that configures the nvml interface as
-  # described here:
-  # https://gitlab.kitware.com/cmake/cmake/-/issues/22845#note_1077538
-  #
-  # Use find_file instead of find_library, which would search for a .lib file.
-  # This is also nice because find_file searches recursively (find_library
-  # does not) and some versions of CTK nest nvml.dll several directories deep
-  # under C:\Windows\System32.
-  find_file(NVBench_NVML_DLL nvml.dll REQUIRED
-    DOC "The full path to nvml.dll. Usually somewhere under C:/Windows/System32."
-    PATHS "C:/Windows/System32"
-  )
-  mark_as_advanced(NVBench_NVML_DLL)
-  add_library(nvbench::nvml SHARED IMPORTED)
-  target_link_libraries(nvbench::nvml INTERFACE CUDA::toolkit)
-  set_target_properties(nvbench::nvml PROPERTIES
-    IMPORTED_LOCATION "${NVBench_NVML_DLL}"
-    IMPORTED_IMPLIB "${CUDA_nvml_LIBRARY}"
-  )
-else()
-  # Linux is much easier...
-  add_library(nvbench::nvml ALIAS CUDA::nvml)
-endif()
+# Since this file is installed, we need to make sure that the CUDAToolkit has
+# been found by consumers:
+if (NOT TARGET CUDA::toolkit)
+  find_package(CUDAToolkit REQUIRED)
+endif()
+
+if (WIN32)
+  # The CUDA:: targets currently don't provide dll locations through the
+  # `IMPORTED_LOCATION` property, nor are they marked as `SHARED` libraries
+  # (they're currently `UNKNOWN`). This prevents the `nvbench_setup_dep_dlls`
+  # CMake function from copying the dlls to the build / install directories.
+  # This is discussed in https://gitlab.kitware.com/cmake/cmake/-/issues/22845
+  # and the other CMake issues it links to.
+  #
+  # We create a nvbench-specific target that configures the nvml interface as
+  # described here:
+  # https://gitlab.kitware.com/cmake/cmake/-/issues/22845#note_1077538
+  #
+  # Use find_file instead of find_library, which would search for a .lib file.
+  # This is also nice because find_file searches recursively (find_library
+  # does not) and some versions of CTK nest nvml.dll several directories deep
+  # under C:\Windows\System32.
+  find_file(NVBench_NVML_DLL nvml.dll
+    DOC "The full path to nvml.dll. Usually somewhere under C:/Windows/System32."
+    PATHS "C:/Windows/System32"
+  )
+  mark_as_advanced(NVBench_NVML_DLL)
+endif()
+
+if (NVBench_NVML_DLL)
+  add_library(nvbench::nvml SHARED IMPORTED)
+  target_link_libraries(nvbench::nvml INTERFACE CUDA::toolkit)
+  set_target_properties(nvbench::nvml PROPERTIES
+    IMPORTED_LOCATION "${NVBench_NVML_DLL}"
+    IMPORTED_IMPLIB "${CUDA_nvml_LIBRARY}"
+  )
+elseif(TARGET CUDA::nvml)
+  add_library(nvbench::nvml ALIAS CUDA::nvml)
+else()
+  message(FATAL_ERROR "Could not find nvml.dll or CUDA::nvml target. "
+          "Set -DNVBench_ENABLE_NVML=OFF to disable NVML support "
+          "or set -DNVBench_NVML_DLL to the full path to nvml.dll on Windows.")
+endif()
diff --git a/nvbench/main.cuh b/nvbench/main.cuh
index bcdced5..cd809ba 100644
--- a/nvbench/main.cuh
+++ b/nvbench/main.cuh
@@ -28,14 +28,135 @@
 #include <cstdlib>
 #include <iostream>
 
+// Advanced users can rebuild NVBench's `main` function using the macros in this file, or replace
+// them with customized implementations.
+
+// Customization point, called before NVBench initialization.
+#ifndef NVBENCH_MAIN_INITIALIZE_CUSTOM_PRE
+#define NVBENCH_MAIN_INITIALIZE_CUSTOM_PRE(argc, argv) []() {}()
+#endif
+
+// Customization point, called after NVBench initialization.
+#ifndef NVBENCH_MAIN_INITIALIZE_CUSTOM_POST
+#define NVBENCH_MAIN_INITIALIZE_CUSTOM_POST(argc, argv) []() {}()
+#endif
+
+// Customization point, called before NVBench parsing. Update argc/argv if needed.
+// argc/argv are the usual command line arguments types. The ARGS version of this
+// macro is a bit more convenient.
+#ifndef NVBENCH_MAIN_CUSTOM_ARGC_ARGV_HANDLER
+#define NVBENCH_MAIN_CUSTOM_ARGC_ARGV_HANDLER(argc, argv) []() {}()
+#endif
+
+// Customization point, called before NVBench parsing. Update args if needed.
+// Args is a vector of strings, each element is an argument.
+#ifndef NVBENCH_MAIN_CUSTOM_ARGS_HANDLER
+#define NVBENCH_MAIN_CUSTOM_ARGS_HANDLER(args) []() {}()
+#endif
+
+// Customization point, called before NVBench parsing.
+#ifndef NVBENCH_MAIN_PARSE_CUSTOM_PRE
+#define NVBENCH_MAIN_PARSE_CUSTOM_PRE(parser, args) []() {}()
+#endif
+
+// Customization point, called after NVBench parsing.
+#ifndef NVBENCH_MAIN_PARSE_CUSTOM_POST
+#define NVBENCH_MAIN_PARSE_CUSTOM_POST(parser) []() {}()
+#endif
+
+// Customization point, called before NVBench finalization.
+#ifndef NVBENCH_MAIN_FINALIZE_CUSTOM_PRE
+#define NVBENCH_MAIN_FINALIZE_CUSTOM_PRE() []() {}()
+#endif
+
+// Customization point, called after NVBench finalization.
+#ifndef NVBENCH_MAIN_FINALIZE_CUSTOM_POST
+#define NVBENCH_MAIN_FINALIZE_CUSTOM_POST() []() {}()
+#endif
+
+// Customization point, use to catch addition exceptions.
+#ifndef NVBENCH_MAIN_CATCH_EXCEPTIONS_CUSTOM
+#define NVBENCH_MAIN_CATCH_EXCEPTIONS_CUSTOM
+#endif
+
+/************************************ Default implementation **************************************/
+
+#ifndef NVBENCH_MAIN
 #define NVBENCH_MAIN                                                                               \
-  int main(int argc, char const *const *argv)                                                      \
+  int main(int argc, char **argv)                                                                  \
   try                                                                                              \
   {                                                                                                \
     NVBENCH_MAIN_BODY(argc, argv);                                                                 \
-    NVBENCH_CUDA_CALL(cudaDeviceReset());                                                          \
     return 0;                                                                                      \
   }                                                                                                \
+  NVBENCH_MAIN_CATCH_EXCEPTIONS_CUSTOM                                                             \
+  NVBENCH_MAIN_CATCH_EXCEPTIONS
+#endif
+
+#ifndef NVBENCH_MAIN_BODY
+#define NVBENCH_MAIN_BODY(argc, argv)                                                              \
+  NVBENCH_MAIN_INITIALIZE(argc, argv);                                                             \
+  {                                                                                                \
+    NVBENCH_MAIN_PARSE(argc, argv);                                                                \
+                                                                                                   \
+    NVBENCH_MAIN_PRINT_PREAMBLE(parser);                                                           \
+    NVBENCH_MAIN_RUN_BENCHMARKS(parser);                                                           \
+    NVBENCH_MAIN_PRINT_EPILOGUE(parser);                                                           \
+                                                                                                   \
+    NVBENCH_MAIN_PRINT_RESULTS(parser);                                                            \
+  } /* Tear down parser before finalization */                                                     \
+  NVBENCH_MAIN_FINALIZE();                                                                         \
+  return 0;
+#endif
+
+#ifndef NVBENCH_MAIN_INITIALIZE
+#define NVBENCH_MAIN_INITIALIZE(argc, argv)                                                        \
+  { /* Open a scope to ensure that the inner initialize/finalize hooks clean up in order. */       \
+    NVBENCH_MAIN_INITIALIZE_CUSTOM_PRE(argc, argv);                                                \
+    nvbench::detail::main_initialize(argc, argv);                                                  \
+    { /* Open a scope to ensure that the inner initialize/finalize hooks clean up in order. */     \
+      NVBENCH_MAIN_INITIALIZE_CUSTOM_POST(argc, argv)
+#endif
+
+#ifndef NVBENCH_MAIN_PARSE
+#define NVBENCH_MAIN_PARSE(argc, argv)                                                             \
+  NVBENCH_MAIN_CUSTOM_ARGC_ARGV_HANDLER(argc, argv);                                               \
+  std::vector<std::string> args = nvbench::detail::main_convert_args(argc, argv);                  \
+  NVBENCH_MAIN_CUSTOM_ARGS_HANDLER(args);                                                          \
+  nvbench::option_parser parser;                                                                   \
+  NVBENCH_MAIN_PARSE_CUSTOM_PRE(parser, args);                                                     \
+  parser.parse(args);                                                                              \
+  NVBENCH_MAIN_PARSE_CUSTOM_POST(parser)
+#endif
+
+#ifndef NVBENCH_MAIN_PRINT_PREAMBLE
+#define NVBENCH_MAIN_PRINT_PREAMBLE(parser) nvbench::detail::main_print_preamble(parser)
+#endif
+
+#ifndef NVBENCH_MAIN_RUN_BENCHMARKS
+#define NVBENCH_MAIN_RUN_BENCHMARKS(parser) nvbench::detail::main_run_benchmarks(parser)
+#endif
+
+#ifndef NVBENCH_MAIN_PRINT_EPILOGUE
+#define NVBENCH_MAIN_PRINT_EPILOGUE(parser) nvbench::detail::main_print_epilogue(parser)
+#endif
+
+#ifndef NVBENCH_MAIN_PRINT_RESULTS
+#define NVBENCH_MAIN_PRINT_RESULTS(parser) nvbench::detail::main_print_results(parser)
+#endif
+
+#ifndef NVBENCH_MAIN_FINALIZE
+#define NVBENCH_MAIN_FINALIZE()                                                                    \
+  NVBENCH_MAIN_FINALIZE_CUSTOM_PRE();                                                              \
+  } /* Close a scope to ensure that the inner initialize/finalize hooks clean up in order. */      \
+  nvbench::detail::main_finalize();                                                                \
+  NVBENCH_MAIN_FINALIZE_CUSTOM_POST();                                                             \
+  } /* Close a scope to ensure that the inner initialize/finalize hooks clean up in order. */      \
+  []() {}()
+#endif
+
+#ifndef NVBENCH_MAIN_CATCH_EXCEPTIONS
+#define NVBENCH_MAIN_CATCH_EXCEPTIONS                                                              \
   catch (std::exception & e)                                                                       \
   {                                                                                                \
     std::cerr << "\nNVBench encountered an error:\n\n" << e.what() << "\n";                        \
@@ -46,56 +167,87 @@
     std::cerr << "\nNVBench encountered an unknown error.\n";                                      \
     return 1;                                                                                      \
   }
-
-#ifdef NVBENCH_HAS_CUPTI
-#define NVBENCH_INITIALIZE_DRIVER_API NVBENCH_DRIVER_API_CALL(cuInit(0))
-#else
-// clang-format off
-#define NVBENCH_INITIALIZE_DRIVER_API do {} while (false)
-// clang-format on
 #endif
 
-#define NVBENCH_MAIN_PARSE(argc, argv)                                                             \
-  nvbench::option_parser parser;                                                                   \
-  parser.parse(argc, argv)
+namespace nvbench::detail
+{
 
-// See NVIDIA/NVBench#136 for CUDA_MODULE_LOADING
+inline void set_env(const char *name, const char *value)
+{
 #ifdef _MSC_VER
-#define NVBENCH_INITIALIZE_CUDA_ENV _putenv_s("CUDA_MODULE_LOADING", "EAGER")
+  _putenv_s(name, value);
 #else
-#define NVBENCH_INITIALIZE_CUDA_ENV setenv("CUDA_MODULE_LOADING", "EAGER", 1)
+  setenv(name, value, 1);
 #endif
+}
 
-#define NVBENCH_INITIALIZE_BENCHMARKS()                                                            \
-  nvbench::benchmark_manager::get().initialize()
+inline void main_initialize(int, char **)
+{
+  // See NVIDIA/NVBench#136 for CUDA_MODULE_LOADING
+  set_env("CUDA_MODULE_LOADING", "EAGER");
 
-#define NVBENCH_MAIN_BODY(argc, argv)                                                              \
-  do                                                                                               \
-  {                                                                                                \
-    NVBENCH_INITIALIZE_CUDA_ENV;                                                                   \
-    NVBENCH_INITIALIZE_DRIVER_API;                                                                 \
-    NVBENCH_INITIALIZE_BENCHMARKS();                                                               \
-    NVBENCH_MAIN_PARSE(argc, argv);                                                                \
-    auto &printer = parser.get_printer();                                                          \
-                                                                                                   \
-    printer.print_device_info();                                                                   \
-    printer.print_log_preamble();                                                                  \
-    auto &benchmarks = parser.get_benchmarks();                                                    \
-                                                                                                   \
-    std::size_t total_states = 0;                                                                  \
-    for (auto &bench_ptr : benchmarks)                                                             \
-    {                                                                                              \
-      total_states += bench_ptr->get_config_count();                                               \
-    }                                                                                              \
-    printer.set_total_state_count(total_states);                                                   \
-                                                                                                   \
-    printer.set_completed_state_count(0);                                                          \
-    for (auto &bench_ptr : benchmarks)                                                             \
-    {                                                                                              \
-      bench_ptr->set_printer(printer);                                                             \
-      bench_ptr->run();                                                                            \
-      bench_ptr->clear_printer();                                                                  \
-    }                                                                                              \
-    printer.print_log_epilogue();                                                                  \
-    printer.print_benchmark_results(benchmarks);                                                   \
-  } while (false)
+  // Initialize CUDA driver API if needed:
+#ifdef NVBENCH_HAS_CUPTI
+  NVBENCH_DRIVER_API_CALL(cuInit(0));
+#endif
+
+  // Initialize the benchmarks *after* setting up the CUDA environment:
+  nvbench::benchmark_manager::get().initialize();
+}
+
+inline std::vector<std::string> main_convert_args(int argc, char const *const *argv)
+{
+  std::vector<std::string> args;
+  for (int i = 0; i < argc; ++i)
+  {
+    args.push_back(argv[i]);
+  }
+  return args;
+}
+
+inline void main_print_preamble(option_parser &parser)
+{
+  auto &printer = parser.get_printer();
+
+  printer.print_device_info();
+  printer.print_log_preamble();
+}
+
+inline void main_run_benchmarks(option_parser &parser)
+{
+  auto &printer    = parser.get_printer();
+  auto &benchmarks = parser.get_benchmarks();
+
+  std::size_t total_states = 0;
+  for (auto &bench_ptr : benchmarks)
+  {
+    total_states += bench_ptr->get_config_count();
+  }
+
+  printer.set_completed_state_count(0);
+  printer.set_total_state_count(total_states);
+
+  for (auto &bench_ptr : benchmarks)
+  {
+    bench_ptr->set_printer(printer);
+    bench_ptr->run();
+    bench_ptr->clear_printer();
+  }
+}
+
+inline void main_print_epilogue(option_parser &parser)
+{
+  auto &printer = parser.get_printer();
+  printer.print_log_epilogue();
+}
+
+inline void main_print_results(option_parser &parser)
+{
+  auto &printer    = parser.get_printer();
+  auto &benchmarks = parser.get_benchmarks();
+  printer.print_benchmark_results(benchmarks);
+}
+
+inline void main_finalize() { NVBENCH_CUDA_CALL(cudaDeviceReset()); }
+
+} // namespace nvbench::detail
diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt
index 1535878..4a03000 100644
--- a/testing/CMakeLists.txt
+++ b/testing/CMakeLists.txt
@@ -6,6 +6,9 @@ set(test_srcs
   cpu_timer.cu
   criterion_manager.cu
   criterion_params.cu
+  custom_main_custom_args.cu
+  custom_main_custom_exceptions.cu
+  custom_main_global_state_raii.cu
   enum_type_list.cu
   entropy_criterion.cu
   float64_axis.cu
@@ -24,7 +27,12 @@ set(test_srcs
   type_list.cu
 )
 
-# Metatarget for all examples:
+# Custom arguments:
+# CTest commands+args can't be modified after creation, so we need to rely on substitution.
+set(NVBench_TEST_ARGS_nvbench.test.custom_main_custom_args "--quiet" "--my-custom-arg" "--run-once" "-d" "0")
+set(NVBench_TEST_ARGS_nvbench.test.custom_main_custom_exceptions "--quiet" "--run-once" "-d" "0")
+
+# Metatarget for all tests:
 add_custom_target(nvbench.test.all)
 add_dependencies(nvbench.all nvbench.test.all)
 
@@ -36,10 +44,14 @@ foreach(test_src IN LISTS test_srcs)
   target_link_libraries(${test_name} PRIVATE nvbench::nvbench fmt)
   set_target_properties(${test_name} PROPERTIES COMPILE_FEATURES cuda_std_17)
   nvbench_config_target(${test_name})
-  add_test(NAME ${test_name} COMMAND "$<TARGET_FILE:${test_name}>")
+  add_test(NAME ${test_name} COMMAND "$<TARGET_FILE:${test_name}>" ${NVBench_TEST_ARGS_${test_name}})
 
   add_dependencies(nvbench.test.all ${test_name})
 endforeach()
 
+set_tests_properties(nvbench.test.custom_main_custom_exceptions PROPERTIES
+  PASS_REGULAR_EXPRESSION "Custom error detected: Expected exception thrown."
+)
+
 add_subdirectory(cmake)
 add_subdirectory(device)
diff --git a/testing/custom_main_custom_args.cu b/testing/custom_main_custom_args.cu
new file mode 100644
index 0000000..f7e331e
--- /dev/null
+++ b/testing/custom_main_custom_args.cu
@@ -0,0 +1,132 @@
+/*
+ *  Copyright 2024 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+#include "nvbench/cuda_call.cuh"
+
+/******************************************************************************
+ * Install custom parser.
+ * sSee <nvbench/main.cuh> for more details.
+ ******************************************************************************/
+
+//
+// Step 1: Define a custom argument handler that accepts a vector of strings.
+//          - This handler should modify the vector in place to remove any custom
+//            arguments it handles. NVbench will then parse the remaining arguments.
+//          - The handler should also update any application state needed to handle
+//            the custom arguments.
+//
+
+// User code to handle a specific argument:
+void handle_my_custom_arg();
+
+// NVBench hook for modiifying the command line arguments before parsing:
+void custom_arg_handler(std::vector<std::string> &args)
+{
+  // Handle and remove "--my-custom-arg"
+  if (auto it = std::find(args.begin(), args.end(), "--my-custom-arg"); it != args.end())
+  {
+    handle_my_custom_arg();
+    args.erase(it);
+  }
+}
+
+//
+// Step 2: Install the custom argument handler.
+//         - This is done by defining a macro that invokes the custom argument handler.
+//
+
+// Install the custom argument handler:
+// Either define this before any NVBench headers are included, or undefine and redefine:
+#undef NVBENCH_MAIN_CUSTOM_ARGS_HANDLER
+#define NVBENCH_MAIN_CUSTOM_ARGS_HANDLER(args) custom_arg_handler(args)
+
+// Step 3: Define `main`
+//
+// After installing the custom argument handler, define the main function using:
+//
+// ```
+// NVBENCH_MAIN
+// ```
+//
+// Here, this is done at the end of this file.
+
+/******************************************************************************
+ * Unit test verification:
+ ******************************************************************************/
+
+// Track whether the args are found / handled.
+bool h_custom_arg_found             = false;
+bool h_handled_on_device            = false;
+__device__ bool d_custom_arg_found  = false;
+__device__ bool d_handled_on_device = false;
+
+// Copy host values to device:
+void copy_host_state_to_device()
+{
+  NVBENCH_CUDA_CALL(cudaMemcpyToSymbol(d_custom_arg_found, &h_custom_arg_found, sizeof(bool)));
+  NVBENCH_CUDA_CALL(cudaMemcpyToSymbol(d_handled_on_device, &h_handled_on_device, sizeof(bool)));
+}
+
+// Copy device values to host:
+void copy_device_state_to_host()
+{
+  NVBENCH_CUDA_CALL(cudaMemcpyFromSymbol(&h_custom_arg_found, d_custom_arg_found, sizeof(bool)));
+  NVBENCH_CUDA_CALL(cudaMemcpyFromSymbol(&h_handled_on_device, d_handled_on_device, sizeof(bool)));
+}
+
+void handle_my_custom_arg()
+{
+  h_custom_arg_found = true;
+  copy_host_state_to_device();
+}
+
+void verify()
+{
+  copy_device_state_to_host();
+  if (!h_custom_arg_found)
+  {
+    throw std::runtime_error("Custom argument not detected.");
+  }
+  if (!h_handled_on_device)
+  {
+    throw std::runtime_error("Custom argument not handled on device.");
+  }
+}
+
+// Install a verification check to ensure the custom argument was handled.
+// Use the `PRE` finalize hook to ensure we check device state before resetting the context.
+#undef NVBENCH_MAIN_FINALIZE_CUSTOM_PRE
+#define NVBENCH_MAIN_FINALIZE_CUSTOM_PRE() verify()
+
+// Simple kernel/benchmark to make sure that the handler can successfully modify CUDA state:
+__global__ void kernel()
+{
+  if (d_custom_arg_found)
+  {
+    d_handled_on_device = true;
+  }
+}
+void bench(nvbench::state &state)
+{
+  state.exec([](nvbench::launch &) { kernel<<<1, 1>>>(); });
+}
+NVBENCH_BENCH(bench);
+
+// Define the customized main function:
+NVBENCH_MAIN
diff --git a/testing/custom_main_custom_exceptions.cu b/testing/custom_main_custom_exceptions.cu
new file mode 100644
index 0000000..b1f9b9c
--- /dev/null
+++ b/testing/custom_main_custom_exceptions.cu
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2022 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+#include <stdexcept>
+
+/******************************************************************************
+ * Install exception handler around the NVBench main body. This is used
+ * to print helpful information when a user exception is thrown before exiting.
+ *
+ * Note that this will **NOT** be used when a benchmark throws an exception.
+ * That will fail the benchmark and note the exception, and continue
+ * execution.
+ *
+ * This is used to catch exceptions in user extensions of NVBench, things like
+ * customized initialization, command line parsing, finalization, etc. See
+ * <nvbench/main.cuh> for more details.
+ ******************************************************************************/
+
+struct user_exception : public std::runtime_error
+{
+  user_exception()
+      : std::runtime_error("Expected exception thrown.")
+  {}
+};
+
+// User code to handle user exception:
+void handle_my_exception(user_exception &e)
+{
+  std::cerr << "Custom error detected: " << e.what() << std::endl;
+  std::exit(1);
+}
+
+// Install the exception handler around the NVBench main body.
+// NVBench will have sensible defaults for common exceptions following this if no terminating catch
+// block is defined.
+// Either define this before any NVBench headers are included, or undefine and redefine.
+#undef NVBENCH_MAIN_CATCH_EXCEPTIONS_CUSTOM
+#define NVBENCH_MAIN_CATCH_EXCEPTIONS_CUSTOM                                                       \
+  catch (user_exception & e) { handle_my_exception(e); }
+
+// For testing purposes, install a argument parser that throws:
+void really_robust_argument_parser(std::vector<std::string> &) { throw user_exception(); }
+#undef NVBENCH_MAIN_CUSTOM_ARGS_HANDLER
+#define NVBENCH_MAIN_CUSTOM_ARGS_HANDLER(args) really_robust_argument_parser(args);
+
+// Define the customized main function:
+NVBENCH_MAIN
diff --git a/testing/custom_main_global_state_raii.cu b/testing/custom_main_global_state_raii.cu
new file mode 100644
index 0000000..e3584ab
--- /dev/null
+++ b/testing/custom_main_global_state_raii.cu
@@ -0,0 +1,121 @@
+/*
+ *  Copyright 2024 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstdio>
+
+/******************************************************************************
+ * Test having global state that is initialized and finalized via RAII.
+ *****************************************************************************/
+
+struct raii
+{
+  const char m_ref_data[6];
+  char *m_data;
+  bool m_cuda;
+
+  const char *m_outer_data;
+  bool m_outer_cuda;
+
+  explicit raii(bool cuda, char *outer_data = nullptr, bool outer_cuda = false)
+      : m_ref_data{'a', 'b', 'c', '1', '2', '3'}
+      , m_data(nullptr)
+      , m_cuda(cuda)
+      , m_outer_data(outer_data)
+      , m_outer_cuda(outer_cuda)
+  {
+    if (m_cuda)
+    {
+      printf("(%p) RAII test: allocating device memory\n", this);
+      NVBENCH_CUDA_CALL(cudaMalloc(&m_data, 6));
+      NVBENCH_CUDA_CALL(cudaMemcpy(m_data, m_ref_data, 6, cudaMemcpyHostToDevice));
+    }
+    else
+    {
+      printf("(%p) RAII test: allocating host memory\n", this);
+      m_data = new char[6];
+      std::copy(m_ref_data, m_ref_data + 6, m_data);
+    }
+  }
+
+  ~raii()
+  {
+    this->verify();
+    if (m_cuda)
+    {
+      printf("(%p) RAII test: invalidating device memory\n", this);
+      NVBENCH_CUDA_CALL(cudaMemset(m_data, 0, 6));
+      printf("(%p) RAII test: freeing device memory\n", this);
+      NVBENCH_CUDA_CALL(cudaFree(m_data));
+    }
+    else
+    {
+      printf("(%p) RAII test: invalidating host memory\n", this);
+      std::fill(m_data, m_data + 6, '\0');
+      printf("(%p) RAII test: freeing host memory\n", this);
+      delete[] m_data;
+    }
+  }
+
+  void verify() noexcept
+  {
+    printf("(%p) RAII test: verifying instance state\n", this);
+    this->verify(m_cuda, m_data);
+    if (m_outer_data)
+    {
+      printf("(%p) RAII test: verifying outer state\n", this);
+      this->verify(m_outer_cuda, m_outer_data);
+    }
+  }
+
+  void verify(bool cuda, const char *data) noexcept
+  {
+    if (cuda)
+    {
+      char test_data[6];
+      NVBENCH_CUDA_CALL(cudaMemcpy(test_data, data, 6, cudaMemcpyDeviceToHost));
+      if (strncmp(test_data, m_ref_data, 6) != 0)
+      {
+        printf("(%p) RAII test failed: device data mismatch\n", this);
+        std::exit(1);
+      }
+    }
+    else
+    {
+      if (strncmp(data, m_ref_data, 6) != 0)
+      {
+        printf("(%p) RAII test failed: host data mismatch\n", this);
+        std::exit(1);
+      }
+    }
+  }
+};
+
+// These will be destroyed in the opposite order in which they are created:
+
+#undef NVBENCH_MAIN_INITIALIZE_CUSTOM_PRE
+#define NVBENCH_MAIN_INITIALIZE_CUSTOM_PRE(argc, argv) raii raii_outer(false);
+
+#undef NVBENCH_MAIN_INITIALIZE_CUSTOM_POST
+#define NVBENCH_MAIN_INITIALIZE_CUSTOM_POST(argc, argv)                                            \
+  [[maybe_unused]] raii raii_inner(true, raii_outer.m_data, raii_outer.m_cuda);
+
+NVBENCH_MAIN