From 3916d91c9909ec3130d2c858febf734d8452681b Mon Sep 17 00:00:00 2001
From: Bernhard Stoeckner <bstoeckner@nvidia.com>
Date: Mon, 25 Sep 2023 18:42:44 +0200
Subject: [PATCH] 535.104.12

---
 CHANGELOG.md                                  |   2 +
 README.md                                     |  12 +-
 kernel-open/Kbuild                            |   2 +-
 kernel-open/conftest.sh                       |  20 +-
 kernel-open/nvidia-uvm/nvidia-uvm.Kbuild      |   2 +-
 kernel-open/nvidia-uvm/uvm_ats_sva.h          |  16 +-
 src/common/inc/nvBldVer.h                     |  16 +-
 src/common/inc/nvUnixVersion.h                |   2 +-
 .../inc/swref/published/hopper/gh100/dev_fb.h |  23 ++-
 .../swref/published/hopper/gh100/dev_fbpa.h   |  29 +++
 .../swref/published/hopper/gh100/dev_ltc.h    |  33 ++++
 .../swref/published/hopper/gh100/dev_nv_xpl.h |  52 ++++++
 .../published/hopper/gh100/dev_xtl_ep_pri.h   |   3 +
 .../swref/published/hopper/gh100/hwproject.h  |   6 +
 .../published/hopper/gh100/pri_nv_xal_ep.h    |  12 ++
 src/common/nvswitch/kernel/ls10/link_ls10.c   |  23 ++-
 src/common/nvswitch/kernel/nvswitch.c         |  54 ++++--
 src/common/sdk/nvidia/inc/nverror.h           |   3 +-
 src/nvidia/generated/g_gpu_nvoc.c             |  11 ++
 src/nvidia/generated/g_gpu_nvoc.h             |  13 ++
 src/nvidia/generated/g_kern_mem_sys_nvoc.c    |  22 +++
 src/nvidia/generated/g_kern_mem_sys_nvoc.h    |  26 +++
 src/nvidia/generated/g_nv_name_released.h     |   4 +
 .../kernel/gpu/arch/hopper/kern_gpu_gh100.c   |  23 +++
 src/nvidia/src/kernel/gpu/gpu.c               |  13 +-
 .../gpu/gsp/arch/hopper/kernel_gsp_gh100.c    |  24 ++-
 .../gpu/gsp/arch/turing/kernel_gsp_tu102.c    |   8 +-
 .../mem_sys/arch/hopper/kern_mem_sys_gh100.c  | 176 +++++++++++++++++-
 src/nvidia/src/kernel/rmapi/control.c         |   2 +-
 .../libraries/nvport/memory/memory_tracking.c |  12 ++
 version.mk                                    |   2 +-
 31 files changed, 582 insertions(+), 64 deletions(-)
 create mode 100644 src/common/inc/swref/published/hopper/gh100/dev_fbpa.h
 create mode 100644 src/common/inc/swref/published/hopper/gh100/dev_ltc.h
 create mode 100644 src/common/inc/swref/published/hopper/gh100/dev_nv_xpl.h

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 31922eda6d..2d8f3905df 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ## Release 535 Entries
 
+### [535.104.12] 2023-09-25
+
 ### [535.104.05] 2023-08-22
 
 ### [535.98] 2023-08-08
diff --git a/README.md b/README.md
index e90a241fea..5282e80d59 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # NVIDIA Linux Open GPU Kernel Module Source
 
 This is the source release of the NVIDIA Linux open GPU kernel modules,
-version 535.104.05.
+version 535.104.12.
 
 
 ## How to Build
@@ -17,7 +17,7 @@ as root:
 
 Note that the kernel modules built here must be used with GSP
 firmware and user-space NVIDIA GPU driver components from a corresponding
-535.104.05 driver release.  This can be achieved by installing
+535.104.12 driver release.  This can be achieved by installing
 the NVIDIA GPU driver from the .run file using the `--no-kernel-modules`
 option.  E.g.,
 
@@ -180,7 +180,7 @@ software applications.
 ## Compatible GPUs
 
 The open-gpu-kernel-modules can be used on any Turing or later GPU
-(see the table below). However, in the 535.104.05 release,
+(see the table below). However, in the 535.104.12 release,
 GeForce and Workstation support is still considered alpha-quality.
 
 To enable use of the open kernel modules on GeForce and Workstation GPUs,
@@ -188,7 +188,7 @@ set the "NVreg_OpenRmEnableUnsupportedGpus" nvidia.ko kernel module
 parameter to 1. For more details, see the NVIDIA GPU driver end user
 README here:
 
-https://us.download.nvidia.com/XFree86/Linux-x86_64/535.104.05/README/kernel_open.html
+https://us.download.nvidia.com/XFree86/Linux-x86_64/535.104.12/README/kernel_open.html
 
 In the below table, if three IDs are listed, the first is the PCI Device 
 ID, the second is the PCI Subsystem Vendor ID, and the third is the PCI
@@ -856,6 +856,10 @@ Subsystem Device ID.
 | NVIDIA RTX 4000 SFF Ada Generation              | 27B0 103C 16FA |
 | NVIDIA RTX 4000 SFF Ada Generation              | 27B0 10DE 16FA |
 | NVIDIA RTX 4000 SFF Ada Generation              | 27B0 17AA 16FA |
+| NVIDIA RTX 4500 Ada Generation                  | 27B1 1028 180C |
+| NVIDIA RTX 4500 Ada Generation                  | 27B1 103C 180C |
+| NVIDIA RTX 4500 Ada Generation                  | 27B1 10DE 180C |
+| NVIDIA RTX 4500 Ada Generation                  | 27B1 17AA 180C |
 | NVIDIA RTX 4000 Ada Generation                  | 27B2 1028 181B |
 | NVIDIA RTX 4000 Ada Generation                  | 27B2 103C 181B |
 | NVIDIA RTX 4000 Ada Generation                  | 27B2 10DE 181B |
diff --git a/kernel-open/Kbuild b/kernel-open/Kbuild
index c8579124c0..99aa87740b 100644
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@@ -72,7 +72,7 @@ EXTRA_CFLAGS += -I$(src)/common/inc
 EXTRA_CFLAGS += -I$(src)
 EXTRA_CFLAGS += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-error -Wno-format-extra-args
 EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
-EXTRA_CFLAGS += -DNV_VERSION_STRING=\"535.104.05\"
+EXTRA_CFLAGS += -DNV_VERSION_STRING=\"535.104.12\"
 
 ifneq ($(SYSSRCHOST1X),)
  EXTRA_CFLAGS += -I$(SYSSRCHOST1X)
diff --git a/kernel-open/conftest.sh b/kernel-open/conftest.sh
index 4e3e7593ae..c8395ed20c 100755
--- a/kernel-open/conftest.sh
+++ b/kernel-open/conftest.sh
@@ -5743,23 +5743,25 @@ compile_test() {
             compile_check_conftest "$CODE" "NV_IOASID_GET_PRESENT" "" "functions"
         ;;
 
-        mm_pasid_set)
+        mm_pasid_drop)
             #
-            # Determine if mm_pasid_set() function is present
+            # Determine if mm_pasid_drop() function is present
+            #
+            # Added by commit 701fac40384f ("iommu/sva: Assign a PASID to mm
+            # on PASID allocation and free it on mm exit") in v5.18.
+            # Moved to linux/iommu.h in commit cd3891158a77 ("iommu/sva: Move
+            # PASID helpers to sva code") in v6.4.
             #
-            # mm_pasid_set() function was added by commit
-            # 701fac40384f07197b106136012804c3cae0b3de (iommu/sva: Assign a
-            # PASID to mm on PASID allocation and free it on mm exit) in v5.18.
-            # (2022-02-15).
             CODE="
             #if defined(NV_LINUX_SCHED_MM_H_PRESENT)
             #include <linux/sched/mm.h>
             #endif
-            void conftest_mm_pasid_set(void) {
-                mm_pasid_set();
+            #include <linux/iommu.h>
+            void conftest_mm_pasid_drop(void) {
+                mm_pasid_drop();
             }"
 
-            compile_check_conftest "$CODE" "NV_MM_PASID_SET_PRESENT" "" "functions"
+            compile_check_conftest "$CODE" "NV_MM_PASID_DROP_PRESENT" "" "functions"
         ;;
 
         drm_crtc_state_has_no_vblank)
diff --git a/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild b/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
index 73083929a8..bb29c3deb2 100644
--- a/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
+++ b/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
@@ -81,7 +81,7 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_memory_uc
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += set_pages_uc
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ktime_get_raw_ts64
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += ioasid_get
-NV_CONFTEST_FUNCTION_COMPILE_TESTS += mm_pasid_set
+NV_CONFTEST_FUNCTION_COMPILE_TESTS += mm_pasid_drop
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += migrate_vma_setup
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmget_not_zero
 NV_CONFTEST_FUNCTION_COMPILE_TESTS += mmgrab
diff --git a/kernel-open/nvidia-uvm/uvm_ats_sva.h b/kernel-open/nvidia-uvm/uvm_ats_sva.h
index 120ecfbf51..7aecde30ea 100644
--- a/kernel-open/nvidia-uvm/uvm_ats_sva.h
+++ b/kernel-open/nvidia-uvm/uvm_ats_sva.h
@@ -32,19 +32,23 @@
 // For ATS support on aarch64, arm_smmu_sva_bind() is needed for
 // iommu_sva_bind_device() calls. Unfortunately, arm_smmu_sva_bind() is not
 // conftest-able. We instead look for the presence of ioasid_get() or
-// mm_pasid_set(). ioasid_get() was added in the same patch series as
-// arm_smmu_sva_bind() and removed in v6.0. mm_pasid_set() was added in the
+// mm_pasid_drop(). ioasid_get() was added in the same patch series as
+// arm_smmu_sva_bind() and removed in v6.0. mm_pasid_drop() was added in the
 // same patch as the removal of ioasid_get(). We assume the presence of
-// arm_smmu_sva_bind() if ioasid_get(v5.11 - v5.17) or mm_pasid_set(v5.18+) is
+// arm_smmu_sva_bind() if ioasid_get(v5.11 - v5.17) or mm_pasid_drop(v5.18+) is
 // present.
 //
 // arm_smmu_sva_bind() was added with commit
 // 32784a9562fb0518b12e9797ee2aec52214adf6f and ioasid_get() was added with
 // commit cb4789b0d19ff231ce9f73376a023341300aed96 (11/23/2020). Commit
 // 701fac40384f07197b106136012804c3cae0b3de (02/15/2022) removed ioasid_get()
-// and added mm_pasid_set().
-    #if UVM_CAN_USE_MMU_NOTIFIERS() && (defined(NV_IOASID_GET_PRESENT) || defined(NV_MM_PASID_SET_PRESENT))
-        #define UVM_ATS_SVA_SUPPORTED() 1
+// and added mm_pasid_drop().
+    #if UVM_CAN_USE_MMU_NOTIFIERS() && (defined(NV_IOASID_GET_PRESENT) || defined(NV_MM_PASID_DROP_PRESENT))
+        #if defined(CONFIG_IOMMU_SVA)
+            #define UVM_ATS_SVA_SUPPORTED() 1
+        #else
+            #define UVM_ATS_SVA_SUPPORTED() 0
+        #endif
     #else
         #define UVM_ATS_SVA_SUPPORTED() 0
     #endif
diff --git a/src/common/inc/nvBldVer.h b/src/common/inc/nvBldVer.h
index 95f49108f5..5bd52b772b 100644
--- a/src/common/inc/nvBldVer.h
+++ b/src/common/inc/nvBldVer.h
@@ -43,18 +43,18 @@
 #endif
 
 #if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS)
-#define NV_BUILD_BRANCH_VERSION         "rel/gpu_drv/r535/r537_13-260"
-#define NV_BUILD_CHANGELIST_NUM         (33206197)
+#define NV_BUILD_BRANCH_VERSION         "rel/gpu_drv/r535/r537_13-267"
+#define NV_BUILD_CHANGELIST_NUM         (33312039)
 #define NV_BUILD_TYPE                   "Official"
-#define NV_BUILD_NAME                   "rel/gpu_drv/r535/r537_13-260"
-#define NV_LAST_OFFICIAL_CHANGELIST_NUM (33206197)
+#define NV_BUILD_NAME                   "rel/gpu_drv/r535/r537_13-267"
+#define NV_LAST_OFFICIAL_CHANGELIST_NUM (33312039)
 
 #else     /* Windows builds */
-#define NV_BUILD_BRANCH_VERSION         "r537_13-1"
-#define NV_BUILD_CHANGELIST_NUM         (33194057)
+#define NV_BUILD_BRANCH_VERSION         "r537_13-7"
+#define NV_BUILD_CHANGELIST_NUM         (33274399)
 #define NV_BUILD_TYPE                   "Official"
-#define NV_BUILD_NAME                   "537.17"
-#define NV_LAST_OFFICIAL_CHANGELIST_NUM (33194057)
+#define NV_BUILD_NAME                   "537.39"
+#define NV_LAST_OFFICIAL_CHANGELIST_NUM (33274399)
 #define NV_BUILD_BRANCH_BASE_VERSION    R535
 #endif
 // End buildmeister python edited section
diff --git a/src/common/inc/nvUnixVersion.h b/src/common/inc/nvUnixVersion.h
index 29293d0114..bcfc1e198f 100644
--- a/src/common/inc/nvUnixVersion.h
+++ b/src/common/inc/nvUnixVersion.h
@@ -4,7 +4,7 @@
 #if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS) || defined(NV_VMWARE) || defined(NV_QNX) || defined(NV_INTEGRITY) || \
     (defined(RMCFG_FEATURE_PLATFORM_GSP) && RMCFG_FEATURE_PLATFORM_GSP == 1)
 
-#define NV_VERSION_STRING               "535.104.05"
+#define NV_VERSION_STRING               "535.104.12"
 
 #else
 
diff --git a/src/common/inc/swref/published/hopper/gh100/dev_fb.h b/src/common/inc/swref/published/hopper/gh100/dev_fb.h
index e40d088f00..b94b765694 100644
--- a/src/common/inc/swref/published/hopper/gh100/dev_fb.h
+++ b/src/common/inc/swref/published/hopper/gh100/dev_fb.h
@@ -20,7 +20,7 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  */
- 
+
 #ifndef __gh100_dev_fb_h_
 #define __gh100_dev_fb_h_
 #define NV_PFB_NISO_FLUSH_SYSMEM_ADDR_SHIFT                       8 /*       */
@@ -29,4 +29,25 @@
 #define NV_PFB_FBHUB_PCIE_FLUSH_SYSMEM_ADDR_HI           0x00100A38 /* RW-4R */
 #define NV_PFB_FBHUB_PCIE_FLUSH_SYSMEM_ADDR_HI_ADR             31:0 /* RWIVF */
 #define NV_PFB_FBHUB_PCIE_FLUSH_SYSMEM_ADDR_HI_ADR_MASK  0x000FFFFF /* ----V */
+
+#define NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT               0x00100E78 /* RW-4R */
+#define NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT               0x00100E78 /* RW-4R */
+#define NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT_TOTAL               15:0 /* RWEVF */
+#define NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT_TOTAL_INIT             0 /* RWE-V */
+#define NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT_UNIQUE             31:16 /* RWEVF */
+#define NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT_UNIQUE_INIT            0 /* RWE-V */
+
+#define NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT               0x00100E8C /* RW-4R */
+#define NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT               0x00100E8C /* RW-4R */
+#define NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT_TOTAL               15:0 /* RWEVF */
+#define NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT_TOTAL_INIT             0 /* RWE-V */
+#define NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT_UNIQUE             31:16 /* RWEVF */
+#define NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT_UNIQUE_INIT            0 /* RWE-V */
+
+#define NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT               0x00100EA0 /* RW-4R */
+#define NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT               0x00100EA0 /* RW-4R */
+#define NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT_TOTAL               15:0 /* RWEVF */
+#define NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT_TOTAL_INIT             0 /* RWE-V */
+#define NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT_UNIQUE             31:16 /* RWEVF */
+#define NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT_UNIQUE_INIT            0 /* RWE-V */
 #endif // __gh100_dev_fb_h_
diff --git a/src/common/inc/swref/published/hopper/gh100/dev_fbpa.h b/src/common/inc/swref/published/hopper/gh100/dev_fbpa.h
new file mode 100644
index 0000000000..e98d154705
--- /dev/null
+++ b/src/common/inc/swref/published/hopper/gh100/dev_fbpa.h
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __gh100_dev_fbpa_h_
+#define __gh100_dev_fbpa_h_
+
+#define NV_PFB_FBPA_0_ECC_DED_COUNT__SIZE_1               4 /*       */
+#define NV_PFB_FBPA_0_ECC_DED_COUNT(i)                   (0x009025A0+(i)*4) /* RW-4A */
+#endif // __gh100_dev_fbpa_h_
diff --git a/src/common/inc/swref/published/hopper/gh100/dev_ltc.h b/src/common/inc/swref/published/hopper/gh100/dev_ltc.h
new file mode 100644
index 0000000000..f1eec19372
--- /dev/null
+++ b/src/common/inc/swref/published/hopper/gh100/dev_ltc.h
@@ -0,0 +1,33 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __gh100_dev_ltc_h_
+#define __gh100_dev_ltc_h_
+
+#define NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT                  0x001404f8 /* RW-4R */
+#define NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT_TOTAL                  15:0 /* RWIVF */
+#define NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT_TOTAL_INIT           0x0000 /* RWI-V */
+#define NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT_UNIQUE                31:16 /* RWIVF */
+#define NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT_UNIQUE_INIT          0x0000 /* RWI-V */
+
+#endif // __gh100_dev_ltc_h_
diff --git a/src/common/inc/swref/published/hopper/gh100/dev_nv_xpl.h b/src/common/inc/swref/published/hopper/gh100/dev_nv_xpl.h
new file mode 100644
index 0000000000..5eff7477b4
--- /dev/null
+++ b/src/common/inc/swref/published/hopper/gh100/dev_nv_xpl.h
@@ -0,0 +1,52 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __gh100_dev_nv_xpl_h_
+#define __gh100_dev_nv_xpl_h_
+#define NV_XPL_DL_ERR_COUNT_RBUF                                               0x00000a54 /* R--4R */
+#define NV_XPL_DL_ERR_COUNT_RBUF__PRIV_LEVEL_MASK                              0x00000b08 /*       */
+#define NV_XPL_DL_ERR_COUNT_RBUF_CORR_ERR                                            15:0 /* R-EVF */
+#define NV_XPL_DL_ERR_COUNT_RBUF_CORR_ERR_INIT                                     0x0000 /* R-E-V */
+#define NV_XPL_DL_ERR_COUNT_RBUF_UNCORR_ERR                                         31:16 /* R-EVF */
+#define NV_XPL_DL_ERR_COUNT_RBUF_UNCORR_ERR_INIT                                   0x0000 /* R-E-V */
+#define NV_XPL_DL_ERR_COUNT_SEQ_LUT                                            0x00000a58 /* R--4R */
+#define NV_XPL_DL_ERR_COUNT_SEQ_LUT__PRIV_LEVEL_MASK                           0x00000b08 /*       */
+#define NV_XPL_DL_ERR_COUNT_SEQ_LUT_CORR_ERR                                         15:0 /* R-EVF */
+#define NV_XPL_DL_ERR_COUNT_SEQ_LUT_CORR_ERR_INIT                                  0x0000 /* R-E-V */
+#define NV_XPL_DL_ERR_COUNT_SEQ_LUT_UNCORR_ERR                                      31:16 /* R-EVF */
+#define NV_XPL_DL_ERR_COUNT_SEQ_LUT_UNCORR_ERR_INIT                                0x0000 /* R-E-V */
+
+#define NV_XPL_DL_ERR_RESET                                                    0x00000a5c /* RW-4R */
+#define NV_XPL_DL_ERR_RESET_RBUF_CORR_ERR_COUNT                                       0:0 /* RWCVF */
+#define NV_XPL_DL_ERR_RESET_RBUF_CORR_ERR_COUNT_DONE                                  0x0 /* RWC-V */
+#define NV_XPL_DL_ERR_RESET_RBUF_CORR_ERR_COUNT_PENDING                               0x1 /* -W--T */
+#define NV_XPL_DL_ERR_RESET_SEQ_LUT_CORR_ERR_COUNT                                    1:1 /* RWCVF */
+#define NV_XPL_DL_ERR_RESET_SEQ_LUT_CORR_ERR_COUNT_DONE                               0x0 /* RWC-V */
+#define NV_XPL_DL_ERR_RESET_SEQ_LUT_CORR_ERR_COUNT_PENDING                            0x1 /* -W--T */
+#define NV_XPL_DL_ERR_RESET_RBUF_UNCORR_ERR_COUNT                                   16:16 /* RWCVF */
+#define NV_XPL_DL_ERR_RESET_RBUF_UNCORR_ERR_COUNT_DONE                                0x0 /* RWC-V */
+#define NV_XPL_DL_ERR_RESET_RBUF_UNCORR_ERR_COUNT_PENDING                             0x1 /* -W--T */
+#define NV_XPL_DL_ERR_RESET_SEQ_LUT_UNCORR_ERR_COUNT                                17:17 /* RWCVF */
+#define NV_XPL_DL_ERR_RESET_SEQ_LUT_UNCORR_ERR_COUNT_DONE                             0x0 /* RWC-V */
+#define NV_XPL_DL_ERR_RESET_SEQ_LUT_UNCORR_ERR_COUNT_PENDING                          0x1 /* -W--T */
+#endif // __gh100_dev_nv_xpl_h__
diff --git a/src/common/inc/swref/published/hopper/gh100/dev_xtl_ep_pri.h b/src/common/inc/swref/published/hopper/gh100/dev_xtl_ep_pri.h
index db96d0b6e6..eb475eef45 100644
--- a/src/common/inc/swref/published/hopper/gh100/dev_xtl_ep_pri.h
+++ b/src/common/inc/swref/published/hopper/gh100/dev_xtl_ep_pri.h
@@ -24,4 +24,7 @@
 #ifndef __gh100_dev_xtl_ep_pri_h__
 #define __gh100_dev_xtl_ep_pri_h__
 #define NV_EP_PCFGM                                                              0x92FFF:0x92000        /* RW--D */
+
+#define NV_XTL_EP_PRI_DED_ERROR_STATUS                                           0x0000043C    /* RW-4R */
+#define NV_XTL_EP_PRI_RAM_ERROR_INTR_STATUS                                      0x000003C8    /* RW-4R */
 #endif // __gh100_dev_xtl_ep_pri_h__
diff --git a/src/common/inc/swref/published/hopper/gh100/hwproject.h b/src/common/inc/swref/published/hopper/gh100/hwproject.h
index 4fda40527f..11bd952c16 100644
--- a/src/common/inc/swref/published/hopper/gh100/hwproject.h
+++ b/src/common/inc/swref/published/hopper/gh100/hwproject.h
@@ -21,3 +21,9 @@
  * DEALINGS IN THE SOFTWARE.
  */
 #define NV_CHIP_EXTENDED_SYSTEM_PHYSICAL_ADDRESS_BITS              52
+#define NV_LTC_PRI_STRIDE                            8192
+#define NV_LTS_PRI_STRIDE                             512
+#define NV_FBPA_PRI_STRIDE                      16384
+#define NV_SCAL_LITTER_NUM_FBPAS                       24
+#define NV_XPL_BASE_ADDRESS                    540672
+#define NV_XTL_BASE_ADDRESS                    593920
diff --git a/src/common/inc/swref/published/hopper/gh100/pri_nv_xal_ep.h b/src/common/inc/swref/published/hopper/gh100/pri_nv_xal_ep.h
index ff1576dbe8..897fd9623c 100644
--- a/src/common/inc/swref/published/hopper/gh100/pri_nv_xal_ep.h
+++ b/src/common/inc/swref/published/hopper/gh100/pri_nv_xal_ep.h
@@ -47,5 +47,17 @@
 #define NV_XAL_EP_INTR_0_PRI_RSP_TIMEOUT                                              3:3
 #define NV_XAL_EP_INTR_0_PRI_RSP_TIMEOUT_PENDING                                      0x1
 #define NV_XAL_EP_SCPM_PRI_DUMMY_DATA_PATTERN_INIT                             0xbadf0200
+
+#define NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT                            0x0010f364 /* RW-4R */
+#define NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT_TOTAL                            15:0 /* RWIUF */
+#define NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT_TOTAL_INIT                     0x0000 /* RWI-V */
+#define NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT_UNIQUE                          31:16 /* RWIUF */
+#define NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT_UNIQUE_INIT                    0x0000 /* RWI-V */
+
+#define NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT                             0x0010f37c /* RW-4R */
+#define NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT_TOTAL                             15:0 /* RWIUF */
+#define NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT_TOTAL_INIT                      0x0000 /* RWI-V */
+#define NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT_UNIQUE                           31:16 /* RWIUF */
+#define NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT_UNIQUE_INIT                     0x0000 /* RWI-V */
 #endif // __gh100_pri_nv_xal_ep_h__
 
diff --git a/src/common/nvswitch/kernel/ls10/link_ls10.c b/src/common/nvswitch/kernel/ls10/link_ls10.c
index bdcae93fa4..e973bd0b5e 100644
--- a/src/common/nvswitch/kernel/ls10/link_ls10.c
+++ b/src/common/nvswitch/kernel/ls10/link_ls10.c
@@ -1542,6 +1542,12 @@ nvswitch_reset_and_train_link_ls10
     nvswitch_execute_unilateral_link_shutdown_ls10(link);
     nvswitch_corelib_clear_link_state_ls10(link);
 
+    //
+    // When a link faults there could be a race between the driver requesting
+    // reset and MINION processing Emergency Shutdown. Minion will notify if
+    // such a collision happens and will deny the reset request, so try the
+    // request up to 3 times
+    //
     do
     {
         status = nvswitch_request_tl_link_state_ls10(link,
@@ -1597,15 +1603,18 @@ nvswitch_reset_and_train_link_ls10
             "%s: NvLink Reset has failed for link %d\n",
             __FUNCTION__, link->linkNumber);
 
-        // Re-register links.
-        status = nvlink_lib_register_link(device->nvlink_device, link);
-        if (status != NVL_SUCCESS)
-        {
-            nvswitch_destroy_link(link);
-            return status;
-        }
         return status;
     }
+
+    status = nvswitch_launch_ALI_link_training(device, link, NV_FALSE);
+    if (status != NVL_SUCCESS)
+    {
+        NVSWITCH_PRINT(device, ERROR,
+            "%s: NvLink failed to request ACTIVE for link %d\n",
+            __FUNCTION__, link->linkNumber);
+        return status;
+    }
+
     return NVL_SUCCESS;
 }
 
diff --git a/src/common/nvswitch/kernel/nvswitch.c b/src/common/nvswitch/kernel/nvswitch.c
index 02b68c1fc4..5a55e6947e 100644
--- a/src/common/nvswitch/kernel/nvswitch.c
+++ b/src/common/nvswitch/kernel/nvswitch.c
@@ -1345,7 +1345,6 @@ nvswitch_lib_initialize_device
     NvU8 link_num;
     nvlink_link *link = NULL;
     NvBool is_blacklisted_by_os = NV_FALSE;
-    NvU64 mode;
 
     if (!NVSWITCH_IS_DEVICE_ACCESSIBLE(device))
     {
@@ -1508,18 +1507,6 @@ nvswitch_lib_initialize_device
 
         nvswitch_reset_persistent_link_hw_state(device, link_num);
 
-        if(_nvswitch_corelib_get_dl_link_mode(link, &mode) != NVL_SUCCESS)
-        {
-            NVSWITCH_PRINT(device, ERROR, "%s: nvlipt_lnk_status: Failed to check link mode! LinkId %d\n",
-                        __FUNCTION__, link_num);
-        }
-        else if(mode == NVLINK_LINKSTATE_FAULT)
-        {
-            NVSWITCH_PRINT(device, INFO, "%s: retraining LinkId %d\n",
-                        __FUNCTION__, link_num);
-            nvswitch_reset_and_train_link(device, link);
-        }
-
     }
 
     retval = nvswitch_set_training_mode(device);
@@ -1623,6 +1610,10 @@ nvswitch_lib_post_init_device
 )
 {
     NvlStatus retval;
+    NvlStatus status;
+    NvU32     link_num;
+    NvU64     mode;
+    nvlink_link *link;
 
     if (!NVSWITCH_IS_DEVICE_INITIALIZED(device))
     {
@@ -1634,7 +1625,7 @@ nvswitch_lib_post_init_device
     {
         return retval;
     }
-    
+
     if (nvswitch_is_bios_supported(device))
     {
         retval = nvswitch_bios_get_image(device);
@@ -1670,6 +1661,41 @@ nvswitch_lib_post_init_device
         (void)nvswitch_launch_ALI(device);
     }
 
+    //
+    // There is an edge case where a hypervisor may not send same number
+    // of reset to switch and GPUs, so try to re-train links in fault
+    // if possible
+    //
+    for (link_num=0; link_num < nvswitch_get_num_links(device); link_num++)
+    {
+        // Sanity check
+        if (!nvswitch_is_link_valid(device, link_num))
+        {
+            continue;
+        }
+
+        status = nvlink_lib_get_link(device->nvlink_device, link_num, &link);
+        if (status != NVL_SUCCESS)
+        {
+            NVSWITCH_PRINT(device, ERROR, "%s: Failed to get link for LinkId %d\n",
+                        __FUNCTION__, link_num);
+            continue;
+        }
+
+        // If the link is in fault then re-train
+        if(_nvswitch_corelib_get_dl_link_mode(link, &mode) != NVL_SUCCESS)
+        {
+            NVSWITCH_PRINT(device, ERROR, "%s: nvlipt_lnk_status: Failed to check link mode! LinkId %d\n",
+                        __FUNCTION__, link_num);
+        }
+        else if(mode == NVLINK_LINKSTATE_FAULT)
+        {
+            NVSWITCH_PRINT(device, INFO, "%s: retraining LinkId %d\n",
+                        __FUNCTION__, link_num);
+            nvswitch_reset_and_train_link(device, link);
+        }
+    }
+
     return NVL_SUCCESS;
 }
 
diff --git a/src/common/sdk/nvidia/inc/nverror.h b/src/common/sdk/nvidia/inc/nverror.h
index a04ecd4f0d..dc24149639 100644
--- a/src/common/sdk/nvidia/inc/nverror.h
+++ b/src/common/sdk/nvidia/inc/nverror.h
@@ -121,7 +121,8 @@
 #define NVLINK_FLA_PRIV_ERR                             (137)
 #define ROBUST_CHANNEL_DLA_ERROR                        (138)
 #define ROBUST_CHANNEL_FAST_PATH_ERROR                  (139)
-#define ROBUST_CHANNEL_LAST_ERROR                       (ROBUST_CHANNEL_FAST_PATH_ERROR)
+#define UNRECOVERABLE_ECC_ERROR_ESCAPE                  (140)
+#define ROBUST_CHANNEL_LAST_ERROR                       (UNRECOVERABLE_ECC_ERROR_ESCAPE)
 
 
 // Indexed CE reference
diff --git a/src/nvidia/generated/g_gpu_nvoc.c b/src/nvidia/generated/g_gpu_nvoc.c
index 2be874b0a0..29896f711e 100644
--- a/src/nvidia/generated/g_gpu_nvoc.c
+++ b/src/nvidia/generated/g_gpu_nvoc.c
@@ -492,6 +492,17 @@ static void __nvoc_init_funcTable_OBJGPU_1(OBJGPU *pThis) {
         pThis->__gpuWriteFunctionConfigRegEx__ = &gpuWriteFunctionConfigRegEx_GM107;
     }
 
+    // Hal function -- gpuReadVgpuConfigReg
+    if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x10000000UL) )) /* ChipHal: GH100 */ 
+    {
+        pThis->__gpuReadVgpuConfigReg__ = &gpuReadVgpuConfigReg_GH100;
+    }
+    // default
+    else
+    {
+        pThis->__gpuReadVgpuConfigReg__ = &gpuReadVgpuConfigReg_46f6a7;
+    }
+
     // Hal function -- gpuGetIdInfo
     if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x10000000UL) )) /* ChipHal: GH100 */ 
     {
diff --git a/src/nvidia/generated/g_gpu_nvoc.h b/src/nvidia/generated/g_gpu_nvoc.h
index 988a2957e4..c2f9bc7851 100644
--- a/src/nvidia/generated/g_gpu_nvoc.h
+++ b/src/nvidia/generated/g_gpu_nvoc.h
@@ -877,6 +877,7 @@ struct OBJGPU {
     NV_STATUS (*__gpuReadFunctionConfigReg__)(struct OBJGPU *, NvU32, NvU32, NvU32 *);
     NV_STATUS (*__gpuWriteFunctionConfigReg__)(struct OBJGPU *, NvU32, NvU32, NvU32);
     NV_STATUS (*__gpuWriteFunctionConfigRegEx__)(struct OBJGPU *, NvU32, NvU32, NvU32, THREAD_STATE_NODE *);
+    NV_STATUS (*__gpuReadVgpuConfigReg__)(struct OBJGPU *, NvU32, NvU32 *);
     void (*__gpuGetIdInfo__)(struct OBJGPU *);
     void (*__gpuHandleSanityCheckRegReadError__)(struct OBJGPU *, NvU32, NvU32);
     void (*__gpuHandleSecFault__)(struct OBJGPU *);
@@ -1427,6 +1428,8 @@ NV_STATUS __nvoc_objCreate_OBJGPU(OBJGPU**, Dynamic*, NvU32,
 #define gpuWriteFunctionConfigReg_HAL(pGpu, function, reg, data) gpuWriteFunctionConfigReg_DISPATCH(pGpu, function, reg, data)
 #define gpuWriteFunctionConfigRegEx(pGpu, function, reg, data, pThreadState) gpuWriteFunctionConfigRegEx_DISPATCH(pGpu, function, reg, data, pThreadState)
 #define gpuWriteFunctionConfigRegEx_HAL(pGpu, function, reg, data, pThreadState) gpuWriteFunctionConfigRegEx_DISPATCH(pGpu, function, reg, data, pThreadState)
+#define gpuReadVgpuConfigReg(pGpu, index, data) gpuReadVgpuConfigReg_DISPATCH(pGpu, index, data)
+#define gpuReadVgpuConfigReg_HAL(pGpu, index, data) gpuReadVgpuConfigReg_DISPATCH(pGpu, index, data)
 #define gpuGetIdInfo(pGpu) gpuGetIdInfo_DISPATCH(pGpu)
 #define gpuGetIdInfo_HAL(pGpu) gpuGetIdInfo_DISPATCH(pGpu)
 #define gpuHandleSanityCheckRegReadError(pGpu, addr, value) gpuHandleSanityCheckRegReadError_DISPATCH(pGpu, addr, value)
@@ -2970,6 +2973,16 @@ static inline NV_STATUS gpuWriteFunctionConfigRegEx_DISPATCH(struct OBJGPU *pGpu
     return pGpu->__gpuWriteFunctionConfigRegEx__(pGpu, function, reg, data, pThreadState);
 }
 
+NV_STATUS gpuReadVgpuConfigReg_GH100(struct OBJGPU *pGpu, NvU32 index, NvU32 *data);
+
+static inline NV_STATUS gpuReadVgpuConfigReg_46f6a7(struct OBJGPU *pGpu, NvU32 index, NvU32 *data) {
+    return NV_ERR_NOT_SUPPORTED;
+}
+
+static inline NV_STATUS gpuReadVgpuConfigReg_DISPATCH(struct OBJGPU *pGpu, NvU32 index, NvU32 *data) {
+    return pGpu->__gpuReadVgpuConfigReg__(pGpu, index, data);
+}
+
 void gpuGetIdInfo_GM107(struct OBJGPU *pGpu);
 
 void gpuGetIdInfo_GH100(struct OBJGPU *pGpu);
diff --git a/src/nvidia/generated/g_kern_mem_sys_nvoc.c b/src/nvidia/generated/g_kern_mem_sys_nvoc.c
index 9df9f5fce0..97b700f450 100644
--- a/src/nvidia/generated/g_kern_mem_sys_nvoc.c
+++ b/src/nvidia/generated/g_kern_mem_sys_nvoc.c
@@ -425,6 +425,28 @@ static void __nvoc_init_funcTable_KernelMemorySystem_1(KernelMemorySystem *pThis
         pThis->__kmemsysRemoveAllAtsPeers__ = &kmemsysRemoveAllAtsPeers_GV100;
     }
 
+    // Hal function -- kmemsysCheckEccCounts
+    if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x10000000UL) )) /* ChipHal: GH100 */ 
+    {
+        pThis->__kmemsysCheckEccCounts__ = &kmemsysCheckEccCounts_GH100;
+    }
+    // default
+    else
+    {
+        pThis->__kmemsysCheckEccCounts__ = &kmemsysCheckEccCounts_b3696a;
+    }
+
+    // Hal function -- kmemsysClearEccCounts
+    if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x10000000UL) )) /* ChipHal: GH100 */ 
+    {
+        pThis->__kmemsysClearEccCounts__ = &kmemsysClearEccCounts_GH100;
+    }
+    // default
+    else
+    {
+        pThis->__kmemsysClearEccCounts__ = &kmemsysClearEccCounts_56cd7a;
+    }
+
     pThis->__nvoc_base_OBJENGSTATE.__engstateConstructEngine__ = &__nvoc_thunk_KernelMemorySystem_engstateConstructEngine;
 
     pThis->__nvoc_base_OBJENGSTATE.__engstateStateInitLocked__ = &__nvoc_thunk_KernelMemorySystem_engstateStateInitLocked;
diff --git a/src/nvidia/generated/g_kern_mem_sys_nvoc.h b/src/nvidia/generated/g_kern_mem_sys_nvoc.h
index 68c61725d9..f5cdb3d468 100644
--- a/src/nvidia/generated/g_kern_mem_sys_nvoc.h
+++ b/src/nvidia/generated/g_kern_mem_sys_nvoc.h
@@ -222,6 +222,8 @@ struct KernelMemorySystem {
     void (*__kmemsysNumaRemoveAllMemory__)(OBJGPU *, struct KernelMemorySystem *);
     NV_STATUS (*__kmemsysSetupAllAtsPeers__)(OBJGPU *, struct KernelMemorySystem *);
     void (*__kmemsysRemoveAllAtsPeers__)(OBJGPU *, struct KernelMemorySystem *);
+    void (*__kmemsysCheckEccCounts__)(OBJGPU *, struct KernelMemorySystem *);
+    NV_STATUS (*__kmemsysClearEccCounts__)(OBJGPU *, struct KernelMemorySystem *);
     NV_STATUS (*__kmemsysStateLoad__)(POBJGPU, struct KernelMemorySystem *, NvU32);
     NV_STATUS (*__kmemsysStateUnload__)(POBJGPU, struct KernelMemorySystem *, NvU32);
     NV_STATUS (*__kmemsysStatePostUnload__)(POBJGPU, struct KernelMemorySystem *, NvU32);
@@ -323,6 +325,10 @@ NV_STATUS __nvoc_objCreate_KernelMemorySystem(KernelMemorySystem**, Dynamic*, Nv
 #define kmemsysSetupAllAtsPeers_HAL(pGpu, pKernelMemorySystem) kmemsysSetupAllAtsPeers_DISPATCH(pGpu, pKernelMemorySystem)
 #define kmemsysRemoveAllAtsPeers(pGpu, pKernelMemorySystem) kmemsysRemoveAllAtsPeers_DISPATCH(pGpu, pKernelMemorySystem)
 #define kmemsysRemoveAllAtsPeers_HAL(pGpu, pKernelMemorySystem) kmemsysRemoveAllAtsPeers_DISPATCH(pGpu, pKernelMemorySystem)
+#define kmemsysCheckEccCounts(pGpu, pKernelMemorySystem) kmemsysCheckEccCounts_DISPATCH(pGpu, pKernelMemorySystem)
+#define kmemsysCheckEccCounts_HAL(pGpu, pKernelMemorySystem) kmemsysCheckEccCounts_DISPATCH(pGpu, pKernelMemorySystem)
+#define kmemsysClearEccCounts(pGpu, pKernelMemorySystem) kmemsysClearEccCounts_DISPATCH(pGpu, pKernelMemorySystem)
+#define kmemsysClearEccCounts_HAL(pGpu, pKernelMemorySystem) kmemsysClearEccCounts_DISPATCH(pGpu, pKernelMemorySystem)
 #define kmemsysStateLoad(pGpu, pEngstate, arg0) kmemsysStateLoad_DISPATCH(pGpu, pEngstate, arg0)
 #define kmemsysStateUnload(pGpu, pEngstate, arg0) kmemsysStateUnload_DISPATCH(pGpu, pEngstate, arg0)
 #define kmemsysStatePostUnload(pGpu, pEngstate, arg0) kmemsysStatePostUnload_DISPATCH(pGpu, pEngstate, arg0)
@@ -733,6 +739,26 @@ static inline void kmemsysRemoveAllAtsPeers_DISPATCH(OBJGPU *pGpu, struct Kernel
     pKernelMemorySystem->__kmemsysRemoveAllAtsPeers__(pGpu, pKernelMemorySystem);
 }
 
+void kmemsysCheckEccCounts_GH100(OBJGPU *pGpu, struct KernelMemorySystem *pKernelMemorySystem);
+
+static inline void kmemsysCheckEccCounts_b3696a(OBJGPU *pGpu, struct KernelMemorySystem *pKernelMemorySystem) {
+    return;
+}
+
+static inline void kmemsysCheckEccCounts_DISPATCH(OBJGPU *pGpu, struct KernelMemorySystem *pKernelMemorySystem) {
+    pKernelMemorySystem->__kmemsysCheckEccCounts__(pGpu, pKernelMemorySystem);
+}
+
+NV_STATUS kmemsysClearEccCounts_GH100(OBJGPU *pGpu, struct KernelMemorySystem *pKernelMemorySystem);
+
+static inline NV_STATUS kmemsysClearEccCounts_56cd7a(OBJGPU *pGpu, struct KernelMemorySystem *pKernelMemorySystem) {
+    return NV_OK;
+}
+
+static inline NV_STATUS kmemsysClearEccCounts_DISPATCH(OBJGPU *pGpu, struct KernelMemorySystem *pKernelMemorySystem) {
+    return pKernelMemorySystem->__kmemsysClearEccCounts__(pGpu, pKernelMemorySystem);
+}
+
 static inline NV_STATUS kmemsysStateLoad_DISPATCH(POBJGPU pGpu, struct KernelMemorySystem *pEngstate, NvU32 arg0) {
     return pEngstate->__kmemsysStateLoad__(pGpu, pEngstate, arg0);
 }
diff --git a/src/nvidia/generated/g_nv_name_released.h b/src/nvidia/generated/g_nv_name_released.h
index 6353f38681..26f9bd0a3d 100644
--- a/src/nvidia/generated/g_nv_name_released.h
+++ b/src/nvidia/generated/g_nv_name_released.h
@@ -1007,6 +1007,10 @@ static const CHIPS_RELEASED sChipsReleased[] = {
     { 0x27B0, 0x16fa, 0x103c, "NVIDIA RTX 4000 SFF Ada Generation" },
     { 0x27B0, 0x16fa, 0x10de, "NVIDIA RTX 4000 SFF Ada Generation" },
     { 0x27B0, 0x16fa, 0x17aa, "NVIDIA RTX 4000 SFF Ada Generation" },
+    { 0x27B1, 0x180c, 0x1028, "NVIDIA RTX 4500 Ada Generation" },
+    { 0x27B1, 0x180c, 0x103c, "NVIDIA RTX 4500 Ada Generation" },
+    { 0x27B1, 0x180c, 0x10de, "NVIDIA RTX 4500 Ada Generation" },
+    { 0x27B1, 0x180c, 0x17aa, "NVIDIA RTX 4500 Ada Generation" },
     { 0x27B2, 0x181b, 0x1028, "NVIDIA RTX 4000 Ada Generation" },
     { 0x27B2, 0x181b, 0x103c, "NVIDIA RTX 4000 Ada Generation" },
     { 0x27B2, 0x181b, 0x10de, "NVIDIA RTX 4000 Ada Generation" },
diff --git a/src/nvidia/src/kernel/gpu/arch/hopper/kern_gpu_gh100.c b/src/nvidia/src/kernel/gpu/arch/hopper/kern_gpu_gh100.c
index 681c0dc340..af04f7c88a 100644
--- a/src/nvidia/src/kernel/gpu/arch/hopper/kern_gpu_gh100.c
+++ b/src/nvidia/src/kernel/gpu/arch/hopper/kern_gpu_gh100.c
@@ -32,6 +32,7 @@
 #include "published/hopper/gh100/dev_pmc.h"
 #include "published/hopper/gh100/dev_xtl_ep_pcfg_gpu.h"
 #include "published/hopper/gh100/pri_nv_xal_ep.h"
+#include "published/hopper/gh100/dev_xtl_ep_pri.h"
 
 #include "ctrl/ctrl2080/ctrl2080mc.h"
 
@@ -77,6 +78,28 @@ gpuReadBusConfigReg_GH100
     return gpuReadBusConfigCycle(pGpu, index, pData);
 }
 
+/*!
+ * @brief Read the non-private registers on vGPU through mirror space
+ *
+ * @param[in]  pGpu   GPU object pointer
+ * @param[in]  index  Register offset in PCIe config space
+ * @param[out] pData  Value of the register
+ *
+ * @returns    NV_OK on success
+ */
+NV_STATUS
+gpuReadVgpuConfigReg_GH100
+(
+    OBJGPU    *pGpu,
+    NvU32      index,
+    NvU32     *pData
+)
+{
+    *pData = GPU_REG_RD32(pGpu, DEVICE_BASE(NV_EP_PCFGM) + index);
+
+    return NV_OK;
+}
+
 /*!
  * @brief Get GPU ID based on PCIE config reads.
  * Also determine other properties of the PCIE capabilities.
diff --git a/src/nvidia/src/kernel/gpu/gpu.c b/src/nvidia/src/kernel/gpu/gpu.c
index a8d164fc5c..06189d40b8 100644
--- a/src/nvidia/src/kernel/gpu/gpu.c
+++ b/src/nvidia/src/kernel/gpu/gpu.c
@@ -4941,12 +4941,19 @@ gpuReadBusConfigCycle_IMPL
     NvU8  device   = gpuGetDevice(pGpu);
     NvU8  function = 0;
 
-    if (pGpu->hPci == NULL)
+    if (IS_PASSTHRU(pGpu))
     {
-        pGpu->hPci = osPciInitHandle(domain, bus, device, function, NULL, NULL);
+        gpuReadVgpuConfigReg_HAL(pGpu, index, pData);
     }
+    else
+    {
+        if (pGpu->hPci == NULL)
+        {
+            pGpu->hPci = osPciInitHandle(domain, bus, device, function, NULL, NULL);
+        }
 
-    *pData = osPciReadDword(pGpu->hPci, index);
+        *pData = osPciReadDword(pGpu->hPci, index);
+    }
 
     return NV_OK;
 }
diff --git a/src/nvidia/src/kernel/gpu/gsp/arch/hopper/kernel_gsp_gh100.c b/src/nvidia/src/kernel/gpu/gsp/arch/hopper/kernel_gsp_gh100.c
index 1064e84af8..22371e4a0b 100644
--- a/src/nvidia/src/kernel/gpu/gsp/arch/hopper/kernel_gsp_gh100.c
+++ b/src/nvidia/src/kernel/gpu/gsp/arch/hopper/kernel_gsp_gh100.c
@@ -29,6 +29,7 @@
 #include "gpu/conf_compute/conf_compute.h"
 #include "gpu/fsp/kern_fsp.h"
 #include "gpu/gsp/kernel_gsp.h"
+#include "gpu/mem_sys/kern_mem_sys.h"
 #include "gsp/gspifpub.h"
 #include "vgpu/rpc.h"
 
@@ -523,6 +524,7 @@ kgspBootstrapRiscvOSEarly_GH100
 {
     KernelFalcon *pKernelFalcon = staticCast(pKernelGsp, KernelFalcon);
     KernelFsp *pKernelFsp = GPU_GET_KERNEL_FSP(pGpu);
+    KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
     NV_STATUS     status        = NV_OK;
 
     // Only for GSP client builds
@@ -532,8 +534,16 @@ kgspBootstrapRiscvOSEarly_GH100
         return NV_ERR_NOT_SUPPORTED;
     }
 
+    // Clear ECC errors before attempting to load GSP
+    status = kmemsysClearEccCounts_HAL(pGpu, pKernelMemorySystem);
+    if (status != NV_OK)
+    {
+        NV_PRINTF(LEVEL_ERROR, "Issue clearing ECC counts! Status:0x%x\n", status);
+    }
+
     // Setup the descriptors that GSP-FMC needs to boot GSP-RM
-    NV_ASSERT_OK_OR_RETURN(kgspSetupGspFmcArgs_HAL(pGpu, pKernelGsp, pGspFw));
+    NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
+            kgspSetupGspFmcArgs_HAL(pGpu, pKernelGsp, pGspFw), exit);
 
     kgspSetupLibosInitArgs(pGpu, pKernelGsp);
 
@@ -562,7 +572,8 @@ kgspBootstrapRiscvOSEarly_GH100
     {
         NV_PRINTF(LEVEL_NOTICE, "Starting to boot GSP via FSP.\n");
         pKernelFsp->setProperty(pKernelFsp, PDB_PROP_KFSP_GSP_MODE_GSPRM, NV_TRUE);
-        NV_ASSERT_OK_OR_RETURN(kfspSendBootCommands_HAL(pGpu, pKernelFsp));
+        NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR,
+                kfspSendBootCommands_HAL(pGpu, pKernelFsp), exit);
     }
     else
     {
@@ -585,7 +596,7 @@ kgspBootstrapRiscvOSEarly_GH100
                 kfspDumpDebugState_HAL(pGpu, pKernelFsp);
             }
 
-            return status;
+            goto exit;
         }
     }
 
@@ -606,7 +617,7 @@ kgspBootstrapRiscvOSEarly_GH100
                   kflcnRegRead_HAL(pGpu, pKernelFalcon, NV_PFALCON_FALCON_MAILBOX0));
         NV_PRINTF(LEVEL_ERROR, "NV_PGSP_FALCON_MAILBOX1 = 0x%x\n",
                   kflcnRegRead_HAL(pGpu, pKernelFalcon, NV_PFALCON_FALCON_MAILBOX1));
-        return status;
+        goto exit;
     }
 
     // Start polling for libos logs now that lockdown is released
@@ -640,6 +651,11 @@ kgspBootstrapRiscvOSEarly_GH100
     NV_PRINTF(LEVEL_INFO, "GSP FW RM ready.\n");
 
 exit:
+    // If GSP fails to boot, check if there's any DED error.
+    if (status != NV_OK)
+    {
+        kmemsysCheckEccCounts_HAL(pGpu, pKernelMemorySystem);
+    }
     NV_ASSERT(status == NV_OK);
 
     return status;
diff --git a/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c b/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c
index 11baf3fe7e..b5a7543714 100644
--- a/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c
+++ b/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c
@@ -799,7 +799,7 @@ kgspHealthCheck_TU102
             objDelete(pReport);
         }
 
-        return bHealthy;
+        goto exit_health_check;
     }
 
     NvU32 mb0 = GPU_REG_RD32(pGpu, NV_PGSP_MAILBOX(0));
@@ -845,6 +845,12 @@ kgspHealthCheck_TU102
                   "********************************************************************************\n");
     }
 
+exit_health_check:
+    if (!bHealthy)
+    {
+        KernelMemorySystem *pKernelMemorySystem = GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu);
+        kmemsysCheckEccCounts_HAL(pGpu, pKernelMemorySystem);
+    }
     return bHealthy;
 }
 
diff --git a/src/nvidia/src/kernel/gpu/mem_sys/arch/hopper/kern_mem_sys_gh100.c b/src/nvidia/src/kernel/gpu/mem_sys/arch/hopper/kern_mem_sys_gh100.c
index ce11a8fa74..9bbbd7786b 100644
--- a/src/nvidia/src/kernel/gpu/mem_sys/arch/hopper/kern_mem_sys_gh100.c
+++ b/src/nvidia/src/kernel/gpu/mem_sys/arch/hopper/kern_mem_sys_gh100.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -23,15 +23,24 @@
 
 #include "core/core.h"
 #include "gpu/gpu.h"
+#include "nvtypes.h"
 #include "os/os.h"
 #include "kernel/gpu/mem_sys/kern_mem_sys.h"
 #include "gpu/mem_mgr/mem_desc.h"
 #include "gpu/bus/kern_bus.h"
+#include "kernel/gpu/intr/intr.h"
+#include "nverror.h"
 
 #include "published/hopper/gh100/dev_fb.h"
+#include "published/hopper/gh100/dev_ltc.h"
+#include "published/hopper/gh100/dev_fbpa.h"
 #include "published/hopper/gh100/dev_vm.h"
 #include "published/hopper/gh100/pri_nv_xal_ep.h"
 #include "published/hopper/gh100/dev_nv_xal_addendum.h"
+#include "published/hopper/gh100/dev_nv_xpl.h"
+#include "published/hopper/gh100/dev_xtl_ep_pri.h"
+#include "published/hopper/gh100/hwproject.h"
+#include "published/ampere/ga100/dev_fb.h"
 
 NV_STATUS
 kmemsysDoCacheOp_GH100
@@ -566,3 +575,168 @@ kmemsysSwizzIdToVmmuSegmentsRange_GH100
 
     return NV_OK;
 }
+/*!
+ * Utility function used to read registers and ignore PRI errors
+ */
+static NvU32
+_kmemsysReadRegAndMaskPriError
+(
+    OBJGPU *pGpu,
+    NvU32 regAddr
+)
+{
+    NvU32 regVal;
+
+    regVal = osGpuReadReg032(pGpu, regAddr);
+    if ((regVal & GPU_READ_PRI_ERROR_MASK) == GPU_READ_PRI_ERROR_CODE)
+    {
+        return 0;
+    }
+
+    return regVal;
+}
+/*
+ * @brief Function that checks if ECC error occurred by reading various count
+ * registers/interrupt registers. This function is not floorsweeping-aware so
+ * PRI errors are ignored
+ */
+void
+kmemsysCheckEccCounts_GH100
+(
+    OBJGPU *pGpu,
+    KernelMemorySystem *pKernelMemorySystem
+)
+{
+    NvU32 dramCount = 0;
+    NvU32 mmuCount = 0;
+    NvU32 ltcCount = 0;
+    NvU32 pcieCount = 0;
+    NvU32 regVal;
+    for (NvU32 i = 0; i < NV_SCAL_LITTER_NUM_FBPAS; i++)
+    {
+        for (NvU32 j = 0; j < NV_PFB_FBPA_0_ECC_DED_COUNT__SIZE_1; j++)
+        {
+            // DRAM count read
+            dramCount += _kmemsysReadRegAndMaskPriError(pGpu, NV_PFB_FBPA_0_ECC_DED_COUNT(j) + (i * NV_FBPA_PRI_STRIDE));
+
+            // LTC count read
+            regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT +
+                    (i * NV_LTC_PRI_STRIDE) + (j * NV_LTS_PRI_STRIDE));
+            ltcCount += DRF_VAL(_PLTCG_LTC0_LTS0, _L2_CACHE_ECC, _UNCORRECTED_ERR_COUNT_UNIQUE, regVal);
+        }
+    }
+
+    // L2TLB
+    regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT);
+    mmuCount += DRF_VAL(_PFB_PRI_MMU, _L2TLB_ECC, _UNCORRECTED_ERR_COUNT_UNIQUE, regVal);
+
+    // HUBTLB
+    regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT);
+    mmuCount += DRF_VAL(_PFB_PRI_MMU, _HUBTLB_ECC, _UNCORRECTED_ERR_COUNT_UNIQUE, regVal);
+
+    // FILLUNIT
+    regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT);
+    mmuCount += DRF_VAL(_PFB_PRI_MMU, _FILLUNIT_ECC, _UNCORRECTED_ERR_COUNT_UNIQUE, regVal);
+
+    // PCIE RBUF
+    regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_XPL_BASE_ADDRESS + NV_XPL_DL_ERR_COUNT_RBUF);
+    pcieCount += DRF_VAL(_XPL_DL, _ERR_COUNT_RBUF, _UNCORR_ERR, regVal);
+
+    // PCIE SEQ_LUT
+    regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_XPL_BASE_ADDRESS + NV_XPL_DL_ERR_COUNT_SEQ_LUT);
+    pcieCount += DRF_VAL(_XPL_DL, _ERR_COUNT_SEQ_LUT, _UNCORR_ERR, regVal);
+
+    // PCIE RE ORDER
+    regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT);
+    pcieCount += DRF_VAL(_XAL_EP, _REORDER_ECC, _UNCORRECTED_ERR_COUNT_UNIQUE, regVal);
+
+    // PCIE P2PREQ
+    regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT);
+    pcieCount += DRF_VAL(_XAL_EP, _P2PREQ_ECC, _UNCORRECTED_ERR_COUNT_UNIQUE, regVal);
+
+    // PCIE XTL
+    regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_XTL_BASE_ADDRESS + NV_XTL_EP_PRI_DED_ERROR_STATUS);
+    if (regVal != 0)
+    {
+        pcieCount += 1;
+    }
+
+    // PCIE XTL
+    regVal = _kmemsysReadRegAndMaskPriError(pGpu, NV_XTL_BASE_ADDRESS + NV_XTL_EP_PRI_RAM_ERROR_INTR_STATUS);
+    if (regVal != 0)
+    {
+        pcieCount += 1;
+    }
+
+    // If counts > 0 or if poison interrupt pending, ECC error has occurred.
+    if (((dramCount + ltcCount + mmuCount + pcieCount) != 0) ||
+        intrIsVectorPending_HAL(pGpu, GPU_GET_INTR(pGpu), NV_PFB_FBHUB_POISON_INTR_VECTOR_HW_INIT, NULL))
+    {
+        nvErrorLog_va((void *)pGpu, UNRECOVERABLE_ECC_ERROR_ESCAPE,
+                      "An uncorrectable ECC error detected "
+                      "(possible firmware handling failure) "
+                      "DRAM:%d, LTC:%d, MMU:%d, PCIE:%d", dramCount, ltcCount, mmuCount, pcieCount);
+    }
+}
+
+/*
+ * @brief  Function that clears ECC error count registers.
+ */
+NV_STATUS
+kmemsysClearEccCounts_GH100
+(
+    OBJGPU *pGpu,
+    KernelMemorySystem *pKernelMemorySystem
+)
+{
+    NvU32 regVal = 0;
+    RMTIMEOUT timeout;
+    NV_STATUS status = NV_OK;
+
+    gpuClearFbhubPoisonIntrForBug2924523_HAL(pGpu);
+
+    for (NvU32 i = 0; i < NV_SCAL_LITTER_NUM_FBPAS; i++)
+    {
+        for (NvU32 j = 0; j < NV_PFB_FBPA_0_ECC_DED_COUNT__SIZE_1; j++)
+        {
+            osGpuWriteReg032(pGpu, NV_PFB_FBPA_0_ECC_DED_COUNT(j) + (i * NV_FBPA_PRI_STRIDE), 0);
+            osGpuWriteReg032(pGpu, NV_PLTCG_LTC0_LTS0_L2_CACHE_ECC_UNCORRECTED_ERR_COUNT + (i * NV_LTC_PRI_STRIDE) + (j * NV_LTS_PRI_STRIDE), 0);
+        }
+    }
+
+    // Reset MMU counts
+    osGpuWriteReg032(pGpu, NV_PFB_PRI_MMU_L2TLB_ECC_UNCORRECTED_ERR_COUNT, 0);
+    osGpuWriteReg032(pGpu, NV_PFB_PRI_MMU_HUBTLB_ECC_UNCORRECTED_ERR_COUNT, 0);
+    osGpuWriteReg032(pGpu, NV_PFB_PRI_MMU_FILLUNIT_ECC_UNCORRECTED_ERR_COUNT, 0);
+
+    // Reset XAL-EP counts
+    osGpuWriteReg032(pGpu, NV_XAL_EP_REORDER_ECC_UNCORRECTED_ERR_COUNT, 0);
+    osGpuWriteReg032(pGpu, NV_XAL_EP_P2PREQ_ECC_UNCORRECTED_ERR_COUNT, 0);
+
+    // Reset XTL-EP status registers
+    osGpuWriteReg032(pGpu, NV_XTL_BASE_ADDRESS + NV_XTL_EP_PRI_DED_ERROR_STATUS, ~0);
+    osGpuWriteReg032(pGpu, NV_XTL_BASE_ADDRESS + NV_XTL_EP_PRI_RAM_ERROR_INTR_STATUS, ~0);
+
+    // Reset XPL-EP error counters
+    regVal = DRF_DEF(_XPL, _DL_ERR_RESET, _RBUF_UNCORR_ERR_COUNT, _PENDING) |
+             DRF_DEF(_XPL, _DL_ERR_RESET, _SEQ_LUT_UNCORR_ERR_COUNT, _PENDING);
+    osGpuWriteReg032(pGpu, NV_XPL_BASE_ADDRESS + NV_XPL_DL_ERR_RESET, regVal);
+
+    // Wait for the error counter reset to complete
+    gpuSetTimeout(pGpu, GPU_TIMEOUT_DEFAULT, &timeout, 0);
+    for (;;)
+    {
+        status = gpuCheckTimeout(pGpu, &timeout);
+
+        regVal = osGpuReadReg032(pGpu, NV_XPL_BASE_ADDRESS + NV_XPL_DL_ERR_RESET);
+
+        if (FLD_TEST_DRF(_XPL, _DL_ERR_RESET, _RBUF_UNCORR_ERR_COUNT, _DONE, regVal) &&
+            FLD_TEST_DRF(_XPL, _DL_ERR_RESET, _SEQ_LUT_UNCORR_ERR_COUNT, _DONE, regVal))
+            break;
+
+        if (status != NV_OK)
+            return status;
+    }
+
+    return NV_OK;
+}
diff --git a/src/nvidia/src/kernel/rmapi/control.c b/src/nvidia/src/kernel/rmapi/control.c
index 48c7ddc179..59c3bb20b8 100644
--- a/src/nvidia/src/kernel/rmapi/control.c
+++ b/src/nvidia/src/kernel/rmapi/control.c
@@ -1013,7 +1013,7 @@ _rmapiControlWithSecInfoTlsIRQL
     NV_STATUS           status;
     THREAD_STATE_NODE   threadState;
 
-    NvU8                stackAllocator[TLS_ISR_ALLOCATOR_SIZE];
+    NvU8                stackAllocator[2*TLS_ISR_ALLOCATOR_SIZE];
     PORT_MEM_ALLOCATOR* pIsrAllocator = portMemAllocatorCreateOnExistingBlock(stackAllocator, sizeof(stackAllocator));
     tlsIsrInit(pIsrAllocator);
 
diff --git a/src/nvidia/src/libraries/nvport/memory/memory_tracking.c b/src/nvidia/src/libraries/nvport/memory/memory_tracking.c
index 4ff892230a..d30f321c42 100644
--- a/src/nvidia/src/libraries/nvport/memory/memory_tracking.c
+++ b/src/nvidia/src/libraries/nvport/memory/memory_tracking.c
@@ -1444,6 +1444,14 @@ _portMemAllocatorCreateOnExistingBlock
     pAllocator->pTracking     = NULL; // No tracking for this allocator
     pAllocator->pImpl         = (PORT_MEM_ALLOCATOR_IMPL*)(pAllocator + 1);
 
+
+    //
+    // PORT_MEM_BITVECTOR (pAllocator->pImpl) and PORT_MEM_ALLOCATOR_TRACKING (pAllocator->pImpl->tracking)
+    // are mutually exclusively used.
+    // When pAllocator->pTracking = NULL the data in pAllocator->pImpl->tracking is not used and instead 
+    // pBitVector uses the same meory location. 
+    // When pAllocator->pImpl->tracking there is no usage of PORT_MEM_BITVECTOR
+    //
     pBitVector = (PORT_MEM_BITVECTOR*)(pAllocator->pImpl);
     pBitVector->pSpinlock = pSpinlock;
 
@@ -1544,6 +1552,10 @@ _portMemAllocatorAllocExistingWrapper
     {
         portSyncSpinlockRelease(pSpinlock);
     }
+    if (pMem == NULL)
+    {
+         PORT_MEM_PRINT_ERROR("Memory allocation failed.\n");
+    }
     return pMem;
 }
 
diff --git a/version.mk b/version.mk
index 6f8caf48f5..65f58f6dee 100644
--- a/version.mk
+++ b/version.mk
@@ -1,4 +1,4 @@
-NVIDIA_VERSION = 535.104.05
+NVIDIA_VERSION = 535.104.12
 
 # This file.
 VERSION_MK_FILE := $(lastword $(MAKEFILE_LIST))