535.129.03

NVIDIA · Oct 31, 2023 · e573018 · e573018
1 parent f59818b
commit e573018
Show file tree

Hide file tree

Showing 163 changed files with 85,969 additions and 84,472 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ## Release 535 Entries
 
+### [535.129.03] 2023-10-31
+
 ### [535.113.01] 2023-09-21
 
 #### Fixed
@@ -50,6 +52,10 @@
 
 ## Release 525 Entries
 
+#### Fixed
+
+- Fix nvidia_p2p_get_pages(): Fix double-free in register-callback error path, [#557](https://github.com/NVIDIA/open-gpu-kernel-modules/pull/557) by @BrendanCunningham
+
 ### [525.116.04] 2023-05-09
 
 ### [525.116.03] 2023-04-25

diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 # NVIDIA Linux Open GPU Kernel Module Source
 
 This is the source release of the NVIDIA Linux open GPU kernel modules,
-version 535.113.01.
+version 535.129.03.
 
 
 ## How to Build
@@ -17,7 +17,7 @@ as root:
 
 Note that the kernel modules built here must be used with GSP
 firmware and user-space NVIDIA GPU driver components from a corresponding
-535.113.01 driver release.  This can be achieved by installing
+535.129.03 driver release.  This can be achieved by installing
 the NVIDIA GPU driver from the .run file using the `--no-kernel-modules`
 option.  E.g.,
 
@@ -180,15 +180,15 @@ software applications.
 ## Compatible GPUs
 
 The open-gpu-kernel-modules can be used on any Turing or later GPU
-(see the table below). However, in the 535.113.01 release,
+(see the table below). However, in the 535.129.03 release,
 GeForce and Workstation support is still considered alpha-quality.
 
 To enable use of the open kernel modules on GeForce and Workstation GPUs,
 set the "NVreg_OpenRmEnableUnsupportedGpus" nvidia.ko kernel module
 parameter to 1. For more details, see the NVIDIA GPU driver end user
 README here:
 
-https://us.download.nvidia.com/XFree86/Linux-x86_64/535.113.01/README/kernel_open.html
+https://us.download.nvidia.com/XFree86/Linux-x86_64/535.129.03/README/kernel_open.html
 
 In the below table, if three IDs are listed, the first is the PCI Device 
 ID, the second is the PCI Subsystem Vendor ID, and the third is the PCI
@@ -658,13 +658,15 @@ Subsystem Device ID.
 | NVIDIA A100-SXM4-80GB                           | 20B2 10DE 147F |
 | NVIDIA A100-SXM4-80GB                           | 20B2 10DE 1622 |
 | NVIDIA A100-SXM4-80GB                           | 20B2 10DE 1623 |
+| NVIDIA PG509-210                                | 20B2 10DE 1625 |
 | NVIDIA A100-SXM-64GB                            | 20B3 10DE 14A7 |
 | NVIDIA A100-SXM-64GB                            | 20B3 10DE 14A8 |
 | NVIDIA A100 80GB PCIe                           | 20B5 10DE 1533 |
 | NVIDIA A100 80GB PCIe                           | 20B5 10DE 1642 |
 | NVIDIA PG506-232                                | 20B6 10DE 1492 |
 | NVIDIA A30                                      | 20B7 10DE 1532 |
 | NVIDIA A30                                      | 20B7 10DE 1804 |
+| NVIDIA A30                                      | 20B7 10DE 1852 |
 | NVIDIA A800-SXM4-40GB                           | 20BD 10DE 17F4 |
 | NVIDIA A100-PCIE-40GB                           | 20F1 10DE 145F |
 | NVIDIA A800-SXM4-80GB                           | 20F3 10DE 179B |
@@ -748,6 +750,8 @@ Subsystem Device ID.
 | NVIDIA H100 PCIe                                | 2331 10DE 1626 |
 | NVIDIA H100                                     | 2339 10DE 17FC |
 | NVIDIA H800 NVL                                 | 233A 10DE 183A |
+| GH200 120GB                                     | 2342 10DE 16EB |
+| GH200 480GB                                     | 2342 10DE 1809 |
 | NVIDIA GeForce RTX 3060 Ti                      | 2414           |
 | NVIDIA GeForce RTX 3080 Ti Laptop GPU           | 2420           |
 | NVIDIA RTX A5500 Laptop GPU                     | 2438           |

diff --git a/kernel-open/Kbuild b/kernel-open/Kbuild
@@ -72,7 +72,7 @@ EXTRA_CFLAGS += -I$(src)/common/inc
 EXTRA_CFLAGS += -I$(src)
 EXTRA_CFLAGS += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-error -Wno-format-extra-args
 EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
-EXTRA_CFLAGS += -DNV_VERSION_STRING=\"535.113.01\"
+EXTRA_CFLAGS += -DNV_VERSION_STRING=\"535.129.03\"
 
 ifneq ($(SYSSRCHOST1X),)
  EXTRA_CFLAGS += -I$(SYSSRCHOST1X)
@@ -123,6 +123,9 @@ ifneq ($(wildcard /proc/sgi_uv),)
  EXTRA_CFLAGS += -DNV_CONFIG_X86_UV
 endif
 
+ifdef VGX_FORCE_VFIO_PCI_CORE
+ EXTRA_CFLAGS += -DNV_VGPU_FORCE_VFIO_PCI_CORE
+endif
 
 #
 # The conftest.sh script tests various aspects of the target kernel.

diff --git a/kernel-open/conftest.sh b/kernel-open/conftest.sh
@@ -4468,6 +4468,24 @@ compile_test() {
             compile_check_conftest "$CODE" "NV_MMU_NOTIFIER_OPS_HAS_INVALIDATE_RANGE" "" "types"
         ;;
 
+        mmu_notifier_ops_arch_invalidate_secondary_tlbs)
+            #
+            # Determine if the mmu_notifier_ops struct has the
+            # 'arch_invalidate_secondary_tlbs' member.
+            #
+            # struct mmu_notifier_ops.invalidate_range was renamed to
+            # arch_invalidate_secondary_tlbs by commit 1af5a8109904
+            # ("mmu_notifiers: rename invalidate_range notifier") due to be
+            # added in v6.6
+           CODE="
+            #include <linux/mmu_notifier.h>
+            int conftest_mmu_notifier_ops_arch_invalidate_secondary_tlbs(void) {
+                return offsetof(struct mmu_notifier_ops, arch_invalidate_secondary_tlbs);
+            }"
+
+            compile_check_conftest "$CODE" "NV_MMU_NOTIFIER_OPS_HAS_ARCH_INVALIDATE_SECONDARY_TLBS" "" "types"
+        ;;
+
         drm_format_num_planes)
             #
             # Determine if drm_format_num_planes() function is present.
@@ -6681,18 +6699,9 @@ case "$5" in
                 VFIO_PCI_CORE_PRESENT=1
             fi
 
-            # When this sanity check is run via nvidia-installer, it sets ARCH as aarch64.
-            # But, when it is run via Kbuild, ARCH is set as arm64
-            if [ "$ARCH" = "aarch64" ]; then
-                ARCH="arm64"
-            fi
-
             if [ "$VFIO_IOMMU_PRESENT" != "0" ] && [ "$KVM_PRESENT" != "0" ] ; then
-
-                # On x86_64, vGPU requires MDEV framework to be present.
-                # On aarch64, vGPU requires MDEV or vfio-pci-core framework to be present.
-                if ([ "$ARCH" = "arm64" ] && ([ "$VFIO_MDEV_PRESENT" != "0" ] || [ "$VFIO_PCI_CORE_PRESENT" != "0" ])) ||
-                   ([ "$ARCH" = "x86_64" ] && [ "$VFIO_MDEV_PRESENT" != "0" ];) then
+                # vGPU requires either MDEV or vfio-pci-core framework to be present.
+                if [ "$VFIO_MDEV_PRESENT" != "0" ] || [ "$VFIO_PCI_CORE_PRESENT" != "0" ]; then
                     exit 0
                 fi
             fi
@@ -6703,14 +6712,10 @@ case "$5" in
                 echo "CONFIG_VFIO_IOMMU_TYPE1";
             fi
 
-            if [ "$ARCH" = "arm64" ] && [ "$VFIO_MDEV_PRESENT" = "0" ] && [ "$VFIO_PCI_CORE_PRESENT" = "0" ]; then
+            if [ "$VFIO_MDEV_PRESENT" = "0" ] && [ "$VFIO_PCI_CORE_PRESENT" = "0" ]; then
                 echo "either CONFIG_VFIO_MDEV or CONFIG_VFIO_PCI_CORE";
             fi
 
-            if [ "$ARCH" = "x86_64" ] && [ "$VFIO_MDEV_PRESENT" = "0" ]; then
-                echo "CONFIG_VFIO_MDEV";
-            fi
-
             if [ "$KVM_PRESENT" = "0" ]; then
                 echo "CONFIG_KVM";
             fi

diff --git a/kernel-open/nvidia-peermem/nvidia-peermem.c b/kernel-open/nvidia-peermem/nvidia-peermem.c
@@ -53,7 +53,13 @@ static int peerdirect_support = NV_MEM_PEERDIRECT_SUPPORT_DEFAULT;
 module_param(peerdirect_support, int, S_IRUGO);
 MODULE_PARM_DESC(peerdirect_support, "Set level of support for Peer-direct, 0 [default] or 1 [legacy, for example MLNX_OFED 4.9 LTS]");
 
-#define peer_err(FMT, ARGS...) printk(KERN_ERR "nvidia-peermem" " %s:%d " FMT, __FUNCTION__, __LINE__, ## ARGS)
+
+#define peer_err(FMT, ARGS...) printk(KERN_ERR "nvidia-peermem" " %s:%d ERROR " FMT, __FUNCTION__, __LINE__, ## ARGS)
+#ifdef NV_MEM_DEBUG
+#define peer_trace(FMT, ARGS...) printk(KERN_DEBUG "nvidia-peermem" " %s:%d TRACE " FMT, __FUNCTION__, __LINE__, ## ARGS)
+#else
+#define peer_trace(FMT, ARGS...) do {} while (0)
+#endif
 
 #if defined(NV_MLNX_IB_PEER_MEM_SYMBOLS_PRESENT)
 
@@ -74,7 +80,10 @@ invalidate_peer_memory mem_invalidate_callback;
 static void *reg_handle = NULL;
 static void *reg_handle_nc = NULL;
 
+#define NV_MEM_CONTEXT_MAGIC ((u64)0xF1F4F1D0FEF0DAD0ULL)
+
 struct nv_mem_context {
+    u64 pad1;
     struct nvidia_p2p_page_table *page_table;
     struct nvidia_p2p_dma_mapping *dma_mapping;
     u64 core_context;
@@ -86,8 +95,22 @@ struct nv_mem_context {
     struct task_struct *callback_task;
     int sg_allocated;
     struct sg_table sg_head;
+    u64 pad2;
 };
 
+#define NV_MEM_CONTEXT_CHECK_OK(MC) ({                                  \
+    struct nv_mem_context *mc = (MC);                                   \
+    int rc = ((0 != mc) &&                                              \
+              (READ_ONCE(mc->pad1) == NV_MEM_CONTEXT_MAGIC) &&          \
+              (READ_ONCE(mc->pad2) == NV_MEM_CONTEXT_MAGIC));           \
+    if (!rc) {                                                          \
+        peer_trace("invalid nv_mem_context=%px pad1=%016llx pad2=%016llx\n", \
+                   mc,                                                  \
+                   mc?mc->pad1:0,                                       \
+                   mc?mc->pad2:0);                                      \
+    }                                                                   \
+    rc;                                                                 \
+})
 
 static void nv_get_p2p_free_callback(void *data)
 {
@@ -97,8 +120,9 @@ static void nv_get_p2p_free_callback(void *data)
     struct nvidia_p2p_dma_mapping *dma_mapping = NULL;
 
     __module_get(THIS_MODULE);
-    if (!nv_mem_context) {
-        peer_err("nv_get_p2p_free_callback -- invalid nv_mem_context\n");
+
+    if (!NV_MEM_CONTEXT_CHECK_OK(nv_mem_context)) {
+        peer_err("detected invalid context, skipping further processing\n");
         goto out;
     }
 
@@ -169,9 +193,11 @@ static int nv_mem_acquire(unsigned long addr, size_t size, void *peer_mem_privat
         /* Error case handled as not mine */
         return 0;
 
+    nv_mem_context->pad1 = NV_MEM_CONTEXT_MAGIC;
     nv_mem_context->page_virt_start = addr & GPU_PAGE_MASK;
     nv_mem_context->page_virt_end   = (addr + size + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK;
     nv_mem_context->mapped_size  = nv_mem_context->page_virt_end - nv_mem_context->page_virt_start;
+    nv_mem_context->pad2 = NV_MEM_CONTEXT_MAGIC;
 
     ret = nvidia_p2p_get_pages(0, 0, nv_mem_context->page_virt_start, nv_mem_context->mapped_size,
                                &nv_mem_context->page_table, nv_mem_dummy_callback, nv_mem_context);
@@ -195,6 +221,7 @@ static int nv_mem_acquire(unsigned long addr, size_t size, void *peer_mem_privat
     return 1;
 
 err:
+    memset(nv_mem_context, 0, sizeof(*nv_mem_context));
     kfree(nv_mem_context);
 
     /* Error case handled as not mine */
@@ -342,6 +369,7 @@ static void nv_mem_release(void *context)
         sg_free_table(&nv_mem_context->sg_head);
         nv_mem_context->sg_allocated = 0;
     }
+    memset(nv_mem_context, 0, sizeof(*nv_mem_context));
     kfree(nv_mem_context);
     module_put(THIS_MODULE);
     return;

diff --git a/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild b/kernel-open/nvidia-uvm/nvidia-uvm.Kbuild
@@ -99,6 +99,7 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += kmem_cache_has_kobj_remove_work
 NV_CONFTEST_TYPE_COMPILE_TESTS += sysfs_slab_unlink
 NV_CONFTEST_TYPE_COMPILE_TESTS += vm_fault_t
 NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_notifier_ops_invalidate_range
+NV_CONFTEST_TYPE_COMPILE_TESTS += mmu_notifier_ops_arch_invalidate_secondary_tlbs
 NV_CONFTEST_TYPE_COMPILE_TESTS += proc_ops
 NV_CONFTEST_TYPE_COMPILE_TESTS += timespec64
 NV_CONFTEST_TYPE_COMPILE_TESTS += mm_has_mmap_lock

diff --git a/kernel-open/nvidia-uvm/uvm.c b/kernel-open/nvidia-uvm/uvm.c
@@ -571,7 +571,6 @@ static void uvm_vm_open_managed_entry(struct vm_area_struct *vma)
 static void uvm_vm_close_managed(struct vm_area_struct *vma)
 {
     uvm_va_space_t *va_space = uvm_va_space_get(vma->vm_file);
-    uvm_processor_id_t gpu_id;
     bool make_zombie = false;
 
     if (current->mm != NULL)
@@ -606,12 +605,6 @@ static void uvm_vm_close_managed(struct vm_area_struct *vma)
 
     uvm_destroy_vma_managed(vma, make_zombie);
 
-    // Notify GPU address spaces that the fault buffer needs to be flushed to
-    // avoid finding stale entries that can be attributed to new VA ranges
-    // reallocated at the same address.
-    for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) {
-        uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id);
-    }
     uvm_va_space_up_write(va_space);
 
     if (current->mm != NULL)

diff --git a/kernel-open/nvidia-uvm/uvm_ada.c b/kernel-open/nvidia-uvm/uvm_ada.c
@@ -1,5 +1,5 @@
 /*******************************************************************************
-    Copyright (c) 2021 NVIDIA Corporation
+    Copyright (c) 2021-2023 NVIDIA Corporation
 
     Permission is hereby granted, free of charge, to any person obtaining a copy
     of this software and associated documentation files (the "Software"), to
@@ -94,4 +94,6 @@ void uvm_hal_ada_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
     parent_gpu->map_remap_larger_page_promotion = false;
 
     parent_gpu->plc_supported = true;
+
+    parent_gpu->no_ats_range_required = false;
 }
diff --git a/kernel-open/nvidia-uvm/uvm_ampere.c b/kernel-open/nvidia-uvm/uvm_ampere.c
@@ -101,4 +101,6 @@ void uvm_hal_ampere_arch_init_properties(uvm_parent_gpu_t *parent_gpu)
         parent_gpu->map_remap_larger_page_promotion = false;
 
     parent_gpu->plc_supported = true;
+
+    parent_gpu->no_ats_range_required = false;
 }
diff --git a/kernel-open/nvidia-uvm/uvm_ats_faults.c b/kernel-open/nvidia-uvm/uvm_ats_faults.c
@@ -107,10 +107,10 @@ static NV_STATUS service_ats_faults(uvm_gpu_va_space_t *gpu_va_space,
     return status;
 }
 
-static void flush_tlb_write_faults(uvm_gpu_va_space_t *gpu_va_space,
-                                   NvU64 addr,
-                                   size_t size,
-                                   uvm_fault_client_type_t client_type)
+static void flush_tlb_va_region(uvm_gpu_va_space_t *gpu_va_space,
+                                NvU64 addr,
+                                size_t size,
+                                uvm_fault_client_type_t client_type)
 {
     uvm_ats_fault_invalidate_t *ats_invalidate;
 
@@ -119,12 +119,12 @@ static void flush_tlb_write_faults(uvm_gpu_va_space_t *gpu_va_space,
     else
         ats_invalidate = &gpu_va_space->gpu->parent->fault_buffer_info.non_replayable.ats_invalidate;
 
-    if (!ats_invalidate->write_faults_in_batch) {
-        uvm_tlb_batch_begin(&gpu_va_space->page_tables, &ats_invalidate->write_faults_tlb_batch);
-        ats_invalidate->write_faults_in_batch = true;
+    if (!ats_invalidate->tlb_batch_pending) {
+        uvm_tlb_batch_begin(&gpu_va_space->page_tables, &ats_invalidate->tlb_batch);
+        ats_invalidate->tlb_batch_pending = true;
     }
 
-    uvm_tlb_batch_invalidate(&ats_invalidate->write_faults_tlb_batch, addr, size, PAGE_SIZE, UVM_MEMBAR_NONE);
+    uvm_tlb_batch_invalidate(&ats_invalidate->tlb_batch, addr, size, PAGE_SIZE, UVM_MEMBAR_NONE);
 }
 
 static void ats_batch_select_residency(uvm_gpu_va_space_t *gpu_va_space,
@@ -497,6 +497,7 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
 
         if (vma->vm_flags & VM_WRITE) {
             uvm_page_mask_region_fill(faults_serviced_mask, subregion);
+            uvm_ats_smmu_invalidate_tlbs(gpu_va_space, start, length);
 
             // The Linux kernel never invalidates TLB entries on mapping
             // permission upgrade. This is a problem if the GPU has cached
@@ -507,7 +508,7 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
             // infinite loop because we just forward the fault to the Linux
             // kernel and it will see that the permissions in the page table are
             // correct. Therefore, we flush TLB entries on ATS write faults.
-            flush_tlb_write_faults(gpu_va_space, start, length, client_type);
+            flush_tlb_va_region(gpu_va_space, start, length, client_type);
         }
         else {
             uvm_page_mask_region_fill(reads_serviced_mask, subregion);
@@ -530,6 +531,15 @@ NV_STATUS uvm_ats_service_faults(uvm_gpu_va_space_t *gpu_va_space,
             return status;
 
         uvm_page_mask_region_fill(faults_serviced_mask, subregion);
+
+        // Similarly to permission upgrade scenario, discussed above, GPU
+        // will not re-fetch the entry if the PTE is invalid and page size
+        // is 4K. To avoid infinite faulting loop, invalidate TLB for every
+        // new translation written explicitly like in the case of permission
+        // upgrade.
+        if (PAGE_SIZE == UVM_PAGE_SIZE_4K)
+            flush_tlb_va_region(gpu_va_space, start, length, client_type);
+
     }
 
     return status;
@@ -564,7 +574,7 @@ NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
     NV_STATUS status;
     uvm_push_t push;
 
-    if (!ats_invalidate->write_faults_in_batch)
+    if (!ats_invalidate->tlb_batch_pending)
         return NV_OK;
 
     UVM_ASSERT(gpu_va_space);
@@ -576,16 +586,15 @@ NV_STATUS uvm_ats_invalidate_tlbs(uvm_gpu_va_space_t *gpu_va_space,
                             "Invalidate ATS entries");
 
     if (status == NV_OK) {
-        uvm_tlb_batch_end(&ats_invalidate->write_faults_tlb_batch, &push, UVM_MEMBAR_NONE);
+        uvm_tlb_batch_end(&ats_invalidate->tlb_batch, &push, UVM_MEMBAR_NONE);
         uvm_push_end(&push);
 
         // Add this push to the GPU's tracker so that fault replays/clears can
         // wait on it
         status = uvm_tracker_add_push_safe(out_tracker, &push);
     }
 
-    ats_invalidate->write_faults_in_batch = false;
+    ats_invalidate->tlb_batch_pending = false;
 
     return status;
 }
-