diff --git a/README.md b/README.md index e65cf2eba..5925f5701 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # NVIDIA Linux Open GPU Kernel Module Source This is the source release of the NVIDIA Linux open GPU kernel modules, -version 550.107.02. +version 550.120. ## How to Build @@ -17,7 +17,7 @@ as root: Note that the kernel modules built here must be used with GSP firmware and user-space NVIDIA GPU driver components from a corresponding -550.107.02 driver release. This can be achieved by installing +550.120 driver release. This can be achieved by installing the NVIDIA GPU driver from the .run file using the `--no-kernel-modules` option. E.g., @@ -188,7 +188,7 @@ encountered specific to them. For details on feature support and limitations, see the NVIDIA GPU driver end user README here: -https://us.download.nvidia.com/XFree86/Linux-x86_64/550.107.02/README/kernel_open.html +https://us.download.nvidia.com/XFree86/Linux-x86_64/550.120/README/kernel_open.html For vGPU support, please refer to the README.vgpu packaged in the vGPU Host Package for more details. @@ -834,12 +834,10 @@ Subsystem Device ID. | NVIDIA GeForce RTX 2050 | 25AD | | NVIDIA RTX A1000 | 25B0 1028 1878 | | NVIDIA RTX A1000 | 25B0 103C 1878 | -| NVIDIA RTX A1000 | 25B0 103C 8D96 | | NVIDIA RTX A1000 | 25B0 10DE 1878 | | NVIDIA RTX A1000 | 25B0 17AA 1878 | | NVIDIA RTX A400 | 25B2 1028 1879 | | NVIDIA RTX A400 | 25B2 103C 1879 | -| NVIDIA RTX A400 | 25B2 103C 8D95 | | NVIDIA RTX A400 | 25B2 10DE 1879 | | NVIDIA RTX A400 | 25B2 17AA 1879 | | NVIDIA A16 | 25B6 10DE 14A9 | diff --git a/kernel-open/Kbuild b/kernel-open/Kbuild index 0ff2471cd..1eadec1c3 100644 --- a/kernel-open/Kbuild +++ b/kernel-open/Kbuild @@ -72,7 +72,7 @@ EXTRA_CFLAGS += -I$(src)/common/inc EXTRA_CFLAGS += -I$(src) EXTRA_CFLAGS += -Wall $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-format-extra-args EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM -EXTRA_CFLAGS += -DNV_VERSION_STRING=\"550.107.02\" +EXTRA_CFLAGS += -DNV_VERSION_STRING=\"550.120\" ifneq ($(SYSSRCHOST1X),) EXTRA_CFLAGS += -I$(SYSSRCHOST1X) diff --git a/kernel-open/Makefile b/kernel-open/Makefile index cee56f106..baafdbfc9 100644 --- a/kernel-open/Makefile +++ b/kernel-open/Makefile @@ -28,7 +28,7 @@ else else KERNEL_UNAME ?= $(shell uname -r) KERNEL_MODLIB := /lib/modules/$(KERNEL_UNAME) - KERNEL_SOURCES := $(shell test -d $(KERNEL_MODLIB)/source && echo $(KERNEL_MODLIB)/source || echo $(KERNEL_MODLIB)/build) + KERNEL_SOURCES := $(shell ((test -d $(KERNEL_MODLIB)/source && echo $(KERNEL_MODLIB)/source) || (test -d $(KERNEL_MODLIB)/build/source && echo $(KERNEL_MODLIB)/build/source)) || echo $(KERNEL_MODLIB)/build) endif KERNEL_OUTPUT := $(KERNEL_SOURCES) @@ -42,7 +42,11 @@ else else KERNEL_UNAME ?= $(shell uname -r) KERNEL_MODLIB := /lib/modules/$(KERNEL_UNAME) - ifeq ($(KERNEL_SOURCES), $(KERNEL_MODLIB)/source) + # $(filter patter...,text) - Returns all whitespace-separated words in text that + # do match any of the pattern words, removing any words that do not match. + # Set the KERNEL_OUTPUT only if either $(KERNEL_MODLIB)/source or + # $(KERNEL_MODLIB)/build/source path matches the KERNEL_SOURCES. + ifneq ($(filter $(KERNEL_SOURCES),$(KERNEL_MODLIB)/source $(KERNEL_MODLIB)/build/source),) KERNEL_OUTPUT := $(KERNEL_MODLIB)/build KBUILD_PARAMS := KBUILD_OUTPUT=$(KERNEL_OUTPUT) endif diff --git a/kernel-open/common/inc/nv-linux.h b/kernel-open/common/inc/nv-linux.h index 734525df6..fa38289be 100644 --- a/kernel-open/common/inc/nv-linux.h +++ b/kernel-open/common/inc/nv-linux.h @@ -474,7 +474,9 @@ static inline void *nv_vmalloc(unsigned long size) void *ptr = __vmalloc(size, GFP_KERNEL); #endif if (ptr) + { NV_MEMDBG_ADD(ptr, size); + } return ptr; } @@ -492,7 +494,9 @@ static inline void *nv_ioremap(NvU64 phys, NvU64 size) void *ptr = ioremap(phys, size); #endif if (ptr) + { NV_MEMDBG_ADD(ptr, size); + } return ptr; } @@ -528,8 +532,9 @@ static inline void *nv_ioremap_cache(NvU64 phys, NvU64 size) #endif if (ptr) + { NV_MEMDBG_ADD(ptr, size); - + } return ptr; } @@ -545,8 +550,9 @@ static inline void *nv_ioremap_wc(NvU64 phys, NvU64 size) #endif if (ptr) + { NV_MEMDBG_ADD(ptr, size); - + } return ptr; } @@ -675,7 +681,9 @@ static inline NvUPtr nv_vmap(struct page **pages, NvU32 page_count, /* All memory cached in PPC64LE; can't honor 'cached' input. */ ptr = vmap(pages, page_count, VM_MAP, prot); if (ptr) + { NV_MEMDBG_ADD(ptr, page_count * PAGE_SIZE); + } return (NvUPtr)ptr; } diff --git a/kernel-open/common/inc/nv_uvm_interface.h b/kernel-open/common/inc/nv_uvm_interface.h index 037410345..f8ca948f1 100644 --- a/kernel-open/common/inc/nv_uvm_interface.h +++ b/kernel-open/common/inc/nv_uvm_interface.h @@ -592,6 +592,13 @@ void nvUvmInterfaceChannelDestroy(uvmGpuChannelHandle channel); Error codes: NV_ERR_GENERIC NV_ERR_NO_MEMORY + NV_ERR_INVALID_STATE + NV_ERR_NOT_SUPPORTED + NV_ERR_NOT_READY + NV_ERR_INVALID_LOCK_STATE + NV_ERR_INVALID_STATE + NV_ERR_NVSWITCH_FABRIC_NOT_READY + NV_ERR_NVSWITCH_FABRIC_FAILURE */ NV_STATUS nvUvmInterfaceQueryCaps(uvmGpuDeviceHandle device, UvmGpuCaps *caps); diff --git a/kernel-open/common/inc/nv_uvm_types.h b/kernel-open/common/inc/nv_uvm_types.h index dbf7fb6d7..67ac999c4 100644 --- a/kernel-open/common/inc/nv_uvm_types.h +++ b/kernel-open/common/inc/nv_uvm_types.h @@ -595,10 +595,8 @@ typedef struct UvmGpuClientInfo_tag typedef enum { - UVM_GPU_CONF_COMPUTE_MODE_NONE, - UVM_GPU_CONF_COMPUTE_MODE_APM, - UVM_GPU_CONF_COMPUTE_MODE_HCC, - UVM_GPU_CONF_COMPUTE_MODE_COUNT + UVM_GPU_CONF_COMPUTE_MODE_NONE = 0, + UVM_GPU_CONF_COMPUTE_MODE_HCC = 2 } UvmGpuConfComputeMode; typedef struct UvmGpuConfComputeCaps_tag diff --git a/kernel-open/common/inc/nvstatuscodes.h b/kernel-open/common/inc/nvstatuscodes.h index 552207f21..2a0444d22 100644 --- a/kernel-open/common/inc/nvstatuscodes.h +++ b/kernel-open/common/inc/nvstatuscodes.h @@ -152,6 +152,8 @@ NV_STATUS_CODE(NV_ERR_FABRIC_MANAGER_NOT_PRESENT, 0x0000007A, "Fabric Manag NV_STATUS_CODE(NV_ERR_ALREADY_SIGNALLED, 0x0000007B, "Semaphore Surface value already >= requested wait value") NV_STATUS_CODE(NV_ERR_QUEUE_TASK_SLOT_NOT_AVAILABLE, 0x0000007C, "PMU RPC error due to no queue slot available for this event") NV_STATUS_CODE(NV_ERR_KEY_ROTATION_IN_PROGRESS, 0x0000007D, "Operation not allowed as key rotation is in progress") +NV_STATUS_CODE(NV_ERR_NVSWITCH_FABRIC_NOT_READY, 0x00000081, "Nvswitch Fabric Status or Fabric Probe is not yet complete, caller needs to retry") +NV_STATUS_CODE(NV_ERR_NVSWITCH_FABRIC_FAILURE, 0x00000082, "Nvswitch Fabric Probe failed") // Warnings: NV_STATUS_CODE(NV_WARN_HOT_SWITCH, 0x00010001, "WARNING Hot switch") diff --git a/kernel-open/conftest.sh b/kernel-open/conftest.sh index 16b222d85..060f47736 100755 --- a/kernel-open/conftest.sh +++ b/kernel-open/conftest.sh @@ -6579,7 +6579,9 @@ compile_test() { # Determine whether drm_fbdev_generic_setup is present. # # Added by commit 9060d7f49376 ("drm/fb-helper: Finish the - # generic fbdev emulation") in v4.19. + # generic fbdev emulation") in v4.19. Removed by commit + # aae4682e5d66 ("drm/fbdev-generic: Convert to fbdev-ttm") + # in v6.11. # CODE=" #include @@ -6591,6 +6593,48 @@ compile_test() { }" compile_check_conftest "$CODE" "NV_DRM_FBDEV_GENERIC_SETUP_PRESENT" "" "functions" + ;; + + drm_fbdev_ttm_setup) + # + # Determine whether drm_fbdev_ttm_setup is present. + # + # Added by commit aae4682e5d66 ("drm/fbdev-generic: + # Convert to fbdev-ttm") in v6.11. + # + CODE=" + #include + #if defined(NV_DRM_DRM_FBDEV_TTM_H_PRESENT) + #include + #endif + void conftest_drm_fbdev_ttm_setup(void) { + drm_fbdev_ttm_setup(); + }" + + compile_check_conftest "$CODE" "NV_DRM_FBDEV_TTM_SETUP_PRESENT" "" "functions" + ;; + + drm_output_poll_changed) + # + # Determine whether drm_mode_config_funcs.output_poll_changed + # callback is present + # + # Removed by commit 446d0f4849b1 ("drm: Remove struct + # drm_mode_config_funcs.output_poll_changed") in v6.12. Hotplug + # event support is handled through the fbdev emulation interface + # going forward. + # + CODE=" + #if defined(NV_DRM_DRM_MODE_CONFIG_H_PRESENT) + #include + #else + #include + #endif + int conftest_drm_output_poll_changed_available(void) { + return offsetof(struct drm_mode_config_funcs, output_poll_changed); + }" + + compile_check_conftest "$CODE" "NV_DRM_OUTPUT_POLL_CHANGED_PRESENT" "" "types" ;; drm_aperture_remove_conflicting_pci_framebuffers) diff --git a/kernel-open/header-presence-tests.mk b/kernel-open/header-presence-tests.mk index 3265a6c54..7009cc37c 100644 --- a/kernel-open/header-presence-tests.mk +++ b/kernel-open/header-presence-tests.mk @@ -15,6 +15,7 @@ NV_HEADER_PRESENCE_TESTS = \ drm/drm_atomic_uapi.h \ drm/drm_drv.h \ drm/drm_fbdev_generic.h \ + drm/drm_fbdev_ttm.h \ drm/drm_framebuffer.h \ drm/drm_connector.h \ drm/drm_probe_helper.h \ diff --git a/kernel-open/nvidia-drm/nv-kthread-q.c b/kernel-open/nvidia-drm/nv-kthread-q.c index b49725b73..28d17d82c 100644 --- a/kernel-open/nvidia-drm/nv-kthread-q.c +++ b/kernel-open/nvidia-drm/nv-kthread-q.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2016 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2016-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -176,7 +176,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data), { unsigned i, j; - const static unsigned attempts = 3; + static const unsigned attempts = 3; struct task_struct *thread[3]; for (i = 0;; i++) { diff --git a/kernel-open/nvidia-drm/nvidia-drm-drv.c b/kernel-open/nvidia-drm/nvidia-drm-drv.c index 9ca0e0ebb..91c7cbfc0 100644 --- a/kernel-open/nvidia-drm/nvidia-drm-drv.c +++ b/kernel-open/nvidia-drm/nvidia-drm-drv.c @@ -64,12 +64,14 @@ #include #endif -#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) +#if defined(NV_DRM_FBDEV_AVAILABLE) #include #include #endif -#if defined(NV_DRM_DRM_FBDEV_GENERIC_H_PRESENT) +#if defined(NV_DRM_DRM_FBDEV_TTM_H_PRESENT) +#include +#elif defined(NV_DRM_DRM_FBDEV_GENERIC_H_PRESENT) #include #endif @@ -124,6 +126,7 @@ static const char* nv_get_input_colorspace_name( #if defined(NV_DRM_ATOMIC_MODESET_AVAILABLE) +#if defined(NV_DRM_OUTPUT_POLL_CHANGED_PRESENT) static void nv_drm_output_poll_changed(struct drm_device *dev) { struct drm_connector *connector = NULL; @@ -167,6 +170,7 @@ static void nv_drm_output_poll_changed(struct drm_device *dev) nv_drm_connector_list_iter_end(&conn_iter); #endif } +#endif /* NV_DRM_OUTPUT_POLL_CHANGED_PRESENT */ static struct drm_framebuffer *nv_drm_framebuffer_create( struct drm_device *dev, @@ -204,7 +208,9 @@ static const struct drm_mode_config_funcs nv_mode_config_funcs = { .atomic_check = nv_drm_atomic_check, .atomic_commit = nv_drm_atomic_commit, + #if defined(NV_DRM_OUTPUT_POLL_CHANGED_PRESENT) .output_poll_changed = nv_drm_output_poll_changed, + #endif }; static void nv_drm_event_callback(const struct NvKmsKapiEvent *event) @@ -480,7 +486,7 @@ static int nv_drm_load(struct drm_device *dev, unsigned long flags) return -ENODEV; } -#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) +#if defined(NV_DRM_FBDEV_AVAILABLE) /* * If fbdev is enabled, take modeset ownership now before other DRM clients * can take master (and thus NVKMS ownership). @@ -608,7 +614,7 @@ static void __nv_drm_unload(struct drm_device *dev) /* Release modeset ownership if fbdev is enabled */ -#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) +#if defined(NV_DRM_FBDEV_AVAILABLE) if (nv_dev->hasFramebufferConsole) { drm_atomic_helper_shutdown(dev); nvKms->releaseOwnership(nv_dev->pDevice); @@ -1810,7 +1816,7 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info) goto failed_drm_register; } -#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) +#if defined(NV_DRM_FBDEV_AVAILABLE) if (nv_drm_fbdev_module_param && drm_core_check_feature(dev, DRIVER_MODESET)) { @@ -1823,9 +1829,13 @@ void nv_drm_register_drm_device(const nv_gpu_info_t *gpu_info) drm_aperture_remove_conflicting_pci_framebuffers(pdev, nv_drm_driver.name); #endif } + #if defined(NV_DRM_FBDEV_TTM_AVAILABLE) + drm_fbdev_ttm_setup(dev, 32); + #elif defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) drm_fbdev_generic_setup(dev, 32); + #endif } -#endif /* defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) */ +#endif /* defined(NV_DRM_FBDEV_AVAILABLE) */ /* Add NVIDIA-DRM device into list */ @@ -1967,12 +1977,12 @@ void nv_drm_suspend_resume(NvBool suspend) if (suspend) { drm_kms_helper_poll_disable(dev); -#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) +#if defined(NV_DRM_FBDEV_AVAILABLE) drm_fb_helper_set_suspend_unlocked(dev->fb_helper, 1); #endif drm_mode_config_reset(dev); } else { -#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) +#if defined(NV_DRM_FBDEV_AVAILABLE) drm_fb_helper_set_suspend_unlocked(dev->fb_helper, 0); #endif drm_kms_helper_poll_enable(dev); diff --git a/kernel-open/nvidia-drm/nvidia-drm-linux.c b/kernel-open/nvidia-drm/nvidia-drm-linux.c index 8c59b7cb2..78429510c 100644 --- a/kernel-open/nvidia-drm/nvidia-drm-linux.c +++ b/kernel-open/nvidia-drm/nvidia-drm-linux.c @@ -34,7 +34,7 @@ MODULE_PARM_DESC( "Enable atomic kernel modesetting (1 = enable, 0 = disable (default))"); module_param_named(modeset, nv_drm_modeset_module_param, bool, 0400); -#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) +#if defined(NV_DRM_FBDEV_AVAILABLE) MODULE_PARM_DESC( fbdev, "Create a framebuffer device (1 = enable, 0 = disable (default)) (EXPERIMENTAL)"); diff --git a/kernel-open/nvidia-drm/nvidia-drm-os-interface.h b/kernel-open/nvidia-drm/nvidia-drm-os-interface.h index 6f8cfea91..a6b0f947e 100644 --- a/kernel-open/nvidia-drm/nvidia-drm-os-interface.h +++ b/kernel-open/nvidia-drm/nvidia-drm-os-interface.h @@ -59,14 +59,20 @@ typedef struct nv_timer nv_drm_timer; #endif #if defined(NV_DRM_FBDEV_GENERIC_SETUP_PRESENT) && defined(NV_DRM_APERTURE_REMOVE_CONFLICTING_PCI_FRAMEBUFFERS_PRESENT) +#define NV_DRM_FBDEV_AVAILABLE #define NV_DRM_FBDEV_GENERIC_AVAILABLE #endif +#if defined(NV_DRM_FBDEV_TTM_SETUP_PRESENT) && defined(NV_DRM_APERTURE_REMOVE_CONFLICTING_PCI_FRAMEBUFFERS_PRESENT) +#define NV_DRM_FBDEV_AVAILABLE +#define NV_DRM_FBDEV_TTM_AVAILABLE +#endif + struct page; /* Set to true when the atomic modeset feature is enabled. */ extern bool nv_drm_modeset_module_param; -#if defined(NV_DRM_FBDEV_GENERIC_AVAILABLE) +#if defined(NV_DRM_FBDEV_AVAILABLE) /* Set to true when the nvidia-drm driver should install a framebuffer device */ extern bool nv_drm_fbdev_module_param; #endif diff --git a/kernel-open/nvidia-drm/nvidia-drm-sources.mk b/kernel-open/nvidia-drm/nvidia-drm-sources.mk index 9eea22857..fec0a871c 100644 --- a/kernel-open/nvidia-drm/nvidia-drm-sources.mk +++ b/kernel-open/nvidia-drm/nvidia-drm-sources.mk @@ -67,6 +67,7 @@ NV_CONFTEST_FUNCTION_COMPILE_TESTS += fence_set_error NV_CONFTEST_FUNCTION_COMPILE_TESTS += sync_file_get_fence NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_aperture_remove_conflicting_pci_framebuffers NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_fbdev_generic_setup +NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_fbdev_ttm_setup NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_connector_attach_hdr_output_metadata_property NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_helper_crtc_enable_color_mgmt NV_CONFTEST_FUNCTION_COMPILE_TESTS += drm_crtc_enable_color_mgmt @@ -129,3 +130,4 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += fence_ops_use_64bit_seqno NV_CONFTEST_TYPE_COMPILE_TESTS += drm_aperture_remove_conflicting_pci_framebuffers_has_driver_arg NV_CONFTEST_TYPE_COMPILE_TESTS += drm_mode_create_dp_colorspace_property_has_supported_colorspaces_arg NV_CONFTEST_TYPE_COMPILE_TESTS += drm_unlocked_ioctl_flag_present +NV_CONFTEST_TYPE_COMPILE_TESTS += drm_output_poll_changed diff --git a/kernel-open/nvidia-modeset/nv-kthread-q.c b/kernel-open/nvidia-modeset/nv-kthread-q.c index b49725b73..28d17d82c 100644 --- a/kernel-open/nvidia-modeset/nv-kthread-q.c +++ b/kernel-open/nvidia-modeset/nv-kthread-q.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2016 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2016-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -176,7 +176,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data), { unsigned i, j; - const static unsigned attempts = 3; + static const unsigned attempts = 3; struct task_struct *thread[3]; for (i = 0;; i++) { diff --git a/kernel-open/nvidia-modeset/nvidia-modeset-linux.c b/kernel-open/nvidia-modeset/nvidia-modeset-linux.c index f1f40c41c..39532ac42 100644 --- a/kernel-open/nvidia-modeset/nvidia-modeset-linux.c +++ b/kernel-open/nvidia-modeset/nvidia-modeset-linux.c @@ -1070,7 +1070,7 @@ static void nvkms_kapi_event_kthread_q_callback(void *arg) nvKmsKapiHandleEventQueueChange(device); } -struct nvkms_per_open *nvkms_open_common(enum NvKmsClientType type, +static struct nvkms_per_open *nvkms_open_common(enum NvKmsClientType type, struct NvKmsKapiDevice *device, int *status) { @@ -1122,7 +1122,7 @@ struct nvkms_per_open *nvkms_open_common(enum NvKmsClientType type, return NULL; } -void nvkms_close_pm_locked(struct nvkms_per_open *popen) +static void nvkms_close_pm_locked(struct nvkms_per_open *popen) { /* * Don't use down_interruptible(): we need to free resources @@ -1185,7 +1185,7 @@ static void nvkms_close_popen(struct nvkms_per_open *popen) } } -int nvkms_ioctl_common +static int nvkms_ioctl_common ( struct nvkms_per_open *popen, NvU32 cmd, NvU64 address, const size_t size diff --git a/kernel-open/nvidia-uvm/nv-kthread-q-selftest.c b/kernel-open/nvidia-uvm/nv-kthread-q-selftest.c index 88b70a4e8..83be19b18 100644 --- a/kernel-open/nvidia-uvm/nv-kthread-q-selftest.c +++ b/kernel-open/nvidia-uvm/nv-kthread-q-selftest.c @@ -1,5 +1,5 @@ /******************************************************************************* - Copyright (c) 2016 NVIDIA Corporation + Copyright (c) 2016-2024 NVIDIA Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -81,7 +81,7 @@ #define NUM_Q_ITEMS_IN_MULTITHREAD_TEST (NUM_TEST_Q_ITEMS * NUM_TEST_KTHREADS) // This exists in order to have a function to place a breakpoint on: -void on_nvq_assert(void) +static void on_nvq_assert(void) { (void)NULL; } diff --git a/kernel-open/nvidia-uvm/nv-kthread-q.c b/kernel-open/nvidia-uvm/nv-kthread-q.c index b49725b73..28d17d82c 100644 --- a/kernel-open/nvidia-uvm/nv-kthread-q.c +++ b/kernel-open/nvidia-uvm/nv-kthread-q.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2016 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2016-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -176,7 +176,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data), { unsigned i, j; - const static unsigned attempts = 3; + static const unsigned attempts = 3; struct task_struct *thread[3]; for (i = 0;; i++) { diff --git a/kernel-open/nvidia-uvm/uvm_channel.c b/kernel-open/nvidia-uvm/uvm_channel.c index e20a9bf37..16908254c 100644 --- a/kernel-open/nvidia-uvm/uvm_channel.c +++ b/kernel-open/nvidia-uvm/uvm_channel.c @@ -158,6 +158,12 @@ static NvU32 uvm_channel_update_progress_with_max(uvm_channel_t *channel, NvU64 completed_value = uvm_channel_update_completed_value(channel); + // LCIC channels don't use gpfifo entries after the static schedule is up. + // They can only have one entry active at a time so use the state of the + // tracking semaphore to represent progress. + if (uvm_channel_is_lcic(channel) && uvm_channel_manager_is_wlc_ready(channel->pool->manager)) + return uvm_gpu_tracking_semaphore_is_completed(&channel->tracking_sem) ? 0 : 1; + channel_pool_lock(channel->pool); // Completed value should never exceed the queued value @@ -397,18 +403,15 @@ static NV_STATUS channel_pool_rotate_key_locked(uvm_channel_pool_t *pool) uvm_assert_mutex_locked(&pool->conf_computing.key_rotation.mutex); uvm_for_each_channel_in_pool(channel, pool) { - NV_STATUS status = uvm_channel_wait(channel); + // WLC channels share CE with LCIC pushes and LCIC waits for + // WLC work to complete using WFI, so it's enough to wait + // for the latter one. + uvm_channel_t *wait_channel = uvm_channel_is_wlc(channel) ? uvm_channel_wlc_get_paired_lcic(channel) : channel; + + NV_STATUS status = uvm_channel_wait(wait_channel); if (status != NV_OK) return status; - if (uvm_channel_pool_is_wlc(pool)) { - uvm_spin_loop_t spin; - uvm_channel_t *lcic_channel = uvm_channel_wlc_get_paired_lcic(channel); - - // LCIC pushes don't exist as such. Rely on the tracking semaphore - // to determine completion, instead of uvm_channel_wait - UVM_SPIN_WHILE(!uvm_gpu_tracking_semaphore_is_completed(&lcic_channel->tracking_sem), &spin); - } } return uvm_conf_computing_rotate_pool_key(pool); @@ -1051,13 +1054,21 @@ static void internal_channel_submit_work_wlc(uvm_push_t *push) UvmCslIv *iv_cpu_addr = lcic_semaphore->conf_computing.ivs; uvm_gpu_semaphore_notifier_t *last_pushed_notifier; NvU32 iv_index; - uvm_spin_loop_t spin; + NV_STATUS status; void* auth_tag_cpu = get_channel_unprotected_sysmem_cpu(wlc_channel) + WLC_SYSMEM_PUSHBUFFER_AUTH_TAG_OFFSET; // Wait for the WLC/LCIC to be primed. This means that PUT == GET + 2 // and a WLC doorbell ring is enough to start work. - UVM_SPIN_WHILE(!uvm_gpu_tracking_semaphore_is_completed(&lcic_channel->tracking_sem), &spin); + status = uvm_channel_wait(lcic_channel); + if (status != NV_OK) { + UVM_ASSERT(uvm_global_get_status() != NV_OK); + + // If there's a global fatal error we can't communicate with the GPU + // and the below launch sequence doesn't work. + UVM_ERR_PRINT_NV_STATUS("Failed to wait for LCIC channel (%s) completion.", status, lcic_channel->name); + return; + } // Executing WLC adds an extra job to LCIC ++lcic_channel->tracking_sem.queued_value; @@ -1852,14 +1863,14 @@ static uvm_gpfifo_entry_t *uvm_channel_get_first_pending_entry(uvm_channel_t *ch NV_STATUS uvm_channel_get_status(uvm_channel_t *channel) { uvm_gpu_t *gpu; - NvNotification *errorNotifier; + NvNotification *error_notifier; if (uvm_channel_is_proxy(channel)) - errorNotifier = channel->proxy.channel_info.shadowErrorNotifier; + error_notifier = channel->proxy.channel_info.shadowErrorNotifier; else - errorNotifier = channel->channel_info.errorNotifier; + error_notifier = channel->channel_info.errorNotifier; - if (errorNotifier->status == 0) + if (error_notifier->status == 0) return NV_OK; // In case we hit a channel error, check the ECC error notifier as well so @@ -2986,16 +2997,18 @@ static NV_STATUS channel_manager_pick_ces(uvm_channel_manager_t *manager, unsign // Return the pool corresponding to the given CE index // -// This function cannot be used to access the proxy pool in SR-IOV heavy. +// Used to retrieve pools of type UVM_CHANNEL_POOL_TYPE_CE only. static uvm_channel_pool_t *channel_manager_ce_pool(uvm_channel_manager_t *manager, NvU32 ce) { - uvm_channel_pool_t *pool; + uvm_channel_pool_t *pool = uvm_channel_pool_first(manager, UVM_CHANNEL_POOL_TYPE_CE); + UVM_ASSERT(pool != NULL); UVM_ASSERT(test_bit(ce, manager->ce_mask)); - // The index of the pool associated with 'ce' is the number of usable CEs - // in [0, ce) - pool = manager->channel_pools + bitmap_weight(manager->ce_mask, ce); + // Pools of type UVM_CHANNEL_POOL_TYPE_CE are stored contiguously. The + // offset of the pool associated with 'ce' is the number of usable CEs in + // [0, ce). + pool += bitmap_weight(manager->ce_mask, ce); UVM_ASSERT(pool->pool_type == UVM_CHANNEL_POOL_TYPE_CE); UVM_ASSERT(pool->engine_index == ce); @@ -3009,6 +3022,8 @@ void uvm_channel_manager_set_p2p_ce(uvm_channel_manager_t *manager, uvm_gpu_t *p UVM_ASSERT(manager->gpu != peer); UVM_ASSERT(optimal_ce < UVM_COPY_ENGINE_COUNT_MAX); + UVM_ASSERT(manager->gpu->parent->peer_copy_mode != UVM_GPU_PEER_COPY_MODE_UNSUPPORTED); + UVM_ASSERT(peer->parent->peer_copy_mode != UVM_GPU_PEER_COPY_MODE_UNSUPPORTED); manager->pool_to_use.gpu_to_gpu[peer_gpu_index] = channel_manager_ce_pool(manager, optimal_ce); } @@ -3213,6 +3228,7 @@ static unsigned channel_manager_get_max_pools(uvm_channel_manager_t *manager) static NV_STATUS channel_manager_create_ce_pools(uvm_channel_manager_t *manager, unsigned *preferred_ce) { unsigned ce; + unsigned type; // A pool is created for each usable CE, even if it has not been selected as // the preferred CE for any type, because as more information is discovered @@ -3222,18 +3238,20 @@ static NV_STATUS channel_manager_create_ce_pools(uvm_channel_manager_t *manager, // usable. for_each_set_bit(ce, manager->ce_mask, UVM_COPY_ENGINE_COUNT_MAX) { NV_STATUS status; - unsigned type; uvm_channel_pool_t *pool = NULL; status = channel_pool_add(manager, UVM_CHANNEL_POOL_TYPE_CE, ce, &pool); if (status != NV_OK) return status; + } - for (type = 0; type < UVM_CHANNEL_TYPE_CE_COUNT; type++) { - // Set pool type if it hasn't been set before. - if (preferred_ce[type] == ce && manager->pool_to_use.default_for_type[type] == NULL) - manager->pool_to_use.default_for_type[type] = pool; - } + for (type = 0; type < UVM_CHANNEL_TYPE_CE_COUNT; type++) { + // Avoid overwriting previously set defaults. + if (manager->pool_to_use.default_for_type[type] != NULL) + continue; + + ce = preferred_ce[type]; + manager->pool_to_use.default_for_type[type] = channel_manager_ce_pool(manager, ce); } return NV_OK; @@ -3739,11 +3757,15 @@ static void channel_manager_stop_wlc(uvm_channel_manager_t *manager) NV_STATUS status; uvm_for_each_channel_in_pool(channel, lcic_pool) { - uvm_spin_loop_t spin; - // Wait for the WLC/LCIC to be primed. This means that PUT == GET + 2 // and a WLC doorbell ring is enough to start work. - UVM_SPIN_WHILE(!uvm_gpu_tracking_semaphore_is_completed(&channel->tracking_sem), &spin); + status = uvm_channel_wait(channel); + if (status != NV_OK) + UVM_ERR_PRINT_NV_STATUS("Failed to wait for LCIC channel (%s) completion", status, channel->name); + + // Continue on error and attempt to stop WLC below. This can lead to + // channel destruction with mismatched GET and PUT pointers. RM will + // print errors if that's the case, but channel destruction succeeeds. } status = uvm_push_begin(manager, UVM_CHANNEL_TYPE_SEC2, &push, "Stop WLC channels"); diff --git a/kernel-open/nvidia-uvm/uvm_gpu.c b/kernel-open/nvidia-uvm/uvm_gpu.c index dfff7e4d8..af811a050 100644 --- a/kernel-open/nvidia-uvm/uvm_gpu.c +++ b/kernel-open/nvidia-uvm/uvm_gpu.c @@ -2256,7 +2256,10 @@ static void set_optimal_p2p_write_ces(const UvmGpuP2PCapsParams *p2p_caps_params bool sorted; NvU32 ce0, ce1; - if (peer_caps->link_type < UVM_GPU_LINK_NVLINK_1) + UVM_ASSERT(peer_caps->ref_count); + UVM_ASSERT(gpu0->parent->peer_copy_mode == gpu1->parent->peer_copy_mode); + + if (gpu0->parent->peer_copy_mode == UVM_GPU_PEER_COPY_MODE_UNSUPPORTED) return; sorted = uvm_id_value(gpu0->id) < uvm_id_value(gpu1->id); @@ -2282,7 +2285,7 @@ static void set_optimal_p2p_write_ces(const UvmGpuP2PCapsParams *p2p_caps_params static int nv_procfs_read_gpu_peer_caps(struct seq_file *s, void *v) { if (!uvm_down_read_trylock(&g_uvm_global.pm.lock)) - return -EAGAIN; + return -EAGAIN; gpu_peer_caps_print((uvm_gpu_t **)s->private, s); diff --git a/kernel-open/nvidia-uvm/uvm_gpu.h b/kernel-open/nvidia-uvm/uvm_gpu.h index 09335483b..f4edd5eb7 100644 --- a/kernel-open/nvidia-uvm/uvm_gpu.h +++ b/kernel-open/nvidia-uvm/uvm_gpu.h @@ -962,6 +962,8 @@ struct uvm_parent_gpu_struct // Whether CE supports physical addressing mode for writes to vidmem bool ce_phys_vidmem_write_supported; + // Addressing mode(s) supported for CE transfers between this GPU and its + // peers: none, physical only, physical and virtual, etc. uvm_gpu_peer_copy_mode_t peer_copy_mode; // Virtualization mode of the GPU. diff --git a/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c b/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c index a24e405fa..4150ebaf8 100644 --- a/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c +++ b/kernel-open/nvidia-uvm/uvm_gpu_access_counters.c @@ -684,7 +684,10 @@ static void access_counter_buffer_flush_locked(uvm_parent_gpu_t *parent_gpu, while (get != put) { // Wait until valid bit is set - UVM_SPIN_WHILE(!parent_gpu->access_counter_buffer_hal->entry_is_valid(parent_gpu, get), &spin); + UVM_SPIN_WHILE(!parent_gpu->access_counter_buffer_hal->entry_is_valid(parent_gpu, get), &spin) { + if (uvm_global_get_status() != NV_OK) + goto done; + } parent_gpu->access_counter_buffer_hal->entry_clear_valid(parent_gpu, get); ++get; @@ -692,6 +695,7 @@ static void access_counter_buffer_flush_locked(uvm_parent_gpu_t *parent_gpu, get = 0; } +done: write_get(parent_gpu, get); } @@ -817,12 +821,18 @@ static NvU32 fetch_access_counter_buffer_entries(uvm_gpu_t *gpu, (fetch_mode == NOTIFICATION_FETCH_MODE_ALL || notification_index < access_counters->max_batch_size)) { uvm_access_counter_buffer_entry_t *current_entry = ¬ification_cache[notification_index]; - // We cannot just wait for the last entry (the one pointed by put) to become valid, we have to do it - // individually since entries can be written out of order + // We cannot just wait for the last entry (the one pointed by put) to + // become valid, we have to do it individually since entries can be + // written out of order UVM_SPIN_WHILE(!gpu->parent->access_counter_buffer_hal->entry_is_valid(gpu->parent, get), &spin) { // We have some entry to work on. Let's do the rest later. if (fetch_mode != NOTIFICATION_FETCH_MODE_ALL && notification_index > 0) goto done; + + // There's no entry to work on and something has gone wrong. Ignore + // the rest. + if (uvm_global_get_status() != NV_OK) + goto done; } // Prevent later accesses being moved above the read of the valid bit diff --git a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c index 9409a9bc7..d6bf7b304 100644 --- a/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c +++ b/kernel-open/nvidia-uvm/uvm_gpu_replayable_faults.c @@ -631,7 +631,15 @@ static NV_STATUS fault_buffer_flush_locked(uvm_gpu_t *gpu, while (get != put) { // Wait until valid bit is set - UVM_SPIN_WHILE(!parent_gpu->fault_buffer_hal->entry_is_valid(parent_gpu, get), &spin); + UVM_SPIN_WHILE(!parent_gpu->fault_buffer_hal->entry_is_valid(parent_gpu, get), &spin) { + // Channels might be idle (e.g. in teardown) so check for errors + // actively. + status = uvm_channel_manager_check_errors(gpu->channel_manager); + if (status != NV_OK) { + write_get(parent_gpu, get); + return status; + } + } fault_buffer_skip_replayable_entry(parent_gpu, get); ++get; @@ -864,6 +872,10 @@ static NV_STATUS fetch_fault_buffer_entries(uvm_gpu_t *gpu, // We have some entry to work on. Let's do the rest later. if (fetch_mode == FAULT_FETCH_MODE_BATCH_READY && fault_index > 0) goto done; + + status = uvm_global_get_status(); + if (status != NV_OK) + goto done; } // Prevent later accesses being moved above the read of the valid bit diff --git a/kernel-open/nvidia-uvm/uvm_mmu.c b/kernel-open/nvidia-uvm/uvm_mmu.c index a66b23a2b..368a6e1a7 100644 --- a/kernel-open/nvidia-uvm/uvm_mmu.c +++ b/kernel-open/nvidia-uvm/uvm_mmu.c @@ -50,18 +50,18 @@ // because that type is normally associated with the LCE mapped to the most // PCEs. The higher bandwidth is beneficial when doing bulk operations such as // clearing PTEs, or initializing a page directory/table. -#define page_tree_begin_acquire(tree, tracker, push, format, ...) ({ \ - NV_STATUS status; \ - uvm_channel_manager_t *manager = (tree)->gpu->channel_manager; \ - \ - if (manager == NULL) \ - status = uvm_push_begin_fake((tree)->gpu, (push)); \ - else if (uvm_parent_gpu_is_virt_mode_sriov_heavy((tree)->gpu->parent)) \ - status = uvm_push_begin_acquire(manager, UVM_CHANNEL_TYPE_MEMOPS, (tracker), (push), (format), ##__VA_ARGS__); \ - else \ - status = uvm_push_begin_acquire(manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, (tracker), (push), (format), ##__VA_ARGS__);\ - \ - status; \ +#define page_tree_begin_acquire(tree, tracker, push, format, ...) ({ \ + NV_STATUS __status; \ + uvm_channel_manager_t *__manager = (tree)->gpu->channel_manager; \ + \ + if (__manager == NULL) \ + __status = uvm_push_begin_fake((tree)->gpu, (push)); \ + else if (uvm_parent_gpu_is_virt_mode_sriov_heavy((tree)->gpu->parent)) \ + __status = uvm_push_begin_acquire(__manager, UVM_CHANNEL_TYPE_MEMOPS, (tracker), (push), (format), ##__VA_ARGS__); \ + else \ + __status = uvm_push_begin_acquire(__manager, UVM_CHANNEL_TYPE_GPU_INTERNAL, (tracker), (push), (format), ##__VA_ARGS__);\ + \ + __status; \ }) // Default location of page table allocations diff --git a/kernel-open/nvidia-uvm/uvm_pmm_test.c b/kernel-open/nvidia-uvm/uvm_pmm_test.c index bcc69733b..46758d8e3 100644 --- a/kernel-open/nvidia-uvm/uvm_pmm_test.c +++ b/kernel-open/nvidia-uvm/uvm_pmm_test.c @@ -1127,7 +1127,6 @@ static NV_STATUS test_pmm_reverse_map_many_blocks(uvm_gpu_t *gpu, uvm_va_space_t // incrementally. Therefore, the reverse translations will show them in // order. uvm_for_each_va_range_in(va_range, va_space, addr, addr + size - 1) { - uvm_va_block_t *va_block; for_each_va_block_in_va_range(va_range, va_block) { NvU32 num_va_block_pages = 0; diff --git a/kernel-open/nvidia-uvm/uvm_tracker_test.c b/kernel-open/nvidia-uvm/uvm_tracker_test.c index af490efa2..fd1d374c4 100644 --- a/kernel-open/nvidia-uvm/uvm_tracker_test.c +++ b/kernel-open/nvidia-uvm/uvm_tracker_test.c @@ -149,7 +149,7 @@ static NV_STATUS test_tracker_completion(uvm_va_space_t *va_space) static NV_STATUS test_tracker_basic(uvm_va_space_t *va_space) { uvm_gpu_t *gpu; - uvm_channel_t *channel; + uvm_channel_t *any_channel; uvm_tracker_t tracker; uvm_tracker_entry_t entry; NvU32 count = 0; @@ -159,15 +159,15 @@ static NV_STATUS test_tracker_basic(uvm_va_space_t *va_space) if (gpu == NULL) return NV_ERR_INVALID_STATE; - channel = uvm_channel_any(gpu->channel_manager); - if (channel == NULL) + any_channel = uvm_channel_any(gpu->channel_manager); + if (any_channel == NULL) return NV_ERR_INVALID_STATE; uvm_tracker_init(&tracker); TEST_CHECK_GOTO(assert_tracker_is_completed(&tracker) == NV_OK, done); // Some channel - entry.channel = channel; + entry.channel = any_channel; entry.value = 1; status = uvm_tracker_add_entry(&tracker, &entry); @@ -258,7 +258,7 @@ static NV_STATUS test_tracker_basic(uvm_va_space_t *va_space) static NV_STATUS test_tracker_overwrite(uvm_va_space_t *va_space) { uvm_gpu_t *gpu; - uvm_channel_t *channel; + uvm_channel_t *any_channel; uvm_tracker_t tracker, dup_tracker; uvm_tracker_entry_t entry; uvm_tracker_entry_t *entry_iter, *dup_entry_iter; @@ -270,15 +270,15 @@ static NV_STATUS test_tracker_overwrite(uvm_va_space_t *va_space) if (gpu == NULL) return NV_ERR_INVALID_STATE; - channel = uvm_channel_any(gpu->channel_manager); - if (channel == NULL) + any_channel = uvm_channel_any(gpu->channel_manager); + if (any_channel == NULL) return NV_ERR_INVALID_STATE; uvm_tracker_init(&tracker); TEST_CHECK_GOTO(assert_tracker_is_completed(&tracker) == NV_OK, done); // Some channel - entry.channel = channel; + entry.channel = any_channel; entry.value = 1; status = uvm_tracker_add_entry(&tracker, &entry); @@ -351,7 +351,7 @@ static NV_STATUS test_tracker_overwrite(uvm_va_space_t *va_space) static NV_STATUS test_tracker_add_tracker(uvm_va_space_t *va_space) { uvm_gpu_t *gpu; - uvm_channel_t *channel; + uvm_channel_t *any_channel; uvm_tracker_t tracker, dup_tracker; uvm_tracker_entry_t entry; uvm_tracker_entry_t *entry_iter, *dup_entry_iter; @@ -362,8 +362,8 @@ static NV_STATUS test_tracker_add_tracker(uvm_va_space_t *va_space) if (gpu == NULL) return NV_ERR_INVALID_STATE; - channel = uvm_channel_any(gpu->channel_manager); - if (channel == NULL) + any_channel = uvm_channel_any(gpu->channel_manager); + if (any_channel == NULL) return NV_ERR_INVALID_STATE; uvm_tracker_init(&tracker); @@ -371,7 +371,7 @@ static NV_STATUS test_tracker_add_tracker(uvm_va_space_t *va_space) TEST_CHECK_GOTO(assert_tracker_is_completed(&tracker) == NV_OK, done); // Some channel - entry.channel = channel; + entry.channel = any_channel; entry.value = 1; status = uvm_tracker_add_entry(&tracker, &entry); diff --git a/kernel-open/nvidia-uvm/uvm_va_block.c b/kernel-open/nvidia-uvm/uvm_va_block.c index ff274f1a0..c4ed9dd8e 100644 --- a/kernel-open/nvidia-uvm/uvm_va_block.c +++ b/kernel-open/nvidia-uvm/uvm_va_block.c @@ -3493,8 +3493,6 @@ static NV_STATUS block_copy_begin_push(uvm_va_block_t *va_block, } if (UVM_ID_IS_CPU(src_id) && UVM_ID_IS_CPU(dst_id)) { - uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); - gpu = uvm_va_space_find_first_gpu_attached_to_cpu_node(va_space, copy_state->src.nid); if (!gpu) gpu = uvm_va_space_find_first_gpu(va_space); @@ -4486,8 +4484,6 @@ static NV_STATUS block_copy_resident_pages_mask(uvm_va_block_t *block, uvm_processor_mask_copy(search_mask, src_processor_mask); for_each_closest_id(src_id, search_mask, dst_id, va_space) { - NV_STATUS status; - if (UVM_ID_IS_CPU(src_id)) { int nid; @@ -8939,13 +8935,13 @@ NV_STATUS uvm_va_block_revoke_prot(uvm_va_block_t *va_block, uvm_processor_mask_copy(resident_procs, &va_block->resident); for_each_closest_id(resident_id, resident_procs, gpu->id, va_space) { - NV_STATUS status = block_revoke_prot_gpu_to(va_block, - va_block_context, - gpu, - resident_id, - running_page_mask, - prot_to_revoke, - out_tracker); + status = block_revoke_prot_gpu_to(va_block, + va_block_context, + gpu, + resident_id, + running_page_mask, + prot_to_revoke, + out_tracker); if (status != NV_OK) break; @@ -12208,16 +12204,16 @@ NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id, // Map pages that are thrashing if (service_context->thrashing_pin_count > 0) { - uvm_page_index_t page_index; + uvm_page_index_t pinned_page_index; - for_each_va_block_page_in_region_mask(page_index, + for_each_va_block_page_in_region_mask(pinned_page_index, &service_context->thrashing_pin_mask, service_context->region) { uvm_processor_mask_t *map_thrashing_processors = NULL; - NvU64 page_addr = uvm_va_block_cpu_page_address(va_block, page_index); + NvU64 page_addr = uvm_va_block_cpu_page_address(va_block, pinned_page_index); // Check protection type - if (!uvm_page_mask_test(caller_page_mask, page_index)) + if (!uvm_page_mask_test(caller_page_mask, pinned_page_index)) continue; map_thrashing_processors = uvm_perf_thrashing_get_thrashing_processors(va_block, page_addr); @@ -12226,7 +12222,7 @@ NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id, service_context->block_context, new_residency, processor_id, - uvm_va_block_region_for_page(page_index), + uvm_va_block_region_for_page(pinned_page_index), caller_page_mask, new_prot, map_thrashing_processors); diff --git a/kernel-open/nvidia-uvm/uvm_va_block.h b/kernel-open/nvidia-uvm/uvm_va_block.h index 220df10b0..9cc5e7c09 100644 --- a/kernel-open/nvidia-uvm/uvm_va_block.h +++ b/kernel-open/nvidia-uvm/uvm_va_block.h @@ -2274,7 +2274,7 @@ NV_STATUS uvm_va_block_populate_page_cpu(uvm_va_block_t *va_block, // returns NV_ERR_MORE_PROCESSING_REQUIRED and this makes it clear that the // block's state is not locked across these calls. #define UVM_VA_BLOCK_LOCK_RETRY(va_block, block_retry, call) ({ \ - NV_STATUS status; \ + NV_STATUS __status; \ uvm_va_block_t *__block = (va_block); \ uvm_va_block_retry_t *__retry = (block_retry); \ \ @@ -2283,14 +2283,14 @@ NV_STATUS uvm_va_block_populate_page_cpu(uvm_va_block_t *va_block, uvm_mutex_lock(&__block->lock); \ \ do { \ - status = (call); \ - } while (status == NV_ERR_MORE_PROCESSING_REQUIRED); \ + __status = (call); \ + } while (__status == NV_ERR_MORE_PROCESSING_REQUIRED); \ \ uvm_mutex_unlock(&__block->lock); \ \ uvm_va_block_retry_deinit(__retry, __block); \ \ - status; \ + __status; \ }) // A helper macro for handling allocation-retry @@ -2305,7 +2305,7 @@ NV_STATUS uvm_va_block_populate_page_cpu(uvm_va_block_t *va_block, // to be already taken. Notably the block's lock might be unlocked and relocked // as part of the call. #define UVM_VA_BLOCK_RETRY_LOCKED(va_block, block_retry, call) ({ \ - NV_STATUS status; \ + NV_STATUS __status; \ uvm_va_block_t *__block = (va_block); \ uvm_va_block_retry_t *__retry = (block_retry); \ \ @@ -2314,12 +2314,12 @@ NV_STATUS uvm_va_block_populate_page_cpu(uvm_va_block_t *va_block, uvm_assert_mutex_locked(&__block->lock); \ \ do { \ - status = (call); \ - } while (status == NV_ERR_MORE_PROCESSING_REQUIRED); \ + __status = (call); \ + } while (__status == NV_ERR_MORE_PROCESSING_REQUIRED); \ \ uvm_va_block_retry_deinit(__retry, __block); \ \ - status; \ + __status; \ }) #endif // __UVM_VA_BLOCK_H__ diff --git a/kernel-open/nvidia/linux_nvswitch.c b/kernel-open/nvidia/linux_nvswitch.c index aa6571418..40cb0d782 100644 --- a/kernel-open/nvidia/linux_nvswitch.c +++ b/kernel-open/nvidia/linux_nvswitch.c @@ -31,6 +31,7 @@ #include "nvCpuUuid.h" #include "nv-time.h" #include "nvlink_caps.h" +#include "nvlink_proto.h" #include #include @@ -49,7 +50,7 @@ #include "ioctl_nvswitch.h" -const static struct +static const struct { NvlStatus status; int err; diff --git a/kernel-open/nvidia/nv-caps-imex.c b/kernel-open/nvidia/nv-caps-imex.c index b9fca3f0e..a8e61bd63 100644 --- a/kernel-open/nvidia/nv-caps-imex.c +++ b/kernel-open/nvidia/nv-caps-imex.c @@ -22,6 +22,7 @@ */ #include "nv-linux.h" +#include "nv-caps-imex.h" extern int NVreg_ImexChannelCount; diff --git a/kernel-open/nvidia/nv-caps.c b/kernel-open/nvidia/nv-caps.c index 6d2061007..89788fb70 100644 --- a/kernel-open/nvidia/nv-caps.c +++ b/kernel-open/nvidia/nv-caps.c @@ -267,7 +267,7 @@ static void nv_cap_procfs_exit(void) nv_cap_procfs_dir = NULL; } -int nv_cap_procfs_init(void) +static int nv_cap_procfs_init(void) { static struct proc_dir_entry *file_entry; diff --git a/kernel-open/nvidia/nv-dma.c b/kernel-open/nvidia/nv-dma.c index 1c2c609b3..f7861b2dc 100644 --- a/kernel-open/nvidia/nv-dma.c +++ b/kernel-open/nvidia/nv-dma.c @@ -290,7 +290,7 @@ void nv_destroy_dma_map_scatterlist(nv_dma_map_t *dma_map) os_free_mem(dma_map->mapping.discontig.submaps); } -void nv_load_dma_map_scatterlist( +static void nv_load_dma_map_scatterlist( nv_dma_map_t *dma_map, NvU64 *va_array ) @@ -486,7 +486,7 @@ NV_STATUS NV_API_CALL nv_dma_map_sgt( return status; } -NV_STATUS NV_API_CALL nv_dma_unmap_sgt( +static NV_STATUS NV_API_CALL nv_dma_unmap_sgt( nv_dma_device_t *dma_dev, void **priv ) diff --git a/kernel-open/nvidia/nv-ibmnpu.c b/kernel-open/nvidia/nv-ibmnpu.c index a541c0718..5a567f4dd 100644 --- a/kernel-open/nvidia/nv-ibmnpu.c +++ b/kernel-open/nvidia/nv-ibmnpu.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2017-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2017-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -25,9 +25,9 @@ * nv-ibmnpu.c - interface with the ibmnpu (IBM NVLink Processing Unit) "module" */ #include "nv-linux.h" +#include "nv-ibmnpu.h" #if defined(NVCPU_PPC64LE) -#include "nv-ibmnpu.h" #include "nv-rsync.h" /* diff --git a/kernel-open/nvidia/nv-kthread-q.c b/kernel-open/nvidia/nv-kthread-q.c index b49725b73..28d17d82c 100644 --- a/kernel-open/nvidia/nv-kthread-q.c +++ b/kernel-open/nvidia/nv-kthread-q.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2016 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2016-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -176,7 +176,7 @@ static struct task_struct *thread_create_on_node(int (*threadfn)(void *data), { unsigned i, j; - const static unsigned attempts = 3; + static const unsigned attempts = 3; struct task_struct *thread[3]; for (i = 0;; i++) { diff --git a/kernel-open/nvidia/nv-mmap.c b/kernel-open/nvidia/nv-mmap.c index 3db8b8075..8fd449604 100644 --- a/kernel-open/nvidia/nv-mmap.c +++ b/kernel-open/nvidia/nv-mmap.c @@ -368,7 +368,7 @@ int nv_encode_caching( return 0; } -int static nvidia_mmap_peer_io( +static int nvidia_mmap_peer_io( struct vm_area_struct *vma, nv_alloc_t *at, NvU64 page_index, @@ -389,7 +389,7 @@ int static nvidia_mmap_peer_io( return ret; } -int static nvidia_mmap_sysmem( +static int nvidia_mmap_sysmem( struct vm_area_struct *vma, nv_alloc_t *at, NvU64 page_index, diff --git a/kernel-open/nvidia/nv-procfs.c b/kernel-open/nvidia/nv-procfs.c index 63a0fe597..b8d7ce4ec 100644 --- a/kernel-open/nvidia/nv-procfs.c +++ b/kernel-open/nvidia/nv-procfs.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 1999-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1999-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -694,7 +694,7 @@ static nv_proc_ops_t nv_procfs_suspend_fops = { /* * Forwards error to nv_log_error which exposes data to vendor callback */ -void +static void exercise_error_forwarding_va( nv_state_t *nv, NvU32 err, diff --git a/kernel-open/nvidia/nv-report-err.c b/kernel-open/nvidia/nv-report-err.c index eec5af3e7..a3fa59548 100644 --- a/kernel-open/nvidia/nv-report-err.c +++ b/kernel-open/nvidia/nv-report-err.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2017 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2017-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -29,7 +29,7 @@ nv_report_error_cb_t nv_error_cb_handle = NULL; -int nv_register_error_cb(nv_report_error_cb_t report_error_cb) +int nvidia_register_error_cb(nv_report_error_cb_t report_error_cb) { if (report_error_cb == NULL) return -EINVAL; @@ -41,9 +41,9 @@ int nv_register_error_cb(nv_report_error_cb_t report_error_cb) return 0; } -EXPORT_SYMBOL(nv_register_error_cb); +EXPORT_SYMBOL(nvidia_register_error_cb); -int nv_unregister_error_cb(void) +int nvidia_unregister_error_cb(void) { if (nv_error_cb_handle == NULL) return -EPERM; @@ -52,9 +52,7 @@ int nv_unregister_error_cb(void) return 0; } -EXPORT_SYMBOL(nv_unregister_error_cb); - -struct pci_dev; +EXPORT_SYMBOL(nvidia_unregister_error_cb); void nv_report_error( struct pci_dev *dev, @@ -63,27 +61,17 @@ void nv_report_error( va_list ap ) { - va_list ap_copy; char *buffer; - int length = 0; - int status = NV_OK; + gfp_t gfp = NV_MAY_SLEEP() ? NV_GFP_NO_OOM : NV_GFP_ATOMIC; - if (nv_error_cb_handle != NULL) - { - va_copy(ap_copy, ap); - length = vsnprintf(NULL, 0, format, ap); - va_end(ap_copy); + if (nv_error_cb_handle == NULL) + return; + + buffer = kvasprintf(gfp, format, ap); - if (length > 0) - { - status = os_alloc_mem((void *)&buffer, (length + 1)*sizeof(char)); + if (buffer == NULL) + return; - if (status == NV_OK) - { - vsnprintf(buffer, length, format, ap); - nv_error_cb_handle(dev, error_number, buffer, length + 1); - os_free_mem(buffer); - } - } - } + nv_error_cb_handle(dev, error_number, buffer, strlen(buffer) + 1); + kfree(buffer); } diff --git a/kernel-open/nvidia/nv-report-err.h b/kernel-open/nvidia/nv-report-err.h index d48870921..815c659a0 100644 --- a/kernel-open/nvidia/nv-report-err.h +++ b/kernel-open/nvidia/nv-report-err.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2017 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2017-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -37,7 +37,7 @@ * @param[in] int * Length of error string. */ -typedef void (*nv_report_error_cb_t)(struct pci_dev *, uint32_t, char *, int); +typedef void (*nv_report_error_cb_t)(struct pci_dev *, uint32_t, char *, size_t); /* * @brief @@ -51,7 +51,7 @@ typedef void (*nv_report_error_cb_t)(struct pci_dev *, uint32_t, char *, int); * -EINVAL callback handle is NULL. * -EBUSY callback handle is already registered. */ -int nv_register_error_cb(nv_report_error_cb_t report_error_cb); +int nvidia_register_error_cb(nv_report_error_cb_t report_error_cb); /* * @brief @@ -61,6 +61,6 @@ int nv_register_error_cb(nv_report_error_cb_t report_error_cb); * 0 upon successful completion. * -EPERM unregister not permitted on NULL callback handle. */ -int nv_unregister_error_cb(void); +int nvidia_unregister_error_cb(void); #endif /* _NV_REPORT_ERR_H_ */ diff --git a/kernel-open/nvidia/nv.c b/kernel-open/nvidia/nv.c index 42804ed8d..f81da81de 100644 --- a/kernel-open/nvidia/nv.c +++ b/kernel-open/nvidia/nv.c @@ -1268,12 +1268,6 @@ static int validate_numa_start_state(nv_linux_state_t *nvl) return rc; } -NV_STATUS NV_API_CALL nv_get_num_dpaux_instances(nv_state_t *nv, NvU32 *num_instances) -{ - *num_instances = nv->num_dpaux_instance; - return NV_OK; -} - void NV_API_CALL nv_schedule_uvm_isr(nv_state_t *nv) { diff --git a/kernel-open/nvidia/nvlink_linux.c b/kernel-open/nvidia/nvlink_linux.c index 11e798d65..ade72e4ee 100644 --- a/kernel-open/nvidia/nvlink_linux.c +++ b/kernel-open/nvidia/nvlink_linux.c @@ -27,6 +27,7 @@ #include "nvlink_linux.h" #include "nvlink_errors.h" #include "nvlink_export.h" +#include "nvlink_proto.h" #include "nv-linux.h" #include "nv-procfs.h" #include "nv-time.h" diff --git a/kernel-open/nvidia/os-interface.c b/kernel-open/nvidia/os-interface.c index 16966d5c7..b612d9ea4 100644 --- a/kernel-open/nvidia/os-interface.c +++ b/kernel-open/nvidia/os-interface.c @@ -402,7 +402,7 @@ NvS32 NV_API_CALL os_string_compare(const char *str1, const char *str2) return strcmp(str1, str2); } -void *os_mem_copy_custom( +static void *os_mem_copy_custom( void *dstPtr, const void *srcPtr, NvU32 length diff --git a/src/common/inc/nvBldVer.h b/src/common/inc/nvBldVer.h index 8216c5030..c993fbb0d 100644 --- a/src/common/inc/nvBldVer.h +++ b/src/common/inc/nvBldVer.h @@ -36,25 +36,25 @@ // and then checked back in. You cannot make changes to these sections without // corresponding changes to the buildmeister script #ifndef NV_BUILD_BRANCH - #define NV_BUILD_BRANCH r552_86 + #define NV_BUILD_BRANCH r550_00 #endif #ifndef NV_PUBLIC_BRANCH - #define NV_PUBLIC_BRANCH r552_86 + #define NV_PUBLIC_BRANCH r550_00 #endif #if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS) -#define NV_BUILD_BRANCH_VERSION "rel/gpu_drv/r550/r552_86-355" -#define NV_BUILD_CHANGELIST_NUM (34618165) +#define NV_BUILD_BRANCH_VERSION "rel/gpu_drv/r550/r550_00-410" +#define NV_BUILD_CHANGELIST_NUM (34843164) #define NV_BUILD_TYPE "Official" -#define NV_BUILD_NAME "rel/gpu_drv/r550/r552_86-355" -#define NV_LAST_OFFICIAL_CHANGELIST_NUM (34618165) +#define NV_BUILD_NAME "rel/gpu_drv/r550/r550_00-410" +#define NV_LAST_OFFICIAL_CHANGELIST_NUM (34843164) #else /* Windows builds */ -#define NV_BUILD_BRANCH_VERSION "r552_86-1" -#define NV_BUILD_CHANGELIST_NUM (34615400) +#define NV_BUILD_BRANCH_VERSION "r550_00-390" +#define NV_BUILD_CHANGELIST_NUM (34843164) #define NV_BUILD_TYPE "Official" -#define NV_BUILD_NAME "552.87" -#define NV_LAST_OFFICIAL_CHANGELIST_NUM (34615400) +#define NV_BUILD_NAME "553.09" +#define NV_LAST_OFFICIAL_CHANGELIST_NUM (34843164) #define NV_BUILD_BRANCH_BASE_VERSION R550 #endif // End buildmeister python edited section diff --git a/src/common/inc/nvUnixVersion.h b/src/common/inc/nvUnixVersion.h index 94fa96119..296f8e100 100644 --- a/src/common/inc/nvUnixVersion.h +++ b/src/common/inc/nvUnixVersion.h @@ -4,7 +4,7 @@ #if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS) || defined(NV_VMWARE) || defined(NV_QNX) || defined(NV_INTEGRITY) || \ (defined(RMCFG_FEATURE_PLATFORM_GSP) && RMCFG_FEATURE_PLATFORM_GSP == 1) -#define NV_VERSION_STRING "550.107.02" +#define NV_VERSION_STRING "550.120" #else diff --git a/src/common/inc/swref/published/nvswitch/ls10/ptop_discovery_ip.h b/src/common/inc/swref/published/nvswitch/ls10/ptop_discovery_ip.h index 93d2c403a..862afd89a 100644 --- a/src/common/inc/swref/published/nvswitch/ls10/ptop_discovery_ip.h +++ b/src/common/inc/swref/published/nvswitch/ls10/ptop_discovery_ip.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2003-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2003-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -25,4 +25,5 @@ #define __ls10_ptop_discovery_ip_h__ /* This file is autogenerated. Do not edit */ #define NV_PTOP_UNICAST_SW_DEVICE_BASE_SAW_0 0x00028000 /* */ +#define NV_PTOP_UNICAST_SW_DEVICE_BASE_SOE_0 0x00840000 /* */ #endif // __ls10_ptop_discovery_ip_h__ diff --git a/src/common/nvswitch/common/inc/soe/soeiftnvl.h b/src/common/nvswitch/common/inc/soe/soeiftnvl.h index 64119908e..ae34b12fb 100644 --- a/src/common/nvswitch/common/inc/soe/soeiftnvl.h +++ b/src/common/nvswitch/common/inc/soe/soeiftnvl.h @@ -34,6 +34,78 @@ * Command Messages between driver and TNVL unit of SOE */ +#define RM_SOE_LIST_LS10_ONLY_ENGINES(_op) \ + _op(GIN) \ + _op(XAL) \ + _op(XAL_FUNC) \ + _op(XPL) \ + _op(XTL) \ + _op(XTL_CONFIG) \ + _op(UXL) \ + _op(GPU_PTOP) \ + _op(PMC) \ + _op(PBUS) \ + _op(ROM2) \ + _op(GPIO) \ + _op(FSP) \ + _op(SYSCTRL) \ + _op(CLKS_SYS) \ + _op(CLKS_SYSB) \ + _op(CLKS_P0) \ + _op(SAW_PM) \ + _op(PCIE_PM) \ + _op(PRT_PRI_HUB) \ + _op(PRT_PRI_RS_CTRL) \ + _op(SYS_PRI_HUB) \ + _op(SYS_PRI_RS_CTRL) \ + _op(SYSB_PRI_HUB) \ + _op(SYSB_PRI_RS_CTRL) \ + _op(PRI_MASTER_RS) \ + _op(PTIMER) \ + _op(CPR) \ + _op(TILEOUT) \ + +#define RM_SOE_LIST_ALL_ENGINES(_op) \ + _op(XVE) \ + _op(SAW) \ + _op(SOE) \ + _op(SMR) \ + \ + _op(NPG) \ + _op(NPORT) \ + \ + _op(NVLW) \ + _op(MINION) \ + _op(NVLIPT) \ + _op(NVLIPT_LNK) \ + _op(NVLTLC) \ + _op(NVLDL) \ + \ + _op(NXBAR) \ + _op(TILE) \ + \ + _op(NPG_PERFMON) \ + _op(NPORT_PERFMON) \ + \ + _op(NVLW_PERFMON) \ + +#define RM_SOE_ENGINE_ID_LIST(_eng) \ + RM_SOE_ENGINE_ID_##_eng, + +// +// ENGINE_IDs are the complete list of all engines that are supported on +// LS10 architecture(s) that may support them. Any one architecture may or +// may not understand how to operate on any one specific engine. +// Architectures that share a common ENGINE_ID are not guaranteed to have +// compatible manuals. +// +typedef enum rm_soe_engine_id +{ + RM_SOE_LIST_ALL_ENGINES(RM_SOE_ENGINE_ID_LIST) + RM_SOE_LIST_LS10_ONLY_ENGINES(RM_SOE_ENGINE_ID_LIST) + RM_SOE_ENGINE_ID_SIZE, +} RM_SOE_ENGINE_ID; + /*! * Commands offered by the SOE Tnvl Interface. */ @@ -47,6 +119,10 @@ enum * Issue pre-lock sequence */ RM_SOE_TNVL_CMD_ISSUE_PRE_LOCK_SEQUENCE = 0x1, + /* + * Issue engine write command + */ + RM_SOE_TNVL_CMD_ISSUE_ENGINE_WRITE = 0x2, }; /*! @@ -60,6 +136,17 @@ typedef struct NvU32 data; } RM_SOE_TNVL_CMD_REGISTER_WRITE; +typedef struct +{ + NvU8 cmdType; + RM_SOE_ENGINE_ID eng_id; + NvU32 eng_bcast; + NvU32 eng_instance; + NvU32 base; + NvU32 offset; + NvU32 data; +} RM_SOE_TNVL_CMD_ENGINE_WRITE; + typedef struct { NvU8 cmdType; @@ -69,8 +156,9 @@ typedef union { NvU8 cmdType; RM_SOE_TNVL_CMD_REGISTER_WRITE registerWrite; + RM_SOE_TNVL_CMD_ENGINE_WRITE engineWrite; RM_SOE_TNVL_CMD_PRE_LOCK_SEQUENCE preLockSequence; } RM_SOE_TNVL_CMD; -#endif // _SOEIFTNVL_H_ +#endif // _SOETNVL_H_ diff --git a/src/common/nvswitch/interface/ctrl_dev_nvswitch.h b/src/common/nvswitch/interface/ctrl_dev_nvswitch.h index b28a36f4d..3ba43589c 100644 --- a/src/common/nvswitch/interface/ctrl_dev_nvswitch.h +++ b/src/common/nvswitch/interface/ctrl_dev_nvswitch.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2018-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2018-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -831,6 +831,7 @@ typedef enum nvswitch_err_type NVSWITCH_ERR_HW_HOST_IO_FAILURE = 10007, NVSWITCH_ERR_HW_HOST_FIRMWARE_INITIALIZATION_FAILURE = 10008, NVSWITCH_ERR_HW_HOST_FIRMWARE_RECOVERY_MODE = 10009, + NVSWITCH_ERR_HW_HOST_TNVL_ERROR = 10010, NVSWITCH_ERR_HW_HOST_LAST, diff --git a/src/common/nvswitch/kernel/inc/haldef_nvswitch.h b/src/common/nvswitch/kernel/inc/haldef_nvswitch.h index 29af395e1..ede6a6a85 100644 --- a/src/common/nvswitch/kernel/inc/haldef_nvswitch.h +++ b/src/common/nvswitch/kernel/inc/haldef_nvswitch.h @@ -213,6 +213,7 @@ _op(NvU32, nvswitch_get_eng_count, (nvswitch_device *device, NVSWITCH_ENGINE_ID eng_id, NvU32 eng_bcast), _arch) \ _op(NvU32, nvswitch_eng_rd, (nvswitch_device *device, NVSWITCH_ENGINE_ID eng_id, NvU32 eng_bcast, NvU32 eng_instance, NvU32 offset), _arch) \ _op(void, nvswitch_eng_wr, (nvswitch_device *device, NVSWITCH_ENGINE_ID eng_id, NvU32 eng_bcast, NvU32 eng_instance, NvU32 offset, NvU32 data), _arch) \ + _op(void, nvswitch_reg_write_32, (nvswitch_device *device, NvU32 offset, NvU32 data), _arch) \ _op(NvU32, nvswitch_get_link_eng_inst, (nvswitch_device *device, NvU32 link_id, NVSWITCH_ENGINE_ID eng_id), _arch) \ _op(void *, nvswitch_alloc_chipdevice, (nvswitch_device *device), _arch) \ _op(NvlStatus, nvswitch_init_thermal, (nvswitch_device *device), _arch) \ diff --git a/src/common/nvswitch/kernel/inc/ls10/ls10.h b/src/common/nvswitch/kernel/inc/ls10/ls10.h index a69a02f8c..ff33903a8 100644 --- a/src/common/nvswitch/kernel/inc/ls10/ls10.h +++ b/src/common/nvswitch/kernel/inc/ls10/ls10.h @@ -189,8 +189,8 @@ #define SOE_VBIOS_VERSION_MASK 0xFF0000 #define SOE_VBIOS_REVLOCK_DISABLE_NPORT_FATAL_INTR 0x370000 #define SOE_VBIOS_REVLOCK_ISSUE_INGRESS_STOP 0x4C0000 -#define SOE_VBIOS_REVLOCK_ISSUE_REGISTER_WRITE 0x580000 -#define SOE_VBIOS_REVLOCK_TNVL_PRELOCK_COMMAND 0x600000 +#define SOE_VBIOS_REVLOCK_TNVL_PRELOCK_COMMAND 0x590000 +#define SOE_VBIOS_REVLOCK_SOE_PRI_CHECKS 0x610000 // LS10 Saved LED state #define ACCESS_LINK_LED_STATE CPLD_MACHXO3_ACCESS_LINK_LED_CTL_NVL_CABLE_LED @@ -1060,10 +1060,10 @@ NvlStatus nvswitch_tnvl_get_attestation_certificate_chain_ls10(nvswitch_device * NvlStatus nvswitch_tnvl_get_attestation_report_ls10(nvswitch_device *device, NVSWITCH_GET_ATTESTATION_REPORT_PARAMS *params); NvlStatus nvswitch_tnvl_send_fsp_lock_config_ls10(nvswitch_device *device); NvlStatus nvswitch_tnvl_get_status_ls10(nvswitch_device *device, NVSWITCH_GET_TNVL_STATUS_PARAMS *params); -void nvswitch_tnvl_reg_wr_32_ls10(nvswitch_device *device, NVSWITCH_ENGINE_ID eng_id, NvU32 eng_bcast, NvU32 eng_instance, NvU32 base_addr, NvU32 offset, NvU32 data); +void nvswitch_tnvl_eng_wr_32_ls10(nvswitch_device *device, NVSWITCH_ENGINE_ID eng_id, NvU32 eng_bcast, NvU32 eng_instance, NvU32 base_addr, NvU32 offset, NvU32 data); NvlStatus nvswitch_send_tnvl_prelock_cmd_ls10(nvswitch_device *device); void nvswitch_tnvl_disable_interrupts_ls10(nvswitch_device *device); - +void nvswitch_tnvl_reg_wr_32_ls10(nvswitch_device *device, NvU32 offset, NvU32 data); NvlStatus nvswitch_ctrl_get_soe_heartbeat_ls10(nvswitch_device *device, NVSWITCH_GET_SOE_HEARTBEAT_PARAMS *p); NvlStatus nvswitch_cci_enable_iobist_ls10(nvswitch_device *device, NvU32 linkNumber, NvBool bEnable); NvlStatus nvswitch_cci_initialization_sequence_ls10(nvswitch_device *device, NvU32 linkNumber); diff --git a/src/common/nvswitch/kernel/inc/ls10/soe_ls10.h b/src/common/nvswitch/kernel/inc/ls10/soe_ls10.h index dfe615af4..d3304b830 100644 --- a/src/common/nvswitch/kernel/inc/ls10/soe_ls10.h +++ b/src/common/nvswitch/kernel/inc/ls10/soe_ls10.h @@ -52,4 +52,5 @@ void nvswitch_soe_disable_nport_fatal_interrupts_ls10(nvswitch_device *devi NvU32 nportIntrEnable, NvU8 nportIntrType); NvlStatus nvswitch_soe_issue_ingress_stop_ls10(nvswitch_device *device, NvU32 nport, NvBool bStop); NvlStatus nvswitch_soe_reg_wr_32_ls10(nvswitch_device *device, NvU32 offset, NvU32 data); +NvlStatus nvswitch_soe_eng_wr_32_ls10(nvswitch_device *device, NVSWITCH_ENGINE_ID eng_id, NvU32 eng_bcast, NvU32 eng_instance, NvU32 base_addr, NvU32 offset, NvU32 data); #endif //_SOE_LS10_H_ diff --git a/src/common/nvswitch/kernel/inc/soe/bin/g_soeuc_lr10_dbg.h b/src/common/nvswitch/kernel/inc/soe/bin/g_soeuc_lr10_dbg.h index 4ede0b051..beb5679f5 100644 --- a/src/common/nvswitch/kernel/inc/soe/bin/g_soeuc_lr10_dbg.h +++ b/src/common/nvswitch/kernel/inc/soe/bin/g_soeuc_lr10_dbg.h @@ -272,8 +272,8 @@ const NvU32 soe_ucode_data_lr10_dbg[] = { 0xa6b0001d, 0x240cf409, 0x001da03e, 0x0049190f, 0x009ff711, 0x00f802f8, 0xb50294b6, 0x00f804b9, 0xb602af92, 0xb9bc0294, 0xf400f8f9, 0x82f9d430, 0x301590b4, 0xc1b027e1, 0x0ad1b00b, 0x94b6f4bd, 0x0c91b002, 0x900149fe, 0x9fa04499, 0x20079990, 0x0b99929f, 0x95b29fa0, 0xa0049992, 0x9297b29f, - 0x9fa00499, 0x0005ecdf, 0x90ffbf00, 0x4efe1499, 0xa0a6b201, 0x34ee909f, 0xb4b20209, 0x14bde9a0, - 0x34bd84bd, 0x001eef3e, 0x277e6ab2, 0x49bf001a, 0x4bfea2b2, 0x014cfe01, 0x9044bb90, 0x95f94bcc, + 0x9fa00499, 0x0005ecdf, 0x90ffbf00, 0x4efe1499, 0xa0a6b201, 0x34ee909f, 0xb4b20209, 0x84bde9a0, + 0x14bd34bd, 0x001eef3e, 0x277e6ab2, 0x49bf001a, 0x4bfea2b2, 0x014cfe01, 0x9044bb90, 0x95f94bcc, 0xb31100b4, 0x008e0209, 0x9e0309b3, 0x010db300, 0x499800a8, 0xb27cb201, 0xfe5bb22a, 0xdd90014d, 0x3295f938, 0x0be0b40c, 0xa53ed4bd, 0x5fbf001e, 0xf9a6e9bf, 0x34381bf4, 0xe89827b0, 0x987fbf01, 0xb03302e9, 0xb0b40a00, 0x90b9bc0c, 0x1bf4f9a6, 0x1444df1e, 0xf9180000, 0x0094330c, 0x90f1b206, @@ -2269,8 +2269,8 @@ const NvU32 soe_ucode_data_lr10_dbg[] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x69e9060c, 0xe6ca2d91, 0xac20edf2, 0xeafeafcc, 0x1de66f4b, 0x98838b38, 0xce342fcf, 0x31422bca, - 0x30867660, 0xbc4af25f, 0xbc09e1ed, 0xab87e0fc, 0x154ee848, 0x4d419617, 0xc10ab5e0, 0x5570cfeb, + 0x69e9060c, 0xe6ca2d91, 0xac20edf2, 0xeafeafcc, 0x294f2cc2, 0x883a9d68, 0x493e2990, 0xc8e27d59, + 0x30867660, 0xbc4af25f, 0xbc09e1ed, 0xab87e0fc, 0x8fc5fac6, 0xe1f366be, 0x1ec159bf, 0x352ff984, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, diff --git a/src/common/nvswitch/kernel/inc/soe/bin/g_soeuc_lr10_prd.h b/src/common/nvswitch/kernel/inc/soe/bin/g_soeuc_lr10_prd.h index 08194f282..e55f4f5df 100644 --- a/src/common/nvswitch/kernel/inc/soe/bin/g_soeuc_lr10_prd.h +++ b/src/common/nvswitch/kernel/inc/soe/bin/g_soeuc_lr10_prd.h @@ -272,8 +272,8 @@ const NvU32 soe_ucode_data_lr10_prd[] = { 0xa6b0001d, 0x240cf409, 0x001da03e, 0x0049190f, 0x009ff711, 0x00f802f8, 0xb50294b6, 0x00f804b9, 0xb602af92, 0xb9bc0294, 0xf400f8f9, 0x82f9d430, 0x301590b4, 0xc1b027e1, 0x0ad1b00b, 0x94b6f4bd, 0x0c91b002, 0x900149fe, 0x9fa04499, 0x20079990, 0x0b99929f, 0x95b29fa0, 0xa0049992, 0x9297b29f, - 0x9fa00499, 0x0005ecdf, 0x90ffbf00, 0x4efe1499, 0xa0a6b201, 0x34ee909f, 0xb4b20209, 0x14bde9a0, - 0x34bd84bd, 0x001eef3e, 0x277e6ab2, 0x49bf001a, 0x4bfea2b2, 0x014cfe01, 0x9044bb90, 0x95f94bcc, + 0x9fa00499, 0x0005ecdf, 0x90ffbf00, 0x4efe1499, 0xa0a6b201, 0x34ee909f, 0xb4b20209, 0x84bde9a0, + 0x14bd34bd, 0x001eef3e, 0x277e6ab2, 0x49bf001a, 0x4bfea2b2, 0x014cfe01, 0x9044bb90, 0x95f94bcc, 0xb31100b4, 0x008e0209, 0x9e0309b3, 0x010db300, 0x499800a8, 0xb27cb201, 0xfe5bb22a, 0xdd90014d, 0x3295f938, 0x0be0b40c, 0xa53ed4bd, 0x5fbf001e, 0xf9a6e9bf, 0x34381bf4, 0xe89827b0, 0x987fbf01, 0xb03302e9, 0xb0b40a00, 0x90b9bc0c, 0x1bf4f9a6, 0x1444df1e, 0xf9180000, 0x0094330c, 0x90f1b206, @@ -2269,8 +2269,8 @@ const NvU32 soe_ucode_data_lr10_prd[] = { 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x69e9060c, 0xe6ca2d91, 0xac20edf2, 0xeafeafcc, 0x1de66f4b, 0x98838b38, 0xce342fcf, 0x31422bca, - 0x30867660, 0xbc4af25f, 0xbc09e1ed, 0xab87e0fc, 0x154ee848, 0x4d419617, 0xc10ab5e0, 0x5570cfeb, + 0x69e9060c, 0xe6ca2d91, 0xac20edf2, 0xeafeafcc, 0x294f2cc2, 0x883a9d68, 0x493e2990, 0xc8e27d59, + 0x30867660, 0xbc4af25f, 0xbc09e1ed, 0xab87e0fc, 0x8fc5fac6, 0xe1f366be, 0x1ec159bf, 0x352ff984, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, diff --git a/src/common/nvswitch/kernel/lr10/link_lr10.c b/src/common/nvswitch/kernel/lr10/link_lr10.c index f3ddd5211..9912e1291 100644 --- a/src/common/nvswitch/kernel/lr10/link_lr10.c +++ b/src/common/nvswitch/kernel/lr10/link_lr10.c @@ -1329,6 +1329,13 @@ nvswitch_corelib_set_tl_link_mode_lr10 nvswitch_device *device = link->dev->pDevInfo; NvlStatus status = NVL_SUCCESS; + if (nvswitch_is_tnvl_mode_locked(device)) + { + NVSWITCH_PRINT(device, ERROR, + "%s(%d): Security locked\n", __FUNCTION__, __LINE__); + return NVL_ERR_INSUFFICIENT_PERMISSIONS; + } + if (!NVSWITCH_IS_LINK_ENG_VALID_LR10(device, NVLDL, link->linkNumber)) { NVSWITCH_PRINT(device, ERROR, @@ -1728,6 +1735,13 @@ nvswitch_corelib_set_rx_mode_lr10 NvlStatus status = NVL_SUCCESS; NvU32 delay_ns; + if (nvswitch_is_tnvl_mode_locked(device)) + { + NVSWITCH_PRINT(device, ERROR, + "%s(%d): Security locked\n", __FUNCTION__, __LINE__); + return NVL_ERR_INSUFFICIENT_PERMISSIONS; + } + if (!NVSWITCH_IS_LINK_ENG_VALID_LR10(device, NVLDL, link->linkNumber)) { NVSWITCH_PRINT(device, ERROR, @@ -1955,6 +1969,13 @@ nvswitch_corelib_set_rx_detect_lr10 NvlStatus status; nvswitch_device *device = link->dev->pDevInfo; + if (nvswitch_is_tnvl_mode_locked(device)) + { + NVSWITCH_PRINT(device, ERROR, + "%s(%d): Security locked\n", __FUNCTION__, __LINE__); + return NVL_ERR_INSUFFICIENT_PERMISSIONS; + } + if (nvswitch_does_link_need_termination_enabled(device, link)) { NVSWITCH_PRINT(device, INFO, @@ -2094,6 +2115,13 @@ nvswitch_request_tl_link_state_lr10 NvlStatus status = NVL_SUCCESS; NvU32 linkStatus; + if (nvswitch_is_tnvl_mode_locked(device)) + { + NVSWITCH_PRINT(device, ERROR, + "%s(%d): Security locked\n", __FUNCTION__, __LINE__); + return NVL_ERR_INSUFFICIENT_PERMISSIONS; + } + if (!NVSWITCH_IS_LINK_ENG_VALID_LR10(device, NVLIPT_LNK, link->linkNumber)) { NVSWITCH_PRINT(device, ERROR, diff --git a/src/common/nvswitch/kernel/lr10/lr10.c b/src/common/nvswitch/kernel/lr10/lr10.c index 30154ab37..b1a8cfeae 100644 --- a/src/common/nvswitch/kernel/lr10/lr10.c +++ b/src/common/nvswitch/kernel/lr10/lr10.c @@ -8204,6 +8204,26 @@ nvswitch_tnvl_disable_interrupts_lr10 return; } +void +nvswitch_reg_write_32_lr10 +( + nvswitch_device *device, + NvU32 offset, + NvU32 data +) +{ + if (device->nvlink_device->pciInfo.bars[0].pBar == NULL) + { + NVSWITCH_PRINT(device, ERROR, + "%s: register write failed at offset 0x%x\n", + __FUNCTION__, offset); + return; + } + + // Write the register + nvswitch_os_mem_write32((NvU8 *)device->nvlink_device->pciInfo.bars[0].pBar + offset, data); +} + // // This function auto creates the lr10 HAL connectivity from the NVSWITCH_INIT_HAL // macro in haldef_nvswitch.h diff --git a/src/common/nvswitch/kernel/ls10/link_ls10.c b/src/common/nvswitch/kernel/ls10/link_ls10.c index a7dcb066e..9d066a350 100644 --- a/src/common/nvswitch/kernel/ls10/link_ls10.c +++ b/src/common/nvswitch/kernel/ls10/link_ls10.c @@ -160,6 +160,13 @@ nvswitch_corelib_training_complete_ls10 { nvswitch_device *device = link->dev->pDevInfo; + if (nvswitch_is_tnvl_mode_locked(device)) + { + NVSWITCH_PRINT(device, ERROR, + "%s(%d): Security locked\n", __FUNCTION__, __LINE__); + return; // NVL_ERR_INSUFFICIENT_PERMISSIONS; + } + nvswitch_init_dlpl_interrupts(link); _nvswitch_configure_reserved_throughput_counters(link); @@ -265,6 +272,13 @@ nvswitch_corelib_set_tx_mode_ls10 NvU32 val; NvlStatus status = NVL_SUCCESS; + if (nvswitch_is_tnvl_mode_locked(device)) + { + NVSWITCH_PRINT(device, ERROR, + "%s(%d): Security locked\n", __FUNCTION__, __LINE__); + return NVL_ERR_INSUFFICIENT_PERMISSIONS; + } + if (!NVSWITCH_IS_LINK_ENG_VALID_LS10(device, NVLDL, link->linkNumber)) { NVSWITCH_PRINT(device, ERROR, @@ -352,6 +366,13 @@ nvswitch_corelib_set_dl_link_mode_ls10 NvBool keepPolling; NVSWITCH_TIMEOUT timeout; + if (nvswitch_is_tnvl_mode_locked(device)) + { + NVSWITCH_PRINT(device, ERROR, + "%s(%d): Security locked\n", __FUNCTION__, __LINE__); + return NVL_ERR_INSUFFICIENT_PERMISSIONS; + } + if (!NVSWITCH_IS_LINK_ENG_VALID_LS10(device, NVLDL, link->linkNumber)) { NVSWITCH_PRINT(device, ERROR, @@ -494,6 +515,13 @@ nvswitch_corelib_get_rx_detect_ls10 NvlStatus status; nvswitch_device *device = link->dev->pDevInfo; + if (nvswitch_is_tnvl_mode_locked(device)) + { + NVSWITCH_PRINT(device, ERROR, + "%s(%d): Security locked\n", __FUNCTION__, __LINE__); + return NVL_ERR_INSUFFICIENT_PERMISSIONS; + } + status = nvswitch_minion_get_rxdet_status_ls10(device, link->linkNumber); if (status != NVL_SUCCESS) @@ -590,13 +618,22 @@ nvswitch_corelib_get_tl_link_mode_ls10 { case NV_NVLIPT_LNK_CTRL_LINK_STATE_STATUS_CURRENTLINKSTATE_ACTIVE: - // If using ALI, ensure that the request to active completed - if (link->dev->enableALI) + if (nvswitch_is_tnvl_mode_locked(device)) { - status = nvswitch_wait_for_tl_request_ready_ls10(link); + NVSWITCH_PRINT(device, ERROR, + "%s(%d): Security locked\n", __FUNCTION__, __LINE__); + *mode = NVLINK_LINKSTATE_HS; } + else + { + // If using ALI, ensure that the request to active completed + if (link->dev->enableALI) + { + status = nvswitch_wait_for_tl_request_ready_ls10(link); + } - *mode = (status == NVL_SUCCESS) ? NVLINK_LINKSTATE_HS:NVLINK_LINKSTATE_OFF; + *mode = (status == NVL_SUCCESS) ? NVLINK_LINKSTATE_HS:NVLINK_LINKSTATE_OFF; + } break; case NV_NVLIPT_LNK_CTRL_LINK_STATE_STATUS_CURRENTLINKSTATE_L2: @@ -995,6 +1032,13 @@ nvswitch_launch_ALI_link_training_ls10 { NvlStatus status = NVL_SUCCESS; + if (nvswitch_is_tnvl_mode_locked(device)) + { + NVSWITCH_PRINT(device, ERROR, + "%s(%d): Security locked\n", __FUNCTION__, __LINE__); + return NVL_ERR_INSUFFICIENT_PERMISSIONS; + } + if ((link == NULL) || !NVSWITCH_IS_LINK_ENG_VALID_LS10(device, NVLIPT_LNK, link->linkNumber) || (link->linkNumber >= NVSWITCH_NVLINK_MAX_LINKS)) diff --git a/src/common/nvswitch/kernel/ls10/ls10.c b/src/common/nvswitch/kernel/ls10/ls10.c index 7b7ea7b60..4526e1ff1 100644 --- a/src/common/nvswitch/kernel/ls10/ls10.c +++ b/src/common/nvswitch/kernel/ls10/ls10.c @@ -4409,11 +4409,11 @@ nvswitch_eng_wr_ls10 if (nvswitch_is_tnvl_mode_enabled(device)) { - nvswitch_tnvl_reg_wr_32_ls10(device, eng_id, eng_bcast, eng_instance, base_addr, offset, data); + nvswitch_tnvl_eng_wr_32_ls10(device, eng_id, eng_bcast, eng_instance, base_addr, offset, data); } else { - nvswitch_reg_write_32(device, base_addr + offset, data); + nvswitch_reg_write_32(device, base_addr + offset, data); } #if defined(DEVELOP) || defined(DEBUG) || defined(NV_MODS) @@ -4431,6 +4431,33 @@ nvswitch_eng_wr_ls10 #endif //defined(DEVELOP) || defined(DEBUG) || defined(NV_MODS) } +void +nvswitch_reg_write_32_ls10 +( + nvswitch_device *device, + NvU32 offset, + NvU32 data +) +{ + if (device->nvlink_device->pciInfo.bars[0].pBar == NULL) + { + NVSWITCH_PRINT(device, ERROR, + "%s: register write failed at offset 0x%x\n", + __FUNCTION__, offset); + return; + } + + if (nvswitch_is_tnvl_mode_enabled(device)) + { + nvswitch_tnvl_reg_wr_32_ls10(device, offset, data); + } + else + { + // Write the register + nvswitch_os_mem_write32((NvU8 *)device->nvlink_device->pciInfo.bars[0].pBar + offset, data); + } +} + NvU32 nvswitch_get_link_eng_inst_ls10 ( diff --git a/src/common/nvswitch/kernel/ls10/soe_ls10.c b/src/common/nvswitch/kernel/ls10/soe_ls10.c index 24c901dfc..5be1d2e04 100644 --- a/src/common/nvswitch/kernel/ls10/soe_ls10.c +++ b/src/common/nvswitch/kernel/ls10/soe_ls10.c @@ -590,11 +590,19 @@ nvswitch_soe_reg_wr_32_ls10 return NVL_SUCCESS; // -NVL_ERR_NOT_SUPPORTED } + if (device->nvlink_device->pciInfo.bars[0].pBar == NULL) + { + NVSWITCH_PRINT(device, ERROR, + "%s: register write failed at offset 0x%x\n", + __FUNCTION__, offset); + return -NVL_IO_ERROR; + } + status = device->hal.nvswitch_ctrl_get_bios_info(device, ¶ms); if ((status != NVL_SUCCESS) || ((params.version & SOE_VBIOS_VERSION_MASK) < - SOE_VBIOS_REVLOCK_ISSUE_REGISTER_WRITE)) + SOE_VBIOS_REVLOCK_SOE_PRI_CHECKS)) { - nvswitch_reg_write_32(device, offset, data); + nvswitch_os_mem_write32((NvU8 *)device->nvlink_device->pciInfo.bars[0].pBar + offset, data); return NVL_SUCCESS; } @@ -629,6 +637,96 @@ nvswitch_soe_reg_wr_32_ls10 return NVL_SUCCESS; } +/* + * @Brief : Perform engine writes in SOE during TNVL + * + * @param[in] device + * @param[in] eng_id NVSWITCH_ENGINE_ID* + * @param[in] eng_bcast NVSWITCH_GET_ENG_DESC_TYPE* + * @param[in] eng_instance + * @param[in] base_addr + * @param[in] offset + * @param[in] data + */ +NvlStatus +nvswitch_soe_eng_wr_32_ls10 +( + nvswitch_device *device, + NVSWITCH_ENGINE_ID eng_id, + NvU32 eng_bcast, + NvU32 eng_instance, + NvU32 base_addr, + NvU32 offset, + NvU32 data +) +{ + FLCN *pFlcn; + NvU32 cmdSeqDesc = 0; + NV_STATUS status; + RM_FLCN_CMD_SOE cmd; + NVSWITCH_TIMEOUT timeout; + RM_SOE_TNVL_CMD_ENGINE_WRITE *pEngineWrite; + NVSWITCH_GET_BIOS_INFO_PARAMS params = { 0 }; + + if (!nvswitch_is_soe_supported(device)) + { + NVSWITCH_PRINT(device, INFO, + "%s: SOE is not supported\n", + __FUNCTION__); + return NVL_SUCCESS; // -NVL_ERR_NOT_SUPPORTED + } + + if (device->nvlink_device->pciInfo.bars[0].pBar == NULL) + { + NVSWITCH_PRINT(device, ERROR, + "%s: register write failed at offset 0x%x\n", + __FUNCTION__, offset); + return -NVL_IO_ERROR; + } + + status = device->hal.nvswitch_ctrl_get_bios_info(device, ¶ms); + if ((status != NVL_SUCCESS) || ((params.version & SOE_VBIOS_VERSION_MASK) < + SOE_VBIOS_REVLOCK_SOE_PRI_CHECKS)) + { + nvswitch_os_mem_write32((NvU8 *)device->nvlink_device->pciInfo.bars[0].pBar + base_addr + offset, data); + return NVL_SUCCESS; + } + + pFlcn = device->pSoe->pFlcn; + + nvswitch_os_memset(&cmd, 0, sizeof(cmd)); + + cmd.hdr.unitId = RM_SOE_UNIT_TNVL; + cmd.hdr.size = RM_SOE_CMD_SIZE(TNVL, ENGINE_WRITE); + + pEngineWrite = &cmd.cmd.tnvl.engineWrite; + pEngineWrite->cmdType = RM_SOE_TNVL_CMD_ISSUE_ENGINE_WRITE; + pEngineWrite->eng_id = eng_id; + pEngineWrite->eng_bcast = eng_bcast; + pEngineWrite->eng_instance = eng_instance; + pEngineWrite->base = base_addr; + pEngineWrite->offset = offset; + pEngineWrite->data = data; + + nvswitch_timeout_create(NVSWITCH_INTERVAL_5MSEC_IN_NS, &timeout); + status = flcnQueueCmdPostBlocking(device, pFlcn, + (PRM_FLCN_CMD)&cmd, + NULL, // pMsg + NULL, // pPayload + SOE_RM_CMDQ_LOG_ID, + &cmdSeqDesc, + &timeout); + if (status != NV_OK) + { + NVSWITCH_PRINT(device, ERROR, + "%s: Failed to send ENGINE_WRITE command to SOE, offset = 0x%x, data = 0x%x\n", + __FUNCTION__, offset, data); + return -NVL_ERR_GENERIC; + } + + return NVL_SUCCESS; +} + /* * @Brief : Init sequence for SOE FSP RISCV image * @@ -902,7 +1000,6 @@ _soeService_LS10 ) { NvBool bRecheckMsgQ = NV_FALSE; - NvBool bRecheckPrintQ = NV_FALSE; NvU32 clearBits = 0; NvU32 intrStatus; PFLCN pFlcn = ENG_GET_FLCN(pSoe); @@ -968,8 +1065,6 @@ _soeService_LS10 NVSWITCH_PRINT(device, INFO, "%s: Received a SWGEN1 interrupt\n", __FUNCTION__); - flcnDebugBufferDisplay_HAL(device, pFlcn); - bRecheckPrintQ = NV_TRUE; } // Clear any sources that were serviced and get the new status. @@ -1005,22 +1100,6 @@ _soeService_LS10 } } - // - // If we just processed a SWGEN1 interrupt (Debug Buffer interrupt), peek - // into the Debug Buffer and see if any text was missed the last time - // the buffer was displayed (above). If it is not empty, re-generate SWGEN1 - // (since it is now cleared) and exit. As long as an interrupt is pending, - // this function will be re-entered and the message(s) will be processed. - // - if (bRecheckPrintQ) - { - if (!flcnDebugBufferIsEmpty_HAL(device, pFlcn)) - { - flcnRegWrite_HAL(device, pFlcn, NV_PFALCON_FALCON_IRQSSET, - DRF_DEF(_PFALCON, _FALCON_IRQSSET, _SWGEN1, _SET)); - } - } - flcnIntrRetrigger_HAL(device, pFlcn); return intrStatus; diff --git a/src/common/nvswitch/kernel/ls10/sugen_ls10.c b/src/common/nvswitch/kernel/ls10/sugen_ls10.c index ddb46354d..822924998 100644 --- a/src/common/nvswitch/kernel/ls10/sugen_ls10.c +++ b/src/common/nvswitch/kernel/ls10/sugen_ls10.c @@ -936,6 +936,7 @@ nvswitch_nvs_top_prod_ls10 NVSWITCH_ENG_WR32(device, SYS_PRI_RS_CTRL, , 0, _PPRIV_RS_CTRL_SYS, _CG1, DRF_DEF(_PPRIV_RS_CTRL_SYS, _CG1, _SLCG, __PROD)); +#if 0 NVSWITCH_ENG_WR32(device, XAL, , 0, _XAL_EP, _CG, DRF_DEF(_XAL_EP, _CG, _IDLE_CG_DLY_CNT, __PROD) | DRF_DEF(_XAL_EP, _CG, _IDLE_CG_EN, __PROD) | @@ -961,7 +962,8 @@ nvswitch_nvs_top_prod_ls10 DRF_DEF(_XAL_EP, _CG1, _SLCG_TXMAP, __PROD) | DRF_DEF(_XAL_EP, _CG1, _SLCG_UNROLL_MEM, __PROD) | DRF_DEF(_XAL_EP, _CG1, _SLCG_UPARB, __PROD)); - +#endif //0 + NVSWITCH_ENG_WR32(device, XPL, , 0, _XPL, _PL_PAD_CTL_PRI_XPL_RXCLK_CG, DRF_DEF(_XPL, _PL_PAD_CTL_PRI_XPL_RXCLK_CG, _IDLE_CG_DLY_CNT, __PROD) | DRF_DEF(_XPL, _PL_PAD_CTL_PRI_XPL_RXCLK_CG, _IDLE_CG_EN, __PROD) | diff --git a/src/common/nvswitch/kernel/ls10/tnvl_ls10.c b/src/common/nvswitch/kernel/ls10/tnvl_ls10.c index f24b53478..cc14863f5 100644 --- a/src/common/nvswitch/kernel/ls10/tnvl_ls10.c +++ b/src/common/nvswitch/kernel/ls10/tnvl_ls10.c @@ -34,6 +34,10 @@ #include "nvswitch/ls10/dev_ctrl_ip_addendum.h" #include "nvswitch/ls10/dev_cpr_ip.h" #include "nvswitch/ls10/dev_npg_ip.h" +#include "nvswitch/ls10/dev_fsp_pri.h" +#include "nvswitch/ls10/dev_soe_ip.h" +#include "nvswitch/ls10/ptop_discovery_ip.h" +#include "nvswitch/ls10/dev_minion_ip.h" #include @@ -1058,7 +1062,7 @@ nvswitch_tnvl_get_status_ls10 } static NvBool -_nvswitch_reg_cpu_write_allow_list_ls10 +_nvswitch_tnvl_eng_wr_cpu_allow_list_ls10 ( nvswitch_device *device, NVSWITCH_ENGINE_ID eng_id, @@ -1091,6 +1095,15 @@ _nvswitch_reg_cpu_write_allow_list_ls10 return NV_TRUE; break; } + case NVSWITCH_ENGINE_ID_MINION: + { + if ((offset == NV_MINION_NVLINK_DL_STAT(0)) || + (offset == NV_MINION_NVLINK_DL_STAT(1)) || + (offset == NV_MINION_NVLINK_DL_STAT(2)) || + (offset == NV_MINION_NVLINK_DL_STAT(3))) + return NV_TRUE; + break; + } default : return NV_FALSE; } @@ -1099,7 +1112,7 @@ _nvswitch_reg_cpu_write_allow_list_ls10 } void -nvswitch_tnvl_reg_wr_32_ls10 +nvswitch_tnvl_eng_wr_32_ls10 ( nvswitch_device *device, NVSWITCH_ENGINE_ID eng_id, @@ -1110,45 +1123,124 @@ nvswitch_tnvl_reg_wr_32_ls10 NvU32 data ) { - if (!nvswitch_is_tnvl_mode_enabled(device)) + if (device->nvlink_device->pciInfo.bars[0].pBar == NULL) { NVSWITCH_PRINT(device, ERROR, - "%s: TNVL mode is not enabled\n", - __FUNCTION__); - NVSWITCH_ASSERT(0); + "%s: register write failed at offset 0x%x\n", + __FUNCTION__, offset); + return; + } + + if (!nvswitch_is_tnvl_mode_enabled(device)) + { + NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_TNVL_ERROR, + "ENG reg-write failed. TNVL mode is not enabled\n"); + return; + } + + if (_nvswitch_tnvl_eng_wr_cpu_allow_list_ls10(device, eng_id, offset)) + { + nvswitch_os_mem_write32((NvU8 *)device->nvlink_device->pciInfo.bars[0].pBar + base_addr + offset, data); return; } if (nvswitch_is_tnvl_mode_locked(device)) { + NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_TNVL_ERROR, + "TNVL ENG_WR failure - 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n", + eng_id, eng_instance, eng_bcast, base_addr, offset, data); + + NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_TNVL_ERROR, + "TNVL mode is locked\n"); + return; + } + + if (nvswitch_soe_eng_wr_32_ls10(device, eng_id, eng_bcast, eng_instance, base_addr, offset, data) != NVL_SUCCESS) + { + NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_TNVL_ERROR, + "TNVL ENG_WR failure - 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n", + eng_id, eng_instance, eng_bcast, base_addr, offset, data); + NVSWITCH_PRINT(device, ERROR, - "%s: TNVL mode is locked\n", - __FUNCTION__); + "%s: SOE ENG_WR failed for 0x%x[%d] %s @0x%08x+0x%06x = 0x%08x\n", + __FUNCTION__, + eng_id, eng_instance, + ( + (eng_bcast == NVSWITCH_GET_ENG_DESC_TYPE_UNICAST) ? "UC" : + (eng_bcast == NVSWITCH_GET_ENG_DESC_TYPE_BCAST) ? "BC" : + (eng_bcast == NVSWITCH_GET_ENG_DESC_TYPE_MULTICAST) ? "MC" : + "??" + ), + base_addr, offset, data); + } +} + +static NvBool +_nvswitch_tnvl_reg_wr_cpu_allow_list_ls10 +( + nvswitch_device *device, + NvU32 offset +) +{ + if ((offset >= DRF_BASE(NV_PFSP)) && + (offset <= DRF_EXTENT(NV_PFSP))) + { + return NV_TRUE; + } + + if ((offset >= NV_PTOP_UNICAST_SW_DEVICE_BASE_SOE_0 + DRF_BASE(NV_SOE)) && + (offset <= NV_PTOP_UNICAST_SW_DEVICE_BASE_SOE_0 + DRF_EXTENT(NV_SOE))) + { + return NV_TRUE; + } + + return NV_FALSE; +} + +void +nvswitch_tnvl_reg_wr_32_ls10 +( + nvswitch_device *device, + NvU32 offset, + NvU32 data +) +{ + if (device->nvlink_device->pciInfo.bars[0].pBar == NULL) + { + NVSWITCH_PRINT(device, ERROR, + "%s: register write failed at offset 0x%x\n", + __FUNCTION__, offset); NVSWITCH_ASSERT(0); return; } - if (_nvswitch_reg_cpu_write_allow_list_ls10(device, eng_id, offset)) + if (!nvswitch_is_tnvl_mode_enabled(device)) { - nvswitch_reg_write_32(device, base_addr + offset, data); + NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_TNVL_ERROR, + "Reg-write failed. TNVL mode is not enabled\n"); + return; } - else + + if (_nvswitch_tnvl_reg_wr_cpu_allow_list_ls10(device, offset)) { - if (nvswitch_soe_reg_wr_32_ls10(device, base_addr + offset, data) != NVL_SUCCESS) - { - NVSWITCH_PRINT(device, ERROR, - "%s: SOE ENG_WR failed for 0x%x[%d] %s @0x%08x+0x%06x = 0x%08x\n", - __FUNCTION__, - eng_id, eng_instance, - ( - (eng_bcast == NVSWITCH_GET_ENG_DESC_TYPE_UNICAST) ? "UC" : - (eng_bcast == NVSWITCH_GET_ENG_DESC_TYPE_BCAST) ? "BC" : - (eng_bcast == NVSWITCH_GET_ENG_DESC_TYPE_MULTICAST) ? "MC" : - "??" - ), - base_addr, offset, data); - NVSWITCH_ASSERT(0); - } + nvswitch_os_mem_write32((NvU8 *)device->nvlink_device->pciInfo.bars[0].pBar + offset, data); + return; + } + + if (nvswitch_is_tnvl_mode_locked(device)) + { + NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_TNVL_ERROR, + "TNVL REG_WR failure - 0x%08x, 0x%08x\n", offset, data); + + NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_TNVL_ERROR, + "TNVL mode is locked\n"); + return; + } + + if (nvswitch_soe_reg_wr_32_ls10(device, offset, data) != NVL_SUCCESS) + { + NVSWITCH_PRINT_SXID(device, NVSWITCH_ERR_HW_HOST_TNVL_ERROR, + "TNVL REG_WR failure - 0x%08x, 0x%08x\n", offset, data); } } diff --git a/src/common/nvswitch/kernel/nvswitch.c b/src/common/nvswitch/kernel/nvswitch.c index f17ecf7a6..da3e637c7 100644 --- a/src/common/nvswitch/kernel/nvswitch.c +++ b/src/common/nvswitch/kernel/nvswitch.c @@ -4964,10 +4964,7 @@ nvswitch_reg_write_32 device->nvlink_device->pciInfo.bars[0].baseAddr, offset, data); #endif - // Write the register - nvswitch_os_mem_write32((NvU8 *)device->nvlink_device->pciInfo.bars[0].pBar + offset, data); - - return; + device->hal.nvswitch_reg_write_32(device, offset, data); } NvU64 @@ -6074,6 +6071,101 @@ _nvswitch_ctrl_set_device_tnvl_lock return status; } +/* + * Service ioctls supported when TNVL mode is locked + */ +NvlStatus +nvswitch_lib_ctrl_tnvl_lock_only +( + nvswitch_device *device, + NvU32 cmd, + void *params, + NvU64 size, + void *osPrivate +) +{ + NvlStatus retval; + NvU64 flags = 0; + + if (!NVSWITCH_IS_DEVICE_ACCESSIBLE(device) || params == NULL) + { + return -NVL_BAD_ARGS; + } + + flags = NVSWITCH_DEV_CMD_CHECK_ADMIN | NVSWITCH_DEV_CMD_CHECK_FM; + switch (cmd) + { + NVSWITCH_DEV_CMD_DISPATCH(CTRL_NVSWITCH_GET_INFOROM_VERSION, + _nvswitch_ctrl_get_inforom_version, + NVSWITCH_GET_INFOROM_VERSION_PARAMS); + NVSWITCH_DEV_CMD_DISPATCH_PRIVILEGED( + CTRL_NVSWITCH_GET_NVLINK_MAX_ERROR_RATES, + _nvswitch_ctrl_get_inforom_nvlink_max_correctable_error_rate, + NVSWITCH_GET_NVLINK_MAX_CORRECTABLE_ERROR_RATES_PARAMS, + osPrivate, flags); + NVSWITCH_DEV_CMD_DISPATCH_PRIVILEGED( + CTRL_NVSWITCH_GET_NVLINK_ERROR_COUNTS, + _nvswitch_ctrl_get_inforom_nvlink_errors, + NVSWITCH_GET_NVLINK_ERROR_COUNTS_PARAMS, + osPrivate, flags); + NVSWITCH_DEV_CMD_DISPATCH_PRIVILEGED( + CTRL_NVSWITCH_GET_ECC_ERROR_COUNTS, + _nvswitch_ctrl_get_inforom_ecc_errors, + NVSWITCH_GET_ECC_ERROR_COUNTS_PARAMS, + osPrivate, flags); + NVSWITCH_DEV_CMD_DISPATCH_PRIVILEGED( + CTRL_NVSWITCH_GET_SXIDS, + _nvswitch_ctrl_get_inforom_bbx_sxid, + NVSWITCH_GET_SXIDS_PARAMS, + osPrivate, flags); + NVSWITCH_DEV_CMD_DISPATCH_PRIVILEGED( + CTRL_NVSWITCH_GET_SYS_INFO, + _nvswitch_ctrl_get_inforom_bbx_sys_info, + NVSWITCH_GET_SYS_INFO_PARAMS, + osPrivate, flags); + NVSWITCH_DEV_CMD_DISPATCH_PRIVILEGED( + CTRL_NVSWITCH_GET_TIME_INFO, + _nvswitch_ctrl_get_inforom_bbx_time_info, + NVSWITCH_GET_TIME_INFO_PARAMS, + osPrivate, flags); + NVSWITCH_DEV_CMD_DISPATCH_PRIVILEGED( + CTRL_NVSWITCH_GET_TEMP_DATA, + _nvswitch_ctrl_get_inforom_bbx_temp_data, + NVSWITCH_GET_TEMP_DATA_PARAMS, + osPrivate, flags); + NVSWITCH_DEV_CMD_DISPATCH_PRIVILEGED( + CTRL_NVSWITCH_GET_TEMP_SAMPLES, + _nvswitch_ctrl_get_inforom_bbx_temp_samples, + NVSWITCH_GET_TEMP_SAMPLES_PARAMS, + osPrivate, flags); + NVSWITCH_DEV_CMD_DISPATCH_PRIVILEGED( + CTRL_NVSWITCH_GET_ATTESTATION_CERTIFICATE_CHAIN, + _nvswitch_ctrl_get_attestation_certificate_chain, + NVSWITCH_GET_ATTESTATION_CERTIFICATE_CHAIN_PARAMS, + osPrivate, flags); + NVSWITCH_DEV_CMD_DISPATCH_PRIVILEGED( + CTRL_NVSWITCH_GET_ATTESTATION_REPORT, + _nvswitch_ctrl_get_attestation_report, + NVSWITCH_GET_ATTESTATION_REPORT_PARAMS, + osPrivate, flags); + NVSWITCH_DEV_CMD_DISPATCH_PRIVILEGED( + CTRL_NVSWITCH_GET_TNVL_STATUS, + _nvswitch_ctrl_get_tnvl_status, + NVSWITCH_GET_TNVL_STATUS_PARAMS, + osPrivate, flags); + NVSWITCH_DEV_CMD_DISPATCH_PRIVILEGED( + CTRL_NVSWITCH_SET_FM_DRIVER_STATE, + nvswitch_ctrl_set_fm_driver_state, + NVSWITCH_SET_FM_DRIVER_STATE_PARAMS, + osPrivate, flags); + default: + nvswitch_os_print(NVSWITCH_DBG_LEVEL_INFO, "ioctl %x is not permitted when TNVL is locked\n", cmd); + return -NVL_ERR_INSUFFICIENT_PERMISSIONS; + } + + return retval; +} + NvlStatus nvswitch_lib_ctrl ( @@ -6087,6 +6179,11 @@ nvswitch_lib_ctrl NvlStatus retval; NvU64 flags = 0; + if (nvswitch_is_tnvl_mode_locked(device)) + { + return nvswitch_lib_ctrl_tnvl_lock_only(device, cmd, params, size, osPrivate); + } + if (!NVSWITCH_IS_DEVICE_ACCESSIBLE(device) || params == NULL) { return -NVL_BAD_ARGS; diff --git a/src/common/sdk/nvidia/inc/ctrl/ctrl0000/ctrl0000vgpu.h b/src/common/sdk/nvidia/inc/ctrl/ctrl0000/ctrl0000vgpu.h index 476e83067..29c893031 100644 --- a/src/common/sdk/nvidia/inc/ctrl/ctrl0000/ctrl0000vgpu.h +++ b/src/common/sdk/nvidia/inc/ctrl/ctrl0000/ctrl0000vgpu.h @@ -37,6 +37,15 @@ #include "class/cl0000.h" #include "nv_vgpu_types.h" +/* DRF macros for OBJGPU::gpuId */ +#define NV0000_BUSDEVICE_DOMAIN 31:16 +#define NV0000_BUSDEVICE_BUS 15:8 +#define NV0000_BUSDEVICE_DEVICE 7:0 + +#define GPU_32_BIT_ID_DECODE_DOMAIN(gpuId) (NvU16)DRF_VAL(0000, _BUSDEVICE, _DOMAIN, gpuId); +#define GPU_32_BIT_ID_DECODE_BUS(gpuId) (NvU8) DRF_VAL(0000, _BUSDEVICE, _BUS, gpuId); +#define GPU_32_BIT_ID_DECODE_DEVICE(gpuId) (NvU8) DRF_VAL(0000, _BUSDEVICE, _DEVICE, gpuId); + /* * NV0000_CTRL_CMD_VGPU_CREATE_DEVICE * diff --git a/src/common/sdk/nvidia/inc/nvstatuscodes.h b/src/common/sdk/nvidia/inc/nvstatuscodes.h index 552207f21..2a0444d22 100644 --- a/src/common/sdk/nvidia/inc/nvstatuscodes.h +++ b/src/common/sdk/nvidia/inc/nvstatuscodes.h @@ -152,6 +152,8 @@ NV_STATUS_CODE(NV_ERR_FABRIC_MANAGER_NOT_PRESENT, 0x0000007A, "Fabric Manag NV_STATUS_CODE(NV_ERR_ALREADY_SIGNALLED, 0x0000007B, "Semaphore Surface value already >= requested wait value") NV_STATUS_CODE(NV_ERR_QUEUE_TASK_SLOT_NOT_AVAILABLE, 0x0000007C, "PMU RPC error due to no queue slot available for this event") NV_STATUS_CODE(NV_ERR_KEY_ROTATION_IN_PROGRESS, 0x0000007D, "Operation not allowed as key rotation is in progress") +NV_STATUS_CODE(NV_ERR_NVSWITCH_FABRIC_NOT_READY, 0x00000081, "Nvswitch Fabric Status or Fabric Probe is not yet complete, caller needs to retry") +NV_STATUS_CODE(NV_ERR_NVSWITCH_FABRIC_FAILURE, 0x00000082, "Nvswitch Fabric Probe failed") // Warnings: NV_STATUS_CODE(NV_WARN_HOT_SWITCH, 0x00010001, "WARNING Hot switch") diff --git a/src/common/uproc/os/common/include/liblogdecode.h b/src/common/uproc/os/common/include/liblogdecode.h index 845f43e6a..97deaeabb 100644 --- a/src/common/uproc/os/common/include/liblogdecode.h +++ b/src/common/uproc/os/common/include/liblogdecode.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2019-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2019-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -187,6 +187,7 @@ void libosLogDestroy(LIBOS_LOG_DECODE *logDecode); void libosExtractLogs(LIBOS_LOG_DECODE *logDecode, NvBool bSyncNvLog); void libosPreserveLogs(LIBOS_LOG_DECODE *pLogDecode); +NvBool isLibosPreserveLogBufferFull(LIBOS_LOG_DECODE *pLogDecode, NvU32 gpuInstance); #ifdef __cplusplus } diff --git a/src/common/uproc/os/libos-v3.1.0/lib/liblogdecode.c b/src/common/uproc/os/libos-v3.1.0/lib/liblogdecode.c index 5114675d6..4d190a69e 100644 --- a/src/common/uproc/os/libos-v3.1.0/lib/liblogdecode.c +++ b/src/common/uproc/os/libos-v3.1.0/lib/liblogdecode.c @@ -1438,6 +1438,34 @@ void libosPreserveLogs(LIBOS_LOG_DECODE *pLogDecode) } } +NvBool isLibosPreserveLogBufferFull(LIBOS_LOG_DECODE *pLogDecode, NvU32 gpuInstance) +{ + NvU64 i = (NvU32)(pLogDecode->numLogBuffers); + NvU32 tag = LIBOS_LOG_NVLOG_BUFFER_TAG(pLogDecode->sourceName, i * 2); + NVLOG_BUFFER_HANDLE handle = 0; + NV_STATUS status = nvlogGetBufferHandleFromTag(tag, &handle); + + if (status != NV_OK) + { + return NV_FALSE; + } + + NVLOG_BUFFER *pNvLogBuffer = NvLogLogger.pBuffers[handle]; + if (pNvLogBuffer == NULL) + { + return NV_FALSE; + } + + if (FLD_TEST_DRF(LOG_BUFFER, _FLAGS, _PRESERVE, _YES, pNvLogBuffer->flags) && + DRF_VAL(LOG, _BUFFER_FLAGS, _GPU_INSTANCE, pNvLogBuffer->flags) == gpuInstance && + (pNvLogBuffer->pos >= pNvLogBuffer->size - NV_OFFSETOF(LIBOS_LOG_NVLOG_BUFFER, data) - sizeof(NvU64))) + { + return NV_TRUE; + } + + return NV_FALSE; +} + static NvBool findPreservedNvlogBuffer(NvU32 tag, NvU32 gpuInstance, NVLOG_BUFFER_HANDLE *pHandle) { NVLOG_BUFFER_HANDLE handle = 0; diff --git a/src/nvidia-modeset/src/nvkms-headsurface-ioctl.c b/src/nvidia-modeset/src/nvkms-headsurface-ioctl.c index 1777ce2cc..895624fc1 100644 --- a/src/nvidia-modeset/src/nvkms-headsurface-ioctl.c +++ b/src/nvidia-modeset/src/nvkms-headsurface-ioctl.c @@ -110,7 +110,7 @@ NvBool nvHsIoctlMoveCursor( { NVHsChannelEvoRec *pHsChannel; - if (apiHead > ARRAY_LEN(pDispEvo->pHsChannel)) { + if (apiHead >= ARRAY_LEN(pDispEvo->pHsChannel)) { return FALSE; } @@ -206,7 +206,7 @@ NvBool nvHsIoctlSetCursorImage( NVHsChannelEvoRec *pHsChannel; NVSurfaceEvoRec *pSurfaceEvo = NULL; - if (apiHead > ARRAY_LEN(pDispEvo->pHsChannel)) { + if (apiHead >= ARRAY_LEN(pDispEvo->pHsChannel)) { return FALSE; } diff --git a/src/nvidia-modeset/src/nvkms.c b/src/nvidia-modeset/src/nvkms.c index 50c46bd35..9e9b54abb 100644 --- a/src/nvidia-modeset/src/nvkms.c +++ b/src/nvidia-modeset/src/nvkms.c @@ -4186,6 +4186,7 @@ static NvBool SwitchMux( { struct NvKmsSwitchMuxParams *pParams = pParamsVoid; const struct NvKmsSwitchMuxRequest *r = &pParams->request; + struct NvKmsPerOpenDev *pOpenDev; NVDpyEvoPtr pDpyEvo; pDpyEvo = GetPerOpenDpy(pOpen, r->deviceHandle, r->dispHandle, r->dpyId); @@ -4193,7 +4194,12 @@ static NvBool SwitchMux( return FALSE; } - if (!nvKmsOpenDevHasSubOwnerPermissionOrBetter(GetPerOpenDev(pOpen, r->deviceHandle))) { + pOpenDev = GetPerOpenDev(pOpen, r->deviceHandle); + if (pOpenDev == NULL) { + return FALSE; + } + + if (!nvKmsOpenDevHasSubOwnerPermissionOrBetter(pOpenDev)) { return FALSE; } diff --git a/src/nvidia/arch/nvalloc/unix/src/os-hypervisor.c b/src/nvidia/arch/nvalloc/unix/src/os-hypervisor.c index 74e76202d..f93e9d338 100644 --- a/src/nvidia/arch/nvalloc/unix/src/os-hypervisor.c +++ b/src/nvidia/arch/nvalloc/unix/src/os-hypervisor.c @@ -45,6 +45,7 @@ #include "gpu/bus/kern_bus.h" #include // NV_PMC_BOOT_1_VGPU #include "nvdevid.h" +#include "ctrl/ctrl0000/ctrl0000vgpu.h" #include "g_vgpu_chip_flags.h" // vGPU device names @@ -845,9 +846,9 @@ void osWakeRemoveVgpu(NvU32 gpuId, NvU32 returnStatus) vgpu_vfio_info vgpu_info; vgpu_info.return_status = returnStatus; - vgpu_info.domain = gpuDecodeDomain(gpuId); - vgpu_info.bus = gpuDecodeBus(gpuId); - vgpu_info.device = gpuDecodeDevice(gpuId); + vgpu_info.domain = GPU_32_BIT_ID_DECODE_DOMAIN(gpuId); + vgpu_info.bus = GPU_32_BIT_ID_DECODE_BUS(gpuId); + vgpu_info.device = GPU_32_BIT_ID_DECODE_DEVICE(gpuId); os_call_vgpu_vfio((void *)&vgpu_info, CMD_VFIO_WAKE_REMOVE_GPU); } diff --git a/src/nvidia/arch/nvalloc/unix/src/osapi.c b/src/nvidia/arch/nvalloc/unix/src/osapi.c index 85b75f520..0fe0aecb9 100644 --- a/src/nvidia/arch/nvalloc/unix/src/osapi.c +++ b/src/nvidia/arch/nvalloc/unix/src/osapi.c @@ -481,6 +481,11 @@ static NV_STATUS allocate_os_event( status = NV_ERR_NO_MEMORY; goto done; } + new_event->hParent = hParent; + new_event->nvfp = nvfp; + new_event->fd = fd; + new_event->active = NV_TRUE; + new_event->refcount = 0; portSyncSpinlockAcquire(nv->event_spinlock); for (event = nv->event_list; event; event = event->next) @@ -501,12 +506,6 @@ static NV_STATUS allocate_os_event( done: if (status == NV_OK) { - new_event->hParent = hParent; - new_event->nvfp = nvfp; - new_event->fd = fd; - new_event->active = NV_TRUE; - new_event->refcount = 0; - nvfp->bCleanupRmapi = NV_TRUE; NV_PRINTF(LEVEL_INFO, "allocated OS event:\n"); diff --git a/src/nvidia/arch/nvalloc/unix/src/osinit.c b/src/nvidia/arch/nvalloc/unix/src/osinit.c index 55f7ca474..e3bb5b504 100644 --- a/src/nvidia/arch/nvalloc/unix/src/osinit.c +++ b/src/nvidia/arch/nvalloc/unix/src/osinit.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 1999-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1999-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -59,6 +59,7 @@ #include #include "liblogdecode.h" #include +#include #include #include @@ -378,6 +379,13 @@ osHandleGpuLost gpuSetDisconnectedProperties(pGpu); + if (IS_GSP_CLIENT(pGpu)) + { + // Notify all channels of the error so that UVM can fail gracefully + KernelGsp *pKernelGsp = GPU_GET_KERNEL_GSP(pGpu); + kgspRcAndNotifyAllChannels(pGpu, pKernelGsp, ROBUST_CHANNEL_GPU_HAS_FALLEN_OFF_THE_BUS, NV_FALSE); + } + // Trigger the OS's PCI recovery mechanism if (nv_pci_trigger_recovery(nv) != NV_OK) { diff --git a/src/nvidia/generated/g_conf_compute_nvoc.c b/src/nvidia/generated/g_conf_compute_nvoc.c index 790f9db2d..38d8a149d 100644 --- a/src/nvidia/generated/g_conf_compute_nvoc.c +++ b/src/nvidia/generated/g_conf_compute_nvoc.c @@ -436,42 +436,6 @@ static void __nvoc_init_funcTable_ConfidentialCompute_1(ConfidentialCompute *pTh } } - // Hal function -- confComputeEnableKeyRotationSupport - if (( ((rmVariantHal_HalVarIdx >> 5) == 0UL) && ((1UL << (rmVariantHal_HalVarIdx & 0x1f)) & 0x00000001UL) )) /* RmVariantHal: VF */ - { - pThis->__confComputeEnableKeyRotationSupport__ = &confComputeEnableKeyRotationSupport_56cd7a; - } - else - { - if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x10000000UL) )) /* ChipHal: GH100 */ - { - pThis->__confComputeEnableKeyRotationSupport__ = &confComputeEnableKeyRotationSupport_GH100; - } - // default - else - { - pThis->__confComputeEnableKeyRotationSupport__ = &confComputeEnableKeyRotationSupport_56cd7a; - } - } - - // Hal function -- confComputeEnableInternalKeyRotationSupport - if (( ((rmVariantHal_HalVarIdx >> 5) == 0UL) && ((1UL << (rmVariantHal_HalVarIdx & 0x1f)) & 0x00000001UL) )) /* RmVariantHal: VF */ - { - pThis->__confComputeEnableInternalKeyRotationSupport__ = &confComputeEnableInternalKeyRotationSupport_56cd7a; - } - else - { - if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x10000000UL) )) /* ChipHal: GH100 */ - { - pThis->__confComputeEnableInternalKeyRotationSupport__ = &confComputeEnableInternalKeyRotationSupport_GH100; - } - // default - else - { - pThis->__confComputeEnableInternalKeyRotationSupport__ = &confComputeEnableInternalKeyRotationSupport_56cd7a; - } - } - // Hal function -- confComputeIsDebugModeEnabled if (( ((chipHal_HalVarIdx >> 5) == 1UL) && ((1UL << (chipHal_HalVarIdx & 0x1f)) & 0x10000000UL) )) /* ChipHal: GH100 */ { diff --git a/src/nvidia/generated/g_conf_compute_nvoc.h b/src/nvidia/generated/g_conf_compute_nvoc.h index 536729570..780e372d1 100644 --- a/src/nvidia/generated/g_conf_compute_nvoc.h +++ b/src/nvidia/generated/g_conf_compute_nvoc.h @@ -117,8 +117,6 @@ struct ConfidentialCompute { NV_STATUS (*__confComputeTriggerKeyRotation__)(struct OBJGPU *, struct ConfidentialCompute *); void (*__confComputeGetKeyPairForKeySpace__)(struct OBJGPU *, struct ConfidentialCompute *, NvU32, NvBool, NvU32 *, NvU32 *); NV_STATUS (*__confComputeEnableKeyRotationCallback__)(struct OBJGPU *, struct ConfidentialCompute *, NvBool); - NV_STATUS (*__confComputeEnableKeyRotationSupport__)(struct OBJGPU *, struct ConfidentialCompute *); - NV_STATUS (*__confComputeEnableInternalKeyRotationSupport__)(struct OBJGPU *, struct ConfidentialCompute *); NvBool (*__confComputeIsDebugModeEnabled__)(struct OBJGPU *, struct ConfidentialCompute *); NvBool (*__confComputeIsGpuCcCapable__)(struct OBJGPU *, struct ConfidentialCompute *); NV_STATUS (*__confComputeEstablishSpdmSessionAndKeys__)(struct OBJGPU *, struct ConfidentialCompute *); @@ -272,10 +270,6 @@ NV_STATUS __nvoc_objCreate_ConfidentialCompute(ConfidentialCompute**, Dynamic*, #define confComputeGetKeyPairForKeySpace_HAL(pGpu, pConfCompute, arg0, arg1, arg2, arg3) confComputeGetKeyPairForKeySpace_DISPATCH(pGpu, pConfCompute, arg0, arg1, arg2, arg3) #define confComputeEnableKeyRotationCallback(pGpu, pConfCompute, bEnable) confComputeEnableKeyRotationCallback_DISPATCH(pGpu, pConfCompute, bEnable) #define confComputeEnableKeyRotationCallback_HAL(pGpu, pConfCompute, bEnable) confComputeEnableKeyRotationCallback_DISPATCH(pGpu, pConfCompute, bEnable) -#define confComputeEnableKeyRotationSupport(pGpu, pConfCompute) confComputeEnableKeyRotationSupport_DISPATCH(pGpu, pConfCompute) -#define confComputeEnableKeyRotationSupport_HAL(pGpu, pConfCompute) confComputeEnableKeyRotationSupport_DISPATCH(pGpu, pConfCompute) -#define confComputeEnableInternalKeyRotationSupport(pGpu, pConfCompute) confComputeEnableInternalKeyRotationSupport_DISPATCH(pGpu, pConfCompute) -#define confComputeEnableInternalKeyRotationSupport_HAL(pGpu, pConfCompute) confComputeEnableInternalKeyRotationSupport_DISPATCH(pGpu, pConfCompute) #define confComputeIsDebugModeEnabled(pGpu, pConfCompute) confComputeIsDebugModeEnabled_DISPATCH(pGpu, pConfCompute) #define confComputeIsDebugModeEnabled_HAL(pGpu, pConfCompute) confComputeIsDebugModeEnabled_DISPATCH(pGpu, pConfCompute) #define confComputeIsGpuCcCapable(pGpu, pConfCompute) confComputeIsGpuCcCapable_DISPATCH(pGpu, pConfCompute) @@ -551,26 +545,6 @@ static inline NV_STATUS confComputeEnableKeyRotationCallback_DISPATCH(struct OBJ return pConfCompute->__confComputeEnableKeyRotationCallback__(pGpu, pConfCompute, bEnable); } -NV_STATUS confComputeEnableKeyRotationSupport_GH100(struct OBJGPU *pGpu, struct ConfidentialCompute *pConfCompute); - -static inline NV_STATUS confComputeEnableKeyRotationSupport_56cd7a(struct OBJGPU *pGpu, struct ConfidentialCompute *pConfCompute) { - return NV_OK; -} - -static inline NV_STATUS confComputeEnableKeyRotationSupport_DISPATCH(struct OBJGPU *pGpu, struct ConfidentialCompute *pConfCompute) { - return pConfCompute->__confComputeEnableKeyRotationSupport__(pGpu, pConfCompute); -} - -NV_STATUS confComputeEnableInternalKeyRotationSupport_GH100(struct OBJGPU *pGpu, struct ConfidentialCompute *pConfCompute); - -static inline NV_STATUS confComputeEnableInternalKeyRotationSupport_56cd7a(struct OBJGPU *pGpu, struct ConfidentialCompute *pConfCompute) { - return NV_OK; -} - -static inline NV_STATUS confComputeEnableInternalKeyRotationSupport_DISPATCH(struct OBJGPU *pGpu, struct ConfidentialCompute *pConfCompute) { - return pConfCompute->__confComputeEnableInternalKeyRotationSupport__(pGpu, pConfCompute); -} - NvBool confComputeIsDebugModeEnabled_GH100(struct OBJGPU *pGpu, struct ConfidentialCompute *pConfCompute); static inline NvBool confComputeIsDebugModeEnabled_491d52(struct OBJGPU *pGpu, struct ConfidentialCompute *pConfCompute) { diff --git a/src/nvidia/generated/g_kernel_gsp_nvoc.h b/src/nvidia/generated/g_kernel_gsp_nvoc.h index 504f7e67c..dc136755d 100644 --- a/src/nvidia/generated/g_kernel_gsp_nvoc.h +++ b/src/nvidia/generated/g_kernel_gsp_nvoc.h @@ -7,7 +7,7 @@ extern "C" { #endif /* - * SPDX-FileCopyrightText: Copyright (c) 2017-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2017-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -360,7 +360,7 @@ struct KernelGsp { NvU64 (*__kgspGetMinWprHeapSizeMB__)(struct OBJGPU *, struct KernelGsp *); NvU64 (*__kgspGetMaxWprHeapSizeMB__)(struct OBJGPU *, struct KernelGsp *); NvU32 (*__kgspGetFwHeapParamOsCarveoutSize__)(struct OBJGPU *, struct KernelGsp *); - NV_STATUS (*__kgspInitVgpuPartitionLogging__)(struct OBJGPU *, struct KernelGsp *, NvU32, NvU64, NvU64, NvU64, NvU64); + NV_STATUS (*__kgspInitVgpuPartitionLogging__)(struct OBJGPU *, struct KernelGsp *, NvU32, NvU64, NvU64, NvU64, NvU64, NvBool *); NV_STATUS (*__kgspPreserveVgpuPartitionLogging__)(struct OBJGPU *, struct KernelGsp *, NvU32); NV_STATUS (*__kgspFreeVgpuPartitionLogging__)(struct OBJGPU *, struct KernelGsp *, NvU32); const char *(*__kgspGetSignatureSectionNamePrefix__)(struct OBJGPU *, struct KernelGsp *); @@ -579,8 +579,8 @@ NV_STATUS __nvoc_objCreate_KernelGsp(KernelGsp**, Dynamic*, NvU32); #define kgspGetMaxWprHeapSizeMB_HAL(pGpu, pKernelGsp) kgspGetMaxWprHeapSizeMB_DISPATCH(pGpu, pKernelGsp) #define kgspGetFwHeapParamOsCarveoutSize(pGpu, pKernelGsp) kgspGetFwHeapParamOsCarveoutSize_DISPATCH(pGpu, pKernelGsp) #define kgspGetFwHeapParamOsCarveoutSize_HAL(pGpu, pKernelGsp) kgspGetFwHeapParamOsCarveoutSize_DISPATCH(pGpu, pKernelGsp) -#define kgspInitVgpuPartitionLogging(pGpu, pKernelGsp, gfid, initTaskLogBUffOffset, initTaskLogBUffSize, vgpuTaskLogBUffOffset, vgpuTaskLogBuffSize) kgspInitVgpuPartitionLogging_DISPATCH(pGpu, pKernelGsp, gfid, initTaskLogBUffOffset, initTaskLogBUffSize, vgpuTaskLogBUffOffset, vgpuTaskLogBuffSize) -#define kgspInitVgpuPartitionLogging_HAL(pGpu, pKernelGsp, gfid, initTaskLogBUffOffset, initTaskLogBUffSize, vgpuTaskLogBUffOffset, vgpuTaskLogBuffSize) kgspInitVgpuPartitionLogging_DISPATCH(pGpu, pKernelGsp, gfid, initTaskLogBUffOffset, initTaskLogBUffSize, vgpuTaskLogBUffOffset, vgpuTaskLogBuffSize) +#define kgspInitVgpuPartitionLogging(pGpu, pKernelGsp, gfid, initTaskLogBUffOffset, initTaskLogBUffSize, vgpuTaskLogBUffOffset, vgpuTaskLogBuffSize, pPreserveLogBufferFull) kgspInitVgpuPartitionLogging_DISPATCH(pGpu, pKernelGsp, gfid, initTaskLogBUffOffset, initTaskLogBUffSize, vgpuTaskLogBUffOffset, vgpuTaskLogBuffSize, pPreserveLogBufferFull) +#define kgspInitVgpuPartitionLogging_HAL(pGpu, pKernelGsp, gfid, initTaskLogBUffOffset, initTaskLogBUffSize, vgpuTaskLogBUffOffset, vgpuTaskLogBuffSize, pPreserveLogBufferFull) kgspInitVgpuPartitionLogging_DISPATCH(pGpu, pKernelGsp, gfid, initTaskLogBUffOffset, initTaskLogBUffSize, vgpuTaskLogBUffOffset, vgpuTaskLogBuffSize, pPreserveLogBufferFull) #define kgspPreserveVgpuPartitionLogging(pGpu, pKernelGsp, gfid) kgspPreserveVgpuPartitionLogging_DISPATCH(pGpu, pKernelGsp, gfid) #define kgspPreserveVgpuPartitionLogging_HAL(pGpu, pKernelGsp, gfid) kgspPreserveVgpuPartitionLogging_DISPATCH(pGpu, pKernelGsp, gfid) #define kgspFreeVgpuPartitionLogging(pGpu, pKernelGsp, gfid) kgspFreeVgpuPartitionLogging_DISPATCH(pGpu, pKernelGsp, gfid) @@ -1170,14 +1170,14 @@ static inline NvU32 kgspGetFwHeapParamOsCarveoutSize_DISPATCH(struct OBJGPU *pGp return pKernelGsp->__kgspGetFwHeapParamOsCarveoutSize__(pGpu, pKernelGsp); } -static inline NV_STATUS kgspInitVgpuPartitionLogging_395e98(struct OBJGPU *pGpu, struct KernelGsp *pKernelGsp, NvU32 gfid, NvU64 initTaskLogBUffOffset, NvU64 initTaskLogBUffSize, NvU64 vgpuTaskLogBUffOffset, NvU64 vgpuTaskLogBuffSize) { +static inline NV_STATUS kgspInitVgpuPartitionLogging_395e98(struct OBJGPU *pGpu, struct KernelGsp *pKernelGsp, NvU32 gfid, NvU64 initTaskLogBUffOffset, NvU64 initTaskLogBUffSize, NvU64 vgpuTaskLogBUffOffset, NvU64 vgpuTaskLogBuffSize, NvBool *pPreserveLogBufferFull) { return NV_ERR_NOT_SUPPORTED; } -NV_STATUS kgspInitVgpuPartitionLogging_IMPL(struct OBJGPU *pGpu, struct KernelGsp *pKernelGsp, NvU32 gfid, NvU64 initTaskLogBUffOffset, NvU64 initTaskLogBUffSize, NvU64 vgpuTaskLogBUffOffset, NvU64 vgpuTaskLogBuffSize); +NV_STATUS kgspInitVgpuPartitionLogging_IMPL(struct OBJGPU *pGpu, struct KernelGsp *pKernelGsp, NvU32 gfid, NvU64 initTaskLogBUffOffset, NvU64 initTaskLogBUffSize, NvU64 vgpuTaskLogBUffOffset, NvU64 vgpuTaskLogBuffSize, NvBool *pPreserveLogBufferFull); -static inline NV_STATUS kgspInitVgpuPartitionLogging_DISPATCH(struct OBJGPU *pGpu, struct KernelGsp *pKernelGsp, NvU32 gfid, NvU64 initTaskLogBUffOffset, NvU64 initTaskLogBUffSize, NvU64 vgpuTaskLogBUffOffset, NvU64 vgpuTaskLogBuffSize) { - return pKernelGsp->__kgspInitVgpuPartitionLogging__(pGpu, pKernelGsp, gfid, initTaskLogBUffOffset, initTaskLogBUffSize, vgpuTaskLogBUffOffset, vgpuTaskLogBuffSize); +static inline NV_STATUS kgspInitVgpuPartitionLogging_DISPATCH(struct OBJGPU *pGpu, struct KernelGsp *pKernelGsp, NvU32 gfid, NvU64 initTaskLogBUffOffset, NvU64 initTaskLogBUffSize, NvU64 vgpuTaskLogBUffOffset, NvU64 vgpuTaskLogBuffSize, NvBool *pPreserveLogBufferFull) { + return pKernelGsp->__kgspInitVgpuPartitionLogging__(pGpu, pKernelGsp, gfid, initTaskLogBUffOffset, initTaskLogBUffSize, vgpuTaskLogBUffOffset, vgpuTaskLogBuffSize, pPreserveLogBufferFull); } static inline NV_STATUS kgspPreserveVgpuPartitionLogging_395e98(struct OBJGPU *pGpu, struct KernelGsp *pKernelGsp, NvU32 gfid) { @@ -1577,14 +1577,14 @@ static inline NV_STATUS kgspAllocateBooterUnloadUcodeImage(struct OBJGPU *pGpu, #define kgspAllocateBooterUnloadUcodeImage(pGpu, pKernelGsp, ppBooterUnloadUcode) kgspAllocateBooterUnloadUcodeImage_IMPL(pGpu, pKernelGsp, ppBooterUnloadUcode) #endif //__nvoc_kernel_gsp_h_disabled -void kgspRcAndNotifyAllUserChannels_IMPL(struct OBJGPU *pGpu, struct KernelGsp *pKernelGsp, NvU32 exceptType); +void kgspRcAndNotifyAllChannels_IMPL(struct OBJGPU *pGpu, struct KernelGsp *pKernelGsp, NvU32 exceptType, NvBool bSkipKernelChannels); #ifdef __nvoc_kernel_gsp_h_disabled -static inline void kgspRcAndNotifyAllUserChannels(struct OBJGPU *pGpu, struct KernelGsp *pKernelGsp, NvU32 exceptType) { +static inline void kgspRcAndNotifyAllChannels(struct OBJGPU *pGpu, struct KernelGsp *pKernelGsp, NvU32 exceptType, NvBool bSkipKernelChannels) { NV_ASSERT_FAILED_PRECOMP("KernelGsp was disabled!"); } #else //__nvoc_kernel_gsp_h_disabled -#define kgspRcAndNotifyAllUserChannels(pGpu, pKernelGsp, exceptType) kgspRcAndNotifyAllUserChannels_IMPL(pGpu, pKernelGsp, exceptType) +#define kgspRcAndNotifyAllChannels(pGpu, pKernelGsp, exceptType, bSkipKernelChannels) kgspRcAndNotifyAllChannels_IMPL(pGpu, pKernelGsp, exceptType, bSkipKernelChannels) #endif //__nvoc_kernel_gsp_h_disabled #undef PRIVATE_FIELD diff --git a/src/nvidia/generated/g_kernel_nvlink_nvoc.h b/src/nvidia/generated/g_kernel_nvlink_nvoc.h index 13abc3147..b370a2fa0 100644 --- a/src/nvidia/generated/g_kernel_nvlink_nvoc.h +++ b/src/nvidia/generated/g_kernel_nvlink_nvoc.h @@ -318,6 +318,7 @@ struct KernelNvlink { NvU32 PRIVATE_FIELD(deviceLockRefcount); char *PRIVATE_FIELD(driverName); char *PRIVATE_FIELD(deviceName); + NvU8 *PRIVATE_FIELD(pGidString); NvBool PRIVATE_FIELD(bVerifTrainingEnable); NvBool PRIVATE_FIELD(bL2Entry); NvBool PRIVATE_FIELD(bSkipLinkTraining); @@ -435,6 +436,7 @@ struct KernelNvlink_PRIVATE { NvU32 deviceLockRefcount; char *driverName; char *deviceName; + NvU8 *pGidString; NvBool bVerifTrainingEnable; NvBool bL2Entry; NvBool bSkipLinkTraining; diff --git a/src/nvidia/generated/g_nv_name_released.h b/src/nvidia/generated/g_nv_name_released.h index 4c2f0ac95..62d787909 100644 --- a/src/nvidia/generated/g_nv_name_released.h +++ b/src/nvidia/generated/g_nv_name_released.h @@ -981,12 +981,10 @@ static const CHIPS_RELEASED sChipsReleased[] = { { 0x25AD, 0x0000, 0x0000, "NVIDIA GeForce RTX 2050" }, { 0x25B0, 0x1878, 0x1028, "NVIDIA RTX A1000" }, { 0x25B0, 0x1878, 0x103c, "NVIDIA RTX A1000" }, - { 0x25B0, 0x8d96, 0x103c, "NVIDIA RTX A1000" }, { 0x25B0, 0x1878, 0x10de, "NVIDIA RTX A1000" }, { 0x25B0, 0x1878, 0x17aa, "NVIDIA RTX A1000" }, { 0x25B2, 0x1879, 0x1028, "NVIDIA RTX A400" }, { 0x25B2, 0x1879, 0x103c, "NVIDIA RTX A400" }, - { 0x25B2, 0x8d95, 0x103c, "NVIDIA RTX A400" }, { 0x25B2, 0x1879, 0x10de, "NVIDIA RTX A400" }, { 0x25B2, 0x1879, 0x17aa, "NVIDIA RTX A400" }, { 0x25B6, 0x14a9, 0x10de, "NVIDIA A16" }, diff --git a/src/nvidia/generated/g_rpc-structures.h b/src/nvidia/generated/g_rpc-structures.h index 8bab673e4..88ddd89d5 100644 --- a/src/nvidia/generated/g_rpc-structures.h +++ b/src/nvidia/generated/g_rpc-structures.h @@ -1254,6 +1254,7 @@ typedef struct rpc_rc_triggered_v17_02 { NvU32 nv2080EngineType; NvU32 chid; + NvU32 gfid; NvU32 exceptType; NvU32 scope; NvU16 partitionAttributionId; @@ -6786,6 +6787,13 @@ static vmiopd_fdesc_t vmiopd_fdesc_t_rpc_rc_triggered_v17_02[] = { .name = "chid" #endif }, + { + .vtype = vtype_NvU32, + .offset = NV_OFFSETOF(rpc_rc_triggered_v17_02, gfid), + #if (defined(DEBUG) || defined(DEVELOP)) + .name = "gfid" + #endif + }, { .vtype = vtype_NvU32, .offset = NV_OFFSETOF(rpc_rc_triggered_v17_02, exceptType), diff --git a/src/nvidia/generated/g_subdevice_diag_nvoc.h b/src/nvidia/generated/g_subdevice_diag_nvoc.h index 8057531c6..06fbc9407 100644 --- a/src/nvidia/generated/g_subdevice_diag_nvoc.h +++ b/src/nvidia/generated/g_subdevice_diag_nvoc.h @@ -7,7 +7,7 @@ extern "C" { #endif /* - * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a diff --git a/src/nvidia/inc/libraries/utils/nvprintf.h b/src/nvidia/inc/libraries/utils/nvprintf.h index f0ad60213..136879d01 100644 --- a/src/nvidia/inc/libraries/utils/nvprintf.h +++ b/src/nvidia/inc/libraries/utils/nvprintf.h @@ -351,7 +351,7 @@ void NVRM_PRINTF_FUNCTION(const char *file, // In MODS builds, we allow all printfs, but don't automatically include the // __FILE__ or __FUNCTION__ references. // -#if NV_PRINTF_STRINGS_ALLOWED && (!defined(NV_MODS) || defined(SIM_BUILD) || defined(DEBUG) || defined(NV_MODS_INTERNAL)) +#if NV_PRINTF_STRINGS_ALLOWED && (!defined(NV_MODS) || defined(SIM_BUILD) || defined(DEBUG) || defined(DEVELOP) || defined(NV_MODS_INTERNAL)) #define NV_FILE_STR __FILE__ #define NV_FILE __FILE__ #define NV_FILE_FMT "%s" diff --git a/src/nvidia/interface/nv_uvm_types.h b/src/nvidia/interface/nv_uvm_types.h index d834ba720..f4e9a7107 100644 --- a/src/nvidia/interface/nv_uvm_types.h +++ b/src/nvidia/interface/nv_uvm_types.h @@ -595,10 +595,8 @@ typedef struct UvmGpuClientInfo_tag typedef enum { - UVM_GPU_CONF_COMPUTE_MODE_NONE, - UVM_GPU_CONF_COMPUTE_MODE_APM, - UVM_GPU_CONF_COMPUTE_MODE_HCC, - UVM_GPU_CONF_COMPUTE_MODE_COUNT + UVM_GPU_CONF_COMPUTE_MODE_NONE = 0, + UVM_GPU_CONF_COMPUTE_MODE_HCC = 2 } UvmGpuConfComputeMode; typedef struct UvmGpuConfComputeCaps_tag diff --git a/src/nvidia/kernel/inc/nvpcf.h b/src/nvidia/kernel/inc/nvpcf.h index e017a91d7..9c43b89e5 100644 --- a/src/nvidia/kernel/inc/nvpcf.h +++ b/src/nvidia/kernel/inc/nvpcf.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2020-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a @@ -82,6 +82,7 @@ typedef struct #define NVPCF_CONTROLLER_STATIC_TABLE_VERSION_21 (0x21) #define NVPCF_CONTROLLER_STATIC_TABLE_VERSION_22 (0x22) #define NVPCF_CONTROLLER_STATIC_TABLE_VERSION_23 (0x23) +#define NVPCF_CONTROLLER_STATIC_TABLE_VERSION_24 (0x24) // format for 2.0 and 2.1 #define NVPCF_CONTROLLER_STATIC_TABLE_HEADER_V20_SIZE_05 (0x05U) diff --git a/src/nvidia/kernel/vgpu/nv/rpc.c b/src/nvidia/kernel/vgpu/nv/rpc.c index 7b4d9f558..158c179b4 100644 --- a/src/nvidia/kernel/vgpu/nv/rpc.c +++ b/src/nvidia/kernel/vgpu/nv/rpc.c @@ -623,11 +623,8 @@ static NV_STATUS _initSysmemPfnRing(OBJGPU *pGpu) KernelBus *pKernelBus = GPU_GET_KERNEL_BUS(pGpu); NvU32 memFlags = 0; - if (!pVGpu->bGspPlugin) - { - if (kbusIsPhysicalBar2InitPagetableEnabled(pKernelBus)) - memFlags = MEMDESC_FLAGS_CPU_ONLY; - } + if (kbusIsPhysicalBar2InitPagetableEnabled(pKernelBus)) + memFlags = MEMDESC_FLAGS_CPU_ONLY; status = _allocRpcMemDesc(pGpu, RM_PAGE_SIZE, diff --git a/src/nvidia/kernel/vgpu/nv/vgpu_util.c b/src/nvidia/kernel/vgpu/nv/vgpu_util.c index 913167488..c9a94b966 100644 --- a/src/nvidia/kernel/vgpu/nv/vgpu_util.c +++ b/src/nvidia/kernel/vgpu/nv/vgpu_util.c @@ -107,13 +107,10 @@ NV_STATUS vgpuAllocSysmemPfnBitMapNode(OBJGPU *pGpu, VGPU_SYSMEM_PFN_BITMAP_NODE OBJVGPU *pVGpu = GPU_GET_VGPU(pGpu); VGPU_GSP_SYSMEM_BITMAP_ROOT_NODE *sysmemBitmapRootNode = NULL; NvU32 memFlags = 0; - if (!pVGpu->bGspPlugin) - { - KernelBus *pKernelBus = GPU_GET_KERNEL_BUS(pGpu); + KernelBus *pKernelBus = GPU_GET_KERNEL_BUS(pGpu); - if (kbusIsPhysicalBar2InitPagetableEnabled(pKernelBus)) - memFlags = MEMDESC_FLAGS_CPU_ONLY; - } + if (kbusIsPhysicalBar2InitPagetableEnabled(pKernelBus)) + memFlags = MEMDESC_FLAGS_CPU_ONLY; if (index != listCount(&(vgpuSysmemPfnInfo.listVgpuSysmemPfnBitmapHead)) || node == NULL) { diff --git a/src/nvidia/src/kernel/gpu/ccu/kernel_ccu.c b/src/nvidia/src/kernel/gpu/ccu/kernel_ccu.c index 3d6dbd283..6dd8a8b77 100644 --- a/src/nvidia/src/kernel/gpu/ccu/kernel_ccu.c +++ b/src/nvidia/src/kernel/gpu/ccu/kernel_ccu.c @@ -65,9 +65,12 @@ _kccuAllocMemory { NV_STATUS status = NV_OK; MEMORY_DESCRIPTOR *pMemDesc = NULL; + NvU64 flags = MEMDESC_FLAGS_USER_READ_ONLY; NV_PRINTF(LEVEL_INFO, "KernelCcu: Allocate memory for class members and shared buffer\n"); + flags |= MEMDESC_FLAGS_ALLOC_IN_UNPROTECTED_MEMORY; + // Allocate memory & init the KernelCcu class members to store shared buffer info pKernelCcu->shrBuf[idx].pCounterDstInfo = portMemAllocNonPaged(sizeof(CCU_SHRBUF_INFO)); pKernelCcu->shrBuf[idx].pKernelMapInfo = portMemAllocNonPaged(sizeof(SHARED_BUFFER_MAP_INFO)); @@ -83,8 +86,7 @@ _kccuAllocMemory // Create a memory descriptor data structure for the shared buffer status = memdescCreate(&pKernelCcu->pMemDesc[idx], pGpu, shrBufSize, 0, NV_MEMORY_CONTIGUOUS, - ADDR_SYSMEM, NV_MEMORY_CACHED, - MEMDESC_FLAGS_USER_READ_ONLY); + ADDR_SYSMEM, NV_MEMORY_CACHED, flags); if (status != NV_OK) { NV_PRINTF(LEVEL_ERROR, "CCU memdescCreate failed for(%u) with status: 0x%x\n", idx, status); diff --git a/src/nvidia/src/kernel/gpu/conf_compute/arch/hopper/conf_compute_key_rotation_gh100.c b/src/nvidia/src/kernel/gpu/conf_compute/arch/hopper/conf_compute_key_rotation_gh100.c index d6019159a..07141c104 100644 --- a/src/nvidia/src/kernel/gpu/conf_compute/arch/hopper/conf_compute_key_rotation_gh100.c +++ b/src/nvidia/src/kernel/gpu/conf_compute/arch/hopper/conf_compute_key_rotation_gh100.c @@ -32,8 +32,6 @@ #include "nvrm_registry.h" #include "kernel/gpu/conf_compute/ccsl.h" -static void initKeyRotationRegistryOverrides(OBJGPU *pGpu, ConfidentialCompute *pConfCompute); -static void initInternalKeyRotationRegistryOverrides(OBJGPU *pGpu, ConfidentialCompute *pConfCompute); static void getKeyPairForKeySpace(NvU32 keySpace, NvBool bKernel, NvU32 *pGlobalH2DKey, NvU32 *pGlobalD2HKey); static NV_STATUS triggerKeyRotationByKeyPair(OBJGPU *pGpu, ConfidentialCompute *pConfCompute, NvU32 h2dKey, NvU32 d2hKey); static NV_STATUS calculateEncryptionStatsByKeyPair(OBJGPU *pGpu, ConfidentialCompute *pConfCompute, NvU32 h2dKey, NvU32 d2hKey); @@ -41,63 +39,6 @@ static NvBool isLowerThresholdCrossed(ConfidentialCompute *pConfCompute, NvU32 h static NvBool isUpperThresholdCrossed(ConfidentialCompute *pConfCompute, NvU32 h2dKey, NvU32 d2hKey); static NV_STATUS keyRotationTimeoutCallback(OBJGPU *pGpu, OBJTMR *pTmr, TMR_EVENT *pTmrEvent); -/*! - * Conditionally enables key rotation support - * - * @param[in] pGpu : OBJGPU Pointer - * @param[in] pConfCompute : ConfidentialCompute pointer - */ -NV_STATUS -confComputeEnableKeyRotationSupport_GH100 -( - OBJGPU *pGpu, - ConfidentialCompute *pConfCompute -) -{ - - if (pConfCompute->getProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_ENABLED) && - pConfCompute->getProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_CC_FEATURE_ENABLED)) - { - pConfCompute->setProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_KEY_ROTATION_SUPPORTED, NV_TRUE); - initKeyRotationRegistryOverrides(pGpu, pConfCompute); - } - - return NV_OK; -} - -/*! - * Conditionally enables key rotation support for keys internal to RM - * - * @param[in] pGpu : OBJGPU Pointer - * @param[in] pConfCompute : ConfidentialCompute pointer - */ -NV_STATUS -confComputeEnableInternalKeyRotationSupport_GH100 -( - OBJGPU *pGpu, - ConfidentialCompute *pConfCompute -) -{ - pConfCompute->keyRotationInternalThreshold = KEY_ROTATION_DEFAULT_INTERNAL_THRESHOLD; - - // Check if we can even support internal key rotation - if (pConfCompute->getProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_ENABLED) && - pConfCompute->getProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_CC_FEATURE_ENABLED)) - { - // TODO CONFCOMP-1740: Default to enabling key rotation when supported - // pConfCompute->setProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_INTERNAL_KEY_ROTATION_ENABLED, NV_TRUE); - initInternalKeyRotationRegistryOverrides(pGpu, pConfCompute); - } - - // If key rotation is disabled, ensure we set values accordingly - if (!pConfCompute->getProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_INTERNAL_KEY_ROTATION_ENABLED)) - { - pConfCompute->keyRotationInternalThreshold = 0; - } - - return NV_OK; -} - /*! * Enables/disables key rotation by setting up the 1 sec callback for key rotation * @@ -515,142 +456,3 @@ getKeyPairForKeySpace(NvU32 keySpace, NvBool bKernel, NvU32 *pGlobalH2DKey, NvU3 *pGlobalH2DKey = CC_GKEYID_GEN(keySpace, localH2DKey); *pGlobalD2HKey = CC_GKEYID_GEN(keySpace, localD2HKey); } - -static void -initKeyRotationRegistryOverrides -( - OBJGPU *pGpu, - ConfidentialCompute *pConfCompute -) -{ - if (pConfCompute->getProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_KEY_ROTATION_SUPPORTED)) - { - NvU32 data; - - if (osReadRegistryDword(pGpu, NV_REG_STR_RM_CONF_COMPUTE_KEY_ROTATION_THRESHOLD_DELTA, &data) == NV_OK) - { - if (data == 0) - { - NV_PRINTF(LEVEL_ERROR, "Illegal value for RmKeyRotationThresholdDelta.\n"); - NV_PRINTF(LEVEL_ERROR, "Cancelling override of threshold delta.\n"); - } - else - { - NV_PRINTF(LEVEL_INFO, "Setting key rotation threshold delta to %u.\n", data); - pConfCompute->keyRotationThresholdDelta = data; - NV_ASSERT_OK(confComputeSetKeyRotationThreshold(pConfCompute, pConfCompute->attackerAdvantage)); - } - } - else if (osReadRegistryDword(pGpu, NV_REG_STR_RM_CONF_COMPUTE_KEY_ROTATION_LOWER_THRESHOLD, &data) == NV_OK) - { - const NvU32 lowerThreshold = data; - - if (osReadRegistryDword(pGpu, NV_REG_STR_RM_CONF_COMPUTE_KEY_ROTATION_UPPER_THRESHOLD, &data) == NV_OK) - { - const NvU32 upperThreshold = data; - - if (upperThreshold > lowerThreshold) - { - NV_PRINTF(LEVEL_INFO, "Setting key rotation lower threshold to %u and upper threshold to %u.\n", - lowerThreshold, upperThreshold); - pConfCompute->keyRotationUpperThreshold = upperThreshold; - pConfCompute->keyRotationLowerThreshold = lowerThreshold; - } - else - { - NV_PRINTF(LEVEL_ERROR, "RmKeyRotationUpperThreshold must be greater than RmKeyRotationLowerThreshold.\n"); - NV_PRINTF(LEVEL_ERROR, "Cancelling override of upper and lower key rotation thresholds.\n"); - } - } - else - { - NV_PRINTF(LEVEL_ERROR, "RmKeyRotationUpperThreshold must be set if RmKeyRotationLowerThreshold is set.\n"); - NV_PRINTF(LEVEL_ERROR, "Cancelling override of upper and lower key rotation thresholds.\n"); - } - } - - if (osReadRegistryDword(pGpu, NV_REG_STR_RM_CONF_COMPUTE_KEY_ROTATION_TIMEOUT_IN_SEC, &data) == NV_OK) - { - const NvU32 minTimeout = 2; - - if (data >= minTimeout) - { - NV_PRINTF(LEVEL_INFO, "Setting key rotation user-mode timeout to %u seconds.\n", data); - pConfCompute->keyRotationTimeout = data; - } - else - { - NV_PRINTF(LEVEL_ERROR, "Key rotation user-mode timeout must be greater than or equal to %u.\n", minTimeout); - NV_PRINTF(LEVEL_ERROR, "Cancelling override of user-mode timeout.\n"); - } - } - - if (osReadRegistryDword(pGpu, NV_REG_STR_RM_CONF_COMPUTE_KEY_ROTATION, &data) == NV_OK) - { - if (FLD_TEST_DRF(_REG_STR, _RM_CONF_COMPUTE_KEY_ROTATION, _ENABLED, _YES, data)) - { - NV_PRINTF(LEVEL_INFO, "Confidential Compute key rotation enabled via regkey override.\n"); - pConfCompute->setProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_KEY_ROTATION_ENABLED, NV_TRUE); - pConfCompute->keyRotationEnableMask = data; - } - else - { - NV_PRINTF(LEVEL_INFO, "Confidential Compute key rotation disabled via regkey override.\n"); - pConfCompute->setProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_KEY_ROTATION_ENABLED, NV_FALSE); - pConfCompute->setProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_KEY_ROTATION_SUPPORTED, NV_FALSE); - } - } - else - { - NV_PRINTF(LEVEL_INFO, "Confidential Compute key rotation is disabled.\n"); - pConfCompute->setProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_KEY_ROTATION_ENABLED, NV_FALSE); - pConfCompute->setProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_KEY_ROTATION_SUPPORTED, NV_FALSE); - } - } -} - -static void -initInternalKeyRotationRegistryOverrides -( - OBJGPU *pGpu, - ConfidentialCompute *pConfCompute -) -{ - NvU32 data = 0; - - if (pConfCompute->getProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_MULTI_GPU_PROTECTED_PCIE_MODE_ENABLED)) - { - NV_PRINTF(LEVEL_INFO, "RM internal key rotation not supported for protected PCIe!\n"); - pConfCompute->setProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_INTERNAL_KEY_ROTATION_ENABLED, NV_FALSE); - } - else if (osReadRegistryDword(pGpu, NV_REG_STR_RM_CONF_COMPUTE_KEY_ROTATION, &data) == NV_OK) - { - if (FLD_TEST_DRF(_REG_STR, _RM_CONF_COMPUTE_KEY_ROTATION, _INTERNAL_KEYS, _YES, data)) - { - NV_PRINTF(LEVEL_ERROR, "Enabling RM internal keys for Key Rotation by regkey override!\n"); - pConfCompute->setProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_INTERNAL_KEY_ROTATION_ENABLED, NV_TRUE); - } - else - { - NV_PRINTF(LEVEL_ERROR, "Disabling RM internal keys for Key Rotation by regkey override!\n"); - pConfCompute->setProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_INTERNAL_KEY_ROTATION_ENABLED, NV_FALSE); - } - } - - if (pConfCompute->getProperty(pConfCompute, PDB_PROP_CONFCOMPUTE_INTERNAL_KEY_ROTATION_ENABLED) && - (osReadRegistryDword(pGpu, NV_REG_STR_RM_CONF_COMPUTE_KEY_ROTATION_INTERNAL_THRESHOLD, &data) == NV_OK)) - { - const NvU32 internalThreshold = data; - if (internalThreshold < KEY_ROTATION_MINIMUM_INTERNAL_THRESHOLD) - { - NV_PRINTF(LEVEL_ERROR, "RmKeyRotationInternalThreshold must be higher than minimum of %u!\n", - KEY_ROTATION_MINIMUM_INTERNAL_THRESHOLD); - } - else - { - NV_PRINTF(LEVEL_INFO, "Setting internal key rotation threshold to %u.\n", - internalThreshold); - pConfCompute->keyRotationInternalThreshold = internalThreshold; - } - } -} diff --git a/src/nvidia/src/kernel/gpu/conf_compute/conf_compute.c b/src/nvidia/src/kernel/gpu/conf_compute/conf_compute.c index 329928bea..d8b795792 100644 --- a/src/nvidia/src/kernel/gpu/conf_compute/conf_compute.c +++ b/src/nvidia/src/kernel/gpu/conf_compute/conf_compute.c @@ -172,8 +172,7 @@ confComputeConstructEngine_IMPL(OBJGPU *pGpu, portMemSet(pConfCompute->aggregateStats, 0, sizeof(pConfCompute->aggregateStats)); portMemSet(pConfCompute->freedChannelAggregateStats, 0, sizeof(pConfCompute->freedChannelAggregateStats)); pConfCompute->keyRotationEnableMask = 0; - NV_ASSERT_OK_OR_RETURN(confComputeEnableKeyRotationSupport_HAL(pGpu, pConfCompute)); - NV_ASSERT_OK_OR_RETURN(confComputeEnableInternalKeyRotationSupport_HAL(pGpu, pConfCompute)); + pConfCompute->keyRotationInternalThreshold = 0; return NV_OK; } diff --git a/src/nvidia/src/kernel/gpu/fifo/kernel_fifo.c b/src/nvidia/src/kernel/gpu/fifo/kernel_fifo.c index 15e2498de..d1766b23c 100644 --- a/src/nvidia/src/kernel/gpu/fifo/kernel_fifo.c +++ b/src/nvidia/src/kernel/gpu/fifo/kernel_fifo.c @@ -1708,22 +1708,25 @@ kfifoGetChannelIterator_IMPL ) { portMemSet(pIt, 0, sizeof(*pIt)); - pIt->physicalChannelID = 0; - pIt->pFifoDataBlock = NULL; - pIt->runlistId = 0; - pIt->numRunlists = 1; - // Do we want to ierate all runlist channels if (runlistId == INVALID_RUNLIST_ID) { - if (kfifoIsPerRunlistChramEnabled(pKernelFifo)) - { - pIt->numRunlists = kfifoGetMaxNumRunlists_HAL(pGpu, pKernelFifo); - } + pIt->runlistId = 0; + + // Resulting iterator will iterate over constructed CHID_MGRs only + pIt->numRunlists = pKernelFifo->numChidMgrs; } else { pIt->runlistId = runlistId; + + // + // kfifoGetChidMgr() ignores the runlistId argument if per-runlist channel RAM is disabled. + // If there's no valid CHID_MGR for the given runlist ID, we can't iterate through the + // channels on the runlist, so we return an empty iterator instead. + // + CHID_MGR *pChidMgr = kfifoGetChidMgr(pGpu, pKernelFifo, pIt->runlistId); + pIt->numRunlists = (pChidMgr == NULL) ? 0 : 1; } } diff --git a/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c b/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c index 84e289842..a937b2291 100644 --- a/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c +++ b/src/nvidia/src/kernel/gpu/gsp/arch/turing/kernel_gsp_tu102.c @@ -912,7 +912,7 @@ kgspHealthCheck_TU102 if (bFirstFatal) { - kgspRcAndNotifyAllUserChannels(pGpu, pKernelGsp, GSP_ERROR); + kgspRcAndNotifyAllChannels(pGpu, pKernelGsp, GSP_ERROR, NV_TRUE); } gpuCheckEccCounts_HAL(pGpu); diff --git a/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c b/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c index 231f89ae9..d1d16589c 100644 --- a/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c +++ b/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c @@ -492,7 +492,7 @@ _kgspRpcRCTriggered RPC_PARAMS(rc_triggered, _v17_02); KernelRc *pKernelRc = GPU_GET_KERNEL_RC(pGpu); - KernelChannel *pKernelChannel; + KernelChannel *pKernelChannel = NULL; KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu); CHID_MGR *pChidMgr; NvU32 status = NV_OK; @@ -521,75 +521,20 @@ _kgspRpcRCTriggered if (status != NV_OK) return status; - pKernelChannel = kfifoChidMgrGetKernelChannel(pGpu, pKernelFifo, - pChidMgr, - rpc_params->chid); - NV_CHECK_OR_RETURN(LEVEL_ERROR, - pKernelChannel != NULL, - NV_ERR_INVALID_CHANNEL); - - // Add the RcDiag records we received from GSP-RM to our system wide journal + if (IS_GFID_PF(rpc_params->gfid)) { - OBJSYS *pSys = SYS_GET_INSTANCE(); - Journal *pRcDB = SYS_GET_RCDB(pSys); - RmClient *pClient; - - NvU32 recordSize = rcdbGetOcaRecordSizeWithHeader(pRcDB, RmRcDiagReport); - NvU32 rcDiagRecStart = pRcDB->RcErrRptNextIdx; - NvU32 rcDiagRecEnd; - NvU32 processId = 0; - NvU32 owner = RCDB_RCDIAG_DEFAULT_OWNER; - - pClient = dynamicCast(RES_GET_CLIENT(pKernelChannel), RmClient); - NV_ASSERT(pClient != NULL); - if (pClient != NULL) - processId = pClient->ProcID; - - for (NvU32 i = 0; i < rpc_params->rcJournalBufferSize / recordSize; i++) - { - RmRCCommonJournal_RECORD *pCommonRecord = - (RmRCCommonJournal_RECORD *)((NvU8*)&rpc_params->rcJournalBuffer + i * recordSize); - RmRcDiag_RECORD *pRcDiagRecord = - (RmRcDiag_RECORD *)&pCommonRecord[1]; - -#if defined(DEBUG) - NV_PRINTF(LEVEL_INFO, "%d: GPUTag=0x%x CPUTag=0x%llx timestamp=0x%llx stateMask=0x%llx\n", - i, pCommonRecord->GPUTag, pCommonRecord->CPUTag, pCommonRecord->timeStamp, - pCommonRecord->stateMask); - NV_PRINTF(LEVEL_INFO, " idx=%d timeStamp=0x%x type=0x%x flags=0x%x count=%d owner=0x%x processId=0x%x\n", - pRcDiagRecord->idx, pRcDiagRecord->timeStamp, pRcDiagRecord->type, pRcDiagRecord->flags, - pRcDiagRecord->count, pRcDiagRecord->owner, processId); - for (NvU32 j = 0; j < pRcDiagRecord->count; j++) - { - NV_PRINTF(LEVEL_INFO, " %d: offset=0x08%x tag=0x08%x value=0x08%x attribute=0x08%x\n", - j, pRcDiagRecord->data[j].offset, pRcDiagRecord->data[j].tag, - pRcDiagRecord->data[j].value, pRcDiagRecord->data[j].attribute); - } -#endif - if (rcdbAddRcDiagRecFromGsp(pGpu, pRcDB, pCommonRecord, pRcDiagRecord) == NULL) - { - NV_PRINTF(LEVEL_WARNING, "Lost RC diagnostic record coming from GPU%d GSP: type=0x%x stateMask=0x%llx\n", - gpuGetInstance(pGpu), pRcDiagRecord->type, pCommonRecord->stateMask); - } - } - - rcDiagRecEnd = pRcDB->RcErrRptNextIdx - 1; - - // Update records to have the correct PID associated with the channel - if (rcDiagRecStart != rcDiagRecEnd) - { - rcdbUpdateRcDiagRecContext(pRcDB, - rcDiagRecStart, - rcDiagRecEnd, - processId, - owner); - } + pKernelChannel = kfifoChidMgrGetKernelChannel(pGpu, pKernelFifo, + pChidMgr, + rpc_params->chid); + NV_CHECK_OR_RETURN(LEVEL_ERROR, + pKernelChannel != NULL, + NV_ERR_INVALID_CHANNEL); } bIsCcEnabled = gpuIsCCFeatureEnabled(pGpu); // With CC enabled, CPU-RM needs to write error notifiers - if (bIsCcEnabled) + if (bIsCcEnabled && pKernelChannel != NULL) { NV_ASSERT_OK_OR_RETURN(krcErrorSetNotifier(pGpu, pKernelRc, pKernelChannel, @@ -610,37 +555,42 @@ _kgspRpcRCTriggered * This function is called on critical FW crash to RC and notify an error code to * all user mode channels, allowing the user mode apps to fail deterministically. * - * @param[in] pGpu GPU object pointer - * @param[in] pKernelGsp KernelGsp object pointer - * @param[in] exceptType Error code to send to the RC notifiers + * @param[in] pGpu GPU object pointer + * @param[in] pKernelGsp KernelGsp object pointer + * @param[in] exceptType Error code to send to the RC notifiers + * @param[in] bSkipKernelChannels Don't RC and notify kernel channels * */ void -kgspRcAndNotifyAllUserChannels +kgspRcAndNotifyAllChannels_IMPL ( OBJGPU *pGpu, KernelGsp *pKernelGsp, - NvU32 exceptType + NvU32 exceptType, + NvBool bSkipKernelChannels ) { + // + // Note Bug 4503046: UVM currently attributes all errors as global and fails + // operations on all GPUs, in addition to the current failing GPU. Right now, the only + // case where we shouldn't skip kernel channels is when the GPU has fallen off the bus. + // + KernelRc *pKernelRc = GPU_GET_KERNEL_RC(pGpu); KernelChannel *pKernelChannel; KernelFifo *pKernelFifo = GPU_GET_KERNEL_FIFO(pGpu); CHANNEL_ITERATOR chanIt; RMTIMEOUT timeout; - NV_PRINTF(LEVEL_ERROR, "RC all user channels for critical error %d.\n", exceptType); + NV_PRINTF(LEVEL_ERROR, "RC all %schannels for critical error %d.\n", + bSkipKernelChannels ? MAKE_NV_PRINTF_STR("user ") : MAKE_NV_PRINTF_STR(""), + exceptType); - // Pass 1: halt all user channels. + // Pass 1: halt all channels. kfifoGetChannelIterator(pGpu, pKernelFifo, &chanIt, INVALID_RUNLIST_ID); while (kfifoGetNextKernelChannel(pGpu, pKernelFifo, &chanIt, &pKernelChannel) == NV_OK) { - // - // Kernel (uvm) channels are skipped to workaround nvbug 4503046, where - // uvm attributes all errors as global and fails operations on all GPUs, - // in addition to the current failing GPU. - // - if (kchannelCheckIsKernel(pKernelChannel)) + if (kchannelCheckIsKernel(pKernelChannel) && bSkipKernelChannels) { continue; } @@ -649,7 +599,7 @@ kgspRcAndNotifyAllUserChannels } // - // Pass 2: Wait for the halts to complete, and RC notify the user channels. + // Pass 2: Wait for the halts to complete, and RC notify the channels. // The channel halts require a preemption, which may not be able to complete // since the GSP is no longer servicing interrupts. Wait for up to the // default GPU timeout value for the preemptions to complete. @@ -658,26 +608,27 @@ kgspRcAndNotifyAllUserChannels kfifoGetChannelIterator(pGpu, pKernelFifo, &chanIt, INVALID_RUNLIST_ID); while (kfifoGetNextKernelChannel(pGpu, pKernelFifo, &chanIt, &pKernelChannel) == NV_OK) { - // Skip kernel (uvm) channels as only user channel halts are initiated above. - if (kchannelCheckIsKernel(pKernelChannel)) + if (kchannelCheckIsKernel(pKernelChannel) && bSkipKernelChannels) { continue; } kfifoCompleteChannelHalt(pGpu, pKernelFifo, pKernelChannel, &timeout); - NV_ASSERT_OK(krcErrorSetNotifier(pGpu, pKernelRc, - pKernelChannel, - exceptType, - kchannelGetEngineType(pKernelChannel), - RC_NOTIFIER_SCOPE_CHANNEL)); - - NV_ASSERT_OK(krcErrorSendEventNotifications_HAL(pGpu, pKernelRc, - pKernelChannel, - kchannelGetEngineType(pKernelChannel), - exceptType, - RC_NOTIFIER_SCOPE_CHANNEL, - 0)); + NV_ASSERT_OK( + krcErrorSetNotifier(pGpu, pKernelRc, + pKernelChannel, + exceptType, + kchannelGetEngineType(pKernelChannel), + RC_NOTIFIER_SCOPE_CHANNEL)); + + NV_ASSERT_OK( + krcErrorSendEventNotifications_HAL(pGpu, pKernelRc, + pKernelChannel, + kchannelGetEngineType(pKernelChannel), + exceptType, + RC_NOTIFIER_SCOPE_CHANNEL, + 0)); } } @@ -2290,7 +2241,8 @@ kgspInitVgpuPartitionLogging_IMPL NvU64 initTaskLogBUffOffset, NvU64 initTaskLogBUffSize, NvU64 vgpuTaskLogBUffOffset, - NvU64 vgpuTaskLogBuffSize + NvU64 vgpuTaskLogBuffSize, + NvBool *pPreserveLogBufferFull ) { struct @@ -2311,6 +2263,7 @@ kgspInitVgpuPartitionLogging_IMPL NV_STATUS nvStatus = NV_OK; RM_LIBOS_LOG_MEM *pTaskLog = NULL; char vm_string[8], sourceName[SOURCE_NAME_MAX_LENGTH]; + NvBool bPreserveLogBufferFull = NV_FALSE; if (gfid > MAX_PARTITIONS_WITH_GFID) { @@ -2328,6 +2281,11 @@ kgspInitVgpuPartitionLogging_IMPL // Setup logging for each task in vgpu partition for (NvU32 i = 0; i < NV_ARRAY_ELEMENTS(logInitValues); ++i) { + if (!bPreserveLogBufferFull) + { + bPreserveLogBufferFull = isLibosPreserveLogBufferFull(&pKernelGsp->logDecodeVgpuPartition[gfid - 1], pGpu->gpuInstance); + } + pTaskLog = &logInitValues[i].taskLogArr[gfid - 1]; NvP64 pVa = NvP64_NULL; @@ -2380,6 +2338,8 @@ kgspInitVgpuPartitionLogging_IMPL pKernelGsp->bHasVgpuLogs = NV_TRUE; + *pPreserveLogBufferFull = bPreserveLogBufferFull; + error_cleanup: if (pKernelGsp->pNvlogFlushMtx != NULL) portSyncMutexRelease(pKernelGsp->pNvlogFlushMtx); diff --git a/src/nvidia/src/kernel/gpu/nvlink/kernel_nvlinkcorelib.c b/src/nvidia/src/kernel/gpu/nvlink/kernel_nvlinkcorelib.c index 5606fc31f..91e9fa092 100644 --- a/src/nvidia/src/kernel/gpu/nvlink/kernel_nvlinkcorelib.c +++ b/src/nvidia/src/kernel/gpu/nvlink/kernel_nvlinkcorelib.c @@ -325,9 +325,12 @@ knvlinkCoreUpdateDeviceUUID_IMPL status = nvlink_lib_update_uuid_and_device_name(&devInfo, pGidString, pKernelNvlink->deviceName); - // Freeing pGidString here as it is malloc'd as part of gpuGetGidInfo_IMPL - if (pGidString != NULL) - portMemFree(pGidString); + // + // pGidString is malloc'd as part of gpuGetGidInfo_IMPL + // Store pGidString within pKernelNvlink so we can free it during + // knvlinkStatePostUnload to maintain alloc/free symmetry + // + pKernelNvlink->pGidString = pGidString; } #endif diff --git a/src/nvidia/src/kernel/gpu/nvlink/kernel_nvlinkstate.c b/src/nvidia/src/kernel/gpu/nvlink/kernel_nvlinkstate.c index e5aa8d163..98eeae344 100644 --- a/src/nvidia/src/kernel/gpu/nvlink/kernel_nvlinkstate.c +++ b/src/nvidia/src/kernel/gpu/nvlink/kernel_nvlinkstate.c @@ -1032,6 +1032,18 @@ _knvlinkPurgeState } } + // + // pGidString is allocated within knvlinkStatePostLoad -> knvlinkCoreUpdateDeviceUUID + // so need to free it during destruct + // Freeing it within knvlinkCoreRemoveDevice could create problems if + // AddDevice/RemoveDevice are used outside StateLoad/StatePostUnload/StateDestroy in the future + // + if (pKernelNvlink->pGidString) + { + portMemFree(pKernelNvlink->pGidString); + pKernelNvlink->pGidString = NULL; + } + _knvlinkPurgeState_end: #endif diff --git a/src/nvidia/src/kernel/gpu/rc/kernel_rc_notification.c b/src/nvidia/src/kernel/gpu/rc/kernel_rc_notification.c index 1e5790cf1..4668674d7 100644 --- a/src/nvidia/src/kernel/gpu/rc/kernel_rc_notification.c +++ b/src/nvidia/src/kernel/gpu/rc/kernel_rc_notification.c @@ -437,12 +437,14 @@ krcErrorSendEventNotifications_KERNEL ) { NV_ASSERT_OR_RETURN(!gpumgrGetBcEnabledStatus(pGpu), NV_ERR_INVALID_STATE); - NV_ASSERT_OR_RETURN(pKernelChannel != NULL, NV_ERR_INVALID_CHANNEL); - NV_ASSERT_OK_OR_RETURN( - krcErrorSendEventNotificationsCtxDma_HAL(pGpu, pKernelRc, - pKernelChannel, - scope)); + if (pKernelChannel != NULL) + { + NV_ASSERT_OK_OR_RETURN( + krcErrorSendEventNotificationsCtxDma_HAL(pGpu, pKernelRc, + pKernelChannel, + scope)); + } gpuNotifySubDeviceEvent(pGpu, NV2080_NOTIFIERS_RC_ERROR, diff --git a/src/nvidia/src/kernel/rmapi/client_resource.c b/src/nvidia/src/kernel/rmapi/client_resource.c index 483dca297..06029e2e8 100644 --- a/src/nvidia/src/kernel/rmapi/client_resource.c +++ b/src/nvidia/src/kernel/rmapi/client_resource.c @@ -2522,6 +2522,7 @@ _controllerParseStaticTable_v22 switch (header.version) { + case NVPCF_CONTROLLER_STATIC_TABLE_VERSION_24: case NVPCF_CONTROLLER_STATIC_TABLE_VERSION_23: case NVPCF_CONTROLLER_STATIC_TABLE_VERSION_22: { diff --git a/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c b/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c index 2456a64cb..a75e3d5c2 100644 --- a/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c +++ b/src/nvidia/src/kernel/rmapi/nv_gpu_ops.c @@ -136,6 +136,8 @@ #include +#include "gpu/gpu_fabric_probe.h" + #define NV_GPU_OPS_NUM_GPFIFO_ENTRIES_DEFAULT 1024 #define NV_GPU_SMALL_PAGESIZE (4 * 1024) @@ -5930,8 +5932,6 @@ void nvGpuOpsMemoryFree(struct gpuAddressSpace *vaSpace, NvU64 pointer) portMemFree(memDesc); } - - NV_STATUS nvGpuOpsQueryCesCaps(struct gpuDevice *device, gpuCesCaps *cesCaps) { @@ -5961,6 +5961,99 @@ NV_STATUS nvGpuOpsQueryCesCaps(struct gpuDevice *device, return status; } +static NV_STATUS _convertSystemFabricStateToErrorCode +( + NV0000_CTRL_SYSTEM_GET_FABRIC_STATUS_PARAMS fabricParams +) +{ + switch (fabricParams.fabricStatus) + { + case NV0000_CTRL_GET_SYSTEM_FABRIC_STATUS_SKIP: + case NV0000_CTRL_GET_SYSTEM_FABRIC_STATUS_INITIALIZED: + return NV_OK; + + case NV0000_CTRL_GET_SYSTEM_FABRIC_STATUS_IN_PROGRESS: + case NV0000_CTRL_GET_SYSTEM_FABRIC_STATUS_UNINITIALIZED: + return NV_ERR_NVSWITCH_FABRIC_NOT_READY; + + default: + NV_PRINTF(LEVEL_ERROR, "Invalid Fabric State\n"); + return NV_ERR_INVALID_STATE; + } +} + +static NV_STATUS _convertGpuFabricProbeStateToErrorCode +( + NV2080_CTRL_CMD_GET_GPU_FABRIC_PROBE_INFO_PARAMS fabricProbeParams +) +{ + switch (fabricProbeParams.state) + { + case NV2080_CTRL_GPU_FABRIC_PROBE_STATE_UNSUPPORTED: + return NV_OK; + + case NV2080_CTRL_GPU_FABRIC_PROBE_STATE_IN_PROGRESS: + case NV2080_CTRL_GPU_FABRIC_PROBE_STATE_NOT_STARTED: + return NV_ERR_NVSWITCH_FABRIC_NOT_READY; + + case NV2080_CTRL_GPU_FABRIC_PROBE_STATE_COMPLETE: + { + // + // When state is NV2080_CTRL_GPU_FABRIC_PROBE_STATE_COMPLETE + // status has to be checked for probe response success/failure. + // + if (fabricProbeParams.status != NV_OK) + return NV_ERR_NVSWITCH_FABRIC_FAILURE; + else + return NV_OK; + } + + default: + NV_PRINTF(LEVEL_ERROR, "Invalid Fabric Probe State\n"); + return NV_ERR_INVALID_STATE; + } +} + +static NV_STATUS _gpuGetFabricStatus +( + struct gpuDevice *pDevice, + RM_API *pRmApi +) +{ + // When MIG is enabled, P2P is not supported, hence return early + if (pDevice->info.smcEnabled) + { + return NV_OK; + } + + if (isDeviceHopperPlus(pDevice)) + { + NV2080_CTRL_CMD_GET_GPU_FABRIC_PROBE_INFO_PARAMS fabricProbeParams = {0}; + + NV_ASSERT_OK_OR_RETURN(pRmApi->Control(pRmApi, + pDevice->session->handle, + pDevice->subhandle, + NV2080_CTRL_CMD_GET_GPU_FABRIC_PROBE_INFO, + &fabricProbeParams, + sizeof(fabricProbeParams))); + + return _convertGpuFabricProbeStateToErrorCode(fabricProbeParams); + } + else + { + NV0000_CTRL_SYSTEM_GET_FABRIC_STATUS_PARAMS fabricParams = {0}; + + NV_ASSERT_OK_OR_RETURN(pRmApi->Control(pRmApi, + pDevice->session->handle, + pDevice->session->handle, + NV0000_CTRL_CMD_SYSTEM_GET_FABRIC_STATUS, + &fabricParams, + sizeof(fabricParams))); + + return _convertSystemFabricStateToErrorCode(fabricParams); + } +} + NV_STATUS nvGpuOpsQueryCaps(struct gpuDevice *device, gpuCaps *caps) { NV_STATUS status; @@ -5994,6 +6087,8 @@ NV_STATUS nvGpuOpsQueryCaps(struct gpuDevice *device, gpuCaps *caps) caps->numaNodeId = infoParams.numaId; } + status = _gpuGetFabricStatus(device, pRmApi); + cleanup: _nvGpuOpsLocksRelease(&acquiredLocks); threadStateFree(&threadState, THREAD_STATE_FLAGS_NONE); @@ -6225,28 +6320,18 @@ nvGpuOpsQueryGpuConfidentialComputeCaps(NvHandle hClient, { NV_CONFIDENTIAL_COMPUTE_ALLOC_PARAMS confComputeAllocParams = {0}; NV_CONF_COMPUTE_CTRL_CMD_SYSTEM_GET_CAPABILITIES_PARAMS confComputeParams = {0}; - NV_CONF_COMPUTE_CTRL_CMD_GPU_GET_KEY_ROTATION_STATE_PARAMS keyRotationParams = {0}; RM_API *pRmApi = rmapiGetInterface(RMAPI_EXTERNAL_KERNEL); NvHandle hConfCompute = 0; NV_STATUS status = NV_OK; confComputeAllocParams.hClient = hClient; - status = pRmApi->Alloc(pRmApi, - hClient, - hClient, - &hConfCompute, - NV_CONFIDENTIAL_COMPUTE, - &confComputeAllocParams, - sizeof(confComputeAllocParams)); - if (status == NV_ERR_INVALID_CLASS) - { - pGpuConfComputeCaps->mode = UVM_GPU_CONF_COMPUTE_MODE_NONE; - return NV_OK; - } - else - { - NV_ASSERT_OK_OR_RETURN(status); - } + NV_ASSERT_OK_OR_RETURN(pRmApi->Alloc(pRmApi, + hClient, + hClient, + &hConfCompute, + NV_CONFIDENTIAL_COMPUTE, + &confComputeAllocParams, + sizeof(confComputeAllocParams))); NV_ASSERT_OK_OR_GOTO(status, pRmApi->Control(pRmApi, @@ -6259,27 +6344,45 @@ nvGpuOpsQueryGpuConfidentialComputeCaps(NvHandle hClient, if (confComputeParams.ccFeature == NV_CONF_COMPUTE_SYSTEM_FEATURE_APM_ENABLED) { - pGpuConfComputeCaps->mode = UVM_GPU_CONF_COMPUTE_MODE_APM; + NV_ASSERT_OK_OR_GOTO(status, NV_ERR_NOT_SUPPORTED, cleanup); } - else if (confComputeParams.ccFeature == NV_CONF_COMPUTE_SYSTEM_FEATURE_HCC_ENABLED) + // + // Although protected pcie uses the same HW features as HCC, we don't advertise + // PPCIe as a multi-gpu extension of HCC. This is because PPCIe does not meet + // the security bar of a full blown HCC solution. For PPCIe, we have traded off + // security for higher performance. Hence, RM does not report both HCC and PPCIe + // ON at the same time. Internally however we use the same code paths for HCC and + // PPCIe. + // + else if (confComputeParams.ccFeature == NV_CONF_COMPUTE_SYSTEM_FEATURE_HCC_ENABLED || + confComputeParams.multiGpuMode == NV_CONF_COMPUTE_SYSTEM_MULTI_GPU_MODE_PROTECTED_PCIE) { pGpuConfComputeCaps->mode = UVM_GPU_CONF_COMPUTE_MODE_HCC; } + else + { + pGpuConfComputeCaps->mode = UVM_GPU_CONF_COMPUTE_MODE_NONE; + } - keyRotationParams.hSubDevice = hSubdevice; - NV_ASSERT_OK_OR_GOTO(status, - pRmApi->Control(pRmApi, - hClient, - hConfCompute, - NV_CONF_COMPUTE_CTRL_CMD_GPU_GET_KEY_ROTATION_STATE, - &keyRotationParams, - sizeof(keyRotationParams)), - cleanup); - - if ((keyRotationParams.keyRotationState == NV_CONF_COMPUTE_CTRL_CMD_GPU_KEY_ROTATION_KERN_ENABLED) || - (keyRotationParams.keyRotationState == NV_CONF_COMPUTE_CTRL_CMD_GPU_KEY_ROTATION_BOTH_ENABLED)) + if (pGpuConfComputeCaps->mode != UVM_GPU_CONF_COMPUTE_MODE_NONE) { - pGpuConfComputeCaps->bKeyRotationEnabled = NV_TRUE; + NV_CONF_COMPUTE_CTRL_CMD_GPU_GET_KEY_ROTATION_STATE_PARAMS keyRotationParams = {0}; + + keyRotationParams.hSubDevice = hSubdevice; + NV_ASSERT_OK_OR_GOTO(status, + pRmApi->Control(pRmApi, + hClient, + hConfCompute, + NV_CONF_COMPUTE_CTRL_CMD_GPU_GET_KEY_ROTATION_STATE, + &keyRotationParams, + sizeof(keyRotationParams)), + cleanup); + + if ((keyRotationParams.keyRotationState == NV_CONF_COMPUTE_CTRL_CMD_GPU_KEY_ROTATION_KERN_ENABLED) || + (keyRotationParams.keyRotationState == NV_CONF_COMPUTE_CTRL_CMD_GPU_KEY_ROTATION_BOTH_ENABLED)) + { + pGpuConfComputeCaps->bKeyRotationEnabled = NV_TRUE; + } } cleanup: pRmApi->Free(pRmApi, hClient, hConfCompute); diff --git a/src/nvidia/src/kernel/virtualization/kernel_hostvgpudeviceapi.c b/src/nvidia/src/kernel/virtualization/kernel_hostvgpudeviceapi.c index bc3377198..cfb843e8a 100644 --- a/src/nvidia/src/kernel/virtualization/kernel_hostvgpudeviceapi.c +++ b/src/nvidia/src/kernel/virtualization/kernel_hostvgpudeviceapi.c @@ -27,7 +27,6 @@ #include "core/core.h" #include "core/locks.h" #include "os/os.h" -#include "virtualization/kernel_hostvgpudeviceapi.h" #include "dev_ctrl_defines.h" #include "mem_mgr/mem.h" #include "kernel/gpu/bif/kernel_bif.h" @@ -87,6 +86,7 @@ kernelhostvgpudeviceapiConstruct_IMPL RsShared *pShared; Device *pDevice; RsClient *pClient = NULL; + NvBool bPreserveLogBufferFull = NV_FALSE; // Forbid allocation of this class on Guest-RM // to avoid fuzzing this class in such cases. See bug 3529160. @@ -247,7 +247,8 @@ kernelhostvgpudeviceapiConstruct_IMPL pBootloadParams->initTaskLogBuffOffset, pBootloadParams->initTaskLogBuffSize, pBootloadParams->vgpuTaskLogBuffOffset, - pBootloadParams->vgpuTaskLogBuffSize), + pBootloadParams->vgpuTaskLogBuffSize, + &bPreserveLogBufferFull), done); } @@ -274,8 +275,11 @@ kernelhostvgpudeviceapiConstruct_IMPL NV2080_CTRL_CMD_VGPU_MGR_INTERNAL_BOOTLOAD_GSP_VGPU_PLUGIN_TASK, pBootloadParams, sizeof(*pBootloadParams)); - // Preserve any captured vGPU Partition logs - NV_ASSERT_OK(kgspPreserveVgpuPartitionLogging(pGpu, pKernelGsp, pAllocParams->gfid)); + if (!bPreserveLogBufferFull) + { + // Preserve any captured vGPU Partition logs + NV_ASSERT_OK(kgspPreserveVgpuPartitionLogging(pGpu, pKernelGsp, pAllocParams->gfid)); + } if (status != NV_OK) { diff --git a/version.mk b/version.mk index a275204dd..d6e3263d5 100644 --- a/version.mk +++ b/version.mk @@ -1,4 +1,4 @@ -NVIDIA_VERSION = 550.107.02 +NVIDIA_VERSION = 550.120 # This file. VERSION_MK_FILE := $(lastword $(MAKEFILE_LIST))