From f220d7e51f1e73e56d380c62a4b3b07b40758a5e Mon Sep 17 00:00:00 2001 From: Julie-CAO Date: Thu, 11 Jul 2024 15:23:16 +0800 Subject: [PATCH] Support vgpu automation test --- .../autoyast/dev_host_15.xml.ep | 2 +- lib/ipmi_backend_utils.pm | 4 +- products/sle/main.pm | 1 + tests/virt_autotest/vgpu.pm | 334 ++++++++++++++++++ 4 files changed, 338 insertions(+), 3 deletions(-) create mode 100644 tests/virt_autotest/vgpu.pm diff --git a/data/virt_autotest/host_unattended_installation_files/autoyast/dev_host_15.xml.ep b/data/virt_autotest/host_unattended_installation_files/autoyast/dev_host_15.xml.ep index e74b1aa9449d..e1c494ce0d1f 100644 --- a/data/virt_autotest/host_unattended_installation_files/autoyast/dev_host_15.xml.ep +++ b/data/virt_autotest/host_unattended_installation_files/autoyast/dev_host_15.xml.ep @@ -59,7 +59,7 @@ SERIALDEV='ttyS2', XEN_SERIAL_CONSOLE="com1=115200,8n1,0x3e8,5 console=com1" --> <%= defined $bmwqemu::vars{"XEN_SERIAL_CONSOLE"} ? $bmwqemu::vars{"XEN_SERIAL_CONSOLE"} : "console=com2,115200" %> vga=gfx-1024x768x16 loglvl=all guest_loglvl=all sync_console % } else { - splash=silent console=<%= $get_var->('SERIALDEV') %>,115200 console=tty loglevel=5 <%= $check_var->('AMD', '1') ? "" : "intel_iommu=on" %> <%= defined $bmwqemu::vars{"OPT_KERNEL_PARAMS"} ? $bmwqemu::vars{"OPT_KERNEL_PARAMS"} : "" %> + splash=silent console=<%= $get_var->('SERIALDEV') %>,115200 console=tty loglevel=5 <%= $get_var->('AMD') ? "" : "intel_iommu=on" %> <%= $get_var->('OPT_KERNEL_PARAMS') ? $get_var->('OPT_KERNEL_PARAMS') : "" %> % } default diff --git a/lib/ipmi_backend_utils.pm b/lib/ipmi_backend_utils.pm index 4fbf13394e76..a9800d0906dc 100644 --- a/lib/ipmi_backend_utils.pm +++ b/lib/ipmi_backend_utils.pm @@ -121,8 +121,8 @@ sub setup_console_in_grub { elsif (${virt_type} eq "kvm") { #enable Intel VT-d for SR-IOV test running on intel SUTs my $intel_option = ""; - if (get_var("ENABLE_SRIOV_NETWORK_CARD_PCI_PASSTHROUGH") && script_run("grep Intel /proc/cpuinfo") == 0) { - $intel_option = "intel_iommu=on"; + if (get_var("ENABLE_SRIOV_NETWORK_CARD_PCI_PASSTHROUGH") or get_var("ENABLE_VGPU")) { + $intel_option = "intel_iommu=on" if script_run("grep Intel /proc/cpuinfo") == 0; } $cmd diff --git a/products/sle/main.pm b/products/sle/main.pm index 070dcdef2ac3..cb1b7d60e5e6 100644 --- a/products/sle/main.pm +++ b/products/sle/main.pm @@ -620,6 +620,7 @@ sub load_virt_feature_tests { } loadtest "virt_autotest/xen_guest_irqbalance" if get_var("ENABLE_XEN_GUEST_IRQBALANCE"); loadtest "virt_autotest/sriov_network_card_pci_passthrough" if get_var("ENABLE_SRIOV_NETWORK_CARD_PCI_PASSTHROUGH"); + loadtest "virt_autotest/vgpu" if get_var("ENABLE_VGPU"); if (get_var('ENABLE_HOTPLUGGING')) { loadtest 'virtualization/universal/hotplugging_guest_preparation'; loadtest 'virtualization/universal/hotplugging_network_interfaces'; diff --git a/tests/virt_autotest/vgpu.pm b/tests/virt_autotest/vgpu.pm new file mode 100644 index 000000000000..5887a4a9d571 --- /dev/null +++ b/tests/virt_autotest/vgpu.pm @@ -0,0 +1,334 @@ +# SUSE's openQA tests +# Copyright 2020 SUSE LLC +# SPDX-License-Identifier: GPL-2.0-or-later +# +# Summary: A test to pass an NVIDIA vGPU to guest via libvirt. Only KVM hosts are supported. +# Test environment: one AMPERE GPU card, such A10, A30 etc., with SR-IOV capability and/or MIG features in host machine; +# A UEFI guest vm is defined in the host, and they are ssh accessible from host. +# Test flow: +# - install the NVIDIA vGPU manager on host and reboot +# - create a GPU Instance and a Compute Instance if MIG feature is available +# - create a vGPU device in host +# - assign the vGPU device to guest vm +# - install the vGPU driver in guest vm +# - detach vGPU from guest and remove the vGPU +# Maintainer: Julie CAO + +use base "virt_feature_test_base"; +use strict; +use warnings; +use utils; +use testapi; +use virt_autotest::common; +use version_utils qw(is_sle); +use virt_autotest::utils; +use Utils::Backends qw(use_ssh_serial_console is_remote_backend); +use ipmi_backend_utils qw(reconnect_when_ssh_console_broken); + +sub run_test { + my $self = shift; + + # Use sol & ssh console explixitly in case test module run on an installed host + # thus no login_console.pm has been required any more + select_console 'sol', await_console => 0; + use_ssh_serial_console; + + # Test Prerequisite is fulfilled + return unless is_host_ready_for_vgpu_test() eq 'yes'; + + my $gpu_device = get_gpu_device(); + die "No NVIDIA AMPERE GPU card on the machine!" if $gpu_device eq ''; + + # Clean up test logs + my $log_dir = "/tmp/vgpu"; + script_run "[ -d $log_dir ] && rm -rf $log_dir; mkdir -p $log_dir"; + + # Install NVIDIA vGPU manager + my $vgpu_manager = get_required_var("VGPU_MANAGER_URL"); + install_vgpu_manager_and_reboot($vgpu_manager); + + # Enable SR-IOV VFs + enable_sriov($gpu_device); + + # Create a CI for GPU with MIG mode supported, such A30/A100 + # given MIG mode is enabled. will handle non MIG mode when zen2 is available + if (get_var('GPU_MIG_MODE')) { + enable_mig_mode(); + # Create a GI + my ($gi_id, $gpu_id) = create_gpu_instance(); + # Create a CI + my $ci_id = create_compute_instance($gi_id, $gpu_id); + } + + # Create a vGPU device + my $vgpu = create_vgpu($gpu_device, get_var('GPU_MIG_MODE')); + + save_original_guest_xmls(); + my $vgpu_grid_driver = get_required_var("VGPU_GRID_DRIVER_URL"); + foreach my $guest (keys %virt_autotest::common::guests) { + record_info("Guest $guest"); + + # Assign vGPU to guest + my $gpu_slot_in_guest = "0x0a"; + check_guest_health($guest); #may remove it later? + assign_vgpu_to_guest($vgpu, $guest, $gpu_slot_in_guest); + check_guest_health($guest); #may remove it later? + + # Install vGPU grid driver in guest + install_vgpu_grid_driver($guest, $vgpu_grid_driver); + + # Detach vGPU from guest + detach_vgpu_from_guest($gpu_slot_in_guest, $guest); + check_guest_health($guest); + } + + # Remove vGPU device + remove_vgpu($vgpu); + + # Disable GPU SR-IOV + assert_script_run("/usr/lib/nvidia/sriov-manage -d $gpu_device"); + record_info("SRIOV disabled", $gpu_device); + + # Upload vgpu related logs(not logs so far) + # upload_virt_logs($log_dir, "logs"); + + # Redefine guest from their original configuration files + restore_original_guests(); +} + +sub is_host_ready_for_vgpu_test { + if (is_sle('<15-SP3') or !is_kvm_host) { + record_info("Host not supported!", "NVIDIA vGPU is only supported on KVM hosts with SLE15SP3 and greater!", result => 'softfail'); + return; + } + #check VT-d is supported in Intel x86_64 machines + if (script_run("grep -m 1 Intel /proc/cpuinfo") == 0) { + assert_script_run "dmesg | grep -E \"DMAR:.*IOMMU enabled\""; + } + script_run("lsmod | grep -e vfio -e nvidia -e mdev"); + return 'yes'; +} + +sub get_gpu_device { + #NVIDIA AMPERE GPU + my $gpu_devices = script_output("lspci | grep NVIDIA | cut -d ' ' -f1"); + foreach (split("\n", $gpu_devices)) { + return $_ if script_run("lspci -v -s $_ | grep 'SR-IOV'") == 0; + } +} + +sub install_vgpu_manager_and_reboot { + my ${dirver_url} = shift; + die "The vGPU driver URL requires to be in 'http://...' format!" unless ${dirver_url} =~ /http:\/\/.*\/(.*\.run)/; + my ${driver_file} = $1; + + # Check if vGPU manager has been already loaded + if (script_run("nvidia-smi") == 0) { + my ${driver_version} = script_output("nvidia-smi -q | grep 'Driver Version' | grep -oE \"[0-9.]+\""); + if (${driver_file} =~ /${driver_version}/) { + record_info("Warning", "vGPU manager ${driver_version} has already been loaded!", result => 'softfail'); + return; + } + } + # Install vGPU manager + download_script(${driver_file}, script_url => ${dirver_url}); + zypper_call "in kernel-default-devel gcc"; + assert_script_run("modprobe vfio_pci_core") if is_sle('>=15-SP5'); + assert_script_run("./${driver_file} -s"); + record_info("vGPU manager is installed successfully."); + + # Reboot host(it took 180+ seconds for ph052 to bootup with calltrace) + script_run("grub2-once 0; reboot", die_on_timeout => 0); + record_info("Host rebooting ..."); + select_console 'sol', await_console => 0; + save_screenshot; + reset_consoles(); + assert_screen([qw(sol-console-wait-typing-ret linux-login text-login)], 120); + if (match_has_tag('sol-console-wait-typing-ret')) { + send_key 'ret'; + assert_screen([qw(inux-login text-login)], 120); + } + reset_consoles(); + select_console('root-ssh'); + + # Verify vGPU manager installed successfully + assert_script_run("lspci -d 10de: -k"); + assert_script_run("lsmod | grep nvidia"); + assert_script_run("nvidia-smi"); + script_run("ls -ld /sys/class/m*"); +} + +sub enable_sriov { + my $gpu = shift; + assert_script_run("/usr/lib/nvidia/sriov-manage -e $gpu"); + script_run("dmesg | tail"); + assert_script_run("ls -l /sys/bus/pci/devices/0000:$gpu/ | grep virtfn"); + assert_script_run("lspci | grep NVIDIA"); + record_info("SR-IOV enabled"); +} + +sub enable_mig_mode { + # Enable MIG mode + assert_script_run("nvidia-smi -mig 1"); + # Verify MIG mode is enabled + assert_script_run("nvidia-smi -i 0 --query-gpu=pci.bus_id,mig.mode.current --format=csv | grep 'Enabled'"); + record_info("Mig Mode is enalbed!"); +} + +sub create_gpu_instance { + + # Create a GI by randomly picking up a supported type which has free instances by the GPU device model + # Only those profiles on which there are available instances are candidates + script_run("nvidia-smi mig -lgip"); + my $avail_gi_profile_cmd = "nvidia-smi mig -lgip | grep MIG | grep -v '0/' | grep -v '+me'"; + die "No available GI profiles!" unless script_run($avail_gi_profile_cmd) == 0; + my @gi_profile_ids = split '\n', script_output("$avail_gi_profile_cmd | awk '{ print \$5 }'"); + my $gi_profile = $gi_profile_ids[int(rand($#gi_profile_ids + 1))]; + my ($gi_id, $gpu_id); + if (script_output("nvidia-smi mig -cgi $gi_profile") =~ /Successfully created GPU instance ID\s+(\d*) on GPU\s+(\d*)/) { + ($gi_id, $gpu_id) = ($1, $2); + assert_script_run("nvidia-smi mig -lgi"); + } + else { + die "Fail to create a GPU Instance!"; + } + record_info("GI created", "GPU_ID: \$gpu_id, GI_ID: \$gi_id"); + + return ($gi_id, $gpu_id); +} + +sub create_compute_instance { + my $gi_id = shift; + + # Create a CI to fully use a GI + my $ci_id; + if (script_output("nvidia-smi mig -cci -gi $gi_id") =~ /Successfully created compute instance ID\s+(\d*).*GPU instance ID\s+$gi_id/) { + $ci_id = $1; + assert_script_run("nvidia-smi mig -lci"); + script_run("nvidia-smi"); + } + else { + die "Fail to create a Compute Instance on GPU ID $gi_id"; + } + record_info("CI created", "GI_ID: \$gi_id, CI_ID: \$ci_id"); + return $ci_id; +} + +sub create_vgpu { + my ($gpu, $mig_mode) = @_; + + # Find available vgpu types + my $vf_count = ${mig_mode} ? '8' : '32'; + assert_script_run("cd /sys/bus/pci/devices/0000:$gpu/virtfn" . int(rand($vf_count)) . "/mdev_supported_types"); + assert_script_run('for i in *; do echo $i $(cat $i/name) available instance: $(cat $i/avail*); done'); + my $vgpu_type = ${mig_mode} ? "A.*-.*-.*C(ME)?" : "A.*-.*[CQ]"; + # We have NVIDIA A10 GPU card with SR-IOV feature and A30 with MIG mode + # They behave a little differently in creating a vGPU device + # Find the available vGPU instance + my @avail_types = split /\n/, script_output("for i in *; do [ `cat \$i/avail*` -ne 0 ] && sed -n '/ ${vgpu_type}\$/p' \$i/name; done | cut -d '/' -f1", proceed_on_failure => 1); + die "No available vGPU types for GPU $gpu!" if @{avail_types} == 0; + + # Choose a ramdom vgpu type and create a vGPU + my $vgpu_type_name = ${avail_types [int(rand($#avail_types + 1))]}; + my $vgpu_type_id = script_output("grep -l '${vgpu_type_name}' */name | cut -d '/' -f1"); + my $vgpu_id = script_output('uuidgen'); + assert_script_run("echo ${vgpu_id} > ${vgpu_type_id}/create"); + + # Verify if vGPU created successfully + record_info("vGPU created", "$vgpu_type_name " . script_output("lsmdev")); + assert_script_run("dmesg | tail"); + assert_script_run("nvidia-smi vgpu -q"); + + return ${vgpu_id}; +} + +sub assign_vgpu_to_guest { + my ($uuid, $vm, $slot) = @_; + + # The UEFI guest created in virt test is ok for vGPU test + die "vGPU only works on UEFI guests!" unless $vm =~ /uefi/i; + assert_script_run("virsh shutdown $vm") unless script_output("virsh domstate $vm") eq "shut off"; + # Save guest xml to /tmp/vm.xml, undefine the current one and define with the new xml + my $vm_xml_file = "/tmp/$vm.xml"; + assert_script_run "virsh dumpxml --inactive $vm > $vm_xml_file"; + assert_script_run "virsh undefine $vm"; + + # Add the vgpu device section to guest configuration file + die "PCI slot '0x0a' has already been used" if script_run("grep \"slot='$slot'\" $vm_xml_file") == 0; + my $vgpu_xml_section = "\\\n \\\n
\\\n \\\n
\\\n"; + + assert_script_run("sed -i \"//a\\\\${vgpu_xml_section}\" $vm_xml_file"); + upload_logs($vm_xml_file); + assert_script_run "virsh define $vm_xml_file"; + assert_script_run "virsh start $vm"; + wait_guest_online($vm); + assert_script_run "ssh root\@$vm 'lspci | grep NVIDIA'"; + record_info("vGPU attached to $vm", script_output("ssh root\@$vm 'lspci | grep NVIDIA'")); +} + +sub install_vgpu_grid_driver { + my ($vm, $driver_url) = @_; + + # Download drivers from fileserver + die "The vGPU driver URL requires to be in 'http://...' format!" unless $driver_url =~ /http:\/\/.*\/(.*\.run)/; + my ${driver_file} = $1; + + # Check if the driver has been already installed + if (script_run("ssh root\@$vm 'nvidia-smi'") == 0) { + my $grid_version = script_output("ssh root\@$vm 'nvidia-smi -q | grep \"Driver Version\" | grep -oE \"[0-9.]+\"'"); + if (${driver_file} =~ /$grid_version/) { + record_info("Warning", "vGPU grid driver $grid_version has already been loaded!", result => 'softfail'); + return; + } + } + + # Install dependencies seperately. It is easier to locate problem this way. + download_script(${driver_file}, script_url => ${driver_url}, machine => $vm); + assert_script_run "ssh root\@$vm 'zypper -n in kernel-default-devel'"; + assert_script_run "ssh root\@$vm 'zypper -n in libglvnd-devel'"; + # Install vGPU grid drivers without manual interactions + assert_script_run("ssh root\@$vm './${driver_file} -s'"); + assert_script_run("ssh root\@$vm 'nvidia-smi'"); + record_info("vGPU Grid driver is installed successfully in $vm"); + # Verify + assert_script_run("ssh root\@$vm 'lsmod | grep -i nvidia'"); +} + +sub detach_vgpu_from_guest { + my ($slot, $vm) = @_; + assert_script_run("ssh root\@$vm 'poweroff'"); + script_retry("virsh domstate $vm | grep 'shut off'", timeout => 60, delay => 5, retry => 3, die => 0); + assert_script_run("virt-xml $vm --remove-device --hostdev type=mdev,address.slot=$slot"); + assert_script_run("virsh start $vm"); + record_info("vGPU has been removed successfully from $vm"); + wait_guest_online($vm); +} + +sub remove_vgpu { + my $uuid = shift; + assert_script_run("cd `find /sys/devices -type d -name $uuid`"); + assert_script_run("echo '1' > remove"); + script_run('cd ../mdev_supported_types/; for i in *; do echo "$i" $(cat $i/name) available: $(cat $i/avail*); done'); + die "Fatal: vGPU device $uuid is still alive!" if script_run("lsmdev | grep $uuid") == 0; + record_info("vGPU removed", "$uuid has been removed from the host"); +} + +sub post_fail_hook { + my $self = shift; + diag("Module vgpu post fail hook starts."); + my $log_dir = "/tmp/vgpu"; + reconnect_when_ssh_console_broken unless defined(script_run("ls /var/log/nvidia-installer.log", die_on_timeout => 0)); + script_run("cp /var/log/nvidia-installer.log $log_dir"); + #julie: need the following lines?? + # save_network_device_status_logs($log_dir, $_, "post_fail_hook") foreach (keys %virt_autotest::common::guests); + upload_virt_logs($log_dir, "vgpu"); + # $self->SUPER::post_fail_hook; + # restore_original_guests(); julie: for debug, don't restore guests +} + +sub test_flags { + #continue subsequent test in the case test restored + return {fatal => 0}; +} + +1;