Support vgpu automation test

os-autoinst · Jul 16, 2024 · f220d7e · f220d7e
1 parent a2ce45e
commit f220d7e
Show file tree

Hide file tree

Showing 4 changed files with 338 additions and 3 deletions.
diff --git a/data/virt_autotest/host_unattended_installation_files/autoyast/dev_host_15.xml.ep b/data/virt_autotest/host_unattended_installation_files/autoyast/dev_host_15.xml.ep
@@ -59,7 +59,7 @@
       SERIALDEV='ttyS2', XEN_SERIAL_CONSOLE="com1=115200,8n1,0x3e8,5 console=com1" -->
       <xen_kernel_append><%= defined $bmwqemu::vars{"XEN_SERIAL_CONSOLE"} ? $bmwqemu::vars{"XEN_SERIAL_CONSOLE"} : "console=com2,115200" %> vga=gfx-1024x768x16 loglvl=all guest_loglvl=all sync_console</xen_kernel_append>
       % } else {
-      <append>splash=silent console=<%= $get_var->('SERIALDEV') %>,115200 console=tty loglevel=5 <%= $check_var->('AMD', '1') ? "" : "intel_iommu=on" %> <%= defined $bmwqemu::vars{"OPT_KERNEL_PARAMS"} ? $bmwqemu::vars{"OPT_KERNEL_PARAMS"} : "" %></append>
+      <append>splash=silent console=<%= $get_var->('SERIALDEV') %>,115200 console=tty loglevel=5 <%= $get_var->('AMD') ? "" : "intel_iommu=on" %> <%= $get_var->('OPT_KERNEL_PARAMS') ? $get_var->('OPT_KERNEL_PARAMS') : "" %></append>
       % }
     </global>
     <loader_type>default</loader_type>

diff --git a/lib/ipmi_backend_utils.pm b/lib/ipmi_backend_utils.pm
@@ -121,8 +121,8 @@ sub setup_console_in_grub {
     elsif (${virt_type} eq "kvm") {
         #enable Intel VT-d for SR-IOV test running on intel SUTs
         my $intel_option = "";
-        if (get_var("ENABLE_SRIOV_NETWORK_CARD_PCI_PASSTHROUGH") && script_run("grep Intel /proc/cpuinfo") == 0) {
-            $intel_option = "intel_iommu=on";
+        if (get_var("ENABLE_SRIOV_NETWORK_CARD_PCI_PASSTHROUGH") or get_var("ENABLE_VGPU")) {
+            $intel_option = "intel_iommu=on" if script_run("grep Intel /proc/cpuinfo") == 0;
         }
 
         $cmd

diff --git a/products/sle/main.pm b/products/sle/main.pm
@@ -620,6 +620,7 @@ sub load_virt_feature_tests {
     }
     loadtest "virt_autotest/xen_guest_irqbalance" if get_var("ENABLE_XEN_GUEST_IRQBALANCE");
     loadtest "virt_autotest/sriov_network_card_pci_passthrough" if get_var("ENABLE_SRIOV_NETWORK_CARD_PCI_PASSTHROUGH");
+    loadtest "virt_autotest/vgpu" if get_var("ENABLE_VGPU");
     if (get_var('ENABLE_HOTPLUGGING')) {
         loadtest 'virtualization/universal/hotplugging_guest_preparation';
         loadtest 'virtualization/universal/hotplugging_network_interfaces';

diff --git a/tests/virt_autotest/vgpu.pm b/tests/virt_autotest/vgpu.pm
@@ -0,0 +1,334 @@
+# SUSE's openQA tests
+# Copyright 2020 SUSE LLC
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# Summary: A test to pass an NVIDIA vGPU to guest via libvirt. Only KVM hosts are supported.
+# Test environment: one AMPERE GPU card, such A10, A30 etc., with SR-IOV capability and/or MIG features in host machine;
+#                   A UEFI guest vm is defined in the host, and they are ssh accessible from host.
+# Test flow:
+#    - install the NVIDIA vGPU manager on host and reboot
+#    - create a GPU Instance and a Compute Instance if MIG feature is available
+#    - create a vGPU device in host
+#    - assign the vGPU device to guest vm
+#    - install the vGPU driver in guest vm
+#    - detach vGPU from guest and remove the vGPU
+# Maintainer: Julie CAO <[email protected]>
+
+use base "virt_feature_test_base";
+use strict;
+use warnings;
+use utils;
+use testapi;
+use virt_autotest::common;
+use version_utils qw(is_sle);
+use virt_autotest::utils;
+use Utils::Backends qw(use_ssh_serial_console is_remote_backend);
+use ipmi_backend_utils qw(reconnect_when_ssh_console_broken);
+
+sub run_test {
+    my $self = shift;
+
+    # Use sol & ssh console explixitly in case test module run on an installed host
+    # thus no login_console.pm has been required any more
+    select_console 'sol', await_console => 0;
+    use_ssh_serial_console;
+
+    # Test Prerequisite is fulfilled
+    return unless is_host_ready_for_vgpu_test() eq 'yes';
+
+    my $gpu_device = get_gpu_device();
+    die "No NVIDIA AMPERE GPU card on the machine!" if $gpu_device eq '';
+
+    # Clean up test logs
+    my $log_dir = "/tmp/vgpu";
+    script_run "[ -d $log_dir ] && rm -rf $log_dir; mkdir -p $log_dir";
+
+    # Install NVIDIA vGPU manager
+    my $vgpu_manager = get_required_var("VGPU_MANAGER_URL");
+    install_vgpu_manager_and_reboot($vgpu_manager);
+
+    # Enable SR-IOV VFs
+    enable_sriov($gpu_device);
+
+    # Create a CI for GPU with MIG mode supported, such A30/A100
+    # given MIG mode is enabled. will handle non MIG mode when zen2 is available
+    if (get_var('GPU_MIG_MODE')) {
+        enable_mig_mode();
+        # Create a GI
+        my ($gi_id, $gpu_id) = create_gpu_instance();
+        # Create a CI
+        my $ci_id = create_compute_instance($gi_id, $gpu_id);
+    }
+
+    # Create a vGPU device
+    my $vgpu = create_vgpu($gpu_device, get_var('GPU_MIG_MODE'));
+
+    save_original_guest_xmls();
+    my $vgpu_grid_driver = get_required_var("VGPU_GRID_DRIVER_URL");
+    foreach my $guest (keys %virt_autotest::common::guests) {
+        record_info("Guest $guest");
+
+        # Assign vGPU to guest
+        my $gpu_slot_in_guest = "0x0a";
+        check_guest_health($guest);    #may remove it later?
+        assign_vgpu_to_guest($vgpu, $guest, $gpu_slot_in_guest);
+        check_guest_health($guest);    #may remove it later?
+
+        # Install vGPU grid driver in guest
+        install_vgpu_grid_driver($guest, $vgpu_grid_driver);
+
+        # Detach vGPU from guest
+        detach_vgpu_from_guest($gpu_slot_in_guest, $guest);
+        check_guest_health($guest);
+    }
+
+    # Remove vGPU device
+    remove_vgpu($vgpu);
+
+    # Disable GPU SR-IOV
+    assert_script_run("/usr/lib/nvidia/sriov-manage -d $gpu_device");
+    record_info("SRIOV disabled", $gpu_device);
+
+    # Upload vgpu related logs(not logs so far)
+    #    upload_virt_logs($log_dir, "logs");
+
+    # Redefine guest from their original configuration files
+    restore_original_guests();
+}
+
+sub is_host_ready_for_vgpu_test {
+    if (is_sle('<15-SP3') or !is_kvm_host) {
+        record_info("Host not supported!", "NVIDIA vGPU is only supported on KVM hosts with SLE15SP3 and greater!", result => 'softfail');
+        return;
+    }
+    #check VT-d is supported in Intel x86_64 machines
+    if (script_run("grep -m 1 Intel /proc/cpuinfo") == 0) {
+        assert_script_run "dmesg | grep -E \"DMAR:.*IOMMU enabled\"";
+    }
+    script_run("lsmod | grep -e vfio -e nvidia -e mdev");
+    return 'yes';
+}
+
+sub get_gpu_device {
+    #NVIDIA AMPERE GPU
+    my $gpu_devices = script_output("lspci | grep NVIDIA | cut -d ' ' -f1");
+    foreach (split("\n", $gpu_devices)) {
+        return $_ if script_run("lspci -v -s $_ | grep 'SR-IOV'") == 0;
+    }
+}
+
+sub install_vgpu_manager_and_reboot {
+    my ${dirver_url} = shift;
+    die "The vGPU driver URL requires to be in 'http://...' format!" unless ${dirver_url} =~ /http:\/\/.*\/(.*\.run)/;
+    my ${driver_file} = $1;
+
+    # Check if vGPU manager has been already loaded
+    if (script_run("nvidia-smi") == 0) {
+        my ${driver_version} = script_output("nvidia-smi -q | grep 'Driver Version' | grep -oE \"[0-9.]+\"");
+        if (${driver_file} =~ /${driver_version}/) {
+            record_info("Warning", "vGPU manager ${driver_version} has already been loaded!", result => 'softfail');
+            return;
+        }
+    }
+    # Install vGPU manager
+    download_script(${driver_file}, script_url => ${dirver_url});
+    zypper_call "in kernel-default-devel gcc";
+    assert_script_run("modprobe vfio_pci_core") if is_sle('>=15-SP5');
+    assert_script_run("./${driver_file} -s");
+    record_info("vGPU manager is installed successfully.");
+
+    # Reboot host(it took 180+ seconds for ph052 to bootup with calltrace)
+    script_run("grub2-once 0; reboot", die_on_timeout => 0);
+    record_info("Host rebooting ...");
+    select_console 'sol', await_console => 0;
+    save_screenshot;
+    reset_consoles();
+    assert_screen([qw(sol-console-wait-typing-ret linux-login text-login)], 120);
+    if (match_has_tag('sol-console-wait-typing-ret')) {
+        send_key 'ret';
+        assert_screen([qw(inux-login text-login)], 120);
+    }
+    reset_consoles();
+    select_console('root-ssh');
+
+    # Verify vGPU manager installed successfully
+    assert_script_run("lspci -d 10de: -k");
+    assert_script_run("lsmod | grep nvidia");
+    assert_script_run("nvidia-smi");
+    script_run("ls -ld /sys/class/m*");
+}
+
+sub enable_sriov {
+    my $gpu = shift;
+    assert_script_run("/usr/lib/nvidia/sriov-manage -e $gpu");
+    script_run("dmesg | tail");
+    assert_script_run("ls -l /sys/bus/pci/devices/0000:$gpu/ | grep virtfn");
+    assert_script_run("lspci | grep NVIDIA");
+    record_info("SR-IOV enabled");
+}
+
+sub enable_mig_mode {
+    # Enable MIG mode
+    assert_script_run("nvidia-smi -mig 1");
+    # Verify MIG mode is enabled
+    assert_script_run("nvidia-smi -i 0 --query-gpu=pci.bus_id,mig.mode.current --format=csv | grep 'Enabled'");
+    record_info("Mig Mode is enalbed!");
+}
+
+sub create_gpu_instance {
+
+    # Create a GI by randomly picking up a supported type which has free instances by the GPU device model
+    # Only those profiles on which there are available instances are candidates
+    script_run("nvidia-smi mig -lgip");
+    my $avail_gi_profile_cmd = "nvidia-smi mig -lgip | grep MIG | grep -v '0/' | grep -v '+me'";
+    die "No available GI profiles!" unless script_run($avail_gi_profile_cmd) == 0;
+    my @gi_profile_ids = split '\n', script_output("$avail_gi_profile_cmd | awk '{ print \$5 }'");
+    my $gi_profile = $gi_profile_ids[int(rand($#gi_profile_ids + 1))];
+    my ($gi_id, $gpu_id);
+    if (script_output("nvidia-smi mig -cgi $gi_profile") =~ /Successfully created GPU instance ID\s+(\d*) on GPU\s+(\d*)/) {
+        ($gi_id, $gpu_id) = ($1, $2);
+        assert_script_run("nvidia-smi mig -lgi");
+    }
+    else {
+        die "Fail to create a GPU Instance!";
+    }
+    record_info("GI created", "GPU_ID: \$gpu_id, GI_ID: \$gi_id");
+
+    return ($gi_id, $gpu_id);
+}
+
+sub create_compute_instance {
+    my $gi_id = shift;
+
+    # Create a CI to fully use a GI
+    my $ci_id;
+    if (script_output("nvidia-smi mig -cci -gi $gi_id") =~ /Successfully created compute instance ID\s+(\d*).*GPU instance ID\s+$gi_id/) {
+        $ci_id = $1;
+        assert_script_run("nvidia-smi mig -lci");
+        script_run("nvidia-smi");
+    }
+    else {
+        die "Fail to create a Compute Instance on GPU ID $gi_id";
+    }
+    record_info("CI created", "GI_ID: \$gi_id, CI_ID: \$ci_id");
+    return $ci_id;
+}
+
+sub create_vgpu {
+    my ($gpu, $mig_mode) = @_;
+
+    # Find available vgpu types
+    my $vf_count = ${mig_mode} ? '8' : '32';
+    assert_script_run("cd /sys/bus/pci/devices/0000:$gpu/virtfn" . int(rand($vf_count)) . "/mdev_supported_types");
+    assert_script_run('for i in *; do echo $i $(cat $i/name) available instance: $(cat $i/avail*); done');
+    my $vgpu_type = ${mig_mode} ? "A.*-.*-.*C(ME)?" : "A.*-.*[CQ]";
+    # We have NVIDIA A10 GPU card with SR-IOV feature and A30 with MIG mode
+    # They behave a little differently in creating a vGPU device
+    # Find the available vGPU instance
+    my @avail_types = split /\n/, script_output("for i in *; do [ `cat \$i/avail*` -ne 0 ] && sed -n '/ ${vgpu_type}\$/p' \$i/name; done | cut -d '/' -f1", proceed_on_failure => 1);
+    die "No available vGPU types for GPU $gpu!" if @{avail_types} == 0;
+
+    # Choose a ramdom vgpu type and create a vGPU
+    my $vgpu_type_name = ${avail_types [int(rand($#avail_types + 1))]};
+    my $vgpu_type_id = script_output("grep -l '${vgpu_type_name}' */name | cut -d '/' -f1");
+    my $vgpu_id = script_output('uuidgen');
+    assert_script_run("echo ${vgpu_id} > ${vgpu_type_id}/create");
+
+    # Verify if vGPU created successfully
+    record_info("vGPU created", "$vgpu_type_name " . script_output("lsmdev"));
+    assert_script_run("dmesg | tail");
+    assert_script_run("nvidia-smi vgpu -q");
+
+    return ${vgpu_id};
+}
+
+sub assign_vgpu_to_guest {
+    my ($uuid, $vm, $slot) = @_;
+
+    # The UEFI guest created in virt test is ok for vGPU test
+    die "vGPU only works on UEFI guests!" unless $vm =~ /uefi/i;
+    assert_script_run("virsh shutdown $vm") unless script_output("virsh domstate $vm") eq "shut off";
+    # Save guest xml to /tmp/vm.xml, undefine the current one and define with the new xml
+    my $vm_xml_file = "/tmp/$vm.xml";
+    assert_script_run "virsh dumpxml --inactive $vm > $vm_xml_file";
+    assert_script_run "virsh undefine $vm";
+
+    # Add the vgpu device section to guest configuration file
+    die "PCI slot '0x0a' has already been used" if script_run("grep \"slot='$slot'\" $vm_xml_file") == 0;
+    my $vgpu_xml_section = "<hostdev mode='subsystem' type='mdev' model='vfio-pci' display='off'>\\\n  <source>\\\n    <address uuid='$uuid'/>\\\n  </source>\\\n  <address type='pci' domain='0x0000' bus='0x00' slot='$slot' function='0x0'/>\\\n</hostdev>";
+
+    assert_script_run("sed -i \"/<devices>/a\\\\${vgpu_xml_section}\" $vm_xml_file");
+    upload_logs($vm_xml_file);
+    assert_script_run "virsh define $vm_xml_file";
+    assert_script_run "virsh start $vm";
+    wait_guest_online($vm);
+    assert_script_run "ssh root\@$vm 'lspci | grep NVIDIA'";
+    record_info("vGPU attached to $vm", script_output("ssh root\@$vm 'lspci | grep NVIDIA'"));
+}
+
+sub install_vgpu_grid_driver {
+    my ($vm, $driver_url) = @_;
+
+    # Download drivers from fileserver
+    die "The vGPU driver URL requires to be in 'http://...' format!" unless $driver_url =~ /http:\/\/.*\/(.*\.run)/;
+    my ${driver_file} = $1;
+
+    # Check if the driver has been already installed
+    if (script_run("ssh root\@$vm 'nvidia-smi'") == 0) {
+        my $grid_version = script_output("ssh root\@$vm 'nvidia-smi -q | grep \"Driver Version\" | grep -oE \"[0-9.]+\"'");
+        if (${driver_file} =~ /$grid_version/) {
+            record_info("Warning", "vGPU grid driver $grid_version has already been loaded!", result => 'softfail');
+            return;
+        }
+    }
+
+    # Install dependencies seperately. It is easier to locate problem this way.
+    download_script(${driver_file}, script_url => ${driver_url}, machine => $vm);
+    assert_script_run "ssh root\@$vm 'zypper -n in kernel-default-devel'";
+    assert_script_run "ssh root\@$vm 'zypper -n in libglvnd-devel'";
+    # Install vGPU grid drivers without manual interactions
+    assert_script_run("ssh root\@$vm './${driver_file} -s'");
+    assert_script_run("ssh root\@$vm 'nvidia-smi'");
+    record_info("vGPU Grid driver is installed successfully in $vm");
+    # Verify
+    assert_script_run("ssh root\@$vm 'lsmod | grep -i nvidia'");
+}
+
+sub detach_vgpu_from_guest {
+    my ($slot, $vm) = @_;
+    assert_script_run("ssh root\@$vm 'poweroff'");
+    script_retry("virsh domstate $vm | grep 'shut off'", timeout => 60, delay => 5, retry => 3, die => 0);
+    assert_script_run("virt-xml $vm --remove-device --hostdev type=mdev,address.slot=$slot");
+    assert_script_run("virsh start $vm");
+    record_info("vGPU has been removed successfully from $vm");
+    wait_guest_online($vm);
+}
+
+sub remove_vgpu {
+    my $uuid = shift;
+    assert_script_run("cd `find /sys/devices -type d -name $uuid`");
+    assert_script_run("echo '1' > remove");
+    script_run('cd ../mdev_supported_types/; for i in *; do echo "$i" $(cat $i/name) available: $(cat $i/avail*); done');
+    die "Fatal: vGPU device $uuid is still alive!" if script_run("lsmdev | grep $uuid") == 0;
+    record_info("vGPU removed", "$uuid has been removed from the host");
+}
+
+sub post_fail_hook {
+    my $self = shift;
+    diag("Module vgpu post fail hook starts.");
+    my $log_dir = "/tmp/vgpu";
+    reconnect_when_ssh_console_broken unless defined(script_run("ls /var/log/nvidia-installer.log", die_on_timeout => 0));
+    script_run("cp /var/log/nvidia-installer.log $log_dir");
+    #julie: need the following lines??
+    #    save_network_device_status_logs($log_dir, $_, "post_fail_hook") foreach (keys %virt_autotest::common::guests);
+    upload_virt_logs($log_dir, "vgpu");
+    #    $self->SUPER::post_fail_hook;
+    #    restore_original_guests();  julie: for debug, don't restore guests
+}
+
+sub test_flags {
+    #continue subsequent test in the case test restored
+    return {fatal => 0};
+}
+
+1;