Skip to content

Commit

Permalink
Support vgpu automation test
Browse files Browse the repository at this point in the history
  • Loading branch information
Julie-CAO committed Jul 16, 2024
1 parent a2ce45e commit f220d7e
Show file tree
Hide file tree
Showing 4 changed files with 338 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
SERIALDEV='ttyS2', XEN_SERIAL_CONSOLE="com1=115200,8n1,0x3e8,5 console=com1" -->
<xen_kernel_append><%= defined $bmwqemu::vars{"XEN_SERIAL_CONSOLE"} ? $bmwqemu::vars{"XEN_SERIAL_CONSOLE"} : "console=com2,115200" %> vga=gfx-1024x768x16 loglvl=all guest_loglvl=all sync_console</xen_kernel_append>
% } else {
<append>splash=silent console=<%= $get_var->('SERIALDEV') %>,115200 console=tty loglevel=5 <%= $check_var->('AMD', '1') ? "" : "intel_iommu=on" %> <%= defined $bmwqemu::vars{"OPT_KERNEL_PARAMS"} ? $bmwqemu::vars{"OPT_KERNEL_PARAMS"} : "" %></append>
<append>splash=silent console=<%= $get_var->('SERIALDEV') %>,115200 console=tty loglevel=5 <%= $get_var->('AMD') ? "" : "intel_iommu=on" %> <%= $get_var->('OPT_KERNEL_PARAMS') ? $get_var->('OPT_KERNEL_PARAMS') : "" %></append>
% }
</global>
<loader_type>default</loader_type>
Expand Down
4 changes: 2 additions & 2 deletions lib/ipmi_backend_utils.pm
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ sub setup_console_in_grub {
elsif (${virt_type} eq "kvm") {
#enable Intel VT-d for SR-IOV test running on intel SUTs
my $intel_option = "";
if (get_var("ENABLE_SRIOV_NETWORK_CARD_PCI_PASSTHROUGH") && script_run("grep Intel /proc/cpuinfo") == 0) {
$intel_option = "intel_iommu=on";
if (get_var("ENABLE_SRIOV_NETWORK_CARD_PCI_PASSTHROUGH") or get_var("ENABLE_VGPU")) {
$intel_option = "intel_iommu=on" if script_run("grep Intel /proc/cpuinfo") == 0;
}

$cmd
Expand Down
1 change: 1 addition & 0 deletions products/sle/main.pm
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,7 @@ sub load_virt_feature_tests {
}
loadtest "virt_autotest/xen_guest_irqbalance" if get_var("ENABLE_XEN_GUEST_IRQBALANCE");
loadtest "virt_autotest/sriov_network_card_pci_passthrough" if get_var("ENABLE_SRIOV_NETWORK_CARD_PCI_PASSTHROUGH");
loadtest "virt_autotest/vgpu" if get_var("ENABLE_VGPU");
if (get_var('ENABLE_HOTPLUGGING')) {
loadtest 'virtualization/universal/hotplugging_guest_preparation';
loadtest 'virtualization/universal/hotplugging_network_interfaces';
Expand Down
334 changes: 334 additions & 0 deletions tests/virt_autotest/vgpu.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,334 @@
# SUSE's openQA tests
# Copyright 2020 SUSE LLC
# SPDX-License-Identifier: GPL-2.0-or-later
#
# Summary: A test to pass an NVIDIA vGPU to guest via libvirt. Only KVM hosts are supported.
# Test environment: one AMPERE GPU card, such A10, A30 etc., with SR-IOV capability and/or MIG features in host machine;
# A UEFI guest vm is defined in the host, and they are ssh accessible from host.
# Test flow:
# - install the NVIDIA vGPU manager on host and reboot
# - create a GPU Instance and a Compute Instance if MIG feature is available
# - create a vGPU device in host
# - assign the vGPU device to guest vm
# - install the vGPU driver in guest vm
# - detach vGPU from guest and remove the vGPU
# Maintainer: Julie CAO <[email protected]>

use base "virt_feature_test_base";
use strict;
use warnings;
use utils;
use testapi;
use virt_autotest::common;
use version_utils qw(is_sle);
use virt_autotest::utils;
use Utils::Backends qw(use_ssh_serial_console is_remote_backend);
use ipmi_backend_utils qw(reconnect_when_ssh_console_broken);

sub run_test {
my $self = shift;

# Use sol & ssh console explixitly in case test module run on an installed host
# thus no login_console.pm has been required any more
select_console 'sol', await_console => 0;
use_ssh_serial_console;

# Test Prerequisite is fulfilled
return unless is_host_ready_for_vgpu_test() eq 'yes';

my $gpu_device = get_gpu_device();
die "No NVIDIA AMPERE GPU card on the machine!" if $gpu_device eq '';

# Clean up test logs
my $log_dir = "/tmp/vgpu";
script_run "[ -d $log_dir ] && rm -rf $log_dir; mkdir -p $log_dir";

# Install NVIDIA vGPU manager
my $vgpu_manager = get_required_var("VGPU_MANAGER_URL");
install_vgpu_manager_and_reboot($vgpu_manager);

# Enable SR-IOV VFs
enable_sriov($gpu_device);

# Create a CI for GPU with MIG mode supported, such A30/A100
# given MIG mode is enabled. will handle non MIG mode when zen2 is available
if (get_var('GPU_MIG_MODE')) {
enable_mig_mode();
# Create a GI
my ($gi_id, $gpu_id) = create_gpu_instance();
# Create a CI
my $ci_id = create_compute_instance($gi_id, $gpu_id);
}

# Create a vGPU device
my $vgpu = create_vgpu($gpu_device, get_var('GPU_MIG_MODE'));

save_original_guest_xmls();
my $vgpu_grid_driver = get_required_var("VGPU_GRID_DRIVER_URL");
foreach my $guest (keys %virt_autotest::common::guests) {
record_info("Guest $guest");

# Assign vGPU to guest
my $gpu_slot_in_guest = "0x0a";
check_guest_health($guest); #may remove it later?
assign_vgpu_to_guest($vgpu, $guest, $gpu_slot_in_guest);
check_guest_health($guest); #may remove it later?

# Install vGPU grid driver in guest
install_vgpu_grid_driver($guest, $vgpu_grid_driver);

# Detach vGPU from guest
detach_vgpu_from_guest($gpu_slot_in_guest, $guest);
check_guest_health($guest);
}

# Remove vGPU device
remove_vgpu($vgpu);

# Disable GPU SR-IOV
assert_script_run("/usr/lib/nvidia/sriov-manage -d $gpu_device");
record_info("SRIOV disabled", $gpu_device);

# Upload vgpu related logs(not logs so far)
# upload_virt_logs($log_dir, "logs");

# Redefine guest from their original configuration files
restore_original_guests();
}

sub is_host_ready_for_vgpu_test {
if (is_sle('<15-SP3') or !is_kvm_host) {
record_info("Host not supported!", "NVIDIA vGPU is only supported on KVM hosts with SLE15SP3 and greater!", result => 'softfail');
return;
}
#check VT-d is supported in Intel x86_64 machines
if (script_run("grep -m 1 Intel /proc/cpuinfo") == 0) {
assert_script_run "dmesg | grep -E \"DMAR:.*IOMMU enabled\"";
}
script_run("lsmod | grep -e vfio -e nvidia -e mdev");
return 'yes';
}

sub get_gpu_device {
#NVIDIA AMPERE GPU
my $gpu_devices = script_output("lspci | grep NVIDIA | cut -d ' ' -f1");
foreach (split("\n", $gpu_devices)) {
return $_ if script_run("lspci -v -s $_ | grep 'SR-IOV'") == 0;
}
}

sub install_vgpu_manager_and_reboot {
my ${dirver_url} = shift;
die "The vGPU driver URL requires to be in 'http://...' format!" unless ${dirver_url} =~ /http:\/\/.*\/(.*\.run)/;
my ${driver_file} = $1;

# Check if vGPU manager has been already loaded
if (script_run("nvidia-smi") == 0) {
my ${driver_version} = script_output("nvidia-smi -q | grep 'Driver Version' | grep -oE \"[0-9.]+\"");
if (${driver_file} =~ /${driver_version}/) {
record_info("Warning", "vGPU manager ${driver_version} has already been loaded!", result => 'softfail');
return;
}
}
# Install vGPU manager
download_script(${driver_file}, script_url => ${dirver_url});
zypper_call "in kernel-default-devel gcc";
assert_script_run("modprobe vfio_pci_core") if is_sle('>=15-SP5');
assert_script_run("./${driver_file} -s");
record_info("vGPU manager is installed successfully.");

# Reboot host(it took 180+ seconds for ph052 to bootup with calltrace)
script_run("grub2-once 0; reboot", die_on_timeout => 0);
record_info("Host rebooting ...");
select_console 'sol', await_console => 0;
save_screenshot;
reset_consoles();
assert_screen([qw(sol-console-wait-typing-ret linux-login text-login)], 120);
if (match_has_tag('sol-console-wait-typing-ret')) {
send_key 'ret';
assert_screen([qw(inux-login text-login)], 120);
}
reset_consoles();
select_console('root-ssh');

# Verify vGPU manager installed successfully
assert_script_run("lspci -d 10de: -k");
assert_script_run("lsmod | grep nvidia");
assert_script_run("nvidia-smi");
script_run("ls -ld /sys/class/m*");
}

sub enable_sriov {
my $gpu = shift;
assert_script_run("/usr/lib/nvidia/sriov-manage -e $gpu");
script_run("dmesg | tail");
assert_script_run("ls -l /sys/bus/pci/devices/0000:$gpu/ | grep virtfn");
assert_script_run("lspci | grep NVIDIA");
record_info("SR-IOV enabled");
}

sub enable_mig_mode {
# Enable MIG mode
assert_script_run("nvidia-smi -mig 1");
# Verify MIG mode is enabled
assert_script_run("nvidia-smi -i 0 --query-gpu=pci.bus_id,mig.mode.current --format=csv | grep 'Enabled'");
record_info("Mig Mode is enalbed!");
}

sub create_gpu_instance {

# Create a GI by randomly picking up a supported type which has free instances by the GPU device model
# Only those profiles on which there are available instances are candidates
script_run("nvidia-smi mig -lgip");
my $avail_gi_profile_cmd = "nvidia-smi mig -lgip | grep MIG | grep -v '0/' | grep -v '+me'";
die "No available GI profiles!" unless script_run($avail_gi_profile_cmd) == 0;
my @gi_profile_ids = split '\n', script_output("$avail_gi_profile_cmd | awk '{ print \$5 }'");
my $gi_profile = $gi_profile_ids[int(rand($#gi_profile_ids + 1))];
my ($gi_id, $gpu_id);
if (script_output("nvidia-smi mig -cgi $gi_profile") =~ /Successfully created GPU instance ID\s+(\d*) on GPU\s+(\d*)/) {
($gi_id, $gpu_id) = ($1, $2);
assert_script_run("nvidia-smi mig -lgi");
}
else {
die "Fail to create a GPU Instance!";
}
record_info("GI created", "GPU_ID: \$gpu_id, GI_ID: \$gi_id");

return ($gi_id, $gpu_id);
}

sub create_compute_instance {
my $gi_id = shift;

# Create a CI to fully use a GI
my $ci_id;
if (script_output("nvidia-smi mig -cci -gi $gi_id") =~ /Successfully created compute instance ID\s+(\d*).*GPU instance ID\s+$gi_id/) {
$ci_id = $1;
assert_script_run("nvidia-smi mig -lci");
script_run("nvidia-smi");
}
else {
die "Fail to create a Compute Instance on GPU ID $gi_id";
}
record_info("CI created", "GI_ID: \$gi_id, CI_ID: \$ci_id");
return $ci_id;
}

sub create_vgpu {
my ($gpu, $mig_mode) = @_;

# Find available vgpu types
my $vf_count = ${mig_mode} ? '8' : '32';
assert_script_run("cd /sys/bus/pci/devices/0000:$gpu/virtfn" . int(rand($vf_count)) . "/mdev_supported_types");
assert_script_run('for i in *; do echo $i $(cat $i/name) available instance: $(cat $i/avail*); done');
my $vgpu_type = ${mig_mode} ? "A.*-.*-.*C(ME)?" : "A.*-.*[CQ]";
# We have NVIDIA A10 GPU card with SR-IOV feature and A30 with MIG mode
# They behave a little differently in creating a vGPU device
# Find the available vGPU instance
my @avail_types = split /\n/, script_output("for i in *; do [ `cat \$i/avail*` -ne 0 ] && sed -n '/ ${vgpu_type}\$/p' \$i/name; done | cut -d '/' -f1", proceed_on_failure => 1);
die "No available vGPU types for GPU $gpu!" if @{avail_types} == 0;

# Choose a ramdom vgpu type and create a vGPU
my $vgpu_type_name = ${avail_types [int(rand($#avail_types + 1))]};
my $vgpu_type_id = script_output("grep -l '${vgpu_type_name}' */name | cut -d '/' -f1");
my $vgpu_id = script_output('uuidgen');
assert_script_run("echo ${vgpu_id} > ${vgpu_type_id}/create");

# Verify if vGPU created successfully
record_info("vGPU created", "$vgpu_type_name " . script_output("lsmdev"));
assert_script_run("dmesg | tail");
assert_script_run("nvidia-smi vgpu -q");

return ${vgpu_id};
}

sub assign_vgpu_to_guest {
my ($uuid, $vm, $slot) = @_;

# The UEFI guest created in virt test is ok for vGPU test
die "vGPU only works on UEFI guests!" unless $vm =~ /uefi/i;
assert_script_run("virsh shutdown $vm") unless script_output("virsh domstate $vm") eq "shut off";
# Save guest xml to /tmp/vm.xml, undefine the current one and define with the new xml
my $vm_xml_file = "/tmp/$vm.xml";
assert_script_run "virsh dumpxml --inactive $vm > $vm_xml_file";
assert_script_run "virsh undefine $vm";

# Add the vgpu device section to guest configuration file
die "PCI slot '0x0a' has already been used" if script_run("grep \"slot='$slot'\" $vm_xml_file") == 0;
my $vgpu_xml_section = "<hostdev mode='subsystem' type='mdev' model='vfio-pci' display='off'>\\\n <source>\\\n <address uuid='$uuid'/>\\\n </source>\\\n <address type='pci' domain='0x0000' bus='0x00' slot='$slot' function='0x0'/>\\\n</hostdev>";

assert_script_run("sed -i \"/<devices>/a\\\\${vgpu_xml_section}\" $vm_xml_file");
upload_logs($vm_xml_file);
assert_script_run "virsh define $vm_xml_file";
assert_script_run "virsh start $vm";
wait_guest_online($vm);
assert_script_run "ssh root\@$vm 'lspci | grep NVIDIA'";
record_info("vGPU attached to $vm", script_output("ssh root\@$vm 'lspci | grep NVIDIA'"));
}

sub install_vgpu_grid_driver {
my ($vm, $driver_url) = @_;

# Download drivers from fileserver
die "The vGPU driver URL requires to be in 'http://...' format!" unless $driver_url =~ /http:\/\/.*\/(.*\.run)/;
my ${driver_file} = $1;

# Check if the driver has been already installed
if (script_run("ssh root\@$vm 'nvidia-smi'") == 0) {
my $grid_version = script_output("ssh root\@$vm 'nvidia-smi -q | grep \"Driver Version\" | grep -oE \"[0-9.]+\"'");
if (${driver_file} =~ /$grid_version/) {
record_info("Warning", "vGPU grid driver $grid_version has already been loaded!", result => 'softfail');
return;
}
}

# Install dependencies seperately. It is easier to locate problem this way.
download_script(${driver_file}, script_url => ${driver_url}, machine => $vm);
assert_script_run "ssh root\@$vm 'zypper -n in kernel-default-devel'";
assert_script_run "ssh root\@$vm 'zypper -n in libglvnd-devel'";
# Install vGPU grid drivers without manual interactions
assert_script_run("ssh root\@$vm './${driver_file} -s'");
assert_script_run("ssh root\@$vm 'nvidia-smi'");
record_info("vGPU Grid driver is installed successfully in $vm");
# Verify
assert_script_run("ssh root\@$vm 'lsmod | grep -i nvidia'");
}

sub detach_vgpu_from_guest {
my ($slot, $vm) = @_;
assert_script_run("ssh root\@$vm 'poweroff'");
script_retry("virsh domstate $vm | grep 'shut off'", timeout => 60, delay => 5, retry => 3, die => 0);
assert_script_run("virt-xml $vm --remove-device --hostdev type=mdev,address.slot=$slot");
assert_script_run("virsh start $vm");
record_info("vGPU has been removed successfully from $vm");
wait_guest_online($vm);
}

sub remove_vgpu {
my $uuid = shift;
assert_script_run("cd `find /sys/devices -type d -name $uuid`");
assert_script_run("echo '1' > remove");
script_run('cd ../mdev_supported_types/; for i in *; do echo "$i" $(cat $i/name) available: $(cat $i/avail*); done');
die "Fatal: vGPU device $uuid is still alive!" if script_run("lsmdev | grep $uuid") == 0;
record_info("vGPU removed", "$uuid has been removed from the host");
}

sub post_fail_hook {
my $self = shift;
diag("Module vgpu post fail hook starts.");
my $log_dir = "/tmp/vgpu";
reconnect_when_ssh_console_broken unless defined(script_run("ls /var/log/nvidia-installer.log", die_on_timeout => 0));
script_run("cp /var/log/nvidia-installer.log $log_dir");
#julie: need the following lines??
# save_network_device_status_logs($log_dir, $_, "post_fail_hook") foreach (keys %virt_autotest::common::guests);
upload_virt_logs($log_dir, "vgpu");
# $self->SUPER::post_fail_hook;
# restore_original_guests(); julie: for debug, don't restore guests
}

sub test_flags {
#continue subsequent test in the case test restored
return {fatal => 0};
}

1;

0 comments on commit f220d7e

Please sign in to comment.