-
Notifications
You must be signed in to change notification settings - Fork 280
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
338 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,334 @@ | ||
# SUSE's openQA tests | ||
# Copyright 2020 SUSE LLC | ||
# SPDX-License-Identifier: GPL-2.0-or-later | ||
# | ||
# Summary: A test to pass an NVIDIA vGPU to guest via libvirt. Only KVM hosts are supported. | ||
# Test environment: one AMPERE GPU card, such A10, A30 etc., with SR-IOV capability and/or MIG features in host machine; | ||
# A UEFI guest vm is defined in the host, and they are ssh accessible from host. | ||
# Test flow: | ||
# - install the NVIDIA vGPU manager on host and reboot | ||
# - create a GPU Instance and a Compute Instance if MIG feature is available | ||
# - create a vGPU device in host | ||
# - assign the vGPU device to guest vm | ||
# - install the vGPU driver in guest vm | ||
# - detach vGPU from guest and remove the vGPU | ||
# Maintainer: Julie CAO <[email protected]> | ||
|
||
use base "virt_feature_test_base"; | ||
use strict; | ||
use warnings; | ||
use utils; | ||
use testapi; | ||
use virt_autotest::common; | ||
use version_utils qw(is_sle); | ||
use virt_autotest::utils; | ||
use Utils::Backends qw(use_ssh_serial_console is_remote_backend); | ||
use ipmi_backend_utils qw(reconnect_when_ssh_console_broken); | ||
|
||
sub run_test { | ||
my $self = shift; | ||
|
||
# Use sol & ssh console explixitly in case test module run on an installed host | ||
# thus no login_console.pm has been required any more | ||
select_console 'sol', await_console => 0; | ||
use_ssh_serial_console; | ||
|
||
# Test Prerequisite is fulfilled | ||
return unless is_host_ready_for_vgpu_test() eq 'yes'; | ||
|
||
my $gpu_device = get_gpu_device(); | ||
die "No NVIDIA AMPERE GPU card on the machine!" if $gpu_device eq ''; | ||
|
||
# Clean up test logs | ||
my $log_dir = "/tmp/vgpu"; | ||
script_run "[ -d $log_dir ] && rm -rf $log_dir; mkdir -p $log_dir"; | ||
|
||
# Install NVIDIA vGPU manager | ||
my $vgpu_manager = get_required_var("VGPU_MANAGER_URL"); | ||
install_vgpu_manager_and_reboot($vgpu_manager); | ||
|
||
# Enable SR-IOV VFs | ||
enable_sriov($gpu_device); | ||
|
||
# Create a CI for GPU with MIG mode supported, such A30/A100 | ||
# given MIG mode is enabled. will handle non MIG mode when zen2 is available | ||
if (get_var('GPU_MIG_MODE')) { | ||
enable_mig_mode(); | ||
# Create a GI | ||
my ($gi_id, $gpu_id) = create_gpu_instance(); | ||
# Create a CI | ||
my $ci_id = create_compute_instance($gi_id, $gpu_id); | ||
} | ||
|
||
# Create a vGPU device | ||
my $vgpu = create_vgpu($gpu_device, get_var('GPU_MIG_MODE')); | ||
|
||
save_original_guest_xmls(); | ||
my $vgpu_grid_driver = get_required_var("VGPU_GRID_DRIVER_URL"); | ||
foreach my $guest (keys %virt_autotest::common::guests) { | ||
record_info("Guest $guest"); | ||
|
||
# Assign vGPU to guest | ||
my $gpu_slot_in_guest = "0x0a"; | ||
check_guest_health($guest); #may remove it later? | ||
assign_vgpu_to_guest($vgpu, $guest, $gpu_slot_in_guest); | ||
check_guest_health($guest); #may remove it later? | ||
|
||
# Install vGPU grid driver in guest | ||
install_vgpu_grid_driver($guest, $vgpu_grid_driver); | ||
|
||
# Detach vGPU from guest | ||
detach_vgpu_from_guest($gpu_slot_in_guest, $guest); | ||
check_guest_health($guest); | ||
} | ||
|
||
# Remove vGPU device | ||
remove_vgpu($vgpu); | ||
|
||
# Disable GPU SR-IOV | ||
assert_script_run("/usr/lib/nvidia/sriov-manage -d $gpu_device"); | ||
record_info("SRIOV disabled", $gpu_device); | ||
|
||
# Upload vgpu related logs(not logs so far) | ||
# upload_virt_logs($log_dir, "logs"); | ||
|
||
# Redefine guest from their original configuration files | ||
restore_original_guests(); | ||
} | ||
|
||
sub is_host_ready_for_vgpu_test { | ||
if (is_sle('<15-SP3') or !is_kvm_host) { | ||
record_info("Host not supported!", "NVIDIA vGPU is only supported on KVM hosts with SLE15SP3 and greater!", result => 'softfail'); | ||
return; | ||
} | ||
#check VT-d is supported in Intel x86_64 machines | ||
if (script_run("grep -m 1 Intel /proc/cpuinfo") == 0) { | ||
assert_script_run "dmesg | grep -E \"DMAR:.*IOMMU enabled\""; | ||
} | ||
script_run("lsmod | grep -e vfio -e nvidia -e mdev"); | ||
return 'yes'; | ||
} | ||
|
||
sub get_gpu_device { | ||
#NVIDIA AMPERE GPU | ||
my $gpu_devices = script_output("lspci | grep NVIDIA | cut -d ' ' -f1"); | ||
foreach (split("\n", $gpu_devices)) { | ||
return $_ if script_run("lspci -v -s $_ | grep 'SR-IOV'") == 0; | ||
} | ||
} | ||
|
||
sub install_vgpu_manager_and_reboot { | ||
my ${dirver_url} = shift; | ||
die "The vGPU driver URL requires to be in 'http://...' format!" unless ${dirver_url} =~ /http:\/\/.*\/(.*\.run)/; | ||
my ${driver_file} = $1; | ||
|
||
# Check if vGPU manager has been already loaded | ||
if (script_run("nvidia-smi") == 0) { | ||
my ${driver_version} = script_output("nvidia-smi -q | grep 'Driver Version' | grep -oE \"[0-9.]+\""); | ||
if (${driver_file} =~ /${driver_version}/) { | ||
record_info("Warning", "vGPU manager ${driver_version} has already been loaded!", result => 'softfail'); | ||
return; | ||
} | ||
} | ||
# Install vGPU manager | ||
download_script(${driver_file}, script_url => ${dirver_url}); | ||
zypper_call "in kernel-default-devel gcc"; | ||
assert_script_run("modprobe vfio_pci_core") if is_sle('>=15-SP5'); | ||
assert_script_run("./${driver_file} -s"); | ||
record_info("vGPU manager is installed successfully."); | ||
|
||
# Reboot host(it took 180+ seconds for ph052 to bootup with calltrace) | ||
script_run("grub2-once 0; reboot", die_on_timeout => 0); | ||
record_info("Host rebooting ..."); | ||
select_console 'sol', await_console => 0; | ||
save_screenshot; | ||
reset_consoles(); | ||
assert_screen([qw(sol-console-wait-typing-ret linux-login text-login)], 120); | ||
if (match_has_tag('sol-console-wait-typing-ret')) { | ||
send_key 'ret'; | ||
assert_screen([qw(inux-login text-login)], 120); | ||
} | ||
reset_consoles(); | ||
select_console('root-ssh'); | ||
|
||
# Verify vGPU manager installed successfully | ||
assert_script_run("lspci -d 10de: -k"); | ||
assert_script_run("lsmod | grep nvidia"); | ||
assert_script_run("nvidia-smi"); | ||
script_run("ls -ld /sys/class/m*"); | ||
} | ||
|
||
sub enable_sriov { | ||
my $gpu = shift; | ||
assert_script_run("/usr/lib/nvidia/sriov-manage -e $gpu"); | ||
script_run("dmesg | tail"); | ||
assert_script_run("ls -l /sys/bus/pci/devices/0000:$gpu/ | grep virtfn"); | ||
assert_script_run("lspci | grep NVIDIA"); | ||
record_info("SR-IOV enabled"); | ||
} | ||
|
||
sub enable_mig_mode { | ||
# Enable MIG mode | ||
assert_script_run("nvidia-smi -mig 1"); | ||
# Verify MIG mode is enabled | ||
assert_script_run("nvidia-smi -i 0 --query-gpu=pci.bus_id,mig.mode.current --format=csv | grep 'Enabled'"); | ||
record_info("Mig Mode is enalbed!"); | ||
} | ||
|
||
sub create_gpu_instance { | ||
|
||
# Create a GI by randomly picking up a supported type which has free instances by the GPU device model | ||
# Only those profiles on which there are available instances are candidates | ||
script_run("nvidia-smi mig -lgip"); | ||
my $avail_gi_profile_cmd = "nvidia-smi mig -lgip | grep MIG | grep -v '0/' | grep -v '+me'"; | ||
die "No available GI profiles!" unless script_run($avail_gi_profile_cmd) == 0; | ||
my @gi_profile_ids = split '\n', script_output("$avail_gi_profile_cmd | awk '{ print \$5 }'"); | ||
my $gi_profile = $gi_profile_ids[int(rand($#gi_profile_ids + 1))]; | ||
my ($gi_id, $gpu_id); | ||
if (script_output("nvidia-smi mig -cgi $gi_profile") =~ /Successfully created GPU instance ID\s+(\d*) on GPU\s+(\d*)/) { | ||
($gi_id, $gpu_id) = ($1, $2); | ||
assert_script_run("nvidia-smi mig -lgi"); | ||
} | ||
else { | ||
die "Fail to create a GPU Instance!"; | ||
} | ||
record_info("GI created", "GPU_ID: \$gpu_id, GI_ID: \$gi_id"); | ||
|
||
return ($gi_id, $gpu_id); | ||
} | ||
|
||
sub create_compute_instance { | ||
my $gi_id = shift; | ||
|
||
# Create a CI to fully use a GI | ||
my $ci_id; | ||
if (script_output("nvidia-smi mig -cci -gi $gi_id") =~ /Successfully created compute instance ID\s+(\d*).*GPU instance ID\s+$gi_id/) { | ||
$ci_id = $1; | ||
assert_script_run("nvidia-smi mig -lci"); | ||
script_run("nvidia-smi"); | ||
} | ||
else { | ||
die "Fail to create a Compute Instance on GPU ID $gi_id"; | ||
} | ||
record_info("CI created", "GI_ID: \$gi_id, CI_ID: \$ci_id"); | ||
return $ci_id; | ||
} | ||
|
||
sub create_vgpu { | ||
my ($gpu, $mig_mode) = @_; | ||
|
||
# Find available vgpu types | ||
my $vf_count = ${mig_mode} ? '8' : '32'; | ||
assert_script_run("cd /sys/bus/pci/devices/0000:$gpu/virtfn" . int(rand($vf_count)) . "/mdev_supported_types"); | ||
assert_script_run('for i in *; do echo $i $(cat $i/name) available instance: $(cat $i/avail*); done'); | ||
my $vgpu_type = ${mig_mode} ? "A.*-.*-.*C(ME)?" : "A.*-.*[CQ]"; | ||
# We have NVIDIA A10 GPU card with SR-IOV feature and A30 with MIG mode | ||
# They behave a little differently in creating a vGPU device | ||
# Find the available vGPU instance | ||
my @avail_types = split /\n/, script_output("for i in *; do [ `cat \$i/avail*` -ne 0 ] && sed -n '/ ${vgpu_type}\$/p' \$i/name; done | cut -d '/' -f1", proceed_on_failure => 1); | ||
die "No available vGPU types for GPU $gpu!" if @{avail_types} == 0; | ||
|
||
# Choose a ramdom vgpu type and create a vGPU | ||
my $vgpu_type_name = ${avail_types [int(rand($#avail_types + 1))]}; | ||
my $vgpu_type_id = script_output("grep -l '${vgpu_type_name}' */name | cut -d '/' -f1"); | ||
my $vgpu_id = script_output('uuidgen'); | ||
assert_script_run("echo ${vgpu_id} > ${vgpu_type_id}/create"); | ||
|
||
# Verify if vGPU created successfully | ||
record_info("vGPU created", "$vgpu_type_name " . script_output("lsmdev")); | ||
assert_script_run("dmesg | tail"); | ||
assert_script_run("nvidia-smi vgpu -q"); | ||
|
||
return ${vgpu_id}; | ||
} | ||
|
||
sub assign_vgpu_to_guest { | ||
my ($uuid, $vm, $slot) = @_; | ||
|
||
# The UEFI guest created in virt test is ok for vGPU test | ||
die "vGPU only works on UEFI guests!" unless $vm =~ /uefi/i; | ||
assert_script_run("virsh shutdown $vm") unless script_output("virsh domstate $vm") eq "shut off"; | ||
# Save guest xml to /tmp/vm.xml, undefine the current one and define with the new xml | ||
my $vm_xml_file = "/tmp/$vm.xml"; | ||
assert_script_run "virsh dumpxml --inactive $vm > $vm_xml_file"; | ||
assert_script_run "virsh undefine $vm"; | ||
|
||
# Add the vgpu device section to guest configuration file | ||
die "PCI slot '0x0a' has already been used" if script_run("grep \"slot='$slot'\" $vm_xml_file") == 0; | ||
my $vgpu_xml_section = "<hostdev mode='subsystem' type='mdev' model='vfio-pci' display='off'>\\\n <source>\\\n <address uuid='$uuid'/>\\\n </source>\\\n <address type='pci' domain='0x0000' bus='0x00' slot='$slot' function='0x0'/>\\\n</hostdev>"; | ||
|
||
assert_script_run("sed -i \"/<devices>/a\\\\${vgpu_xml_section}\" $vm_xml_file"); | ||
upload_logs($vm_xml_file); | ||
assert_script_run "virsh define $vm_xml_file"; | ||
assert_script_run "virsh start $vm"; | ||
wait_guest_online($vm); | ||
assert_script_run "ssh root\@$vm 'lspci | grep NVIDIA'"; | ||
record_info("vGPU attached to $vm", script_output("ssh root\@$vm 'lspci | grep NVIDIA'")); | ||
} | ||
|
||
sub install_vgpu_grid_driver { | ||
my ($vm, $driver_url) = @_; | ||
|
||
# Download drivers from fileserver | ||
die "The vGPU driver URL requires to be in 'http://...' format!" unless $driver_url =~ /http:\/\/.*\/(.*\.run)/; | ||
my ${driver_file} = $1; | ||
|
||
# Check if the driver has been already installed | ||
if (script_run("ssh root\@$vm 'nvidia-smi'") == 0) { | ||
my $grid_version = script_output("ssh root\@$vm 'nvidia-smi -q | grep \"Driver Version\" | grep -oE \"[0-9.]+\"'"); | ||
if (${driver_file} =~ /$grid_version/) { | ||
record_info("Warning", "vGPU grid driver $grid_version has already been loaded!", result => 'softfail'); | ||
return; | ||
} | ||
} | ||
|
||
# Install dependencies seperately. It is easier to locate problem this way. | ||
download_script(${driver_file}, script_url => ${driver_url}, machine => $vm); | ||
assert_script_run "ssh root\@$vm 'zypper -n in kernel-default-devel'"; | ||
assert_script_run "ssh root\@$vm 'zypper -n in libglvnd-devel'"; | ||
# Install vGPU grid drivers without manual interactions | ||
assert_script_run("ssh root\@$vm './${driver_file} -s'"); | ||
assert_script_run("ssh root\@$vm 'nvidia-smi'"); | ||
record_info("vGPU Grid driver is installed successfully in $vm"); | ||
# Verify | ||
assert_script_run("ssh root\@$vm 'lsmod | grep -i nvidia'"); | ||
} | ||
|
||
sub detach_vgpu_from_guest { | ||
my ($slot, $vm) = @_; | ||
assert_script_run("ssh root\@$vm 'poweroff'"); | ||
script_retry("virsh domstate $vm | grep 'shut off'", timeout => 60, delay => 5, retry => 3, die => 0); | ||
assert_script_run("virt-xml $vm --remove-device --hostdev type=mdev,address.slot=$slot"); | ||
assert_script_run("virsh start $vm"); | ||
record_info("vGPU has been removed successfully from $vm"); | ||
wait_guest_online($vm); | ||
} | ||
|
||
sub remove_vgpu { | ||
my $uuid = shift; | ||
assert_script_run("cd `find /sys/devices -type d -name $uuid`"); | ||
assert_script_run("echo '1' > remove"); | ||
script_run('cd ../mdev_supported_types/; for i in *; do echo "$i" $(cat $i/name) available: $(cat $i/avail*); done'); | ||
die "Fatal: vGPU device $uuid is still alive!" if script_run("lsmdev | grep $uuid") == 0; | ||
record_info("vGPU removed", "$uuid has been removed from the host"); | ||
} | ||
|
||
sub post_fail_hook { | ||
my $self = shift; | ||
diag("Module vgpu post fail hook starts."); | ||
my $log_dir = "/tmp/vgpu"; | ||
reconnect_when_ssh_console_broken unless defined(script_run("ls /var/log/nvidia-installer.log", die_on_timeout => 0)); | ||
script_run("cp /var/log/nvidia-installer.log $log_dir"); | ||
#julie: need the following lines?? | ||
# save_network_device_status_logs($log_dir, $_, "post_fail_hook") foreach (keys %virt_autotest::common::guests); | ||
upload_virt_logs($log_dir, "vgpu"); | ||
# $self->SUPER::post_fail_hook; | ||
# restore_original_guests(); julie: for debug, don't restore guests | ||
} | ||
|
||
sub test_flags { | ||
#continue subsequent test in the case test restored | ||
return {fatal => 0}; | ||
} | ||
|
||
1; |