Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
212 changes: 84 additions & 128 deletions libvirt/tests/src/guest_kernel_debugging/guest_kdump.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import time
import threading
import logging as log

from virttest import utils_kdump
from virttest import utils_test
from virttest import virsh
from virttest.libvirt_xml import vm_xml
from aexpect.exceptions import ShellProcessTerminatedError
from aexpect.exceptions import ShellTimeoutError


# Using as lower capital is not the best way to do, but this is just a
Expand All @@ -32,84 +30,46 @@ def run(test, params, env):
vms = env.get_all_vms()
guest_stress = params.get("guest_stress", "no") == "yes"
guest_upstream_kernel = params.get("guest_upstream_kernel", "no") == "yes"
upstream_kernel_vmlinux = params.get("upstream_kernel_vmlinux")
crash_utility = params.get("crash_utility", "no") == "yes"
setup_kdump = params.get("setup_kdump", "yes") == "yes"
install_missing_packages = params.get("install_missing_packages", "yes") == "yes"
crashkernel_value = params.get("crashkernel_value", "1024M")
stress_time = params.get("stress_time", "30")
crash_dir = params.get("crash_dir", "/var/crash/")
debug_dir = params.get("debug_dir", "/home/")
dump_options = params.get("dump_options", "--memory-only --bypass-cache")
enable_sysrq_cmd = "echo 1 > /proc/sys/kernel/sysrq"
trigger_crash_cmd = "echo c > /proc/sysrq-trigger"

def check_kdump_service(vm):
"""
Check if kdump.service is running
Current supported Distros: rhel, fedora, ubuntu
Current supported Distros: rhel, fedora, ubuntu, sles, suse

param vm: vm object
returns: None
"""
# Set command based on different distros
logging.info("Checking for kdump.service in guest %s" % vm.name)
session = vm.wait_for_login(timeout=240)
distro_details = session.cmd("cat /etc/os-release").lower()
check_kdump_cmd = ""
if "fedora" in distro_details or "rhel" in distro_details:
check_kdump_cmd = "kdumpctl status"
elif "ubuntu" in distro_details:
check_kdump_cmd = "kdump-config status"
else:
test.cancel("Guest distro not supported")

# Check the kdump.service status
check_kdump_status, check_kdump = session.cmd_status_output(check_kdump_cmd)
if check_kdump_status:
logging.debug("Kdump service status: %s" % check_kdump)
test.error("Kdump service is not running in guest %s" % vm.name)
logging.info("Kdump service is up and running:\n%s" % check_kdump)
distro_info = utils_kdump.get_distro_info(session)
session.close()
if distro_info["name"] == "unknown":
test.cancel("Guest distro not supported")
utils_kdump.check_kdump_service(
vm,
distro_info,
crashkernel_value=crashkernel_value if setup_kdump else None,
test=test,
)

def get_vmcores(vm):
"""
Get vm-cores present in the crash directory

param vm: vm object
returns: list of vm-cores
"""
logging.info("Getting vmcores in the guest %s" % vm.name)
session = vm.wait_for_login(timeout=100)
get_vmcores_cmd = "ls " + crash_dir
vmcores = session.cmd(get_vmcores_cmd).split()
session.close()
return vmcores

def check_guest_status(vm):
"""
Check guest domstate. Guest should be running at all times.
Get vmcore file paths present in the crash directory.

param vm: vm object
returns:
1. 0 if guest is running
2. 1 if guest is not running
"""
logging.info("Checking domstate of guest %s" % vm.name)
if vm.state() != "running":
logging.debug("Domain is not running: %s" % vm.state())
return 1
return 0

def trigger_crash(vm, session):
returns: list of vmcore file paths
"""
Trigger sysrq triggered crash in guest

param vm: vm object
param session: session object
returns: None
"""
logging.info("Triggering sysrq crash in guest %s" % vm.name)
try:
session.cmd(trigger_crash_cmd)
except ShellProcessTerminatedError:
time.sleep(120)
session.close()
return utils_kdump.get_vmcores(vm, crash_dir=crash_dir, test=test)

def load_guest_stress(vms):
"""
Expand Down Expand Up @@ -152,67 +112,28 @@ def virsh_dump(failed_vms):
else:
logging.debug("Cannot dump %s as it is in shut off state" % vm.name)

def crash_utility_tool(vm, vmcore):
def crash_utility_tool(vm, vmcore_file):
"""
Check the working of crash utility tool to analyse the guest dump
Current supported Distros: rhel, fedora, ubuntu
Current supported Distros: rhel, fedora, ubuntu, sles, suse

param vm: vm object
param vm-core: guest vm-core file
param vmcore_file: guest vmcore file path
returns: None
"""
logging.info("Debugging %s vmcore using crash utility" % vm.name)
session = vm.wait_for_login(timeout=100)
debug_libraries = []
vmcore_file = crash_dir + vmcore + "/vmcore"
distro_details = session.cmd("cat /etc/os-release").lower()
guest_kernel = session.cmd("uname -r").strip()

# Get required debug libraries based on distros
if "fedora" in distro_details or "rhel" in distro_details:
debug_libraries = ["*kexec-tools*", "*elfutils*", "*crash*", "*kdump-utils*"]
if not guest_upstream_kernel:
debug_libraries.append("*kernel-debuginfo*")
elif "ubuntu" in distro_details:
debug_libraries = ["*linux-crashdump*", "*kdump-tools*", "*crash*", "*elfutils*"]
else:
test.cancel("Guest distro not supported")

# Check if required debug libraries are installed
not_installed_libs = set()
for library in debug_libraries:
get_library_cmd = "rpm -qa " + library
output = session.cmd(get_library_cmd).split()
if not output:
not_installed_libs.add(library)
if not_installed_libs:
test.error("Debug libraries not installed in %s: %s" % (vm.name, not_installed_libs))
session = vm.wait_for_login(timeout=240)
distro_info = utils_kdump.get_distro_info(session)
session.close()

# Get debug kernel location based on distros
if "fedora" in distro_details or "rhel" in distro_details:
vmlinux = "/usr/lib/debug/lib/modules/" + guest_kernel + "/vmlinux"
elif "ubuntu" in distro_details:
vmlinux = "/usr/lib/debug/boot/vmlinux-" + guest_kernel
else:
if distro_info["name"] == "unknown":
test.cancel("Guest distro not supported")
if guest_upstream_kernel:
vmlinux = params.get("upstream_kernel_vmlinux", vmlinux)

# Run the crash utility tool
session = vm.wait_for_login(timeout=100)
crash_cmd = f"crash {vmlinux} {vmcore_file}"
crash_log = ""
logging.debug("Crash command: %s", crash_cmd)
try:
session.cmd(crash_cmd)
except ShellTimeoutError:
crash_log = session.get_output()
session.close()

logging.debug("Crash utility output: %s" % crash_log)
if "PID" not in crash_log or "crash>" not in crash_log:
test.fail("Failed to debug %s vmcore using crash utiility" % vm.name)
utils_kdump.analyze_vmcore_with_crash(
vm,
distro_info,
vmcore_file,
upstream_kernel=guest_upstream_kernel,
vmlinux_path=upstream_kernel_vmlinux,
test=test,
)

# Declaring variables before starting test
failed_vms = set()
Expand All @@ -228,6 +149,30 @@ def crash_utility_tool(vm, vmcore):
vmxml.sync()
vm.start()

# Setup kdump packages/configuration before validation
if setup_kdump:
for vm in vms:
session = vm.wait_for_login(timeout=240)
distro_info = utils_kdump.get_distro_info(session)
session.close()
if distro_info["name"] == "unknown":
test.cancel("Guest distro not supported")
packages_installed = utils_kdump.ensure_kdump_packages(
vm,
distro_info,
install_missing=install_missing_packages,
test=test,
)
if packages_installed:
logging.info("Rebooting %s after kdump package installation", vm.name)
vm.reboot(session=None, timeout=240)
utils_kdump.configure_kdump(
vm,
distro_info,
crashkernel_value=crashkernel_value,
test=test,
)

# Check for kdump service if it is operational
for vm in vms:
check_kdump_service(vm)
Expand All @@ -246,29 +191,26 @@ def crash_utility_tool(vm, vmcore):
time.sleep(int(stress_time))

for vm in vms:
if check_guest_status(vm):
if utils_kdump.check_guest_status(vm):
failed_vms.add(vm.name)
virsh_dump_vms.add(vm)
if failed_vms:
virsh_dump(virsh_dump_vms)
test.fail("Guest %s not running after running stress" % failed_vms)

# Trigger crash in guests in parallel
kdump_threads = []
for vm in vms:
kdump_threads.append(
threading.Thread(target=trigger_crash, args=(vm, vm.wait_for_login(timeout=100)))
)
time.sleep(20)
for kdump_thread in kdump_threads:
kdump_thread.start()
for kdump_thread in kdump_threads:
kdump_thread.join()
utils_kdump.trigger_crash_parallel(
vms,
enable_sysrq_cmd=enable_sysrq_cmd,
trigger_crash_cmd=trigger_crash_cmd,
wait_time=120,
test=test,
)

# Check guest status after crash
for vm in vms:
try:
if check_guest_status(vm):
if utils_kdump.check_guest_status(vm):
raise Exception("Guest %s not running after triggering crash" % vm.name)
session = vm.wait_for_login(timeout=240)
logging.info("Able to login into %s" % vm.name)
Expand All @@ -283,23 +225,37 @@ def crash_utility_tool(vm, vmcore):

# Check for the vm-cores in guests after crash
post_vmcores = {}
new_vmcores = {}
for vm in vms:
post_vmcores[vm.name] = get_vmcores(vm)
logging.info("%s vmcores after crash: %s" % (vm.name, post_vmcores[vm.name]))
new_vmcores[vm.name] = sorted(
list(set(post_vmcores[vm.name]) - set(pre_vmcores[vm.name]))
)

# Check if vm-core got generated after crash in guests
for vm in vms:
if len(post_vmcores[vm.name]) <= len(pre_vmcores[vm.name]):
if not new_vmcores[vm.name]:
failed_vms.add(vm.name)
if failed_vms:
test.fail("vmcore not generated in %s" % failed_vms)

# Debug vm-core using crash utility tool
if crash_utility:
for vm in vms:
for vmcore_file in pre_vmcores[vm.name]:
post_vmcores[vm.name].remove(vmcore_file)
crash_utility_tool(vm, post_vmcores[vm.name][-1])
session = vm.wait_for_login(timeout=240)
distro_info = utils_kdump.get_distro_info(session)
session.close()
if distro_info["name"] == "unknown":
test.cancel("Guest distro not supported")
utils_kdump.ensure_crash_utility_packages(
vm,
distro_info,
install_missing=install_missing_packages,
upstream_kernel=guest_upstream_kernel,
test=test,
)
crash_utility_tool(vm, new_vmcores[vm.name][-1])

# Unload the stress app in guests
if guest_stress:
Expand Down
56 changes: 56 additions & 0 deletions libvirt/tests/src/libvirt_mem.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,60 @@ def restore_hugepages(page_size=4):
config.restore()
utils_libvirtd.libvirtd_restart()

def setup_vhost_max_mem_regions():
"""
Configure vhost max_mem_regions for SLES systems.
SLES has lower default values compared to other distros.
This is required for memory hotplug tests with max_slots.
Must be called when no VM is using vhost modules.
This function applies runtime configuration only.
For persistent configuration across reboots, manually create:
/etc/modprobe.d/vhost.conf with content: options vhost max_mem_regions=512
"""
try:
# Check if running on SLES
distro_check = process.run("cat /etc/os-release | grep -i sles",
shell=True, ignore_status=True)
if distro_check.exit_status == 0:
logging.info("SLES detected, configuring vhost max_mem_regions=512")
vm_check = process.run("virsh list --state-running --name",
shell=True, ignore_status=True)
if vm_check.exit_status != 0:
logging.warning("Unable to determine running VMs; skipping vhost configuration")
return
running_vms = [name for name in vm_check.stdout_text.splitlines() if name.strip()]
if running_vms:
logging.warning("Running VMs detected, skipping vhost configuration for safety:")
logging.warning("%s", "\n".join(running_vms))
return

logging.info("No running VMs detected, proceeding with vhost configuration")
process.run("modprobe -r vhost_net vhost_scsi vhost_vsock vhost",
shell=True, ignore_status=True)
result = process.run("modprobe vhost max_mem_regions=512",
shell=True, ignore_status=True)

if result.exit_status == 0:
logging.info("Successfully configured vhost max_mem_regions=512 at runtime")
verify_cmd = "cat /sys/module/vhost/parameters/max_mem_regions"
verify_result = process.run(verify_cmd, shell=True, ignore_status=True)
if verify_result.exit_status == 0:
current_value = verify_result.stdout_text.strip()
logging.info("Verified vhost max_mem_regions is set to: %s", current_value)
if current_value != "512":
test.cancel("vhost max_mem_regions verification failed: expected 512, got %s"
% current_value)
else:
logging.debug("Could not verify vhost max_mem_regions setting")
else:
test.cancel("Failed to configure vhost at runtime: %s" % result.stderr_text)
else:
logging.debug("Non-SLES system detected, skipping vhost configuration")

except Exception as e:
test.cancel("Error setting up vhost max_mem_regions: %s" % str(e))


def check_qemu_cmd(max_mem_rt, tg_size):
"""
Check qemu command line options.
Expand Down Expand Up @@ -528,6 +582,8 @@ def modify_domain_xml():
# Destroy domain first
if vm.is_alive():
vm.destroy(gracefully=False)
# Configure vhost max_mem_regions for SLES (VM is now stopped)
setup_vhost_max_mem_regions()
modify_domain_xml()
numa_info = utils_misc.NumaInfo()
logging.debug(numa_info.get_all_node_meminfo())
Expand Down