diff --git a/libvirt/tests/src/guest_kernel_debugging/guest_kdump.py b/libvirt/tests/src/guest_kernel_debugging/guest_kdump.py index c64e5a8e9b..ece42740e5 100644 --- a/libvirt/tests/src/guest_kernel_debugging/guest_kdump.py +++ b/libvirt/tests/src/guest_kernel_debugging/guest_kdump.py @@ -1,12 +1,10 @@ import time -import threading import logging as log +from virttest import utils_kdump from virttest import utils_test from virttest import virsh from virttest.libvirt_xml import vm_xml -from aexpect.exceptions import ShellProcessTerminatedError -from aexpect.exceptions import ShellTimeoutError # Using as lower capital is not the best way to do, but this is just a @@ -32,84 +30,46 @@ def run(test, params, env): vms = env.get_all_vms() guest_stress = params.get("guest_stress", "no") == "yes" guest_upstream_kernel = params.get("guest_upstream_kernel", "no") == "yes" + upstream_kernel_vmlinux = params.get("upstream_kernel_vmlinux") crash_utility = params.get("crash_utility", "no") == "yes" + setup_kdump = params.get("setup_kdump", "yes") == "yes" + install_missing_packages = params.get("install_missing_packages", "yes") == "yes" + crashkernel_value = params.get("crashkernel_value", "1024M") stress_time = params.get("stress_time", "30") crash_dir = params.get("crash_dir", "/var/crash/") debug_dir = params.get("debug_dir", "/home/") dump_options = params.get("dump_options", "--memory-only --bypass-cache") + enable_sysrq_cmd = "echo 1 > /proc/sys/kernel/sysrq" trigger_crash_cmd = "echo c > /proc/sysrq-trigger" def check_kdump_service(vm): """ Check if kdump.service is running - Current supported Distros: rhel, fedora, ubuntu + Current supported Distros: rhel, fedora, ubuntu, sles, suse param vm: vm object returns: None """ - # Set command based on different distros - logging.info("Checking for kdump.service in guest %s" % vm.name) session = vm.wait_for_login(timeout=240) - distro_details = session.cmd("cat /etc/os-release").lower() - check_kdump_cmd = "" - if "fedora" in distro_details or "rhel" in distro_details: - check_kdump_cmd = "kdumpctl status" - elif "ubuntu" in distro_details: - check_kdump_cmd = "kdump-config status" - else: - test.cancel("Guest distro not supported") - - # Check the kdump.service status - check_kdump_status, check_kdump = session.cmd_status_output(check_kdump_cmd) - if check_kdump_status: - logging.debug("Kdump service status: %s" % check_kdump) - test.error("Kdump service is not running in guest %s" % vm.name) - logging.info("Kdump service is up and running:\n%s" % check_kdump) + distro_info = utils_kdump.get_distro_info(session) session.close() + if distro_info["name"] == "unknown": + test.cancel("Guest distro not supported") + utils_kdump.check_kdump_service( + vm, + distro_info, + crashkernel_value=crashkernel_value if setup_kdump else None, + test=test, + ) def get_vmcores(vm): """ - Get vm-cores present in the crash directory - - param vm: vm object - returns: list of vm-cores - """ - logging.info("Getting vmcores in the guest %s" % vm.name) - session = vm.wait_for_login(timeout=100) - get_vmcores_cmd = "ls " + crash_dir - vmcores = session.cmd(get_vmcores_cmd).split() - session.close() - return vmcores - - def check_guest_status(vm): - """ - Check guest domstate. Guest should be running at all times. + Get vmcore file paths present in the crash directory. param vm: vm object - returns: - 1. 0 if guest is running - 2. 1 if guest is not running - """ - logging.info("Checking domstate of guest %s" % vm.name) - if vm.state() != "running": - logging.debug("Domain is not running: %s" % vm.state()) - return 1 - return 0 - - def trigger_crash(vm, session): + returns: list of vmcore file paths """ - Trigger sysrq triggered crash in guest - - param vm: vm object - param session: session object - returns: None - """ - logging.info("Triggering sysrq crash in guest %s" % vm.name) - try: - session.cmd(trigger_crash_cmd) - except ShellProcessTerminatedError: - time.sleep(120) - session.close() + return utils_kdump.get_vmcores(vm, crash_dir=crash_dir, test=test) def load_guest_stress(vms): """ @@ -152,67 +112,28 @@ def virsh_dump(failed_vms): else: logging.debug("Cannot dump %s as it is in shut off state" % vm.name) - def crash_utility_tool(vm, vmcore): + def crash_utility_tool(vm, vmcore_file): """ Check the working of crash utility tool to analyse the guest dump - Current supported Distros: rhel, fedora, ubuntu + Current supported Distros: rhel, fedora, ubuntu, sles, suse param vm: vm object - param vm-core: guest vm-core file + param vmcore_file: guest vmcore file path returns: None """ - logging.info("Debugging %s vmcore using crash utility" % vm.name) - session = vm.wait_for_login(timeout=100) - debug_libraries = [] - vmcore_file = crash_dir + vmcore + "/vmcore" - distro_details = session.cmd("cat /etc/os-release").lower() - guest_kernel = session.cmd("uname -r").strip() - - # Get required debug libraries based on distros - if "fedora" in distro_details or "rhel" in distro_details: - debug_libraries = ["*kexec-tools*", "*elfutils*", "*crash*", "*kdump-utils*"] - if not guest_upstream_kernel: - debug_libraries.append("*kernel-debuginfo*") - elif "ubuntu" in distro_details: - debug_libraries = ["*linux-crashdump*", "*kdump-tools*", "*crash*", "*elfutils*"] - else: - test.cancel("Guest distro not supported") - - # Check if required debug libraries are installed - not_installed_libs = set() - for library in debug_libraries: - get_library_cmd = "rpm -qa " + library - output = session.cmd(get_library_cmd).split() - if not output: - not_installed_libs.add(library) - if not_installed_libs: - test.error("Debug libraries not installed in %s: %s" % (vm.name, not_installed_libs)) + session = vm.wait_for_login(timeout=240) + distro_info = utils_kdump.get_distro_info(session) session.close() - - # Get debug kernel location based on distros - if "fedora" in distro_details or "rhel" in distro_details: - vmlinux = "/usr/lib/debug/lib/modules/" + guest_kernel + "/vmlinux" - elif "ubuntu" in distro_details: - vmlinux = "/usr/lib/debug/boot/vmlinux-" + guest_kernel - else: + if distro_info["name"] == "unknown": test.cancel("Guest distro not supported") - if guest_upstream_kernel: - vmlinux = params.get("upstream_kernel_vmlinux", vmlinux) - - # Run the crash utility tool - session = vm.wait_for_login(timeout=100) - crash_cmd = f"crash {vmlinux} {vmcore_file}" - crash_log = "" - logging.debug("Crash command: %s", crash_cmd) - try: - session.cmd(crash_cmd) - except ShellTimeoutError: - crash_log = session.get_output() - session.close() - - logging.debug("Crash utility output: %s" % crash_log) - if "PID" not in crash_log or "crash>" not in crash_log: - test.fail("Failed to debug %s vmcore using crash utiility" % vm.name) + utils_kdump.analyze_vmcore_with_crash( + vm, + distro_info, + vmcore_file, + upstream_kernel=guest_upstream_kernel, + vmlinux_path=upstream_kernel_vmlinux, + test=test, + ) # Declaring variables before starting test failed_vms = set() @@ -228,6 +149,30 @@ def crash_utility_tool(vm, vmcore): vmxml.sync() vm.start() + # Setup kdump packages/configuration before validation + if setup_kdump: + for vm in vms: + session = vm.wait_for_login(timeout=240) + distro_info = utils_kdump.get_distro_info(session) + session.close() + if distro_info["name"] == "unknown": + test.cancel("Guest distro not supported") + packages_installed = utils_kdump.ensure_kdump_packages( + vm, + distro_info, + install_missing=install_missing_packages, + test=test, + ) + if packages_installed: + logging.info("Rebooting %s after kdump package installation", vm.name) + vm.reboot(session=None, timeout=240) + utils_kdump.configure_kdump( + vm, + distro_info, + crashkernel_value=crashkernel_value, + test=test, + ) + # Check for kdump service if it is operational for vm in vms: check_kdump_service(vm) @@ -246,7 +191,7 @@ def crash_utility_tool(vm, vmcore): time.sleep(int(stress_time)) for vm in vms: - if check_guest_status(vm): + if utils_kdump.check_guest_status(vm): failed_vms.add(vm.name) virsh_dump_vms.add(vm) if failed_vms: @@ -254,21 +199,18 @@ def crash_utility_tool(vm, vmcore): test.fail("Guest %s not running after running stress" % failed_vms) # Trigger crash in guests in parallel - kdump_threads = [] - for vm in vms: - kdump_threads.append( - threading.Thread(target=trigger_crash, args=(vm, vm.wait_for_login(timeout=100))) - ) - time.sleep(20) - for kdump_thread in kdump_threads: - kdump_thread.start() - for kdump_thread in kdump_threads: - kdump_thread.join() + utils_kdump.trigger_crash_parallel( + vms, + enable_sysrq_cmd=enable_sysrq_cmd, + trigger_crash_cmd=trigger_crash_cmd, + wait_time=120, + test=test, + ) # Check guest status after crash for vm in vms: try: - if check_guest_status(vm): + if utils_kdump.check_guest_status(vm): raise Exception("Guest %s not running after triggering crash" % vm.name) session = vm.wait_for_login(timeout=240) logging.info("Able to login into %s" % vm.name) @@ -283,13 +225,17 @@ def crash_utility_tool(vm, vmcore): # Check for the vm-cores in guests after crash post_vmcores = {} + new_vmcores = {} for vm in vms: post_vmcores[vm.name] = get_vmcores(vm) logging.info("%s vmcores after crash: %s" % (vm.name, post_vmcores[vm.name])) + new_vmcores[vm.name] = sorted( + list(set(post_vmcores[vm.name]) - set(pre_vmcores[vm.name])) + ) # Check if vm-core got generated after crash in guests for vm in vms: - if len(post_vmcores[vm.name]) <= len(pre_vmcores[vm.name]): + if not new_vmcores[vm.name]: failed_vms.add(vm.name) if failed_vms: test.fail("vmcore not generated in %s" % failed_vms) @@ -297,9 +243,19 @@ def crash_utility_tool(vm, vmcore): # Debug vm-core using crash utility tool if crash_utility: for vm in vms: - for vmcore_file in pre_vmcores[vm.name]: - post_vmcores[vm.name].remove(vmcore_file) - crash_utility_tool(vm, post_vmcores[vm.name][-1]) + session = vm.wait_for_login(timeout=240) + distro_info = utils_kdump.get_distro_info(session) + session.close() + if distro_info["name"] == "unknown": + test.cancel("Guest distro not supported") + utils_kdump.ensure_crash_utility_packages( + vm, + distro_info, + install_missing=install_missing_packages, + upstream_kernel=guest_upstream_kernel, + test=test, + ) + crash_utility_tool(vm, new_vmcores[vm.name][-1]) # Unload the stress app in guests if guest_stress: diff --git a/libvirt/tests/src/libvirt_mem.py b/libvirt/tests/src/libvirt_mem.py index 64abd0ece1..1563358887 100644 --- a/libvirt/tests/src/libvirt_mem.py +++ b/libvirt/tests/src/libvirt_mem.py @@ -116,6 +116,60 @@ def restore_hugepages(page_size=4): config.restore() utils_libvirtd.libvirtd_restart() + def setup_vhost_max_mem_regions(): + """ + Configure vhost max_mem_regions for SLES systems. + SLES has lower default values compared to other distros. + This is required for memory hotplug tests with max_slots. + Must be called when no VM is using vhost modules. + This function applies runtime configuration only. + For persistent configuration across reboots, manually create: + /etc/modprobe.d/vhost.conf with content: options vhost max_mem_regions=512 + """ + try: + # Check if running on SLES + distro_check = process.run("cat /etc/os-release | grep -i sles", + shell=True, ignore_status=True) + if distro_check.exit_status == 0: + logging.info("SLES detected, configuring vhost max_mem_regions=512") + vm_check = process.run("virsh list --state-running --name", + shell=True, ignore_status=True) + if vm_check.exit_status != 0: + logging.warning("Unable to determine running VMs; skipping vhost configuration") + return + running_vms = [name for name in vm_check.stdout_text.splitlines() if name.strip()] + if running_vms: + logging.warning("Running VMs detected, skipping vhost configuration for safety:") + logging.warning("%s", "\n".join(running_vms)) + return + + logging.info("No running VMs detected, proceeding with vhost configuration") + process.run("modprobe -r vhost_net vhost_scsi vhost_vsock vhost", + shell=True, ignore_status=True) + result = process.run("modprobe vhost max_mem_regions=512", + shell=True, ignore_status=True) + + if result.exit_status == 0: + logging.info("Successfully configured vhost max_mem_regions=512 at runtime") + verify_cmd = "cat /sys/module/vhost/parameters/max_mem_regions" + verify_result = process.run(verify_cmd, shell=True, ignore_status=True) + if verify_result.exit_status == 0: + current_value = verify_result.stdout_text.strip() + logging.info("Verified vhost max_mem_regions is set to: %s", current_value) + if current_value != "512": + test.cancel("vhost max_mem_regions verification failed: expected 512, got %s" + % current_value) + else: + logging.debug("Could not verify vhost max_mem_regions setting") + else: + test.cancel("Failed to configure vhost at runtime: %s" % result.stderr_text) + else: + logging.debug("Non-SLES system detected, skipping vhost configuration") + + except Exception as e: + test.cancel("Error setting up vhost max_mem_regions: %s" % str(e)) + + def check_qemu_cmd(max_mem_rt, tg_size): """ Check qemu command line options. @@ -528,6 +582,8 @@ def modify_domain_xml(): # Destroy domain first if vm.is_alive(): vm.destroy(gracefully=False) + # Configure vhost max_mem_regions for SLES (VM is now stopped) + setup_vhost_max_mem_regions() modify_domain_xml() numa_info = utils_misc.NumaInfo() logging.debug(numa_info.get_all_node_meminfo())