From c03bb178c5630ad2260e8dd448a7a8cf9b25b554 Mon Sep 17 00:00:00 2001 From: Aleksei Sviridkin Date: Sat, 4 Jul 2026 14:11:25 +0300 Subject: [PATCH] fix(vgpu-device-manager): wait for host-installed vGPU Manager readiness The vgpu-manager-validation init container waited only for the vgpu-manager-ready status file, which the validator writes when the vGPU Manager is deployed as a container. When the vGPU Manager driver is pre-installed on the host (driver.enabled=false), the validator writes host-vgpu-manager-ready instead, so the init container blocked indefinitely on "waiting for NVIDIA vGPU Manager to be setup" and the vGPU Device Manager never started. Wait for either status file so the operand starts in both the container-managed and host-installed driver modes, resolving the existing TODO in the daemonset asset. Assisted-By: Claude Signed-off-by: Aleksei Sviridkin --- .../0600_daemonset.yaml | 8 ++-- controllers/object_controls_test.go | 40 +++++++++++++++++++ 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/assets/state-vgpu-device-manager/0600_daemonset.yaml b/assets/state-vgpu-device-manager/0600_daemonset.yaml index afe9faeeae..a72e8754d3 100644 --- a/assets/state-vgpu-device-manager/0600_daemonset.yaml +++ b/assets/state-vgpu-device-manager/0600_daemonset.yaml @@ -23,9 +23,11 @@ spec: - name: vgpu-manager-validation image: "FILLED BY THE OPERATOR" command: ['sh', '-c'] - # TODO: Account for pre-installed vGPU Manager. Currently validator - # creates a different status file when driver is pre-installed. - args: ["until [ -f /run/nvidia/validations/vgpu-manager-ready ]; do echo waiting for NVIDIA vGPU Manager to be setup; sleep 5; done"] + # The validator writes vgpu-manager-ready when the vGPU Manager is + # deployed as a container, and host-vgpu-manager-ready when the vGPU + # Manager driver is pre-installed on the host. Wait for either so this + # operand starts in both cases. + args: ["until [ -f /run/nvidia/validations/vgpu-manager-ready ] || [ -f /run/nvidia/validations/host-vgpu-manager-ready ]; do echo waiting for NVIDIA vGPU Manager to be setup; sleep 5; done"] securityContext: privileged: true volumeMounts: diff --git a/controllers/object_controls_test.go b/controllers/object_controls_test.go index 1f96ff5643..3231e26292 100644 --- a/controllers/object_controls_test.go +++ b/controllers/object_controls_test.go @@ -65,6 +65,7 @@ const ( devicePluginAssetsPath = "assets/state-device-plugin/" dcgmExporterAssetsPath = "assets/state-dcgm-exporter/" migManagerAssetsPath = "assets/state-mig-manager/" + vGPUDeviceManagerAssetsPath = "assets/state-vgpu-device-manager/" nfdNvidiaPCILabelKey = "feature.node.kubernetes.io/pci-10de.present" upgradedKernel = "5.4.135-generic" ) @@ -1254,6 +1255,45 @@ func TestVGPUManagerAssets(t *testing.T) { } } +// TestVGPUDeviceManagerReadinessGate verifies that the vGPU Device Manager's +// vgpu-manager-validation init container waits for the vGPU Manager readiness +// status file written in BOTH deployment modes: vgpu-manager-ready (vGPU +// Manager deployed as a container) and host-vgpu-manager-ready (vGPU Manager +// driver pre-installed on the host). The validator writes only the host- +// prefixed file when the driver is pre-installed, so a gate that waits for the +// container-managed file alone hangs indefinitely when driver.enabled=false. +func TestVGPUDeviceManagerReadinessGate(t *testing.T) { + manifestPath := filepath.Join(cfg.root, vGPUDeviceManagerAssetsPath, "0600_daemonset.yaml") + buffer, err := os.ReadFile(manifestPath) + require.NoError(t, err, "unable to read vGPU Device Manager daemonset asset") + + ds := appsv1.DaemonSet{} + ser := json.NewSerializerWithOptions(json.DefaultMetaFactory, scheme.Scheme, scheme.Scheme, + json.SerializerOptions{Yaml: true, Pretty: false, Strict: false}) + _, _, err = ser.Decode(buffer, nil, &ds) + require.NoError(t, err, "unable to decode vGPU Device Manager daemonset asset") + + var initCtr *corev1.Container + for i := range ds.Spec.Template.Spec.InitContainers { + if ds.Spec.Template.Spec.InitContainers[i].Name == "vgpu-manager-validation" { + initCtr = &ds.Spec.Template.Spec.InitContainers[i] + break + } + } + require.NotNil(t, initCtr, "vgpu-manager-validation init container not found") + + args := strings.Join(initCtr.Args, " ") + require.Contains(t, args, "/run/nvidia/validations/vgpu-manager-ready", + "readiness gate must wait for the container-managed vGPU Manager status file") + require.Contains(t, args, "/run/nvidia/validations/host-vgpu-manager-ready", + "readiness gate must also wait for the host-installed vGPU Manager status file (driver.enabled=false)") + // The two files must be combined with OR: the gate must pass when EITHER + // status file exists, since the validator only ever writes one of them. + // Guard against an accidental AND, which would re-break both modes. + require.Contains(t, args, "|| [ -f /run/nvidia/validations/host-vgpu-manager-ready", + "the two status files must be combined with OR, not AND, so the gate passes when either is present") +} + // getSandboxDevicePluginTestInput return a ClusterPolicy instance for a particular // device plugin test case. This function will grow as new test cases are added func getSandboxDevicePluginTestInput(testCase string) *gpuv1.ClusterPolicy {