diff --git a/assets/state-vgpu-device-manager/0600_daemonset.yaml b/assets/state-vgpu-device-manager/0600_daemonset.yaml index afe9faeeae..a72e8754d3 100644 --- a/assets/state-vgpu-device-manager/0600_daemonset.yaml +++ b/assets/state-vgpu-device-manager/0600_daemonset.yaml @@ -23,9 +23,11 @@ spec: - name: vgpu-manager-validation image: "FILLED BY THE OPERATOR" command: ['sh', '-c'] - # TODO: Account for pre-installed vGPU Manager. Currently validator - # creates a different status file when driver is pre-installed. - args: ["until [ -f /run/nvidia/validations/vgpu-manager-ready ]; do echo waiting for NVIDIA vGPU Manager to be setup; sleep 5; done"] + # The validator writes vgpu-manager-ready when the vGPU Manager is + # deployed as a container, and host-vgpu-manager-ready when the vGPU + # Manager driver is pre-installed on the host. Wait for either so this + # operand starts in both cases. + args: ["until [ -f /run/nvidia/validations/vgpu-manager-ready ] || [ -f /run/nvidia/validations/host-vgpu-manager-ready ]; do echo waiting for NVIDIA vGPU Manager to be setup; sleep 5; done"] securityContext: privileged: true volumeMounts: diff --git a/controllers/object_controls_test.go b/controllers/object_controls_test.go index 1f96ff5643..3231e26292 100644 --- a/controllers/object_controls_test.go +++ b/controllers/object_controls_test.go @@ -65,6 +65,7 @@ const ( devicePluginAssetsPath = "assets/state-device-plugin/" dcgmExporterAssetsPath = "assets/state-dcgm-exporter/" migManagerAssetsPath = "assets/state-mig-manager/" + vGPUDeviceManagerAssetsPath = "assets/state-vgpu-device-manager/" nfdNvidiaPCILabelKey = "feature.node.kubernetes.io/pci-10de.present" upgradedKernel = "5.4.135-generic" ) @@ -1254,6 +1255,45 @@ func TestVGPUManagerAssets(t *testing.T) { } } +// TestVGPUDeviceManagerReadinessGate verifies that the vGPU Device Manager's +// vgpu-manager-validation init container waits for the vGPU Manager readiness +// status file written in BOTH deployment modes: vgpu-manager-ready (vGPU +// Manager deployed as a container) and host-vgpu-manager-ready (vGPU Manager +// driver pre-installed on the host). The validator writes only the host- +// prefixed file when the driver is pre-installed, so a gate that waits for the +// container-managed file alone hangs indefinitely when driver.enabled=false. +func TestVGPUDeviceManagerReadinessGate(t *testing.T) { + manifestPath := filepath.Join(cfg.root, vGPUDeviceManagerAssetsPath, "0600_daemonset.yaml") + buffer, err := os.ReadFile(manifestPath) + require.NoError(t, err, "unable to read vGPU Device Manager daemonset asset") + + ds := appsv1.DaemonSet{} + ser := json.NewSerializerWithOptions(json.DefaultMetaFactory, scheme.Scheme, scheme.Scheme, + json.SerializerOptions{Yaml: true, Pretty: false, Strict: false}) + _, _, err = ser.Decode(buffer, nil, &ds) + require.NoError(t, err, "unable to decode vGPU Device Manager daemonset asset") + + var initCtr *corev1.Container + for i := range ds.Spec.Template.Spec.InitContainers { + if ds.Spec.Template.Spec.InitContainers[i].Name == "vgpu-manager-validation" { + initCtr = &ds.Spec.Template.Spec.InitContainers[i] + break + } + } + require.NotNil(t, initCtr, "vgpu-manager-validation init container not found") + + args := strings.Join(initCtr.Args, " ") + require.Contains(t, args, "/run/nvidia/validations/vgpu-manager-ready", + "readiness gate must wait for the container-managed vGPU Manager status file") + require.Contains(t, args, "/run/nvidia/validations/host-vgpu-manager-ready", + "readiness gate must also wait for the host-installed vGPU Manager status file (driver.enabled=false)") + // The two files must be combined with OR: the gate must pass when EITHER + // status file exists, since the validator only ever writes one of them. + // Guard against an accidental AND, which would re-break both modes. + require.Contains(t, args, "|| [ -f /run/nvidia/validations/host-vgpu-manager-ready", + "the two status files must be combined with OR, not AND, so the gate passes when either is present") +} + // getSandboxDevicePluginTestInput return a ClusterPolicy instance for a particular // device plugin test case. This function will grow as new test cases are added func getSandboxDevicePluginTestInput(testCase string) *gpuv1.ClusterPolicy {