Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions assets/state-vgpu-device-manager/0600_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@ spec:
- name: vgpu-manager-validation
image: "FILLED BY THE OPERATOR"
command: ['sh', '-c']
# TODO: Account for pre-installed vGPU Manager. Currently validator
# creates a different status file when driver is pre-installed.
args: ["until [ -f /run/nvidia/validations/vgpu-manager-ready ]; do echo waiting for NVIDIA vGPU Manager to be setup; sleep 5; done"]
# The validator writes vgpu-manager-ready when the vGPU Manager is
# deployed as a container, and host-vgpu-manager-ready when the vGPU
# Manager driver is pre-installed on the host. Wait for either so this
# operand starts in both cases.
args: ["until [ -f /run/nvidia/validations/vgpu-manager-ready ] || [ -f /run/nvidia/validations/host-vgpu-manager-ready ]; do echo waiting for NVIDIA vGPU Manager to be setup; sleep 5; done"]
securityContext:
privileged: true
volumeMounts:
Expand Down
40 changes: 40 additions & 0 deletions controllers/object_controls_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ const (
devicePluginAssetsPath = "assets/state-device-plugin/"
dcgmExporterAssetsPath = "assets/state-dcgm-exporter/"
migManagerAssetsPath = "assets/state-mig-manager/"
vGPUDeviceManagerAssetsPath = "assets/state-vgpu-device-manager/"
nfdNvidiaPCILabelKey = "feature.node.kubernetes.io/pci-10de.present"
upgradedKernel = "5.4.135-generic"
)
Expand Down Expand Up @@ -1254,6 +1255,45 @@ func TestVGPUManagerAssets(t *testing.T) {
}
}

// TestVGPUDeviceManagerReadinessGate verifies that the vGPU Device Manager's
// vgpu-manager-validation init container waits for the vGPU Manager readiness
// status file written in BOTH deployment modes: vgpu-manager-ready (vGPU
// Manager deployed as a container) and host-vgpu-manager-ready (vGPU Manager
// driver pre-installed on the host). The validator writes only the host-
// prefixed file when the driver is pre-installed, so a gate that waits for the
// container-managed file alone hangs indefinitely when driver.enabled=false.
func TestVGPUDeviceManagerReadinessGate(t *testing.T) {
manifestPath := filepath.Join(cfg.root, vGPUDeviceManagerAssetsPath, "0600_daemonset.yaml")
buffer, err := os.ReadFile(manifestPath)
require.NoError(t, err, "unable to read vGPU Device Manager daemonset asset")

ds := appsv1.DaemonSet{}
ser := json.NewSerializerWithOptions(json.DefaultMetaFactory, scheme.Scheme, scheme.Scheme,
json.SerializerOptions{Yaml: true, Pretty: false, Strict: false})
_, _, err = ser.Decode(buffer, nil, &ds)
require.NoError(t, err, "unable to decode vGPU Device Manager daemonset asset")

var initCtr *corev1.Container
for i := range ds.Spec.Template.Spec.InitContainers {
if ds.Spec.Template.Spec.InitContainers[i].Name == "vgpu-manager-validation" {
initCtr = &ds.Spec.Template.Spec.InitContainers[i]
break
}
}
require.NotNil(t, initCtr, "vgpu-manager-validation init container not found")

args := strings.Join(initCtr.Args, " ")
require.Contains(t, args, "/run/nvidia/validations/vgpu-manager-ready",
"readiness gate must wait for the container-managed vGPU Manager status file")
require.Contains(t, args, "/run/nvidia/validations/host-vgpu-manager-ready",
"readiness gate must also wait for the host-installed vGPU Manager status file (driver.enabled=false)")
// The two files must be combined with OR: the gate must pass when EITHER
// status file exists, since the validator only ever writes one of them.
// Guard against an accidental AND, which would re-break both modes.
require.Contains(t, args, "|| [ -f /run/nvidia/validations/host-vgpu-manager-ready",
"the two status files must be combined with OR, not AND, so the gate passes when either is present")
}

// getSandboxDevicePluginTestInput return a ClusterPolicy instance for a particular
// device plugin test case. This function will grow as new test cases are added
func getSandboxDevicePluginTestInput(testCase string) *gpuv1.ClusterPolicy {
Expand Down