ci(gpu): add separate GPU test workflows (#773)

pimlock · web-flow · commit c2e5256725ff · 2026-04-08T10:01:44.000-07:00
- add a dedicated `test-gpu.yml` workflow for validating the new shared NVIDIA GPU runner infrastructure
- keep the existing branch E2E and branch checks workflows unchanged for now
- add the repository-side `copy-pr-bot` configuration required for workflows that run on NVIDIA hosted workers
diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
@@ -0,0 +1,3 @@
+enabled: true
+auto_sync_draft: false
+auto_sync_ready: true
diff --git a/.github/workflows/e2e-gpu-test.yaml b/.github/workflows/e2e-gpu-test.yaml
@@ -0,0 +1,82 @@
+name: GPU E2E Test
+
+on:
+  workflow_call:
+    inputs:
+      image-tag:
+        description: "Image tag to test (typically the commit SHA)"
+        required: true
+        type: string
+
+permissions:
+  contents: read
+  packages: read
+
+jobs:
+  e2e-gpu:
+    name: "E2E GPU (${{ matrix.name }})"
+    runs-on: ${{ matrix.runner }}
+    continue-on-error: ${{ matrix.experimental }}
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: linux-arm64
+            runner: linux-arm64-gpu-l4-latest-1
+            cluster: e2e-gpu-arm64
+            port: "8083"
+            experimental: false
+          - name: linux-amd64
+            runner: linux-amd64-gpu-rtxpro6000-latest-1
+            cluster: e2e-gpu-amd64
+            port: "8084"
+            experimental: false
+          - name: wsl-amd64
+            runner: wsl-amd64-gpu-rtxpro6000-latest-1
+            cluster: e2e-gpu-wsl
+            port: "8085"
+            experimental: true
+    container:
+      image: ghcr.io/nvidia/openshell/ci:latest
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --privileged
+      volumes:
+        - /var/run/docker.sock:/var/run/docker.sock
+    env:
+      MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      IMAGE_TAG: ${{ inputs.image-tag }}
+      OPENSHELL_REGISTRY: ghcr.io/nvidia/openshell
+      OPENSHELL_REGISTRY_HOST: ghcr.io
+      OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell
+      OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }}
+      OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
+      OPENSHELL_GATEWAY: ${{ matrix.cluster }}
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Log in to GHCR
+        run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
+
+      - name: Pull cluster image
+        run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }}
+
+      - name: Install Python dependencies and generate protobuf stubs
+        run: uv sync --frozen && mise run --no-prepare python:proto
+
+      - name: Bootstrap GPU cluster
+        env:
+          GATEWAY_HOST: host.docker.internal
+          GATEWAY_PORT: ${{ matrix.port }}
+          CLUSTER_NAME: ${{ matrix.cluster }}
+          # Passes --gpu to the gateway bootstrap so the cluster comes up with GPU passthrough enabled.
+          CLUSTER_GPU: "1"
+          SKIP_IMAGE_PUSH: "1"
+          SKIP_CLUSTER_IMAGE_BUILD: "1"
+          OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }}
+        run: mise run --no-prepare --skip-deps cluster
+
+      - name: Run tests
+        run: mise run --no-prepare --skip-deps e2e:python:gpu
diff --git a/.github/workflows/test-gpu.yml b/.github/workflows/test-gpu.yml
@@ -0,0 +1,77 @@
+name: GPU Test
+
+on:
+  push:
+    branches:
+      - "pull-request/[0-9]+"
+  workflow_dispatch: {}
+  # Add `schedule:` here when we want nightly coverage from the same workflow.
+
+permissions:
+  contents: read
+  pull-requests: read
+  packages: write
+
+jobs:
+  pr_metadata:
+    name: Resolve PR metadata
+    runs-on: ubuntu-latest
+    outputs:
+      should_run: ${{ steps.gate.outputs.should_run }}
+    steps:
+      - id: get_pr_info
+        if: github.event_name == 'push'
+        continue-on-error: true
+        uses: nv-gha-runners/get-pr-info@main
+
+      - id: gate
+        shell: bash
+        env:
+          EVENT_NAME: ${{ github.event_name }}
+          GITHUB_SHA_VALUE: ${{ github.sha }}
+          GET_PR_INFO_OUTCOME: ${{ steps.get_pr_info.outcome }}
+          PR_INFO: ${{ steps.get_pr_info.outputs.pr-info }}
+        run: |
+          if [ "$EVENT_NAME" != "push" ]; then
+            echo "should_run=true" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          if [ "$GET_PR_INFO_OUTCOME" != "success" ]; then
+            echo "should_run=false" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          head_sha="$(jq -r '.head.sha' <<< "$PR_INFO")"
+          has_gpu_label="$(jq -r '[.labels[].name] | index("test:e2e-gpu") != null' <<< "$PR_INFO")"
+
+          # Only trust copied pull-request/* pushes that still match the PR head SHA
+          # and are explicitly labeled for GPU coverage.
+          if [ "$head_sha" = "$GITHUB_SHA_VALUE" ] && [ "$has_gpu_label" = "true" ]; then
+            should_run=true
+          else
+            should_run=false
+          fi
+
+          echo "should_run=$should_run" >> "$GITHUB_OUTPUT"
+
+  build-gateway:
+    needs: [pr_metadata]
+    if: needs.pr_metadata.outputs.should_run == 'true'
+    uses: ./.github/workflows/docker-build.yml
+    with:
+      component: gateway
+
+  build-cluster:
+    needs: [pr_metadata]
+    if: needs.pr_metadata.outputs.should_run == 'true'
+    uses: ./.github/workflows/docker-build.yml
+    with:
+      component: cluster
+
+  e2e-gpu:
+    needs: [pr_metadata, build-gateway, build-cluster]
+    if: needs.pr_metadata.outputs.should_run == 'true'
+    uses: ./.github/workflows/e2e-gpu-test.yaml
+    with:
+      image-tag: ${{ github.sha }}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+enabled: true`
	`2`	`+auto_sync_draft: false`
	`3`	`+auto_sync_ready: true`