Skip to content

Commit c2e5256

Browse files
authored
ci(gpu): add separate GPU test workflows (#773)
- add a dedicated `test-gpu.yml` workflow for validating the new shared NVIDIA GPU runner infrastructure - keep the existing branch E2E and branch checks workflows unchanged for now - add the repository-side `copy-pr-bot` configuration required for workflows that run on NVIDIA hosted workers
1 parent d7acfc1 commit c2e5256

3 files changed

Lines changed: 162 additions & 0 deletions

File tree

.github/copy-pr-bot.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
enabled: true
2+
auto_sync_draft: false
3+
auto_sync_ready: true
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
name: GPU E2E Test
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
image-tag:
7+
description: "Image tag to test (typically the commit SHA)"
8+
required: true
9+
type: string
10+
11+
permissions:
12+
contents: read
13+
packages: read
14+
15+
jobs:
16+
e2e-gpu:
17+
name: "E2E GPU (${{ matrix.name }})"
18+
runs-on: ${{ matrix.runner }}
19+
continue-on-error: ${{ matrix.experimental }}
20+
timeout-minutes: 30
21+
strategy:
22+
fail-fast: false
23+
matrix:
24+
include:
25+
- name: linux-arm64
26+
runner: linux-arm64-gpu-l4-latest-1
27+
cluster: e2e-gpu-arm64
28+
port: "8083"
29+
experimental: false
30+
- name: linux-amd64
31+
runner: linux-amd64-gpu-rtxpro6000-latest-1
32+
cluster: e2e-gpu-amd64
33+
port: "8084"
34+
experimental: false
35+
- name: wsl-amd64
36+
runner: wsl-amd64-gpu-rtxpro6000-latest-1
37+
cluster: e2e-gpu-wsl
38+
port: "8085"
39+
experimental: true
40+
container:
41+
image: ghcr.io/nvidia/openshell/ci:latest
42+
credentials:
43+
username: ${{ github.actor }}
44+
password: ${{ secrets.GITHUB_TOKEN }}
45+
options: --privileged
46+
volumes:
47+
- /var/run/docker.sock:/var/run/docker.sock
48+
env:
49+
MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
50+
IMAGE_TAG: ${{ inputs.image-tag }}
51+
OPENSHELL_REGISTRY: ghcr.io/nvidia/openshell
52+
OPENSHELL_REGISTRY_HOST: ghcr.io
53+
OPENSHELL_REGISTRY_NAMESPACE: nvidia/openshell
54+
OPENSHELL_REGISTRY_USERNAME: ${{ github.actor }}
55+
OPENSHELL_REGISTRY_PASSWORD: ${{ secrets.GITHUB_TOKEN }}
56+
OPENSHELL_GATEWAY: ${{ matrix.cluster }}
57+
steps:
58+
- uses: actions/checkout@v4
59+
60+
- name: Log in to GHCR
61+
run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
62+
63+
- name: Pull cluster image
64+
run: docker pull ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }}
65+
66+
- name: Install Python dependencies and generate protobuf stubs
67+
run: uv sync --frozen && mise run --no-prepare python:proto
68+
69+
- name: Bootstrap GPU cluster
70+
env:
71+
GATEWAY_HOST: host.docker.internal
72+
GATEWAY_PORT: ${{ matrix.port }}
73+
CLUSTER_NAME: ${{ matrix.cluster }}
74+
# Passes --gpu to the gateway bootstrap so the cluster comes up with GPU passthrough enabled.
75+
CLUSTER_GPU: "1"
76+
SKIP_IMAGE_PUSH: "1"
77+
SKIP_CLUSTER_IMAGE_BUILD: "1"
78+
OPENSHELL_CLUSTER_IMAGE: ghcr.io/nvidia/openshell/cluster:${{ inputs.image-tag }}
79+
run: mise run --no-prepare --skip-deps cluster
80+
81+
- name: Run tests
82+
run: mise run --no-prepare --skip-deps e2e:python:gpu

.github/workflows/test-gpu.yml

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
name: GPU Test
2+
3+
on:
4+
push:
5+
branches:
6+
- "pull-request/[0-9]+"
7+
workflow_dispatch: {}
8+
# Add `schedule:` here when we want nightly coverage from the same workflow.
9+
10+
permissions:
11+
contents: read
12+
pull-requests: read
13+
packages: write
14+
15+
jobs:
16+
pr_metadata:
17+
name: Resolve PR metadata
18+
runs-on: ubuntu-latest
19+
outputs:
20+
should_run: ${{ steps.gate.outputs.should_run }}
21+
steps:
22+
- id: get_pr_info
23+
if: github.event_name == 'push'
24+
continue-on-error: true
25+
uses: nv-gha-runners/get-pr-info@main
26+
27+
- id: gate
28+
shell: bash
29+
env:
30+
EVENT_NAME: ${{ github.event_name }}
31+
GITHUB_SHA_VALUE: ${{ github.sha }}
32+
GET_PR_INFO_OUTCOME: ${{ steps.get_pr_info.outcome }}
33+
PR_INFO: ${{ steps.get_pr_info.outputs.pr-info }}
34+
run: |
35+
if [ "$EVENT_NAME" != "push" ]; then
36+
echo "should_run=true" >> "$GITHUB_OUTPUT"
37+
exit 0
38+
fi
39+
40+
if [ "$GET_PR_INFO_OUTCOME" != "success" ]; then
41+
echo "should_run=false" >> "$GITHUB_OUTPUT"
42+
exit 0
43+
fi
44+
45+
head_sha="$(jq -r '.head.sha' <<< "$PR_INFO")"
46+
has_gpu_label="$(jq -r '[.labels[].name] | index("test:e2e-gpu") != null' <<< "$PR_INFO")"
47+
48+
# Only trust copied pull-request/* pushes that still match the PR head SHA
49+
# and are explicitly labeled for GPU coverage.
50+
if [ "$head_sha" = "$GITHUB_SHA_VALUE" ] && [ "$has_gpu_label" = "true" ]; then
51+
should_run=true
52+
else
53+
should_run=false
54+
fi
55+
56+
echo "should_run=$should_run" >> "$GITHUB_OUTPUT"
57+
58+
build-gateway:
59+
needs: [pr_metadata]
60+
if: needs.pr_metadata.outputs.should_run == 'true'
61+
uses: ./.github/workflows/docker-build.yml
62+
with:
63+
component: gateway
64+
65+
build-cluster:
66+
needs: [pr_metadata]
67+
if: needs.pr_metadata.outputs.should_run == 'true'
68+
uses: ./.github/workflows/docker-build.yml
69+
with:
70+
component: cluster
71+
72+
e2e-gpu:
73+
needs: [pr_metadata, build-gateway, build-cluster]
74+
if: needs.pr_metadata.outputs.should_run == 'true'
75+
uses: ./.github/workflows/e2e-gpu-test.yaml
76+
with:
77+
image-tag: ${{ github.sha }}

0 commit comments

Comments
 (0)