|
1 | 1 | #!/usr/bin/env bash |
2 | | -# Create an isolated venv for PR 7916 repro at .venvs/pr7916 (repo root): |
3 | | -# - Removes only that venv if it already exists (does not touch .venv or other envs) |
4 | | -# - PyTorch 2.8.0+cu128 (CUDA 12.8) |
5 | | -# - requirements from requirements/requirements.txt |
6 | | -# - DeepSpeed editable install from the *current* checkout |
7 | | -# - pytest |
| 2 | +# PR 7916: venv at .venvs/pr7916, PyTorch 2.8 + cu128, then repro on current branch vs master. |
8 | 3 | # |
9 | | -# Then validates the fix by: |
10 | | -# 1) Running the repro with the current branch (expect success + "OK" line) |
11 | | -# 2) Checking out master and running the same repro script (expect original RuntimeError) |
12 | | -# 3) Checking back to the branch you started on |
| 4 | +# Venv: reuses $VENV_DIR if bin/activate exists (no pip). --force-install always recreates. |
| 5 | +# --skip-install reuses only and errors if the venv is missing. |
13 | 6 | # |
14 | | -# Usage (from repo root): |
15 | | -# ./scripts/setup_pr7916.sh |
16 | | -# ./scripts/setup_pr7916.sh --skip-install # reuse .venvs/pr7916, no pip/venv setup |
17 | | -# PR7916_SKIP_INSTALL=1 ./scripts/setup_pr7916.sh |
18 | | -# |
19 | | -# Activate later: |
20 | | -# source .venvs/pr7916/bin/activate |
| 7 | +# Usage: ./scripts/setup_pr7916.sh [--force-install] [--skip-install] |
| 8 | +# Env: PR7916_VENV_DIR, PR7916_MAIN_REF (default master), PR7916_FORCE_INSTALL, PR7916_SKIP_INSTALL |
21 | 9 | # |
22 | 10 | set -euo pipefail |
| 11 | + |
23 | 12 | ROOT="$(cd "$(dirname "$0")/.." && pwd)" |
24 | 13 | cd "$ROOT" |
25 | 14 |
|
26 | 15 | VENV_DIR="${PR7916_VENV_DIR:-$ROOT/.venvs/pr7916}" |
27 | 16 | MAIN_REF="${PR7916_MAIN_REF:-master}" |
28 | | - |
29 | | -SKIP_INSTALL=0 |
30 | | -case "${PR7916_SKIP_INSTALL:-}" in |
31 | | - 1|true|yes|on) SKIP_INSTALL=1 ;; |
32 | | -esac |
33 | | -if [[ "${1:-}" == "--skip-install" ]]; then |
34 | | - SKIP_INSTALL=1 |
| 17 | +VENV_SH="$VENV_DIR/bin/activate" |
| 18 | + |
| 19 | +truthy() { case "${1:-}" in 1|true|yes|on) return 0;; *) return 1;; esac; } |
| 20 | + |
| 21 | +force=0 |
| 22 | +skip_only=0 |
| 23 | +truthy "${PR7916_FORCE_INSTALL:-}" && force=1 |
| 24 | +truthy "${PR7916_SKIP_INSTALL:-}" && skip_only=1 |
| 25 | +while [[ $# -gt 0 ]]; do |
| 26 | + case "$1" in |
| 27 | + --force-install) force=1 ;; |
| 28 | + --skip-install) skip_only=1 ;; |
| 29 | + *) echo "error: unknown argument: $1" >&2; exit 1 ;; |
| 30 | + esac |
35 | 31 | shift |
36 | | -fi |
| 32 | +done |
37 | 33 |
|
38 | | -if [[ "$SKIP_INSTALL" -eq 1 ]]; then |
39 | | - echo "==> Skipping venv recreate and pip installs (reuse $VENV_DIR)" |
40 | | - if [[ ! -f "$VENV_DIR/bin/activate" ]]; then |
41 | | - echo "error: venv not found at $VENV_DIR — run once without --skip-install first." >&2 |
42 | | - exit 1 |
43 | | - fi |
44 | | - # shellcheck source=/dev/null |
45 | | - . "$VENV_DIR/bin/activate" |
| 34 | +print_versions() { |
46 | 35 | python -c "import torch, deepspeed; print('torch', torch.__version__, 'cuda', torch.version.cuda); print('deepspeed', deepspeed.__file__); print('deepspeed version', deepspeed.__version__)" |
47 | | -else |
48 | | - echo "==> Using venv: $VENV_DIR (only this path is removed if it already exists)" |
49 | | - rm -rf "$VENV_DIR" |
50 | | - mkdir -p "$(dirname "$VENV_DIR")" |
51 | | - python3 -m venv "$VENV_DIR" |
52 | | - # shellcheck source=/dev/null |
53 | | - . "$VENV_DIR/bin/activate" |
54 | | - |
55 | | - python -c 'import sys; assert sys.version_info[:2] == (3, 11), "Use Python 3.11 to match the bug report"' || { |
56 | | - echo "Warning: expected Python 3.11; found $(python -V)" >&2 |
57 | | - } |
| 36 | +} |
| 37 | + |
| 38 | +# Sets: full=1 → wipe + venv + pip; full=0 → activate existing only |
| 39 | +decide_full_setup() { |
| 40 | + if [[ "$force" -eq 1 ]]; then |
| 41 | + echo 1 |
| 42 | + elif [[ "$skip_only" -eq 1 ]]; then |
| 43 | + echo 0 |
| 44 | + elif [[ -f "$VENV_SH" ]]; then |
| 45 | + echo 0 |
| 46 | + else |
| 47 | + echo 1 |
| 48 | + fi |
| 49 | +} |
| 50 | + |
| 51 | +setup_venv() { |
| 52 | + local full |
| 53 | + full="$(decide_full_setup)" |
| 54 | + |
| 55 | + if [[ "$full" -eq 0 ]]; then |
| 56 | + [[ -f "$VENV_SH" ]] || { |
| 57 | + echo "error: no venv at $VENV_DIR (drop --skip-install or run once without it)" >&2 |
| 58 | + exit 1 |
| 59 | + } |
| 60 | + echo "==> Reusing venv $VENV_DIR (use --force-install to reinstall)" |
| 61 | + else |
| 62 | + echo "==> Creating venv at $VENV_DIR" |
| 63 | + rm -rf "$VENV_DIR" |
| 64 | + mkdir -p "$(dirname "$VENV_DIR")" |
| 65 | + python3 -m venv "$VENV_DIR" |
| 66 | + fi |
58 | 67 |
|
59 | | - pip install -U pip setuptools wheel |
| 68 | + # shellcheck source=/dev/null |
| 69 | + . "$VENV_SH" |
| 70 | + |
| 71 | + if [[ "$full" -eq 1 ]]; then |
| 72 | + python -c 'import sys; assert sys.version_info[:2] == (3, 11), "Use Python 3.11 to match the bug report"' || { |
| 73 | + echo "Warning: expected Python 3.11; found $(python -V)" >&2 |
| 74 | + } |
| 75 | + pip install -U pip setuptools wheel |
| 76 | + pip install "torch==2.8.0" --index-url https://download.pytorch.org/whl/cu128 |
| 77 | + pip install -r requirements/requirements.txt |
| 78 | + pip install -e . |
| 79 | + pip install pytest |
| 80 | + fi |
60 | 81 |
|
61 | | - # PyTorch 2.8.0 + CUDA 12.8 (matches common functorch / ZeRO-3 bug reports) |
62 | | - pip install "torch==2.8.0" --index-url https://download.pytorch.org/whl/cu128 |
| 82 | + print_versions |
| 83 | +} |
63 | 84 |
|
64 | | - pip install -r requirements/requirements.txt |
| 85 | +run_repro_compare() { |
| 86 | + local REPRO_SRC="$ROOT/scripts/repro_pr7916.py" REPRO_TMP FIX_BRANCH STASHED=0 MAIN_EC |
65 | 87 |
|
66 | | - # Latest DeepSpeed = this git checkout (editable) |
67 | | - pip install -e . |
| 88 | + [[ -f "$REPRO_SRC" ]] || { |
| 89 | + echo "error: missing $REPRO_SRC (need this file on the current branch)" >&2 |
| 90 | + exit 1 |
| 91 | + } |
68 | 92 |
|
69 | | - pip install pytest |
| 93 | + REPRO_TMP="$(mktemp /tmp/repro_pr7916_XXXXXX.py)" |
| 94 | + cp "$REPRO_SRC" "$REPRO_TMP" |
| 95 | + trap 'rm -f "$REPRO_TMP"' EXIT |
70 | 96 |
|
71 | | - python -c "import torch, deepspeed; print('torch', torch.__version__, 'cuda', torch.version.cuda); print('deepspeed', deepspeed.__file__); print('deepspeed version', deepspeed.__version__)" |
72 | | -fi |
| 97 | + FIX_BRANCH="$(git rev-parse --abbrev-ref HEAD)" |
| 98 | + local -a run=(torchrun --standalone --nproc_per_node=1) |
73 | 99 |
|
74 | | -REPRO_SRC="$ROOT/scripts/repro_pr7916.py" |
75 | | -if [[ ! -f "$REPRO_SRC" ]]; then |
76 | | - echo "error: missing $REPRO_SRC (need repro script on current branch)" >&2 |
77 | | - exit 1 |
78 | | -fi |
| 100 | + echo "" |
| 101 | + echo "==> [1/2] Repro on $FIX_BRANCH (expect OK)" |
| 102 | + "${run[@]}" "$REPRO_TMP" |
79 | 103 |
|
80 | | -REPRO_TMP="$(mktemp /tmp/repro_pr7916_XXXXXX.py)" |
81 | | -cp "$REPRO_SRC" "$REPRO_TMP" |
82 | | -cleanup_repro_tmp() { rm -f "$REPRO_TMP"; } |
83 | | -trap cleanup_repro_tmp EXIT |
| 104 | + echo "" |
| 105 | + echo "==> [2/2] Repro on $MAIN_REF (expect setup_context RuntimeError on unfixed tree)" |
| 106 | + if [[ -n "$(git status --porcelain 2>/dev/null)" ]]; then |
| 107 | + echo "==> Stashing local changes for checkout..." |
| 108 | + git stash push -m "pr7916-setup: temp stash before main repro" |
| 109 | + STASHED=1 |
| 110 | + fi |
| 111 | + if ! git checkout "$MAIN_REF"; then |
| 112 | + echo "error: checkout $MAIN_REF failed" >&2 |
| 113 | + [[ "$STASHED" -eq 1 ]] && git stash pop || true |
| 114 | + exit 1 |
| 115 | + fi |
84 | 116 |
|
85 | | -FIX_BRANCH="$(git rev-parse --abbrev-ref HEAD)" |
86 | | -TORCHRUN=(torchrun --standalone --nproc_per_node=1) |
| 117 | + set +e |
| 118 | + "${run[@]}" "$REPRO_TMP" |
| 119 | + MAIN_EC=$? |
| 120 | + set -e |
87 | 121 |
|
88 | | -echo "" |
89 | | -echo "==> [1/2] Repro on fix branch: $FIX_BRANCH (expect success)" |
90 | | -"${TORCHRUN[@]}" "$REPRO_TMP" |
| 122 | + if [[ "$MAIN_EC" -eq 0 ]]; then |
| 123 | + echo "warning: main-branch repro exited 0 (expected failure on unfixed tree)." >&2 |
| 124 | + else |
| 125 | + echo "main-branch repro exited $MAIN_EC (non-zero expected for unfixed tree)." |
| 126 | + fi |
91 | 127 |
|
92 | | -echo "" |
93 | | -echo "==> [2/2] Repro on $MAIN_REF (expect original functorch / setup_context error)" |
94 | | -STASHED=0 |
95 | | -if [[ -n "$(git status --porcelain 2>/dev/null)" ]]; then |
96 | | - echo "==> Stashing local changes so checkout to $MAIN_REF can proceed..." |
97 | | - git stash push -m "pr7916-setup: temp stash before main repro" |
98 | | - STASHED=1 |
99 | | -fi |
100 | | -if ! git checkout "$MAIN_REF"; then |
101 | | - echo "error: could not checkout $MAIN_REF" >&2 |
| 128 | + echo "" |
| 129 | + echo "==> Restoring $FIX_BRANCH" |
| 130 | + git checkout "$FIX_BRANCH" |
102 | 131 | if [[ "$STASHED" -eq 1 ]]; then |
103 | | - git stash pop || true |
| 132 | + git stash pop || echo "warning: stash pop failed — see git stash list" >&2 |
104 | 133 | fi |
105 | | - exit 1 |
106 | | -fi |
107 | | -set +e |
108 | | -"${TORCHRUN[@]}" "$REPRO_TMP" |
109 | | -MAIN_EC=$? |
110 | | -set -e |
111 | | -if [[ "$MAIN_EC" -eq 0 ]]; then |
112 | | - echo "" >&2 |
113 | | - echo "warning: main branch run exited 0 — expected failure on unfixed tree." >&2 |
114 | | -else |
115 | | - echo "" |
116 | | - echo "main branch run exited with $MAIN_EC (non-zero is expected for the unfixed tree)." |
117 | | -fi |
118 | | - |
119 | | -echo "" |
120 | | -echo "==> Restoring branch: $FIX_BRANCH" |
121 | | -git checkout "$FIX_BRANCH" |
| 134 | +} |
122 | 135 |
|
123 | | -if [[ "$STASHED" -eq 1 ]]; then |
124 | | - echo "==> Restoring stashed local changes..." |
125 | | - git stash pop || echo "warning: stash pop failed (resolve manually with git stash list)" >&2 |
126 | | -fi |
| 136 | +setup_venv |
| 137 | +run_repro_compare |
127 | 138 |
|
128 | 139 | echo "" |
129 | | -echo "Done. To use this environment: source $VENV_DIR/bin/activate" |
| 140 | +echo "Done. Activate: source $VENV_DIR/bin/activate" |
0 commit comments