Skip to content

Commit 3121a7f

Browse files
committed
update reproduce script to remove duplicated code
Signed-off-by: Zhang <jianmusings@gmail.com>
1 parent e58ac18 commit 3121a7f

1 file changed

Lines changed: 111 additions & 100 deletions

File tree

scripts/setup_pr7916.sh

Lines changed: 111 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -1,129 +1,140 @@
11
#!/usr/bin/env bash
2-
# Create an isolated venv for PR 7916 repro at .venvs/pr7916 (repo root):
3-
# - Removes only that venv if it already exists (does not touch .venv or other envs)
4-
# - PyTorch 2.8.0+cu128 (CUDA 12.8)
5-
# - requirements from requirements/requirements.txt
6-
# - DeepSpeed editable install from the *current* checkout
7-
# - pytest
2+
# PR 7916: venv at .venvs/pr7916, PyTorch 2.8 + cu128, then repro on current branch vs master.
83
#
9-
# Then validates the fix by:
10-
# 1) Running the repro with the current branch (expect success + "OK" line)
11-
# 2) Checking out master and running the same repro script (expect original RuntimeError)
12-
# 3) Checking back to the branch you started on
4+
# Venv: reuses $VENV_DIR if bin/activate exists (no pip). --force-install always recreates.
5+
# --skip-install reuses only and errors if the venv is missing.
136
#
14-
# Usage (from repo root):
15-
# ./scripts/setup_pr7916.sh
16-
# ./scripts/setup_pr7916.sh --skip-install # reuse .venvs/pr7916, no pip/venv setup
17-
# PR7916_SKIP_INSTALL=1 ./scripts/setup_pr7916.sh
18-
#
19-
# Activate later:
20-
# source .venvs/pr7916/bin/activate
7+
# Usage: ./scripts/setup_pr7916.sh [--force-install] [--skip-install]
8+
# Env: PR7916_VENV_DIR, PR7916_MAIN_REF (default master), PR7916_FORCE_INSTALL, PR7916_SKIP_INSTALL
219
#
2210
set -euo pipefail
11+
2312
ROOT="$(cd "$(dirname "$0")/.." && pwd)"
2413
cd "$ROOT"
2514

2615
VENV_DIR="${PR7916_VENV_DIR:-$ROOT/.venvs/pr7916}"
2716
MAIN_REF="${PR7916_MAIN_REF:-master}"
28-
29-
SKIP_INSTALL=0
30-
case "${PR7916_SKIP_INSTALL:-}" in
31-
1|true|yes|on) SKIP_INSTALL=1 ;;
32-
esac
33-
if [[ "${1:-}" == "--skip-install" ]]; then
34-
SKIP_INSTALL=1
17+
VENV_SH="$VENV_DIR/bin/activate"
18+
19+
truthy() { case "${1:-}" in 1|true|yes|on) return 0;; *) return 1;; esac; }
20+
21+
force=0
22+
skip_only=0
23+
truthy "${PR7916_FORCE_INSTALL:-}" && force=1
24+
truthy "${PR7916_SKIP_INSTALL:-}" && skip_only=1
25+
while [[ $# -gt 0 ]]; do
26+
case "$1" in
27+
--force-install) force=1 ;;
28+
--skip-install) skip_only=1 ;;
29+
*) echo "error: unknown argument: $1" >&2; exit 1 ;;
30+
esac
3531
shift
36-
fi
32+
done
3733

38-
if [[ "$SKIP_INSTALL" -eq 1 ]]; then
39-
echo "==> Skipping venv recreate and pip installs (reuse $VENV_DIR)"
40-
if [[ ! -f "$VENV_DIR/bin/activate" ]]; then
41-
echo "error: venv not found at $VENV_DIR — run once without --skip-install first." >&2
42-
exit 1
43-
fi
44-
# shellcheck source=/dev/null
45-
. "$VENV_DIR/bin/activate"
34+
print_versions() {
4635
python -c "import torch, deepspeed; print('torch', torch.__version__, 'cuda', torch.version.cuda); print('deepspeed', deepspeed.__file__); print('deepspeed version', deepspeed.__version__)"
47-
else
48-
echo "==> Using venv: $VENV_DIR (only this path is removed if it already exists)"
49-
rm -rf "$VENV_DIR"
50-
mkdir -p "$(dirname "$VENV_DIR")"
51-
python3 -m venv "$VENV_DIR"
52-
# shellcheck source=/dev/null
53-
. "$VENV_DIR/bin/activate"
54-
55-
python -c 'import sys; assert sys.version_info[:2] == (3, 11), "Use Python 3.11 to match the bug report"' || {
56-
echo "Warning: expected Python 3.11; found $(python -V)" >&2
57-
}
36+
}
37+
38+
# Sets: full=1 → wipe + venv + pip; full=0 → activate existing only
39+
decide_full_setup() {
40+
if [[ "$force" -eq 1 ]]; then
41+
echo 1
42+
elif [[ "$skip_only" -eq 1 ]]; then
43+
echo 0
44+
elif [[ -f "$VENV_SH" ]]; then
45+
echo 0
46+
else
47+
echo 1
48+
fi
49+
}
50+
51+
setup_venv() {
52+
local full
53+
full="$(decide_full_setup)"
54+
55+
if [[ "$full" -eq 0 ]]; then
56+
[[ -f "$VENV_SH" ]] || {
57+
echo "error: no venv at $VENV_DIR (drop --skip-install or run once without it)" >&2
58+
exit 1
59+
}
60+
echo "==> Reusing venv $VENV_DIR (use --force-install to reinstall)"
61+
else
62+
echo "==> Creating venv at $VENV_DIR"
63+
rm -rf "$VENV_DIR"
64+
mkdir -p "$(dirname "$VENV_DIR")"
65+
python3 -m venv "$VENV_DIR"
66+
fi
5867

59-
pip install -U pip setuptools wheel
68+
# shellcheck source=/dev/null
69+
. "$VENV_SH"
70+
71+
if [[ "$full" -eq 1 ]]; then
72+
python -c 'import sys; assert sys.version_info[:2] == (3, 11), "Use Python 3.11 to match the bug report"' || {
73+
echo "Warning: expected Python 3.11; found $(python -V)" >&2
74+
}
75+
pip install -U pip setuptools wheel
76+
pip install "torch==2.8.0" --index-url https://download.pytorch.org/whl/cu128
77+
pip install -r requirements/requirements.txt
78+
pip install -e .
79+
pip install pytest
80+
fi
6081

61-
# PyTorch 2.8.0 + CUDA 12.8 (matches common functorch / ZeRO-3 bug reports)
62-
pip install "torch==2.8.0" --index-url https://download.pytorch.org/whl/cu128
82+
print_versions
83+
}
6384

64-
pip install -r requirements/requirements.txt
85+
run_repro_compare() {
86+
local REPRO_SRC="$ROOT/scripts/repro_pr7916.py" REPRO_TMP FIX_BRANCH STASHED=0 MAIN_EC
6587

66-
# Latest DeepSpeed = this git checkout (editable)
67-
pip install -e .
88+
[[ -f "$REPRO_SRC" ]] || {
89+
echo "error: missing $REPRO_SRC (need this file on the current branch)" >&2
90+
exit 1
91+
}
6892

69-
pip install pytest
93+
REPRO_TMP="$(mktemp /tmp/repro_pr7916_XXXXXX.py)"
94+
cp "$REPRO_SRC" "$REPRO_TMP"
95+
trap 'rm -f "$REPRO_TMP"' EXIT
7096

71-
python -c "import torch, deepspeed; print('torch', torch.__version__, 'cuda', torch.version.cuda); print('deepspeed', deepspeed.__file__); print('deepspeed version', deepspeed.__version__)"
72-
fi
97+
FIX_BRANCH="$(git rev-parse --abbrev-ref HEAD)"
98+
local -a run=(torchrun --standalone --nproc_per_node=1)
7399

74-
REPRO_SRC="$ROOT/scripts/repro_pr7916.py"
75-
if [[ ! -f "$REPRO_SRC" ]]; then
76-
echo "error: missing $REPRO_SRC (need repro script on current branch)" >&2
77-
exit 1
78-
fi
100+
echo ""
101+
echo "==> [1/2] Repro on $FIX_BRANCH (expect OK)"
102+
"${run[@]}" "$REPRO_TMP"
79103

80-
REPRO_TMP="$(mktemp /tmp/repro_pr7916_XXXXXX.py)"
81-
cp "$REPRO_SRC" "$REPRO_TMP"
82-
cleanup_repro_tmp() { rm -f "$REPRO_TMP"; }
83-
trap cleanup_repro_tmp EXIT
104+
echo ""
105+
echo "==> [2/2] Repro on $MAIN_REF (expect setup_context RuntimeError on unfixed tree)"
106+
if [[ -n "$(git status --porcelain 2>/dev/null)" ]]; then
107+
echo "==> Stashing local changes for checkout..."
108+
git stash push -m "pr7916-setup: temp stash before main repro"
109+
STASHED=1
110+
fi
111+
if ! git checkout "$MAIN_REF"; then
112+
echo "error: checkout $MAIN_REF failed" >&2
113+
[[ "$STASHED" -eq 1 ]] && git stash pop || true
114+
exit 1
115+
fi
84116

85-
FIX_BRANCH="$(git rev-parse --abbrev-ref HEAD)"
86-
TORCHRUN=(torchrun --standalone --nproc_per_node=1)
117+
set +e
118+
"${run[@]}" "$REPRO_TMP"
119+
MAIN_EC=$?
120+
set -e
87121

88-
echo ""
89-
echo "==> [1/2] Repro on fix branch: $FIX_BRANCH (expect success)"
90-
"${TORCHRUN[@]}" "$REPRO_TMP"
122+
if [[ "$MAIN_EC" -eq 0 ]]; then
123+
echo "warning: main-branch repro exited 0 (expected failure on unfixed tree)." >&2
124+
else
125+
echo "main-branch repro exited $MAIN_EC (non-zero expected for unfixed tree)."
126+
fi
91127

92-
echo ""
93-
echo "==> [2/2] Repro on $MAIN_REF (expect original functorch / setup_context error)"
94-
STASHED=0
95-
if [[ -n "$(git status --porcelain 2>/dev/null)" ]]; then
96-
echo "==> Stashing local changes so checkout to $MAIN_REF can proceed..."
97-
git stash push -m "pr7916-setup: temp stash before main repro"
98-
STASHED=1
99-
fi
100-
if ! git checkout "$MAIN_REF"; then
101-
echo "error: could not checkout $MAIN_REF" >&2
128+
echo ""
129+
echo "==> Restoring $FIX_BRANCH"
130+
git checkout "$FIX_BRANCH"
102131
if [[ "$STASHED" -eq 1 ]]; then
103-
git stash pop || true
132+
git stash pop || echo "warning: stash pop failed — see git stash list" >&2
104133
fi
105-
exit 1
106-
fi
107-
set +e
108-
"${TORCHRUN[@]}" "$REPRO_TMP"
109-
MAIN_EC=$?
110-
set -e
111-
if [[ "$MAIN_EC" -eq 0 ]]; then
112-
echo "" >&2
113-
echo "warning: main branch run exited 0 — expected failure on unfixed tree." >&2
114-
else
115-
echo ""
116-
echo "main branch run exited with $MAIN_EC (non-zero is expected for the unfixed tree)."
117-
fi
118-
119-
echo ""
120-
echo "==> Restoring branch: $FIX_BRANCH"
121-
git checkout "$FIX_BRANCH"
134+
}
122135

123-
if [[ "$STASHED" -eq 1 ]]; then
124-
echo "==> Restoring stashed local changes..."
125-
git stash pop || echo "warning: stash pop failed (resolve manually with git stash list)" >&2
126-
fi
136+
setup_venv
137+
run_repro_compare
127138

128139
echo ""
129-
echo "Done. To use this environment: source $VENV_DIR/bin/activate"
140+
echo "Done. Activate: source $VENV_DIR/bin/activate"

0 commit comments

Comments
 (0)