|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# Usage: |
| 4 | +# ./algorithmic-efficiency/scoring/utils/slurm/run_submission.sh \ |
| 5 | +# --submission_path submissions_algorithms/submissions/self_tuning/schedule_free_adamw_v2 |
| 6 | +# |
| 7 | +# Note: --dry_run is true by default (sets MAX_GLOBAL_STEPS=10). |
| 8 | +# To perform a full run, explicitly set --dry_run false. |
| 9 | + |
| 10 | +set -e |
| 11 | +set -x |
| 12 | + |
| 13 | +# --- Global Variables --- |
| 14 | +SUBMISSION_PATH="" |
| 15 | +DRY_RUN=true |
| 16 | +MAX_GLOBAL_STEPS=10 |
| 17 | +SUBMISSION_NAME="" |
| 18 | +RULESET="" |
| 19 | +FRAMEWORK="" |
| 20 | +ARRAY_RANGE="" |
| 21 | + |
| 22 | +# --- Helper Functions --- |
| 23 | + |
| 24 | +install_yq() { |
| 25 | + if ! command -v yq &> /dev/null; then |
| 26 | + echo "yq not found. Attempting to install locally to $HOME/.local/bin..." |
| 27 | + mkdir -p "$HOME/.local/bin" |
| 28 | + local OS=$(uname | tr '[:upper:]' '[:lower:]') |
| 29 | + local ARCH=$(uname -m) |
| 30 | + case "$ARCH" in |
| 31 | + x86_64) ARCH="amd64" ;; |
| 32 | + aarch64) ARCH="arm64" ;; |
| 33 | + esac |
| 34 | + |
| 35 | + local YQ_URL="https://github.com/mikefarah/yq/releases/latest/download/yq_${OS}_${ARCH}" |
| 36 | + if command -v curl &> /dev/null; then |
| 37 | + curl -L "$YQ_URL" -o "$HOME/.local/bin/yq" |
| 38 | + elif command -v wget &> /dev/null; then |
| 39 | + wget "$YQ_URL" -O "$HOME/.local/bin/yq" |
| 40 | + else |
| 41 | + echo "Error: Neither curl nor wget found. Please install yq manually: https://github.com/mikefarah/yq" |
| 42 | + exit 1 |
| 43 | + fi |
| 44 | + chmod +x "$HOME/.local/bin/yq" |
| 45 | + export PATH="$HOME/.local/bin:$PATH" |
| 46 | + echo "yq installed successfully to $HOME/.local/bin" |
| 47 | + fi |
| 48 | +} |
| 49 | + |
| 50 | +check_command() { |
| 51 | + if ! command -v "$1" &> /dev/null; then |
| 52 | + echo "Error: $1 could not be found. Please install it." |
| 53 | + exit 1 |
| 54 | + fi |
| 55 | +} |
| 56 | + |
| 57 | +verify_environment() { |
| 58 | + if [[ "$PWD" != "$HOME" ]]; then |
| 59 | + echo "Error: This script must be run from your home directory ($HOME)." |
| 60 | + echo "Expected directory structure:" |
| 61 | + echo " $HOME/" |
| 62 | + echo " ├── algorithmic-efficiency/" |
| 63 | + echo " └── submissions_algorithms/" |
| 64 | + exit 1 |
| 65 | + fi |
| 66 | + |
| 67 | + if [[ ! -d "algorithmic-efficiency" || ! -d "submissions_algorithms" ]]; then |
| 68 | + echo "Error: Required repositories not found in the current directory." |
| 69 | + echo "Please ensure both 'algorithmic-efficiency' and 'submissions_algorithms' are present in $HOME." |
| 70 | + exit 1 |
| 71 | + fi |
| 72 | + |
| 73 | + install_yq |
| 74 | + check_command "jq" |
| 75 | +} |
| 76 | + |
| 77 | +parse_flags() { |
| 78 | + while [[ $# -gt 0 ]]; do |
| 79 | + case $1 in |
| 80 | + --submission_path) |
| 81 | + SUBMISSION_PATH="$2" |
| 82 | + shift 2 |
| 83 | + ;; |
| 84 | + --dry_run) |
| 85 | + DRY_RUN="$2" |
| 86 | + shift 2 |
| 87 | + ;; |
| 88 | + *) |
| 89 | + echo "Unknown option $1" |
| 90 | + exit 1 |
| 91 | + ;; |
| 92 | + esac |
| 93 | + done |
| 94 | + |
| 95 | + if [ -z "$SUBMISSION_PATH" ]; then |
| 96 | + echo "Error: --submission_path is required." |
| 97 | + exit 1 |
| 98 | + fi |
| 99 | + |
| 100 | + if [ "$DRY_RUN" = false ]; then |
| 101 | + MAX_GLOBAL_STEPS="" |
| 102 | + fi |
| 103 | +} |
| 104 | + |
| 105 | +extract_submission_info() { |
| 106 | + SUBMISSION_NAME=$(basename "$SUBMISSION_PATH") |
| 107 | + local info_file="$SUBMISSION_PATH/submission_info.yml" |
| 108 | + |
| 109 | + if [ ! -f "$info_file" ]; then |
| 110 | + echo "Error: $info_file not found." |
| 111 | + exit 1 |
| 112 | + fi |
| 113 | + |
| 114 | + local raw_ruleset=$(yq eval '.ruleset' "$info_file" | tr '[:upper:]' '[:lower:]') |
| 115 | + FRAMEWORK=$(yq eval '.framework' "$info_file" | tr '[:upper:]' '[:lower:]') |
| 116 | + |
| 117 | + # Parse ruleset by checking for substrings "self" or "external" |
| 118 | + if [[ "$raw_ruleset" == *"self"* ]]; then |
| 119 | + RULESET="self" |
| 120 | + elif [[ "$raw_ruleset" == *"external"* ]]; then |
| 121 | + RULESET="external" |
| 122 | + else |
| 123 | + echo "Error: Expected 'ruleset' in $info_file to contain 'self' or 'external' (got '$raw_ruleset')." |
| 124 | + exit 1 |
| 125 | + fi |
| 126 | + |
| 127 | + # Verify framework |
| 128 | + if [[ "$FRAMEWORK" != "jax" && "$FRAMEWORK" != "pytorch" ]]; then |
| 129 | + echo "Error: 'framework' in $info_file must be either 'jax' or 'pytorch' (got '$FRAMEWORK')." |
| 130 | + exit 1 |
| 131 | + fi |
| 132 | + |
| 133 | + echo "Submission Name: $SUBMISSION_NAME" |
| 134 | + echo "Ruleset: $RULESET" |
| 135 | + echo "Framework: $FRAMEWORK" |
| 136 | + echo "Dry Run: $DRY_RUN" |
| 137 | + echo "Max Global Steps: $MAX_GLOBAL_STEPS" |
| 138 | +} |
| 139 | + |
| 140 | +generate_config() { |
| 141 | + local exp_prefix="submissions_a100_dry_run" |
| 142 | + if [ "$DRY_RUN" = false ]; then |
| 143 | + exp_prefix="submissions_a100" |
| 144 | + fi |
| 145 | + |
| 146 | + docker run \ |
| 147 | + --rm \ |
| 148 | + -v "$(pwd)":/algorithmic-efficiency \ |
| 149 | + -w /algorithmic-efficiency \ |
| 150 | + --entrypoint python \ |
| 151 | + "europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_${FRAMEWORK}_main:latest" \ |
| 152 | + algorithmic-efficiency/scoring/utils/slurm/make_job_config.py \ |
| 153 | + --framework="$FRAMEWORK" \ |
| 154 | + --tuning_ruleset="$RULESET" \ |
| 155 | + --submission_path="$SUBMISSION_PATH/submission.py" \ |
| 156 | + --experiment_dir="${exp_prefix}/$SUBMISSION_NAME" |
| 157 | + |
| 158 | + mv config.json "$SUBMISSION_NAME.json" |
| 159 | +} |
| 160 | + |
| 161 | +prepare_sbatch_array() { |
| 162 | + local num_jobs=$(jq 'length' "$SUBMISSION_NAME.json") |
| 163 | + if [[ "$num_jobs" -eq 0 ]]; then |
| 164 | + echo "Error: No jobs found in $SUBMISSION_NAME.json." |
| 165 | + exit 1 |
| 166 | + fi |
| 167 | + |
| 168 | + ARRAY_RANGE="0-$((num_jobs - 1))" |
| 169 | + echo "Number of jobs: $num_jobs" |
| 170 | + echo "Sbatch array range: $ARRAY_RANGE" |
| 171 | + |
| 172 | + mkdir -p "experiments/tests/$SUBMISSION_NAME" |
| 173 | +} |
| 174 | + |
| 175 | +run_sbatch() { |
| 176 | + local sbatch_cmd=( |
| 177 | + sbatch |
| 178 | + --array="$ARRAY_RANGE" |
| 179 | + --output="experiments/tests/$SUBMISSION_NAME/job_%A_%a.out" |
| 180 | + --error="experiments/tests/$SUBMISSION_NAME/job_%A_%a.err" |
| 181 | + "algorithmic-efficiency/scoring/utils/slurm/run_jobs.sh" |
| 182 | + --config_file "$(pwd)/$SUBMISSION_NAME.json" |
| 183 | + --image "europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_${FRAMEWORK}_main:latest" |
| 184 | + ) |
| 185 | + |
| 186 | + if [ -n "$MAX_GLOBAL_STEPS" ]; then |
| 187 | + sbatch_cmd+=(--max_global_steps "$MAX_GLOBAL_STEPS") |
| 188 | + fi |
| 189 | + |
| 190 | + "${sbatch_cmd[@]}" |
| 191 | +} |
| 192 | + |
| 193 | +# --- Main --- |
| 194 | + |
| 195 | +main() { |
| 196 | + verify_environment |
| 197 | + parse_flags "$@" |
| 198 | + extract_submission_info |
| 199 | + generate_config |
| 200 | + prepare_sbatch_array |
| 201 | + run_sbatch |
| 202 | +} |
| 203 | + |
| 204 | +main "$@" |
0 commit comments