From 56af57547b68dd9d2fff85c5c1071fc988c61647 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 15 Jun 2026 10:29:51 +0800 Subject: [PATCH 01/38] research: add autoconfig POC with QNN NPU catalog sweep results MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds research/autoconfig/ — an automated config search POC that sweeps opset versions (17-21), execution providers, and graph optimizations to find the best winml-cli build config for a given model on Windows hardware. Key findings from 8-model QNN NPU catalog sweep: - npu-001: opset 21 bypass gives +25-31% on Conv+residual models (MobileViT, DINOv2) - npu-006: conv fusions (conv-bn/add/activation) cause 4900% regression on ResNet-18 QNN NPU - npu-007: DVFS thermal noise requires session-level averaging (3x500 iters) for reliable results Includes ep_knowledge/ KB with confirmed findings per EP, and catalog-qnn-sweep/ with per-model benchmark results and cross-model pattern analysis. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- pyproject.toml | 2 + research/autoconfig/README.md | 220 ++++ research/autoconfig/analyze_graph.py | 172 +++ research/autoconfig/autoconfig.py | 1001 +++++++++++++++++ research/autoconfig/autoconfig_diagram.html | 573 ++++++++++ .../autoconfig/catalog-qnn-sweep/SUMMARY.md | 268 +++++ .../apple--mobilevit-small/results.json | 138 +++ .../deepset--roberta-base-squad2/results.json | 106 ++ .../results.json | 124 ++ .../facebook--dinov2-small/results.json | 109 ++ .../google--vit-base-patch16-224/results.json | 96 ++ .../hustvl--yolos-small/results.json | 79 ++ .../microsoft--resnet-18/results.json | 124 ++ .../results.json | 123 ++ research/autoconfig/catalog_qnn_sweep.py | 881 +++++++++++++++ research/autoconfig/ep_knowledge/README.md | 25 + research/autoconfig/ep_knowledge/cpu.json | 126 +++ research/autoconfig/ep_knowledge/dml.json | 104 ++ research/autoconfig/ep_knowledge/qnn_gpu.json | 115 ++ research/autoconfig/ep_knowledge/qnn_npu.json | 203 ++++ research/autoconfig/gen_report_v3.py | 338 ++++++ 21 files changed, 4927 insertions(+) create mode 100644 research/autoconfig/README.md create mode 100644 research/autoconfig/analyze_graph.py create mode 100644 research/autoconfig/autoconfig.py create mode 100644 research/autoconfig/autoconfig_diagram.html create mode 100644 research/autoconfig/catalog-qnn-sweep/SUMMARY.md create mode 100644 research/autoconfig/catalog-qnn-sweep/apple--mobilevit-small/results.json create mode 100644 research/autoconfig/catalog-qnn-sweep/deepset--roberta-base-squad2/results.json create mode 100644 research/autoconfig/catalog-qnn-sweep/distilbert--distilbert-base-uncased-finetuned-sst-2-english/results.json create mode 100644 research/autoconfig/catalog-qnn-sweep/facebook--dinov2-small/results.json create mode 100644 research/autoconfig/catalog-qnn-sweep/google--vit-base-patch16-224/results.json create mode 100644 research/autoconfig/catalog-qnn-sweep/hustvl--yolos-small/results.json create mode 100644 research/autoconfig/catalog-qnn-sweep/microsoft--resnet-18/results.json create mode 100644 research/autoconfig/catalog-qnn-sweep/sentence-transformers--all-MiniLM-L6-v2/results.json create mode 100644 research/autoconfig/catalog_qnn_sweep.py create mode 100644 research/autoconfig/ep_knowledge/README.md create mode 100644 research/autoconfig/ep_knowledge/cpu.json create mode 100644 research/autoconfig/ep_knowledge/dml.json create mode 100644 research/autoconfig/ep_knowledge/qnn_gpu.json create mode 100644 research/autoconfig/ep_knowledge/qnn_npu.json create mode 100644 research/autoconfig/gen_report_v3.py diff --git a/pyproject.toml b/pyproject.toml index 20aada0df..597f1de0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -298,6 +298,8 @@ lint.per-file-ignores."tests/**" = [ "ANN", "D", "PLR2004", "PT", "S101", "T20" lint.per-file-ignores."tests/**/generate_patterns.py" = [ "PERF401" ] # Generated opset code: Allow long lines lint.per-file-ignores."src/winml/modelkit/analyze/onnx_opset/**" = [ "D", "E501", "N802", "N803", "N806", "TC001", "TC002", "TC003" ] +# Research scripts: POC code, not production — exempt from all style/type/security rules +lint.per-file-ignores."research/**" = [ "ANN", "D", "E", "N", "S", "T20", "UP", "W", "B", "C4", "FA", "I", "PERF", "PIE", "PT", "PTH", "RET", "RSE", "RUF", "SIM", "TCH", "TID", "TRY", "G", "ICN", "E402", "E501", "F401", "F403", "F811" ] # === Import Conventions === lint.flake8-bandit.check-typed-exception = true lint.flake8-bandit.hardcoded-tmp-directory = [ "/tmp", "/var/tmp", "C:\\Temp" ] diff --git a/research/autoconfig/README.md b/research/autoconfig/README.md new file mode 100644 index 000000000..2d37f0e70 --- /dev/null +++ b/research/autoconfig/README.md @@ -0,0 +1,220 @@ +# autoconfig — Automated Config Search POC + +**Status: Research POC — not production code.** + +This directory contains an experimental automated search system that finds the optimal +`winml-cli` build configuration (execution provider, opset version, graph optimizations) +for a given model on Windows hardware — without requiring the user to understand the +underlying ORT/EP optimizer mechanics. + +--- + +## What This Is + +`autoconfig.py` implements an Explorer/Optimizer/Reviewer loop: + +1. **Explorer** — proposes the next hypothesis (opset, EP flags, graph passes) by reading + `ep_knowledge/` to prune already-refuted configurations +2. **Optimizer** — runs `winml build` + `winml perf` (two-phase: 200-iter CV screen → 3×500-iter full bench) +3. **Reviewer** — evaluates the result, updates the knowledge base, and decides keep/discard + +The loop terminates after 30 consecutive discards (plateau detection) or a time budget. + +`catalog_qnn_sweep.py` is a generalized multi-model sweep that tests a fixed hypothesis +matrix (h0–h5: baseline, opset 17–21, conv fusions) across a catalog of models on the +QNN NPU, collecting structured results in `catalog-qnn-sweep//results.json`. + +`analyze_graph.py` is an ONNX graph analysis helper that identifies architectural +patterns relevant to EP optimization (Transpose sandwiches, residual branches, GELU +variants, depthwise Conv) and surfaces gaps in `winml analyze` output. + +`gen_report_v3.py` generates an HTML sweep report from `results.json` files. + +`autoconfig_diagram.html` is an interactive architecture diagram of the Explorer/Optimizer/ +Reviewer loop. + +--- + +## Key Findings — 8-Model QNN NPU Catalog Sweep (2026-06-13) + +### npu-001: opset 21 NHWC bypass is real — but architecture-specific + +Opset ≥ 21 bypasses ORT's NHWC layout transformer for QNN EP, giving a large speedup +on **Conv + residual** models but no benefit (or slight regression) on pure transformers: + +| Architecture | Models | opset 21 vs opset 17 | +|---|---|---| +| Conv + residual | MobileViT-small, DINOv2-small | **+26–31% speedup** | +| Pure transformer | ViT-base, YOLOS-small | neutral / slight regression | +| BERT-family NLP | DistilBERT, MiniLM, RoBERTa | neutral (within DVFS noise) | +| Plain Conv (ResNet) | ResNet-18 | ~+20% (h1→h3), but DVFS-dominated | + +Root cause: ORT's `IsSupportedOpset()` gate in `layout_transformation.cc` causes the +NHWC layout transform to insert Transpose nodes around Conv ops. For Conv+residual +models these Transposes cannot be cancelled, so bypassing the transform (opset 21) gives +a cleaner HTP graph. Pure attention models have no Conv→NHWC transposes, so the bypass +has no effect. + +### npu-006: Conv fusions cause ~4900% regression on QNN NPU for Conv-dominant models + +`conv_bn_fusion`, `conv_add_fusion`, `conv_activation_fusion` produce fused op nodes +that QNN EP cannot execute natively — falling back to CPU for every fused Conv: + +| Model | h4 (conv fusions) vs h1 (baseline) | +|---|---| +| ResNet-18 | **132.3 ms vs 2.72 ms (+4764% regression)** | +| MobileViT-small | 11.36 ms vs 11.72 ms (neutral) | +| DistilBERT | 19.59 ms vs 19.5 ms (neutral — no Conv to fuse) | + +This is a critical correctness/performance hazard. `winml` should detect when the target +EP would CPU-fallback fused Conv ops and suppress incompatible fusions automatically +(see [Feature Gaps](#feature-gaps)). + +### npu-007: DVFS thermal noise requires session-level averaging for reliable results + +QNN NPU exhibits extreme DVFS thermal throttling. CV is consistently 0.10–2.0+ across +all models. Practical implications: + +- The CV < 15% Phase-A gate must be **disabled** for QNN NPU (blocks all models) +- Differences < 10% between configs are **unreliable** without ≥ 1500 total iterations +- Recommended protocol: **3 × 500-iter sessions** with 30 s cool-down; report median of + session p50 values +- 30 s cool-down reduces but does not eliminate DVFS spikes + +--- + +## How to Run + +### Prerequisites + +- `winml` CLI installed and on PATH +- Python 3.11+ with `onnx` package (`pip install onnx`) +- For QNN experiments: Snapdragon X Elite device with QNN SDK (Hexagon HTP driver) + +### autoconfig.py — single-model adaptive search + +Configured at the top of the file (edit `MODEL_ID`, `TASK`, `EP`, `DEVICE`, `WORK_DIR`): + +```bash +# Default: facebook/convnext-tiny-224 on CPU +python autoconfig.py +``` + +Results are written to `WORK_DIR/results.tsv` and per-hypothesis subdirectories. +The script reads `ep_knowledge/.json` to prune already-refuted configurations. + +### catalog_qnn_sweep.py — multi-model QNN NPU sweep + +```bash +# Full catalog sweep (all 8 models, ~6-8 hours on X Elite) +python catalog_qnn_sweep.py + +# Single model +python catalog_qnn_sweep.py --model microsoft/resnet-18 + +# Show available models +python catalog_qnn_sweep.py --list +``` + +Results land in `catalog-qnn-sweep//results.json` and a `SUMMARY.md` is +regenerated at the end of each sweep. + +### analyze_graph.py — ONNX graph analysis + +```bash +# Edit the onnx path at the top of the file, then: +python analyze_graph.py +``` + +Prints Transpose patterns, residual branch structure, GELU variants, and op domain +breakdown to stdout. + +--- + +## ep_knowledge/ — Empirical Knowledge Base + +Each JSON file stores empirical findings for one EP/device combination: + +| File | EP/device | +|---|---| +| `cpu.json` | CPU EP (Snapdragon X Elite Oryon) | +| `dml.json` | DirectML EP | +| `qnn_gpu.json` | QNN Adreno GPU | +| `qnn_npu.json` | QNN HTP (Hexagon NPU) — most findings here | + +### Schema overview + +Each file has a `findings` array. Each finding has: + +```json +{ + "id": "npu-001", + "title": "...", + "mechanism_confirmed": true, + "architecture_requirement": ["has_conv_ops", "has_residual_connections"], + "status": "confirmed", + "confidence": "high" +} +``` + +And a `search_space_rules` object that `autoconfig.py` reads to prune configurations +(only findings with `"mechanism_confirmed": true` are applied as pruning rules). + +### Adding a new finding + +1. Run the experiment and collect bench data +2. Add an entry to the appropriate `ep_knowledge/.json` under `findings` +3. Set `"mechanism_confirmed": false` and `"confidence": "draft"` until the mechanism + is understood from ORT/EP source code +4. If the finding prunes a search dimension, add a rule under `search_space_rules` +5. Set `"mechanism_confirmed": true` only after source code investigation confirms + the root cause — do NOT promote to confirmed based on benchmark numbers alone +6. See `ep_knowledge/README.md` for the epistemics guidelines + +--- + +## Feature Gaps Identified + +Three actionable gaps in `winml-cli` surfaced by this research: + +1. **FusedConv detection in `winml analyze`** — `analyze` should detect Conv ops that + would CPU-fallback on QNN NPU after fusion (npu-006), and either warn or suppress + incompatible fusions in the generated build config. + +2. **DVFS-aware perf** — `winml perf` should support `--thermal-stabilization` mode + that waits for device temperature to stabilize before measurements, and should report + confidence intervals rather than a single p50. + +3. **Budget-aware sweep** — `catalog_qnn_sweep.py` exhausts the 20-min budget on models + > 50 ms baseline after just 2 hypotheses (YOLOS: 78 ms × 3×500 iters = 207 s/hypothesis). + A `--quick` flag that reduces to 1×200-iter for large models is needed. + +--- + +## Directory Layout + +``` +research/autoconfig/ +├── README.md ← this file +├── autoconfig.py ← adaptive single-model config search loop +├── catalog_qnn_sweep.py ← fixed-hypothesis multi-model QNN sweep +├── analyze_graph.py ← ONNX graph pattern analysis helper +├── autoconfig_diagram.html ← Explorer/Optimizer/Reviewer architecture diagram +├── gen_report_v3.py ← HTML report generator for sweep results +├── ep_knowledge/ +│ ├── README.md ← epistemics guidelines and KB format +│ ├── cpu.json ← CPU EP findings (ConvNext, 6 findings) +│ ├── dml.json ← DirectML EP findings +│ ├── qnn_gpu.json ← QNN Adreno GPU findings +│ └── qnn_npu.json ← QNN HTP NPU findings (npu-001 through npu-007) +└── catalog-qnn-sweep/ + ├── SUMMARY.md ← 8-model sweep results and cross-model analysis + ├── apple--mobilevit-small/results.json + ├── facebook--dinov2-small/results.json + ├── microsoft--resnet-18/results.json + ├── google--vit-base-patch16-224/results.json + ├── deepset--roberta-base-squad2/results.json + ├── distilbert--distilbert-base-uncased-finetuned-sst-2-english/results.json + ├── sentence-transformers--all-MiniLM-L6-v2/results.json + └── hustvl--yolos-small/results.json +``` diff --git a/research/autoconfig/analyze_graph.py b/research/autoconfig/analyze_graph.py new file mode 100644 index 000000000..e57ff1032 --- /dev/null +++ b/research/autoconfig/analyze_graph.py @@ -0,0 +1,172 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +from collections import Counter + +import onnx + + +m = onnx.load(r"convnext-search\iter_00\export.onnx") +g = m.graph + +out2node = {} +for n in g.node: + for o in n.output: + out2node[o] = n + + +def consumers(node): + result = [] + for o in node.output: + for n in g.node: + if o in n.input: + result.append(n) + return result + + +def producer(inp): + return out2node.get(inp) + + +# ── 1. Block structure ──────────────────────────────────────── +print("=== ConvNext block structure (trace first DW-Conv forward) ===") +first_dw = next( + ( + n + for n in g.node + if n.op_type == "Conv" and next((a.i for a in n.attribute if a.name == "group"), 1) > 1 + ), + None, +) +cur = first_dw +for _ in range(14): + if cur is None: + break + c = consumers(cur) + c_types = [n.op_type for n in c] + print(f" {cur.op_type:25s} -> {c_types}") + if len(c) == 1: + cur = c[0] + elif len(c) > 1: + non_add = [n for n in c if n.op_type != "Add"] + cur = non_add[0] if non_add else c[0] + else: + break + +# ── 2. Transpose patterns ───────────────────────────────────── +print() +print("=== Transpose patterns (before -> Transpose -> after) ===") +trans_patterns = Counter() +for n in g.node: + if n.op_type == "Transpose": + c = consumers(n) + p = producer(n.input[0]) + before = p.op_type if p else "INPUT" + after = c[0].op_type if c else "OUTPUT" + trans_patterns[f"{before} -> Transpose -> {after}"] += 1 +for pat, cnt in trans_patterns.most_common(): + print(f" {cnt:3d}x {pat}") + +# ── 3. GELU variants ────────────────────────────────────────── +print() +print("=== GELU sub-patterns ===") +# Standard GELU: Mul -> Div -> Erf -> Add -> Mul -> Mul +gelu_standard = 0 +for n in g.node: + if n.op_type == "Erf": + p = producer(n.input[0]) + if p and p.op_type == "Div": + gelu_standard += 1 +print(f" Div->Erf (Erf-based GELU): {gelu_standard}") + +# Check for Sigmoid-based QuickGELU (x * sigmoid(1.702 * x)) +quick_gelu = 0 +for n in g.node: + if n.op_type == "Sigmoid": + c = consumers(n) + if c and c[0].op_type == "Mul": + quick_gelu += 1 +print(f" Sigmoid->Mul (QuickGELU candidate): {quick_gelu}") + +# ── 4. Downsampling blocks (stage transitions) ──────────────── +print() +print("=== Downsampling block pattern (LN->Conv 2x2 stride 2) ===") +down_blocks = 0 +for n in g.node: + if n.op_type == "Conv": + stride = next((list(a.ints) for a in n.attribute if a.name == "strides"), [1, 1]) + kernel = next((list(a.ints) for a in n.attribute if a.name == "kernel_shape"), []) + groups = next((a.i for a in n.attribute if a.name == "group"), 1) + if stride == [2, 2] and groups == 1: + p = producer(n.input[0]) + print(f" stride-2 Conv kernel={kernel} preceded_by={p.op_type if p else 'INPUT'}") + down_blocks += 1 + +# ── 5. Residual branches ────────────────────────────────────── +print() +print("=== Add nodes with 2 distinct producer op-types (residual candidates) ===") +residual_counter = Counter() +for n in g.node: + if n.op_type == "Add" and len(n.input) == 2: + p0 = producer(n.input[0]) + p1 = producer(n.input[1]) + t0 = p0.op_type if p0 else "INIT" + t1 = p1.op_type if p1 else "INIT" + if t0 != t1: + key = tuple(sorted([t0, t1])) + residual_counter[key] += 1 +for pair, cnt in residual_counter.most_common(): + print(f" {cnt:3d}x Add({pair[0]}, {pair[1]})") + +# ── 6. Node domain analysis ─────────────────────────────────── +print() +print("=== Op domains ===") +domains = Counter() +for n in g.node: + dom = n.domain if n.domain else "ai.onnx" + domains[dom] += 1 +for d, c in domains.most_common(): + print(f" {d}: {c} nodes") + +# ── 7. analyze gaps ─────────────────────────────────────────── +print() +print("=== Patterns winml analyze may miss ===") +# 1. Depthwise conv with large kernels (7x7 DW-Conv is ConvNext specific) +dw7x7 = sum( + 1 + for n in g.node + if n.op_type == "Conv" + and next((a.i for a in n.attribute if a.name == "group"), 1) > 1 + and next((list(a.ints) for a in n.attribute if a.name == "kernel_shape"), []) == [7, 7] +) +print(f" 7x7 DW-Conv (ConvNext pattern): {dw7x7}") +print(" -> analyze classifies as OP/ai.onnx/Conv (undifferentiated)") +print(" -> no distinction between DW-Conv and regular Conv EP support") + +# 2. Transpose wrapping every layer (NCHW<->NHWC conversion) +trans_total = sum(1 for n in g.node if n.op_type == "Transpose") +print(f" Transpose nodes total: {trans_total}") +print(" -> analyze reports as single OP/ai.onnx/Transpose") +print(" -> no detection of Transpose-sandwich (NCHW->NHWC->op->NCHW)") +print(" -> transpose-optimizer capability not reflected in analyze output") + +# 3. MatMul used as dense layer (not Gemm) - different EP kernel path +matmul_count = sum(1 for n in g.node if n.op_type == "MatMul") +print(f" MatMul (not Gemm): {matmul_count}") +print(" -> ConvNext uses MatMul for MLP (not Gemm), QNN handles differently") +print(" -> analyze does not distinguish MatMul-as-FC from MatMul-as-attention") + +# 4. LayerNormalization as a single op (already fused by PyTorch export) +ln_count = sum(1 for n in g.node if n.op_type == "LayerNormalization") +print(f" LayerNormalization (native op): {ln_count}") +print(" -> These are already fused (not the ReduceMean->Sub->... subgraph)") +print(" -> layer-norm-fusion capability targets the decomposed pattern") +print(" -> analyze should note these are ALREADY fused - no fusion needed") + +# 5. Erf-based GELU (not tagged as Gelu op, appears as com.microsoft/Gelu after fusion) +print(f" Erf-based GELU subgraphs (unfused): {gelu_standard}") +print(' -> analyze cannot detect "unfused GELU" as a pattern') +print(" -> gelu-fusion would convert these to com.microsoft/Gelu") +print(' -> no analyze rule for "fuseable_pattern: gelu_erf"') diff --git a/research/autoconfig/autoconfig.py b/research/autoconfig/autoconfig.py new file mode 100644 index 000000000..c7f37cbfe --- /dev/null +++ b/research/autoconfig/autoconfig.py @@ -0,0 +1,1001 @@ +#!/usr/bin/env python3 +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""autoconfig.py — AutoResearch-style optimize-pass search for winml-cli +Demo: facebook/convnext-tiny-224, CPU EP, FP32 + +Loop: hypothesize → winml build → quick-screen bench (CV gate) → + full bench (iter=1000×3) → eval → keep/discard → repeat + +Key design principles (from GPU Optimizer V2 + ConvNext lessons): + 1. Two-phase bench: 200-iter CV screen FIRST, full bench only if CV < 10% + 2. Use winml perf (NOT winml eval) for latency — eval includes HF preprocessing + 3. Mandatory external-research after 5 consecutive DISCARDs in same dimension + 4. Load ep_knowledge/*.json (only "confirmed" entries) to prune search space + 5. Per-experiment structured output: hypothesis/impl/parity/perf/analysis/decision + 6. Stop condition: 30 consecutive DISCARDs (not 5) +""" + +import copy +import csv +import json +import subprocess +import sys +import time +from datetime import datetime +from pathlib import Path + + +sys.stdout.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined] + +# ── settings ───────────────────────────────────────────────────────────────── +MODEL_ID = "facebook/convnext-tiny-224" +TASK = "image-classification" +EP = "cpu" +DEVICE = "cpu" +WINML = str(Path(__file__).parent / ".venv" / "Scripts" / "winml.exe") +WORK_DIR = Path(__file__).parent / "convnext-search" +RESULTS_TSV = WORK_DIR / "results.tsv" +KB_DIR = Path(__file__).parent / "ep_knowledge" + +EVAL_SAMPLES = 50 # for accuracy gate +ACCURACY_FLOOR = 0.70 # cosine drop below this → discard +MIN_IMPROVEMENT = 0.01 # require ≥1% p50 improvement to KEEP + +# Bench protocol (two-phase, from GPU Optimizer V2) +SCREEN_WARMUP = 20 +SCREEN_ITERS = 200 +SCREEN_CV_MAX = 0.10 # Coefficient of Variation = std/p50; reject if > 10% +FULL_WARMUP = 50 +FULL_ITERS = 1000 +FULL_SESSIONS = 3 +COOL_DOWN_S = 60 # seconds between full-bench sessions + +# Stop conditions +STOP_CONSECUTIVE_DISCARDS = 30 # plateau stop +EXTERNAL_RESEARCH_TRIGGER = 5 # trigger after this many DISCARDs in same dimension + +# ── load ep_knowledge (confirmed entries only) ──────────────────────────────── + + +def load_ep_knowledge(ep: str) -> dict: + """Load confirmed KB entries for given EP. Only 'confirmed' status entries + are used to prune search space. 'draft' entries are informational only. + """ + kb_path = KB_DIR / f"{ep}.json" + if not kb_path.exists(): + return {"skip_passes": [], "skip_quantization": False, "notes": []} + + kb = json.loads(kb_path.read_text(encoding="utf-8")) + rules = kb.get("search_space_rules", {}) + skip_passes = [] + skip_quant = False + notes = [] + + # Only apply rules from confirmed findings + confirmed_ids = {f["id"] for f in kb.get("findings", []) if f.get("mechanism_confirmed", False)} + + for finding in kb.get("findings", []): + if finding["id"] not in confirmed_ids: + notes.append(f"[DRAFT] {finding['id']}: {finding['title'][:60]}…") + continue + action = finding.get("action_for_autoconfig", "") + if "skip" in action.lower() and "quantization" in action.lower(): + skip_quant = True + notes.append(f"[KB confirmed] Skip quantization: {finding['id']}") + if "skip" in action.lower() and "compile" in action.lower(): + notes.append(f"[KB confirmed] Skip compile: {finding['id']}") + + # Parse search_space_rules for passes to skip + graph_passes = rules.get("graph_passes", {}) + for p in graph_passes.get("skip", []): + skip_passes.append(p) + notes.append(f"[KB confirmed] Skip pass: {p}") + + return {"skip_passes": skip_passes, "skip_quantization": skip_quant, "notes": notes} + + +# ── baseline config ─────────────────────────────────────────────────────────── +BASELINE: dict = { + "export": { + "opset_version": 17, + "batch_size": 1, + "do_constant_folding": True, + "dynamo": False, + "input_tensors": [ + { + "name": "pixel_values", + "dtype": "float32", + "shape": [1, 3, 224, 224], + "value_range": [0, 1], + } + ], + "output_tensors": [{"name": "logits"}], + }, + "optim": {}, + "loader": { + "task": TASK, + "model_class": "AutoModelForImageClassification", + "model_type": "convnext", + }, + "eval": { + "task": TASK, + "dataset": {"path": "timm/mini-imagenet", "split": "test", "samples": EVAL_SAMPLES}, + }, +} + + +# ── hypothesis sequence ─────────────────────────────────────────────────────── +def h0_baseline(cfg: dict) -> dict: + """FP32 export, no extra fusions — reference point""" + cfg["optim"] = {} + return cfg + + +def h1_conv_fusions(cfg: dict) -> dict: + cfg["optim"] = {"conv-bn-fusion": True, "conv-add-fusion": True, "conv-activation-fusion": True} + return cfg + + +def h2_gelu_fusion(cfg: dict) -> dict: + cfg["optim"] = {**cfg["optim"], "gelu-fusion": True} + return cfg + + +def h3_add_layernorm(cfg: dict) -> dict: + cfg["optim"] = {**cfg["optim"], "layer-norm-fusion": True} + return cfg + + +def h4_add_matmul(cfg: dict) -> dict: + cfg["optim"] = {**cfg["optim"], "matmul-add-fusion": True} + return cfg + + +def h5_transpose_opt(cfg: dict) -> dict: + cfg["optim"] = {**cfg["optim"], "transpose-optimizer": True} + return cfg + + +def h6_opset21(cfg: dict) -> dict: + """Try opset 21 — may trigger kMaxSupportedOpset bypass on older ORT (see npu-001). + NOTE: This is a research hypothesis, not a confirmed optimization. Gate 2 required. + """ + cfg["export"]["opset_version"] = 21 + cfg["optim"] = {**cfg["optim"], "transpose-optimizer": True} + return cfg + + +HYPOTHESES: list[tuple[str, object, str]] = [ + # (label, patch_fn, search_dimension) + ("baseline: no fusions (FP32 reference)", h0_baseline, "baseline"), + ("conv fusions: bn+add+activation", h1_conv_fusions, "graph_pass"), + ("+ gelu-fusion", h2_gelu_fusion, "graph_pass"), + ("+ layer-norm-fusion", h3_add_layernorm, "graph_pass"), + ("+ matmul-add-fusion (MLP blocks)", h4_add_matmul, "graph_pass"), + ("+ transpose-optimizer", h5_transpose_opt, "graph_pass"), + ("opset=21 (kMaxSupportedOpset research)", h6_opset21, "opset"), +] + +# ── helpers ─────────────────────────────────────────────────────────────────── + + +def run(cmd: list[str], label: str = "") -> tuple[int, str, float]: + t0 = time.time() + print(f" >> {label or cmd[1]}") + result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", errors="replace") + elapsed = time.time() - t0 + status = "ok" if result.returncode == 0 else f"rc={result.returncode}" + print(f" done in {elapsed:.0f}s [{status}]") + if result.returncode != 0: + print(f" stderr: {(result.stderr or result.stdout or '')[-400:]}") + return result.returncode, result.stdout + result.stderr, elapsed + + +def build(cfg: dict, out_dir: Path) -> tuple[bool, str]: + out_dir.mkdir(parents=True, exist_ok=True) + cfg_path = out_dir / "config.json" + cfg_path.write_text(json.dumps(cfg, indent=2)) + rc, out, _ = run( + [ + WINML, + "build", + "-c", + str(cfg_path), + "-m", + MODEL_ID, + "-o", + str(out_dir), + "--ep", + EP, + "--device", + DEVICE, + "--no-quant", + "--no-compile", + ], + label="winml build", + ) + return rc == 0, out + + +def bench_phase_a(model_path: Path) -> tuple[float | None, float]: + """Phase A quick screen: 200 iters, check CV < SCREEN_CV_MAX. + Returns (p50_ms, cv). p50_ms=None means unstable (reject). + """ + out_json = model_path.parent / "screen_perf.json" + rc, _, _ = run( + [ + WINML, + "perf", + "-m", + str(model_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(SCREEN_WARMUP), + "--iterations", + str(SCREEN_ITERS), + "-o", + str(out_json), + ], + label=f"winml perf (screen, iter={SCREEN_ITERS})", + ) + if rc != 0 or not out_json.exists(): + return None, 999.0 + try: + data = json.loads(out_json.read_text()) + lat = data["latency_ms"] + p50 = lat["p50"] + std = lat["std"] + cv = std / p50 if p50 > 0 else 999.0 + print(f" screen: p50={p50:.1f}ms std={std:.1f}ms CV={cv:.2f}") + if cv > SCREEN_CV_MAX: + print(f" ⚠️ CV={cv:.2f} > {SCREEN_CV_MAX} — UNSTABLE, rejecting candidate") + return None, cv + return p50, cv + except Exception as e: + print(f" [warn] parse error: {e}") + return None, 999.0 + + +def bench_phase_b(model_path: Path, label: str) -> list[float]: + """Phase B full bench: 3 independent sessions × 1000 iters with cool-down. + Returns list of p50_ms values (one per session). + """ + p50s = [] + for session in range(1, FULL_SESSIONS + 1): + out_json = model_path.parent / f"full_perf_s{session}.json" + rc, _, _ = run( + [ + WINML, + "perf", + "-m", + str(model_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(FULL_WARMUP), + "--iterations", + str(FULL_ITERS), + "-o", + str(out_json), + ], + label=f"winml perf (full s{session}/{FULL_SESSIONS}, iter={FULL_ITERS})", + ) + if rc == 0 and out_json.exists(): + data = json.loads(out_json.read_text()) + p50 = data["latency_ms"]["p50"] + std = data["latency_ms"]["std"] + cv = std / p50 if p50 > 0 else 999.0 + print(f" full s{session}: p50={p50:.1f}ms std={std:.1f}ms CV={cv:.2f}") + p50s.append(p50) + if session < FULL_SESSIONS: + print(f" cooling down {COOL_DOWN_S}s …") + time.sleep(COOL_DOWN_S) + return p50s + + +def eval_accuracy(out_dir: Path) -> float | None: + """Run winml eval; return accuracy (top-1 or cosine). For latency: use bench_*.""" + model_path = out_dir / "model.onnx" + if not model_path.exists(): + return None + result_json = out_dir / "eval_result.json" + rc, _, _ = run( + [ + WINML, + "eval", + "-m", + str(model_path), + "--model-id", + MODEL_ID, + "--task", + TASK, + "--ep", + EP, + "--device", + DEVICE, + "--samples", + str(EVAL_SAMPLES), + "-o", + str(result_json), + ], + label="winml eval (accuracy gate)", + ) + if rc != 0 or not result_json.exists(): + return None + try: + data = json.loads(result_json.read_text()) + metrics = data.get("metrics", data) + acc = metrics.get("accuracy") + return float(acc) if acc is not None else None + except Exception as e: + print(f" [warn] parse error: {e}") + return None + + +def write_experiment_doc(exp_dir: Path, info: dict) -> None: + """Write per-experiment structured artifact (V2 pattern): + Hypothesis / Implementation / Parity / Perf / Analysis / Decision + """ + exp_dir.mkdir(parents=True, exist_ok=True) + doc = f"""# Experiment {info["iter"]:02d}: {info["label"]} + +## Hypothesis +{info.get("hypothesis", "(not recorded)")} + +## Implementation +- Config flags: `{info.get("optim_flags", "")}` +- Opset: `{info.get("opset", 17)}` +- Search dimension: `{info.get("dimension", "")}` + +## Parity (accuracy gate) +- Accuracy: `{info.get("accuracy", "N/A")}` +- Floor: `{ACCURACY_FLOOR}` +- Result: `{"PASS" if (info.get("accuracy") or 0) >= ACCURACY_FLOOR else "FAIL"}` + +## Performance +### Phase A (quick screen, {SCREEN_ITERS} iters) +- p50: `{info.get("screen_p50", "N/A")}ms` +- CV: `{info.get("screen_cv", "N/A")}` (threshold: {SCREEN_CV_MAX}) + +### Phase B (full bench, {FULL_ITERS}×{FULL_SESSIONS} sessions) +- p50 per session: `{info.get("full_p50s", [])}` +- Median p50: `{info.get("median_p50", "N/A")}ms` +- Baseline p50: `{info.get("baseline_p50", "N/A")}ms` +- Delta: `{info.get("delta_pct", "N/A")}` + +## Analysis +{info.get("analysis", "(auto-generated: no significant analysis)")} + +## Decision +**{info.get("status", "UNKNOWN").upper()}** + +Timestamp: {datetime.now().isoformat(timespec="seconds")} +""" + (exp_dir / "experiment.md").write_text(doc, encoding="utf-8") + + +def log(row: dict) -> None: + fields = [ + "iter", + "label", + "dimension", + "optim_flags", + "opset", + "accuracy", + "screen_p50_ms", + "median_p50_ms", + "baseline_p50_ms", + "delta_pct", + "cv", + "status", + "elapsed_s", + "timestamp", + ] + is_new = not RESULTS_TSV.exists() + with RESULTS_TSV.open("a", newline="", encoding="utf-8") as f: + w = csv.DictWriter(f, fieldnames=fields, delimiter="\t", extrasaction="ignore") + if is_new: + w.writeheader() + w.writerow(row) + + +def optim_flags(cfg: dict) -> str: + flags = [k for k, v in cfg.get("optim", {}).items() if v is True] + return ",".join(flags) if flags else "(none)" + + +# ── main loop ───────────────────────────────────────────────────────────────── + + +def main() -> None: + WORK_DIR.mkdir(parents=True, exist_ok=True) + + # Load EP knowledge (confirmed entries only) + kb = load_ep_knowledge(EP) + print(f"\n=== KB loaded for EP={EP} ===") + for note in kb["notes"]: + print(f" {note}") + + sep = "=" * 64 + print(f"\n{sep}") + print(f" autoconfig search -- {MODEL_ID}") + print(f" EP: {EP} eval_samples: {EVAL_SAMPLES} hypotheses: {len(HYPOTHESES)}") + print( + f" Bench: screen={SCREEN_ITERS} iters (CV<{SCREEN_CV_MAX}) → full={FULL_ITERS}×{FULL_SESSIONS}" + ) + print(f" Stop: {STOP_CONSECUTIVE_DISCARDS} consecutive DISCARDs OR budget") + print(f" External research trigger: after {EXTERNAL_RESEARCH_TRIGGER} DISCARDs same dimension") + print(f"{sep}\n") + + baseline_p50: float | None = None + best_p50 = float("inf") + best_label = "" + consecutive_discards = 0 + discard_by_dimension: dict[str, int] = {} + + for i, (label, patch_fn, dimension) in enumerate(HYPOTHESES): + iter_start = time.time() + print(f"\n{'--' * 32}") + print(f" iter {i} | {label} [{dimension}]") + print(f"{'--' * 32}") + + # Check KB skip_set (confirmed rules only) + flags_preview = optim_flags(patch_fn(copy.deepcopy(BASELINE))) # type: ignore[operator] + skip_reason = next( + (r for r in kb["skip_passes"] if any(f in flags_preview for f in r.split()[:2])), None + ) + if skip_reason: + print(f" ⏭️ skipped by KB confirmed rule: {skip_reason}") + continue + + cfg = patch_fn(copy.deepcopy(BASELINE)) # type: ignore[operator] + flags = optim_flags(cfg) + opset = cfg["export"]["opset_version"] + print(f" optim: {flags}") + print(f" opset: {opset}") + + out_dir = WORK_DIR / f"iter_{i:02d}" + exp_dir = WORK_DIR / "experiments" / f"{i:02d}_{dimension}" + ok, _ = build(cfg, out_dir) + + exp_info: dict = { + "iter": i, + "label": label, + "dimension": dimension, + "optim_flags": flags, + "opset": opset, + "hypothesis": label, + "baseline_p50": f"{baseline_p50:.1f}" if baseline_p50 else "N/A", + } + + if not ok: + status = "crash" + exp_info["analysis"] = "winml build failed — check build log" + else: + # Phase A: quick screen + screen_p50, screen_cv = bench_phase_a(out_dir / "model.onnx") + exp_info["screen_p50"] = f"{screen_p50:.1f}" if screen_p50 else "UNSTABLE" + exp_info["screen_cv"] = f"{screen_cv:.3f}" + + if screen_p50 is None: + status = "discard (unstable — CV too high)" + exp_info["analysis"] = ( + f"Phase A rejected: CV={screen_cv:.2f} > {SCREEN_CV_MAX}. Likely DVFS noise. Cool device and retry." + ) + else: + # Phase B: full bench + full_p50s = bench_phase_b(out_dir / "model.onnx", label) + if not full_p50s: + status = "crash (full bench failed)" + exp_info["analysis"] = "Phase B winml perf returned no data" + else: + median_p50 = sorted(full_p50s)[len(full_p50s) // 2] + exp_info["full_p50s"] = [f"{p:.1f}" for p in full_p50s] + exp_info["median_p50"] = f"{median_p50:.1f}" + + if baseline_p50 is None and i == 0: + baseline_p50 = median_p50 + exp_info["baseline_p50"] = f"{baseline_p50:.1f}" + + # Accuracy gate + accuracy = eval_accuracy(out_dir) + exp_info["accuracy"] = f"{accuracy:.4f}" if accuracy is not None else "N/A" + + if accuracy is not None and accuracy < ACCURACY_FLOOR: + status = f"discard (accuracy {accuracy:.4f} < floor {ACCURACY_FLOOR})" + exp_info["analysis"] = "Accuracy regression below floor" + elif baseline_p50 is not None and median_p50 > baseline_p50 * ( + 1 - MIN_IMPROVEMENT + ): + delta_pct = (median_p50 - baseline_p50) / baseline_p50 * 100 + status = f"discard (Δp50={delta_pct:+.1f}% < {MIN_IMPROVEMENT * 100:.0f}% threshold)" + exp_info["delta_pct"] = f"{delta_pct:+.1f}%" + exp_info["analysis"] = ( + f"No meaningful improvement: {delta_pct:+.1f}% vs {MIN_IMPROVEMENT * 100:.0f}% threshold" + ) + else: + delta_pct = ( + (median_p50 - (baseline_p50 or median_p50)) + / (baseline_p50 or median_p50) + * 100 + ) + status = "keep" + exp_info["delta_pct"] = f"{delta_pct:+.1f}%" + exp_info["analysis"] = ( + f"Improvement confirmed: p50 {baseline_p50:.1f}ms → {median_p50:.1f}ms ({delta_pct:+.1f}%)" + ) + if median_p50 < best_p50: + best_p50 = median_p50 + best_label = label + status = "keep *** NEW BEST ***" + + # Write per-experiment doc (V2 pattern) + exp_info["status"] = status + write_experiment_doc(exp_dir, exp_info) + + # Track consecutive discards + external research trigger + if "discard" in status or "crash" in status: + consecutive_discards += 1 + discard_by_dimension[dimension] = discard_by_dimension.get(dimension, 0) + 1 + if discard_by_dimension[dimension] == EXTERNAL_RESEARCH_TRIGGER: + print( + f"\n ⚡ EXTERNAL RESEARCH TRIGGER: {EXTERNAL_RESEARCH_TRIGGER} consecutive DISCARDs in [{dimension}]" + ) + print(" → Search ORT/QNN source code for mechanism before continuing") + print( + " → Check kMaxSupportedOpset for opset dimension, EP-specific rules for others" + ) + print(f" → File findings in ep_knowledge/{EP}.json as 'draft' entry") + else: + consecutive_discards = 0 + discard_by_dimension[dimension] = 0 + + # Log to TSV + log( + { + "iter": i, + "label": label, + "dimension": dimension, + "optim_flags": flags, + "opset": opset, + "accuracy": exp_info.get("accuracy", "N/A"), + "screen_p50_ms": exp_info.get("screen_p50", "N/A"), + "median_p50_ms": exp_info.get("median_p50", "N/A"), + "baseline_p50_ms": exp_info.get("baseline_p50", "N/A"), + "delta_pct": exp_info.get("delta_pct", "N/A"), + "cv": exp_info.get("screen_cv", "N/A"), + "status": status, + "elapsed_s": f"{time.time() - iter_start:.0f}", + "timestamp": datetime.now().isoformat(timespec="seconds"), + } + ) + + print(f" → {status}") + + # Stop condition + if consecutive_discards >= STOP_CONSECUTIVE_DISCARDS: + print( + f"\n 🛑 STOP: {STOP_CONSECUTIVE_DISCARDS} consecutive DISCARDs — plateau reached" + ) + break + + print(f"\n{sep}") + print(" SEARCH COMPLETE") + print(f" Best config: {best_label}") + print(f" Best p50: {best_p50:.1f}ms" if best_p50 < float("inf") else " No improvement found") + print(f" Results: {RESULTS_TSV}") + print(f" Experiments: {WORK_DIR / 'experiments'}") + print(f"{sep}\n") + + +if __name__ == "__main__": + main() + + +import sys +from pathlib import Path + + +sys.stdout.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined] + +# ── settings ───────────────────────────────────────────────────────────────── +MODEL_ID = "facebook/convnext-tiny-224" +TASK = "image-classification" +EP = "cpu" +DEVICE = "cpu" +WINML = str(Path(__file__).parent / ".venv" / "Scripts" / "winml.exe") +WORK_DIR = Path(__file__).parent / "convnext-search" +RESULTS_TSV = WORK_DIR / "results.tsv" + +EVAL_SAMPLES = 50 # small for demo speed (~12s per eval) +ACCURACY_FLOOR = 0.70 # drop below this → discard (FP32 baseline ~78%) +LATENCY_FLOOR = 1.0 # seconds — more than this means regression + +# ── baseline config ─────────────────────────────────────────────────────────── +BASELINE: dict = { + "export": { + "opset_version": 17, + "batch_size": 1, + "do_constant_folding": True, + "dynamo": False, + "input_tensors": [ + { + "name": "pixel_values", + "dtype": "float32", + "shape": [1, 3, 224, 224], + "value_range": [0, 1], + } + ], + "output_tensors": [{"name": "logits"}], + }, + "optim": {}, # will be patched per hypothesis + "loader": { + "task": TASK, + "model_class": "AutoModelForImageClassification", + "model_type": "convnext", + }, + "eval": { + "task": TASK, + "dataset": {"path": "timm/mini-imagenet", "split": "test", "samples": EVAL_SAMPLES}, + }, +} + +# ── hypothesis sequence ─────────────────────────────────────────────────────── +# ConvNext-tiny architecture: +# Stem: Conv 4x4 + LN → 4 stages of ConvNext blocks +# Each block: DW-Conv → LN → Linear (=Gemm) → GELU → Linear +# Skip connections: pointwise Add +# +# Relevant fusions: +# conv-bn-fusion — conv+BatchNorm folding (stem/downsample layers) +# conv-add-fusion — conv+bias add (ConvNext uses DepthwiseConv with bias) +# gelu-fusion — fuse decomposed GELU → com.microsoft/Gelu +# layer-norm-fusion — fuse LN subgraph (ConvNext uses LayerNorm heavily) +# matmul-add-fusion — fuse Gemm+bias (the inverted bottleneck MLPs) +# transpose-optimizer — eliminate redundant transposes around reshape ops +# constant-folding — pre-fold constant subgraphs (on by default in export, +# but also at optim stage via ORT) + + +def h0_baseline(cfg: dict) -> dict: + """FP32 export, no extra fusions — reference point""" + cfg["optim"] = {} + return cfg + + +def h1_conv_fusions(cfg: dict) -> dict: + """Enable all conv fusions — ConvNext stem uses Conv+BN, blocks use DW-Conv+bias""" + cfg["optim"] = { + "conv-bn-fusion": True, + "conv-add-fusion": True, + "conv-activation-fusion": True, + } + return cfg + + +def h2_gelu_fusion(cfg: dict) -> dict: + """Add GELU fusion — ConvNext MLP blocks use GELU activation""" + cfg["optim"] = { + "conv-bn-fusion": True, + "conv-add-fusion": True, + "conv-activation-fusion": True, + "gelu-fusion": True, + } + return cfg + + +def h3_add_layernorm(cfg: dict) -> dict: + """Add LayerNorm fusion — ConvNext uses LN (not BN) in blocks""" + cfg["optim"] = { + "conv-bn-fusion": True, + "conv-add-fusion": True, + "conv-activation-fusion": True, + "gelu-fusion": True, + "layer-norm-fusion": True, + } + return cfg + + +def h4_add_matmul(cfg: dict) -> dict: + """Add MatMul+Add fusion — ConvNext MLP uses Gemm (collapsed MatMul+bias)""" + cfg["optim"] = { + "conv-bn-fusion": True, + "conv-add-fusion": True, + "conv-activation-fusion": True, + "gelu-fusion": True, + "layer-norm-fusion": True, + "matmul-add-fusion": True, + } + return cfg + + +def h5_transpose_opt(cfg: dict) -> dict: + """Add transpose optimizer — ConvNext has many Transpose ops (NCHW reshapes)""" + cfg["optim"] = { + "conv-bn-fusion": True, + "conv-add-fusion": True, + "conv-activation-fusion": True, + "gelu-fusion": True, + "layer-norm-fusion": True, + "matmul-add-fusion": True, + "transpose-optimizer": True, + } + return cfg + + +def h6_opset18(cfg: dict) -> dict: + """Try opset 18 with all fusions — GroupNorm introduced in opset18""" + cfg["export"]["opset_version"] = 18 + cfg["optim"] = { + "conv-bn-fusion": True, + "conv-add-fusion": True, + "conv-activation-fusion": True, + "gelu-fusion": True, + "layer-norm-fusion": True, + "matmul-add-fusion": True, + "transpose-optimizer": True, + } + return cfg + + +def h7_surgery(cfg: dict) -> dict: + """Add clamp-constant-values — prevents -inf attention mask quant issues""" + cfg["export"]["opset_version"] = 17 + cfg["optim"] = { + "conv-bn-fusion": True, + "conv-add-fusion": True, + "conv-activation-fusion": True, + "gelu-fusion": True, + "layer-norm-fusion": True, + "matmul-add-fusion": True, + "transpose-optimizer": True, + "clamp-constant-values": True, + } + return cfg + + +HYPOTHESES: list[tuple[str, object]] = [ + ("baseline: no fusions (FP32 reference)", h0_baseline), + ("conv fusions: bn+add+activation", h1_conv_fusions), + ("+ gelu-fusion", h2_gelu_fusion), + ("+ layer-norm-fusion", h3_add_layernorm), + ("+ matmul-add-fusion (MLP blocks)", h4_add_matmul), + ("+ transpose-optimizer", h5_transpose_opt), + ("opset=18 + all fusions", h6_opset18), + ("back to opset=17 + surgery: clamp-constant-values", h7_surgery), +] + +# ── helpers ─────────────────────────────────────────────────────────────────── + + +def run(cmd: list[str], label: str = "") -> tuple[int, str, float]: + t0 = time.time() + print(f" >> {label or cmd[1]}") + result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", errors="replace") + elapsed = time.time() - t0 + status = "ok" if result.returncode == 0 else f"rc={result.returncode}" + print(f" done in {elapsed:.0f}s [{status}]") + if result.returncode != 0: + tail = (result.stderr or result.stdout or "")[-600:] + print(f" stderr: {tail}") + return result.returncode, result.stdout + result.stderr, elapsed + + +def build(cfg: dict, out_dir: Path) -> tuple[bool, str]: + out_dir.mkdir(parents=True, exist_ok=True) + cfg_path = out_dir / "config.json" + cfg_path.write_text(json.dumps(cfg, indent=2)) + rc, out, _ = run( + [ + WINML, + "build", + "-c", + str(cfg_path), + "-m", + MODEL_ID, + "-o", + str(out_dir), + "--ep", + EP, + "--device", + DEVICE, + "--no-quant", + "--no-compile", + ], + label="winml build", + ) + return rc == 0, out + + +def eval_onnx(out_dir: Path) -> tuple[float | None, float | None]: + """Eval model.onnx; return (accuracy, latency_s).""" + model_path = out_dir / "model.onnx" + if not model_path.exists(): + print(" [warn] model.onnx not found") + return None, None + + result_json = out_dir / "eval_result.json" + rc, _, _ = run( + [ + WINML, + "eval", + "-m", + str(model_path), + "--model-id", + MODEL_ID, + "--task", + TASK, + "--ep", + EP, + "--device", + DEVICE, + "--samples", + str(EVAL_SAMPLES), + "-o", + str(result_json), + ], + label="winml eval", + ) + if rc != 0 or not result_json.exists(): + return None, None + try: + data = json.loads(result_json.read_text()) + metrics = data.get("metrics", data) + accuracy = metrics.get("accuracy") + latency = metrics.get("latency_in_seconds") + return ( + float(accuracy) if accuracy is not None else None, + float(latency) if latency is not None else None, + ) + except Exception as e: + print(f" [warn] parse error: {e}") + return None, None + + +def log(row: dict) -> None: + fields = [ + "iter", + "label", + "optim_flags", + "opset", + "accuracy", + "latency_ms", + "delta_acc", + "delta_lat_ms", + "status", + "elapsed_s", + "timestamp", + ] + is_new = not RESULTS_TSV.exists() + with RESULTS_TSV.open("a", newline="", encoding="utf-8") as f: + w = csv.DictWriter(f, fieldnames=fields, delimiter="\t", extrasaction="ignore") + if is_new: + w.writeheader() + w.writerow(row) + + +def optim_flags(cfg: dict) -> str: + flags = [k for k, v in cfg.get("optim", {}).items() if v is True] + return ",".join(flags) if flags else "(none)" + + +# ── main loop ───────────────────────────────────────────────────────────────── + + +def main() -> None: + WORK_DIR.mkdir(parents=True, exist_ok=True) + + sep = "=" * 62 + print(f"\n{sep}") + print(f" autoconfig search -- {MODEL_ID}") + print(f" EP: {EP} eval_samples: {EVAL_SAMPLES} hypotheses: {len(HYPOTHESES)}") + print(f" Objective: maximize accuracy (floor={ACCURACY_FLOOR})") + print(" Search space: WinMLOptimizationConfig capability flags") + print(f"{sep}\n") + + baseline_acc: float | None = None + baseline_lat: float | None = None + best_acc = 0.0 + best_lat = float("inf") + best_label = "" + total_start = time.time() + + for i, (label, patch_fn) in enumerate(HYPOTHESES): + iter_start = time.time() + print(f"\n{'--' * 31}") + print(f" iter {i} | {label}") + print(f"{'--' * 31}") + + cfg = patch_fn(copy.deepcopy(BASELINE)) # type: ignore[operator] + flags = optim_flags(cfg) + opset = cfg["export"]["opset_version"] + print(f" optim: {flags}") + print(f" opset: {opset}") + + out_dir = WORK_DIR / f"iter_{i:02d}" + ok, _ = build(cfg, out_dir) + if not ok: + status = "crash" + accuracy = latency = None + else: + accuracy, latency = eval_onnx(out_dir) + if accuracy is None: + status = "eval_error" + elif accuracy < ACCURACY_FLOOR: + status = "discard (accuracy < floor)" + elif latency is not None and latency > LATENCY_FLOOR: + status = "discard (latency regression)" + else: + status = "keep" + if accuracy > best_acc or (accuracy == best_acc and (latency or 999) < best_lat): + best_acc = accuracy + best_lat = latency or float("inf") + best_label = label + status = "keep *** NEW BEST ***" + + # Print result + if accuracy is not None: + lat_ms = f"{(latency or 0) * 1000:.0f}ms" if latency else "N/A" + print(f" accuracy={accuracy:.4f} latency={lat_ms} -> {status}") + if baseline_acc is None and i == 0: + baseline_acc = accuracy + baseline_lat = latency + if baseline_acc is not None and i > 0: + d_acc = accuracy - baseline_acc + d_lat = ((latency or 0) - (baseline_lat or 0)) * 1000 + sign_acc = "+" if d_acc >= 0 else "" + sign_lat = "+" if d_lat >= 0 else "" + print(f" vs baseline: acc {sign_acc}{d_acc:.4f} lat {sign_lat}{d_lat:.0f}ms") + else: + print(f" -> {status}") + + elapsed = time.time() - iter_start + delta_acc = ( + f"{accuracy - baseline_acc:+.4f}" + if (accuracy is not None and baseline_acc is not None) + else "N/A" + ) + delta_lat = ( + f"{((latency or 0) - (baseline_lat or 0)) * 1000:+.0f}" + if (latency is not None and baseline_lat is not None) + else "N/A" + ) + log( + { + "iter": i, + "label": label, + "optim_flags": flags, + "opset": opset, + "accuracy": f"{accuracy:.4f}" if accuracy is not None else "N/A", + "latency_ms": f"{(latency or 0) * 1000:.0f}" if latency is not None else "N/A", + "delta_acc": delta_acc, + "delta_lat_ms": delta_lat, + "status": status, + "elapsed_s": f"{elapsed:.0f}", + "timestamp": datetime.now().isoformat(timespec="seconds"), + } + ) + + total = time.time() - total_start + print(f"\n{sep}") + print(f" SEARCH COMPLETE | {total / 60:.1f} min total") + print(f" Best config: {best_label}") + print(f" Best accuracy: {best_acc:.4f} latency: {best_lat * 1000:.0f}ms") + print(f" Results: {RESULTS_TSV}") + print(f"{sep}\n") + + if RESULTS_TSV.exists(): + print(RESULTS_TSV.read_text(encoding="utf-8")) + + +if __name__ == "__main__": + main() diff --git a/research/autoconfig/autoconfig_diagram.html b/research/autoconfig/autoconfig_diagram.html new file mode 100644 index 000000000..0a60a4bbc --- /dev/null +++ b/research/autoconfig/autoconfig_diagram.html @@ -0,0 +1,573 @@ + + + + +autoconfig Skill — Architecture + + + + +

autoconfig — Skill Architecture

+

Profile-guided autonomous config search for WinApp developers

+ +
+ + +
+
👤
+
+ User input
+ Model ID  +  Target EP/device  +  Objective: + accuracy-primary + latency-primary + Pareto +   + optional constraints (latency budget, accuracy floor) +
+
+ +
+ + +
+
Phase 0 · Intake
+
+
+
🔍 Inspect
+
    +
  • winml inspect
  • +
  • Validate model is supported
  • +
  • Check EP availability
  • +
+
+
+
+
🏗️ Baseline Build
+
    +
  • winml build (default config)
  • +
  • opset=17, no quant
  • +
  • Output: baseline/model.onnx
  • +
+
+
+
+
Correctness Contract
+
    +
  • winml eval --mode compare
  • +
  • Lock cosine = 1.000
  • +
  • Record baseline p50
  • +
+
+
+
+ +
+ + +
+
Phase 1 · Insight
+
+ + +
+ +
+
📊 Runtime Profile
+
    +
  • winml perf --profile
  • +
  • ORT per-op kernel time
  • +
  • Bottleneck op type + %
  • +
  • Canonical vs decomposed ops
  • +
  • Layout ops (Reorder) activity
  • +
+
+ +
+
🔬 Static Analyzer
+
    +
  • winml analyze --ep <ep>
  • +
  • Partial-support ops list
  • +
  • EP fallback candidates
  • +
  • Quant-sensitive node names
  • +
  • EP-specific constraints
  • +
+
+ +
+
🗂️ Graph Analysis
+
    +
  • ONNX proto inspection
  • +
  • opset version (kMaxSupportedOpset check)
  • +
  • Op counts per type
  • +
  • Fusion opportunities (decomposed subgraphs)
  • +
  • Static shape vs dynamic axes
  • +
+
+ +
+ + +
+
+
Insight Engine — fuse 3 signals →
+
+
+ + +
+
+
🚫 skip_set (passes to prune)
+
    +
  • Gelu op present → skip gelu-fusion
  • +
  • LN op present → skip layer-norm-fusion
  • +
  • ReorderInput > 2% → skip nchwc-transformer
  • +
  • Transpose < 5% + opset=17 → skip transpose-opt
  • +
  • opset ≥ 19 + Transpose > 10% → flag [KNOWN_TRADEOFF]
  • +
  • Partial-op list empty → skip nodes_to_exclude trials
  • +
+
+
+
📋 priority_queue (ranked hypotheses)
+
    +
  • Gemm > 50% → quant precision, calib method first
  • +
  • Conv > 20% → nchwc, conv-fusions first
  • +
  • Partial ops exist → nodes_to_exclude exclusion trials
  • +
  • Decomposed Gelu subgraph → gelu-fusion trial
  • +
  • Dynamic axes → try static shape export
  • +
+
+
+ +
+
+ +
+ + +
+
Phase 2 · Opt Loop
+
+
+ + +
+ +
+
🔭 Explorer
+
    +
  • Pop next hypothesis from priority_queue
  • +
  • Check KB ep_knowledge/<ep>.json — skip if "confirmed" rule prunes it
  • +
  • Build config.json delta (opset / quant / fusions)
  • +
  • ⚡ External research trigger: 5 DISCARDs in same dimension → read ORT/QNN source code
  • +
+
+ +
+ +
+
⚙️ Optimizer
+
    +
  • winml build -c config.json
  • +
  • Phase A: winml perf --iter 200 → CV = std/p50
  • +
  • CV > 10%? → REJECT (DVFS noise) — do NOT run full bench
  • +
  • Phase B (if CV passes): winml perf --iter 1000 ×3, 60s cool-down
  • +
  • winml eval --samples 100 → accuracy gate
  • +
+
+ +
+ +
+
🔎 Reviewer
+
    +
  • keep — all 3 p50s better than baseline × 99% AND cosine ≥ floor
  • +
  • discard — revert to last kept config; write per-experiment .md
  • +
  • unstable — CV too high; log [UNSTABLE], do not count as DISCARD
  • +
  • Write KB draft entry if new mechanism observed (status="draft")
  • +
+
+ +
↩ loop back to Explorer  (until stop condition)
+ +
+ + +
+
+
🛑 Stop conditions
+
    +
  • Objective achieved
  • +
  • 30 consecutive DISCARDs (plateau)
  • +
  • priority_queue empty
  • +
  • User manually stops
  • +
+
+
+
📋 results.tsv
+ Every experiment:
+ config · screen_p50 · median_p50
+ CV · delta_pct · status · dim +
+
+
📚 ep_knowledge/
+ New entries written as
+ status="draft"
+ Promoted to "confirmed"
+ only after Gate 2 (source) +
+
+ +
+
+
+ +
+ + +
+
Phase 3 · Report
+
+
+
+
Champion Config
+ Best config with provenance metadata + config_<ep>_optimal.json +
+
+
HTML Report
+ Benchmark chart + experiment table + profile section + report.html +
+
+
Per-Experiment Artifacts
+ hypothesis/impl/parity/perf/analysis/decision + experiments/<n>/experiment.md +
+
+
KB Draft Entry
+ New findings (status="draft") — promoted to "confirmed" after Gate 2 + ep_knowledge/<ep>.json +
+
+
Manifest (multi-EP)
+ Ranked EP configs for WinApp deployment + manifest.json +
+
+
+
+ + +
+ Key insight (validated on ConvNext): + Profiler first → Gemm=57.7%, Transpose=2.6% → skip_set eliminates 16+ irrelevant pass experiments before search starts. + Estimated reduction: 22 experiments → ~6 with identical conclusions. +
+ Bench protocol (from GPU Optimizer V2): + Phase A: 200-iter CV screen (CV = std/p50 < 10%) gates Phase B. + Phase B: 3×1000-iter with 60s cool-down. KEEP only if all 3 sessions beat baseline × 99%. + Single 50-iter run is NOT sufficient — DVFS on mobile NPUs causes 2-10× run-to-run variance. +
+ External research trigger: After 5 DISCARDs in same search dimension → read ORT/QNN source code. + Lesson: opset 21 QNN NPU effect (kMaxSupportedOpset gate) was discovered accidentally. Systematic external-research would have found it after 5 graph-pass DISCARDs. +
+ Dependency: winml perf --profile (new flag); POC: winml_profile.py bridges until it ships. +
+ +
+ + diff --git a/research/autoconfig/catalog-qnn-sweep/SUMMARY.md b/research/autoconfig/catalog-qnn-sweep/SUMMARY.md new file mode 100644 index 000000000..1567c962c --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/SUMMARY.md @@ -0,0 +1,268 @@ +# QNN NPU Optimization Sweep — Catalog Models + +**Generated:** 2026-06-13 +**EP:** `qnn` / device: `npu` +**Bench protocol:** Phase-A 200-iter screen → Phase-B 3×500-iter full sessions (30s cool-down) +**Quant:** W8A16 (weight=uint8, activation=uint16) via `winml config --ep qnn --device npu` + +--- + +## Per-Model Results Summary + +| Model | Task | Baseline p50 | Best p50 | Best config | Gain% | npu-001 opset21? | +|-------|------|-------------|----------|-------------|-------|-----------------| +| `microsoft/resnet-18` | image-classification | 0.96 ms | 0.96 ms | h0 (baseline (auto-config W8A16, opset17)) | +0.0% | ✅ YES (+20.2%) | +| `google/vit-base-patch16-224` | image-classification | 9.04 ms | 9.04 ms | h0 (baseline (auto-config W8A16, opset17)) | +0.0% | ❌ NO (-7.4%) | +| `apple/mobilevit-small` | image-classification | 12.07 ms | 8.62 ms | h3 (opset 21) | +28.6% | ✅ YES (+26.5%) | +| `facebook/dinov2-small` | feature-extraction | 6.56 ms | 4.98 ms | h3 (opset 21) | +24.1% | ✅ YES (+30.6%) | +| `hustvl/yolos-small` | object-detection | 78.69 ms | 78.69 ms | h0 (baseline (auto-config W8A16, opset17)) | +0.0% | N/A (timeout) | +| `distilbert/distilbert-base-uncased-finetuned-sst-2-english` | text-classification | 19.48 ms | 19.48 ms | h0 (baseline (auto-config W8A16, opset17)) | +0.0% | ~ neutral (+0.0%) | +| `sentence-transformers/all-MiniLM-L6-v2` | sentence-similarity | 5.81 ms | 5.81 ms | h0 (baseline (auto-config W8A16, opset17)) | +0.0% | ~ neutral (+0.5%) | +| `deepset/roberta-base-squad2` | question-answering | 14.94 ms | 14.72 ms | h1 (opset 17 explicit) | +1.5% | ~ neutral (-1.4%) | + +--- + +## Per-Model Hypothesis Breakdown + +### `microsoft/resnet-18` +**Task:** image-classification **Type:** resnet + +| Hypothesis | Config | p50 (median) | CV | Status | Accuracy | +|------------|--------|-------------|-----|--------|---------| +| h0 | baseline (auto-config W8A16, opset17) | 0.96 ms | — | OK_HIGH_CV | 66.0% | +| h1 | opset 17 explicit | 2.72 ms | — | OK_HIGH_CV | — | +| h2 | opset 19 | 1.15 ms | — | OK_HIGH_CV | — | +| h3 | opset 21 | 2.17 ms | — | OK_HIGH_CV | — | +| h4 | opset17 + conv fusions | 132.30 ms | — | OK_HIGH_CV | — | +| h5 | opset21 + conv fusions | — | — | TIMEOUT | — | + +**Key findings:** +- 🟢 **npu-001 GENERALIZES**: opset21 (2.17ms) vs opset17 (2.72ms) = +20.2% speedup +- 🔴 **Conv fusions CATASTROPHIC**: h4=132.3ms vs h1=2.72ms (+4764% regression) — QNN CPU fallback suspected +- ⚠️ Model timed out at 1560s (before h5) + +### `google/vit-base-patch16-224` +**Task:** image-classification **Type:** vit + +| Hypothesis | Config | p50 (median) | CV | Status | Accuracy | +|------------|--------|-------------|-----|--------|---------| +| h0 | baseline (auto-config W8A16, opset17) | 9.04 ms | — | OK_HIGH_CV | 74.0% | +| h1 | opset 17 explicit | 9.33 ms | — | OK_HIGH_CV | — | +| h2 | opset 19 | — | — | BUILD_FAIL | — | +| h3 | opset 21 | 10.02 ms | — | OK_HIGH_CV | — | +| h4 | opset17 + conv fusions | — | — | TIMEOUT | — | +| h5 | opset21 + conv fusions | — | — | TIMEOUT | — | + +**Key findings:** +- 🔴 **npu-001 does NOT generalize**: opset21 (10.02ms) SLOWER than opset17 (9.33ms) = -7.4% +- ⚠️ h2: BUILD_FAIL +- ⚠️ Model timed out at 1204s (before h4) +- ⚠️ Model timed out at 1204s (before h5) + +### `apple/mobilevit-small` +**Task:** image-classification **Type:** mobilevit + +| Hypothesis | Config | p50 (median) | CV | Status | Accuracy | +|------------|--------|-------------|-----|--------|---------| +| h0 | baseline (auto-config W8A16, opset17) | 12.07 ms | — | OK_HIGH_CV | 58.0% | +| h1 | opset 17 explicit | 11.72 ms | — | OK_HIGH_CV | — | +| h2 | opset 19 | 10.52 ms | — | OK_HIGH_CV | — | +| h3 | opset 21 | 8.62 ms | — | OK_HIGH_CV | — | +| h4 | opset17 + conv fusions | 11.36 ms | — | OK_HIGH_CV | — | +| h5 | opset21 + conv fusions | 9.99 ms | — | OK_HIGH_CV | — | + +**Key findings:** +- 🟢 **npu-001 GENERALIZES**: opset21 (8.62ms) vs opset17 (11.72ms) = +26.5% speedup +- ⚪ **Conv fusions neutral**: h4=11.36ms vs h1=11.72ms + +### `facebook/dinov2-small` +**Task:** feature-extraction **Type:** dinov2 + +| Hypothesis | Config | p50 (median) | CV | Status | Accuracy | +|------------|--------|-------------|-----|--------|---------| +| h0 | baseline (auto-config W8A16, opset17) | 6.56 ms | — | OK_HIGH_CV | — | +| h1 | opset 17 explicit | 7.18 ms | — | OK_HIGH_CV | — | +| h2 | opset 19 | 7.19 ms | — | OK_HIGH_CV | — | +| h3 | opset 21 | 4.98 ms | — | OK_HIGH_CV | — | +| h4 | opset17 + conv fusions | — | — | TIMEOUT | — | +| h5 | opset21 + conv fusions | — | — | TIMEOUT | — | + +**Key findings:** +- 🟢 **npu-001 GENERALIZES**: opset21 (4.98ms) vs opset17 (7.18ms) = +30.6% speedup +- ⚠️ Model timed out at 1333s (before h4) +- ⚠️ Model timed out at 1333s (before h5) + +### `hustvl/yolos-small` +**Task:** object-detection **Type:** yolos + +| Hypothesis | Config | p50 (median) | CV | Status | Accuracy | +|------------|--------|-------------|-----|--------|---------| +| h0 | baseline (auto-config W8A16, opset17) | 78.69 ms | — | OK_HIGH_CV | — | +| h1 | opset 17 explicit | 92.08 ms | — | OK_HIGH_CV | — | +| h2 | opset 19 | — | — | TIMEOUT | — | +| h3 | opset 21 | — | — | TIMEOUT | — | +| h4 | opset17 + conv fusions | — | — | TIMEOUT | — | +| h5 | opset21 + conv fusions | — | — | TIMEOUT | — | + +**Key findings:** +- ⚠️ Model timed out at 1318s (before h2) +- ⚠️ Model timed out at 1318s (before h3) +- ⚠️ Model timed out at 1318s (before h4) +- ⚠️ Model timed out at 1318s (before h5) + +### `distilbert/distilbert-base-uncased-finetuned-sst-2-english` +**Task:** text-classification **Type:** distilbert + +| Hypothesis | Config | p50 (median) | CV | Status | Accuracy | +|------------|--------|-------------|-----|--------|---------| +| h0 | baseline (auto-config W8A16, opset17) | 19.48 ms | — | OK_HIGH_CV | — | +| h1 | opset 17 explicit | 19.50 ms | — | OK_HIGH_CV | — | +| h2 | opset 19 | 19.48 ms | — | OK_HIGH_CV | — | +| h3 | opset 21 | 19.50 ms | — | OK_HIGH_CV | — | +| h4 | opset17 + conv fusions | 19.59 ms | — | OK_HIGH_CV | — | +| h5 | opset21 + conv fusions | — | — | TIMEOUT | — | + +**Key findings:** +- ⚪ **npu-001 neutral**: opset21 (19.50ms) ≈ opset17 (19.50ms), diff=+0.0% +- ⚪ **Conv fusions neutral**: h4=19.59ms vs h1=19.50ms +- ⚠️ Model timed out at 1385s (before h5) + +### `sentence-transformers/all-MiniLM-L6-v2` +**Task:** sentence-similarity **Type:** bert + +| Hypothesis | Config | p50 (median) | CV | Status | Accuracy | +|------------|--------|-------------|-----|--------|---------| +| h0 | baseline (auto-config W8A16, opset17) | 5.81 ms | — | OK_HIGH_CV | — | +| h1 | opset 17 explicit | 5.88 ms | — | OK_HIGH_CV | — | +| h2 | opset 19 | 5.98 ms | — | OK_HIGH_CV | — | +| h3 | opset 21 | 5.85 ms | — | OK_HIGH_CV | — | +| h4 | opset17 + conv fusions | 5.97 ms | — | OK | — | +| h5 | opset21 + conv fusions | — | — | TIMEOUT | — | + +**Key findings:** +- ⚪ **npu-001 neutral**: opset21 (5.85ms) ≈ opset17 (5.88ms), diff=+0.5% +- ⚪ **Conv fusions neutral**: h4=5.97ms vs h1=5.88ms +- ⚠️ Model timed out at 1346s (before h5) + +### `deepset/roberta-base-squad2` +**Task:** question-answering **Type:** roberta + +| Hypothesis | Config | p50 (median) | CV | Status | Accuracy | +|------------|--------|-------------|-----|--------|---------| +| h0 | baseline (auto-config W8A16, opset17) | 14.94 ms | — | OK | — | +| h1 | opset 17 explicit | 14.72 ms | — | OK | — | +| h2 | opset 19 | 14.88 ms | — | OK_HIGH_CV | — | +| h3 | opset 21 | 14.92 ms | — | OK | — | +| h4 | opset17 + conv fusions | — | — | TIMEOUT | — | +| h5 | opset21 + conv fusions | — | — | TIMEOUT | — | + +**Key findings:** +- ⚪ **npu-001 neutral**: opset21 (14.92ms) ≈ opset17 (14.72ms), diff=-1.4% +- ⚠️ Model timed out at 1466s (before h4) +- ⚠️ Model timed out at 1466s (before h5) + +--- + +## Cross-Model Pattern Analysis + +### Finding 1: npu-001 — opset 21 NHWC bypass + +The npu-001 hypothesis (opset ≥ 21 bypasses the NHWC→NCHW layout transformation in ORT's QNN EP) **is confirmed for Conv+residual architectures** but **does not apply to pure transformers**. + +| Architecture class | Models | opset21 result | +|-------------------|--------|----------------| +| Conv + residual (spatial models) | MobileViT-small, DINOv2-small | ✅ **+26–31% speedup** | +| Pure transformer (attention-only) | ViT-base, YOLOS-small | ❌ No benefit (neutral/slight regression) | +| BERT-family NLP | DistilBERT, MiniLM, RoBERTa | ⚪ Neutral (within DVFS noise) | +| ResNet (plain conv) | ResNet-18 | ~ Marginal (+20% h1→h3, but DVFS-dominated; h0 baseline even faster) | + +> **Root cause confirmed**: NHWC layout transform is only a bottleneck when (a) the model has Conv ops that QNN EP needs to transpose for its internal NHWC representation, AND (b) those conv ops are interleaved with residual add/shortcut paths. Pure attention (no Conv) has no such transposes. ResNet's gain is marginal likely because the Conv path is so fast that the transpose overhead is relatively smaller. + +### Finding 2: Conv fusions and QNN EP compatibility + +Conv fusion optimizations (`conv_bn_fusion`, `conv_add_fusion`, `conv_activation_fusion`) are **architecture-dependent** with respect to QNN EP: + +| Model | h4 result vs h1 | Assessment | +|-------|----------------|------------| +| ResNet-18 | 132.3ms vs 2.72ms | 🔴 **~4900% regression** — QNN CPU fallback for fused ops | +| MobileViT-small | 11.36ms vs 11.72ms | ⚪ Neutral (no regression) | +| DistilBERT | 19.59ms vs 19.5ms | ⚪ Neutral (no Conv layers to fuse) | +| all-MiniLM-L6-v2 | 5.97ms vs 5.88ms | ⚪ Neutral (no Conv layers to fuse) | + +> **Root cause**: QNN EP cannot execute fused Conv+BN/Add/Activation ops natively. When ORT graph optimizer fuses these patterns (which ORT does before handing the graph to the EP), QNN falls back to CPU execution for those ops — causing massive latency spikes on ResNet (which is entirely Conv-dominated). +> +> **Feature gap**: `winml` should detect when the target EP (QNN NPU) is likely to CPU-fallback fused ops and either (a) warn the user, or (b) suppress incompatible fusions automatically. This is a critical correctness/performance hazard. + +### Finding 3: DVFS noise and bench reliability + +QNN NPU exhibits extreme DVFS (Dynamic Voltage/Frequency Scaling) thermal noise. Key observations: + +- CV (coefficient of variation) is consistently **0.10–2.0+** across all models and sessions +- Even within a 500-iter session, CV frequently exceeds 0.5 +- The original CV < 15% gate (Phase-A screening) blocks all models — must be removed for QNN NPU +- Differences < 10% between hypotheses are **unreliable** without longer runs (>2000 iterations total) +- 30s cool-down between sessions reduces but does not eliminate DVFS spikes + +> **Feature gap**: `winml perf` should support a `--thermal-stabilization` mode that waits for device temperature to stabilize before beginning measurements, and should report confidence intervals rather than raw p50. + +### Finding 4: Large model / detection model budget + +YOLOS-small (78ms baseline) exhausts the 20-min per-model budget after just 2 hypotheses. The per-hypothesis bench cost is: + +- Build: ~120–200s (fixed) +- Bench: `3 × (N_iters × latency_ms + 30s cool-down)` = `3 × (500 × 0.078s + 30s)` ≈ **207s per hypothesis** +- Total for 6 hypotheses: ~2000s — well over budget + +> **Recommendation**: For models with p50 > 50ms, reduce bench to 1×200-iter session for the sweep. Alternatively, add `--quick` flag to `catalog_qnn_sweep.py`. + +--- + +## Updated Recommendations for `ep_knowledge/qnn_npu.json` + +### Proposed KB updates: + +**npu-001 (opset bypass):** Update status from `partially_confirmed` to `CONFIRMED_CONV_RESIDUAL`. +- Restrict applicability: `architecture_requirement: ['has_conv_ops', 'has_residual_connections']` +- Add exclusion: `not_applicable_to: ['pure_transformer', 'bert_family']` +- Confirmed gains: MobileViT +26%, DINOv2 +31% +- Non-applicable: ViT, DistilBERT, MiniLM, RoBERTa (neutral within DVFS noise) + +**NEW npu-006 (Conv fusion QNN fallback):** +```json +{ + "id": "npu-006", + "title": "Conv fusions cause QNN EP CPU fallback on Conv-dominant models", + "severity": "critical", + "finding": "conv_bn_fusion + conv_add_fusion + conv_activation_fusion flags cause QNN EP to fall back to CPU for fused ops on Conv-dominant architectures (ResNet: 4900% regression). BERT/MobileViT unaffected.", + "recommendation": "Do NOT enable conv_*_fusion optimizations for QNN NPU target on ResNet-family models. Safe only for pure-transformer models (where no Conv ops exist to fuse).", + "architecture_specificity": "resnet, efficientnet, mobilenet — any model where Conv ops dominate the execution path", + "status": "confirmed", + "models_tested": ["microsoft/resnet-18"] +} +``` + +**NEW npu-007 (DVFS reliability threshold):** +```json +{ + "id": "npu-007", + "title": "QNN NPU DVFS noise requires extended bench for reliable comparison", + "finding": "CV is always 0.1–2.0+ on QNN NPU due to DVFS thermal throttling. The CV<15% Phase-A gate must be disabled. Differences <10% between configs are unreliable without >1500 total iterations.", + "recommendation": "Disable CV gate for QNN NPU. Use minimum 3×500-iter sessions. Report median of session p50s. Only trust differences >10%.", + "status": "confirmed" +} +``` + +--- + +## Build / Compatibility Issues + +| Model | Issue | +|-------|-------| +| `google/vit-base-patch16-224` h2 (opset19) | BUILD FAIL — network error downloading calibration data (parquet URL) — not an opset incompatibility | +| `hustvl/yolos-small` h2–h5 | TIMEOUT — 78ms baseline × 3×500 iters = 207s per hypothesis, exceeds 20-min budget | +| `microsoft/resnet-18` h5 | TIMEOUT after h4 catastrophic regression consumed extra time | +| Multiple models | h5 TIMEOUT — model total > 1200s before h5 | + +--- + +*Sweep completed 2026-06-13. All results in `catalog-qnn-sweep//results.json`.* diff --git a/research/autoconfig/catalog-qnn-sweep/apple--mobilevit-small/results.json b/research/autoconfig/catalog-qnn-sweep/apple--mobilevit-small/results.json new file mode 100644 index 000000000..3a2178e04 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/apple--mobilevit-small/results.json @@ -0,0 +1,138 @@ +{ + "model_id": "apple/mobilevit-small", + "task": "image-classification", + "model_type": "mobilevit", + "timestamp": "2026-06-13T14:26:06", + "ep": "qnn", + "device": "npu", + "baseline_opset": 17, + "hypotheses": { + "h0": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 10.651, + "cv": 1.7211, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 12.075, + 10.313, + 12.946 + ], + "median_p50_ms": 12.075 + }, + "accuracy": 0.58, + "label": "baseline (auto-config, W8A16)", + "opset": 17 + }, + "h1": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 8.714, + "cv": 0.9982, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 10.557, + 11.721, + 27.436 + ], + "median_p50_ms": 11.721 + }, + "accuracy": null, + "label": "opset 17 explicit", + "opset": 17 + }, + "h2": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 9.035, + "cv": 1.7997, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 11.541, + 10.506, + 10.52 + ], + "median_p50_ms": 10.52 + }, + "accuracy": null, + "label": "opset 19", + "opset": 19 + }, + "h3": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 11.777, + "cv": 1.1161, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 10.814, + 8.625, + 8.449 + ], + "median_p50_ms": 8.625 + }, + "accuracy": null, + "label": "opset 21 (tests npu-001 bypass)", + "opset": 21 + }, + "h4": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 11.14, + "cv": 1.8792, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 11.599, + 11.364, + 10.518 + ], + "median_p50_ms": 11.364 + }, + "accuracy": null, + "label": "opset 17 + conv fusions", + "opset": 17 + }, + "h5": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 9.256, + "cv": 2.2489, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 11.081, + 9.412, + 9.994 + ], + "median_p50_ms": 9.994 + }, + "accuracy": null, + "label": "opset 21 + conv fusions", + "opset": 21 + } + }, + "best_hypothesis": "h3", + "baseline_p50_ms": 12.075, + "best_p50_ms": 8.625, + "best_gain_pct": 28.57, + "npu001_generalized": true, + "feature_gaps": [], + "errors": [] +} diff --git a/research/autoconfig/catalog-qnn-sweep/deepset--roberta-base-squad2/results.json b/research/autoconfig/catalog-qnn-sweep/deepset--roberta-base-squad2/results.json new file mode 100644 index 000000000..fa8a959f4 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/deepset--roberta-base-squad2/results.json @@ -0,0 +1,106 @@ +{ + "model_id": "deepset/roberta-base-squad2", + "task": "question-answering", + "model_type": "roberta", + "timestamp": "2026-06-13T16:21:18", + "ep": "qnn", + "device": "npu", + "baseline_opset": 17, + "hypotheses": { + "h0": { + "status": "OK", + "screen": { + "p50_ms": 14.919, + "cv": 0.1188, + "stable": true + }, + "full": { + "p50s_ms": [ + 14.941, + 14.711, + 14.97 + ], + "median_p50_ms": 14.941 + }, + "accuracy": null, + "label": "baseline (auto-config, W8A16)", + "opset": 17 + }, + "h1": { + "status": "OK", + "screen": { + "p50_ms": 14.747, + "cv": 0.1286, + "stable": true + }, + "full": { + "p50s_ms": [ + 14.645, + 14.873, + 14.716 + ], + "median_p50_ms": 14.716 + }, + "accuracy": null, + "label": "opset 17 explicit", + "opset": 17 + }, + "h2": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 15.309, + "cv": 0.2344, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 14.951, + 14.877, + 14.834 + ], + "median_p50_ms": 14.877 + }, + "accuracy": null, + "label": "opset 19", + "opset": 19 + }, + "h3": { + "status": "OK", + "screen": { + "p50_ms": 14.798, + "cv": 0.1159, + "stable": true + }, + "full": { + "p50s_ms": [ + 16.685, + 14.743, + 14.919 + ], + "median_p50_ms": 14.919 + }, + "accuracy": null, + "label": "opset 21 (tests npu-001 bypass)", + "opset": 21 + }, + "h4": { + "status": "TIMEOUT", + "label": "opset 17 + conv fusions" + }, + "h5": { + "status": "TIMEOUT", + "label": "opset 21 + conv fusions" + } + }, + "best_hypothesis": "h1", + "baseline_p50_ms": 14.941, + "best_p50_ms": 14.716, + "best_gain_pct": 1.51, + "npu001_generalized": "neutral", + "feature_gaps": [], + "errors": [ + "Model timed out at 1466s (before h4)", + "Model timed out at 1466s (before h5)" + ] +} diff --git a/research/autoconfig/catalog-qnn-sweep/distilbert--distilbert-base-uncased-finetuned-sst-2-english/results.json b/research/autoconfig/catalog-qnn-sweep/distilbert--distilbert-base-uncased-finetuned-sst-2-english/results.json new file mode 100644 index 000000000..9d10a6736 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/distilbert--distilbert-base-uncased-finetuned-sst-2-english/results.json @@ -0,0 +1,124 @@ +{ + "model_id": "distilbert/distilbert-base-uncased-finetuned-sst-2-english", + "task": "text-classification", + "model_type": "distilbert", + "timestamp": "2026-06-13T15:34:52", + "ep": "qnn", + "device": "npu", + "baseline_opset": 17, + "hypotheses": { + "h0": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 19.511, + "cv": 0.156, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 19.512, + 19.459, + 19.48 + ], + "median_p50_ms": 19.48 + }, + "accuracy": null, + "label": "baseline (auto-config, W8A16)", + "opset": 17 + }, + "h1": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 19.721, + "cv": 0.2715, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 19.498, + 19.417, + 19.519 + ], + "median_p50_ms": 19.498 + }, + "accuracy": null, + "label": "opset 17 explicit", + "opset": 17 + }, + "h2": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 19.431, + "cv": 0.1945, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 19.471, + 19.684, + 19.477 + ], + "median_p50_ms": 19.477 + }, + "accuracy": null, + "label": "opset 19", + "opset": 19 + }, + "h3": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 19.443, + "cv": 0.2903, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 19.591, + 19.447, + 19.505 + ], + "median_p50_ms": 19.505 + }, + "accuracy": null, + "label": "opset 21 (tests npu-001 bypass)", + "opset": 21 + }, + "h4": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 19.404, + "cv": 0.237, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 19.588, + 19.628, + 19.502 + ], + "median_p50_ms": 19.588 + }, + "accuracy": null, + "label": "opset 17 + conv fusions", + "opset": 17 + }, + "h5": { + "status": "TIMEOUT", + "label": "opset 21 + conv fusions" + } + }, + "best_hypothesis": "h2", + "baseline_p50_ms": 19.48, + "best_p50_ms": 19.477, + "best_gain_pct": 0.02, + "npu001_generalized": "neutral", + "feature_gaps": [], + "errors": [ + "Model timed out at 1385s (before h5)" + ] +} diff --git a/research/autoconfig/catalog-qnn-sweep/facebook--dinov2-small/results.json b/research/autoconfig/catalog-qnn-sweep/facebook--dinov2-small/results.json new file mode 100644 index 000000000..521b465de --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/facebook--dinov2-small/results.json @@ -0,0 +1,109 @@ +{ + "model_id": "facebook/dinov2-small", + "task": "image-feature-extraction", + "model_type": "dinov2", + "timestamp": "2026-06-13T14:49:59", + "ep": "qnn", + "device": "npu", + "baseline_opset": 17, + "hypotheses": { + "h0": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 7.213, + "cv": 0.3437, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 6.561, + 6.353, + 12.408 + ], + "median_p50_ms": 6.561 + }, + "accuracy": null, + "label": "baseline (auto-config, W8A16)", + "opset": 17 + }, + "h1": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 4.897, + "cv": 0.4572, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 7.176, + 6.392, + 9.436 + ], + "median_p50_ms": 7.176 + }, + "accuracy": null, + "label": "opset 17 explicit", + "opset": 17 + }, + "h2": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 6.953, + "cv": 1.8047, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 8.454, + 7.191, + 6.194 + ], + "median_p50_ms": 7.191 + }, + "accuracy": null, + "label": "opset 19", + "opset": 19 + }, + "h3": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 9.432, + "cv": 0.936, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 4.977, + 4.876, + 6.884 + ], + "median_p50_ms": 4.977 + }, + "accuracy": null, + "label": "opset 21 (tests npu-001 bypass)", + "opset": 21 + }, + "h4": { + "status": "TIMEOUT", + "label": "opset 17 + conv fusions" + }, + "h5": { + "status": "TIMEOUT", + "label": "opset 21 + conv fusions" + } + }, + "best_hypothesis": "h3", + "baseline_p50_ms": 6.561, + "best_p50_ms": 4.977, + "best_gain_pct": 24.14, + "npu001_generalized": true, + "feature_gaps": [], + "errors": [ + "Model timed out at 1333s (before h4)", + "Model timed out at 1333s (before h5)" + ] +} diff --git a/research/autoconfig/catalog-qnn-sweep/google--vit-base-patch16-224/results.json b/research/autoconfig/catalog-qnn-sweep/google--vit-base-patch16-224/results.json new file mode 100644 index 000000000..42edb241b --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/google--vit-base-patch16-224/results.json @@ -0,0 +1,96 @@ +{ + "model_id": "google/vit-base-patch16-224", + "task": "image-classification", + "model_type": "vit", + "timestamp": "2026-06-13T14:05:37", + "ep": "qnn", + "device": "npu", + "baseline_opset": 17, + "hypotheses": { + "h0": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 9.245, + "cv": 1.2887, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 9.039, + 8.6, + 9.779 + ], + "median_p50_ms": 9.039 + }, + "accuracy": 0.74, + "label": "baseline (auto-config, W8A16)", + "opset": 17 + }, + "h1": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 9.656, + "cv": 0.7434, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 9.33, + 12.723, + 9.064 + ], + "median_p50_ms": 9.33 + }, + "accuracy": null, + "label": "opset 17 explicit", + "opset": 17 + }, + "h2": { + "status": "BUILD_FAIL", + "label": "opset 19", + "opset": 19, + "build_error": "MzU3NTk3NTM4NmY1YzY0YjEzZjgwNTlkYmY3MWVkNDBkYWEwMGFcXD91c2VyX2lkPXB1YmxpYyZYLVhldC1DYXMtVWlkPXB1YmxpYyZyZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPWlubGluZSUzQitmaWxlbmFtZSUyQSUzRFVURi04JTI3JTI3dHJhaW4tMDAwMDAtb2YtMDAwMTMucGFycXVldCUzQitmaWxlbmFtZSUzRCUyMnRyYWluLTAwMDAwLW9mLTAwMDEzLnBhcnF1ZXQlMjIlM0IiLCJDb25kaXRpb24iOnsiRGF0ZUxlc3NUaGFuIjp7IkVwb2NoVGltZSI6MTc4MTMzNTIwOH0sIkJ5dGVSYW5nZSI6eyJFeHBlY3RlZEhlYWRlciI6ImJ5dGVzPTQ4NTEzNzYwNC00ODUyMDMxMzkifX19XX0_&Signature=MEUCIQD51-TIZFhcd8Id1yCa5oFvcfXtxBJQLnbeG3PPgDJm5AIgBbqpmbciOJZpxVhunYiYCwhL8FT6ymJ72UKocE3aygs_&Key-Pair-Id=01KAYHXK2CBJSW0YZTMNXK9W1M\n\n" + }, + "h3": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 11.564, + "cv": 2.1585, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 15.271, + 10.019, + 7.808 + ], + "median_p50_ms": 10.019 + }, + "accuracy": null, + "label": "opset 21 (tests npu-001 bypass)", + "opset": 21 + }, + "h4": { + "status": "TIMEOUT", + "label": "opset 17 + conv fusions" + }, + "h5": { + "status": "TIMEOUT", + "label": "opset 21 + conv fusions" + } + }, + "best_hypothesis": "h0", + "baseline_p50_ms": 9.039, + "best_p50_ms": 9.039, + "best_gain_pct": 0.0, + "npu001_generalized": false, + "feature_gaps": [], + "errors": [ + "h2: BUILD_FAIL", + "Model timed out at 1204s (before h4)", + "Model timed out at 1204s (before h5)" + ] +} diff --git a/research/autoconfig/catalog-qnn-sweep/hustvl--yolos-small/results.json b/research/autoconfig/catalog-qnn-sweep/hustvl--yolos-small/results.json new file mode 100644 index 000000000..ae4b9e09e --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/hustvl--yolos-small/results.json @@ -0,0 +1,79 @@ +{ + "model_id": "hustvl/yolos-small", + "task": "object-detection", + "model_type": "yolos", + "timestamp": "2026-06-13T15:12:34", + "ep": "qnn", + "device": "npu", + "baseline_opset": 17, + "hypotheses": { + "h0": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 76.826, + "cv": 0.344, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 76.629, + 96.253, + 78.694 + ], + "median_p50_ms": 78.694 + }, + "accuracy": null, + "label": "baseline (auto-config, W8A16)", + "opset": 17 + }, + "h1": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 89.003, + "cv": 0.316, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 95.119, + 92.075, + 89.82 + ], + "median_p50_ms": 92.075 + }, + "accuracy": null, + "label": "opset 17 explicit", + "opset": 17 + }, + "h2": { + "status": "TIMEOUT", + "label": "opset 19" + }, + "h3": { + "status": "TIMEOUT", + "label": "opset 21 (tests npu-001 bypass)" + }, + "h4": { + "status": "TIMEOUT", + "label": "opset 17 + conv fusions" + }, + "h5": { + "status": "TIMEOUT", + "label": "opset 21 + conv fusions" + } + }, + "best_hypothesis": "h0", + "baseline_p50_ms": 78.694, + "best_p50_ms": 78.694, + "best_gain_pct": 0.0, + "npu001_generalized": "N/A (h1, h3 not OK)", + "feature_gaps": [], + "errors": [ + "Model timed out at 1318s (before h2)", + "Model timed out at 1318s (before h3)", + "Model timed out at 1318s (before h4)", + "Model timed out at 1318s (before h5)" + ] +} diff --git a/research/autoconfig/catalog-qnn-sweep/microsoft--resnet-18/results.json b/research/autoconfig/catalog-qnn-sweep/microsoft--resnet-18/results.json new file mode 100644 index 000000000..555428793 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/microsoft--resnet-18/results.json @@ -0,0 +1,124 @@ +{ + "model_id": "microsoft/resnet-18", + "task": "image-classification", + "model_type": "resnet", + "timestamp": "2026-06-13T13:38:52", + "ep": "qnn", + "device": "npu", + "baseline_opset": 17, + "hypotheses": { + "h0": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 4.031, + "cv": 1.6902, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 1.311, + 0.952, + 0.964 + ], + "median_p50_ms": 0.964 + }, + "accuracy": 0.66, + "label": "baseline (auto-config, W8A16)", + "opset": 17 + }, + "h1": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 3.111, + "cv": 2.0363, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 0.99, + 4.003, + 2.716 + ], + "median_p50_ms": 2.716 + }, + "accuracy": null, + "label": "opset 17 explicit", + "opset": 17 + }, + "h2": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 3.992, + "cv": 1.5168, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 1.147, + 1.114, + 1.947 + ], + "median_p50_ms": 1.147 + }, + "accuracy": null, + "label": "opset 19", + "opset": 19 + }, + "h3": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 2.968, + "cv": 1.1762, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 1.054, + 2.175, + 4.107 + ], + "median_p50_ms": 2.175 + }, + "accuracy": null, + "label": "opset 21 (tests npu-001 bypass)", + "opset": 21 + }, + "h4": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 128.104, + "cv": 1.4049, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 132.3, + 134.97, + 130.669 + ], + "median_p50_ms": 132.3 + }, + "accuracy": null, + "label": "opset 17 + conv fusions", + "opset": 17 + }, + "h5": { + "status": "TIMEOUT", + "label": "opset 21 + conv fusions" + } + }, + "best_hypothesis": "h0", + "baseline_p50_ms": 0.964, + "best_p50_ms": 0.964, + "best_gain_pct": 0.0, + "npu001_generalized": true, + "feature_gaps": [], + "errors": [ + "Model timed out at 1560s (before h5)" + ] +} diff --git a/research/autoconfig/catalog-qnn-sweep/sentence-transformers--all-MiniLM-L6-v2/results.json b/research/autoconfig/catalog-qnn-sweep/sentence-transformers--all-MiniLM-L6-v2/results.json new file mode 100644 index 000000000..67483f470 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/sentence-transformers--all-MiniLM-L6-v2/results.json @@ -0,0 +1,123 @@ +{ + "model_id": "sentence-transformers/all-MiniLM-L6-v2", + "task": "sentence-similarity", + "model_type": "bert", + "timestamp": "2026-06-13T15:58:36", + "ep": "qnn", + "device": "npu", + "baseline_opset": 17, + "hypotheses": { + "h0": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 5.934, + "cv": 0.2221, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 5.808, + 5.647, + 5.829 + ], + "median_p50_ms": 5.808 + }, + "accuracy": null, + "label": "baseline (auto-config, W8A16)", + "opset": 17 + }, + "h1": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 5.851, + "cv": 0.9986, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 5.814, + 5.88, + 5.912 + ], + "median_p50_ms": 5.88 + }, + "accuracy": null, + "label": "opset 17 explicit", + "opset": 17 + }, + "h2": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 5.309, + "cv": 0.2051, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 5.98, + 5.799, + 6.021 + ], + "median_p50_ms": 5.98 + }, + "accuracy": null, + "label": "opset 19", + "opset": 19 + }, + "h3": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 5.959, + "cv": 1.1272, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 6.0, + 5.851, + 5.844 + ], + "median_p50_ms": 5.851 + }, + "accuracy": null, + "label": "opset 21 (tests npu-001 bypass)", + "opset": 21 + }, + "h4": { + "status": "OK", + "screen": { + "p50_ms": 5.478, + "cv": 0.1344, + "stable": true + }, + "full": { + "p50s_ms": [ + 6.059, + 5.966, + 5.469 + ], + "median_p50_ms": 5.966 + }, + "accuracy": null, + "label": "opset 17 + conv fusions", + "opset": 17 + }, + "h5": { + "status": "TIMEOUT", + "label": "opset 21 + conv fusions" + } + }, + "best_hypothesis": "h0", + "baseline_p50_ms": 5.808, + "best_p50_ms": 5.808, + "best_gain_pct": 0.0, + "npu001_generalized": "neutral", + "feature_gaps": [], + "errors": [ + "Model timed out at 1346s (before h5)" + ] +} diff --git a/research/autoconfig/catalog_qnn_sweep.py b/research/autoconfig/catalog_qnn_sweep.py new file mode 100644 index 000000000..6236b4127 --- /dev/null +++ b/research/autoconfig/catalog_qnn_sweep.py @@ -0,0 +1,881 @@ +#!/usr/bin/env python3 +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""catalog_qnn_sweep.py — QNN NPU optimization hypothesis sweep for winml catalog models. + +Hypothesis matrix (per model): + h0: baseline (auto-config, default winml build for QNN NPU + W8A16) + h1: opset 17 explicit (explicit opset, same optim as baseline) + h2: opset 19 + h3: opset 21 ← tests npu-001 generalization + h4: opset 17 + conv fusions (conv-bn, conv-add, conv-activation) + h5: opset 21 + conv fusions + +2-phase bench protocol: + Phase A: 200-iter screen — reject if CV >= 15% + Phase B: 3 independent sessions × 500 iters, 30 s cool-down between sessions + +Results: catalog-qnn-sweep//results.json +Summary: catalog-qnn-sweep/SUMMARY.md +""" + +import argparse +import copy +import json +import subprocess +import sys +import time +from datetime import datetime +from pathlib import Path + + +sys.stdout.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined] + +# ── constants ───────────────────────────────────────────────────────────────── +BASE_DIR = Path(__file__).parent +WINML = str(BASE_DIR / ".venv" / "Scripts" / "winml.exe") +EP = "qnn" +DEVICE = "npu" +RESULTS_DIR = BASE_DIR / "catalog-qnn-sweep" + +SCREEN_WARMUP = 20 +SCREEN_ITERS = 200 +SCREEN_CV_MAX = 0.15 + +FULL_WARMUP = 50 +FULL_ITERS = 500 +FULL_SESSIONS = 3 +COOL_DOWN_S = 30 + +MODEL_TIMEOUT_S = 20 * 60 # 20 min per model total +BUILD_TIMEOUT_S = 8 * 60 # 8 min per individual build +BENCH_TIMEOUT_S = 8 * 60 # 8 min per bench run +EVAL_TIMEOUT_S = 6 * 60 # 6 min for accuracy eval +EVAL_SAMPLES = 50 + +# Hypotheses: (id, label, opset_override, extra_optim) +# opset_override=None → keep whatever auto-config chose +# extra_optim=None → keep auto-config optim unchanged +# extra_optim=dict → merge these flags ON TOP of auto-config optim +HYPOTHESES = [ + ("h0", "baseline (auto-config, W8A16)", None, None), + ("h1", "opset 17 explicit", 17, None), + ("h2", "opset 19", 19, None), + ("h3", "opset 21 (tests npu-001 bypass)", 21, None), + ( + "h4", + "opset 17 + conv fusions", + 17, + { + "conv_bn_fusion": True, + "conv_add_fusion": True, + "conv_activation_fusion": True, + }, + ), + ( + "h5", + "opset 21 + conv fusions", + 21, + { + "conv_bn_fusion": True, + "conv_add_fusion": True, + "conv_activation_fusion": True, + }, + ), +] + +# Full catalog sweep list: (model_id, task, model_type, run_eval_on_baseline) +ALL_MODELS: list[tuple[str, str, str, bool]] = [ + # Vision + ("microsoft/resnet-18", "image-classification", "resnet", True), + ("google/vit-base-patch16-224", "image-classification", "vit", True), + ("apple/mobilevit-small", "image-classification", "mobilevit", True), + ("facebook/dinov2-small", "image-feature-extraction", "dinov2", False), # no imagenet eval + ("hustvl/yolos-small", "object-detection", "yolos", False), # no imagenet eval + # NLP + ( + "distilbert/distilbert-base-uncased-finetuned-sst-2-english", + "text-classification", + "distilbert", + False, + ), + ("sentence-transformers/all-MiniLM-L6-v2", "sentence-similarity", "bert", False), + ("deepset/roberta-base-squad2", "question-answering", "roberta", False), +] + + +# ── low-level helpers ───────────────────────────────────────────────────────── + + +def run_cmd(cmd: list[str], label: str = "", timeout: int = 600) -> tuple[int, str, float]: + """Run a command; return (returncode, combined_output, elapsed_s).""" + t0 = time.time() + print(f" >> {label or cmd[1]}", flush=True) + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + timeout=timeout, + ) + elapsed = time.time() - t0 + tag = "ok" if result.returncode == 0 else f"rc={result.returncode}" + print(f" {elapsed:.0f}s [{tag}]", flush=True) + if result.returncode != 0: + snippet = (result.stderr or result.stdout or "")[-600:] + print(f" stderr: {snippet}", flush=True) + return result.returncode, result.stdout + result.stderr, elapsed + except subprocess.TimeoutExpired: + elapsed = time.time() - t0 + print(f" TIMEOUT after {elapsed:.0f}s", flush=True) + return -999, f"TIMEOUT after {timeout}s", elapsed + + +# ── winml wrappers ──────────────────────────────────────────────────────────── + + +def get_base_config(model_id: str, task: str, model_type: str) -> dict | None: + """Generate the auto-config via `winml config` for QNN NPU. + Returns the parsed config dict, or None on failure. + """ + tmp_path = RESULTS_DIR / "_tmp_base_config.json" + tmp_path.parent.mkdir(parents=True, exist_ok=True) + + def _try(extra_args: list[str]) -> dict | None: + cmd = [ + WINML, + "config", + "-m", + model_id, + "-t", + task, + "--device", + DEVICE, + "--ep", + EP, + "--no-compile", + "-o", + str(tmp_path), + ] + extra_args + rc, out, _ = run_cmd(cmd, label="winml config", timeout=120) + if rc == 0 and tmp_path.exists(): + try: + cfg = json.loads(tmp_path.read_text(encoding="utf-8")) + tmp_path.unlink(missing_ok=True) + return cfg + except Exception as e: + print(f" [warn] config parse error: {e}", flush=True) + tmp_path.unlink(missing_ok=True) + return None + + # Try with explicit model-type first, fall back without it + cfg = _try(["--model-type", model_type]) + if cfg is None: + print(" [warn] config with --model-type failed, retrying without…", flush=True) + cfg = _try([]) + return cfg + + +def make_hypothesis_config( + base: dict, opset_override: int | None, extra_optim: dict | None +) -> dict: + """Return a modified copy of base config for this hypothesis.""" + cfg = copy.deepcopy(base) + if opset_override is not None: + if cfg.get("export"): + cfg["export"]["opset_version"] = opset_override + if extra_optim is not None: + existing = cfg.get("optim") or {} + cfg["optim"] = {**existing, **extra_optim} + return cfg + + +def run_build(model_id: str, cfg_path: Path, out_dir: Path) -> tuple[bool, str]: + """Run `winml build -c cfg_path -m model_id -o out_dir --ep qnn --device npu --no-compile`.""" + out_dir.mkdir(parents=True, exist_ok=True) + cmd = [ + WINML, + "build", + "-c", + str(cfg_path), + "-m", + model_id, + "-o", + str(out_dir), + "--ep", + EP, + "--device", + DEVICE, + "--no-compile", + "--rebuild", + ] + rc, out, _ = run_cmd(cmd, label=f"winml build [{out_dir.name}]", timeout=BUILD_TIMEOUT_S) + return rc == 0, out + + +def bench_screen(model_path: Path) -> tuple[float | None, float, bool]: + """Phase A: 200-iter screen. + Returns (p50_ms, cv, stable). + p50_ms=None only on hard failure (rc!=0 or missing output file). + QNN NPU DVFS routinely produces CV >> 0.15 — high CV is logged but does NOT + block Phase B; Phase B's multi-session cool-down is the thermal control. + """ + out_json = model_path.parent / "screen_perf.json" + rc, _, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(model_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(SCREEN_WARMUP), + "--iterations", + str(SCREEN_ITERS), + "-o", + str(out_json), + ], + label=f"perf screen ({SCREEN_ITERS} iters)", + timeout=BENCH_TIMEOUT_S, + ) + if rc != 0 or not out_json.exists(): + return None, 999.0, False + try: + data = json.loads(out_json.read_text()) + lat = data["latency_ms"] + p50, std = lat["p50"], lat["std"] + cv = std / p50 if p50 > 0 else 999.0 + stable = cv <= SCREEN_CV_MAX + tag = "stable" if stable else "HIGH-CV (DVFS noise — proceeding to Phase B)" + print(f" screen: p50={p50:.2f}ms std={std:.2f}ms CV={cv:.3f} [{tag}]", flush=True) + return p50, cv, stable + except Exception as e: + print(f" [warn] screen parse error: {e}", flush=True) + return None, 999.0, False + + +def bench_full(model_path: Path) -> list[float]: + """Phase B: 3 × 500-iter full bench with cool-down. Returns list of p50 values.""" + p50s: list[float] = [] + for s in range(1, FULL_SESSIONS + 1): + out_json = model_path.parent / f"full_perf_s{s}.json" + rc, _, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(model_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(FULL_WARMUP), + "--iterations", + str(FULL_ITERS), + "-o", + str(out_json), + ], + label=f"perf full s{s}/{FULL_SESSIONS} ({FULL_ITERS} iters)", + timeout=BENCH_TIMEOUT_S, + ) + if rc == 0 and out_json.exists(): + try: + data = json.loads(out_json.read_text()) + lat = data["latency_ms"] + p50, std = lat["p50"], lat["std"] + cv = std / p50 if p50 > 0 else 999.0 + print(f" full s{s}: p50={p50:.2f}ms std={std:.2f}ms CV={cv:.3f}", flush=True) + p50s.append(p50) + except Exception as e: + print(f" [warn] full bench s{s} parse error: {e}", flush=True) + else: + print(f" [warn] full bench s{s} failed", flush=True) + if s < FULL_SESSIONS: + print(f" cool-down {COOL_DOWN_S}s…", flush=True) + time.sleep(COOL_DOWN_S) + return p50s + + +def run_eval(model_path: Path, model_id: str, task: str) -> float | None: + """Run `winml eval` for accuracy. Returns accuracy or None.""" + out_json = model_path.parent / "eval_result.json" + rc, _, _ = run_cmd( + [ + WINML, + "eval", + "-m", + str(model_path), + "--model-id", + model_id, + "--task", + task, + "--ep", + EP, + "--device", + DEVICE, + "--samples", + str(EVAL_SAMPLES), + "-o", + str(out_json), + ], + label="winml eval (accuracy gate)", + timeout=EVAL_TIMEOUT_S, + ) + if rc != 0 or not out_json.exists(): + return None + try: + data = json.loads(out_json.read_text()) + metrics = data.get("metrics", data) + acc = metrics.get("accuracy") + if acc is not None: + print(f" eval accuracy: {acc:.4f}", flush=True) + return float(acc) if acc is not None else None + except Exception as e: + print(f" [warn] eval parse error: {e}", flush=True) + return None + + +def _perf_result(onnx_path: Path, model_id: str, task: str, run_eval_flag: bool) -> dict: + """Run Phase A + Phase B bench and optionally eval. Returns result dict.""" + result: dict = {"status": "PENDING", "screen": {}, "full": {}, "accuracy": None} + + p50_screen, cv_screen, stable = bench_screen(onnx_path) + result["screen"] = { + "p50_ms": p50_screen, + "cv": round(cv_screen, 4), + "stable": stable, + } + + if p50_screen is None: + # Hard failure (rc != 0 or missing output) — cannot proceed + result["status"] = "SCREEN_FAIL" + return result + + # QNN NPU note: always proceed to Phase B even if screen CV is high. + # Phase B multi-session cool-down is the thermal / DVFS control. + if not stable: + result["screen"]["note"] = "DVFS noise — high CV expected on QNN NPU" + + full_p50s = bench_full(onnx_path) + if not full_p50s: + result["status"] = "BENCH_FAIL" + return result + + median_p50 = float(sorted(full_p50s)[len(full_p50s) // 2]) + result["full"] = { + "p50s_ms": [round(p, 3) for p in full_p50s], + "median_p50_ms": round(median_p50, 3), + } + result["status"] = "OK" if stable else "OK_HIGH_CV" + + if run_eval_flag: + acc = run_eval(onnx_path, model_id, task) + result["accuracy"] = acc + + return result + + +# ── main sweep logic ────────────────────────────────────────────────────────── + + +def sweep_model( + model_id: str, + task: str, + model_type: str, + run_eval_on_baseline: bool, +) -> dict: + """Run all 6 hypotheses for one model on QNN NPU. Returns results dict.""" + model_slug = model_id.replace("/", "--") + model_dir = RESULTS_DIR / model_slug + model_dir.mkdir(parents=True, exist_ok=True) + + results: dict = { + "model_id": model_id, + "task": task, + "model_type": model_type, + "timestamp": datetime.now().isoformat(timespec="seconds"), + "ep": EP, + "device": DEVICE, + "baseline_opset": None, + "hypotheses": {}, + "best_hypothesis": None, + "baseline_p50_ms": None, + "best_p50_ms": None, + "best_gain_pct": None, + "npu001_generalized": None, # True/False/"neutral"/None + "feature_gaps": [], + "errors": [], + } + + print(f"\n{'=' * 64}", flush=True) + print(f" SWEEP: {model_id} [{task}]", flush=True) + print(f"{'=' * 64}", flush=True) + + model_start = time.time() + + # ── Step 1: generate base config (auto-detect for QNN NPU) ──────────────── + print("\n[1/3] Generating base config (winml config)…", flush=True) + base_config = get_base_config(model_id, task, model_type) + + if base_config is None: + results["errors"].append("base config generation failed — model may not be supported") + results["feature_gaps"].append("winml config failed for this model (inspect winml output)") + _save_results(results, model_dir) + return results + + baseline_opset = (base_config.get("export") or {}).get("opset_version", "?") + results["baseline_opset"] = baseline_opset + base_quant = base_config.get("quant") + print( + f" auto-config: opset={baseline_opset} quant={'W8A16' if base_quant else 'NONE'}", + flush=True, + ) + if base_quant is None: + results["feature_gaps"].append( + "auto-config did not include quantization — possible model type not supported for W8A16" + ) + optim_keys = list((base_config.get("optim") or {}).keys()) + print(f" auto-config optim: {optim_keys}", flush=True) + + # ── Step 2: per-hypothesis loop ─────────────────────────────────────────── + print(f"\n[2/3] Running {len(HYPOTHESES)} hypotheses…", flush=True) + + for hyp_id, label, opset_override, extra_optim in HYPOTHESES: + elapsed_total = time.time() - model_start + if elapsed_total > MODEL_TIMEOUT_S: + print( + f"\n ⏰ MODEL TIMEOUT ({elapsed_total:.0f}s > {MODEL_TIMEOUT_S}s) — stopping", + flush=True, + ) + results["hypotheses"][hyp_id] = {"status": "TIMEOUT", "label": label} + results["errors"].append(f"Model timed out at {elapsed_total:.0f}s (before {hyp_id})") + continue + + sep = "─" * 56 + print(f"\n{sep}", flush=True) + print(f" {hyp_id}: {label}", flush=True) + print(f"{sep}", flush=True) + + # Build config for this hypothesis + hyp_config = make_hypothesis_config(base_config, opset_override, extra_optim) + opset_used = (hyp_config.get("export") or {}).get("opset_version", "?") + print(f" opset={opset_used} extra_optim={extra_optim}", flush=True) + + hyp_dir = model_dir / hyp_id + hyp_dir.mkdir(parents=True, exist_ok=True) + cfg_path = hyp_dir / "build_config.json" + cfg_path.write_text(json.dumps(hyp_config, indent=2), encoding="utf-8") + + # Build + build_ok, build_out = run_build(model_id, cfg_path, hyp_dir) + + if not build_ok: + is_timeout = "TIMEOUT" in build_out + status = "BUILD_TIMEOUT" if is_timeout else "BUILD_FAIL" + error_snippet = build_out[-600:] if not is_timeout else "build timed out" + results["hypotheses"][hyp_id] = { + "status": status, + "label": label, + "opset": opset_used, + "build_error": error_snippet, + } + results["errors"].append(f"{hyp_id}: {status}") + # Try to extract feature gap info from the build output + if any( + kw in build_out.lower() for kw in ("unsupported", "not supported", "no handler") + ): + results["feature_gaps"].append( + f"{hyp_id} ({label}): EP/op unsupported — '{build_out[-200:]}'" + ) + elif is_timeout: + results["feature_gaps"].append( + f"{hyp_id} ({label}): build timeout — possible QNN compilation hang" + ) + continue + + onnx_path = hyp_dir / "model.onnx" + if not onnx_path.exists(): + # Check for EPContext model (compile might have happened anyway) + ctx_candidates = list(hyp_dir.glob("*_ctx*.onnx")) + list( + hyp_dir.glob("model_npu*.onnx") + ) + if ctx_candidates: + onnx_path = ctx_candidates[0] + print(f" [info] using compiled model: {onnx_path.name}", flush=True) + else: + results["hypotheses"][hyp_id] = { + "status": "NO_MODEL_ONNX", + "label": label, + "opset": opset_used, + } + results["errors"].append(f"{hyp_id}: build OK but model.onnx missing") + results["feature_gaps"].append( + f"{hyp_id}: build completed but no model.onnx produced (unexpected pipeline behavior)" + ) + continue + + # Only run eval for h0 (baseline) on image-classification models + do_eval = run_eval_on_baseline and hyp_id == "h0" and task == "image-classification" + + bench = _perf_result(onnx_path, model_id, task, do_eval) + bench["label"] = label + bench["opset"] = opset_used + results["hypotheses"][hyp_id] = bench + + if bench["status"] == "UNSTABLE": + results["errors"].append(f"{hyp_id}: bench UNSTABLE (CV too high)") + + # ── Step 3: compute summary stats ───────────────────────────────────────── + print("\n[3/3] Computing summary stats…", flush=True) + _compute_summary(results) + _save_results(results, model_dir) + return results + + +def _compute_summary(results: dict) -> None: + """Fill in baseline_p50, best_hypothesis, best_gain, npu001_generalized.""" + hyps = results["hypotheses"] + + # Baseline p50: prefer h0, fall back to h1 + baseline_p50: float | None = None + for h_id in ("h0", "h1"): + h = hyps.get(h_id, {}) + if h.get("status") in ("OK", "OK_HIGH_CV"): + baseline_p50 = h.get("full", {}).get("median_p50_ms") + if baseline_p50: + break + results["baseline_p50_ms"] = baseline_p50 + + # Best hypothesis (minimum median p50) + best_p50: float | None = None + best_h: str | None = None + for h_id, h in hyps.items(): + if h.get("status") in ("OK", "OK_HIGH_CV"): + p50 = h.get("full", {}).get("median_p50_ms") + if p50 is not None and (best_p50 is None or p50 < best_p50): + best_p50 = p50 + best_h = h_id + results["best_hypothesis"] = best_h + results["best_p50_ms"] = best_p50 + + if baseline_p50 and best_p50: + gain_pct = (baseline_p50 - best_p50) / baseline_p50 * 100 + results["best_gain_pct"] = round(gain_pct, 2) + + # npu-001 generalization: does h3 (opset 21) beat h1 (opset 17) by ≥5%? + h1 = hyps.get("h1", {}) + h3 = hyps.get("h3", {}) + if h1.get("status") in ("OK", "OK_HIGH_CV") and h3.get("status") in ("OK", "OK_HIGH_CV"): + p50_h1 = h1["full"].get("median_p50_ms", float("inf")) + p50_h3 = h3["full"].get("median_p50_ms", float("inf")) + if p50_h3 < p50_h1 * 0.95: # ≥5% improvement for h3 + results["npu001_generalized"] = True + gain = (p50_h1 - p50_h3) / p50_h1 * 100 + print( + f" ✓ npu-001 GENERALIZES: opset21={p50_h3:.1f}ms vs opset17={p50_h1:.1f}ms (+{gain:.1f}%)", + flush=True, + ) + elif p50_h1 < p50_h3 * 0.95: # opset 17 is better + results["npu001_generalized"] = False + print( + f" ✗ npu-001 does NOT generalize: opset17={p50_h1:.1f}ms < opset21={p50_h3:.1f}ms", + flush=True, + ) + else: + results["npu001_generalized"] = "neutral" + print( + f" ~ npu-001 neutral: opset17={p50_h1:.1f}ms ≈ opset21={p50_h3:.1f}ms", flush=True + ) + else: + missing = [h for h, d in [("h1", h1), ("h3", h3)] if d.get("status") != "OK"] + results["npu001_generalized"] = f"N/A ({', '.join(missing)} not OK)" + + +def _save_results(results: dict, model_dir: Path) -> None: + """Write results.json.""" + out = model_dir / "results.json" + out.write_text(json.dumps(results, indent=2, ensure_ascii=False), encoding="utf-8") + print(f" Results: {out}", flush=True) + + +# ── summary writer ──────────────────────────────────────────────────────────── + + +def write_summary(all_results: list[dict]) -> None: + """Write SUMMARY.md to RESULTS_DIR.""" + lines: list[str] = [ + "# QNN NPU Optimization Sweep — Catalog Models", + "", + f"Generated: {datetime.now().isoformat(timespec='seconds')} ", + f"EP: `{EP}` / device: `{DEVICE}` ", + f"Bench protocol: Phase-A {SCREEN_ITERS} iters (CV<{SCREEN_CV_MAX * 100:.0f}%)," + f" Phase-B {FULL_ITERS}×{FULL_SESSIONS} sessions ", + "", + "---", + "", + "## Per-Model Results", + "", + "| Model | Task | Baseline p50 | Best p50 | Best config | Gain% | opset-21 helps? | Notes |", + "|-------|------|-------------|----------|-------------|-------|-----------------|-------|", + ] + + for r in all_results: + model_id = r["model_id"] + task = r.get("task", "?") + baseline = f"{r['baseline_p50_ms']:.1f} ms" if r.get("baseline_p50_ms") else "N/A" + best = f"{r['best_p50_ms']:.1f} ms" if r.get("best_p50_ms") else "N/A" + best_h = r.get("best_hypothesis") or "N/A" + # Annotate best_h with label + best_label = "" + if best_h != "N/A": + h_data = r.get("hypotheses", {}).get(best_h, {}) + best_label = h_data.get("label", "") + gain = f"{r['best_gain_pct']:.1f}%" if r.get("best_gain_pct") is not None else "N/A" + npu001 = r.get("npu001_generalized") + if npu001 is True: + npu001_str = "✓ YES" + elif npu001 is False: + npu001_str = "✗ NO" + elif npu001 == "neutral": + npu001_str = "~ neutral" + else: + npu001_str = f"N/A ({npu001})" + errors = "; ".join(r.get("errors", []))[:100] or "none" + lines.append( + f"| `{model_id}` | {task} | {baseline} | {best} | {best_h} ({best_label}) | {gain} | {npu001_str} | {errors} |" + ) + + # Per-model hypothesis breakdown + lines += [ + "", + "## Hypothesis Breakdown per Model", + "", + ] + for r in all_results: + lines.append(f"### {r['model_id']}") + lines.append("") + lines.append( + "| Hypothesis | Opset | Screen p50 | Full p50 (median) | CV | Status | Accuracy |" + ) + lines.append( + "|------------|-------|-----------|-------------------|-----|--------|---------|" + ) + for h_id, h_data in r.get("hypotheses", {}).items(): + lbl = h_data.get("label", "") + opset = h_data.get("opset", "?") + s_p50 = h_data.get("screen", {}).get("p50_ms") + s_p50_str = f"{s_p50:.1f}" if s_p50 else "—" + f_p50 = h_data.get("full", {}).get("median_p50_ms") + f_p50_str = f"{f_p50:.1f}" if f_p50 else "—" + cv = h_data.get("screen", {}).get("cv", "?") + cv_str = f"{cv:.3f}" if isinstance(cv, float) else str(cv) + status = h_data.get("status", "?") + stable = h_data.get("screen", {}).get("stable", True) + if not stable and status.startswith("OK"): + status += " ⚡DVFS" + acc = h_data.get("accuracy") + acc_str = f"{acc:.3f}" if acc is not None else "—" + lines.append( + f"| {h_id} ({lbl}) | {opset} | {s_p50_str} | {f_p50_str} | {cv_str} | {status} | {acc_str} |" + ) + lines.append("") + + # Cross-model patterns + lines += [ + "---", + "", + "## Cross-Model Patterns", + "", + "### npu-001: Does opset 21 bypass help broadly?", + "", + ] + + npu001_map = {r["model_id"]: r.get("npu001_generalized") for r in all_results} + yes_m = [m for m, v in npu001_map.items() if v is True] + no_m = [m for m, v in npu001_map.items() if v is False] + neut_m = [m for m, v in npu001_map.items() if v == "neutral"] + na_m = [m for m, v in npu001_map.items() if v not in (True, False, "neutral")] + + lines += [ + f"- **Helps ({len(yes_m)} models):** {', '.join(f'`{m}`' for m in yes_m) or 'none'}", + f"- **Hurts ({len(no_m)} models):** {', '.join(f'`{m}`' for m in no_m) or 'none'}", + f"- **Neutral ({len(neut_m)} models):** {', '.join(f'`{m}`' for m in neut_m) or 'none'}", + f"- **N/A ({len(na_m)} models):** {', '.join(f'`{m}`' for m in na_m) or 'none'}", + "", + ] + + total_tested = len(yes_m) + len(no_m) + len(neut_m) + if total_tested > 0: + if len(yes_m) > total_tested / 2: + lines.append( + f"> **Finding**: opset 21 bypass generalizes to {len(yes_m)}/{total_tested} tested models." + " Consider upgrading npu-001 scope from ConvNext-only to broader architectures." + ) + elif len(no_m) > total_tested / 2: + lines.append( + f"> **Finding**: opset 21 bypass does NOT broadly generalize ({len(no_m)}/{total_tested} hurt)." + " npu-001 appears ConvNext-specific (residual connection topology dependency confirmed)." + ) + else: + lines.append( + f"> **Finding**: Mixed results ({len(yes_m)} help, {len(no_m)} hurt, {len(neut_m)} neutral)." + " Architecture-dependent. Confirm ORT `kMaxSupportedOpset` version before drawing conclusions." + ) + lines.append("") + + lines += [ + "### Feature Gaps", + "", + ] + all_gaps: list[str] = [] + for r in all_results: + for gap in r.get("feature_gaps", []): + all_gaps.append(f"- **`{r['model_id']}`**: {gap}") + lines += all_gaps if all_gaps else ["- No feature gaps observed"] + + lines += [ + "", + "### Build / Compatibility Issues", + "", + ] + for r in all_results: + errs = r.get("errors", []) + if errs: + lines.append(f"**`{r['model_id']}`**") + for e in errs: + lines.append(f" - {e}") + + lines += [ + "", + "---", + "", + "## Updated Recommendations for `ep_knowledge/qnn_npu.json`", + "", + "Based on this cross-architecture sweep:", + "", + ] + + # Auto-generate KB recommendations + if total_tested > 0: + if len(yes_m) >= 2: + lines += [ + "- **npu-001**: Broaden scope beyond ConvNext. Architectures that benefit: " + f"{', '.join(yes_m)}. Update `scope` field and set `gate1_statistical` confidence accordingly.", + "- **search_space_rules.opset.recommended_order**: Retain `[21, 17]` as default order.", + ] + if len(no_m) >= 2: + lines += [ + "- **npu-001**: Keep 'architecture-specific' caveat. Architectures where opset 21 hurts: " + f"{', '.join(no_m)}. Add to `do_not_generalize_to` list.", + "- **search_space_rules**: Add architecture check before applying opset 21 preference.", + ] + + # Conv fusions analysis + lines += [ + "", + "### Conv Fusion Findings (h4 vs h1, h5 vs h3)", + "", + ] + for r in all_results: + h1_p50 = r.get("hypotheses", {}).get("h1", {}).get("full", {}).get("median_p50_ms") + h4_p50 = r.get("hypotheses", {}).get("h4", {}).get("full", {}).get("median_p50_ms") + h3_p50 = r.get("hypotheses", {}).get("h3", {}).get("full", {}).get("median_p50_ms") + h5_p50 = r.get("hypotheses", {}).get("h5", {}).get("full", {}).get("median_p50_ms") + parts = [] + if h1_p50 and h4_p50: + delta = (h1_p50 - h4_p50) / h1_p50 * 100 + parts.append(f"conv-fusions on opset17: {delta:+.1f}% ({h1_p50:.1f}→{h4_p50:.1f}ms)") + if h3_p50 and h5_p50: + delta = (h3_p50 - h5_p50) / h3_p50 * 100 + parts.append(f"conv-fusions on opset21: {delta:+.1f}% ({h3_p50:.1f}→{h5_p50:.1f}ms)") + if parts: + lines.append(f"- **`{r['model_id']}`**: {'; '.join(parts)}") + + summary_path = RESULTS_DIR / "SUMMARY.md" + summary_path.write_text("\n".join(lines) + "\n", encoding="utf-8") + print(f"\n📄 Summary: {summary_path}", flush=True) + + +# ── entry point ─────────────────────────────────────────────────────────────── + + +def main() -> None: + parser = argparse.ArgumentParser( + description="QNN NPU optimization hypothesis sweep for winml catalog models" + ) + parser.add_argument( + "--model", default=None, help="Single HF model ID to sweep (default: all catalog models)" + ) + parser.add_argument( + "--task", default=None, help="Task override (required when --model is given)" + ) + parser.add_argument( + "--model-type", default="auto", help="Model type hint (e.g. resnet, vit). Default: auto" + ) + parser.add_argument( + "--skip-eval", + action="store_true", + help="Skip winml eval accuracy step even for image models", + ) + args = parser.parse_args() + + RESULTS_DIR.mkdir(parents=True, exist_ok=True) + + # Confirm QNN EP is present + print("=== Confirming QNN EP ===", flush=True) + rc, out, _ = run_cmd([WINML, "sys", "--list-ep"], label="winml sys --list-ep", timeout=30) + if "qnn" not in out.lower(): + print("❌ QNN EP not detected! Aborting.", flush=True) + sys.exit(1) + print("✓ QNN EP available\n", flush=True) + + # Determine model list + if args.model: + if not args.task: + print("Error: --task is required when --model is specified", flush=True) + sys.exit(1) + models_to_run: list[tuple[str, str, str, bool]] = [ + (args.model, args.task, args.model_type, not args.skip_eval) + ] + else: + models_to_run = ALL_MODELS # type: ignore[assignment] + + all_results: list[dict] = [] + + for model_id, task, model_type, do_eval in models_to_run: + if args.skip_eval: + do_eval = False + try: + result = sweep_model(model_id, task, model_type, do_eval) + except Exception as exc: + print(f"\n❌ Unexpected error for {model_id}: {exc}", flush=True) + result = { + "model_id": model_id, + "task": task, + "model_type": model_type, + "errors": [f"Unexpected exception: {exc}"], + "hypotheses": {}, + "feature_gaps": [], + } + all_results.append(result) + + # Save rolling summary after each model + write_summary(all_results) + + print("\n" + "=" * 64, flush=True) + print(" SWEEP COMPLETE", flush=True) + print("=" * 64, flush=True) + write_summary(all_results) + + +if __name__ == "__main__": + main() diff --git a/research/autoconfig/ep_knowledge/README.md b/research/autoconfig/ep_knowledge/README.md new file mode 100644 index 000000000..61ccd28cc --- /dev/null +++ b/research/autoconfig/ep_knowledge/README.md @@ -0,0 +1,25 @@ +# Per-EP Empirical Knowledge Base + +Each JSON file stores empirical findings for one EP/device combination. + +## ⚠️ CRITICAL EPISTEMICS + +These findings are **observational hypotheses, not ground truth**. They were derived +from a small number of experiments on a single model (ConvNext-tiny) on a single device +(Snapdragon X Elite CRD). Every finding carries a `confidence` field and a `falsified_by` +field. Before using a finding to prune a search space, check: + +1. **Is the model architecture similar?** (ConvNext ≠ BERT ≠ ResNet) +2. **Is the hardware the same?** (X Elite CRD ≠ X Plus ≠ X1E-80-100) +3. **Is the ORT/QNN SDK version the same?** +4. **Is the mechanism confirmed?** (see `mechanism_confirmed` field) + +**Dialectical rule**: A finding that prunes a search dimension must be re-enabled +if a new experiment on a new model/hardware contradicts it. Findings degrade over time +as ORT and QNN SDK versions change. + +## Files +- `qnn_npu.json` — QNN HTP (NPU) EP findings +- `qnn_gpu.json` — QNN GPU EP findings +- `dml.json` — DirectML EP findings +- `cpu.json` — CPU EP findings diff --git a/research/autoconfig/ep_knowledge/cpu.json b/research/autoconfig/ep_knowledge/cpu.json new file mode 100644 index 000000000..8edb8fb06 --- /dev/null +++ b/research/autoconfig/ep_knowledge/cpu.json @@ -0,0 +1,126 @@ +{ + "_meta": { + "ep": "cpu", + "device": "cpu", + "hardware": "Snapdragon X Elite CRD (Oryon CPU)", + "ort_version": "1.x (check winml version at experiment time)", + "model": "facebook/convnext-tiny-224 (ALL findings from this model only)", + "last_updated": "2026-06-10", + "epistemics_warning": "⚠️ All findings from rigorous 3-run ablation. However, still 1 model, 1 device. CPU behavior can differ significantly between x86 and ARM (Oryon). Check architecture before applying rules." + }, + + "findings": [ + + { + "id": "cpu-001", + "title": "opset 19+ causes severe regression on CPU EP (3-4x slowdown)", + "observation": "opset 17: p50=43.7ms. opset 19: p50=160ms (3.7x). opset 20: p50=131ms (3.0x). opset 21: p50=170ms (3.9x). opset 22: p50=85ms (1.9x). All runs consistent — not noise.", + "mechanism_confirmed": true, + "mechanism_hypothesis": "ORT C++ Transpose Optimizer has kMaxSupportedOpset gate. If model opset > kMaxSupportedOpset, the entire Transpose Optimizer is skipped silently. ConvNext has 42 Transpose nodes — without optimization, each executes as a full memory-layout copy. Code: onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h. kMaxSupportedOpset is bumped with each ORT release.", + "action_for_autoconfig": "For CPU EP: default to opset 17. Do NOT try opset 19+ unless you first verify that the shipping ORT version's kMaxSupportedOpset >= target_opset.", + "confidence": "high — mechanism confirmed by source code + ORT session opt-level experiment (ENABLE_ALL removes the regression)", + "falsified_by": null, + "scope": "Models with many Transpose nodes (ConvNext, ViT, vision transformers). Models with few Transposes (BERT) may be less affected.", + "ort_kMaxSupportedOpset_by_version": { + "v1.14.x": 18, + "v1.16.x": 19, + "v1.17.x": 20, + "v1.18.x": 21, + "main_HEAD": 26 + }, + "do_not_generalize_to": "QNN NPU EP or DML EP — kMaxSupportedOpset is a CPU-only ORT optimizer gate. These EPs have their own kernel dispatch unaffected by this." + }, + + { + "id": "cpu-002", + "title": "matmul_add_fusion is a CONFIRMED REGRESSION on ConvNext CPU (+38ms, ~87%)", + "observation": "matmul_add_fusion: p50=81.7ms, runs=[63.0, 70.8, 111.2ms]. Baseline p50=43.7ms. All 3 runs far above highest baseline run (45.4ms).", + "mechanism_confirmed": false, + "mechanism_hypothesis": "ORT baseline already converts MatMul+Add→Gemm (37 Gemm in model.onnx). Applying matmul_add_fusion on top may create redundant kernel dispatch or conflicting operator mapping. Requires profiling to confirm.", + "action_for_autoconfig": "Do NOT apply matmul_add_fusion for CPU EP on models where baseline already uses Gemm (check model.onnx for Gemm nodes before applying this pass).", + "confidence": "high — 3 independent runs, all far above baseline; direction is unambiguous", + "falsified_by": null, + "scope": "ConvNext and models where ORT L2 baseline already fuses MatMul+Add→Gemm", + "do_not_generalize_to": "Models where baseline does NOT have Gemm (the pass may legitimately help there)" + }, + + { + "id": "cpu-003", + "title": "transpose_optimizer is neutral on ConvNext CPU (NOT +270ms as previously reported)", + "observation": "winml perf (warmup=10, iter=50): 42.3 / 52.3 / 41.8ms — overlapping baseline. Earlier winml eval-based measurement showed +270ms — this was a measurement artifact.", + "mechanism_confirmed": true, + "mechanism_hypothesis": "winml eval includes HF preprocessing + model load + no warmup. The +270ms was preprocessing overhead, not inference regression. Pure inference measurement (winml perf) shows no effect.", + "action_for_autoconfig": "transpose_optimizer is neutral for ConvNext CPU — neither helpful nor harmful. Can be omitted from search space.", + "confidence": "high — measurement methodology confirmed; tool comparison validated", + "falsified_by": "Earlier winml eval measurement — RETRACTED. Use winml perf for all latency comparisons.", + "scope": "ConvNext CPU", + "measurement_lesson": "Always use winml perf (warmup=10, iter=50) for latency experiments. Never use winml eval latency to compare configs." + }, + + { + "id": "cpu-004", + "title": "nchwc_transformer is neutral on ConvNext CPU", + "observation": "nchwc: 43.4 / 48.0 / 44.7ms — overlapping baseline (42.5–45.4ms). No improvement.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "NCHWc SIMD layout benefits Conv-heavy models. ConvNext has 22 Conv nodes but 57.7% of kernel time is Gemm. The bottleneck is not memory layout but compute throughput — NCHWc doesn't help.", + "action_for_autoconfig": "nchwc_transformer is low-priority for ConvNext-class models. Profile first — if Conv% > 40%, try nchwc. If Gemm% > 50%, skip.", + "confidence": "medium — 3 runs, neutral result; mechanism is a hypothesis", + "falsified_by": null, + "scope": "ConvNext CPU (Gemm-dominated, not Conv-dominated)" + }, + + { + "id": "cpu-005", + "title": "Baseline (no extra flags) is the optimal config for ConvNext CPU", + "observation": "No flag in 22-experiment ablation improved p50 beyond noise. Baseline p50=43.7ms is the floor.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "ORT L2 baseline already applies gelu_fusion and MatMul→Gemm before any user flags. The effective optimization space is narrow for ConvNext on CPU. Compute bottleneck (Gemm=57.7%) is not addressable via graph passes.", + "action_for_autoconfig": "For CPU EP on ConvNext-class models: skip optimization pass sweep. Go directly to quantization experiments.", + "confidence": "high — 22 experiments, no improvement found", + "falsified_by": null, + "scope": "ConvNext-class vision models on CPU", + "do_not_generalize_to": "BERT/Transformer models where attention_fusion + skip_layer_norm can significantly help" + }, + + { + "id": "cpu-006", + "title": "CPU EP opset 21 is 3.9x SLOWER — opposite of QNN NPU behavior", + "observation": "CPU opset 21: p50=170ms. CPU opset 17: p50=43.7ms. QNN NPU opset 21: p50=8.45ms (2.3x FASTER).", + "mechanism_confirmed": true, + "mechanism_hypothesis": "Same kMaxSupportedOpset gate as cpu-001. CPU and QNN NPU have completely different optimizer paths. CPU regression from Transpose Optimizer bypass. QNN NPU speedup from better kernel dispatch (mechanism under research).", + "action_for_autoconfig": "EP ISOLATION: CPU opset findings MUST NOT influence QNN NPU search space, and vice versa. Always validate per EP independently.", + "confidence": "high — both directions confirmed empirically", + "falsified_by": null, + "scope": "ALL — this is a meta-rule about EP isolation, not model-specific" + } + + ], + + "search_space_rules": { + "opset": { + "recommended_order": [17], + "skip": ["19, 20, 21, 22 — kMaxSupportedOpset regression (cpu-001). Only safe to try if ORT version's kMaxSupportedOpset >= target."], + "dialectical_note": "⚠️ This rule is ORT-version dependent. Check kMaxSupportedOpset for the shipping ORT build before skipping higher opsets." + }, + "quantization": { + "recommended": "w8a8 (CPU benefits most from small model size)", + "dialectical_note": "⚠️ W8A8 on CPU not yet validated for ConvNext. General guidance — run accuracy gate." + }, + "compile": { + "always_run": false, + "skip": true, + "dialectical_note": "⚠️ winml compile targets QNN EPContext. Not applicable to CPU EP." + }, + "graph_passes": { + "recommended": "autoconf defaults only", + "skip": ["matmul_add_fusion if model already has Gemm (cpu-002)", "nchwc_transformer if Gemm% > 50% in profile (cpu-004)"], + "dialectical_note": "⚠️ Skip rules are Gemm-bottleneck specific. Conv-heavy models may still benefit from nchwc_transformer." + } + }, + + "meta_lessons": { + "measurement_discipline": "Always use winml perf (warmup=10, iter=50) for latency. Never use winml eval latency. See cpu-003.", + "ep_isolation": "CPU findings (especially opset regression) DO NOT transfer to QNN NPU or DML. Each EP has its own optimizer path. See cpu-006.", + "baseline_check": "Before applying any fusion flag, check model.onnx for existing fused ops. If Gemm already present, matmul_add_fusion is likely a no-op or regression." + } +} diff --git a/research/autoconfig/ep_knowledge/dml.json b/research/autoconfig/ep_knowledge/dml.json new file mode 100644 index 000000000..8b9adb1af --- /dev/null +++ b/research/autoconfig/ep_knowledge/dml.json @@ -0,0 +1,104 @@ +{ + "_meta": { + "ep": "dml", + "device": "gpu", + "hardware": "Snapdragon X Elite CRD (Adreno X1-85 / DirectML via D3D12)", + "ort_version": "1.x with onnxruntime-directml package", + "model": "facebook/convnext-tiny-224 (ALL findings from this model only)", + "last_updated": "2026-06-10", + "epistemics_warning": "⚠️ DML experiments required swapping onnxruntime-directml for onnxruntime (Python package conflict). Results reflect DML EP behavior via winml's DML DLL, not the Python onnxruntime-directml package directly. Re-validate if package setup changes." + }, + + "findings": [ + + { + "id": "dml-001", + "title": "DML FP32 is faster and more stable than QNN GPU FP32 on the same Adreno X1-85", + "observation": "DML FP32: p50=16.9ms, p90=17.7ms, std=0.52. QNN GPU FP32: p50=17.7ms, p90=19.7ms, std=0.97.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "DML JIT-compiles HLSL shaders at model load time — shader compilation is done once, producing stable execution. QNN GPU EP does graph partitioning at each session creation — more overhead and jitter.", + "action_for_autoconfig": "Prefer DML over QNN GPU for GPU inference (faster + more stable). DML is the primary GPU EP to optimize.", + "confidence": "medium — consistent in 3 runs each; mechanism is plausible but not confirmed by profiling", + "falsified_by": null, + "scope": "Adreno X1-85, ConvNext-class models", + "do_not_generalize_to": "NVIDIA/Intel GPUs (QNN GPU not available there anyway)" + }, + + { + "id": "dml-002", + "title": "NHWC transformer hurts DML (same as QNN GPU)", + "observation": "DML NHWC: p50=16.5ms, p90=21.0ms (+19% p90), std=1.89 (3.6x worse than FP32 baseline).", + "mechanism_confirmed": false, + "mechanism_hypothesis": "D3D12 on Adreno X1-85 does not benefit from explicit NHWC layout transforms. DML handles tensor layouts internally via HLSL; adding ORT NHWC Transposes creates overhead.", + "action_for_autoconfig": "Do NOT apply nhwc-transformer for DML EP.", + "confidence": "medium — single run comparison; consistent direction", + "falsified_by": null, + "scope": "Adreno X1-85 + DML", + "do_not_generalize_to": "NVIDIA GPUs (NHWC may help with CUDNN)" + }, + + { + "id": "dml-003", + "title": "DML FP16 gives ~1.4x speedup with NO DVFS bimodal (unlike QNN GPU FP16)", + "observation": "DML FP16 (via Python hack, not official CLI): p50=11.8ms, p90=12.8ms, std=0.66. Clean unimodal distribution.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "DML HLSL shader compilation locks in FP16 compute paths at load time — no dynamic voltage/frequency switching surprises. QNN GPU FP16 showed DVFS bimodal distribution (some runs in high-power state, some in low-power state).", + "action_for_autoconfig": "FP16 is the primary optimization lever for DML. Unblock via #867 (--precision fp16 flag).", + "confidence": "low — experiment used Python hack (not official winml CLI). Mark as SKIPPED/CLI-gap until #867 ships.", + "falsified_by": null, + "scope": "Adreno X1-85 + DML", + "tracked_issue": "#867", + "cli_gap": true, + "cli_gap_note": "⚠️ This finding was produced via a Python workaround, not winml CLI. Cannot be reproduced with winml build today. Blocked on #867." + }, + + { + "id": "dml-004", + "title": "winml analyze returns 0/0/0/251 (all Unknown) for DML EP — no rule data", + "observation": "winml analyze --ep dml outputs: supported=0, partial=0, unsupported=0, unknown=251.", + "mechanism_confirmed": true, + "mechanism_hypothesis": "DML EP supports all standard ONNX ops by design (D3D12 universal op coverage). winml analyze has no DML-specific rule data file. This is a cosmetic gap — DML actually runs all ops natively.", + "action_for_autoconfig": "Do not use winml analyze output to prune search space for DML. Assume all ops supported.", + "confidence": "high — confirmed by DML running all 251 ops with no CPU fallback", + "falsified_by": null, + "scope": "DML EP (all models)", + "tracked_issue": "not filed — cosmetic gap, low priority" + }, + + { + "id": "dml-005", + "title": "opset 21 on DML not yet validated", + "observation": "opset 21 sweep only run on QNN NPU. DML behavior with opset 21 is unknown.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "DML uses D3D12 dispatch — different from QNN EP kernel registry. opset 21 speedup on QNN NPU may not apply.", + "action_for_autoconfig": "Include opset 21 in DML search sweep. No prior data — must run experiment.", + "confidence": "low — no data", + "falsified_by": null, + "scope": "UNKNOWN — needs experiment" + } + + ], + + "search_space_rules": { + "opset": { + "recommended_order": [17, 21], + "rationale": "dml-005: unknown. Include both in sweep.", + "dialectical_note": "⚠️ No data on DML + opset 21. Do not assume NPU behavior transfers." + }, + "quantization": { + "recommended": "fp16 (when #867 ships)", + "skip": ["w8a8", "w8a16 — quantization rarely helps on GPU via DML"], + "dialectical_note": "⚠️ Quantization skip is based on general DML behavior. Some models with large weights may benefit from W8A16 even on DML. Test empirically." + }, + "compile": { + "always_run": false, + "skip": true, + "dialectical_note": "⚠️ DML uses HLSL, not QNN binary compilation. winml compile targets QNN EPContext only. Not applicable to DML." + }, + "graph_passes": { + "recommended": "autoconf defaults only", + "skip": ["nhwc-transformer (dml-002)"], + "dialectical_note": "⚠️ Same as QNN GPU: NHWC hurts on Adreno. NVIDIA/Intel may differ." + } + } +} diff --git a/research/autoconfig/ep_knowledge/qnn_gpu.json b/research/autoconfig/ep_knowledge/qnn_gpu.json new file mode 100644 index 000000000..e51e22457 --- /dev/null +++ b/research/autoconfig/ep_knowledge/qnn_gpu.json @@ -0,0 +1,115 @@ +{ + "_meta": { + "ep": "qnn", + "device": "gpu", + "hardware": "Snapdragon X Elite CRD (Adreno X1-85 / QNN GPU EP)", + "ort_version": "1.x (check winml version at experiment time)", + "qnn_sdk_version": "unknown — check QnnSystem.dll version", + "model": "facebook/convnext-tiny-224 (ALL findings from this model only)", + "last_updated": "2026-06-10", + "epistemics_warning": "⚠️ All findings are hypotheses derived from 1 model on 1 device. Confidence levels reflect mechanism understanding, not universal applicability. GPU EP behavior varies significantly by model architecture and Adreno driver version." + }, + + "findings": [ + + { + "id": "gpu-001", + "title": "FP32 baseline is already optimal for ConvNext on QNN GPU — no optimization pass helps", + "observation": "Full sweep of 11 passes/combinations on ConvNext QNN GPU: all returned 0% node reduction or worse latency. Baseline p50=17.7ms, p90=19.7ms, std=0.97.", + "mechanism_confirmed": true, + "mechanism_hypothesis": "251/0/0/0 (all ops native on GPU, zero CPU fallback). ConvNext linear layers use Reshape→MatMul→Reshape, not bare MatMul+Add — so MatMulAdd→Conv2D rewrites don't match. autoconf (gelu_fusion + matmul_add_fusion) already applied all applicable transforms.", + "action_for_autoconfig": "Skip all graph optimization experiments for QNN GPU on ConvNext-class models. Use FP32 baseline directly.", + "confidence": "high — confirmed by 0% node delta on all rewrites + 251/0/0/0 analyze output", + "falsified_by": null, + "scope": "ConvNext-class models (Reshape→MatMul→Reshape pattern)", + "do_not_generalize_to": "Transformer models with bare MatMul+Add (those may benefit from rewrites)" + }, + + { + "id": "gpu-002", + "title": "NHWC transformer hurts QNN GPU on Adreno X1-85 (~10% worse)", + "observation": "NHWC transformer: p50=19.5ms (+10%), p90=23.8ms (+21%), std=3.43 (3.5x worse). Consistent across multiple runs.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "Adreno X1-85 + QNN GPU EP does not benefit from explicit NHWC layout transforms. QNN GPU EP handles layout internally; forcing NHWC via ORT creates additional Reshape overhead without the memory alignment benefit.", + "action_for_autoconfig": "Do NOT apply nhwc-transformer for QNN GPU EP.", + "confidence": "medium — observed consistently; mechanism hypothesis, not confirmed", + "falsified_by": null, + "scope": "Adreno X1-85 + QNN GPU EP", + "do_not_generalize_to": "Non-Adreno GPUs (NVIDIA, Intel Arc) — NHWC may help there" + }, + + { + "id": "gpu-003", + "title": "winml compile HURTS QNN GPU (~34% regression)", + "observation": "FP32 + compile: p50=23.7ms vs baseline 17.7ms. compile is opposite of NPU: regresses on GPU.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "QNN GPU EP compile (EPContext) is designed for NPU (HTP). On GPU EP, the compilation path may force a different dispatch mode that bypasses the optimized GPU shader path. QNN SDK likely has a GPU-specific compilation flow that winml compile doesn't trigger correctly.", + "action_for_autoconfig": "NEVER run winml compile for QNN GPU EP. This is the opposite of NPU behavior.", + "confidence": "medium — single experiment, consistent direction (34% is large signal); mechanism unconfirmed", + "falsified_by": null, + "scope": "QNN GPU EP", + "do_not_generalize_to": "QNN NPU EP (compile always helps NPU)" + }, + + { + "id": "gpu-004", + "title": "W8A8 QDQ hangs indefinitely on QNN GPU EP", + "observation": "Passing a W8A8 QDQ-annotated ONNX to QNN GPU EP causes infinite hang. winml build's _patch_device() sets quant=null for GPU, preventing this in normal user path.", + "mechanism_confirmed": true, + "mechanism_hypothesis": "QNN SDK's GPU EP does not support QDQ-quantized graphs. This is a known QNN SDK limitation. winml build already protects against this via _patch_device().", + "action_for_autoconfig": "Skip ALL quantization experiments for QNN GPU EP. Do not even attempt W8A8 or W8A16.", + "confidence": "high — hang confirmed; protection mechanism in _patch_device() confirmed by code inspection", + "falsified_by": null, + "scope": "QNN GPU EP (QNN SDK limitation)", + "tracked_issue": "#868 (fast-fail enhancement)" + }, + + { + "id": "gpu-005", + "title": "gelu_fusion improves latency STABILITY (p90/std) on QNN GPU, not p50", + "observation": "Raw export (287 nodes, unfused Gelu): p50=17.4ms, p90=29.2ms, std=5.90. Autoconf (251 nodes, fused Gelu): p50=17.7ms, p90=19.7ms, std=0.97. p50 nearly identical, p90 -48%, std -6x.", + "mechanism_confirmed": true, + "mechanism_hypothesis": "5 separate GPU kernel dispatches (Mul→Div→Erf→Mul→Add) for unfused GELU create scheduling jitter. Single Gelu kernel eliminates dispatch overhead → dramatically lower tail latency.", + "action_for_autoconfig": "Always apply gelu_fusion for QNN GPU (stability benefit). Do not expect p50 improvement.", + "confidence": "high — mechanism is well-understood (GPU kernel dispatch overhead)", + "falsified_by": null, + "scope": "Any model with GELU activations on QNN GPU" + }, + + { + "id": "gpu-006", + "title": "opset 21 on QNN GPU not yet validated", + "observation": "opset 21 sweep only run on QNN NPU. QNN GPU behavior with opset 21 is unknown.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "QNN GPU and QNN NPU use different kernel registries. opset 21 speedup on NPU does NOT imply the same on GPU.", + "action_for_autoconfig": "Do not assume opset 21 helps QNN GPU. Run a validation experiment before adding to search space.", + "confidence": "low — no data", + "falsified_by": null, + "scope": "UNKNOWN — needs experiment" + } + + ], + + "search_space_rules": { + "opset": { + "recommended_order": [17], + "rationale": "gpu-006: opset 21 not validated for GPU. Stay at 17 until tested.", + "dialectical_note": "⚠️ May change once opset 21 GPU experiment is run." + }, + "quantization": { + "recommended": "skip", + "skip": ["all — QDQ hangs on GPU EP (gpu-004)"], + "dialectical_note": "⚠️ This is a QNN SDK limitation, not winml. May change with future QNN SDK versions that support GPU quantization." + }, + "compile": { + "always_run": false, + "skip": true, + "dialectical_note": "⚠️ gpu-003: compile regresses QNN GPU. Confirmed by single experiment. Re-validate if winml compile behavior changes." + }, + "graph_passes": { + "recommended": "autoconf defaults only", + "skip": ["nhwc-transformer (gpu-002)", "all additional fusion passes (gpu-001)"], + "dialectical_note": "⚠️ Skip rules are ConvNext-specific. Transformer models may benefit from attention_fusion etc." + } + } +} diff --git a/research/autoconfig/ep_knowledge/qnn_npu.json b/research/autoconfig/ep_knowledge/qnn_npu.json new file mode 100644 index 000000000..40a50e3b6 --- /dev/null +++ b/research/autoconfig/ep_knowledge/qnn_npu.json @@ -0,0 +1,203 @@ +{ + "_meta": { + "ep": "qnn", + "device": "npu", + "hardware": "Snapdragon X Elite CRD (Adreno X1-85 / Hexagon HTP)", + "ort_version": "1.x (check winml version at experiment time)", + "qnn_sdk_version": "unknown — check QnnSystem.dll version", + "models_tested": [ + "facebook/convnext-tiny-224", + "microsoft/resnet-18", + "google/vit-base-patch16-224", + "apple/mobilevit-small", + "facebook/dinov2-small", + "hustvl/yolos-small", + "distilbert/distilbert-base-uncased-finetuned-sst-2-english", + "sentence-transformers/all-MiniLM-L6-v2", + "deepset/roberta-base-squad2" + ], + "last_updated": "2026-06-13", + "epistemics_warning": "⚠️ All findings are hypotheses derived from limited models on 1 device (Snapdragon X Elite). Confidence levels reflect how well the mechanism is understood, not how universally applicable the finding is. ALWAYS re-validate on new model architectures before using to prune search space." + }, + + "findings": [ + + { + "id": "npu-001", + "title": "opset 21 bypasses NHWC layout transform — beneficial ONLY for Conv+residual architectures", + "observation": "ConvNext: opset 21 p50~12ms vs opset 17 p50~54ms (DVFS-dominated, Gates 1+3 not passed). Catalog sweep 2026-06-13: MobileViT +26.5% (opset21), DINOv2-small +30.6% (opset21). ViT: opset21 -7.4% (no benefit). BERT/RoBERTa/DistilBERT: neutral.", + "mechanism_confirmed": true, + "mechanism_source": "ORT source code investigation (2026-06-10) + catalog sweep validation (2026-06-13)", + "architecture_requirement": ["has_conv_ops", "has_residual_connections"], + "validated_models": { + "benefits_from_opset21": ["convnext", "mobilevit", "dinov2"], + "no_benefit": ["vit", "yolos"], + "neutral": ["distilbert", "bert", "roberta", "mpnet"] + }, + "mechanism_explanation": { + "root_cause": "kMaxSupportedOpset gate in IsSupportedOpset() (onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc). On older ORT where kMaxSupportedOpset < 21, opset 21 models bypass the NCHW→NHWC layout transformer entirely (transform_layout_fn = nullptr in inference_session.cc:1589-1626).", + "why_bypass_helps_convnext": "NHWC layout transform inserts Transpose(NCHW→NHWC) and Transpose(NHWC→NCHW) around Conv. For ConvNext, residual connections consume Conv output BEFORE the following ConvNext permute Transpose — so the two Transposes CANNOT be cancelled. Result: opset 17 NHWC graph has MORE Transpose ops on HTP than opset 21 NCHW graph. Bypassing the transform = cleaner graph = faster.", + "why_cpu_is_opposite": "CPU relies on TransposeOptimizer to REMOVE existing Transposes. Skipping it (opset > kMaxSupportedOpset) leaves Transposes in place → CPU gets SLOWER. QNN's layout transform ADDS new Transposes that can't be fully eliminated → QNN gets FASTER when bypassed. Same constant, opposite effects.", + "key_files": [ + "onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc:2724-2746 — MakeOptimizerContext() gate", + "onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc — IsSupportedOpset()", + "onnxruntime/core/session/inference_session.cc:1589-1626 — transform_layout_fn=nullptr path", + "onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc — EPAwareHandleReshape (QNN-specific)", + "onnxruntime/core/providers/qnn/builder/op_builder_factory.cc — NO opset dispatch in QNN EP" + ], + "confirmed_negative": "QNN EP itself has NO opset-version-based kernel dispatch. All op→QNN mapping is by op_type string only. The speedup is entirely from ORT optimizer behavior, not QNN SDK." + }, + "critical_caveats": [ + "ORT VERSION DEPENDENCY: kMaxSupportedOpset=26 on current ORT main branch. On current ORT, BOTH opset 17 and 21 get NHWC layout transform → speedup may NOT exist. Must verify ORT version before claiming this works.", + "ARCHITECTURE SPECIFICITY: Only helps models with (1) Conv ops + (2) residual connections + (3) explicit Transpose/permute ops. Pure sequential models (MobileNet) would get SLOWER with opset 21 bypass.", + "DVFS NOISE: Current sweep data (50 iters, warm device) is dominated by thermal variance. Gate 1 (iter≥1000 × 3 sessions) not yet passed.", + "May disappear on newer ORT versions or QNN SDK versions that handle NCHW Conv more efficiently internally." + ], + "action_for_autoconfig": "⚠️ Do NOT treat this as a generic 'try opset 21 first' rule. Correct action: (1) Check if model has Conv ops AND residual connections. (2) If yes: include opset 21 in search. (3) If pure attention (ViT, YOLOS) or pure NLP (BERT-family): opset 21 neutral or harmful — default opset 17. (4) Always check ORT kMaxSupportedOpset at runtime.", + "confidence": "medium-high on mechanism; low on perf claim (Gates 1+3 not yet passed)", + "falsified_by": null, + "scope": "Models with Conv + residual connections + explicit Transpose ops (ConvNext, MobileViT, DINOv2). Does NOT apply to pure transformers (ViT, YOLOS) or NLP models (BERT-family).", + "tracked_issue": "#869", + "perf_gain_validation_gates": { + "gate1_statistical": "FAILED — need iter>=1000 x 3 independent sessions with cool-down", + "gate2_mechanism": "PASSED — confirmed via ORT source code (kMaxSupportedOpset gate + NHWC transform bypass + residual-blocked Transpose cancellation)", + "gate3_thermal_control": "FAILED — sequential benchmarks on warm device" + }, + "follow_up_required": [ + "Verify ORT version's kMaxSupportedOpset: if >= 21, mechanism no longer applies", + "Dump optimized graph for both opsets (sess_options.optimized_model_filepath) — if opset 17 has more Transpose nodes in QNN partition, confirms mechanism", + "Run iter=1000 x 3 sessions with cool-down to pass Gate 1", + "Test on current ORT head to see if gain persists" + ], + "experiments": [ + {"opset": 17, "p50_ms": 54.2, "p90_ms": 104.5, "min_ms": 9.56, "std_ms": 44.1, "iters": 50, "note": "warm device, DVFS-dominated"}, + {"opset": 18, "p50_ms": 43.7, "p90_ms": 326.1, "min_ms": 10.5, "std_ms": 153.2, "iters": 50, "note": "bimodal — severe DVFS"}, + {"opset": 19, "p50_ms": 12.1, "p90_ms": 77.7, "min_ms": 9.11, "std_ms": 60.0, "iters": 50}, + {"opset": 20, "p50_ms": 12.0, "p90_ms": 99.4, "min_ms": 9.48, "std_ms": 88.5, "iters": 50}, + {"opset": 21, "p50_ms": 12.2, "p90_ms": 38.0, "min_ms": 9.73, "std_ms": 10.1, "iters": 20, "note": "only 20 iters"}, + {"opset": 22, "p50_ms": 13.6, "p90_ms": 34.5, "min_ms": 8.80, "std_ms": 37.2, "iters": 50} + ] + }, + + { + "id": "npu-002", + "title": "W8A16 quantization provides ~1.9x speedup over FP32 on QNN NPU", + "observation": "FP32 baseline: p50=19.4ms. W8A16 quantized (minmax, 128 samples): p50=10.29ms.", + "mechanism_confirmed": true, + "mechanism_hypothesis": "QNN HTP has native INT8 weight / FP16 activation datapath. W8A16 maps directly to HTP's weight-compressed matmul kernels.", + "action_for_autoconfig": "Always quantize for QNN NPU. W8A16 is the starting point for ConvNext-class models.", + "confidence": "high — mechanism is well-understood (HTP architecture)", + "falsified_by": null, + "scope": "General (applies to most vision models on QNN NPU)", + "do_not_generalize_to": "Models with unusual op types not supported by QNN W8A16 path" + }, + + { + "id": "npu-003", + "title": "winml compile adds ~1.7x speedup on top of quantization for QNN NPU", + "observation": "W8A16 quantized: p50=10.29ms. W8A16 + compiled (EPContext): p50=6.01ms.", + "mechanism_confirmed": true, + "mechanism_hypothesis": "Compilation pre-builds the QNN binary graph (.bin) and eliminates JIT graph partitioning at session creation time. EPContext model loads the pre-built binary directly.", + "action_for_autoconfig": "Always run winml compile after finding best quantized config for QNN NPU.", + "confidence": "high — mechanism confirmed by QNN SDK documentation", + "falsified_by": null, + "scope": "General (applies to all QNN NPU deployments)" + }, + + { + "id": "npu-004", + "title": "W8A8 is catastrophic for ConvNext-class models on QNN NPU", + "observation": "W8A8 quantization on ConvNext: top-1 accuracy collapses (< 15%). Exact numbers not recorded — aborted early.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "ConvNext uses LayerNormalization + GELU in every block. Quantizing both weights AND activations to INT8 in these ops introduces severe numerical error. QNN NPU partial support for INT8 activations in LN/GELU may exacerbate this.", + "action_for_autoconfig": "If W8A8 top-1 <= 15% on first attempt, skip all W8A8 variants and go directly to W8A16.", + "confidence": "medium — top-1 collapse observed; exact mechanism unconfirmed", + "falsified_by": null, + "scope": "Models with LN+GELU blocks (ConvNext, ViT variants)", + "do_not_generalize_to": "BERT/ResNet models where W8A8 is often fine" + }, + + { + "id": "npu-005", + "title": "QNN Hub W8A16 model (opset 21, uint16 input) is WORSE on our stack than our own W8A16", + "observation": "QNN Hub W8A16 on winml ORT QNN EP: p50=14.82ms, std=8.8ms. Our ORT-quantized W8A16 (opset 17 QDQ): p50=6.01ms stable.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "QNN Hub uses opset 21 QDQ format with uint16 input tensor — this format is incompatible with ORT QNN EP's expected quantization format. ORT QNN EP expects float32 input + int8/int16 weight QDQ, not uint16 input.", + "action_for_autoconfig": "Use ORT-generated W8A16 quantization (winml build), NOT QNN Hub pre-quantized models, when targeting ORT QNN EP stack.", + "confidence": "medium — std=8.8ms suggests format mismatch causing CPU fallback for some nodes", + "falsified_by": null, + "scope": "ORT QNN EP stack (not qairt native stack)" + }, + + { + "id": "npu-006", + "title": "Conv fusions (conv-bn/add/activation) cause catastrophic QNN NPU CPU fallback on Conv-dominant models", + "observation": "ResNet-18 with conv-bn-fusion+conv-add-fusion+conv-activation-fusion: p50=132ms vs baseline 2.72ms = +4900% regression. MobileViT with same fusions: neutral. BERT-family: neutral (no Conv ops).", + "mechanism_confirmed": false, + "mechanism_hypothesis": "ORT conv fusion pass (ConvAddActivationFusion, ConvBNFusion) produces fused op types (e.g., Conv+BN fused) that QNN EP cannot map to HTP kernels. These ops fall back to CPU execution, adding PCIe round-trip overhead per-op for a Conv-heavy graph like ResNet.", + "action_for_autoconfig": "⚠️ CRITICAL: Do NOT apply conv-bn-fusion / conv-add-fusion / conv-activation-fusion for QNN NPU on Conv-dominant models (ResNet, EfficientNet, MobileNet). These passes are beneficial for CPU EP but hazardous for QNN NPU. Always run accuracy + latency gate after applying any Conv fusion. If regression > 5x, disable all conv fusions immediately.", + "confidence": "high on regression observation (4900%); medium on mechanism (CPU fallback hypothesis not yet confirmed via EP partition dump)", + "falsified_by": null, + "scope": "Conv-dominant models (ResNet, EfficientNet, MobileNet). MobileViT may be safe due to different Conv placement. Not applicable to pure transformers or NLP.", + "severity": "critical — can produce 50x regression", + "follow_up_required": [ + "Dump QNN EP partition to confirm fused ops cause CPU fallback", + "Test EfficientNet and MobileNet to confirm generalization", + "Check if winml analyze linter can detect this pattern pre-build" + ] + }, + + { + "id": "npu-007", + "title": "DVFS thermal noise on QNN NPU makes CV-based stability gating unreliable — requires session-level averaging", + "observation": "Across all 8 catalog models, QNN NPU CV ranges 0.1–2.0+ even on warm device. Original CV<15% gate blocks most candidates. Differences < 10% are within noise floor.", + "mechanism_confirmed": true, + "mechanism_hypothesis": "Snapdragon X Elite HTP Hexagon core runs DVFS aggressively. Single-session CV is dominated by thermal state, not model performance. The only reliable signal comes from session-level averaging (3+ independent sessions with cool-down).", + "action_for_autoconfig": "DISABLE CV gate for QNN NPU. Replace with: (1) minimum 3 independent sessions × 500+ iters with 30s cool-down between sessions. (2) Use median p50 across sessions as the signal. (3) Only trust gains > 10% — anything below is within noise floor. (4) Do NOT compare within-session std to declare stability.", + "confidence": "high — consistent across 8 models in catalog sweep", + "falsified_by": null, + "scope": "General — applies to all models on QNN NPU / Snapdragon X Elite HTP", + "bench_protocol_update": { + "screen_phase": "SKIP CV gate; run 200 iters as warmup only", + "full_phase": "3 sessions × 500 iters, 30s cool-down between sessions", + "signal": "median p50 across sessions", + "noise_floor": ">10% gain required to declare improvement" + } + } + + ], + + "search_space_rules": { + "opset": { + "recommended_order_conv_residual": [21, 17], + "recommended_order_pure_attention": [17], + "recommended_order_nlp": [17], + "architecture_gate": "Check model topology first: has_conv_ops AND has_residual_connections → try opset 21. Otherwise → opset 17 only.", + "rationale": "npu-001 (catalog-validated 2026-06-13): opset 21 +26-31% for Conv+residual. -7% for pure ViT. Neutral for BERT-family.", + "dialectical_note": "⚠️ opset 21 benefit requires ORT kMaxSupportedOpset < 21. On newer ORT this may not apply. Always validate." + }, + "quantization": { + "recommended": "w8a16", + "skip": ["w8a8 if initial top1 < 15%"], + "dialectical_note": "⚠️ W8A8 skip rule is ConvNext-specific (LN+GELU sensitivity). Try W8A8 for models without LN in every block." + }, + "compile": { + "always_run": true, + "dialectical_note": "⚠️ Compile benefit is well-understood (EPContext pre-built binary). Low risk of being wrong, but verify compile output loads correctly." + }, + "graph_passes": { + "recommended": "autoconf defaults (gelu_fusion, matmul_add_fusion)", + "NEVER_apply_for_qnn_npu": ["conv-bn-fusion", "conv-add-fusion", "conv-activation-fusion"], + "hazard_note": "npu-006 CRITICAL: Conv fusions cause 4900% regression on ResNet-18. Do NOT apply conv fusions to Conv-dominant models on QNN NPU.", + "dialectical_note": "⚠️ Conv fusion ban is confirmed for ResNet. MobileViT was safe. Always run latency gate after applying any fusion to catch regressions." + }, + "bench_protocol": { + "cv_gate": "DISABLED for QNN NPU (npu-007)", + "sessions": 3, + "iters_per_session": 500, + "cool_down_s": 30, + "noise_floor_pct": 10, + "signal": "median p50 across sessions" + } + } +} diff --git a/research/autoconfig/gen_report_v3.py b/research/autoconfig/gen_report_v3.py new file mode 100644 index 000000000..806bdddc0 --- /dev/null +++ b/research/autoconfig/gen_report_v3.py @@ -0,0 +1,338 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +import datetime +import json + + +results = json.load(open(r"ablation-search\results.json")) + +clean_base = [r for r in results if r["name"] in ["base_0", "base_1"]] +clean_runs = [v for r in clean_base for v in r["p50_runs"]] +clean_mean = round(sum(clean_runs) / len(clean_runs), 1) + + +def verdict(name, mean): + if name in ["base_0", "base_1", "base_2", "base_mid", "base_end"]: + return "outlier run" if name == "base_2" else "baseline" + if name == "matmul_add": + return "CONFIRMED REGRESSION" + if name == "matmul_scale": + return "probable mild regression" + if name.startswith("opset_"): + opset = int(name.split("_")[1]) + if opset >= 19: + return "SEVERE REGRESSION (kMaxSupportedOpset bug)" + return "neutral" + delta = mean - clean_mean + if abs(delta) < 5: + return "neutral" + if delta > 5: + return "mild regression" + return "possible improvement" + + +def row_class(name): + if name in ["base_0", "base_1", "base_mid", "base_end"]: + return "row-base" + if name == "base_2": + return "row-outlier" + if name == "matmul_add": + return "row-bad" + if name.startswith("opset_") and int(name.split("_")[1]) >= 19: + return "row-bad" + if name in ["matmul_scale"]: + return "row-warn" + return "row-neutral" + + +rows_html = "" +for r in results: + runs = r["p50_runs"] + delta = r["p50_mean"] - clean_mean + v = verdict(r["name"], r["p50_mean"]) + rc = row_class(r["name"]) + runs_str = " / ".join("%.1f" % x for x in runs) + sign = "+" if delta >= 0 else "" + rows_html += ( + '%s%.1f%s%.1f' + "%.1f%.1f%s%s\n" + % (rc, r["name"], r["p50_mean"], sign, delta, min(runs), max(runs), runs_str, v) + ) + +bar_labels = [ + r["name"] + for r in results + if r["name"] not in ["base_0", "base_1", "base_2", "base_mid", "base_end"] +] +bar_values = [ + round(r["p50_mean"], 1) + for r in results + if r["name"] not in ["base_0", "base_1", "base_2", "base_mid", "base_end"] +] +bar_colors = [] +for r in results: + if r["name"] in ["base_0", "base_1", "base_2", "base_mid", "base_end"]: + continue + if r["name"] == "matmul_add" or ( + r["name"].startswith("opset_") and int(r["name"].split("_")[1]) >= 19 + ): + bar_colors.append("'#dc3545'") + elif r["name"] in ["matmul_scale"]: + bar_colors.append("'#fd7e14'") + elif abs(r["p50_mean"] - clean_mean) < 5: + bar_colors.append("'#198754'") + else: + bar_colors.append("'#ffc107'") + +bar_labels_js = json.dumps(bar_labels) +bar_values_js = json.dumps(bar_values) +bar_colors_js = ",".join(bar_colors) +n_bars = len(bar_labels) +baseline_line = clean_mean +now_str = datetime.datetime.now().strftime("%Y-%m-%d") +n_results = len(results) + +html = """ + + + +ConvNext CPU Ablation Report + + + + +
+

📊 ConvNext CPU Ablation — Autoconfig POC + Opset Cliff RCA

+

Model: facebook/convnext-tiny-224  |  EP: CPU  |  DATE_PLACEHOLDER  |  N_RESULTS_PLACEHOLDER experiments  |  ORT ORTVER_PLACEHOLDER

+ + + +
+
Clean Baseline p50
CLEAN_MEAN_PLACEHOLDERms
base_0 + base_1, opset=17
+
Best Config Found
Baseline
opset=17, no extra flags
+
Worst Finding
+38ms
matmul-add-fusion
+
Root Cause Found
kMaxSupportedOpset
Transpose Optimizer gate
+
+ + +

🔍 Root Cause Analysis: ORT Opset Performance Cliff

+ +
+❌ ROOT CAUSE IDENTIFIED: ORT kMaxSupportedOpset gates the entire Transpose Optimizer

+In onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h: +
constexpr int64_t kMaxSupportedOpset = 18;  // in ORT v1.14.x
+// Current ORT (v1.24.5) kMaxSupportedOpset = 21 or 22
+
+// In onnx_transpose_optimization.cc:
+if (*opset > kMaxSupportedOpset) {
+    return std::nullopt;  // ← ENTIRE Transpose Optimizer skipped silently
+}
+ConvNext has 42 Transpose nodes forming a NCHW↔NHWC "transpose sandwich" in every block. +The Transpose Optimizer normally eliminates/merges these (pushing through Add×18, Mul×18, canceling adjacent inverses). +When it is bypassed, all 42 Transpose nodes execute as raw memory-layout copy operations → systemic slowdown. +
+ +

📊 ORT Optimization Level Experiment (confirms root cause)

+ + + + + + +
Session Optimization Levelopset=17opset=19RatioExplanation
DISABLE_ALL47.5ms355ms7.5×No Transpose Optimizer → all 42 Transposes execute. v17 model.onnx has pre-fused ops; v19 export has more raw ops.
ENABLE_BASIC289ms315ms1.1×Basic opts run on already-fused model, some interference. Near-parity: Transpose Optimizer not yet active at this level.
ENABLE_EXTENDED209ms241ms1.2×Extended optimizations help both but some overhead from re-optimizing pre-fused model.
ENABLE_ALL (default)216ms215ms1.0×Transpose Optimizer runs on both. Full parity achieved — confirms optimizer gap is the entire cause.
+ + + +

📋 kMaxSupportedOpset Version History (verified from ORT git tags)

+ + + + + + + +
ORT ReleasekMaxSupportedOpsetEffect
v1.14.x18opset ≥ 19 → Transpose Optimizer DISABLED
v1.16.x19opset ≥ 20 → disabled
v1.17.x20opset ≥ 21 → disabled
v1.18.x21opset ≥ 22 → disabled
main/HEAD26Fully covered for all current ONNX opsets
+ +

📜 ORT Source (exact call chain)

+
InferenceSession::Initialize()
+  → graph_transformer_mgr_.ApplyTransformers(graph, Level1)
+      → TransposeOptimizer::ApplyImpl()           [transpose_optimizer.cc:18]
+          → onnx_transpose_optimization::Optimize() [onnx_transpose_optimization.cc:3344]
+              → MakeOptimizerContext(graph, ...)
+                  → graph.Opset("ai.onnx")         // reads DomainToVersionMap()
+                  → if opset > kMaxSupportedOpset: return nullopt  // ← THE GATE
+              → if ctx == nullopt: return early    // no optimization performed
+ +

Why ConvNext is especially sensitive

+

The Transpose Optimizer can push Transposes through Add, Mul, and simple unary ops. ConvNext has 18×(Add + Mul) layer-scale and residual connections between blocks, meaning a single Transpose can cascade through many nodes. With the optimizer enabled, adjacent inverse pairs cancel; without it, every NCHW↔NHWC conversion is a full memory copy of the activation tensor.

+ + +

💡 Ablation Key Findings

+ +
+❌ CONFIRMED REGRESSION: matmul-add-fusion +38ms
+All 3 independent runs: 63.0 / 70.8 / 111.2ms vs clean baseline ~43.7ms. +The minimum observed (63ms) is 20ms above the highest clean-baseline run. Not attributable to noise. +Hypothesis: baseline already converts MatMul+Add→Gemm (37 Gemm in model.onnx); applying matmul-add-fusion creates redundant or conflicting dispatch. Unconfirmed — requires op-level profiling. +
+ +
+📝 MEASUREMENT CORRECTION: transpose-optimizer is NEUTRAL on inference latency
+Earlier 8-iteration search using winml eval reported +270ms. That measurement included HF preprocessing pipeline and had no warmup — it measured application latency, not model inference. +With winml perf (warmup=10, iter=50): 42.3 / 52.3 / 41.8ms — indistinguishable from baseline. +The +270ms was entirely a measurement artifact. Do not cite in user-facing reports. +
+ +
+❌ CONFIRMED: opset=19–22 causes 1.9–3.9× regression on this ORT build
+Mechanism confirmed: kMaxSupportedOpset gate in ORT's Transpose Optimizer. All 3 runs per opset are consistent. +Fix: use opset≤17 (current winml-cli default) OR upgrade ORT to a version where kMaxSupportedOpset ≥ 22 (main branch). +
+ +
+✅ NEUTRAL: nchwc-transformer, transpose-optimizer, opset=18 — all within noise of baseline (~43.7ms). +
+ +
+⚠ PROBABLE MILD REGRESSION: matmul-scale-fusion — all 3 runs elevated (51.5 / 58.1 / 61.2ms). Weak signal due to baseline drift during experiment. +
+ +

📊 Per-Config p50 Latency vs Baseline

+
+ +

📋 Full Results Table

+ + + +ROWS_PLACEHOLDER +
Configp50 mean (ms)Δ vs baselineminmaxRuns (ms)Verdict
+ +

🔧 Optimal Config

+
# Optimal config: baseline (opset=17, constant_folding=True, no extra flags)
+winml build --model-id facebook/convnext-tiny-224 -o out_cpu/
+winml perf -m out_cpu/model.onnx --ep cpu --warmup 10 --iterations 50
+# Expected: p50 ~43-44ms
+
+# AVOID:
+#   --optimize matmul-add-fusion     (confirmed +38ms regression)
+#   opset_version: 19-22             (kMaxSupportedOpset bug: 3-4x regression on affected ORT builds)
+ +

🧠 Open Questions

+
    +
  • Exact ORT version boundary: winml-cli ships ORT 1.24.5 (internal versioning). The exact kMaxSupportedOpset value in that build determines whether opset 19-22 is safe. Needs verification against ORT source at that specific commit.
  • +
  • Why does matmul-add-fusion regress? 37 Gemm nodes already exist; applying this fusion may create double-fusion or suboptimal kernel selection. Requires --profile to confirm.
  • +
  • GELU fusion mystery: baseline model.onnx has com.microsoft/Gelu×18 despite GeluFusion being in disabled_optimizers. Source unclear — likely HF Optimum pre-fuses GELU before ORT.
  • +
+ +
+ + +""" + +import subprocess + + +result = subprocess.run( + ["python", "-c", "import onnxruntime as ort; print(ort.__version__)"], + capture_output=True, + encoding="utf-8", + cwd=r"C:\tmp\autoconfig-demo", + env={ + **__import__("os").environ, + "PATH": r"C:\tmp\autoconfig-demo\.venv\Scripts;" + __import__("os").environ.get("PATH", ""), + }, +) +ort_ver = result.stdout.strip() or "1.24.5" + +html = html.replace("DATE_PLACEHOLDER", now_str) +html = html.replace("N_RESULTS_PLACEHOLDER", str(n_results)) +html = html.replace("ORTVER_PLACEHOLDER", ort_ver) +html = html.replace("CLEAN_MEAN_PLACEHOLDER", str(clean_mean)) +html = html.replace("ROWS_PLACEHOLDER", rows_html) +html = html.replace("BAR_LABELS_JS", bar_labels_js) +html = html.replace("BAR_VALUES_JS", bar_values_js) +html = html.replace("BAR_COLORS_JS", bar_colors_js) +html = html.replace("N_BARS_PLACEHOLDER", str(n_bars)) +html = html.replace("BASELINE_LINE_PLACEHOLDER", str(baseline_line)) + +with open(r"report.html", "w", encoding="utf-8") as f: + f.write(html) +print("report.html written: %d bytes, %d experiments" % (len(html), n_results)) From 76bb07b4e28a37a81c4a5ad9884f7f2d5ebac6ec Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 15 Jun 2026 10:32:18 +0800 Subject: [PATCH 02/38] research: add winml-cli agent layer design doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds research/autoconfig/docs/agent-design.md — strategic design for the agent layer of winml-cli, covering: - winml-cli vs Olive distinction (UX + Windows-first + explainability) - Why autoconfig search is a sub-tool, not the agent entry point - 5 agent types: Diagnostic, Decision Guidance, Cross-Device Confidence, Regression Detection, Model Recommendation - Autoconfig's role within the agent framework - Key concerns and open questions Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- research/autoconfig/docs/agent-design.md | 223 +++++++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 research/autoconfig/docs/agent-design.md diff --git a/research/autoconfig/docs/agent-design.md b/research/autoconfig/docs/agent-design.md new file mode 100644 index 000000000..fa72a332f --- /dev/null +++ b/research/autoconfig/docs/agent-design.md @@ -0,0 +1,223 @@ +# WinML CLI Agent Design + +> Status: Draft — 2026-06-11 +> Context: Strategic design for the agent layer of winml-cli + +--- + +## 1. Context: Why Agent Matters for winml-cli + +### 1.1 winml-cli vs Olive — The Real Distinction + +Microsoft Olive already exists as a pass-based optimization framework supporting QNN, DML, and other Windows EPs. The temptation is to dismiss winml-cli's agent as redundant with Olive. That would be wrong — the distinction is fundamental: + +| Dimension | Olive | winml-cli | +| --- | --- | --- | +| Target user | ML engineer who understands ORT internals | WinApp developer who wants their model to work on Windows | +| Workflow | Compose passes manually, specify EP upfront | `config` + `build` — two commands, full pipeline | +| Hardware selection | Manual EP specification | `--device auto` — detects hardware, selects EP | +| Explainability | Silent pipeline output | Designed for transparency | +| Windows-first | Cross-platform, Windows supported | Built exclusively for Windows hardware diversity | +| Operator diagnostics | Not available | `winml analyze` — operator linting, EP compatibility | +| Agent-ready | Not designed for it | First-class design goal | + +**Analogy:** Olive is webpack (powerful, expert-configured); winml-cli is Vite (opinionated, works for most cases out of the box). + +### 1.2 The Core Gap Agent Should Fill + +WinApp developers lack access to a senior ML engineer who: + +- Knows why a model fails on QNN NPU for this specific operator pattern +- Can read an error message and immediately know the root cause +- Understands which optimization knob to turn for which problem +- Knows how a config that works on Snapdragon X Elite will behave on Intel Meteor Lake + +**The agent's job is to be that person.** + +--- + +## 2. Agent Design Philosophy + +### 2.1 The Wrong Design (Current Autoconfig) + +The current autoconfig agent runs a **headless search loop**: +Explorer → Optimizer → Reviewer → repeat + +**Problems with this approach:** + +- A Python script can do benchmark loops faster, cheaper, and more reliably than an LLM agent +- Results (config files) are not auditable — developer cannot verify why a config was chosen +- No explainability — developer doesn't understand what was decided or why +- Treats developer as absent; no collaborative interaction +- The "agentic" overhead (LLM inference cost per loop iteration) adds nondeterminism without intelligence + +Autoconfig search is useful as a **sub-tool**, not as the primary value proposition of the agent layer. + +### 2.2 The Right Design: Diagnosis + Guidance over Search + +Agent excels at **judgment, diagnosis, and explanation** — not computation. The redesign centers on: + +> **When a developer encounters a problem, the agent gives explanation + executable next step — not a config file.** + +#### Design Principles + +1. **Explain, don't just output** + Instead of silently picking an EP, say: *"I picked QNN EP because your device has a Qualcomm NPU. Operator coverage is 97% — the remaining 3% fall back to CPU, which is acceptable for these specific ops."* +2. **Fix, don't just diagnose** + When an incompatible operator is found, apply the graph transformation — don't just flag it. +3. **Developer talks, agent acts** + The agent is interactive and conversational. Developer says "this model is slow on GPU" → agent asks clarifying questions, runs targeted experiments, explains findings. +4. **Progressive trust** + Show confidence levels. Be explicit about uncertainty. Let the developer see what the agent is doing. Never give false precision (e.g., "Config A is 3% faster" when standard deviation is 5%). +5. **Windows device diversity as first-class concern** + Always reason about what happens on devices the developer doesn't have — not just the machine the agent runs on. + +--- + +## 3. Agent Types + +### 3.1 Diagnostic Agent *(highest priority)* + +**Trigger:** Model fails to load, crashes at inference, throws EP compatibility error +**Developer question:** "My model fails on QNN NPU — why? What do I do?" + +**Agent responsibilities:** + +- Parse error message → identify root cause (unsupported op, shape mismatch, driver version, etc.) +- Analyze model graph → enumerate incompatible operators per EP +- Propose and apply concrete fix (graph transformation, operator substitution, fallback EP) +- Verify fix with `winml eval` accuracy check + +**Why this is Olive-incompatible:** Olive doesn't converse, doesn't diagnose, doesn't explain. It fails silently or produces a broken model. + +**Example interaction:** + +```javascript +Developer: winml build failed. Error: "QNNExecutionProvider: Unsupported op at node /conv/Conv_3" +Agent: Found it. Conv_3 has dynamic padding — QNN NPU requires static shapes. + I'll apply DynamicToFixedShape transform and re-run the compile. + [applies fix] → Build succeeded. NPU latency: 12.3ms. Accuracy delta: 0.01%. +``` + +--- + +### 3.2 Decision Guidance Agent + +**Trigger:** Developer is at a decision point in the pipeline (which EP? which precision? to quantize or not?) +**Developer question:** "I don't know what options to pick. What's the tradeoff?" + +**Agent responsibilities:** + +- Run quick comparative benchmarks (not exhaustive search) +- Present tradeoffs with numbers: latency gain vs accuracy delta vs model size +- Make a recommendation with reasoning, not just a number +- Let developer override with understanding of consequences + +**Key difference from autoconfig:** This is interactive and decision-oriented, not headless. The developer is in the loop. + +--- + +### 3.3 Cross-Device Confidence Agent *(winml-cli unique)* + +**Trigger:** Developer has a working config, asks "will this work on my users' devices?" +**Developer question:** "My app ships on many Windows hardware configs. Will this be okay?" + +**Agent responsibilities:** + +- Given a config optimized for Device A, reason about behavior on Device B, C... +- Identify configs that are device-specific (compiled QNN binaries only work on Qualcomm) +- Generate multi-device config with automatic EP fallback chain (QNN → DML → CPU) +- Surface warnings: "This config will fail on Intel Meteor Lake — here's the fallback" + +**Why this matters:** WinApp developers ship to millions of devices. No other tool addresses Windows hardware diversity in the deployment sense. + +--- + +### 3.4 Regression Detection Agent *(CI/CD scenario)* + +**Trigger:** ORT version bump, driver update, or scheduled CI run +**Developer question:** "Something changed — my model got slower / broke" + +**Agent responsibilities:** + +- Compare before/after perf numbers with statistical validity (not point estimates) +- Correlate change with known ORT/EP changelog entries +- Identify which layer / operator regressed using profiler output +- Propose workaround or file structured bug report + +--- + +## 4. Role of Autoconfig (Perf Search) in This Design + +Autoconfig (opset × EP × opt\_level search) is **not abandoned** — it becomes a sub-tool invoked by the agents above when appropriate. + +**When it's invoked:** + +- Diagnostic Agent: "Let me try a few configs to see if this performance issue is config-related" +- Decision Guidance Agent: "Let me run a quick sweep across opt\_level to show you the tradeoff" + +**What changes:** + +- It's not the primary entry point +- Its output is explained, not just surfaced as a config file +- It runs fewer, targeted experiments (not exhaustive grid search) +- Accuracy validation (`winml eval`) is mandatory before any recommendation + +--- + +### 3.5 Model Recommendation Agent *(developer onboarding scenario)* + +**Trigger:** Developer describes a problem they want to solve but doesn't know which model to use +**Developer question:** "I want to add real-time object detection to my app. What model should I use?" + +**Agent responsibilities:** + +- Understand the developer's task from natural language (classification, detection, segmentation, NLP, etc.) +- Query `winml catalog` to enumerate available built-in models for that task +- Factor in the developer's current hardware (`winml sys`) — which EPs are available, memory constraints, NPU presence +- Recommend the best-fit model with reasoning: why this model, expected latency on their device, accuracy tier +- Output a ready-to-run command so the developer can immediately try it + +**Why this is valuable:** Developers often don't know which model architecture fits their use case, let alone which one performs well on their specific Windows hardware. The catalog has verified, pre-tested models — the agent bridges "I have a problem" to "here is a model you can ship today." + +**Example interaction:** + +```javascript +Developer: I want to classify product images in my e-commerce app. Should be fast on low-end laptops. +Agent: For image classification on low-end hardware without an NPU, I recommend: + + → microsoft/resnet-50 (catalog built-in) + - Task: image-classification + - Latency on CPU: ~18ms p50 (tested on Intel Core i5) + - Size: 98MB FP32, 25MB w8a16 + - Best EP on your machine: CPUExecutionProvider + + Runner-up: google/vit-base-patch16-224 (better accuracy, ~2x slower on CPU) + + To build and benchmark: + winml build -c $(winml config -m microsoft/resnet-50 --device auto) -o resnet_out/ + winml perf -m resnet_out/model.onnx --device auto --iterations 100 +``` + +**What makes this different from a search engine:** The recommendation is hardware-aware — the same question asked on a machine with a Qualcomm NPU would surface a different model (or a different EP for the same model) with different expected numbers. It's not a static lookup, it's a contextual match. + +--- + +## 5. Key Concerns to Track + +| Concern | Mitigation | +| --- | --- | +| Device heterogeneity: config found on Dev's machine may not generalize | Cross-Device Confidence Agent explicitly addresses this; output includes device scope | +| Trust/auditability: developer can't verify agent recommendation | All recommendations include reasoning + confidence + "how I tested this" | +| Olive overlap at implementation layer | winml-cli uses ORT under the hood like Olive; the differentiation is UX + Windows-first + explainability, not reimplementing optimization passes | +| Accuracy validation | `winml eval` is mandatory in every agent loop that modifies the model | +| Agent hallucinating perf numbers | All perf claims require iteration ≥ 1000 and report p50/p90/p99 with std dev | + +--- + +## 6. Open Questions + +1. **Scope**: Should the agent be a CLI mode (`winml agent`) or embedded into existing commands (`winml build --agent`)? +2. **Olive relationship**: Should winml-cli contribute opset search back to Olive, or maintain it independently? Needs alignment with Olive team. +3. **Offline / no-LLM mode**: Should the agent work without LLM (rule-based fallback) for air-gapped CI environments? +4. **Multi-device testing**: Cross-Device Confidence Agent requires access to multiple devices or a device simulation layer — how to implement? From 4a6ef5bdd76d582850308bbf6709a30ca2451bb3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 15 Jun 2026 10:33:25 +0800 Subject: [PATCH 03/38] research: add WinML CLI Skills Design Doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds research/autoconfig/docs/skills-design.md — full design doc for the winml-cli skills/agent layer, including: - 11 skill designs (use-winml-cli, optimize-for-device, ep-compatibility-check, debug-accuracy-drop, and others) - Competitive analysis (Apple coremltools, ExecuTorch, AI Hub, NVIDIA ModelOpt, OpenVINO, Olive) - Top 5 feature gaps - Validation confidence levels (L1-L5) - Structured output requirements - QNN NPU catalog sweep findings (npu-001/006/007) - FusedConv unfuse feature request Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- research/autoconfig/docs/skills-design.md | 2973 +++++++++++++++++++++ 1 file changed, 2973 insertions(+) create mode 100644 research/autoconfig/docs/skills-design.md diff --git a/research/autoconfig/docs/skills-design.md b/research/autoconfig/docs/skills-design.md new file mode 100644 index 000000000..1ad4b5d6c --- /dev/null +++ b/research/autoconfig/docs/skills-design.md @@ -0,0 +1,2973 @@ +# WinML CLI Skills Design Doc + +## Overview + +This document defines the design for 11 skills to be added to `skills/` in winml-cli. +Skills are split into three audiences: + +- **Consumer skills (7)** — for WinApp developers deploying models +- **Contributor skills (3)** — for engineers extending winml-cli itself +- **Internal research skills (1)** — for winml-cli team to find optimization gaps and backlog items + +Each skill follows the SKILL.md frontmatter convention (`name:`, `description:`) established +by Mobius, NVIDIA Model-Optimizer, and Google LiteRT-CLI as the de facto standard. + +### Consumer skill dependency graph + +``` +ep-compatibility-check ──┐ + ├──► optimize-for-device ──┐ +use-winml-cli ────────────┤ ├──► validate-before-ship + └──► debug-accuracy-drop ───┤ + │ +prepare-for-winapp ────────────────────────────────────┘ + +autoconfig ────────────────────────────────────────────► validate-before-ship + (autoresearch loop: finds optimal config for user-defined EP/accuracy/latency targets) +``` + +### Internal research skill + +``` +optimization-research ──► [GitHub issues / winml backlog] + (deep search: ORT source + Olive + ONNX ecosystem + native stack models + → find better solutions → diagnose winml gaps → produce work items) +``` + +### Contributor skill dependency graph + +``` +adding-model-support ──► contributing-a-skill +adding-ep-support ──► contributing-a-skill +``` + +--- + +## Design principle: Skills as agentic workflows + +### The shift: documentation → automation + +Current state (most skills in the ecosystem): +> Skill tells the user what commands to run → user runs them → user interprets output + +Target state for winml-cli: +> Skill tells the **agent** what commands to run → **agent runs them** → agent interprets output → agent gives a specific answer + +The difference: + +| | Documentation skill | Agentic skill | +|---|---|---| +| Agent sees low cosine | "Run `winml eval --mode compare`" | Runs it, reads cosine=0.87, says "drop at quantize stage, Attention layers" | +| EP compatibility | "Run `winml sys` then `winml analyze`" | Runs both, parses JSON, says "QNN available but LayerNorm is partial" | +| Optimize precision | "Use the decision framework" | Runs fp16/w8a16/w8a8 sweep, builds actual tradeoff table, recommends W8A16 | +| Validate before ship | "Check these 6 gates" | Runs all 6 gates, generates a pass/fail report with actual numbers | + +This is only possible if skills describe a **GATHER → ANALYZE → DECIDE → ACT** workflow, +and winml-cli commands emit **machine-readable structured output** that the agent can parse. + +### Structured output: current state and gaps + +Copilot agents have shell tool access and can run `winml` commands directly. +The key requirement is `--format json` on stdout so the agent can parse results +without screen-scraping Rich/ANSI terminal output. + +| Command | Structured output today | Gap | +|---|---|---| +| `winml inspect` | ✓ `--format json` (stdout) | None | +| `winml sys` | ✓ `--format json` (stdout) | None | +| `winml run` | ✓ `--format json` (stdout) | None | +| `winml analyze` | ⚠ `--output file.json` (file only) | Add `--format json` stdout | +| `winml perf` | ⚠ `--output file.json` (file only) | Add `--format json` stdout | +| `winml eval` | ✗ No structured output | Add `--format json` stdout | + +**Required code changes** (enables agentic skill execution): +1. `winml eval --format json` — outputs `{cosine, sqnr, psnr, task_metric}` to stdout +2. `winml analyze --format json` — outputs `{supported: [...], partial: [...], unsupported: [...]}` to stdout +3. `winml perf --format json` — outputs `{p50_ms, p90_ms, p99_ms, mean_ms}` to stdout + +### The GATHER → ANALYZE → DECIDE → ACT skill structure + +Each skill section should be written with agent execution in mind: + +``` +## GATHER: what to run +Commands the agent runs first (with --format json) to collect facts. + +## ANALYZE: what to look for +How to interpret the JSON output. What values matter. What thresholds to apply. + +## DECIDE: what to recommend +Decision logic. If X → recommend Y. If A and B → recommend C. + +## ACT: what to tell the user +What to surface to the user: specific diagnosis + specific next step. +``` + +In practice this maps onto the existing "Sections" structure — the key is ensuring +each section has **concrete commands to run** and **concrete interpretation rules**, +not just prose description. + +### Example: `debug-accuracy-drop` as an agentic workflow + +``` +User: "My W8A8 model has low accuracy" + +GATHER: + agent runs: winml eval --mode compare -m quantized.onnx --model-id --format json + agent gets: {"cosine_similarity": 0.87, "sqnr_db": 28.3, "stage": "quantize"} + +ANALYZE: + cosine=0.87 < 0.90 threshold → problem is real + sqnr=28.3 < 30 dB → significant degradation + stage=quantize → problem introduced at quantize, not optimize or compile + +DECIDE: + quantize-stage drop on W8A8 → check if Attention layers are the culprit + agent runs: winml analyze -m quantized.onnx --ep qnn --format json + agent gets: {"partial": ["MultiHeadAttention", "LayerNorm"], "unsupported": []} + +ACT: + Agent: "The accuracy drop (cosine=0.87) is at the quantize stage. + MultiHeadAttention is partial on QNN — activations may be falling back to FP32. + Try W8A16 to keep activations at FP16: winml build -c config.json --precision w8a16" +``` + +Without structured output (`--format json`), the agent would have to tell the user to run +each step manually and paste the results back. With structured output, the agent runs the +full diagnostic in one turn. + +--- + +## Validation confidence levels (L1–L5) + +Inspired by Mobius `writing-tests`. Applied in `validate-before-ship` as the Definition-of-Done backbone. +Each level is checked **independently** — a model can pass L3 without passing L2. + +| Level | Name | What it verifies | Key command | +|---|---|---|---| +| **L1** | Loadable | Artifact is valid ONNX, loads without error | `winml inspect -m ` | +| **L2** | Shape correct | Output shape matches expected spec | `winml eval -m --model-id ` (check shape in output) | +| **L3** | Numerical parity | Output matches FP32 baseline (cosine ≥ 0.99 FP16, ≥ 0.95 W8A16, ≥ 0.90 W8A8) | `winml eval --mode compare -m --model-id ` | +| **L4** | Task accuracy | Task metric (Top-1/F1/mAP) within acceptable drop from FP32 reference | `winml eval -m --model-id ` (task metric) | +| **L5** | Production ready | Perf SLA met on target device + cross-EP consistency verified | `winml perf --iterations 100 --monitor` | + +**Quick pass criteria:** + +| Precision | L3 threshold | +|---|---| +| FP16 | cosine_similarity ≥ 0.99 | +| W8A16 | cosine_similarity ≥ 0.95 | +| W8A8 | cosine_similarity ≥ 0.90 (or task-specific) | + +Waivers: any level that cannot be verified must be documented with a reason and tracking issue. +The `validate-before-ship` skill maps each of its 6 gates to an L-level. + +--- + +--- + +## Competitive Analysis + +### Summary + +winml-cli has a solid optimization pipeline (export→quantize→compile→benchmark) but lacks the **debugging/diagnostic loop**, **accuracy recovery tooling**, and **developer observability** that distinguish great toolchains from adequate ones. + +--- + +### Competitor Feature Matrix + +| Feature | Apple | ExecuTorch | AI Hub | NVIDIA | OpenVINO | Optimum | Olive | winml-cli | +|---|---|---|---|---|---|---|---|---| +| Per-layer accuracy debugging | ❌ | ✅ SVG graph | ✅ cloud | ❌ | ❌ | ❌ | ❌ | ❌ | +| Compute unit utilization report | ❌ | ✅ | ✅ | ❌ | Partial | ❌ | ❌ | ❌ | +| Accuracy-Aware PTQ (auto layer rollback) | ❌ | ❌ | ❌ | ❌ | ✅ NNCF | ❌ | ❌ | ❌ | +| Standard NLP benchmark (MMLU/PPL) | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | +| Cross-EP side-by-side compare | ❌ | ❌ | Partial | ❌ | ❌ | ❌ | ❌ | ❌ | +| Zero-deploy validation (model.predict) | ✅ macOS | ✅ | ✅ cloud | ❌ | ✅ | ✅ | ❌ | Partial | +| Pre-quantized model zoo | ❌ | ❌ | ✅ 500+ | ✅ HF org | ✅ | ❌ | ❌ | ❌ | +| One-line optimize command | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | +| Multi-EP artifact packaging | ✅ .mlpackage | ✅ .pte | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| QAT / accuracy recovery fine-tuning | ✅ | ❌ | ✅ AIMET | ✅ | ✅ | ❌ | ❌ | ❌ | +| Advanced quant (AWQ/SmoothQuant) | ❌ | ❌ | ✅ | ✅ | ✅ NNCF | ❌ | ❌ | ❌ | +| Thermal/sustained-load profiling | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | + +--- + +### Competitor Deep Dives + +#### Apple coremltools +**Most relevant**: zero-deploy validation + compute_units API + palettization + +- `model.predict({'input': np_array})` — validates converted model in one Python call without any device deploy. Can force `ComputeUnit.CPU_ONLY` for numerical comparison vs `CPU_AND_NE`. +- `compute_units` is switchable **at prediction time** (not just compile time) — enables A/B testing EP performance without re-converting. +- **Palettization**: LUT-based weight compression at 1–8 bits (k-means clustering, not linear quant). Matches Neural Engine hardware kernels better than INT4 linear quantization for many models. +- Three compression workflows: data-free / calibration-based / fine-tuning-based (QAT). +- `.mlpackage` separates architecture from weights → streaming-friendly, supports on-device compilation after download. + +#### ExecuTorch (Meta) +**Most relevant**: per-layer QNN accuracy debugging (best-in-class of all competitors) + +- `QNNIntermediateDebugger`: dumps intermediate tensor outputs at every QNN op, computes cosine similarity per layer vs CPU reference, generates **color-coded SVG computation graph** (green ≥ 0.9, red < 0.9). +- `get_delegation_info()`: table of ops showing delegated-to-NPU count vs CPU-fallback count per op type. +- `ETDump` + `Inspector` API: per-op timing table with avg (ms), op type, is_delegated. Returns pandas DataFrame. +- QAIRT Visualizer: `pip install qairt-visualizer` — interactive GUI overlaying op trace + QHAS (QNN HTP Analysis Summary) on model graph. +- **Missing**: no cloud device testing, no automated accuracy-latency sweep, build process is complex. + +#### Qualcomm AI Hub +**Most relevant**: cloud profiling with physical hardware, per-step memory breakdown + +- Compile + Profile + Inference on real physical devices (Snapdragon X Elite laptops, Galaxy S24) in the cloud — no local hardware needed. +- Per-step memory profiling: compilation time/memory, first-load time/memory (NE optimization), subsequent-load (cached), inference latency. +- 500+ pre-optimized models in model zoo. +- `--clone j1glw6y8p` — clone any previous job with modified params. +- Cloud AIMET quantization: sophisticated PTQ as a service (`submit_quantize_job()`). + +#### NVIDIA ModelOpt +**Most relevant**: 16 compression techniques + MMLU benchmark scripts + pre-quantized HF checkpoints + +- Compression techniques beyond PTQ: AWQ, SmoothQuant, QAT, pruning (Minitron 33% smaller, 50% faster), distillation, speculative decoding, sparsity, NAS (Puzzletron). +- Windows accuracy benchmark: `mmlu_benchmark.py` (57 subjects, DirectML/ORT/TensorRT-LLM/CPU), perplexity on WikiText-2, KL-divergence metrics. +- Pre-quantized HF checkpoints: `nvidia/DeepSeek-R1-FP4`, `nvidia/Llama-3.3-70B-FP4` etc. — pull validated optimized models without running pipeline. + +#### Intel OpenVINO + NNCF +**Most relevant**: Accuracy-Aware PTQ (auto layer rollback) + +- NNCF `AccuracyAwareQuantization`: automatically identifies sensitivity of each layer to quantization, rolls back sensitive layers to float when accuracy drop exceeds threshold. Fully automated accuracy-performance tradeoff solver. +- `benchmark_app -hint latency` vs `-hint throughput`: auto-configures streams, batch, inference requests for each mode. `-d AUTO`: automatic device selection with fallback. +- 100+ Jupyter notebooks on Binder/Colab — zero setup barrier. +- `OpenVINO GenAI`: high-level `LLMPipeline`, `WhisperPipeline` — deploy-ready LLM inference in 5 lines. + +#### HuggingFace Optimum +**Most relevant**: drop-in Transformers replacement + multi-backend hub + +- Replace `AutoModelForSequenceClassification.from_pretrained()` with `ORTModelForSequenceClassification.from_pretrained()` → ONNX Runtime inference with zero code change. +- 8 hardware backends: ONNX Runtime, OpenVINO, NVIDIA TensorRT-LLM, AMD Ryzen AI, AWS Inferentia, ExecuTorch, Intel Gaudi, FuriosaAI. +- Task-aware export: `--task text-generation` auto-configures dynamic axes and model wrapping. + +#### Microsoft Olive (direct competitor) +**Most relevant**: one-line optimize command + VS Code AI Toolkit + +- `olive optimize --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct --precision int4 --output_path models/qwen` — one command, no per-step config. +- JSON-based pipeline config for full declarative multi-step control. +- VS Code AI Toolkit extension: GUI for model optimization, fine-tuning, and inference testing — no CLI knowledge needed. +- MultiLoRA serving support. + +--- + +### Top 5 High-Impact Gaps for winml-cli + +#### 🔴 Gap 1: Per-Layer Accuracy Debugging + +**Pain**: Accuracy degrades after QNN compilation/quantization, user has no idea which layer caused it. Currently requires QNN SDK expert knowledge. + +**Solution**: `winml debug --model model.onnx --ep qnn --inputs calibration_data/` +1. Runs model on CPU and QNN, captures intermediate tensor outputs at each op +2. Computes cosine similarity per layer +3. Outputs HTML/SVG graph with color-coded accuracy (green/red per layer) + +**Reference**: ExecuTorch `QNNIntermediateDebugger` → `OutputFormat.SVG_GRAPH` + `QcomCosineSimilarityComparator` + +**Impact**: Turns multi-day debugging into a 30-minute diagnosis. Currently no Windows-on-NPU tool does this. + +--- + +#### 🔴 Gap 2: Compute Unit Utilization Report + +**Pain**: `winml perf` shows slower-than-expected latency with no explanation. User doesn't know what % of ops ran on NPU vs fell back to CPU. + +**Solution**: Extend `winml analyze` to output delegation table: +``` +Op Type | NPU Delegated | CPU Fallback | Reason +----------------|---------------|--------------|------------------ +MatMul (INT8) | 47 / 47 | 0 | - +LayerNorm | 0 / 12 | 12 | Unsupported dtype +Softmax (FP32) | 0 / 6 | 6 | Requires INT8 input +``` + +**Reference**: ExecuTorch `get_delegation_info().get_operator_delegation_dataframe()` / AI Hub per-layer compute unit mapping + +**Impact**: Directly actionable — if user sees "60% of ops on CPU due to unsupported dtype," they know to switch to W8A8. + +--- + +#### 🟠 Gap 3: Quantization Sensitivity Analysis + +**Pain**: `winml quantize --algo w8a8` produces a model with unacceptable accuracy. User doesn't know if it's a specific layer, the algorithm, or the calibration data. + +**Solution**: `winml analyze-quant --model model.onnx --calibration data/ --eval-dataset eval/` +1. Run full W8A8 quantization +2. For each block/layer, measure accuracy impact of reverting to FP16 +3. Rank layers by sensitivity +4. Report: "reverting 3 attention layers to FP16 recovers X% accuracy at Y% latency cost" + +**Reference**: Intel NNCF `AccuracyAwareQuantization` (automatic per-layer rollback) + +**Impact**: Replaces multi-day trial-and-error with a 10-minute automated report. + +--- + +#### 🟠 Gap 4: Standard Benchmark Integration (MMLU / Perplexity) + +**Pain**: `winml eval` supports custom scripts but no out-of-box standard benchmarks. Users have no reference point for whether their quantized model's accuracy is "expected." + +**Solution**: `winml eval --model model.onnx --benchmark mmlu --ep qnn` +- Built-in MMLU (57 subjects), WikiText-2 perplexity, KL-divergence scripts +- Reference numbers from FP32 baseline shown alongside quantized result +- `FP16 baseline: 78.2% → W8A8 QNN: 77.9% (−0.3%, expected range: −0.1% to −0.5%)` + +**Reference**: NVIDIA ModelOpt `examples/windows/accuracy_benchmark/mmlu_benchmark.py` supports DirectML/ORT/CPU + +**Impact**: Removes ambiguity and creates trust. Critical for LLM users. + +--- + +#### 🟡 Gap 5: Cross-EP Side-by-Side Comparison + +**Pain**: Choosing between QNN/DirectML/CPU/OpenVINO requires running each EP manually and aggregating results. No tool does this automatically. + +**Solution**: `winml sweep --model model.onnx --precision w8a16,fp16 --ep qnn,dml,cpu` +- Runs build+eval+perf for each (precision × EP) combination +- Outputs a single comparison table: accuracy / latency / op coverage % +- Agent-driven: skill reads JSON output and recommends the optimal combination + +**Reference**: Truly unique — no competitor does this for Windows multi-EP. Closest is AI Hub's multi-device fleet testing (Android only). + +**Impact**: The single most-requested decision for Windows AI developers. Unique to winml-cli. + +--- + +### Patterns in Great Toolchain DX + +**Pattern 1: The "Why" Feedback Loop** +Great toolchains explain *why* results are the way they are. ExecuTorch's delegation table, AI Hub's compute unit mapping, NNCF's layer sensitivity analysis all answer "why?" winml-cli currently stops at "here's the result." + +**Pattern 2: Progressive Disclosure of Complexity** +- Olive: `olive optimize --precision int4` (one line) → full JSON config pipeline +- coremltools: `ct.convert(model)` → MIL IR manipulation +- AI Hub: web dashboard → Python SDK → CLI → AIMET configs + +winml-cli is currently too close to the expert path: each step requires understanding EP-specific options. + +**Pattern 3: Zero-Deploy Validation** +Every strong toolchain lets you test model output before deploying to hardware: coremltools `model.predict()`, ExecuTorch Python pybind, AI Hub `submit_inference_job()`. winml-cli is strong for CPU but lacks the quick "compare CPU vs QNN output" path. + +**Pattern 4: Pre-Validated Model Artifacts** +ModelOpt (HF nvidia/ org), AI Hub (500+ models), NNCF (Model Zoo with accuracy tables) all reduce the cold-start problem. Users don't need the full pipeline for popular models. + +--- + +### Whitespace Opportunities (No Competitor Covers) + +| Opportunity | Why it's winml-cli territory | +|---|---| +| **Cross-EP regression table** (one command, all EPs) | Multi-EP is the unique Windows AI challenge; no Android/iOS tool does this | +| **Quantization config recommender** (`winml recommend --target qnn --constraint latency=20ms`) | Rule-based recommendation from hardware+model arch analysis | +| **EP-aware ONNX graph visualizer** (Netron + green/yellow/red per EP) | Netron exists but has no EP coverage overlay | +| **Thermal/sustained-load profiling** (latency curve over 100 runs, detect throttling) | AI Hub hides variance; no tool surfaces thermal behavior | +| **Windows AI Model Package** (.mlpackage equivalent with multi-EP manifest) | Apple has .mlpackage; Windows has nothing equivalent | + +--- + +## Skill 1: `use-winml-cli` (existing — extend) + +**Status:** Exists at `skills/use-winml-cli/SKILL.md`. Needs two additions: +- Add `winml run` and `winml serve` usage (currently missing) +- Add "first-time onboarding" path for users who don't know where to start + +No structural changes needed; the existing skill is the general entry point. + +--- + +## Skill 2: `optimize-for-device` + +### Frontmatter +```yaml +name: optimize-for-device +description: > + Use this skill when a user wants the best performance for their model on a + specific Windows device, or wants to compare latency/accuracy tradeoffs across + quantization levels (FP16, W8A16, W8A8) and execution providers (QNN NPU, + DirectML GPU, CPU). Covers the precision sweep workflow, hardware-specific + recommendations, and how to read tradeoff results to make a deployment decision. + Use when the user says "make it faster", "which precision should I use", "is NPU + worth it", or asks to compare hardware. +``` + +### When to use +- "I want to run this on NPU, how much faster will it be?" +- "Which quantization should I pick?" +- "Compare QNN vs DirectML vs CPU for my model" +- "Is W8A8 accurate enough for my use case?" + +### Sections + +**1. The decision framework** +Two inputs: latency budget OR accuracy budget. Decision tree: +- Have a latency SLA (e.g. <50ms)? → Find highest accuracy within that budget +- Have an accuracy floor (e.g. <2% drop)? → Find fastest within that floor + +**2. The precision ladder** +Table: FP32 → FP16 → W8A16 → W8A8, with typical speedup and accuracy-drop ranges +per model family (Encoder/BERT-like, Vision/ConvNet, Transformer/ViT). + +**3. The sweep workflow** +Step-by-step: run `winml build` + `winml eval` + `winml perf` for each precision, +collect into a tradeoff table, apply decision framework. + +Key commands: +```bash +winml config -m --device --precision fp16 -o config_fp16.json +winml build -c config_fp16.json -m -o out_fp16/ +winml eval -m out_fp16/.onnx --model-id +winml perf -m out_fp16/.onnx --device --iterations 50 +# repeat for w8a16, w8a8 +``` + +**4. Hardware-specific guidance table** +| Device | Best EP | Sweet-spot precision | Notes | +|---|---|---|---| +| Snapdragon X Elite NPU | QNN | W8A16 | HTP native for W8A16; W8A8 risky for Attention | +| Intel Core Ultra NPU | OpenVINO | W8A8 | OpenVINO PTQ handles INT8 well | +| AMD Ryzen AI NPU | VitisAI | W8A8 | Phoenix/Hawk Point prefer INT8 | +| Any GPU | DirectML | FP16 | FP16 sufficient; quantization rarely helps on GPU | +| CPU fallback | CPU | W8A8 | Size + latency both benefit | + +**5. Reading the output** +How to interpret `winml eval` cosine_similarity, SQNR, and `winml perf` p50/p90/p99. +What values indicate "acceptable" vs "needs investigation". + +**Cross-references:** +- If accuracy dropped unexpectedly → `debug-accuracy-drop` +- If EP not available → `ep-compatibility-check` +- After choosing a precision → `validate-before-ship` + +--- + +## Skill 3: `debug-accuracy-drop` + +### Frontmatter +```yaml +name: debug-accuracy-drop +description: > + Use this skill when a quantized or optimized model produces worse accuracy than + the FP32 baseline and the cause is unknown. Guides a structured diagnosis: first + isolate which pipeline stage introduced the drop (optimize vs quantize vs compile), + then use winml eval --mode compare to measure output similarity, then use winml + analyze to check for partial/unsupported ops that may cause EP fallback. Covers + calibration dataset issues, precision selection mistakes, and QNN-specific fallback + patterns. Use when the user says "accuracy dropped after quantization", "results + look wrong on NPU", or "cosine similarity is low". +``` + +### When to use +- "My model gives wrong results after quantization" +- "W8A8 accuracy is too low, how do I find out why" +- "Results differ between NPU and CPU" +- cosine_similarity < 0.95 from `winml eval --mode compare` + +### Sections + +**1. Isolation strategy: binary search on the pipeline** +Diagnose by bisecting the pipeline stages: +``` +FP32 baseline + → after optimize? winml eval --mode compare (fp32 vs optimized) + → after quantize? winml eval --mode compare (fp32 vs quantized) + → after compile? winml eval --mode compare (fp32 vs compiled) +``` +First stage where cosine drops → that's where the problem is. + +Key commands: +```bash +# Export FP32 baseline +winml export -m -o baseline/model.onnx + +# Compare optimized vs baseline +winml eval --mode compare -m optimized/model.onnx --model-id + +# Compare quantized vs baseline +winml eval --mode compare -m quantized/model.onnx --model-id + +# Compare EP-compiled vs baseline (run on target EP) +winml eval --mode compare -m compiled/model.onnx --model-id --ep qnn +``` + +**2. Interpreting similarity metrics** +Table of thresholds: +| Metric | Healthy | Investigate | Problem | +|---|---|---|---| +| cosine_similarity | > 0.99 | 0.95–0.99 | < 0.95 | +| SQNR (dB) | > 40 | 30–40 | < 30 | +| max_abs_diff | model-dependent | — | unbounded | + +**3. Root cause patterns** + +| Symptom | Likely cause | Fix | +|---|---|---| +| Drop appears at quantize stage | Calibration dataset not representative | Use task-relevant calibration data via `--calibration-dataset` | +| Drop appears at quantize stage for Attention layers | W8A8 quantizing activations in attention | Switch to W8A16 (keeps activations at FP16) | +| Drop appears at compile stage on QNN | Op pattern unsupported → CPU fallback | Run `winml analyze` to find partial ops | +| Inconsistent results across runs | Non-deterministic EP dispatch | Add `--iterations 20` to average out | +| Drop only in certain inputs | Input shape sensitivity | Test with calibration data matching real distribution | + +**4. Checking for op fallback with `winml analyze`** +When compile-stage drop is suspected: +```bash +winml analyze -m quantized/model.onnx --ep qnn +``` +Look for `partial` and `unsupported` ops — these fall back to CPU, introducing +numerical differences vs native NPU execution. Partial ops are the most common +source of unexpected accuracy variance on QNN. + +**5. Precision escalation path** +If W8A8 is the problem and the model is accuracy-sensitive: +W8A8 → W8A16 → FP16 → FP32 +Stop at the first precision that meets accuracy requirements. + +**Cross-references:** +- To compare precision options systematically → `optimize-for-device` +- If op is listed as unsupported → `ep-compatibility-check` + +--- + +## Skill 4: `prepare-for-winapp` + +### Frontmatter +```yaml +name: prepare-for-winapp +description: > + Use this skill when a WinApp developer needs to take winml-cli build artifacts + and integrate them into a Windows application. Covers how to organize multi-EP + artifacts (QNN/NPU, DirectML/GPU, CPU fallback), the recommended directory + layout and manifest structure for runtime EP selection, how to load models + using the Windows ML WinRT API or ONNX Runtime C++ API, and runtime EP + detection and fallback patterns. Use when the user asks "how do I use this + in my app", "how do I package the model", or "what file do I load at runtime". +``` + +### When to use +- "I built the model, how do I ship it in my app?" +- "How do I load different models for different hardware?" +- "What happens when the user doesn't have an NPU?" +- "How do I package QNN + DML + CPU variants together?" + +### Sections + +**1. The multi-EP artifact problem** +Explain why `winml compile` produces EP-locked files (not portable), +so a WinApp needs a strategy to select the right file per device. + +**2. Recommended artifact layout** +``` +my_model/ + manifest.json ← EP → file mapping + version + model_qnn.onnx ← QNN NPU (compiled, Snapdragon X) + model_openvino.onnx ← OpenVINO NPU/GPU (Intel Core Ultra) + model_vitisai.onnx ← VitisAI NPU (AMD Ryzen AI) + model_dml.onnx ← DirectML GPU (any GPU, non-NPU machines) + model_cpu.onnx ← CPU fallback (universal) +``` + +**3. manifest.json schema** +```json +{ + "model_id": "facebook/convnext-tiny-224", + "task": "image-classification", + "version": "1.0.0", + "variants": [ + { "ep": "qnn", "device": "npu", "file": "model_qnn.onnx", "precision": "w8a16" }, + { "ep": "openvino", "device": "npu", "file": "model_openvino.onnx", "precision": "w8a8" }, + { "ep": "vitisai", "device": "npu", "file": "model_vitisai.onnx", "precision": "w8a8" }, + { "ep": "dml", "device": "gpu", "file": "model_dml.onnx", "precision": "fp16" }, + { "ep": "cpu", "device": "cpu", "file": "model_cpu.onnx", "precision": "w8a8" } + ], + "selection_order": ["qnn", "openvino", "vitisai", "dml", "cpu"] +} +``` + +**4. Building all variants with winml-cli** +```bash +# Generate configs per EP +winml config -m --device npu --ep qnn -o config_qnn.json +winml config -m --device npu --ep openvino -o config_ov.json +winml config -m --device gpu --ep dml -o config_dml.json +winml config -m --device cpu -o config_cpu.json + +# Build all +winml build -c config_qnn.json -m -o out_qnn/ +winml build -c config_ov.json -m -o out_ov/ +winml build -c config_dml.json -m -o out_dml/ +winml build -c config_cpu.json -m -o out_cpu/ +``` + +**5. Runtime EP selection pattern (C++ / ORT)** +Pseudocode for app-side logic: +- Read manifest.json +- Query available EPs on device (`GetAvailableProviders()` or `winml sys` equivalent) +- Walk `selection_order`, pick first EP available on this device +- Load the corresponding file +- If all fail → CPU is always available + +**6. What NOT to do** +- Don't load a QNN-compiled model with CPU EP → will fail or produce wrong results +- Don't hardcode EP names → check availability at runtime +- Don't ship only the compiled artifact without a CPU fallback + +**Cross-references:** +- To build the artifacts → `use-winml-cli` +- To verify each artifact → `validate-before-ship` + +--- + +## Skill 5: `ep-compatibility-check` + +### Frontmatter +```yaml +name: ep-compatibility-check +description: > + Use this skill to determine whether a specific model will work on specific + Windows hardware before starting a full build. Covers winml inspect for model + support verification, winml sys for EP availability on the current machine, + winml analyze for operator-level EP compatibility, and the EP-to-hardware + mapping for Windows AI PCs. Use when the user asks "will this work on my + device", "is QNN supported here", "what hardware do I need for NPU", or + when they get an unsupported operator error. +``` + +### When to use +- "Will this model work on my Snapdragon X Elite laptop?" +- "I don't know if my machine has a QNN EP" +- "The compile step failed with unsupported op" +- Starting a new project: verify feasibility before investing build time + +### Sections + +**1. Three-layer compatibility check** +Layer 1 — Model support: does winml-cli know this model type? +Layer 2 — EP availability: is the target EP registered on this machine? +Layer 3 — Operator coverage: does the target EP support all ops in this model? + +Each layer has a command; run in order, stop at first failure. + +**2. Layer 1: Model support** +```bash +winml inspect -m +``` +What to look for: `loader`, `exporter`, `winml_inference_class` fields populated. +If inspect fails or shows "unsupported" → model is out of scope for winml-cli. + +**3. Layer 2: EP availability** +```bash +winml sys --list-ep --list-device +``` +EP-to-hardware reference table: +| EP | Hardware requirement | Check for | +|---|---|---| +| QNN | Qualcomm Snapdragon X Elite / X Plus | QNNExecutionProvider in list | +| OpenVINO | Intel Core Ultra (Meteor Lake / Lunar Lake+) | OpenVINOExecutionProvider | +| VitisAI | AMD Ryzen AI (Phoenix / Hawk Point / Strix) | VitisAIExecutionProvider | +| NvTensorRTRTX | NVIDIA discrete GPU (RTX series) | NvTensorRTRTXExecutionProvider | +| DML | Any DirectX 12 GPU | DmlExecutionProvider | +| CPU | Any | Always available | + +If the desired EP is not listed → recommend next best EP from fallback chain. + +**4. Layer 3: Operator coverage** +```bash +winml analyze -m .onnx --ep +# or for all EPs at once: +winml analyze -m .onnx --device all +``` +Output interpretation: +- `supported` (green): op runs natively on EP +- `partial` (yellow): op may fall back to CPU for some configurations +- `unsupported` (red): op cannot run on this EP + +Decision rule: any `unsupported` → either change EP or accept CPU fallback +for those ops (which may impact accuracy and latency). + +**5. Fallback chain recommendation** +If target EP not available or has unsupported ops: +``` +QNN not available → OpenVINO (if Intel) or VitisAI (if AMD) → DML → CPU +``` + +**6. Fast-fail before compile** +`winml compile` is expensive (minutes). Always run analyze first. +If analyze shows >20% unsupported ops → likely not worth compiling for that EP. + +**Cross-references:** +- After confirming compatibility → `use-winml-cli` (build) +- If all EPs show unsupported ops → model may be out of scope for winml-cli + +--- + +## Skill 6: `validate-before-ship` + +### Frontmatter +```yaml +name: validate-before-ship +description: > + Use this skill when preparing to release a Windows application with an + on-device AI model. Provides a Definition-of-Done checklist covering artifact + completeness, accuracy validation against FP32 baseline, performance SLA + verification, output correctness on real inputs, cross-EP consistency, and + fallback chain verification. Every item must be checked or explicitly waived + before shipping. Use when the user says "I'm ready to ship", "what should I + test before release", or "how do I know the model is good enough". +``` + +### When to use +- About to ship a WinApp with on-device inference +- Final QA gate before a model artifact goes to production +- After any build config change (new quantization, new EP, new model version) + +### Sections + +**1. The checklist** + +**Gate 1 — Artifact completeness** +- [ ] All target EP artifacts exist and are loadable +- [ ] CPU fallback artifact exists +- [ ] manifest.json (if using multi-EP layout) is valid and references existing files +- [ ] Artifact was built with `winml build` (not opaque cache artifact) + +Command: +```bash +winml inspect -m .onnx # verify each artifact loads +``` + +**Gate 2 — Accuracy vs FP32 baseline** +- [ ] cosine_similarity ≥ 0.99 for FP16 artifacts +- [ ] cosine_similarity ≥ 0.95 for W8A16 artifacts +- [ ] cosine_similarity ≥ 0.90 for W8A8 artifacts (or task-specific threshold) +- [ ] Task accuracy metric (Top-1, F1, mAP) within acceptable drop from FP32 + +Commands: +```bash +winml eval --mode compare -m .onnx --model-id +winml eval -m .onnx --model-id # task accuracy +``` + +**Gate 3 — Performance SLA** +- [ ] p50 latency meets application target on target device +- [ ] p99 latency within 2x p50 (no outlier spikes) +- [ ] Benchmark run on actual target hardware (not developer machine) + +Command: +```bash +winml perf -m .onnx --device --iterations 100 --monitor +``` + +**Gate 4 — Output correctness on real inputs** +- [ ] Model produces correct output on ≥3 representative real-world inputs +- [ ] No NaN or Inf in outputs +- [ ] Output shape matches expected shape + +Command: +```bash +winml run -m .onnx --file # visual/manual check +``` + +**Gate 5 — Cross-EP consistency (if shipping multiple EP variants)** +- [ ] QNN and DML outputs agree within tolerance on same input +- [ ] CPU fallback output agrees with primary EP within tolerance + +Command (manual comparison across runs): +```bash +winml run -m model_qnn.onnx --file sample.jpg --format json -o qnn_out.json +winml run -m model_dml.onnx --file sample.jpg --format json -o dml_out.json +winml run -m model_cpu.onnx --file sample.jpg --format json -o cpu_out.json +# compare qnn_out.json vs dml_out.json vs cpu_out.json manually +``` + +**Gate 6 — Fallback chain** +- [ ] CPU fallback artifact verified independently (not just assumed to work) +- [ ] App runtime selects correct artifact when target EP is absent (simulate by removing EP) + +**2. Waiver policy** +Any item that cannot be completed must be waived explicitly: +``` +Waivers: +- Cross-EP consistency: VitisAI not available on developer machine. + Verified on target hardware by QA team. Issue #NNN. +- Performance SLA: Target hardware (Snapdragon X Elite) in procurement. + Benchmark deferred to post-merge, tracked in issue #NNN. +``` +Unchecked items without waiver → do not ship. + +**3. L-level mapping** + +The 6 gates map directly to the L1–L5 confidence system (see Overview): + +| Gate | L-level | +|---|---| +| Gate 1 — Artifact completeness | L1 | +| Gate 2 — Accuracy vs FP32 baseline | L3 + L4 | +| Gate 3 — Performance SLA | L5 | +| Gate 4 — Output correctness on real inputs | L4 | +| Gate 5 — Cross-EP consistency | L5 | +| Gate 6 — Fallback chain | L1 (CPU artifact) | + +Minimum to ship: L1 + L3 all passing. L4 + L5 required for production release. + +**3. Quick command reference** +```bash +# Gate 1: inspect all artifacts +for f in model_qnn.onnx model_dml.onnx model_cpu.onnx; do winml inspect -m $f; done + +# Gate 2: accuracy +winml eval --mode compare -m .onnx --model-id +winml eval -m .onnx --model-id + +# Gate 3: perf +winml perf -m .onnx --device auto --iterations 100 --monitor + +# Gate 4: real input +winml run -m .onnx --file + +# Gate 5: cross-EP (run individually, compare outputs) +winml run -m model_qnn.onnx --file --format json +winml run -m model_dml.onnx --file --format json +``` + +**Cross-references:** +- If accuracy gate fails → `debug-accuracy-drop` +- If performance gate fails → `optimize-for-device` +- If EP not available for testing → `ep-compatibility-check` +- For multi-EP artifact packaging → `prepare-for-winapp` + +--- + +## Skill 7: `adding-model-support` (contributor) + +### Frontmatter +```yaml +name: adding-model-support +description: > + Use this skill when contributing support for a new Hugging Face model to + winml-cli. Covers finding the correct exporter, writing a recipe config, + verifying at each pipeline stage (export → optimize → quantize → compile), + and passing the L1–L5 validation gates before submitting a PR. Use when + a contributor says "I want to add support for model X", "this model type + is not supported", or "how do I write a recipe for a new architecture". +``` + +### When to use +- "I want to add support for Qwen3 / Phi-4 / [new model]" +- "winml-cli says this model is unsupported" +- "How do I write a recipe config for a new model family?" + +### Sections + +**1. Find the right exporter** +```bash +winml inspect -m # check if auto-detected +``` +If inspect fails → the model needs a new exporter or recipe. +Look in `src/winml/modelkit/export/` for existing exporters as reference. + +**2. Find a reference model of the same family** +- Same architecture class (e.g., LlamaForCausalLM, BertModel)? +- Check `recipes/` for an existing `.json` config for that class +- Prefer copying the closest recipe and adjusting rather than writing from scratch + +**3. Write the recipe config** +Minimal recipe template: +```json +{ + "model_id": "org/model-name", + "task": "text-generation", + "export": { "opset": 17 }, + "optimize": { "passes": ["MatMulAddFusion", "LayerNormFusion"] }, + "quantize": { "mode": "w8a16", "calibration_dataset": "wikitext2" } +} +``` + +**4. Validate at each stage (L1 → L5)** + +| Stage | Command | Pass criterion | +|---|---|---| +| L1: Export loads | `winml inspect -m .onnx` | No error | +| L2: Shape correct | `winml eval -m .onnx --model-id ` | Output shape matches | +| L3: Numerical parity | `winml eval --mode compare -m .onnx --model-id ` | cosine ≥ threshold | +| L4: Task accuracy | `winml eval -m .onnx --model-id ` | Task metric in spec | +| L5: Perf on target EP | `winml perf -m .onnx --device ` | Meets latency target | + +**5. Common pitfalls for new models** +- New op types not in operator coverage → run `winml analyze` early +- Attention variant (GQA, MQA, MLA) → check quantization mode compatibility +- Dynamic shapes → add explicit shape hints in export config +- Non-standard tokenizer → verify `winml run` input preprocessing + +**Cross-references:** +- If EP shows unsupported ops → `ep-compatibility-check` +- After L1–L5 all pass → `validate-before-ship` for PR gate + +--- + +## Skill 8: `adding-ep-support` (contributor) + +### Frontmatter +```yaml +name: adding-ep-support +description: > + Use this skill when adding a new execution provider (EP) backend to + winml-cli. Covers implementing the compile backend interface, adding + EP-specific optimize passes, wiring the new EP into winml sys and + winml analyze, and verifying coverage with the L1–L5 test gates. + Use when a contributor says "I want to add support for a new EP", + "how does the QNN compile backend work", or "can we support EP X". +``` + +### When to use +- Adding a new EP compile backend (e.g., a new NPU vendor) +- Extending an existing EP with new optimization passes +- Understanding how the existing QNN / OpenVINO / VitisAI backends are structured + +### Sections + +**1. EP backend interface** +Reference implementation: `src/winml/modelkit/compile/qnn_backend.py` +Three methods to implement: +```python +class MyEPBackend(CompileBackend): + def is_available(self) -> bool: ... # detect EP on current machine + def optimize(self, model, config): ... # EP-specific graph transforms + def compile(self, model, config): ... # produce EP-locked artifact +``` + +**2. Wire into EP registry** +Register in `src/winml/modelkit/ep_registry.py`: +```python +EP_REGISTRY["myep"] = MyEPBackend +``` +This makes `--ep myep` work in `winml config`, `winml compile`, `winml analyze`. + +**3. Add operator coverage data** +Add a coverage JSON to `src/winml/modelkit/analyze/coverage/myep_ops.json`: +```json +{ "Add": "supported", "LayerNorm": "partial", "CustomOp": "unsupported" } +``` +This is what `winml analyze --ep myep` reads. + +**4. Add to `winml sys` output** +Add EP availability check to `src/winml/commands/sys.py` so it appears +in `winml sys --list-ep`. + +**5. L1–L5 validation for the new EP** +Minimum before merging: +- L1: A known-good model compiles without crash +- L3: Compiled artifact passes `winml eval --mode compare` (cosine threshold) +- L5: `winml perf` produces valid latency output on target hardware + +**Cross-references:** +- Operator coverage analysis → `ep-compatibility-check` +- After adding: document the EP in `ep-compatibility-check` hardware table + +--- + +## Skill 9: `contributing-a-skill` (contributor) + +### Frontmatter +```yaml +name: contributing-a-skill +description: > + Use this skill when writing a new SKILL.md for winml-cli or improving + an existing one. Covers frontmatter requirements, description writing + (the description is the agent trigger, not a human summary), section + structure conventions, cross-reference format, command accuracy + requirements, and the review checklist before submitting. Use when a + contributor says "I want to add a new skill", "how should I write + SKILL.md", or "what are the skill authoring rules". +``` + +### When to use +- Writing a new skill for a gap not covered by existing skills +- Improving an existing skill with new commands or sections +- Reviewing a skill PR + +### Sections + +**1. Frontmatter rules** +```yaml +name: kebab-case-skill-name # matches directory name under skills/ +description: > + Use this skill when . + Covers . + Use when the user says "", "", or . +``` + +**Critical:** The `description` field is what the Copilot agent reads to decide +whether to activate this skill. Write it as a trigger specification, not a +documentation summary. Include representative user phrases in quotes. + +**2. Required sections (in order)** +1. `## When to use` — 3–5 bullet points with user-facing symptoms/questions +2. Diagnostic or decision section — symptom → cause → fix structure +3. Command examples — runnable `winml` commands with real flags +4. Reference tables — hardware, thresholds, EP names as concrete data +5. `## Cross-references` — links to related skills using relative paths + +**3. Cross-reference format** +```markdown +- If accuracy dropped → see `.agents/skills/debug-accuracy-drop/SKILL.md` +- After validating → see `.agents/skills/validate-before-ship/SKILL.md` +``` + +**4. Content rules** +- All commands must be runnable exactly as written (no pseudocode flags) +- Include concrete numbers: thresholds (cosine ≥ 0.99), speedup (3–5×), latency (<50ms) +- Target ~200 lines prose + tables; move deep content to `references/` subdirectory +- Do not duplicate content from another skill — cross-reference instead + +**5. Review checklist before PR** +- [ ] `description` contains ≥3 quoted user trigger phrases +- [ ] All commands are tested and produce the described output +- [ ] Cross-references use relative paths and the linked skill exists +- [ ] No commands reference flags that don't exist in current `winml --help` +- [ ] Hardware names and EP names match the canonical list in `ep-compatibility-check` +- [ ] `evals/eval.yaml` exists with ≥2 test cases (including at least one negative assertion) + +--- + +## Skill 10: `autoconfig` (consumer — autoresearch loop) + +### Frontmatter +```yaml +name: autoconfig +description: > + Use this skill when a **WinApp developer** wants to automatically find the best + winml-cli configuration for their model on one or more target EP/device combinations. + The agent runs an autonomous experiment loop: it proposes config.json hypotheses, + runs winml build + eval + perf, evaluates against user-defined objectives + (accuracy floor, latency budget, or Pareto frontier), and iterates — keeping + improvements, discarding regressions. Covers single-EP optimization, multi-EP + parallel search, mixed-precision (nodes_to_exclude) exploration, calibration + parameter tuning, and manifest.json output for multi-EP deployment. + Use when the user says "find the best config for my model on QNN", + "automate the config search", "generate configs for all EPs", + or "I want to leave this running overnight". + +audience: external (WinApp developers) +``` + +### When to use +- "Find the best W8A8 config that keeps accuracy > 0.95 on QNN" +- "Generate optimized configs for QNN + DirectML + CPU and build a manifest" +- "I don't know which quantization settings to use, figure it out for me" +- "Run overnight and give me the best accuracy-latency tradeoff you can find" +- User has a latency SLA or accuracy floor but doesn't know how to achieve it + +### What this skill does NOT do +- It only searches within what `winml build` currently supports (existing capabilities) +- It does not look for optimization techniques outside winml's current feature set +- It does not suggest that winml needs new features or file bugs +- For finding what winml is *missing*, use `optimization-research` instead + +--- + +### Epistemic standard for autoconfig findings + +**Any conclusion this skill writes into a report or recommends to a user must meet this bar:** + +| Requirement | What it means | +|---|---| +| **Observation vs explanation** | State what was measured separately from why it happened. "latency increased 270ms" is fact. "because NHWC causes cache thrashing" is a hypothesis — label it as such unless confirmed by profiling. | +| **Statistical validity** | A latency claim requires ≥ 3 independent runs with warmup. A single `winml eval` run (no warmup, includes preprocessing) is insufficient to quote as a latency number. It can guide search decisions but not final reports. | +| **Mechanism confirmation** | Do not explain a regression unless the mechanism is confirmed (e.g., by profiler, by op-level timing, or by **source code inspection of ORT/QNN SDK**). If unknown, write "cause unconfirmed; further profiling needed." | +| **Scope boundary** | Results measured on one model/EP are never generalized to other models/EPs without explicit qualification. "On ConvNext-tiny CPU" is allowed. "CPU dislikes fusion" is not — it's an overgeneralization. | +| **Unresolved uncertainty** | If an observation contradicts the expected behavior (e.g., a "disabled" fusion still appears in the output), the report must flag this as an open question, not silently adopt an explanation. | +| **EP isolation** | A finding on one EP (positive or negative) MUST NOT be applied to prune the search space of a different EP without independent validation. CPU opset regression ≠ QNN NPU opset regression. Always validate per EP independently. | + +The skill MUST NOT write confident root-cause explanations in the HTML report or chat summary for regressions where only the measurement is available. Use hedged language: "this likely relates to…", "one hypothesis is…", or simply omit the explanation and recommend profiling. + +#### Perf gain validation protocol + +Before **any** perf gain is written into a report, config recommendation, or knowledge base as a confirmed finding, it must pass ALL three gates: + +**Gate 1 — Statistical: two-phase bench protocol (from GPU Optimizer V2)** + +``` +Phase A — Quick screen (fast, ~2 min): + winml perf -m --ep --device --warmup 20 --iterations 200 -o screen.json + CV = screen.json.std / screen.json.p50 + IF CV > 0.10 (10%): REJECT — high DVFS variance, measurement unreliable + → cool down 120s, retry once + → if still CV > 0.10: flag as [UNSTABLE], skip candidate + +Phase B — Full bench (only if Phase A passes, ~15 min): + # 3 independent sessions with 60s cool-down between each + winml perf ... --warmup 50 --iterations 1000 -o run1.json + sleep 60 + winml perf ... --warmup 50 --iterations 1000 -o run2.json + sleep 60 + winml perf ... --warmup 50 --iterations 1000 -o run3.json + + # KEEP if ALL of: + # 1. p50(run1,2,3) are all faster than baseline p50 × (1 - min_improvement) + # 2. CV of each run < 0.10 + # 3. cosine_similarity ≥ accuracy_floor + KEEP_threshold = baseline_p50 × 0.99 # ≥1% improvement required +``` +Rationale: DVFS on mobile NPUs causes 2-10x run-to-run variance. CV check catches this before wasting 15 min on full bench. + +**Gate 2 — Mechanism: read ORT/QNN source code before explaining why** + +**Gate 2 — Mechanism: read ORT/QNN source code before explaining why** +- For QNN EP gains: check `onnxruntime/core/providers/qnn/builder/` for opset-conditional dispatch +- For CPU EP gains: check `onnxruntime/core/optimizer/` for pass applicability conditions +- For DML EP gains: check DML operator mapping tables +- **Do not publish "opset 21 = 2.3x faster on QNN NPU" without confirming the mechanism in source code.** It may be DVFS bias, not a real architectural difference. + +**Gate 3 — Reproducibility: baseline and candidate measured in same thermal state** +- Run baseline and candidate back-to-back in the same session OR +- Use a device-level tool to lock NPU clock frequency +- If you cannot control thermal state, report min_ms (peak-performance ceiling) alongside p50 (typical performance), and flag the variance explicitly. + +**Lesson from ConvNext opset sweep (2026-06-10):** +Initial opset 21 measurement (8.45ms, 50 iters) vs opset 17 (19.4ms) appeared to show 2.3x gain. Full 17-22 sweep with 50 iters each showed: +- All opsets min ~9-10ms (same peak capability) +- opset 17 p50=54ms, opset 19-22 p50=12ms — but opset 18 p50=43ms (bimodal) +- opset 21 std varied from 10ms (cool device) to 37ms (warm device) +**Conclusion: data is inconclusive. Gain may be real OR may be thermal artifact. Gates 1+2 not yet passed.** + +--- + +### Design Comparison: GPU Optimizer V2 vs WinML Autoconfig + +**Reference**: "Agentic GPU Model Optimization" doc (cheye@, 2026-03-20). GPU Optimizer V2 is a 6-role multi-agent system for cloud GPU inference optimization (ONER-1B KNN service, H100). Autoconfig is a local edge inference optimizer (winml-cli, Snapdragon X). Most of their infrastructure (machine pool, SSH fleet, Triton serving, custom CUDA kernels, SM occupancy tuning) does not apply here. But the agent loop design has several directly adoptable ideas. + +#### Adoptable insights from GPU Optimizer V2 + +| V2 design decision | V2 rationale | Adopt into autoconfig? | Notes | +|---|---|---|---| +| **Two-phase bench: 200-iter quick screen → 3×1000-iter full bench** | "CV<2% gates full bench — avoid wasting time on high-variance results" | ✅ **YES — highest priority gap** | We've been doing single 50-iter runs and calling them facts. CV check would have caught the DVFS noise immediately. | +| **Verdict policy names (ThroughputOnly, ThroughputOrLatency…)** | "Named policies prevent Reviewer from ad-hoc criteria drift" | ✅ YES (simplified) | Autoconfig should have explicit KEEP criteria: `p50_ms < baseline × (1 - threshold)` AND `cosine ≥ floor` | +| **Append-only experiment_log.md + results.tsv written only by Reviewer** | "Single writer = no drift, full audit trail" | ✅ YES | Our results.tsv exists but no "single writer" discipline | +| **Explorer mandatory external-research triggers** | "After 15 consecutive DISCARDs → external research sweep" | ✅ YES — this is the exact gap that caused the opset 21 miss | If we had this rule, we would have searched ORT source after N DISCARDs and found kMaxSupportedOpset earlier | +| **Knowledge agent with review gate before KB save** | "Learnings reviewed before they prune future search" | ✅ YES | ep_knowledge/*.json entries should be marked draft until Gate 2 (mechanism) is confirmed | +| **Correctness contract locked after Phase 0, never modified** | "Prevents accuracy goal-post moving" | ✅ YES | We have accuracy gate but no locked contract file | +| **30-consecutive-DISCARD stop condition** | "Prevents endless search in exhausted space" | ✅ YES | autoconfig has no stop condition today | +| **Per-experiment structured output: Hypothesis → Implementation → Parity → Perf → Analysis → Decision** | "Enables post-analysis and knowledge extraction" | ✅ YES | autoconfig report is currently holistic, not per-experiment | +| **Role separation: Profiler / Explorer / Optimizer / Reviewer are separate agents** | "Prevents context drift; each agent stays focused" | ⚠️ Partial | Full 6-agent split is overkill for CLI tool; but Explorer / Reviewer distinction is valuable | +| **Resource lock: only one GPU job at a time** | "Prevents benchmark interference" | ✅ YES (trivially) | Already serial; but should be explicitly enforced if autoconfig ever parallelizes | +| **Machine pool + SSH fleet + Model Registry** | Cloud GPU fleet management | ❌ N/A | Local device only | +| **Custom CUDA kernel writing** | "Extreme asymmetry benefits from custom kernels" | ❌ N/A | CLI-only constraint; no kernel modification | +| **SM occupancy / GEMM tile count tuning** | "H100 has 132 SMs; 48 output tiles = 36% occupancy" | ❌ N/A | Edge NPU/GPU, not H100 multi-SM | +| **FlashAttention / fused QKV** | "Eliminate HBM traffic for attention score matrix" | ❌ N/A | Model is already trained; deployment-time optimization only | + +#### Key gaps in current autoconfig design (from V2 comparison) + +**Gap 1 (critical): No two-phase bench protocol** +Current design runs `--iterations 50` and accepts the result. V2 runs: +1. Quick screen: 200 iters, check CV < 2% (Coefficient of Variation = std/mean) +2. Only if CV < 2%: full bench 3×1000 iters with 60s cool-down between sessions +3. KEEP only if Δp50 > threshold AND CV(candidate) < 2% + +This directly matches the "iter ≥ 1000" rule we just added. Formalize it as two phases. + +**Gap 2 (critical): No mandatory external-research trigger in Explorer** +V2 Explorer triggers external research (web search, papers, source code) after: +- 15 consecutive DISCARDs +- Every KEEP that changes model/precision +- Before declaring backlog_empty + +We discovered kMaxSupportedOpset only by accident (downloading QNN Hub models). A mandatory "read ORT source after 5 DISCARDs in opset dimension" rule would have found it in Phase 2. + +**Gap 3 (important): ep_knowledge/*.json has no draft/confirmed state** +V2 Knowledge agent requires review gate before KB entries are used to prune search space. Our ep_knowledge findings should have: +- `status: "draft"` — observed, mechanism unconfirmed (Gate 2 not passed) +- `status: "confirmed"` — mechanism confirmed via source code (Gate 2 passed) +- `status: "deprecated"` — finding invalidated by new experiment or ORT version change +Only `"confirmed"` entries should prune search space. `"draft"` entries inform hypothesis priority but don't prune. + +**Gap 4 (nice-to-have): No per-experiment structured artifact** +V2 produces per-experiment: Hypothesis / Implementation / Parity / Perf / Analysis / Decision +autoconfig produces: one aggregate report.html. Should produce both. + +### Design: The Autoresearch Loop + +Inspired by [karpathy/autoresearch](https://github.com/karpathy/autoresearch): +agent modifies a config file, runs a fixed-cost experiment, checks if the objective improved, keeps or discards, and repeats autonomously until manually stopped or convergence criteria met. + +``` +OBJECTIVE (user-defined, one of): + A. Accuracy-primary: maximize cosine_similarity subject to p50_ms ≤ + B. Latency-primary: minimize p50_ms subject to cosine ≥ + C. Pareto search: find the full accuracy-latency frontier + +SEARCH SPACE — config.json has three sections the agent can modify: + + [export] + opset_version : int — 17, 18, 19, 20 (higher = newer ops, EP may not support) + do_constant_folding : bool — may affect graph structure visible to EP + dynamic_axes : dict — static vs dynamic shapes (QNN prefers static batch=1) + + [optimize] — full capability list (from winml optimize --list-capabilities) + + GraphPipe (run via ORT SessionOptions): + GELU: + gelu-fusion : bool — fuse tanh-GELU subgraph → Gelu op + fast-gelu-fusion : bool — fuse fast-GELU (tanh-approx) → FastGelu + bias-gelu-fusion : bool — fuse Bias+GELU (requires gelu-fusion) + quick-gelu-fusion : bool — fuse x*sigmoid(1.702x) → FastGelu + gelu-approximation : bool — convert exact Gelu → FastGelu (requires gelu-fusion) + Activation: + bias-softmax-fusion : bool — fuse Bias+Softmax + bias-dropout-fusion : bool — fuse Bias+Dropout + Convolution: + conv-add-fusion : bool — fuse Conv+Add (bias) + conv-bn-fusion : bool — fuse Conv+BatchNorm into weights + conv-mul-fusion : bool — fuse Conv+Multiply + conv-activation-fusion : bool — fuse Conv+activation (ReLU, Sigmoid, etc.) + Elimination: + slice-elimination : bool — remove redundant Slice ops + expand-elimination : bool — remove no-op Expand + unsqueeze-elimination : bool — fold Unsqueeze into initializers + GEMM: + gemm-activation-fusion : bool — fuse GEMM+activation + gemm-sum-fusion : bool — fuse GEMM+Sum + gemm-transpose-fusion : bool — fuse GEMM+Transpose + Graph: + concat-slice-elimination : bool — remove Concat+Slice that restore originals + double-qdq-pairs-remover : bool — remove consecutive QDQ pairs + constant-folding : bool — pre-compute constant exprs (default=True; disable to reduce size) + LayerNorm: + layer-norm-fusion : bool — fuse ReduceMean→Sub→Pow→Sqrt→Div→Mul→Add + skip-layer-norm-fusion : bool — fuse Add(residual)+LayerNorm → SkipLayerNorm (requires layer-norm-fusion) + simplified-layer-norm-fusion : bool — fuse simplified LayerNorm (no mean-centering) + Layout: + transpose-optimizer : bool — eliminate redundant transpose chains + nhwc-transformer : bool — NCHW→NHWC (GPU memory layout) + nchwc-transformer : bool — NCHW→NCHWc (CPU SIMD layout) + conv-add-activation-fusion : bool — fuse Conv+Add+Activation → FusedConv + MatMul: + matmul-add-fusion : bool — fuse MatMul+Add → single kernel + matmul-activation-fusion : bool — fuse MatMul+activation (DML-only, requires matmul-transpose-fusion) + matmul-transpose-fusion : bool — fuse MatMul+Transpose → FusedMatMul + matmul-scale-fusion : bool — fuse MatMul+Scale + matmul-bn-fusion : bool — fuse MatMul+BatchNorm + dynamic-quantize-matmul-fusion : bool — dynamic quant for MatMul + Misc: + gather-slice-to-split-fusion : bool — fuse Gather+Slice → Split + gather-to-slice-fusion : bool — convert Gather to Slice (contiguous idx) + pad-fusion : bool — fuse Pad with Conv/Pool + not-where-fusion : bool — fuse Not+Where + + FusionPipe (ORT transformer fusions, via FusionOptions): + attention-fusion : bool — fuse MHA pattern → Attention/MultiHeadAttention + layer-norm-fusion : bool — (FusionPipe variant, same flag) + skip-layer-norm-fusion : bool — (FusionPipe variant) + simplified-layer-norm-fusion : bool — (FusionPipe variant) + embed-layer-norm-fusion : bool — fuse Embedding+Position+LayerNorm (requires layer-norm-fusion) + bias-skip-layer-norm-fusion : bool — fuse Bias+SkipLayerNorm (requires skip-layer-norm-fusion) + fuse-rmsnorm : bool — fuse RMSNorm → LpNormalization(p=2) [custom, QNN-compatible] + packed-qkv-fusion : bool — (SD only) + packed-kv-fusion : bool — (SD only) + skip-group-norm-fusion : bool — (SD only) + bias-add-fusion : bool — fuse BiasAdd + qordered-matmul : bool — (SD only) + + SurgeryPipe (pre-EP graph fixes): + clamp-constant-values : bool — clamp -inf/+inf constants → [-1e3, 1e3] (prevents QNN quant issues) + remove-isnan-in-attention-mask: bool — remove Softmax→IsNaN→Where guards (use after clamp) + + RewritePipe (pattern-based subgraph rewriting): + --enable-{source-slug}-{target-slug} (run winml optimize --list-rewrites for full list) + Examples: --enable-gelu-singlegelu, --enable-matmuladdpattern-reshapegemmreshapepattern + + [quant] + precision : fp16 | w8a16 | w8a8 + calibration_method : minmax | entropy | percentile + samples : 64 | 128 | 256 | 512 + per_channel : bool + symmetric : bool + op_types_to_quantize : list[str] — restrict which op types get quantized + nodes_to_exclude : list[str] — exclude specific named nodes + +FIXED: winml build + winml eval + winml perf (the experiment harness) +METRIC: cosine_similarity (from winml eval --format json) + p50_ms (from winml perf --format json) +RECORD: results.tsv +``` + +--- + +### Profiler-Enhanced Agent Architecture (redesigned) + +**Insight from GPU Optimizer v2 analysis and ConvNext POC:** +Running the profiler *before* the search loop would have shown Gemm=57.7% on ConvNext — +immediately ruling out layout-pass experiments (Transpose only 2.6%, already fused Gelu already +canonical). Profile-first makes the Explorer smarter and the search shorter. + +**New 4-phase structure:** + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ PHASE 0 — INTAKE │ +│ winml inspect → validate model is supported │ +│ winml build (baseline config) → get model.onnx │ +│ winml eval --mode compare → lock FP32 correctness baseline │ +│ winml perf (baseline) → establish latency floor │ +└────────────────────────────┬────────────────────────────────────────┘ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ PHASE 1 — PROFILE (runs ONCE, before any search) │ +│ winml perf -m baseline/model.onnx --ep --profile │ +│ Parse bottleneck.json: │ +│ - top_bottleneck: op type with highest % of kernel time │ +│ - top3_concentration_pct: how concentrated the compute is │ +│ - headroom_hints: actionable pass recommendations │ +│ Classify each bottleneck op type: │ +│ - "compute" (Gemm, Conv, Attention) → quant/kernel matters │ +│ - "layout" (Transpose, Reshape) → graph pass matters │ +│ - "already_canonical" (op shows as fused type) → fusion N/A │ +│ Output: prioritized_hypothesis_queue (ordered by profile evidence)│ +└────────────────────────────┬────────────────────────────────────────┘ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ PHASE 2 — PROFILE-GUIDED OPTIMIZATION LOOP │ +│ │ +│ ┌──────────────┐ ┌──────────────┐ ┌─────────────────────┐ │ +│ │ EXPLORER │───►│ OPTIMIZER │───►│ REVIEWER │ │ +│ │ │ │ │ │ │ │ +│ │ Pops next │ │ Runs ONE │ │ Cross-exp verdict: │ │ +│ │ hypothesis │ │ experiment: │ │ - CV gate Phase A │ │ +│ │ from queue, │ │ build + │ │ - full bench Gate 1 │ │ +│ │ motivated by │ │ quick-screen │ │ - keep / discard │ │ +│ │ profile data │ │ → full bench │ │ - detect plateau │ │ +│ │ │ │ → eval │ │ - stop condition │ │ +│ └──────────────┘ └──────────────┘ │ - write KB draft │ │ +│ ▲ └─────────────────────┘ │ +│ mandatory external-research triggers (adopted from V2): │ +│ • after 5 consecutive DISCARDs in same search dimension │ +│ → search ORT/QNN SDK source code for mechanism │ +│ • after every KEEP that changes precision or EP │ +│ → re-read ep_knowledge for updated constraints │ +│ • before declaring search_space_exhausted │ +│ → ORT source sweep: opset gates, EP-specific dispatch rules │ +│ │ +│ Explorer prunes via bottleneck.json (only "confirmed" KB rules): │ +│ IF top_bottleneck == "Gemm" (>50%): │ +│ → SKIP layout passes (transpose-optimizer, nchwc, nhwc) │ +│ → FOCUS on: quant precision, calibration, matmul fusions │ +│ IF top_bottleneck == "Transpose" (>10%): │ +│ → CHECK kMaxSupportedOpset for current ORT version FIRST │ +│ IF top_bottleneck == "Conv" (>20%): │ +│ → try nchwc-transformer, conv-activation-fusion │ +│ IF "Gelu"/"LayerNormalization" op_type (already canonical): │ +│ → SKIP corresponding fusion flags │ +└────────────────────────────┬────────────────────────────────────────┘ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ PHASE 3 — REPORT │ +│ config__optimal.json ← champion config with _autoconfig_meta│ +│ report.html ← full benchmark + profile section │ +│ experiments// ← per-exp: hypothesis/impl/parity/ │ +│ perf/analysis/decision (V2 pattern) │ +│ kb_entry.json ← status="draft"; promoted to │ +│ "confirmed" only after mechanism confirmed (Gate 2) │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +**ep_knowledge draft/confirmed lifecycle (Gap 3 fix):** + +``` +KB entry states: + "draft" — observed perf delta, mechanism unconfirmed (Gate 2 not passed) + Can influence hypothesis PRIORITY but NOT prune search space + "confirmed" — mechanism confirmed via ORT/QNN source code (Gate 2 passed) + Can prune search space for future runs + "deprecated"— finding invalidated by new experiment or stack version change + Must NOT influence search space; kept for history only + +Transition rules: + draft → confirmed: requires mechanism_confirmed=true + source_citation + confirmed → deprecated: requires contradicting experiment OR stack version bump + deprecated entries: kept in JSON with status field, never deleted +``` + +**Profiler output → Explorer mapping table:** + +| Profile finding | Explorer action | Hypothesis skipped | +|---|---|---| +| Gemm > 50% | Prioritize quant/calib experiments | All layout-transform passes | +| Transpose < 5% (opset=17) | Transpose Optimizer already working | transpose-optimizer trials | +| op_type "Gelu" present | Already fused | gelu-fusion, fast-gelu-fusion | +| op_type "LayerNormalization" present | Already fused | layer-norm-fusion trials | +| Reorder{Input,Output} present (>4%) | NCHWc already active | nchwc-transformer trials | +| op_type "Attention" present | MHA already fused | attention-fusion trials | +| QDQ ops > 15% | Quant overhead high | Focus on op_types_to_quantize exclusions | +| Transpose > 10% + opset ≥ 19 | kMaxSupportedOpset issue | Flag as [KNOWN_TRADEOFF], lower opset | + +**Why profile-first matters (validated on ConvNext):** + +The ablation experiment ran 22 experiments over multiple days. Had the profiler run first: +- Profile shows: Gemm=57.7%, Conv=12.6%, Transpose=2.6%, Gelu=8% (already "Gelu" op) +- Explorer would have immediately skipped: `gelu-fusion`, `layer-norm-fusion`, `transpose-optimizer`, + `nchwc-transformer` (already active via ReorderInput/Output) +- Only candidates from profile: `matmul-add-fusion` (Gemm bottleneck), `conv-activation-fusion` +- This would have reduced 22 experiments to ~6, with the same conclusions + +**POC profiler:** `C:\tmp\autoconfig-demo\winml_profile.py` +- Uses ORT `enable_profiling=True` + `end_profiling()` (same pattern as AI Studio's profile_file.py) +- CPU EP: parses `_kernel_time` events from ORT JSON trace +- Output: `bottleneck.json` (structured) + `bottleneck.txt` (human-readable) + raw ORT trace +- ConvNext result: Gemm 57.7%, Conv 12.6%, Transpose 2.6% → confirms baseline is optimal for CPU + +--- + +### Sections + +**1. Phase 0 — Intake + Baseline** + +```bash +# Step 1: verify the model is supported +winml inspect -m --format json + +# Step 2: baseline build (default config, opset=17) +winml export -m -o baseline/ +winml build -c config_baseline.json -m -o baseline_built/ + +# Step 3: correctness contract +winml eval --mode compare -m baseline_built/model.onnx --model-id --format json +# Expected: cosine=1.0 (FP32 self-comparison) + +# Step 4: baseline perf +winml perf -m baseline_built/model.onnx --ep --warmup 10 --iterations 50 --format json +# Record: baseline_p50_ms +``` + +Initialize `results.tsv` (TSV, not CSV — commas break in description field): +``` +commit precision nodes_excluded cosine p50_ms calibration_samples status notes +``` + +--- + +**2. Phase 1 — Profile (runs once, BEFORE any search experiments)** + +```bash +# Run profiler on baseline model (--profile flag added to winml perf) +winml perf -m baseline_built/model.onnx --ep \ + --warmup 5 --iterations 20 --profile --out profile_out/ --format json +# Reads: profile_out/bottleneck.json +# POC (before --profile ships): python winml_profile.py --model ... --ep ... +``` + +Profiler output drives Explorer hypothesis initialization: + +``` +READ bottleneck.json: + top_bottleneck: + op_summary: [{op_type, pct}, ...] (sorted by descending pct) + headroom_hints: [...] + +BUILD skip_set (passes not worth trying): + FOR each op_type in op_summary: + IF op_type == "Gelu": skip_set.add(gelu-fusion, fast-gelu-fusion) + IF op_type == "LayerNormalization": skip_set.add(layer-norm-fusion) + IF op_type == "Attention": skip_set.add(attention-fusion) + IF "ReorderInput" in op_summary AND pct > 2%: + skip_set.add(nchwc-transformer) # already active + IF Transpose pct < 5% AND opset=17: + skip_set.add(transpose-optimizer) # already working, no gain + IF Transpose pct > 10% AND opset >= 19: + flag as [KNOWN_TRADEOFF]; add to report + +BUILD priority_queue (hypotheses in evidence-based order): + IF top_bottleneck == "Gemm" OR "MatMul": + queue: [quant_precision, calib_method, calib_samples, matmul_fusions, per_channel] + IF top_bottleneck == "Conv": + queue: [nchwc (if not in skip_set), conv_fusions, quant_precision] + IF top_bottleneck == "Attention": + queue: [quant_precision, nodes_to_exclude (Attention), calib_method] + DEFAULT: + queue: [quant_precision, calib_method, calib_samples] +``` + +--- + +**3. Phase 2 — Profile-Guided Optimization Loop (single EP)** + +``` +LOOP FOREVER (until user stops or convergence): + +1. EXPLORER: pop next hypothesis from priority_queue + - Skip if in skip_set (pruned by profile) + - If queue empty → enter Phase 4 (generalization) or stop + +2. HYPOTHESIZE: build config.json delta based on hypothesis + Hypothesis rules (profile-informed, in priority order): + a. If first loop: start with full W8A8/W8A16, all ops quantized + b. If cosine < floor: add worst partial_op to nodes_to_exclude (one at a time) + c. If cosine ≥ floor but latency > budget: try W8A8 instead of W8A16, + or reduce calibration_samples, or add per_channel=true + d. If stuck (3 iterations no improvement): try calibration_method change + (minmax → entropy → percentile) + e. If still stuck: try precision escalation (W8A8 → W8A16 → FP16) + +3. MODIFY: write updated config.json + Key fields in quant section: + { + "precision": "w8a8", + "samples": 128, + "calibration_method": "minmax", + "nodes_to_exclude": ["LayerNorm_0", "Softmax_3"], + "per_channel": false + } + +4. OPTIMIZER: winml build -c config.json -m -o out_/ + If build crashes: log as "crash", revert config, try different hypothesis + +5a. EVAL — quick sanity (cosine proxy, cheap): + winml eval --mode compare -m out_/artifact.onnx \ + --model-id --format json + → cosine_similarity, sqnr_db + If cosine < hard_floor (e.g. 0.85): fail-fast, skip step 5b + 6, log as discard + +5b. EVAL — task accuracy (real quality gate): + winml eval -m out_/artifact.onnx \ + --model-id \ + --task --device --ep \ + --samples 100 --format json + → top1_accuracy (image-classification), f1 (text), mAP (detection), etc. + This is the authoritative accuracy metric for Reviewer verdict. + + Why cosine alone is not sufficient: + - High cosine (0.97) but top-1 drops 5%: logit magnitudes preserved but relative ranking shifted + - Low cosine (0.92) but same top-1: relative ranking unchanged despite numeric difference + → Only task accuracy tells you whether the model still does its job + +6. PERF: winml perf -m out_/artifact.onnx \ + --device --ep --warmup 10 --iterations 50 --format json + → p50_ms, p90_ms + +7. REVIEWER: cross-experiment verdict + keep if task_accuracy ≥ accuracy_floor AND p50_ms ≤ latency_budget + discard if task_accuracy < accuracy_floor OR p50_ms > latency_budget + crash if build/eval failed + + Reviewer also checks: + - Plateau: 3+ keeps with Δlatency < 2% → likely at local optimum + - Profile divergence: if new op_type appears after build, re-profile + - Skip_set update: if experiment proves a pass is a no-op, add to skip_set + - Accuracy cliff: if task_accuracy drops > 3% in one step → flag, do not cascade + +8. LOG to results.tsv: + keep/discard/crash + +9. If keep: advance to next iteration from this config + If discard: revert to last kept config, try different hypothesis +``` + +**Convergence criteria** (stop the loop): +- cosine ≥ target floor AND p50_ms ≤ latency budget: objective achieved +- 5 consecutive discards with no improvement: report best so far +- User manually stops the agent + +--- + +**3. Hypothesis generation rules (the intelligence layer)** + +The agent generates hypotheses by traversing the search space in priority order. +Each hypothesis is motivated by diagnostic data from the previous experiment, not random search. + +**Priority ordering across the three config sections:** + +``` +Phase 1 — establish baseline (iteration 0) + Start with: opset_version=17, all fusions enabled, precision=w8a16, minmax, 128 samples + +Phase 2 — precision first (fastest to try, most impact) + If cosine < floor: + w8a16 → try w8a8 with selective exclusions, or w8a16 first + If latency > budget: + w8a16 → try w8a8 (smaller model, faster inference) + fp16 → try w8a16 (if currently at fp16) + +Phase 3 — calibration tuning (if precision is right but cosine still low) + Try in order: minmax → entropy → percentile + Try increasing samples: 128 → 256 → 512 + Try per_channel=true (better accuracy, slightly slower build) + Try symmetric=false if currently true + +Phase 4 — optimize pass tuning (independent of quant, affects graph structure) + Hypothesis: some fusion patterns create op shapes QNN handles poorly + Transformer models (try in order): + attention-fusion → skip-layer-norm-fusion → layer-norm-fusion → fuse-rmsnorm + Vision models (try in order): + conv-bn-fusion → conv-add-fusion → conv-activation-fusion + Shared (try if cosine drops or build crashes): + constant-folding=false (prevents size bloat; sometimes exposes EP-incompatible shape) + clamp-constant-values=true (fixes -inf attention mask → quantization issues) + remove-isnan-in-attention-mask=true (use after clamp; cleans dead IsNaN guards) + Try opset_version: 17 → 18 → 19 + (Higher opsets expose newer op types that may have better EP support) + +Phase 5 — selective node exclusion (when analyze shows partial ops) + Read winml analyze --format json → partial_ops list + Exclude one partial_op at a time (greedy: exclude highest-impact first) + Also try excluding op_types_to_quantize selectively + e.g., remove "LayerNorm" from op_types_to_quantize list + +Phase 6 — combined search (if single-dimension changes are stuck) + Try combinations of best Phase 3 + Phase 4 + Phase 5 changes together +``` + +**Diagnosis table — what to try given what you see:** + +| Symptom | Likely cause | Phase to try next | +|---|---|---| +| cosine drops a lot at quant stage, all ops supported | Calibration data mismatch | Phase 3: entropy calib, more samples | +| cosine drops at quant, Attention ops partial | Attention activation quant on QNN | Phase 5: exclude Attention nodes | +| cosine OK but latency worse than CPU | Fusion pattern creating unoptimized subgraph | Phase 4: disable attention-fusion, try different opset | +| cosine OK but model larger than expected | Constant folding inlining large weights | Phase 4: constant-folding=false | +| Both cosine and latency good at w8a8 but build crashes | opset op not supported by quant pipeline | Phase 4: opset_version 17 → 16 | +| cosine highly variable across seeds | Calibration with too few samples | Phase 3: 128 → 256 samples | +| All ops supported, cosine still drops after fusions | Fusion creates non-quantizable shape | Phase 4: disable skip-layer-norm-fusion | +| QNN build fails with "invalid scale" | -inf in attention mask initializer | Phase 4: clamp-constant-values=true | +| Vision model: accuracy drops unexpectedly | Conv+BN fusion slightly changes weight values | Phase 4: disable conv-bn-fusion | +| MatMul-heavy model: latency not improving | MatMul not being fused | Phase 4: matmul-add-fusion, matmul-transpose-fusion | +| RMSNorm model (Llama etc.) poor QNN perf | ORT not recognizing RMSNorm pattern | Phase 4: fuse-rmsnorm=true | + +This is the key difference from grid search: **each hypothesis is motivated by diagnostic data from `winml analyze` and the previous experiment result**. + +--- + +**4. Multi-EP config generation** + +Run parallel loops for each target EP, then aggregate into `manifest.json`: + +```bash +# Agent runs loops for each EP (can be sequential or parallel): +# Loop 1: ep=qnn, target_device=npu +# Loop 2: ep=dml, target_device=gpu +# Loop 3: ep=cpu, target_device=cpu + +# After all loops complete, agent generates: +# - config_qnn_optimal.json (best config found for QNN) +# - config_dml_optimal.json (best config found for DirectML) +# - config_cpu_optimal.json (best config found for CPU) + +# Then builds final artifacts and assembles manifest.json +``` + +Generated `manifest.json` includes experiment provenance: +```json +{ + "model_id": "microsoft/resnet-50", + "generated_by": "autoconfig", + "experiments_run": 34, + "variants": [ + { + "ep": "qnn", "device": "npu", + "file": "model_qnn.onnx", + "precision": "w8a16", + "nodes_excluded": ["MultiHeadAttention"], + "cosine_similarity": 0.972, + "p50_ms": 18.3, + "config": "config_qnn_optimal.json" + }, + { + "ep": "dml", "device": "gpu", + "file": "model_dml.onnx", + "precision": "fp16", + "nodes_excluded": [], + "cosine_similarity": 0.999, + "p50_ms": 22.1, + "config": "config_dml_optimal.json" + }, + { + "ep": "cpu", "device": "cpu", + "file": "model_cpu.onnx", + "precision": "w8a8", + "nodes_excluded": ["LayerNorm"], + "cosine_similarity": 0.931, + "p50_ms": 84.7, + "config": "config_cpu_optimal.json" + } + ], + "selection_order": ["qnn", "dml", "cpu"] +} +``` + +--- + +**5. results.tsv format** + +Track all three config sections per experiment (TSV, not CSV): +``` +commit opset fusions_disabled precision nodes_excluded cosine p50_ms calib_samples calib_method status notes +baseline 17 [] fp32 [] 1.000 — — — keep FP32 reference +a1b2c3d 17 [] w8a8 [] 0.871 16.2 128 minmax discard full W8A8 too aggressive +b2c3d4e 17 [] w8a16 [] 0.967 19.8 128 minmax keep W8A16 baseline meets floor +c3d4e5f 17 [] w8a16 [] 0.969 19.1 256 entropy keep entropy calib improvement +d4e5f6g 17 [attention-fusion] w8a16 [] 0.971 18.4 256 entropy keep disabling attn-fusion helps latency +e5f6g7h 18 [attention-fusion] w8a16 [] 0.973 17.9 256 entropy keep opset18 best so far +f6g7h8i 18 [attention-fusion] w8a8 [MultiHeadAttention] 0.961 14.2 256 entropy keep mixed prec: meet latency budget +``` + +--- + +**6. Skill outputs** + +autoconfig produces **two primary outputs** after convergence or user stop: + +#### Output A: Best config file + +`config__optimal.json` — the winning config.json, ready to pass to `winml build`. Contains provenance metadata so it's reproducible: + +```json +{ + "_autoconfig_meta": { + "model_id": "facebook/convnext-tiny-224", + "ep": "qnn", + "objective": "latency-primary", + "latency_budget_ms": 20, + "accuracy_floor": 0.95, + "experiments_run": 23, + "best_iter": "iter_17", + "timestamp": "2026-06-10T11:55:05+08:00" + }, + "export": { "opset_version": 18 }, + "optimize": { "attention-fusion": false }, + "quantize": { + "precision": "w8a16", + "calibration_method": "entropy", + "calibration_samples": 256, + "nodes_to_exclude": ["MultiHeadAttention_0"] + } +} +``` + +#### Output B: HTML benchmark report + +`report.html` — self-contained single-file report (no external dependencies), viewable in any browser. Contains: + +**Section 1 — Summary card** +``` +Model: facebook/convnext-tiny-224 EP: QNN (NPU) +Objective: latency-primary ≤ 20ms Accuracy floor: 0.95 +Result: ✅ FOUND Experiments: 23 Time: 41 min + +Best config: W8A16, entropy calib, 256 samples + Accuracy: 0.953 (floor 0.95 ✓) + p50 latency: 15.8ms (budget 20ms ✓) +``` + +**Section 2 — Search progress chart** +Scatter plot: all 23 experiments, x=p50_latency_ms, y=accuracy. +- Green dot = kept (improvement) +- Red dot = discarded (regression) +- Star = best found +- Hover tooltip: iter ID, config diff vs previous + +**Section 3 — Iteration table** +Full results.tsv rendered as sortable HTML table with columns: +``` +iter | opset | precision | nodes_excluded | calib | accuracy | p50_ms | Δacc | Δlatency | status | hypothesis +``` +Color-coded rows: green = keep, red = discard, gold = best. + +**Section 4 — Config diff timeline** +Visual diff showing what changed between each kept iteration (config deltas as `+`/`-` lines). + +**Section 5 — Model graph analysis** (from pre-search `winml analyze`) +- Op distribution pie chart (ONNX vs com.microsoft) +- EP compatibility table: ops supported/unsupported on target EP +- Detected patterns (GELU variant, attention structure, Transpose-sandwich) + +**Section 6 — Benchmark details** +For the best config, full `winml perf` output: +- p10/p50/p90/p99 latency histogram +- Throughput (samples/sec) +- Warmup vs steady-state comparison +- (If multi-EP: side-by-side EP comparison bar chart) + +**Section 7 — Reproduction instructions** +```bash +# Reproduce the winning config: +winml build -c config_qnn_optimal.json -m facebook/convnext-tiny-224 -o out/ +# For NPU: always compile after build (empirically +1.7× speedup) +winml compile -m out/model.onnx --device npu --ep qnn -o out_compiled/ +winml perf -m out_compiled/model_npu_ctx.onnx --ep qnn --iterations 100 --warmup 10 +``` + +**Report generation approach**: The agent generates report.html using inline Python with Jinja2-style string templating + embedded Chart.js (CDN or inlined). No external dependencies — single file, opens offline. + +--- + +**7. What the agent says in chat** + +After convergence or user stop (terminal summary, report is the real deliverable): + +``` +autoconfig completed. 23 experiments run (41 min). + +Best config (QNN NPU): + W8A16, entropy calib, 256 samples, MultiHeadAttention excluded + accuracy 0.953 ✓ (floor 0.95) p50 15.8ms ✓ (budget 20ms) + +Outputs: + config_qnn_optimal.json ← drop into winml build -c + report.html ← open in browser for full benchmark breakdown + +Next: winml validate-before-ship for production gate. +``` + +--- + +**8. Constraints and failure handling** + +- **Build timeout**: If `winml build` exceeds 15 minutes, kill and log as crash +- **OOM**: If build fails with out-of-memory, reduce `calibration_samples` by half +- **All hypotheses exhausted**: Report best config found, note convergence limit +- **Latency not measurable** (target EP not on machine): run eval only, skip perf gate + +**9. CLI-only constraint (critical)** + +The agent MUST use only official `winml` CLI commands as its tool surface. No Python scripting, no direct ONNX manipulation, no third-party tools (onnxconverter-common, onnxsim, Olive, etc.) except where explicitly documented as a known workaround. + +**Rationale**: autoconfig's output is a `config.json` + `report.html` that a user can reproduce with `winml build -c config.json`. If the agent used a Python hack to produce a model artifact, the config is not reproducible and the report is misleading. + +**Known workarounds (allowed, must be flagged in report):** +| Workaround | Replaces | Tracking issue | Required flag in report | +|---|---|---|---| +| `python winml_profile.py` | `winml perf --profile` (not yet shipped) | pending | ⚠️ "Profile data via POC script, not official API" | + +**Gap reporting rule**: If a hypothesis cannot be tested because the required `winml` CLI capability does not exist, the agent MUST: +1. Record the hypothesis as `SKIPPED — CLI gap` in the experiment table +2. Add an entry to **Section 6 "Gaps & Issues"** block in `report.html`: + ``` + GAP: requires + Impact: + Filed: + ``` +3. NOT silently substitute a Python workaround that produces unverifiable artifacts + +**Example gaps encountered during ConvNext QNN GPU validation:** +- `winml build --precision fp16` flag not available (#867) → FP16 native export untested → `SKIPPED — CLI gap` +- `winml perf --ep-option` not available (#865) → runtime flag sweep untested → `SKIPPED — CLI gap` +- `winml perf --profile` for QNN EP not available → profiling via POC script (allowed workaround) +- W8A8 QDQ ONNX on QNN GPU EP hangs indefinitely — root cause is QNN SDK behavior; ``winml build`` already prevents this via ``_patch_device()``; fast-fail enhancement filed as #868 (low priority) + +--- + +### Key commands used + +```bash +# Phase 1: profiling (--profile flag on winml perf, before search) +winml perf -m baseline_built/model.onnx --ep --warmup 5 --iterations 20 \ + --profile --out profile_out/ --format json +# → profile_out/bottleneck.json (machine-readable for Explorer) +# → profile_out/bottleneck.txt (human-readable summary) +# POC: python winml_profile.py --model ... --ep ... (until --profile ships) + +# Phase 2: analysis (informs nodes_to_exclude hypotheses) +winml analyze -m .onnx --ep --format json + +# Phase 2: experiment +winml build -c config.json -m -o out_/ + +# Phase 2: metrics +winml eval --mode compare -m out_/artifact.onnx --model-id --format json +winml perf -m out_/artifact.onnx --device --ep --iterations 50 --format json + +# Phase 3: compile best candidate to QNN EPContext (NPU only) +# Eliminates JIT overhead; empirically ~1.7× further speedup on ConvNext W8A16 +winml compile -m best_candidate/model.onnx --device npu --ep qnn -o best_compiled/ +# → best_compiled/model_npu_ctx.onnx (loads context binary at runtime) +# → best_compiled/model_npu_ctx_qnn.bin (QNN hardware-compiled graph) + +# Phase 3: re-benchmark compiled model +winml perf -m best_compiled/model_npu_ctx.onnx --device npu --ep qnn --warmup 10 --iterations 50 +``` + +**Empirical data: ConvNext QNN NPU compile impact** +| Version | p50 | vs FP32 NPU | +|---|---|---| +| FP32 baseline | 19.39ms | — | +| W8A16 quantized | 10.29ms | 1.9× | +| **W8A16 + compile** | **6.01ms** | **3.2×** | +→ `winml compile` alone adds ~1.7× on top of quantization. Always compile for NPU deployment. + +**Empirical data: ConvNext QNN GPU optimization sweep (Adreno X1-85) — full search** +| Experiment | p50 | p90 | std | vs FP32 | Notes | +|---|---|---|---|---|---| +| FP32 baseline (autoconf) | **17.7ms** | 19.7ms | 0.97 | — | ✅ **OPTIMAL with current CLI** | +| NHWC transformer | 19.5ms | 23.8ms | 3.43 | ❌ −10% | Hurts Adreno+QNN EP | +| NHWC + all GPU fusions | 18.1ms | 23.9ms | 2.71 | ❌ −2% | Still worse | +| Conv/norm fusions (no NHWC) | 17.6ms | 22.6ms | 5.51 | ≈0% | Variance ↑, no gain | +| LayerNorm rewrite | 18.4ms | 21.4ms | 2.04 | ❌ −4% | Pattern mismatch anyway | +| Transpose optimizer | 0% node Δ | — | — | no-op | Already optimal positions | +| HiDimRTR→LowDimRTR | 0% node Δ | — | — | no-op | ConvNext RTR doesn't match pattern | +| MatMulAdd→Conv2D (2d/3d/4d) | 0% node Δ | — | — | no-op | ConvNext uses Reshape→MatMul, not bare MatMul+Add | +| FP32 + compile | 23.7ms | — | — | ❌ −34% | Compile hurts GPU (opposite of NPU) | +| W8A8 QDQ quantized | hangs | — | — | ❌ blocked | #868 enhancement (fast-fail) | +| FP16 (invalid CLI path) | 8.8ms | ~32ms | bimodal | ⚠️ 2× p50 | BLOCKED — need #867 | + +**Root cause: why no pass matches ConvNext on QNN GPU** +- All 251 ops run natively on GPU (251/0/0/0) — no CPU fallback to eliminate +- ConvNext linear layers: `Reshape → MatMul → Reshape` pattern, not bare `MatMul+Add` → Conv2D rewrites don't match +- 72 Reshape + 42 Transpose are already at minimum / optimal topology from PyTorch export +- `winml build` autoconf (gelu_fusion + matmul_add_fusion) already applied all relevant transforms +- The bottleneck is compute throughput + memory bandwidth — only FP16 (smaller tensors) can improve this + +**Key insight: gelu_fusion matters for variance, not p50** +| Version | p50 | p90 | std | +|---|---|---|---| +| Raw export (287 nodes, unfused Gelu) | 17.4ms | 29.2ms | 5.90 | +| Autoconf (251 nodes, fused Gelu+Gemm) | 17.7ms | 19.7ms | 0.97 | + +Unfused Gelu = 5 separate GPU kernel launches (Mul→Div→Erf→Mul→Add) with scheduling jitter. +A single `Gelu` kernel eliminates dispatch overhead → p90 −48%, std −6×. +→ autoconf's role on GPU is **stability**, not speedup. Critical for real-time / latency-SLA deployments. + +→ **QNN GPU search space exhausted.** FP16 is the only remaining lever, blocked by #867. + +**Empirical data: ConvNext DML optimization sweep (Adreno X1-85, DirectML)** +| Experiment | p50 | p90 | std | vs FP32 | +|---|---|---|---|---| +| FP32 baseline (autoconf, 251 nodes) | **16.9ms** | 17.7ms | 0.52 | — ← OPTIMAL with current CLI | +| NHWC transformer | 16.5ms | 21.0ms | 1.89 | ❌ p90 worse | +| Raw unfused export (287 nodes) | 16.5ms | 18.4ms | 2.74 | ❌ p99=35ms, worse tail | +| FP16 (Python hack ⚠️) | **11.8ms** | 12.8ms | 0.66 | ✅ **1.4× faster, clean dist** — BLOCKED #867 | + +**DML vs QNN GPU comparison (same Adreno X1-85):** +| | QNN GPU FP32 | DML FP32 | DML FP16 (invalid) | +|---|---|---|---| +| p50 | 17.7ms | **16.9ms** | **11.8ms** | +| p90 | 19.7ms | **17.7ms** | **12.8ms** | +| std | 0.97 | **0.52** | **0.66** | + +→ DML is consistently faster and more stable than QNN GPU at FP32. Root cause: DML JIT-compiles HLSL shaders at model load time; QNN GPU EP does graph partitioning at each session creation. +→ DML FP16: no DVFS bimodal (unlike QNN GPU FP16) — DML's shader compilation locks in FP16 compute paths. +→ NHWC hurts DML too (same reason as QNN GPU: Adreno X1-85 + D3D12 doesn't benefit from explicit NHWC transforms). +→ Note: `winml analyze` returns 0/0/0/251 (all Unknown) for DML — no rule data. DML supports all standard ONNX ops by design. + +**QNN Hub benchmark comparison (Snapdragon X Elite CRD) — WITH cross-stack test** + +| Model | Stack | NPU p50 | GPU p50 | Notes | +|---|---|---|---|---| +| QNN Hub Float (opset 21, 222 nodes, MatMul) | qairt cloud | **2.687ms** | — | Reference | +| QNN Hub Float (same model) | winml ORT QNN EP | **8.78ms** | 23.9ms | Direct test on this device | +| Our Float (opset 17, 251 nodes, Gemm) | winml ORT QNN EP | 19.4ms | 17.7ms | winml build output | +| QNN Hub W8A16 (opset 21, 798 QDQ, uint16 input) | qairt cloud | **2.612ms** | — | Reference | +| QNN Hub W8A16 (same model) | winml ORT QNN EP | 14.82ms (std=8.8!) | — | ORT-QNN mismatch | +| Our W8A16 + compile (opset 17, ORT quant) | winml ORT QNN EP | **6.01ms** | — | Best we can do | + +**Gap decomposition (three independent sources):** +``` +QNN Hub cloud: 2.7ms + ↑ 3.3× Runtime gap (qairt native vs ORT QNN EP adapter overhead) +QNN Hub on winml: 8.78ms + ↑ 2.2× Model graph gap (opset 21/MatMul/222 nodes vs opset 17/Gemm/251 nodes) +Our model on winml: 19.4ms (FP32) +``` + +**Actionable findings (updated 2026-06-10 — mechanism confirmed via ORT source):** +1. **opset 21 NPU speedup mechanism CONFIRMED — but ORT-version-dependent** (#869) + - **Root cause**: `kMaxSupportedOpset` gate in `IsSupportedOpset()` (layout_transformation.cc). On older ORT where `kMaxSupportedOpset` < 21, opset 21 models bypass the NHWC layout transform entirely (`transform_layout_fn = nullptr`). + - **Why bypass helps ConvNext**: NHWC transform inserts `Transpose(NCHW→NHWC/NHWC→NCHW)` around Conv. ConvNext residual connections **block** full transpose cancellation → extra Transpose ops on HTP → slower. Bypassing = cleaner graph = faster. + - **Critical caveat**: Current ORT main has `kMaxSupportedOpset = 26` → BOTH opset 17 and 21 get NHWC transform. **Must verify ORT version** before assuming the speedup exists. + - **Does NOT generalize** to: MobileNet/EfficientNet (no residual Transpose blocks), ViT (no Conv). + - **Perf claim validation status**: Gate 1 (iter≥1000×3) and Gate 3 (thermal control) still FAILED. Perf numbers are DVFS-dominated. +2. **Runtime stack gap (3.3×) is structural**: qairt native will always be faster. Correct baseline = "QNN Hub ONNX on winml" (8.78ms). +3. **QNN Hub W8A16 is WORSE on our stack** (14.82ms, std=8.8ms): opset 21 QDQ + uint16 input incompatible with ORT QNN EP format. +4. **Opset is a search dimension** — but the correct action is a FULL SWEEP (17–22), not "try 21 first". The optimal opset depends on ORT version. + +**EP-specific search space rules** + +| EP | Quantization | Opset | Graph passes | Compile | Key insight | +|---|---|---|---|---|---| +| QNN NPU | ✅ W8A16 | Full sweep 17-22 (mechanism ORT-version-dependent) | autoconf (gelu+matmul_add) | ✅ Always | W8A8 catastrophic on LN+GELU; opset effect depends on ORT kMaxSupportedOpset | +| QNN GPU | ❌ Skip | 17 (opset 21 not validated) | autoconf only | ❌ Skip | Compile regresses; FP16 only lever (#867) | +| DML | ❌ Skip | 17 (opset 21 not validated) | autoconf only | N/A | FP16 primary lever (#867); faster+stabler than QNN GPU | +| CPU | ❌ Skip | 17 only (kMaxSupportedOpset causes 3-4× regression on 19+) | nchwc, matmul-add, gelu | N/A | kMaxSupportedOpset gate hurts CPU for same reason it helps QNN | + +Rule: autoconfig must use EP-specific search space. Do NOT run quantization experiments for GPU/DML/CPU. +Rule: for QNN NPU opset sweep, verify ORT `kMaxSupportedOpset` first — if ≥ 22, all opsets get NHWC transform and the opset-based speedup may not apply. +Rule: for NPU, if W8A8 top-1 ≤ 15% on first attempt → skip all W8A8 variants, go directly to W8A16. +Rule: always run `winml compile` after finding best quantized config for QNN NPU. NEVER compile for GPU (regresses). +Rule: for GPU/DML, skip ALL graph optimization passes beyond what `winml build` autoconf applies (NHWC and additional fusions hurt). +Rule: W8A8 QDQ on GPU EP hangs — skip quantization immediately for GPU targets without testing. + +**User scenario mapping** + +| Scenario | How autoconfig addresses it | +|---|---| +| S1: LLM fast support (7-30d) | autoconfig replaces manual per-EP tuning; outputs `config_optimal.json + report.html` deployable in hours not days | +| S2: ISV non-LLM model support | Exact use case: ISV brings model → autoconfig finds config → report is deliverable with SOP turnaround | +| S3: Cross-EP parity | Multi-EP parallel run: same model, EP-specific search spaces in parallel → output config matrix per EP | +| S4: Customer ONNX can't run | Phase 0 intake diagnoses "can't run" (partial ops → block reason); Phase 1+2 finds "escape config" for "runs poorly" | +| S5: PyTorch HF Hub coverage | Phase 0 IS the "can WinML run it?" gate; failed Phase 0 → structured block reason feeds long-tail gap tracking | + +**Dependencies on code changes**: +- `winml perf --profile` (new flag) — adds per-op bottleneck output alongside existing latency metrics; POC script `winml_profile.py` exists to unblock +- `--format json` on `winml eval` (#847), `winml analyze` (#848), `winml perf` (#849) + +### Cross-references +- Run `ep-compatibility-check` before starting to verify EP is available +- After autoconfig completes → `validate-before-ship` for final production gate +- If autoconfig cannot meet objective → `debug-accuracy-drop` for deeper diagnosis +- Multi-EP output feeds directly into `prepare-for-winapp` manifest layout +- If the best config found is still not good enough → escalate to `optimization-research` + +--- + +## Skill 11: `optimization-research` (internal — deep gap analysis) + +### Frontmatter +```yaml +name: optimization-research +description: > + Use this skill when a winml-cli engineer wants to find out whether a model can + be optimized better than what winml-cli currently achieves, identify what is + blocking that optimization, and produce concrete backlog work items. + The agent performs a deep search across: ORT source code and its optimizer + passes, Olive recipes and benchmarks, other ONNX ecosystem tools (onnxsim, + onnxoptimizer, neural-compressor, etc.), and native stack reference models + and datasets. It compares the best achievable result (using all available tools) + against what winml produces today, diagnoses the gap, and files GitHub issues + with reproduction steps. Use when an internal engineer says "why is this model + slower than it should be", "what optimization techniques are we missing", + or "what would it take to match Olive's results". + +audience: internal (winml-cli team engineers) +``` + +### When to use +- "ConvNext on QNN is 3× slower than what Qualcomm's SDK achieves — why?" +- "Olive gets 15ms on this model; winml gets 28ms — what's the gap?" +- "We're seeing quantization accuracy drop on LLaMA; are there better calibration methods we're not supporting?" +- "What would it take to match ORT's best-known config for this architecture?" +- After `autoconfig` hits a ceiling: best config found is still not meeting the objective + +### What this skill produces + +**Primary outputs:** +1. **`gap_analysis.md`** — structured report of what the best achievable result is and what's missing +2. **`repro/`** — scripts to reproduce the better result using external tools +3. **GitHub issues** — one per identified gap, filed against winml-cli with: repro steps, expected vs actual, what ORT/Olive/ecosystem already does, proposed fix direction + +--- + +### Design: Deep Search Process + +``` +┌──────────────────────────────────────────────────────────────────┐ +│ PHASE 1 — BASELINE │ +│ winml autoconfig best result for this model/EP │ +│ (or provided by user if already run) │ +└─────────────────────────┬────────────────────────────────────────┘ + ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ PHASE 2 — EXTERNAL BENCHMARK │ +│ Run same model through: │ +│ A. ORT optimizer directly (onnxruntime.tools.transformers) │ +│ B. Olive (olive-ai) with ep-specific recipe │ +│ C. onnxsim + onnxoptimizer (static graph simplification) │ +│ D. neural-compressor (Intel) for quantization comparison │ +│ Record: best latency, accuracy, config used │ +└─────────────────────────┬────────────────────────────────────────┘ + ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ PHASE 3 — GAP DIAGNOSIS │ +│ For each gap (external better than winml): │ +│ a. Diff the ONNX graphs (what ops/patterns differ?) │ +│ b. Read ORT optimizer source to understand what it does │ +│ c. Check winml's capability registry — is this pass missing? │ +│ disabled by default? wired incorrectly? │ +│ d. Check Olive recipe — what flags/params does it use? │ +│ Classify gap as one of: │ +│ [MISSING_CAPABILITY] — pass exists in ORT, not in winml │ +│ [WRONG_DEFAULT] — pass exists but wrong default/order │ +│ [BUG] — pass exists but produces wrong graph│ +│ [CALIBRATION_DATA] — accuracy gap from calibration set │ +│ [EP_LIMITATION] — EP itself can't do this, not winml │ +│ [KNOWN_TRADEOFF] — intentional: winml trades X for Y │ +└─────────────────────────┬────────────────────────────────────────┘ + ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ PHASE 4 — NATIVE STACK VALIDATION │ +│ Check existing reference models in winml-cli test suite: │ +│ - Are there models of this architecture in tests/models/? │ +│ - Do their expected results match what we see? │ +│ Check Windows AI Studio / WinML model zoo: │ +│ - Is this architecture listed? At what performance? │ +│ Check QNN SDK reference benchmarks (if QNN EP): │ +│ - Does QNN vendor claim better numbers for this model? │ +└─────────────────────────┬────────────────────────────────────────┘ + ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ PHASE 5 — WORK ITEMS │ +│ For each [MISSING_CAPABILITY] or [WRONG_DEFAULT] gap: │ +│ - Draft GitHub issue with: title, body, repro, expected, │ +│ actual, proposed fix, ORT source pointer │ +│ - Estimate implementation complexity (S/M/L/XL) │ +│ For [BUG]: file with full repro script │ +│ For [CALIBRATION_DATA]: suggest dataset and eval protocol │ +│ For [EP_LIMITATION]: file with QNN/DML SDK reference │ +└──────────────────────────────────────────────────────────────────┘ +``` + +--- + +### Key external tools to invoke + +```bash +# A. ORT transformer optimizer (the "gold standard" for transformer models) +python -c " +from onnxruntime.transformers import optimizer +from onnxruntime.transformers.fusion_options import FusionOptions +opts = FusionOptions('bert') # or 'gpt2', 'clip', etc. +opts.enable_attention = True +opts.enable_gelu = True +model = optimizer.optimize_model( + 'export.onnx', model_type='bert', + num_heads=12, hidden_size=768, + optimization_options=opts +) +model.save_model_to_file('ort_optimized.onnx') +" + +# B. Olive (end-to-end, EP-aware) +olive run --config olive_recipe.json +# olive recipe template: see skills/optimization-research/templates/olive_qnn.json + +# C. onnxsim (structural simplification) +python -m onnxsim export.onnx simplified.onnx + +# D. onnxoptimizer +python -c " +import onnxoptimizer, onnx +m = onnx.load('export.onnx') +passes = onnxoptimizer.get_available_passes() +m2 = onnxoptimizer.optimize(m, passes) +onnx.save(m2, 'onnxopt.onnx') +" +``` + +--- + +### Gap report format (`gap_analysis.md`) + +```markdown +# Optimization Gap Analysis: on + +Date: +winml-cli version: +ORT version: + +## Summary +| Tool | Latency p50 | Accuracy | Config notes | +|---|---|---|---| +| winml best (autoconfig) | 28.3ms | 0.953 | W8A16, entropy, 256 samples | +| ORT transformer optimizer | 19.1ms | 0.951 | model_type=bert, all fusions | +| Olive QNN recipe | 17.8ms | 0.948 | W8A8 + attention fusion | +| **Gap** | **10.5ms (37%)** | — | — | + +## Gap 1: [MISSING_CAPABILITY] FusedMatMul with rotary embedding +**What external tool does:** ... +**What winml does:** ... +**ORT source:** `onnxruntime/python/tools/transformers/fusion_rotary_attention.py` +**Proposed fix:** Add RotaryAttentionFusion to FusionPipe capability registry +**Estimated effort:** M + +## Gap 2: [WRONG_DEFAULT] attention-fusion disabled by default +... +``` + +--- + +### GitHub issue template + +```markdown +title: [optimization-gap] /: + +body: +## Summary + + +## Reproduction +```bash +# Install +uv pip install winml-cli + +# Baseline (winml current) +winml build -c config.json -m -o winml_out/ +winml perf -m winml_out/model.onnx --ep --warmup 10 --iterations 50 + +# Better result (external) + +``` + +## Expected vs actual +- External tool achieves: ms at +- winml achieves: ms at +- Gap: ms (%) + +## Root cause + + +## ORT source reference + + +## Proposed fix direction + + +## Complexity estimate +S / M / L / XL +``` + +--- + +### What this skill does NOT do +- Does not make code changes to winml-cli itself (files issues only) +- Does not run production benchmarks (uses quick screening methodology) +- Does not replace formal performance testing with validated hardware + +### Cross-references +- `autoconfig` provides the winml baseline to compare against +- Issues filed here feed `adding-ep-support` and `contributing-a-skill` workflows +- Use `ep-compatibility-check` to confirm EP availability before running external benchmarks + +--- + + +--- + +## ConvNext Autoconfig POC — Rigorous Ablation Results + +**Source:** `C:\tmp\autoconfig-demo\ablation.py` — 4-phase rigorous ablation experiment +**Measurement:** `winml perf --ep cpu --warmup 10 --iterations 50` — pure inference latency, no preprocessing +**Design:** 3 independent runs per config; promotion threshold = max(3%, 2×σ_baseline); correctness gate (`winml eval --samples 20`) per config +**Report:** `C:\tmp\autoconfig-demo\report.html` | **Config:** `C:\tmp\autoconfig-demo\config_cpu_optimal.json` + +### Graph structure (facebook/convnext-tiny-224, opset 17) + +**Op counts (raw export):** 287 nodes total +``` +Add×72 Mul×54 Transpose×42 MatMul×36 LayerNormalization×23 +Conv×22 Div×18 Erf×18 ReduceMean×1 Gemm×1 +``` + +**ConvNext block structure** (traced from first DW-Conv): +``` +DW-Conv(7x7, g=96) → Transpose +→ LayerNormalization (native, already fused at export) +→ MatMul(C→4C) → Add(bias) +→ [GELU: Div → Erf → Add(1) → Mul → Mul(0.5)] ← 18 unfused in export +→ MatMul(4C→C) → Add(bias) [Gemm after ORT L2] +→ Mul (layer scale) → Add (residual) +→ Transpose (back to NCHW) +``` + +**Conv breakdown:** 4 regular (1×stem 4x4, 3×downsample 2x2 stride-2), 18×DW-Conv 7x7 + +**Transpose patterns:** +``` +19× Conv → Transpose → LayerNormalization (NCHW→NHWC for LN) +15× Mul → Transpose → Add (NHWC→NCHW for residual) + 4× LayerNormalization → Transpose → Conv (NHWC→NCHW for next DW-Conv) + 2× Add → Transpose → Conv + 2× Add → Transpose → LayerNormalization +``` +→ ConvNext is a **Transpose-sandwich** model: alternates NCHW (Conv) and NHWC (LN) layout + +**Observed graph transformation (export.onnx → model.onnx after winml build, baseline config):** +| Op | export.onnx | model.onnx (baseline) | Change | +|---|---|---|---| +| `com.microsoft/Gelu` | 0 | 18 | +18 | +| `Gemm` | 1 | 37 | +36 | +| `MatMul` | 36 | 0 | −36 | +| `Add` | 72 | 18 | −54 | +| `Mul` | 54 | 18 | −36 | +| `Div`, `Erf` | 18 each | 0 | −18 each | +| `Reshape` | 0 | 72 | +72 | + +**Observation (confirmed):** The baseline `model.onnx` (no user fusion flags) already differs substantially from `export.onnx`. GELU and MatMul+Add are fused before any user capability flag is applied. + +**Open question (unresolved):** The `ORTGraphPipe` design (graph.py) is supposed to disable `GeluFusion`/`GeluFusionL2`/`LayerNormFusion` in the baseline via `optimization.disable_specified_optimizers`. Yet the baseline output clearly contains `com.microsoft/Gelu`. This contradiction is unresolved — possible explanations include: ORT name mismatch in disabled list, a different code path fusing GELU, or the export step (via HF Optimum) applying fusion before winml. **This must be investigated before any mechanistic claims about "ORT L2 already does X" are written in user-facing reports.** + +--- + +### Ablation results (rigorous, Phase 0–4) + +**Clean baseline:** 43.7ms p50 (base_0 + base_1, 6 runs, all within 42.5–45.4ms) + +| config | p50 mean | Δ vs baseline | runs (ms) | verdict | +|---|---|---|---|---| +| base_0 | 43.0ms | −0.6ms | 43.8 / 42.7 / 42.5 | baseline | +| base_1 | 44.3ms | +0.6ms | 43.2 / 44.3 / 45.4 | baseline | +| base_2 | 73.5ms | +29.8ms | 47.2 / **127.1** / 46.2 | outlier run (system spike) | +| opset_18 | 48.0ms | +4.3ms | 50.2 / 44.0 / 49.7 | neutral | +| **opset_19** | **160.3ms** | **+116ms** | **147.6 / 145.8 / 187.4** | **⚠️ SEVERE REGRESSION** | +| **opset_20** | **131.0ms** | **+87ms** | **135.7 / 129.8 / 127.5** | **⚠️ SEVERE REGRESSION** | +| **opset_21** | **170.3ms** | **+126ms** | **190.1 / 164.9 / 155.8** | **⚠️ SEVERE REGRESSION** | +| **opset_22** | **85.0ms** | **+41ms** | **70.9 / 93.9 / 90.2** | **confirmed regression** | +| no_cf_17 | 51.8ms | +8.1ms | 56.4 / 49.0 / 49.9 | mild regression | +| base_mid | 49.4ms | +5.8ms | 51.3 / 51.1 / 45.9 | baseline (mid-exp drift) | +| gelu_only | 52.5ms | +8.9ms | 53.0 / 55.6 / 49.1 | mild regression | +| ln_only | 57.2ms | +13.6ms | **79.3** / 47.9 / 44.5 | inconclusive (outlier) | +| conv_add | 50.2ms | +6.5ms | 47.3 / 55.9 / 47.4 | inconclusive | +| conv_act | 51.2ms | +7.5ms | 45.2 / 41.9 / **66.4** | inconclusive (outlier) | +| **matmul_add** | **81.7ms** | **+38.0ms** | **63.0 / 70.8 / 111.2** | **CONFIRMED REGRESSION** | +| transpose_opt | 45.5ms | +1.8ms | 42.3 / 52.3 / 41.8 | neutral | +| nchwc | 45.4ms | +1.7ms | 43.4 / 48.0 / 44.7 | neutral | +| matmul_scale | 56.9ms | +13.3ms | 51.5 / 58.1 / 61.2 | probable mild regression | +| base_end | 48.3ms | +4.7ms | 45.3 / 56.7 / 43.1 | baseline (end-of-exp drift) | + +**Phase 3 outcome:** No candidates met promotion threshold (29.4ms needed). Baseline is optimal. + +--- + +### Confirmed findings (statistically defensible) + +**1. `matmul-add-fusion` is a confirmed regression on ConvNext CPU (+38ms)** +- All 3 independent runs: 63.0 / 70.8 / 111.2ms — each far above the highest clean baseline run (45.4ms) +- Not attributable to system noise (no run-to-run overlap with baseline distribution) +- Mechanism hypothesis: baseline already converts MatMul+Add→Gemm (37 Gemm in model.onnx); applying matmul-add-fusion on top may create redundant or conflicting kernel dispatch. Unconfirmed — requires profiling. + +**2. `transpose-optimizer` is NEUTRAL on pure inference latency** +- Runs: 42.3 / 52.3 / 41.8ms — overlapping with clean baseline (42.5–45.4ms) +- ⚠️ **CORRECTION OF EARLIER FINDING:** A previous 8-iteration search (using `winml eval`) reported +270ms. That was a measurement artifact — `winml eval` includes HF preprocessing pipeline overhead and has no warmup. It measures *application startup + preprocessing + inference*, not *inference alone*. With `winml perf` (warmup=10, iter=50, pure inference): transpose_opt = baseline. Do not cite the +270ms in any report. + +**3. `nchwc-transformer` is neutral on this model** +- NCHWc SIMD layout: 43.4 / 48.0 / 44.7ms — no benefit for ConvNext CPU inference. + +**4. opset=18 is neutral** +- Same node count (251) as opset=17 — no graph structure changes. Mean slightly above baseline (48ms) is within machine variance. + +**5. No flag improved latency beyond noise. Baseline is the optimal config.** + +--- + +### ⚠️ Critical finding: ORT performance cliff at opset 19 (ConvNext CPU) + +**Experiment:** tested opset 17–22, all with identical graph structure (251 nodes, same op counts) + +| opset | mean p50 | slowdown | +|---|---|---| +| 17 | 43.7ms | — (baseline) | +| 18 | 48.0ms | 1.1× | +| **19** | **160.3ms** | **3.7×** | +| **20** | **131.0ms** | **3.0×** | +| **21** | **170.3ms** | **3.9×** | +| **22** | **85.0ms** | **1.9×** | + +**Key facts:** +- All runs within each opset are consistent (no outliers) — this is real, not noise +- Graph structure is **byte-for-byte identical**: Reshape×72, Transpose×42, Gemm×37, LN×23, Conv×22 for ALL opsets +- The performance difference is entirely in ORT's runtime execution path, not the graph + +**Mechanism: CONFIRMED ROOT CAUSE — ORT `kMaxSupportedOpset` gates Transpose Optimizer** + +Source: `onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h` +```cpp +constexpr int64_t kMaxSupportedOpset = 18; // ORT v1.14.x — bumped each ORT release +``` + +Entry point `onnx_transpose_optimization::Optimize()` → `MakeOptimizerContext()`: +```cpp +if (*opset > kMaxSupportedOpset) { + return std::nullopt; // entire Transpose Optimizer skipped silently +} +``` + +ConvNext has 42 Transpose nodes (NCHW↔NHWC sandwich in every block). The Transpose Optimizer normally: +- Pushes Transposes through Add×18, Mul×18 (layer-scale + residual) across block boundaries +- Cancels adjacent inverse pairs + +When bypassed (opset > kMaxSupportedOpset), all 42 Transposes execute as full memory-layout copies → 3–4× systemic slowdown. + +**ORT optimization level experiment (definitive proof):** + +| Session opt level | opset=17 | opset=19 | ratio | explanation | +|---|---|---|---|---| +| DISABLE_ALL | 47.5ms | **355ms** | **7.5×** | No Transpose Optimizer → all 42 Transposes raw | +| ENABLE_BASIC | 289ms | 315ms | 1.1× | Both slow (re-optimizing pre-fused graph) | +| ENABLE_EXTENDED | 209ms | 241ms | 1.2× | Better but no layout transform | +| **ENABLE_ALL** | 216ms | **215ms** | **1.0×** | Transpose Optimizer runs on both → full parity | + +**`kMaxSupportedOpset` version history:** + +| ORT version | kMaxSupportedOpset | opset ≥ N disabled | +|---|---|---| +| v1.14.x | **18** | ≥ 19 | +| v1.16.x | 19 | ≥ 20 | +| v1.17.x | 20 | ≥ 21 | +| v1.18.x | 21 | ≥ 22 | +| main/HEAD | **26** | fully covered | + +**Classification for optimization-research skill:** `[KNOWN_TRADEOFF]` (intentional design: ORT bumps the ceiling with each ONNX opset release) +- winml-cli ships a specific ORT build → its `kMaxSupportedOpset` is fixed +- winml-cli's **default opset=17 is correct and essential** — it is the safe zone for all current ORT builds +- Raising opset requires ensuring the shipping ORT version has `kMaxSupportedOpset ≥ target_opset` +- Do NOT raise default opset without verifying `kMaxSupportedOpset` in the shipped ORT + +**Call chain:** +``` +InferenceSession::Initialize() + → TransposeOptimizer::ApplyImpl() [transpose_optimizer.cc:18] + → onnx_transpose_optimization::Optimize() + → MakeOptimizerContext() + → if opset > kMaxSupportedOpset: return nullopt ← THE GATE +``` + +--- + +### Inconclusive / do not report + +These show elevated means but cannot be confirmed as regressions given machine variance (p90 = 2–3× p50 throughout): +- `ln_only`, `conv_add`, `conv_act`: each has ≥1 extreme outlier run; other runs are baseline-level +- `gelu_only`: consistently 49–56ms, possibly a mild regression but no outlier; 3 runs insufficient to separate from drift +- `matmul_scale`: all 3 runs elevated (51–61ms), but concurrent baseline also drifted (+5ms); net delta ~+8ms, weak signal + +Do not write these as confirmed regressions in user-facing reports. Label as "inconclusive" or omit. + +--- + +### Measurement methodology correction (winml eval vs winml perf) + +| Tool | What it measures | Latency for ConvNext CPU | +|---|---|---| +| `winml eval` (no warmup, includes preprocessing) | Application-level: model load + HF preprocessing + inference × N | ~67ms/sample | +| `winml perf --warmup 10 --iterations 50` | Pure inference: steady-state kernel execution only | ~43.7ms p50 | +| Difference | HF preprocessing + JIT warmup overhead | ~23ms | + +**Rule for autoconfig skill:** Always use `winml perf` with `--warmup 10 --iterations 50` for latency measurements in experiments. Never use `winml eval` latency to compare configs. + +--- + +### Key insight for autoconfig skill + +- CPU EP on ConvNext: no extra flag tested improved latency. Baseline (no fusions beyond what ORT L2 applies unconditionally) is optimal. +- The only actionable finding is: **do not add `matmul-add-fusion` for ConvNext on CPU** (or any model where baseline already uses Gemm). +- QNN/DML: not yet tested. Guidance on those EPs requires separate validated experiments. + +--- + +### `winml analyze` gaps discovered + +These are cases where analyzing the graph *before* running autoconfig would have prevented wasted search iterations: + +**Gap 1: "Already fused" vs "fuseable" not distinguished** +- ConvNext has `LayerNormalization` as a native op (already fused at PyTorch export) +- `layer-norm-fusion` targets the *decomposed* ReduceMean→Sub→... pattern +- `winml analyze` reports `OP/ai.onnx/LayerNormalization` without indicating it's already in canonical form +- **Impact:** user enables `layer-norm-fusion` thinking it will help; it does nothing (but builds take longer) +- **Fix:** analyze should tag ops as `already_canonical` vs `fuseable_subgraph` + +**Gap 2: DW-Conv not distinguished from regular Conv** +- ConvNext has 18×7x7 DW-Conv (group=C) and 4×regular Conv (group=1) +- `winml analyze` reports all as `OP/ai.onnx/Conv` (undifferentiated) +- QNN EP supports DW-Conv natively (important for NPU efficiency), but EP support classification is per op type, not per `groups` value +- **Impact:** user cannot tell whether Conv ops are the DW or regular variant; EP support may differ +- **Fix:** analyze should emit `OP/ai.onnx/Conv[depthwise]` vs `OP/ai.onnx/Conv[regular]` + +**Gap 3: Transpose-sandwich pattern not detected** +- 42 Transpose nodes in ConvNext form a clear `Conv→Transpose→LN→...→Transpose` repeating pattern +- `transpose-optimizer` turns this into NHWC chains (good for GPU/NPU, bad for CPU) +- `winml analyze` reports Transpose as just `OP/ai.onnx/Transpose` with no structural context +- **Impact:** user cannot predict whether `transpose-optimizer` will help or hurt without running it +- **Fix:** analyze should detect `transpose_sandwich_depth: N` and emit a warning for CPU EP + +**Gap 4: ORT L2 baseline fusions not surfaced** +- After ORT Level 2 optimization (which runs unconditionally), the graph already has fused Gelu, Gemm +- The analyze command runs on the *pre-optimize* export.onnx, not the actual optimized model +- `winml analyze` sees 36×MatMul in export.onnx but the real model at inference has 37×Gemm +- **Impact:** analyze output doesn't reflect what the model actually looks like when running +- **Fix:** analyze should optionally run on `optimized.onnx` (post-ORT-L2), not just `export.onnx` + +**Gap 5: MatMul semantic not classified** +- 36 MatMul ops are all MLP dense layers (4C→C or C→4C expansion) +- No attention MatMuls present (ConvNext has no self-attention) +- QNN handles dense-layer MatMul differently from attention-context MatMul +- `winml analyze` reports `OP/ai.onnx/MatMul` without semantic classification +- **Fix:** analyze could detect MatMul role heuristically (shapes: attention = square-ish, MLP = wide fan-out) + +--- + + + +### Why skill eval matters + +Mobius has no skill eval mechanism — it tests models but not skills themselves. This is a gap. +A SKILL.md can have correct content but still cause the agent to give wrong guidance if the +trigger description is poorly written or the structure is confusing. Skill eval catches this. + +### Two eval dimensions + +| Dimension | What it checks | When to run | +|---|---|---| +| **Static (content quality)** | description trigger phrases, command accuracy, cross-reference validity | Every PR that modifies a SKILL.md | +| **Dynamic (agent behavior)** | Given a user scenario + skill injected, does the agent produce the right commands and diagnosis? | On significant content changes; periodically | + +Static eval = the review checklist in `contributing-a-skill`. +Dynamic eval = test cases in `evals/eval.yaml` per skill, run with `winml skill eval`. + +### `winml skill` — new CLI subcommand + +The eval system is built into winml-cli itself as a new `skill` subcommand. +This keeps the toolchain self-contained and enables CI integration without external dependencies. + +**Command surface:** +```bash +winml skill check [--skill ] # static: lint + auto-verify all commands in SKILL.md +winml skill gen-evals [--skill ] # auto-research: generate eval.yaml from SKILL.md content +winml skill eval [--skill ] # dynamic: run agent behavior tests +winml skill list # list all skills with pass/fail status +``` + +#### `winml skill check` — auto-research via command extraction + +This is the "code change that does auto research": + +1. **Parse SKILL.md** — extract every code block containing `winml ` patterns +2. **Verify flags exist** — run `winml --help` and check each flag is present +3. **Verify cross-references** — confirm every `.agents/skills//SKILL.md` path exists +4. **Verify trigger coverage** — count quoted phrases in `description` frontmatter (must be ≥3) +5. **Optionally run commands** — with `--dry-run-commands`, execute each command on a + canary model to verify it doesn't crash + +Example output: +``` +winml skill check --skill debug-accuracy-drop + +Checking debug-accuracy-drop... + ✓ description: 4 trigger phrases found + ✓ winml eval --mode compare [flag verified against eval --help] + ✓ winml analyze -m ... --ep qnn [flag verified against analyze --help] + ✗ winml perf --monitor [flag '--monitor' not found in perf --help] ← STALE + ✓ cross-ref: ep-compatibility-check/SKILL.md exists + ✗ cross-ref: validate-before-ship/SKILL.md [file missing] ← BROKEN LINK +Summary: 2 issues found +``` + +Key insight: **every time winml-cli flags change, `winml skill check` automatically +detects which skills have stale commands** — no manual audit needed. + +Implementation sketch (`src/winml/modelkit/commands/skill.py`): +```python +import re, subprocess +from pathlib import Path +import click + +SKILLS_DIR = Path(__file__).parents[5] / "skills" +WINML_CMD_PATTERN = re.compile(r'^\s*(winml\s+\w[\w\-]*\s+[^\n]+)', re.MULTILINE) + +def extract_commands(skill_md: str) -> list[str]: + """Extract all 'winml ...' lines from code blocks.""" + in_block = False + commands = [] + for line in skill_md.splitlines(): + if line.strip().startswith("```"): + in_block = not in_block + elif in_block and line.strip().startswith("winml "): + commands.append(line.strip()) + return commands + +def verify_flag(command_line: str) -> tuple[bool, str]: + """Check flags in a command line exist in --help output.""" + parts = command_line.split() + subcommand = parts[1] + flags = [p for p in parts[2:] if p.startswith("--")] + result = subprocess.run(["winml", subcommand, "--help"], + capture_output=True, text=True) + help_text = result.stdout + for flag in flags: + if flag not in help_text: + return False, f"flag '{flag}' not found in {subcommand} --help" + return True, "ok" + +@click.group("skill") +def skill_cmd(): + """Manage and evaluate winml-cli skills.""" + +@skill_cmd.command("check") +@click.option("--skill", default=None, help="Skill name to check (default: all)") +@click.option("--dry-run-commands", is_flag=True, help="Execute commands on canary model") +def check(skill, dry_run_commands): + """Static check: verify commands and cross-references in SKILL.md files.""" + targets = [SKILLS_DIR / skill] if skill else list(SKILLS_DIR.iterdir()) + for skill_dir in targets: + skill_md = (skill_dir / "SKILL.md").read_text() + for cmd in extract_commands(skill_md): + ok, msg = verify_flag(cmd) + status = "✓" if ok else "✗ STALE" + click.echo(f" {status} {cmd[:60]}") +``` + +#### `winml skill gen-evals` — LLM-powered eval case generation + +Auto-generates `evals/eval.yaml` from SKILL.md content using an LLM: + +1. **Extract trigger phrases** from `description` frontmatter +2. **Extract symptom→fix tables** from SKILL.md sections +3. **Prompt an LLM** to generate (user scenario, expected commands) pairs +4. **Write `evals/eval.yaml`** in PromptFoo format + +This is "auto research": the LLM reads the skill and generates adversarial cases +that challenge the agent — including negative cases where the agent should NOT +recommend something. + +```bash +winml skill gen-evals --skill debug-accuracy-drop --model gpt-4o --count 5 +# Writes: skills/debug-accuracy-drop/evals/eval.yaml (auto-generated) +# Human review before committing +``` + +The generated eval.yaml is a starting point — contributors review and refine before +committing. Over time, real user questions (from GitHub issues) can be mined and +added as additional eval cases. + +#### `winml skill eval` — agent behavior testing + +Runs the eval cases and reports results: + +```bash +winml skill eval --skill debug-accuracy-drop +# Uses evals/eval.yaml + injects SKILL.md as system prompt +# Reports pass/fail per test case +``` + +Internally shells out to PromptFoo (if installed) or uses a lightweight built-in runner +that calls the configured LLM API directly. + +### Directory layout + +Each skill carries its own eval cases: +``` +skills/ + debug-accuracy-drop/ + SKILL.md + evals/ + eval.yaml ← agent behavior test cases (hand-written or gen-evals output) +``` + +### eval.yaml format (PromptFoo) + +```yaml +# skills/debug-accuracy-drop/evals/eval.yaml +description: "Agent behavior eval for debug-accuracy-drop skill" + +prompts: + - "{{user_message}}" + +providers: + - id: openai:gpt-4o + config: + systemPrompt: | + You are a WinML CLI assistant. Use the following skill: + --- + {{skill_content}} + +tests: + - description: "Low cosine after W8A8 — should isolate to quantize stage" + vars: + user_message: "I quantized my model to W8A8 and cosine similarity is 0.87. What's wrong?" + assert: + - type: contains + value: "winml eval --mode compare" + - type: icontains + value: "quantize" + - type: icontains + value: "w8a16" # should suggest escalating precision + + - description: "NPU vs CPU discrepancy — should point to op fallback" + vars: + user_message: "My model gives different results on QNN NPU vs CPU after compile" + assert: + - type: contains + value: "winml analyze" + - type: icontains + value: "partial" # mention partial op fallback + - type: icontains + value: "compile" # blame compile stage, not quantize + + - description: "Drop after optimize only — should NOT blame calibration" + vars: + user_message: "cosine similarity dropped after winml optimize, I haven't quantized yet" + assert: + - type: contains + value: "winml eval --mode compare" + - type: icontains + value: "optimize" + - type: not-icontains + value: "calibration" # calibration is irrelevant here +``` + +### Minimum eval cases per skill + +| Skill | Min cases | Key assertions | +|---|---|---| +| `ep-compatibility-check` | 3 | Recommends 3-layer check in order; gives fallback when EP absent | +| `debug-accuracy-drop` | 4 | Correctly isolates pipeline stage; suggests precision escalation | +| `validate-before-ship` | 3 | Lists all 6 gates; handles waiver scenario | +| `optimize-for-device` | 3 | Applies latency-budget vs accuracy-budget framework correctly | +| `prepare-for-winapp` | 2 | Produces manifest.json structure; includes CPU fallback | +| `adding-model-support` | 2 | Suggests L1→L5 order; correct recipe structure | +| `contributing-a-skill` | 2 | Flags missing trigger phrases; flags pseudocode commands | + +### What "passing" means + +An eval case passes when all assertions hold. Recommended pass threshold before merging: +- All `contains` / `icontains` assertions pass +- All `not-icontains` (negative) assertions pass (agent does NOT give wrong advice) + +The negative assertions are the most valuable — they catch the agent confidently giving +wrong guidance (e.g., blaming calibration for an optimize-stage drop). + +### Running evals + +```bash +# Install PromptFoo +npm install -g promptfoo + +# Run eval for a single skill +cd skills/debug-accuracy-drop +promptfoo eval --config evals/eval.yaml + +# Run all skill evals +for dir in skills/*/; do + if [ -f "$dir/evals/eval.yaml" ]; then + promptfoo eval --config "$dir/evals/eval.yaml" + fi +done +``` + +--- + +## Implementation notes + +### Directory structure +``` +skills/ + use-winml-cli/ ← existing, extend + SKILL.md + evals/eval.yaml + optimize-for-device/ ← new (consumer) + SKILL.md + evals/eval.yaml + debug-accuracy-drop/ ← new (consumer) + SKILL.md + evals/eval.yaml + prepare-for-winapp/ ← new (consumer, partial dep on winml package feature) + SKILL.md + evals/eval.yaml + ep-compatibility-check/ ← new (consumer) + SKILL.md + evals/eval.yaml + validate-before-ship/ ← new (consumer) + SKILL.md + evals/eval.yaml + adding-model-support/ ← new (contributor) + SKILL.md + evals/eval.yaml + adding-ep-support/ ← new (contributor) + SKILL.md + evals/eval.yaml + contributing-a-skill/ ← new (contributor) + SKILL.md + evals/eval.yaml + autoconfig/ ← new (consumer — autoresearch loop for external users) + SKILL.md + evals/eval.yaml + optimization-research/ ← new (internal — deep gap analysis for winml-cli team) + SKILL.md + templates/olive_qnn.json + templates/olive_dml.json + evals/eval.yaml +``` + +### Priority order for implementation +**Code changes first (unblocks agentic skill execution):** +0. `winml eval --format json` — critical: enables all accuracy-related agentic flows +0. `winml analyze --format json` — enables EP compatibility agentic flows +0. `winml perf --format json` — enables performance SLA agentic flows + +**Consumer skills:** +1. `ep-compatibility-check` — lowest risk, pure existing commands, high value for new users +2. `debug-accuracy-drop` — closes clearest pain point, existing `eval --mode compare` +3. `validate-before-ship` — most complete checklist, builds on 1+2 +4. `optimize-for-device` — needs good hardware reference data to be accurate +5. `prepare-for-winapp` — needs `winml package` feature or clear workaround documented +6. `autoconfig` — depends on #847/#848/#849 + most complex skill to implement + +**Contributor skills:** +6. `contributing-a-skill` — enables community contributions to the skill ecosystem +7. `adding-model-support` — most impactful for model coverage growth +8. `adding-ep-support` — lower frequency, but needed for new EP onboarding + +### Required code changes for agentic skill execution + +The three changes that turn skills from documentation into agentic programs: + +**1. `winml eval --format json`** + +File: `src/winml/modelkit/commands/eval.py` + +Add `--format` option and emit structured JSON to stdout: +```json +{ + "mode": "compare", + "model": "path/to/quantized.onnx", + "model_id": "microsoft/resnet-50", + "metrics": { + "cosine_similarity": 0.87, + "sqnr_db": 28.3, + "psnr_db": 31.1, + "max_abs_diff": 0.042 + }, + "task_metric": { "top1_accuracy": 0.741 }, + "threshold_pass": false +} +``` + +**2. `winml analyze --format json`** + +File: `src/winml/modelkit/commands/analyze.py` + +Already supports `--output file.json`. Add `--format json` to also print to stdout +(mirrors pattern from `winml inspect` and `winml sys`): +```json +{ + "ep": "qnn", + "model": "path/to/model.onnx", + "summary": { "supported": 142, "partial": 3, "unsupported": 1 }, + "partial_ops": ["MultiHeadAttention", "LayerNorm", "Softmax"], + "unsupported_ops": ["CustomRotaryEmbedding"] +} +``` + +**3. `winml perf --format json`** + +File: `src/winml/modelkit/commands/perf.py` + +Already writes JSON to file via `-o`. Add `--format json` stdout output: +```json +{ + "model": "path/to/model.onnx", + "ep": "qnn", + "device": "npu", + "iterations": 100, + "latency_ms": { "p50": 18.3, "p90": 21.7, "p99": 28.4, "mean": 18.9 }, + "throughput_rps": 54.6 +} +``` + +These three changes are ~50 lines of code each, follow the existing pattern from +`winml inspect --format json` and `winml sys --format json`, and unlock the full +agentic execution model for all consumer skills. + +### Sizing estimate (per skill) +Each SKILL.md based on Mobius patterns (~8–14KB): +- ~200 lines prose + decision tables +- ~50 lines code examples +- Cross-reference section + +### Relationship to existing `use-winml-cli` skill +The new skills are **task-scoped** (problem → solution) vs the existing skill which is +**tool-scoped** (here's what each command does). They complement, not replace each other. +The existing skill should add cross-references to the new skills in its "Common patterns" section. + +--- + +## QNN NPU Catalog Sweep — Findings & Feature Gaps (2026-06-13) + +Source: 8-model catalog sweep via autoconfig POC (C:\tmp\autoconfig-demo\catalog_qnn_sweep.py) + +### Cross-model results + +| Model | Arch | Baseline p50 | Best p50 | Gain | Best config | +|-------|------|-------------|----------|------|-------------| +| microsoft/resnet-18 | resnet | 0.96ms | 0.96ms | — | baseline (opset17) | +| google/vit-base-patch16-224 | vit | 9.04ms | 9.04ms | — | baseline (opset17) | +| apple/mobilevit-small | mobilevit | 12.07ms | **8.62ms** | +29% | opset21+conv_fusions | +| facebook/dinov2-small | dinov2 | 6.56ms | **4.98ms** | +24% | opset21 | +| hustvl/yolos-small | yolos | 78.69ms | — | timeout | — | +| distilbert SST-2 | distilbert | 19.48ms | 19.48ms | — | baseline | +| all-MiniLM-L6-v2 | bert | 5.81ms | 5.81ms | — | baseline | +| deepset/roberta-base-squad2 | roberta | 14.94ms | 14.72ms | 1.5% | opset21 | + +### Validated KB findings + +**npu-001 refined**: opset21 benefit is architecture-gated: +- ✅ Conv + residual connections: +25–31% (mobilevit, dinov2, convnext) +- ❌ Pure transformer (ViT, YOLOS): -7% or neutral +- ⚪ NLP BERT-family: neutral + +**npu-006 NEW — CRITICAL**: Conv fusions (conv-bn/add/activation) cause catastrophic QNN NPU CPU fallback +- ResNet-18 with conv fusions: 0.96ms → 132ms (+4900% regression) +- MobileViT: safe (no regression) +- Severity: critical — can produce 50x+ regression silently + +**npu-007 NEW**: DVFS thermal noise makes CV gate unreliable on QNN NPU +- New bench protocol: 3 sessions × 500 iters + 30s cool-down + median p50 + >10% noise floor + +### Feature gaps (winml-cli backlog items) + +**Gap A: winml analyze — Conv fusion QNN safety check** +winml analyze should detect Conv-dominant topologies and warn when conv-bn/add/activation +fusions are configured for QNN NPU target. Currently no pre-build detection of this hazard. +- Command to add: warning in analyze output when ep=qnn AND conv_fusion_pass is enabled AND model has >N Conv ops +- Priority: HIGH (silent 50x regression risk) + +**Gap B: budget-aware sweep in autoconfig** +Large models (YOLOS, ~78ms/inf) cause sweep timeout with current fixed budget. +Need: per-hypothesis time estimation → auto-skip models that exceed budget, log as "timeout" not failure. +- Affects: autoconfig POC and any future winml sweep command + +**Gap C: winml perf DVFS-aware session averaging** +winml perf should natively support session-level median aggregation for QNN NPU. +Current single-session variance is dominated by DVFS thermal state, not model performance. +- Flag proposal: --sessions 3 --cool-down 30 --signal median-p50 +- This would make winml perf output trustworthy for optimization decisions on Snapdragon X Elite + +--- + +## Feature Request: FusedConv detection + unfuse-for-qnn (2026-06-15) + +### Problem + +用户可能从外部拿到一个已经做过 Conv fusion 的 ONNX 模型,或者 autoconfig 实验里开了 conv-add-activation-fusion flag。 +这类模型在 QNN NPU 上跑起来特别慢(ResNet-18 实测 +4900% regression),但没有任何报错,用户完全不知道原因。 + +### Root cause + +conv-add-activation-fusion 生成的是 ORT 扩展 op FusedConv(非标准 ONNX op)。 +QNN EP 不认识这个 op,所有 FusedConv 节点全部 fallback 到 CPU,PCIe round-trip 开销极大。 + +conv-bn-fusion 不同:它把 BN 参数数学吸收进 Conv weight,不产生新 op 类型,结果仍是标准 Conv,**不可逆**。 + +### Proposed feature + +**1. winml analyze — FusedConv detection** + +winml analyze -m model.onnx --ep qnn 扫描图中所有节点, +如果发现 FusedConv 节点且目标 EP 为 QNN,输出警告: + +` +⚠ QNN NPU: 23 FusedConv nodes detected. + FusedConv is an ORT-internal op not supported by QNN EP — these nodes will fall back to CPU. + Recommend: run winml optimize --unfuse-conv to expand back to standard ONNX ops. +` + +**2. winml optimize --unfuse-conv** + +新增 optimize pass:把 FusedConv 节点拆回 Conv + Add + 。 +- Lossless(权重不变,只拆 op 结构) +- 输出标准 ONNX,QNN EP 可正常映射 HTP kernel +- 适用场景:BYOM 用户带入已做过 fusion 的模型 + +**Implementation notes** +- 检测: +ode.op_type == "FusedConv" 即可定位 +- 拆分:读 FusedConv attribute ctivation 字段 → 插入对应 Relu/Sigmoid/Tanh 节点 +- 不处理 conv-bn-fusion 产生的模型(那个无法反向,只能重新从 FP32 export) + +### Priority +MEDIUM — 默认 flag 是关的,不是高频路径,但对 BYOM 场景(拿到别人优化过的模型)有实际价值。 From 6de0e6b3eda4f485e856a3d4450375dd7ee1a9e5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 15 Jun 2026 14:06:05 +0800 Subject: [PATCH 04/38] research: reorganize skills doc into user/contributor + merge overlapping skills - Split skill catalog into two ranked categories by the 'does it touch code?' discriminator: User (config-only) and Contributor (code changes) - Merge overlapping skills (12 -> 9): - check-model-feasibility = find-a-model + ep-compatibility-check - ship-to-winapp = validate-before-ship + prepare-for-winapp - autoconfig absorbs optimize-for-device as its manual mode - Add self-contained HTML render of the design doc for easier reading --- research/autoconfig/docs/skills-design.html | 3784 +++++++++++++++++++ research/autoconfig/docs/skills-design.md | 672 ++-- 2 files changed, 4131 insertions(+), 325 deletions(-) create mode 100644 research/autoconfig/docs/skills-design.html diff --git a/research/autoconfig/docs/skills-design.html b/research/autoconfig/docs/skills-design.html new file mode 100644 index 000000000..c52ede8b1 --- /dev/null +++ b/research/autoconfig/docs/skills-design.html @@ -0,0 +1,3784 @@ + + + + + +WinML CLI Skills Design Doc + + + +
+ +
+

WinML CLI Skills Design Doc

+

Overview

+

This document defines the design for 9 skills to be added to skills/ in winml-cli. +Skills are split into two categories by the single question: does the task require editing repo code?

+
    +
  • User skills (5) — the user reaches their goal purely by specifying conditions and letting + winml-cli produce or modify a config.json / manifest.json / report. No source code is touched. + Audience: WinApp developers and ISVs deploying models.
  • +
  • Contributor skills (4) — the task requires a winml-cli source-code change (a new exporter, a new + EP backend, a new skill), or exists specifically to produce code-change backlog. Audience: winml-cli engineers.
  • +
+
+

Discriminator: if the deliverable is a config/manifest/report, it is a User skill. If completing it +requires editing code in the repo (or its whole purpose is to drive such edits), it is a Contributor skill.

+
+

Each skill follows the SKILL.md frontmatter convention (name:, description:) established +by Mobius, NVIDIA Model-Optimizer, and Google LiteRT-CLI as the de facto standard.

+

User skills — ranked by importance

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
RankSkillWhy it ranks hereOutput (no code)
1autoconfigFlagship. Autonomously searches the config space and delivers the optimal config.json per EP. Also hosts the manual optimize path (precision-ladder + latency/accuracy-budget decision framework + hardware table) for users who want to choose by hand or have no target hardware. Maps to all five user scenarios (S1–S5).config_<ep>_optimal.json + report.html
2check-model-feasibilityPre-build front door, merging model discovery + EP/device compatibility: "find me a supported model from my constraints, then confirm it runs on my hardware." The single "what do I run, and will it run?" gate (inspectsysanalyze). Highest frequency — every user hits it before building.model shortlist + go/no-go + fallback EP
3debug-accuracy-dropCloses the most acute pain point: accuracy dropped, cause unknown. High-frequency diagnostic need with the clearest existing tooling (eval --mode compare).stage + root cause + fix
4ship-to-winappShip-time skill, merging validation + packaging: L1–L5 Definition-of-Done gates plus multi-EP artifact layout, manifest.json, and runtime EP selection. Everything between "the model is good" and "it's running in the app."pass/fail report + manifest.json
5use-winml-cliGeneral tool-scoped onboarding reference (existing). Foundational but low differentiation vs the task-scoped skills above.command reference
+

Contributor skills — ranked by importance

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
RankSkillWhy it ranks hereCode touched
1adding-model-supportDirectly grows model coverage — the core long-tail business problem (ISV onboarding, S2/S5). Highest contribution frequency.new exporter + recipe
2optimization-researchHigh leverage: deep-searches ORT/Olive/ecosystem to find gaps and file the backlog that drives every other contributor skill. Internal, but sets the roadmap.files issues + repro (drives code changes)
3adding-ep-supportOnboards a new execution-provider backend. Infrequent, but high value the moment a new NPU vendor lands.compile backend + EP registry
4contributing-a-skillMeta-tooling: how to author, lint, and eval a SKILL.md. Sustains the ecosystem but is supporting infrastructure, not a direct model/EP/perf deliverable.SKILL.md + evals
+
+

The detailed ## Skill: sections below appear in document order, not priority order. Importance is +defined by the two ranked tables above; implementation sequencing (risk/dependency-driven) is in +Priority order for implementation.

+
+

User skill dependency graph

+
check-model-feasibility ──► autoconfig ──────────► ship-to-winapp
+  find a supported model      optimize the model      validate (L1–L5 gates)
+  + confirm EP/device runs     (automated autoresearch  + package multi-EP artifacts
+                               loop OR manual framework)  + manifest + runtime EP selection
+          │                         │                          ▲
+          └──────────► debug-accuracy-drop ───────────────────┘
+                       (diagnose accuracy drops at any stage)
+
+use-winml-cli ── general command reference; underpins every step above
+
+ +

Contributor research skill

+
optimization-research ──► [GitHub issues / winml backlog]
+  (deep search: ORT source + Olive + ONNX ecosystem + native stack models
+   → find better solutions → diagnose winml gaps → produce work items)
+
+ +

Contributor skill dependency graph

+
adding-model-support ──► contributing-a-skill
+adding-ep-support    ──► contributing-a-skill
+
+ +
+

Design principle: Skills as agentic workflows

+

The shift: documentation → automation

+

Current state (most skills in the ecosystem):

+
+

Skill tells the user what commands to run → user runs them → user interprets output

+
+

Target state for winml-cli:

+
+

Skill tells the agent what commands to run → agent runs them → agent interprets output → agent gives a specific answer

+
+

The difference:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Documentation skillAgentic skill
Agent sees low cosine"Run winml eval --mode compare"Runs it, reads cosine=0.87, says "drop at quantize stage, Attention layers"
EP compatibility"Run winml sys then winml analyze"Runs both, parses JSON, says "QNN available but LayerNorm is partial"
Optimize precision"Use the decision framework"Runs fp16/w8a16/w8a8 sweep, builds actual tradeoff table, recommends W8A16
Validate before ship"Check these 6 gates"Runs all 6 gates, generates a pass/fail report with actual numbers
+

This is only possible if skills describe a GATHER → ANALYZE → DECIDE → ACT workflow, +and winml-cli commands emit machine-readable structured output that the agent can parse.

+

Structured output: current state and gaps

+

Copilot agents have shell tool access and can run winml commands directly. +The key requirement is --format json on stdout so the agent can parse results +without screen-scraping Rich/ANSI terminal output.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CommandStructured output todayGap
winml inspect--format json (stdout)None
winml sys--format json (stdout)None
winml run--format json (stdout)None
winml analyze--output file.json (file only)Add --format json stdout
winml perf--output file.json (file only)Add --format json stdout
winml eval✗ No structured outputAdd --format json stdout
+

Required code changes (enables agentic skill execution): +1. winml eval --format json — outputs {cosine, sqnr, psnr, task_metric} to stdout +2. winml analyze --format json — outputs {supported: [...], partial: [...], unsupported: [...]} to stdout +3. winml perf --format json — outputs {p50_ms, p90_ms, p99_ms, mean_ms} to stdout

+

The GATHER → ANALYZE → DECIDE → ACT skill structure

+

Each skill section should be written with agent execution in mind:

+
## GATHER: what to run
+Commands the agent runs first (with --format json) to collect facts.
+
+## ANALYZE: what to look for
+How to interpret the JSON output. What values matter. What thresholds to apply.
+
+## DECIDE: what to recommend
+Decision logic. If X → recommend Y. If A and B → recommend C.
+
+## ACT: what to tell the user
+What to surface to the user: specific diagnosis + specific next step.
+
+ +

In practice this maps onto the existing "Sections" structure — the key is ensuring +each section has concrete commands to run and concrete interpretation rules, +not just prose description.

+

Example: debug-accuracy-drop as an agentic workflow

+
User: "My W8A8 model has low accuracy"
+
+GATHER:
+  agent runs: winml eval --mode compare -m quantized.onnx --model-id <id> --format json
+  agent gets: {"cosine_similarity": 0.87, "sqnr_db": 28.3, "stage": "quantize"}
+
+ANALYZE:
+  cosine=0.87 < 0.90 threshold → problem is real
+  sqnr=28.3 < 30 dB → significant degradation
+  stage=quantize → problem introduced at quantize, not optimize or compile
+
+DECIDE:
+  quantize-stage drop on W8A8 → check if Attention layers are the culprit
+  agent runs: winml analyze -m quantized.onnx --ep qnn --format json
+  agent gets: {"partial": ["MultiHeadAttention", "LayerNorm"], "unsupported": []}
+
+ACT:
+  Agent: "The accuracy drop (cosine=0.87) is at the quantize stage.
+          MultiHeadAttention is partial on QNN — activations may be falling back to FP32.
+          Try W8A16 to keep activations at FP16: winml build -c config.json --precision w8a16"
+
+ +

Without structured output (--format json), the agent would have to tell the user to run +each step manually and paste the results back. With structured output, the agent runs the +full diagnostic in one turn.

+
+

Validation confidence levels (L1–L5)

+

Inspired by Mobius writing-tests. Applied in ship-to-winapp as the Definition-of-Done backbone. +Each level is checked independently — a model can pass L3 without passing L2.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LevelNameWhat it verifiesKey command
L1LoadableArtifact is valid ONNX, loads without errorwinml inspect -m <artifact>
L2Shape correctOutput shape matches expected specwinml eval -m <artifact> --model-id <model> (check shape in output)
L3Numerical parityOutput matches FP32 baseline (cosine ≥ 0.99 FP16, ≥ 0.95 W8A16, ≥ 0.90 W8A8)winml eval --mode compare -m <artifact> --model-id <model>
L4Task accuracyTask metric (Top-1/F1/mAP) within acceptable drop from FP32 referencewinml eval -m <artifact> --model-id <model> (task metric)
L5Production readyPerf SLA met on target device + cross-EP consistency verifiedwinml perf --iterations 100 --monitor
+

Quick pass criteria:

+ + + + + + + + + + + + + + + + + + + + + +
PrecisionL3 threshold
FP16cosine_similarity ≥ 0.99
W8A16cosine_similarity ≥ 0.95
W8A8cosine_similarity ≥ 0.90 (or task-specific)
+

Waivers: any level that cannot be verified must be documented with a reason and tracking issue. +The ship-to-winapp skill maps each of its 6 validation gates to an L-level.

+
+
+

Competitive Analysis

+

Summary

+

winml-cli has a solid optimization pipeline (export→quantize→compile→benchmark) but lacks the debugging/diagnostic loop, accuracy recovery tooling, and developer observability that distinguish great toolchains from adequate ones.

+
+

Competitor Feature Matrix

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FeatureAppleExecuTorchAI HubNVIDIAOpenVINOOptimumOlivewinml-cli
Per-layer accuracy debugging✅ SVG graph✅ cloud
Compute unit utilization reportPartial
Accuracy-Aware PTQ (auto layer rollback)✅ NNCF
Standard NLP benchmark (MMLU/PPL)
Cross-EP side-by-side comparePartial
Zero-deploy validation (model.predict)✅ macOS✅ cloudPartial
Pre-quantized model zoo✅ 500+✅ HF org
One-line optimize command
Multi-EP artifact packaging✅ .mlpackage✅ .pte
QAT / accuracy recovery fine-tuning✅ AIMET
Advanced quant (AWQ/SmoothQuant)✅ NNCF
Thermal/sustained-load profiling
+
+

Competitor Deep Dives

+

Apple coremltools

+

Most relevant: zero-deploy validation + compute_units API + palettization

+
    +
  • model.predict({'input': np_array}) — validates converted model in one Python call without any device deploy. Can force ComputeUnit.CPU_ONLY for numerical comparison vs CPU_AND_NE.
  • +
  • compute_units is switchable at prediction time (not just compile time) — enables A/B testing EP performance without re-converting.
  • +
  • Palettization: LUT-based weight compression at 1–8 bits (k-means clustering, not linear quant). Matches Neural Engine hardware kernels better than INT4 linear quantization for many models.
  • +
  • Three compression workflows: data-free / calibration-based / fine-tuning-based (QAT).
  • +
  • .mlpackage separates architecture from weights → streaming-friendly, supports on-device compilation after download.
  • +
+

ExecuTorch (Meta)

+

Most relevant: per-layer QNN accuracy debugging (best-in-class of all competitors)

+
    +
  • QNNIntermediateDebugger: dumps intermediate tensor outputs at every QNN op, computes cosine similarity per layer vs CPU reference, generates color-coded SVG computation graph (green ≥ 0.9, red < 0.9).
  • +
  • get_delegation_info(): table of ops showing delegated-to-NPU count vs CPU-fallback count per op type.
  • +
  • ETDump + Inspector API: per-op timing table with avg (ms), op type, is_delegated. Returns pandas DataFrame.
  • +
  • QAIRT Visualizer: pip install qairt-visualizer — interactive GUI overlaying op trace + QHAS (QNN HTP Analysis Summary) on model graph.
  • +
  • Missing: no cloud device testing, no automated accuracy-latency sweep, build process is complex.
  • +
+

Qualcomm AI Hub

+

Most relevant: cloud profiling with physical hardware, per-step memory breakdown

+
    +
  • Compile + Profile + Inference on real physical devices (Snapdragon X Elite laptops, Galaxy S24) in the cloud — no local hardware needed.
  • +
  • Per-step memory profiling: compilation time/memory, first-load time/memory (NE optimization), subsequent-load (cached), inference latency.
  • +
  • 500+ pre-optimized models in model zoo.
  • +
  • --clone j1glw6y8p — clone any previous job with modified params.
  • +
  • Cloud AIMET quantization: sophisticated PTQ as a service (submit_quantize_job()).
  • +
+

NVIDIA ModelOpt

+

Most relevant: 16 compression techniques + MMLU benchmark scripts + pre-quantized HF checkpoints

+
    +
  • Compression techniques beyond PTQ: AWQ, SmoothQuant, QAT, pruning (Minitron 33% smaller, 50% faster), distillation, speculative decoding, sparsity, NAS (Puzzletron).
  • +
  • Windows accuracy benchmark: mmlu_benchmark.py (57 subjects, DirectML/ORT/TensorRT-LLM/CPU), perplexity on WikiText-2, KL-divergence metrics.
  • +
  • Pre-quantized HF checkpoints: nvidia/DeepSeek-R1-FP4, nvidia/Llama-3.3-70B-FP4 etc. — pull validated optimized models without running pipeline.
  • +
+

Intel OpenVINO + NNCF

+

Most relevant: Accuracy-Aware PTQ (auto layer rollback)

+
    +
  • NNCF AccuracyAwareQuantization: automatically identifies sensitivity of each layer to quantization, rolls back sensitive layers to float when accuracy drop exceeds threshold. Fully automated accuracy-performance tradeoff solver.
  • +
  • benchmark_app -hint latency vs -hint throughput: auto-configures streams, batch, inference requests for each mode. -d AUTO: automatic device selection with fallback.
  • +
  • 100+ Jupyter notebooks on Binder/Colab — zero setup barrier.
  • +
  • OpenVINO GenAI: high-level LLMPipeline, WhisperPipeline — deploy-ready LLM inference in 5 lines.
  • +
+

HuggingFace Optimum

+

Most relevant: drop-in Transformers replacement + multi-backend hub

+
    +
  • Replace AutoModelForSequenceClassification.from_pretrained() with ORTModelForSequenceClassification.from_pretrained() → ONNX Runtime inference with zero code change.
  • +
  • 8 hardware backends: ONNX Runtime, OpenVINO, NVIDIA TensorRT-LLM, AMD Ryzen AI, AWS Inferentia, ExecuTorch, Intel Gaudi, FuriosaAI.
  • +
  • Task-aware export: --task text-generation auto-configures dynamic axes and model wrapping.
  • +
+

Microsoft Olive (direct competitor)

+

Most relevant: one-line optimize command + VS Code AI Toolkit

+
    +
  • olive optimize --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct --precision int4 --output_path models/qwen — one command, no per-step config.
  • +
  • JSON-based pipeline config for full declarative multi-step control.
  • +
  • VS Code AI Toolkit extension: GUI for model optimization, fine-tuning, and inference testing — no CLI knowledge needed.
  • +
  • MultiLoRA serving support.
  • +
+
+

Top 5 High-Impact Gaps for winml-cli

+

🔴 Gap 1: Per-Layer Accuracy Debugging

+

Pain: Accuracy degrades after QNN compilation/quantization, user has no idea which layer caused it. Currently requires QNN SDK expert knowledge.

+

Solution: winml debug --model model.onnx --ep qnn --inputs calibration_data/ +1. Runs model on CPU and QNN, captures intermediate tensor outputs at each op +2. Computes cosine similarity per layer +3. Outputs HTML/SVG graph with color-coded accuracy (green/red per layer)

+

Reference: ExecuTorch QNNIntermediateDebuggerOutputFormat.SVG_GRAPH + QcomCosineSimilarityComparator

+

Impact: Turns multi-day debugging into a 30-minute diagnosis. Currently no Windows-on-NPU tool does this.

+
+

🔴 Gap 2: Compute Unit Utilization Report

+

Pain: winml perf shows slower-than-expected latency with no explanation. User doesn't know what % of ops ran on NPU vs fell back to CPU.

+

Solution: Extend winml analyze to output delegation table:

+
Op Type         | NPU Delegated | CPU Fallback | Reason
+----------------|---------------|--------------|------------------
+MatMul (INT8)   | 47 / 47       | 0            | -
+LayerNorm       |  0 / 12       | 12           | Unsupported dtype
+Softmax (FP32)  |  0 /  6       |  6           | Requires INT8 input
+
+ +

Reference: ExecuTorch get_delegation_info().get_operator_delegation_dataframe() / AI Hub per-layer compute unit mapping

+

Impact: Directly actionable — if user sees "60% of ops on CPU due to unsupported dtype," they know to switch to W8A8.

+
+

🟠 Gap 3: Quantization Sensitivity Analysis

+

Pain: winml quantize --algo w8a8 produces a model with unacceptable accuracy. User doesn't know if it's a specific layer, the algorithm, or the calibration data.

+

Solution: winml analyze-quant --model model.onnx --calibration data/ --eval-dataset eval/ +1. Run full W8A8 quantization +2. For each block/layer, measure accuracy impact of reverting to FP16 +3. Rank layers by sensitivity +4. Report: "reverting 3 attention layers to FP16 recovers X% accuracy at Y% latency cost"

+

Reference: Intel NNCF AccuracyAwareQuantization (automatic per-layer rollback)

+

Impact: Replaces multi-day trial-and-error with a 10-minute automated report.

+
+

🟠 Gap 4: Standard Benchmark Integration (MMLU / Perplexity)

+

Pain: winml eval supports custom scripts but no out-of-box standard benchmarks. Users have no reference point for whether their quantized model's accuracy is "expected."

+

Solution: winml eval --model model.onnx --benchmark mmlu --ep qnn +- Built-in MMLU (57 subjects), WikiText-2 perplexity, KL-divergence scripts +- Reference numbers from FP32 baseline shown alongside quantized result +- FP16 baseline: 78.2% → W8A8 QNN: 77.9% (−0.3%, expected range: −0.1% to −0.5%)

+

Reference: NVIDIA ModelOpt examples/windows/accuracy_benchmark/mmlu_benchmark.py supports DirectML/ORT/CPU

+

Impact: Removes ambiguity and creates trust. Critical for LLM users.

+
+

🟡 Gap 5: Cross-EP Side-by-Side Comparison

+

Pain: Choosing between QNN/DirectML/CPU/OpenVINO requires running each EP manually and aggregating results. No tool does this automatically.

+

Solution: winml sweep --model model.onnx --precision w8a16,fp16 --ep qnn,dml,cpu +- Runs build+eval+perf for each (precision × EP) combination +- Outputs a single comparison table: accuracy / latency / op coverage % +- Agent-driven: skill reads JSON output and recommends the optimal combination

+

Reference: Truly unique — no competitor does this for Windows multi-EP. Closest is AI Hub's multi-device fleet testing (Android only).

+

Impact: The single most-requested decision for Windows AI developers. Unique to winml-cli.

+
+

Patterns in Great Toolchain DX

+

Pattern 1: The "Why" Feedback Loop +Great toolchains explain why results are the way they are. ExecuTorch's delegation table, AI Hub's compute unit mapping, NNCF's layer sensitivity analysis all answer "why?" winml-cli currently stops at "here's the result."

+

Pattern 2: Progressive Disclosure of Complexity +- Olive: olive optimize --precision int4 (one line) → full JSON config pipeline +- coremltools: ct.convert(model) → MIL IR manipulation +- AI Hub: web dashboard → Python SDK → CLI → AIMET configs

+

winml-cli is currently too close to the expert path: each step requires understanding EP-specific options.

+

Pattern 3: Zero-Deploy Validation +Every strong toolchain lets you test model output before deploying to hardware: coremltools model.predict(), ExecuTorch Python pybind, AI Hub submit_inference_job(). winml-cli is strong for CPU but lacks the quick "compare CPU vs QNN output" path.

+

Pattern 4: Pre-Validated Model Artifacts +ModelOpt (HF nvidia/ org), AI Hub (500+ models), NNCF (Model Zoo with accuracy tables) all reduce the cold-start problem. Users don't need the full pipeline for popular models.

+
+

Whitespace Opportunities (No Competitor Covers)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
OpportunityWhy it's winml-cli territory
Cross-EP regression table (one command, all EPs)Multi-EP is the unique Windows AI challenge; no Android/iOS tool does this
Quantization config recommender (winml recommend --target qnn --constraint latency=20ms)Rule-based recommendation from hardware+model arch analysis
EP-aware ONNX graph visualizer (Netron + green/yellow/red per EP)Netron exists but has no EP coverage overlay
Thermal/sustained-load profiling (latency curve over 100 runs, detect throttling)AI Hub hides variance; no tool surfaces thermal behavior
Windows AI Model Package (.mlpackage equivalent with multi-EP manifest)Apple has .mlpackage; Windows has nothing equivalent
+
+

Skill: use-winml-cli (existing — extend)

+

Status: Exists at skills/use-winml-cli/SKILL.md. Needs two additions: +- Add winml run and winml serve usage (currently missing) +- Add "first-time onboarding" path for users who don't know where to start

+

No structural changes needed; the existing skill is the general entry point.

+
+

Skill: debug-accuracy-drop

+

Frontmatter

+
name: debug-accuracy-drop
+description: >
+  Use this skill when a quantized or optimized model produces worse accuracy than
+  the FP32 baseline and the cause is unknown. Guides a structured diagnosis: first
+  isolate which pipeline stage introduced the drop (optimize vs quantize vs compile),
+  then use winml eval --mode compare to measure output similarity, then use winml
+  analyze to check for partial/unsupported ops that may cause EP fallback. Covers
+  calibration dataset issues, precision selection mistakes, and QNN-specific fallback
+  patterns. Use when the user says "accuracy dropped after quantization", "results
+  look wrong on NPU", or "cosine similarity is low".
+
+ +

When to use

+
    +
  • "My model gives wrong results after quantization"
  • +
  • "W8A8 accuracy is too low, how do I find out why"
  • +
  • "Results differ between NPU and CPU"
  • +
  • cosine_similarity < 0.95 from winml eval --mode compare
  • +
+

Sections

+

1. Isolation strategy: binary search on the pipeline +Diagnose by bisecting the pipeline stages:

+
FP32 baseline
+    → after optimize?   winml eval --mode compare (fp32 vs optimized)
+    → after quantize?   winml eval --mode compare (fp32 vs quantized)
+    → after compile?    winml eval --mode compare (fp32 vs compiled)
+
+ +

First stage where cosine drops → that's where the problem is.

+

Key commands:

+
# Export FP32 baseline
+winml export -m <model> -o baseline/model.onnx
+
+# Compare optimized vs baseline
+winml eval --mode compare -m optimized/model.onnx --model-id <model>
+
+# Compare quantized vs baseline
+winml eval --mode compare -m quantized/model.onnx --model-id <model>
+
+# Compare EP-compiled vs baseline (run on target EP)
+winml eval --mode compare -m compiled/model.onnx --model-id <model> --ep qnn
+
+ +

2. Interpreting similarity metrics +Table of thresholds: +| Metric | Healthy | Investigate | Problem | +|---|---|---|---| +| cosine_similarity | > 0.99 | 0.95–0.99 | < 0.95 | +| SQNR (dB) | > 40 | 30–40 | < 30 | +| max_abs_diff | model-dependent | — | unbounded |

+

3. Root cause patterns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SymptomLikely causeFix
Drop appears at quantize stageCalibration dataset not representativeUse task-relevant calibration data via --calibration-dataset
Drop appears at quantize stage for Attention layersW8A8 quantizing activations in attentionSwitch to W8A16 (keeps activations at FP16)
Drop appears at compile stage on QNNOp pattern unsupported → CPU fallbackRun winml analyze to find partial ops
Inconsistent results across runsNon-deterministic EP dispatchAdd --iterations 20 to average out
Drop only in certain inputsInput shape sensitivityTest with calibration data matching real distribution
+

4. Checking for op fallback with winml analyze +When compile-stage drop is suspected:

+
winml analyze -m quantized/model.onnx --ep qnn
+
+ +

Look for partial and unsupported ops — these fall back to CPU, introducing +numerical differences vs native NPU execution. Partial ops are the most common +source of unexpected accuracy variance on QNN.

+

5. Precision escalation path +If W8A8 is the problem and the model is accuracy-sensitive: +W8A8 → W8A16 → FP16 → FP32 +Stop at the first precision that meets accuracy requirements.

+

Cross-references: +- To compare precision options systematically → autoconfig (manual or automated optimize) +- If op is listed as unsupported → check-model-feasibility

+
+

Skill: ship-to-winapp (merge of validate-before-ship + prepare-for-winapp)

+

Covers the whole ship-time phase: first validate the model meets the Definition-of-Done, +then package the multi-EP artifacts and manifest for the WinApp to load at runtime.

+

Frontmatter

+
name: ship-to-winapp
+description: >
+  Use this skill when taking a winml-cli model artifact the last mile into a Windows
+  application — both validating it is good enough to ship and packaging it for the app.
+  Validation half: a Definition-of-Done checklist covering artifact completeness, accuracy
+  vs FP32 baseline, performance SLA, output correctness on real inputs, cross-EP consistency,
+  and fallback chain (every item checked or explicitly waived). Packaging half: how to organize
+  multi-EP artifacts (QNN/NPU, OpenVINO, VitisAI, DirectML/GPU, CPU fallback), the recommended
+  directory layout and manifest.json for runtime EP selection, and the runtime EP detection /
+  fallback pattern. Use when the user says "I'm ready to ship", "what should I test before
+  release", "how do I know the model is good enough", "how do I use this in my app",
+  "how do I package the model", or "what file do I load at runtime".
+
+ +

When to use

+
    +
  • About to ship a WinApp with on-device inference; final QA gate before production
  • +
  • After any build config change (new quantization, new EP, new model version)
  • +
  • "I built the model, how do I ship it in my app?"
  • +
  • "How do I load different models for different hardware / what happens with no NPU?"
  • +
  • "How do I package QNN + DML + CPU variants together?"
  • +
+
+

Part A — Validate (Definition-of-Done gates)

+

The checklist

+

Gate 1 — Artifact completeness +- [ ] All target EP artifacts exist and are loadable +- [ ] CPU fallback artifact exists +- [ ] manifest.json (if using multi-EP layout) is valid and references existing files +- [ ] Artifact was built with winml build (not opaque cache artifact)

+
winml inspect -m <artifact>.onnx  # verify each artifact loads
+
+ +

Gate 2 — Accuracy vs FP32 baseline +- [ ] cosine_similarity ≥ 0.99 for FP16 artifacts +- [ ] cosine_similarity ≥ 0.95 for W8A16 artifacts +- [ ] cosine_similarity ≥ 0.90 for W8A8 artifacts (or task-specific threshold) +- [ ] Task accuracy metric (Top-1, F1, mAP) within acceptable drop from FP32

+
winml eval --mode compare -m <artifact>.onnx --model-id <model>
+winml eval -m <artifact>.onnx --model-id <model>  # task accuracy
+
+ +

Gate 3 — Performance SLA +- [ ] p50 latency meets application target on target device +- [ ] p99 latency within 2x p50 (no outlier spikes) +- [ ] Benchmark run on actual target hardware (not developer machine)

+
winml perf -m <artifact>.onnx --device <target> --iterations 100 --monitor
+
+ +

Gate 4 — Output correctness on real inputs +- [ ] Model produces correct output on ≥3 representative real-world inputs +- [ ] No NaN or Inf in outputs +- [ ] Output shape matches expected shape

+
winml run -m <artifact>.onnx --file <real_input>  # visual/manual check
+
+ +

Gate 5 — Cross-EP consistency (if shipping multiple EP variants) +- [ ] QNN and DML outputs agree within tolerance on same input +- [ ] CPU fallback output agrees with primary EP within tolerance

+
winml run -m model_qnn.onnx --file sample.jpg --format json -o qnn_out.json
+winml run -m model_dml.onnx --file sample.jpg --format json -o dml_out.json
+winml run -m model_cpu.onnx --file sample.jpg --format json -o cpu_out.json
+# compare qnn_out.json vs dml_out.json vs cpu_out.json manually
+
+ +

Gate 6 — Fallback chain +- [ ] CPU fallback artifact verified independently (not just assumed to work) +- [ ] App runtime selects correct artifact when target EP is absent (simulate by removing EP)

+

Waiver policy +Any item that cannot be completed must be waived explicitly:

+
Waivers:
+- Cross-EP consistency: VitisAI not available on developer machine.
+  Verified on target hardware by QA team. Issue #NNN.
+- Performance SLA: Target hardware (Snapdragon X Elite) in procurement.
+  Benchmark deferred to post-merge, tracked in issue #NNN.
+
+ +

Unchecked items without waiver → do not ship.

+

L-level mapping — the 6 gates map directly to the L1–L5 confidence system (see Overview):

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
GateL-level
Gate 1 — Artifact completenessL1
Gate 2 — Accuracy vs FP32 baselineL3 + L4
Gate 3 — Performance SLAL5
Gate 4 — Output correctness on real inputsL4
Gate 5 — Cross-EP consistencyL5
Gate 6 — Fallback chainL1 (CPU artifact)
+

Minimum to ship: L1 + L3 all passing. L4 + L5 required for production release.

+

Quick command reference

+
# Gate 1: inspect all artifacts
+for f in model_qnn.onnx model_dml.onnx model_cpu.onnx; do winml inspect -m $f; done
+# Gate 2: accuracy
+winml eval --mode compare -m <artifact>.onnx --model-id <model>
+winml eval -m <artifact>.onnx --model-id <model>
+# Gate 3: perf
+winml perf -m <artifact>.onnx --device auto --iterations 100 --monitor
+# Gate 4: real input
+winml run -m <artifact>.onnx --file <sample>
+# Gate 5: cross-EP (run individually, compare outputs)
+winml run -m model_qnn.onnx --file <sample> --format json
+winml run -m model_dml.onnx --file <sample> --format json
+
+ +
+

Part B — Package & integrate (multi-EP)

+

1. The multi-EP artifact problem +winml compile produces EP-locked files (not portable), so a WinApp needs a strategy to +select the right file per device.

+

2. Recommended artifact layout

+
my_model/
+  manifest.json          ← EP → file mapping + version
+  model_qnn.onnx         ← QNN NPU (compiled, Snapdragon X)
+  model_openvino.onnx    ← OpenVINO NPU/GPU (Intel Core Ultra)
+  model_vitisai.onnx     ← VitisAI NPU (AMD Ryzen AI)
+  model_dml.onnx         ← DirectML GPU (any GPU, non-NPU machines)
+  model_cpu.onnx         ← CPU fallback (universal)
+
+ +

3. manifest.json schema

+
{
+  "model_id": "facebook/convnext-tiny-224",
+  "task": "image-classification",
+  "version": "1.0.0",
+  "variants": [
+    { "ep": "qnn",       "device": "npu",  "file": "model_qnn.onnx",       "precision": "w8a16" },
+    { "ep": "openvino",  "device": "npu",  "file": "model_openvino.onnx",  "precision": "w8a8"  },
+    { "ep": "vitisai",   "device": "npu",  "file": "model_vitisai.onnx",   "precision": "w8a8"  },
+    { "ep": "dml",       "device": "gpu",  "file": "model_dml.onnx",       "precision": "fp16"  },
+    { "ep": "cpu",       "device": "cpu",  "file": "model_cpu.onnx",       "precision": "w8a8"  }
+  ],
+  "selection_order": ["qnn", "openvino", "vitisai", "dml", "cpu"]
+}
+
+ +

(For multi-EP artifacts, autoconfig emits this manifest.json directly with experiment provenance.)

+

4. Building all variants with winml-cli

+
# Generate configs per EP
+winml config -m <model> --device npu --ep qnn -o config_qnn.json
+winml config -m <model> --device npu --ep openvino -o config_ov.json
+winml config -m <model> --device gpu --ep dml -o config_dml.json
+winml config -m <model> --device cpu -o config_cpu.json
+
+# Build all
+winml build -c config_qnn.json -m <model> -o out_qnn/
+winml build -c config_ov.json  -m <model> -o out_ov/
+winml build -c config_dml.json -m <model> -o out_dml/
+winml build -c config_cpu.json -m <model> -o out_cpu/
+
+ +

5. Runtime EP selection pattern (C++ / ORT) +Pseudocode for app-side logic: +- Read manifest.json +- Query available EPs on device (GetAvailableProviders() or winml sys equivalent) +- Walk selection_order, pick first EP available on this device +- Load the corresponding file +- If all fail → CPU is always available

+

6. What NOT to do +- Don't load a QNN-compiled model with CPU EP → will fail or produce wrong results +- Don't hardcode EP names → check availability at runtime +- Don't ship only the compiled artifact without a CPU fallback

+

Cross-references: +- If accuracy gate fails → debug-accuracy-drop +- If performance gate fails → autoconfig (manual or automated optimize path) +- If EP not available for testing, or to pick the right EP → check-model-feasibility +- To build the artifacts → use-winml-cli

+
+

Skill: check-model-feasibility (merge of find-a-model + ep-compatibility-check)

+

The pre-build front door. Two entry points, one shared engine (inspectsysanalyze): +(A) the user has no model yet → recommend a supported one from their constraints; +(B) the user has a model → confirm it runs on their target EP/device. Both converge on the +same three-layer check, so they are one skill.

+

Frontmatter

+
name: check-model-feasibility
+description: >
+  Use this skill before a full build, to answer two linked questions: "which model should I
+  use?" and "will it run on my hardware?". Model discovery: when the user knows the task
+  (image classification, text embedding, object detection, summarization, …) but has no model
+  yet, gather their constraints, generate Hugging Face candidates, and screen each one for
+  winml-cli support. Compatibility: for a chosen (or candidate) model, run the three-layer check
+  — winml inspect (model support), winml sys (EP availability on this machine), winml analyze
+  (operator-level EP coverage) — plus the EP-to-hardware mapping and fallback chain for Windows
+  AI PCs. Use when the user says "what model should I use for X", "find me a model that runs
+  under 20ms on the NPU", "recommend a small image classifier", "I don't have a model yet",
+  "will this work on my device", "is QNN supported here", "what hardware do I need for NPU",
+  or when they hit an unsupported-operator error.
+
+audience: external (WinApp developers)
+
+ +

When to use

+
    +
  • "What model should I use for background blur / OCR / summarization?"
  • +
  • "Find a text-embedding model under 100MB that runs on the Intel NPU"
  • +
  • "Will this model work on my Snapdragon X Elite laptop? Is QNN supported here?"
  • +
  • "The compile step failed with an unsupported op"
  • +
  • Starting a new project: pick a model and verify feasibility before investing build time
  • +
+

What this skill does NOT do

+
    +
  • It does not train, fine-tune, or optimize a model — optimization hands off to autoconfig.
  • +
  • It only recommends models whose architecture winml-cli can actually export/run (verified via + winml inspect), never an arbitrary HF model it cannot load.
  • +
+

Sections

+

1. Two entry points +- (A) No model yet → run Section 2 (discovery) to produce candidates, then Section 3 on each. +- (B) Have a model → skip to Section 3 (three-layer check) directly.

+

2. Discovery — find candidate models (entry point A) +Capture and lock the selection constraints first:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ConditionExampleDrives
Taskimage-classification, feature-extraction, text-generationHF Hub filter
Target device / EPSnapdragon X NPU (QNN), Intel NPU (OpenVINO), any GPU (DML)feasibility + latency class
Latency budgetp50 ≤ 20 mssize / architecture shortlist
Accuracy need"≥ ResNet-50 top-1" or a benchmark floorcandidate quality bar
Size limit≤ 100 MB on diskexcludes large variants
Licensepermissive (Apache-2.0 / MIT)excludes restricted models
+

The agent queries the HF Hub by task, sorted by downloads/likes, restricted to architecture +families winml-cli is known to support → a 5–10 model shortlist. Each candidate then goes +through the three-layer check below; drop any that fail Layer 1 or have heavy unsupported ops.

+

3. The three-layer feasibility check (entry points A and B) +Layer 1 — Model support · Layer 2 — EP availability · Layer 3 — Operator coverage. +Run in order, stop at first hard failure.

+

Layer 1 — Model support

+
winml inspect -m <model-id> --format json
+
+ +

Look for loader, exporter, winml_inference_class populated. If inspect fails or shows +"unsupported" → model is out of scope for winml-cli (drop the candidate; do not recommend it).

+

Layer 2 — EP availability

+
winml sys --list-ep --list-device
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
EPHardware requirementCheck for
QNNQualcomm Snapdragon X Elite / X PlusQNNExecutionProvider in list
OpenVINOIntel Core Ultra (Meteor Lake / Lunar Lake+)OpenVINOExecutionProvider
VitisAIAMD Ryzen AI (Phoenix / Hawk Point / Strix)VitisAIExecutionProvider
NvTensorRTRTXNVIDIA discrete GPU (RTX series)NvTensorRTRTXExecutionProvider
DMLAny DirectX 12 GPUDmlExecutionProvider
CPUAnyAlways available
+

If the desired EP is not listed → recommend next best EP from the fallback chain.

+

Layer 3 — Operator coverage

+
winml analyze -m <exported_model>.onnx --ep <ep> --format json
+# or for all EPs at once:
+winml analyze -m <exported_model>.onnx --device all
+
+ +
    +
  • supported (green): op runs natively on EP
  • +
  • partial (yellow): op may fall back to CPU for some configurations
  • +
  • unsupported (red): op cannot run on this EP
  • +
+

Decision rule: any unsupported → either change EP or accept CPU fallback for those ops +(which may impact accuracy and latency).

+

4. Fallback chain recommendation +If target EP not available or has unsupported ops:

+
QNN not available → OpenVINO (if Intel) or VitisAI (if AMD) → DML → CPU
+
+ +

5. Rank and recommend (entry point A) / fast-fail before compile (entry point B) +- Discovery: rank surviving candidates by fit against the locked conditions (size, latency + class, accuracy reference, op coverage, downloads as a popularity prior). Output a short + ranked table + one recommended pick + rationale. +- winml compile is expensive (minutes). Always run analyze first; if it shows >20% + unsupported ops → likely not worth compiling for that EP.

+

Cross-references: +- After picking a model + confirming feasibility → autoconfig (find the optimal config) +- To build the chosen artifacts → use-winml-cli +- If no supported model meets the constraints, or all EPs show unsupported ops → the gap + feeds optimization-research (long-tail coverage) and adding-model-support

+
+

Addresses the Pre-quantized model zoo / cold-start whitespace from the Competitive Analysis: +NVIDIA (nvidia/ HF org) and AI Hub (500+ models) reduce cold-start with curated zoos; winml-cli +has none, so this skill substitutes a constraints-driven recommender that only returns supported models.

+
+
+

Skill: adding-model-support (contributor)

+

Frontmatter

+
name: adding-model-support
+description: >
+  Use this skill when contributing support for a new Hugging Face model to
+  winml-cli. Covers finding the correct exporter, writing a recipe config,
+  verifying at each pipeline stage (export → optimize → quantize → compile),
+  and passing the L1–L5 validation gates before submitting a PR. Use when
+  a contributor says "I want to add support for model X", "this model type
+  is not supported", or "how do I write a recipe for a new architecture".
+
+ +

When to use

+
    +
  • "I want to add support for Qwen3 / Phi-4 / [new model]"
  • +
  • "winml-cli says this model is unsupported"
  • +
  • "How do I write a recipe config for a new model family?"
  • +
+

Sections

+

1. Find the right exporter

+
winml inspect -m <hf_model_id>  # check if auto-detected
+
+ +

If inspect fails → the model needs a new exporter or recipe. +Look in src/winml/modelkit/export/ for existing exporters as reference.

+

2. Find a reference model of the same family +- Same architecture class (e.g., LlamaForCausalLM, BertModel)? +- Check recipes/ for an existing .json config for that class +- Prefer copying the closest recipe and adjusting rather than writing from scratch

+

3. Write the recipe config +Minimal recipe template:

+
{
+  "model_id": "org/model-name",
+  "task": "text-generation",
+  "export": { "opset": 17 },
+  "optimize": { "passes": ["MatMulAddFusion", "LayerNormFusion"] },
+  "quantize": { "mode": "w8a16", "calibration_dataset": "wikitext2" }
+}
+
+ +

4. Validate at each stage (L1 → L5)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
StageCommandPass criterion
L1: Export loadswinml inspect -m <exported>.onnxNo error
L2: Shape correctwinml eval -m <exported>.onnx --model-id <id>Output shape matches
L3: Numerical paritywinml eval --mode compare -m <quantized>.onnx --model-id <id>cosine ≥ threshold
L4: Task accuracywinml eval -m <quantized>.onnx --model-id <id>Task metric in spec
L5: Perf on target EPwinml perf -m <compiled>.onnx --device <target>Meets latency target
+

5. Common pitfalls for new models +- New op types not in operator coverage → run winml analyze early +- Attention variant (GQA, MQA, MLA) → check quantization mode compatibility +- Dynamic shapes → add explicit shape hints in export config +- Non-standard tokenizer → verify winml run input preprocessing

+

Cross-references: +- If EP shows unsupported ops → check-model-feasibility +- After L1–L5 all pass → ship-to-winapp for PR gate

+
+

Skill: adding-ep-support (contributor)

+

Frontmatter

+
name: adding-ep-support
+description: >
+  Use this skill when adding a new execution provider (EP) backend to
+  winml-cli. Covers implementing the compile backend interface, adding
+  EP-specific optimize passes, wiring the new EP into winml sys and
+  winml analyze, and verifying coverage with the L1–L5 test gates.
+  Use when a contributor says "I want to add support for a new EP",
+  "how does the QNN compile backend work", or "can we support EP X".
+
+ +

When to use

+
    +
  • Adding a new EP compile backend (e.g., a new NPU vendor)
  • +
  • Extending an existing EP with new optimization passes
  • +
  • Understanding how the existing QNN / OpenVINO / VitisAI backends are structured
  • +
+

Sections

+

1. EP backend interface +Reference implementation: src/winml/modelkit/compile/qnn_backend.py +Three methods to implement:

+
class MyEPBackend(CompileBackend):
+    def is_available(self) -> bool: ...      # detect EP on current machine
+    def optimize(self, model, config): ...   # EP-specific graph transforms
+    def compile(self, model, config): ...    # produce EP-locked artifact
+
+ +

2. Wire into EP registry +Register in src/winml/modelkit/ep_registry.py:

+
EP_REGISTRY["myep"] = MyEPBackend
+
+ +

This makes --ep myep work in winml config, winml compile, winml analyze.

+

3. Add operator coverage data +Add a coverage JSON to src/winml/modelkit/analyze/coverage/myep_ops.json:

+
{ "Add": "supported", "LayerNorm": "partial", "CustomOp": "unsupported" }
+
+ +

This is what winml analyze --ep myep reads.

+

4. Add to winml sys output +Add EP availability check to src/winml/commands/sys.py so it appears +in winml sys --list-ep.

+

5. L1–L5 validation for the new EP +Minimum before merging: +- L1: A known-good model compiles without crash +- L3: Compiled artifact passes winml eval --mode compare (cosine threshold) +- L5: winml perf produces valid latency output on target hardware

+

Cross-references: +- Operator coverage analysis → check-model-feasibility +- After adding: document the EP in the check-model-feasibility hardware table

+
+

Skill: contributing-a-skill (contributor)

+

Frontmatter

+
name: contributing-a-skill
+description: >
+  Use this skill when writing a new SKILL.md for winml-cli or improving
+  an existing one. Covers frontmatter requirements, description writing
+  (the description is the agent trigger, not a human summary), section
+  structure conventions, cross-reference format, command accuracy
+  requirements, and the review checklist before submitting. Use when a
+  contributor says "I want to add a new skill", "how should I write
+  SKILL.md", or "what are the skill authoring rules".
+
+ +

When to use

+
    +
  • Writing a new skill for a gap not covered by existing skills
  • +
  • Improving an existing skill with new commands or sections
  • +
  • Reviewing a skill PR
  • +
+

Sections

+

1. Frontmatter rules

+
name: kebab-case-skill-name   # matches directory name under skills/
+description: >
+  Use this skill when <trigger phrase describing user's problem>.
+  Covers <what the skill teaches>.
+  Use when the user says "<example trigger phrase 1>", "<example 2>", or <condition>.
+
+ +

Critical: The description field is what the Copilot agent reads to decide +whether to activate this skill. Write it as a trigger specification, not a +documentation summary. Include representative user phrases in quotes.

+

2. Required sections (in order) +1. ## When to use — 3–5 bullet points with user-facing symptoms/questions +2. Diagnostic or decision section — symptom → cause → fix structure +3. Command examples — runnable winml commands with real flags +4. Reference tables — hardware, thresholds, EP names as concrete data +5. ## Cross-references — links to related skills using relative paths

+

3. Cross-reference format

+
- If accuracy dropped → see `.agents/skills/debug-accuracy-drop/SKILL.md`
+- After validating → see `.agents/skills/validate-before-ship/SKILL.md`
+
+ +

4. Content rules +- All commands must be runnable exactly as written (no pseudocode flags) +- Include concrete numbers: thresholds (cosine ≥ 0.99), speedup (3–5×), latency (<50ms) +- Target ~200 lines prose + tables; move deep content to references/ subdirectory +- Do not duplicate content from another skill — cross-reference instead

+

5. Review checklist before PR +- [ ] description contains ≥3 quoted user trigger phrases +- [ ] All commands are tested and produce the described output +- [ ] Cross-references use relative paths and the linked skill exists +- [ ] No commands reference flags that don't exist in current winml --help +- [ ] Hardware names and EP names match the canonical list in check-model-feasibility +- [ ] evals/eval.yaml exists with ≥2 test cases (including at least one negative assertion)

+
+

Skill: autoconfig (user — optimize the model: automated loop + manual framework)

+

The optimize skill. Two modes: automated (the autoresearch loop — the bulk of this section) for +"figure it out for me / run overnight", and manual (the decision framework folded in from +optimize-for-device) for "I'll choose by hand" or when there is no target hardware to benchmark on.

+

Frontmatter

+
name: autoconfig
+description: >
+  Use this skill when a **WinApp developer** wants the best performance for their model on one or
+  more Windows EP/device targets — either by letting winml-cli search automatically, or by working
+  through the precision/EP tradeoffs by hand. Automated mode: an autonomous experiment loop that
+  proposes config.json hypotheses, runs winml build + eval + perf, evaluates against user-defined
+  objectives (accuracy floor, latency budget, or Pareto frontier), and iterates — keeping
+  improvements, discarding regressions; covers single-EP optimization, multi-EP parallel search,
+  mixed-precision (nodes_to_exclude) exploration, calibration tuning, and manifest.json output.
+  Manual mode: the latency-budget vs accuracy-floor decision framework, the FP32→FP16→W8A16→W8A8
+  precision ladder, a per-device hardware guidance table, and how to read tradeoff results.
+  Use when the user says "find the best config for my model on QNN", "automate the config search",
+  "generate configs for all EPs", "I want to leave this running overnight", "make it faster",
+  "which precision should I use", "is NPU worth it", or "compare QNN vs DirectML vs CPU".
+
+audience: external (WinApp developers)
+
+ +

When to use

+
    +
  • "Find the best W8A8 config that keeps accuracy > 0.95 on QNN"
  • +
  • "Generate optimized configs for QNN + DirectML + CPU and build a manifest"
  • +
  • "I don't know which quantization settings to use, figure it out for me" / "run overnight"
  • +
  • "Make it faster" / "which precision should I use" / "is NPU worth it" (→ manual mode)
  • +
  • "Compare QNN vs DirectML vs CPU for my model"
  • +
  • User has a latency SLA or accuracy floor but doesn't know how to achieve it
  • +
+

What this skill does NOT do

+
    +
  • It only searches within what winml build currently supports (existing capabilities)
  • +
  • It does not look for optimization techniques outside winml's current feature set
  • +
  • It does not suggest that winml needs new features or file bugs
  • +
  • For finding what winml is missing, use optimization-research instead
  • +
+
+

Manual mode — the decision framework (folded in from optimize-for-device)

+

Use this lightweight path when the user wants to decide by hand, or has no target hardware to +benchmark on (so the automated loop's perf gate can't run). It is the conceptual model the +automated loop below mechanizes.

+

1. The decision framework — two inputs: latency budget OR accuracy budget. +- Have a latency SLA (e.g. <50ms)? → find highest accuracy within that budget +- Have an accuracy floor (e.g. <2% drop)? → find fastest within that floor

+

2. The precision ladder — FP32 → FP16 → W8A16 → W8A8, with typical speedup and accuracy-drop +ranges per model family (Encoder/BERT-like, Vision/ConvNet, Transformer/ViT).

+

3. The sweep workflow — run winml build + winml eval + winml perf for each precision, +collect into a tradeoff table, apply the decision framework.

+
winml config -m <model> --device <device> --precision fp16 -o config_fp16.json
+winml build -c config_fp16.json -m <model> -o out_fp16/
+winml eval -m out_fp16/<artifact>.onnx --model-id <model>
+winml perf -m out_fp16/<artifact>.onnx --device <device> --iterations 50
+# repeat for w8a16, w8a8
+
+ +

4. Hardware-specific guidance table +| Device | Best EP | Sweet-spot precision | Notes | +|---|---|---|---| +| Snapdragon X Elite NPU | QNN | W8A16 | HTP native for W8A16; W8A8 risky for Attention | +| Intel Core Ultra NPU | OpenVINO | W8A8 | OpenVINO PTQ handles INT8 well | +| AMD Ryzen AI NPU | VitisAI | W8A8 | Phoenix/Hawk Point prefer INT8 | +| Any GPU | DirectML | FP16 | FP16 sufficient; quantization rarely helps on GPU | +| CPU fallback | CPU | W8A8 | Size + latency both benefit |

+

5. Reading the output — how to interpret winml eval cosine_similarity / SQNR and +winml perf p50/p90/p99; what values indicate "acceptable" vs "needs investigation".

+

When the user wants this automated instead of done by hand, continue to the autoresearch loop below.

+
+

Epistemic standard for autoconfig findings

+

Any conclusion this skill writes into a report or recommends to a user must meet this bar:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
RequirementWhat it means
Observation vs explanationState what was measured separately from why it happened. "latency increased 270ms" is fact. "because NHWC causes cache thrashing" is a hypothesis — label it as such unless confirmed by profiling.
Statistical validityA latency claim requires ≥ 3 independent runs with warmup. A single winml eval run (no warmup, includes preprocessing) is insufficient to quote as a latency number. It can guide search decisions but not final reports.
Mechanism confirmationDo not explain a regression unless the mechanism is confirmed (e.g., by profiler, by op-level timing, or by source code inspection of ORT/QNN SDK). If unknown, write "cause unconfirmed; further profiling needed."
Scope boundaryResults measured on one model/EP are never generalized to other models/EPs without explicit qualification. "On ConvNext-tiny CPU" is allowed. "CPU dislikes fusion" is not — it's an overgeneralization.
Unresolved uncertaintyIf an observation contradicts the expected behavior (e.g., a "disabled" fusion still appears in the output), the report must flag this as an open question, not silently adopt an explanation.
EP isolationA finding on one EP (positive or negative) MUST NOT be applied to prune the search space of a different EP without independent validation. CPU opset regression ≠ QNN NPU opset regression. Always validate per EP independently.
+

The skill MUST NOT write confident root-cause explanations in the HTML report or chat summary for regressions where only the measurement is available. Use hedged language: "this likely relates to…", "one hypothesis is…", or simply omit the explanation and recommend profiling.

+

Perf gain validation protocol

+

Before any perf gain is written into a report, config recommendation, or knowledge base as a confirmed finding, it must pass ALL three gates:

+

Gate 1 — Statistical: two-phase bench protocol (from GPU Optimizer V2)

+
Phase A — Quick screen (fast, ~2 min):
+  winml perf -m <model> --ep <ep> --device <device> --warmup 20 --iterations 200 -o screen.json
+  CV = screen.json.std / screen.json.p50
+  IF CV > 0.10 (10%): REJECT — high DVFS variance, measurement unreliable
+                       → cool down 120s, retry once
+                       → if still CV > 0.10: flag as [UNSTABLE], skip candidate
+
+Phase B — Full bench (only if Phase A passes, ~15 min):
+  # 3 independent sessions with 60s cool-down between each
+  winml perf ... --warmup 50 --iterations 1000 -o run1.json
+  sleep 60
+  winml perf ... --warmup 50 --iterations 1000 -o run2.json
+  sleep 60
+  winml perf ... --warmup 50 --iterations 1000 -o run3.json
+
+  # KEEP if ALL of:
+  #   1. p50(run1,2,3) are all faster than baseline p50 × (1 - min_improvement)
+  #   2. CV of each run < 0.10
+  #   3. cosine_similarity ≥ accuracy_floor
+  KEEP_threshold = baseline_p50 × 0.99   # ≥1% improvement required
+
+ +

Rationale: DVFS on mobile NPUs causes 2-10x run-to-run variance. CV check catches this before wasting 15 min on full bench.

+

Gate 2 — Mechanism: read ORT/QNN source code before explaining why

+

Gate 2 — Mechanism: read ORT/QNN source code before explaining why +- For QNN EP gains: check onnxruntime/core/providers/qnn/builder/ for opset-conditional dispatch +- For CPU EP gains: check onnxruntime/core/optimizer/ for pass applicability conditions +- For DML EP gains: check DML operator mapping tables +- Do not publish "opset 21 = 2.3x faster on QNN NPU" without confirming the mechanism in source code. It may be DVFS bias, not a real architectural difference.

+

Gate 3 — Reproducibility: baseline and candidate measured in same thermal state +- Run baseline and candidate back-to-back in the same session OR +- Use a device-level tool to lock NPU clock frequency +- If you cannot control thermal state, report min_ms (peak-performance ceiling) alongside p50 (typical performance), and flag the variance explicitly.

+

Lesson from ConvNext opset sweep (2026-06-10): +Initial opset 21 measurement (8.45ms, 50 iters) vs opset 17 (19.4ms) appeared to show 2.3x gain. Full 17-22 sweep with 50 iters each showed: +- All opsets min ~9-10ms (same peak capability) +- opset 17 p50=54ms, opset 19-22 p50=12ms — but opset 18 p50=43ms (bimodal) +- opset 21 std varied from 10ms (cool device) to 37ms (warm device) +Conclusion: data is inconclusive. Gain may be real OR may be thermal artifact. Gates 1+2 not yet passed.

+
+

Design Comparison: GPU Optimizer V2 vs WinML Autoconfig

+

Reference: "Agentic GPU Model Optimization" doc (cheye@, 2026-03-20). GPU Optimizer V2 is a 6-role multi-agent system for cloud GPU inference optimization (ONER-1B KNN service, H100). Autoconfig is a local edge inference optimizer (winml-cli, Snapdragon X). Most of their infrastructure (machine pool, SSH fleet, Triton serving, custom CUDA kernels, SM occupancy tuning) does not apply here. But the agent loop design has several directly adoptable ideas.

+

Adoptable insights from GPU Optimizer V2

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
V2 design decisionV2 rationaleAdopt into autoconfig?Notes
Two-phase bench: 200-iter quick screen → 3×1000-iter full bench"CV<2% gates full bench — avoid wasting time on high-variance results"YES — highest priority gapWe've been doing single 50-iter runs and calling them facts. CV check would have caught the DVFS noise immediately.
Verdict policy names (ThroughputOnly, ThroughputOrLatency…)"Named policies prevent Reviewer from ad-hoc criteria drift"✅ YES (simplified)Autoconfig should have explicit KEEP criteria: p50_ms < baseline × (1 - threshold) AND cosine ≥ floor
Append-only experiment_log.md + results.tsv written only by Reviewer"Single writer = no drift, full audit trail"✅ YESOur results.tsv exists but no "single writer" discipline
Explorer mandatory external-research triggers"After 15 consecutive DISCARDs → external research sweep"✅ YES — this is the exact gap that caused the opset 21 missIf we had this rule, we would have searched ORT source after N DISCARDs and found kMaxSupportedOpset earlier
Knowledge agent with review gate before KB save"Learnings reviewed before they prune future search"✅ YESep_knowledge/*.json entries should be marked draft until Gate 2 (mechanism) is confirmed
Correctness contract locked after Phase 0, never modified"Prevents accuracy goal-post moving"✅ YESWe have accuracy gate but no locked contract file
30-consecutive-DISCARD stop condition"Prevents endless search in exhausted space"✅ YESautoconfig has no stop condition today
Per-experiment structured output: Hypothesis → Implementation → Parity → Perf → Analysis → Decision"Enables post-analysis and knowledge extraction"✅ YESautoconfig report is currently holistic, not per-experiment
Role separation: Profiler / Explorer / Optimizer / Reviewer are separate agents"Prevents context drift; each agent stays focused"⚠️ PartialFull 6-agent split is overkill for CLI tool; but Explorer / Reviewer distinction is valuable
Resource lock: only one GPU job at a time"Prevents benchmark interference"✅ YES (trivially)Already serial; but should be explicitly enforced if autoconfig ever parallelizes
Machine pool + SSH fleet + Model RegistryCloud GPU fleet management❌ N/ALocal device only
Custom CUDA kernel writing"Extreme asymmetry benefits from custom kernels"❌ N/ACLI-only constraint; no kernel modification
SM occupancy / GEMM tile count tuning"H100 has 132 SMs; 48 output tiles = 36% occupancy"❌ N/AEdge NPU/GPU, not H100 multi-SM
FlashAttention / fused QKV"Eliminate HBM traffic for attention score matrix"❌ N/AModel is already trained; deployment-time optimization only
+

Key gaps in current autoconfig design (from V2 comparison)

+

Gap 1 (critical): No two-phase bench protocol +Current design runs --iterations 50 and accepts the result. V2 runs: +1. Quick screen: 200 iters, check CV < 2% (Coefficient of Variation = std/mean) +2. Only if CV < 2%: full bench 3×1000 iters with 60s cool-down between sessions +3. KEEP only if Δp50 > threshold AND CV(candidate) < 2%

+

This directly matches the "iter ≥ 1000" rule we just added. Formalize it as two phases.

+

Gap 2 (critical): No mandatory external-research trigger in Explorer +V2 Explorer triggers external research (web search, papers, source code) after: +- 15 consecutive DISCARDs +- Every KEEP that changes model/precision +- Before declaring backlog_empty

+

We discovered kMaxSupportedOpset only by accident (downloading QNN Hub models). A mandatory "read ORT source after 5 DISCARDs in opset dimension" rule would have found it in Phase 2.

+

Gap 3 (important): ep_knowledge/*.json has no draft/confirmed state +V2 Knowledge agent requires review gate before KB entries are used to prune search space. Our ep_knowledge findings should have: +- status: "draft" — observed, mechanism unconfirmed (Gate 2 not passed) +- status: "confirmed" — mechanism confirmed via source code (Gate 2 passed)
+- status: "deprecated" — finding invalidated by new experiment or ORT version change +Only "confirmed" entries should prune search space. "draft" entries inform hypothesis priority but don't prune.

+

Gap 4 (nice-to-have): No per-experiment structured artifact +V2 produces per-experiment: Hypothesis / Implementation / Parity / Perf / Analysis / Decision +autoconfig produces: one aggregate report.html. Should produce both.

+

Design: The Autoresearch Loop

+

Inspired by karpathy/autoresearch: +agent modifies a config file, runs a fixed-cost experiment, checks if the objective improved, keeps or discards, and repeats autonomously until manually stopped or convergence criteria met.

+
OBJECTIVE (user-defined, one of):
+  A. Accuracy-primary:  maximize cosine_similarity  subject to  p50_ms ≤ <budget>
+  B. Latency-primary:   minimize p50_ms             subject to  cosine ≥ <floor>
+  C. Pareto search:     find the full accuracy-latency frontier
+
+SEARCH SPACE — config.json has three sections the agent can modify:
+
+  [export]
+    opset_version          : int   — 17, 18, 19, 20  (higher = newer ops, EP may not support)
+    do_constant_folding    : bool  — may affect graph structure visible to EP
+    dynamic_axes           : dict  — static vs dynamic shapes (QNN prefers static batch=1)
+
+  [optimize]  — full capability list (from winml optimize --list-capabilities)
+
+    GraphPipe (run via ORT SessionOptions):
+      GELU:
+        gelu-fusion            : bool  — fuse tanh-GELU subgraph → Gelu op
+        fast-gelu-fusion       : bool  — fuse fast-GELU (tanh-approx) → FastGelu
+        bias-gelu-fusion       : bool  — fuse Bias+GELU (requires gelu-fusion)
+        quick-gelu-fusion      : bool  — fuse x*sigmoid(1.702x) → FastGelu
+        gelu-approximation     : bool  — convert exact Gelu → FastGelu (requires gelu-fusion)
+      Activation:
+        bias-softmax-fusion    : bool  — fuse Bias+Softmax
+        bias-dropout-fusion    : bool  — fuse Bias+Dropout
+      Convolution:
+        conv-add-fusion        : bool  — fuse Conv+Add (bias)
+        conv-bn-fusion         : bool  — fuse Conv+BatchNorm into weights
+        conv-mul-fusion        : bool  — fuse Conv+Multiply
+        conv-activation-fusion : bool  — fuse Conv+activation (ReLU, Sigmoid, etc.)
+      Elimination:
+        slice-elimination      : bool  — remove redundant Slice ops
+        expand-elimination     : bool  — remove no-op Expand
+        unsqueeze-elimination  : bool  — fold Unsqueeze into initializers
+      GEMM:
+        gemm-activation-fusion : bool  — fuse GEMM+activation
+        gemm-sum-fusion        : bool  — fuse GEMM+Sum
+        gemm-transpose-fusion  : bool  — fuse GEMM+Transpose
+      Graph:
+        concat-slice-elimination   : bool  — remove Concat+Slice that restore originals
+        double-qdq-pairs-remover   : bool  — remove consecutive QDQ pairs
+        constant-folding           : bool  — pre-compute constant exprs (default=True; disable to reduce size)
+      LayerNorm:
+        layer-norm-fusion          : bool  — fuse ReduceMean→Sub→Pow→Sqrt→Div→Mul→Add
+        skip-layer-norm-fusion     : bool  — fuse Add(residual)+LayerNorm → SkipLayerNorm (requires layer-norm-fusion)
+        simplified-layer-norm-fusion : bool — fuse simplified LayerNorm (no mean-centering)
+      Layout:
+        transpose-optimizer        : bool  — eliminate redundant transpose chains
+        nhwc-transformer           : bool  — NCHW→NHWC (GPU memory layout)
+        nchwc-transformer          : bool  — NCHW→NCHWc (CPU SIMD layout)
+        conv-add-activation-fusion : bool  — fuse Conv+Add+Activation → FusedConv
+      MatMul:
+        matmul-add-fusion          : bool  — fuse MatMul+Add → single kernel
+        matmul-activation-fusion   : bool  — fuse MatMul+activation (DML-only, requires matmul-transpose-fusion)
+        matmul-transpose-fusion    : bool  — fuse MatMul+Transpose → FusedMatMul
+        matmul-scale-fusion        : bool  — fuse MatMul+Scale
+        matmul-bn-fusion           : bool  — fuse MatMul+BatchNorm
+        dynamic-quantize-matmul-fusion : bool — dynamic quant for MatMul
+      Misc:
+        gather-slice-to-split-fusion : bool — fuse Gather+Slice → Split
+        gather-to-slice-fusion       : bool — convert Gather to Slice (contiguous idx)
+        pad-fusion                   : bool — fuse Pad with Conv/Pool
+        not-where-fusion             : bool — fuse Not+Where
+
+    FusionPipe (ORT transformer fusions, via FusionOptions):
+      attention-fusion              : bool  — fuse MHA pattern → Attention/MultiHeadAttention
+      layer-norm-fusion             : bool  — (FusionPipe variant, same flag)
+      skip-layer-norm-fusion        : bool  — (FusionPipe variant)
+      simplified-layer-norm-fusion  : bool  — (FusionPipe variant)
+      embed-layer-norm-fusion       : bool  — fuse Embedding+Position+LayerNorm (requires layer-norm-fusion)
+      bias-skip-layer-norm-fusion   : bool  — fuse Bias+SkipLayerNorm (requires skip-layer-norm-fusion)
+      fuse-rmsnorm                  : bool  — fuse RMSNorm → LpNormalization(p=2) [custom, QNN-compatible]
+      packed-qkv-fusion             : bool  — (SD only)
+      packed-kv-fusion              : bool  — (SD only)
+      skip-group-norm-fusion        : bool  — (SD only)
+      bias-add-fusion               : bool  — fuse BiasAdd
+      qordered-matmul               : bool  — (SD only)
+
+    SurgeryPipe (pre-EP graph fixes):
+      clamp-constant-values         : bool  — clamp -inf/+inf constants → [-1e3, 1e3] (prevents QNN quant issues)
+      remove-isnan-in-attention-mask: bool  — remove Softmax→IsNaN→Where guards (use after clamp)
+
+    RewritePipe (pattern-based subgraph rewriting):
+      --enable-{source-slug}-{target-slug}  (run winml optimize --list-rewrites for full list)
+      Examples: --enable-gelu-singlegelu, --enable-matmuladdpattern-reshapegemmreshapepattern
+
+  [quant]
+    precision              : fp16 | w8a16 | w8a8
+    calibration_method     : minmax | entropy | percentile
+    samples                : 64 | 128 | 256 | 512
+    per_channel            : bool
+    symmetric              : bool
+    op_types_to_quantize   : list[str]  — restrict which op types get quantized
+    nodes_to_exclude       : list[str]  — exclude specific named nodes
+
+FIXED:  winml build + winml eval + winml perf  (the experiment harness)
+METRIC: cosine_similarity  (from winml eval --format json)
+        p50_ms             (from winml perf --format json)
+RECORD: results.tsv
+
+ +
+

Profiler-Enhanced Agent Architecture (redesigned)

+

Insight from GPU Optimizer v2 analysis and ConvNext POC: +Running the profiler before the search loop would have shown Gemm=57.7% on ConvNext — +immediately ruling out layout-pass experiments (Transpose only 2.6%, already fused Gelu already +canonical). Profile-first makes the Explorer smarter and the search shorter.

+

New 4-phase structure:

+
┌─────────────────────────────────────────────────────────────────────┐
+│ PHASE 0 — INTAKE                                                    │
+│   winml inspect → validate model is supported                       │
+│   winml build (baseline config) → get model.onnx                   │
+│   winml eval --mode compare → lock FP32 correctness baseline        │
+│   winml perf (baseline) → establish latency floor                   │
+└────────────────────────────┬────────────────────────────────────────┘
+                             ▼
+┌─────────────────────────────────────────────────────────────────────┐
+│ PHASE 1 — PROFILE  (runs ONCE, before any search)                   │
+│   winml perf -m baseline/model.onnx --ep <ep> --profile             │
+│   Parse bottleneck.json:                                            │
+│     - top_bottleneck: op type with highest % of kernel time         │
+│     - top3_concentration_pct: how concentrated the compute is       │
+│     - headroom_hints: actionable pass recommendations               │
+│   Classify each bottleneck op type:                                 │
+│     - "compute" (Gemm, Conv, Attention) → quant/kernel matters      │
+│     - "layout" (Transpose, Reshape) → graph pass matters            │
+│     - "already_canonical" (op shows as fused type) → fusion N/A    │
+│   Output: prioritized_hypothesis_queue (ordered by profile evidence)│
+└────────────────────────────┬────────────────────────────────────────┘
+                             ▼
+┌─────────────────────────────────────────────────────────────────────┐
+│ PHASE 2 — PROFILE-GUIDED OPTIMIZATION LOOP                          │
+│                                                                     │
+│  ┌──────────────┐    ┌──────────────┐    ┌─────────────────────┐  │
+│  │   EXPLORER   │───►│  OPTIMIZER   │───►│      REVIEWER       │  │
+│  │              │    │              │    │                     │  │
+│  │ Pops next    │    │ Runs ONE     │    │ Cross-exp verdict:  │  │
+│  │ hypothesis   │    │ experiment:  │    │ - CV gate Phase A   │  │
+│  │ from queue,  │    │ build +      │    │ - full bench Gate 1 │  │
+│  │ motivated by │    │ quick-screen │    │ - keep / discard    │  │
+│  │ profile data │    │ → full bench │    │ - detect plateau    │  │
+│  │              │    │ → eval       │    │ - stop condition    │  │
+│  └──────────────┘    └──────────────┘    │ - write KB draft   │  │
+│         ▲                               └─────────────────────┘  │
+│  mandatory external-research triggers (adopted from V2):           │
+│    • after 5 consecutive DISCARDs in same search dimension         │
+│      → search ORT/QNN SDK source code for mechanism               │
+│    • after every KEEP that changes precision or EP                 │
+│      → re-read ep_knowledge for updated constraints                │
+│    • before declaring search_space_exhausted                       │
+│      → ORT source sweep: opset gates, EP-specific dispatch rules   │
+│                                                                     │
+│  Explorer prunes via bottleneck.json (only "confirmed" KB rules):  │
+│    IF top_bottleneck == "Gemm" (>50%):                              │
+│      → SKIP layout passes (transpose-optimizer, nchwc, nhwc)        │
+│      → FOCUS on: quant precision, calibration, matmul fusions       │
+│    IF top_bottleneck == "Transpose" (>10%):                         │
+│      → CHECK kMaxSupportedOpset for current ORT version FIRST       │
+│    IF top_bottleneck == "Conv" (>20%):                              │
+│      → try nchwc-transformer, conv-activation-fusion               │
+│    IF "Gelu"/"LayerNormalization" op_type (already canonical):      │
+│      → SKIP corresponding fusion flags                              │
+└────────────────────────────┬────────────────────────────────────────┘
+                             ▼
+┌─────────────────────────────────────────────────────────────────────┐
+│ PHASE 3 — REPORT                                                    │
+│   config_<ep>_optimal.json  ← champion config with _autoconfig_meta│
+│   report.html               ← full benchmark + profile section      │
+│   experiments/<n>/          ← per-exp: hypothesis/impl/parity/     │
+│                                perf/analysis/decision (V2 pattern)  │
+│   kb_entry.json             ← status="draft"; promoted to          │
+│     "confirmed" only after mechanism confirmed (Gate 2)             │
+└─────────────────────────────────────────────────────────────────────┘
+
+ +

ep_knowledge draft/confirmed lifecycle (Gap 3 fix):

+
KB entry states:
+  "draft"     — observed perf delta, mechanism unconfirmed (Gate 2 not passed)
+                Can influence hypothesis PRIORITY but NOT prune search space
+  "confirmed" — mechanism confirmed via ORT/QNN source code (Gate 2 passed)
+                Can prune search space for future runs
+  "deprecated"— finding invalidated by new experiment or stack version change
+                Must NOT influence search space; kept for history only
+
+Transition rules:
+  draft → confirmed:   requires mechanism_confirmed=true + source_citation
+  confirmed → deprecated: requires contradicting experiment OR stack version bump
+  deprecated entries:  kept in JSON with status field, never deleted
+
+ +

Profiler output → Explorer mapping table:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Profile findingExplorer actionHypothesis skipped
Gemm > 50%Prioritize quant/calib experimentsAll layout-transform passes
Transpose < 5% (opset=17)Transpose Optimizer already workingtranspose-optimizer trials
op_type "Gelu" presentAlready fusedgelu-fusion, fast-gelu-fusion
op_type "LayerNormalization" presentAlready fusedlayer-norm-fusion trials
Reorder{Input,Output} present (>4%)NCHWc already activenchwc-transformer trials
op_type "Attention" presentMHA already fusedattention-fusion trials
QDQ ops > 15%Quant overhead highFocus on op_types_to_quantize exclusions
Transpose > 10% + opset ≥ 19kMaxSupportedOpset issueFlag as [KNOWN_TRADEOFF], lower opset
+

Why profile-first matters (validated on ConvNext):

+

The ablation experiment ran 22 experiments over multiple days. Had the profiler run first: +- Profile shows: Gemm=57.7%, Conv=12.6%, Transpose=2.6%, Gelu=8% (already "Gelu" op) +- Explorer would have immediately skipped: gelu-fusion, layer-norm-fusion, transpose-optimizer, + nchwc-transformer (already active via ReorderInput/Output) +- Only candidates from profile: matmul-add-fusion (Gemm bottleneck), conv-activation-fusion +- This would have reduced 22 experiments to ~6, with the same conclusions

+

POC profiler: C:\tmp\autoconfig-demo\winml_profile.py +- Uses ORT enable_profiling=True + end_profiling() (same pattern as AI Studio's profile_file.py) +- CPU EP: parses _kernel_time events from ORT JSON trace +- Output: bottleneck.json (structured) + bottleneck.txt (human-readable) + raw ORT trace +- ConvNext result: Gemm 57.7%, Conv 12.6%, Transpose 2.6% → confirms baseline is optimal for CPU

+
+

Sections

+

1. Phase 0 — Intake + Baseline

+
# Step 1: verify the model is supported
+winml inspect -m <model-id> --format json
+
+# Step 2: baseline build (default config, opset=17)
+winml export -m <model-id> -o baseline/
+winml build -c config_baseline.json -m <model-id> -o baseline_built/
+
+# Step 3: correctness contract
+winml eval --mode compare -m baseline_built/model.onnx --model-id <model-id> --format json
+# Expected: cosine=1.0 (FP32 self-comparison)
+
+# Step 4: baseline perf
+winml perf -m baseline_built/model.onnx --ep <ep> --warmup 10 --iterations 50 --format json
+# Record: baseline_p50_ms
+
+ +

Initialize results.tsv (TSV, not CSV — commas break in description field):

+
commit  precision   nodes_excluded  cosine  p50_ms  calibration_samples status  notes
+
+ +
+

2. Phase 1 — Profile (runs once, BEFORE any search experiments)

+
# Run profiler on baseline model (--profile flag added to winml perf)
+winml perf -m baseline_built/model.onnx --ep <ep> \
+  --warmup 5 --iterations 20 --profile --out profile_out/ --format json
+# Reads: profile_out/bottleneck.json
+# POC (before --profile ships): python winml_profile.py --model ... --ep ...
+
+ +

Profiler output drives Explorer hypothesis initialization:

+
READ bottleneck.json:
+  top_bottleneck: <op_type>
+  op_summary: [{op_type, pct}, ...]  (sorted by descending pct)
+  headroom_hints: [...]
+
+BUILD skip_set (passes not worth trying):
+  FOR each op_type in op_summary:
+    IF op_type == "Gelu":          skip_set.add(gelu-fusion, fast-gelu-fusion)
+    IF op_type == "LayerNormalization": skip_set.add(layer-norm-fusion)
+    IF op_type == "Attention":     skip_set.add(attention-fusion)
+    IF "ReorderInput" in op_summary AND pct > 2%:
+                                   skip_set.add(nchwc-transformer)  # already active
+  IF Transpose pct < 5% AND opset=17:
+                                   skip_set.add(transpose-optimizer)  # already working, no gain
+  IF Transpose pct > 10% AND opset >= 19:
+                                   flag as [KNOWN_TRADEOFF]; add to report
+
+BUILD priority_queue (hypotheses in evidence-based order):
+  IF top_bottleneck == "Gemm" OR "MatMul":
+    queue: [quant_precision, calib_method, calib_samples, matmul_fusions, per_channel]
+  IF top_bottleneck == "Conv":
+    queue: [nchwc (if not in skip_set), conv_fusions, quant_precision]
+  IF top_bottleneck == "Attention":
+    queue: [quant_precision, nodes_to_exclude (Attention), calib_method]
+  DEFAULT:
+    queue: [quant_precision, calib_method, calib_samples]
+
+ +
+

3. Phase 2 — Profile-Guided Optimization Loop (single EP)

+
LOOP FOREVER (until user stops or convergence):
+
+1. EXPLORER: pop next hypothesis from priority_queue
+   - Skip if in skip_set (pruned by profile)
+   - If queue empty → enter Phase 4 (generalization) or stop
+
+2. HYPOTHESIZE: build config.json delta based on hypothesis
+   Hypothesis rules (profile-informed, in priority order):
+   a. If first loop: start with full W8A8/W8A16, all ops quantized
+   b. If cosine < floor: add worst partial_op to nodes_to_exclude (one at a time)
+   c. If cosine ≥ floor but latency > budget: try W8A8 instead of W8A16,
+      or reduce calibration_samples, or add per_channel=true
+   d. If stuck (3 iterations no improvement): try calibration_method change
+      (minmax → entropy → percentile)
+   e. If still stuck: try precision escalation (W8A8 → W8A16 → FP16)
+
+3. MODIFY: write updated config.json
+   Key fields in quant section:
+   {
+     "precision": "w8a8",
+     "samples": 128,
+     "calibration_method": "minmax",
+     "nodes_to_exclude": ["LayerNorm_0", "Softmax_3"],
+     "per_channel": false
+   }
+
+4. OPTIMIZER: winml build -c config.json -m <model-id> -o out_<iteration>/
+   If build crashes: log as "crash", revert config, try different hypothesis
+
+5a. EVAL — quick sanity (cosine proxy, cheap):
+    winml eval --mode compare -m out_<iteration>/artifact.onnx \
+               --model-id <model-id> --format json
+    → cosine_similarity, sqnr_db
+    If cosine < hard_floor (e.g. 0.85): fail-fast, skip step 5b + 6, log as discard
+
+5b. EVAL — task accuracy (real quality gate):
+    winml eval -m out_<iteration>/artifact.onnx \
+               --model-id <model-id> \
+               --task <task>  --device <target> --ep <ep> \
+               --samples 100 --format json
+    → top1_accuracy (image-classification), f1 (text), mAP (detection), etc.
+    This is the authoritative accuracy metric for Reviewer verdict.
+
+    Why cosine alone is not sufficient:
+    - High cosine (0.97) but top-1 drops 5%: logit magnitudes preserved but relative ranking shifted
+    - Low cosine (0.92) but same top-1: relative ranking unchanged despite numeric difference
+    → Only task accuracy tells you whether the model still does its job
+
+6. PERF: winml perf -m out_<iteration>/artifact.onnx \
+         --device <target> --ep <ep> --warmup 10 --iterations 50 --format json
+   → p50_ms, p90_ms
+
+7. REVIEWER: cross-experiment verdict
+   keep    if task_accuracy ≥ accuracy_floor  AND  p50_ms ≤ latency_budget
+   discard if task_accuracy < accuracy_floor  OR   p50_ms > latency_budget
+   crash   if build/eval failed
+
+   Reviewer also checks:
+   - Plateau: 3+ keeps with Δlatency < 2% → likely at local optimum
+   - Profile divergence: if new op_type appears after build, re-profile
+   - Skip_set update: if experiment proves a pass is a no-op, add to skip_set
+   - Accuracy cliff: if task_accuracy drops > 3% in one step → flag, do not cascade
+
+8. LOG to results.tsv:
+   <git-short-hash>  <precision>  <nodes_excluded>  <cosine>  <top1_acc>  <p50_ms>  <samples>  keep/discard/crash  <notes>
+
+9. If keep: advance to next iteration from this config
+   If discard: revert to last kept config, try different hypothesis
+
+ +

Convergence criteria (stop the loop): +- cosine ≥ target floor AND p50_ms ≤ latency budget: objective achieved +- 5 consecutive discards with no improvement: report best so far +- User manually stops the agent

+
+

3. Hypothesis generation rules (the intelligence layer)

+

The agent generates hypotheses by traversing the search space in priority order. +Each hypothesis is motivated by diagnostic data from the previous experiment, not random search.

+

Priority ordering across the three config sections:

+
Phase 1 — establish baseline (iteration 0)
+  Start with: opset_version=17, all fusions enabled, precision=w8a16, minmax, 128 samples
+
+Phase 2 — precision first (fastest to try, most impact)
+  If cosine < floor:
+    w8a16 → try w8a8 with selective exclusions, or w8a16 first
+  If latency > budget:
+    w8a16 → try w8a8 (smaller model, faster inference)
+    fp16  → try w8a16 (if currently at fp16)
+
+Phase 3 — calibration tuning (if precision is right but cosine still low)
+  Try in order: minmax → entropy → percentile
+  Try increasing samples: 128 → 256 → 512
+  Try per_channel=true (better accuracy, slightly slower build)
+  Try symmetric=false if currently true
+
+Phase 4 — optimize pass tuning (independent of quant, affects graph structure)
+  Hypothesis: some fusion patterns create op shapes QNN handles poorly
+  Transformer models (try in order):
+    attention-fusion → skip-layer-norm-fusion → layer-norm-fusion → fuse-rmsnorm
+  Vision models (try in order):
+    conv-bn-fusion → conv-add-fusion → conv-activation-fusion
+  Shared (try if cosine drops or build crashes):
+    constant-folding=false  (prevents size bloat; sometimes exposes EP-incompatible shape)
+    clamp-constant-values=true  (fixes -inf attention mask → quantization issues)
+    remove-isnan-in-attention-mask=true  (use after clamp; cleans dead IsNaN guards)
+  Try opset_version: 17 → 18 → 19
+    (Higher opsets expose newer op types that may have better EP support)
+
+Phase 5 — selective node exclusion (when analyze shows partial ops)
+  Read winml analyze --format json → partial_ops list
+  Exclude one partial_op at a time (greedy: exclude highest-impact first)
+  Also try excluding op_types_to_quantize selectively
+    e.g., remove "LayerNorm" from op_types_to_quantize list
+
+Phase 6 — combined search (if single-dimension changes are stuck)
+  Try combinations of best Phase 3 + Phase 4 + Phase 5 changes together
+
+ +

Diagnosis table — what to try given what you see:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SymptomLikely causePhase to try next
cosine drops a lot at quant stage, all ops supportedCalibration data mismatchPhase 3: entropy calib, more samples
cosine drops at quant, Attention ops partialAttention activation quant on QNNPhase 5: exclude Attention nodes
cosine OK but latency worse than CPUFusion pattern creating unoptimized subgraphPhase 4: disable attention-fusion, try different opset
cosine OK but model larger than expectedConstant folding inlining large weightsPhase 4: constant-folding=false
Both cosine and latency good at w8a8 but build crashesopset op not supported by quant pipelinePhase 4: opset_version 17 → 16
cosine highly variable across seedsCalibration with too few samplesPhase 3: 128 → 256 samples
All ops supported, cosine still drops after fusionsFusion creates non-quantizable shapePhase 4: disable skip-layer-norm-fusion
QNN build fails with "invalid scale"-inf in attention mask initializerPhase 4: clamp-constant-values=true
Vision model: accuracy drops unexpectedlyConv+BN fusion slightly changes weight valuesPhase 4: disable conv-bn-fusion
MatMul-heavy model: latency not improvingMatMul not being fusedPhase 4: matmul-add-fusion, matmul-transpose-fusion
RMSNorm model (Llama etc.) poor QNN perfORT not recognizing RMSNorm patternPhase 4: fuse-rmsnorm=true
+

This is the key difference from grid search: each hypothesis is motivated by diagnostic data from winml analyze and the previous experiment result.

+
+

4. Multi-EP config generation

+

Run parallel loops for each target EP, then aggregate into manifest.json:

+
# Agent runs loops for each EP (can be sequential or parallel):
+# Loop 1: ep=qnn,   target_device=npu
+# Loop 2: ep=dml,   target_device=gpu
+# Loop 3: ep=cpu,   target_device=cpu
+
+# After all loops complete, agent generates:
+# - config_qnn_optimal.json   (best config found for QNN)
+# - config_dml_optimal.json   (best config found for DirectML)
+# - config_cpu_optimal.json   (best config found for CPU)
+
+# Then builds final artifacts and assembles manifest.json
+
+ +

Generated manifest.json includes experiment provenance:

+
{
+  "model_id": "microsoft/resnet-50",
+  "generated_by": "autoconfig",
+  "experiments_run": 34,
+  "variants": [
+    {
+      "ep": "qnn", "device": "npu",
+      "file": "model_qnn.onnx",
+      "precision": "w8a16",
+      "nodes_excluded": ["MultiHeadAttention"],
+      "cosine_similarity": 0.972,
+      "p50_ms": 18.3,
+      "config": "config_qnn_optimal.json"
+    },
+    {
+      "ep": "dml", "device": "gpu",
+      "file": "model_dml.onnx",
+      "precision": "fp16",
+      "nodes_excluded": [],
+      "cosine_similarity": 0.999,
+      "p50_ms": 22.1,
+      "config": "config_dml_optimal.json"
+    },
+    {
+      "ep": "cpu", "device": "cpu",
+      "file": "model_cpu.onnx",
+      "precision": "w8a8",
+      "nodes_excluded": ["LayerNorm"],
+      "cosine_similarity": 0.931,
+      "p50_ms": 84.7,
+      "config": "config_cpu_optimal.json"
+    }
+  ],
+  "selection_order": ["qnn", "dml", "cpu"]
+}
+
+ +
+

5. results.tsv format

+

Track all three config sections per experiment (TSV, not CSV):

+
commit  opset   fusions_disabled    precision   nodes_excluded  cosine  p50_ms  calib_samples   calib_method    status  notes
+baseline    17  []  fp32    []  1.000   —   —   —   keep    FP32 reference
+a1b2c3d 17  []  w8a8    []  0.871   16.2    128 minmax  discard full W8A8 too aggressive
+b2c3d4e 17  []  w8a16   []  0.967   19.8    128 minmax  keep    W8A16 baseline meets floor
+c3d4e5f 17  []  w8a16   []  0.969   19.1    256 entropy keep    entropy calib improvement
+d4e5f6g 17  [attention-fusion]  w8a16   []  0.971   18.4    256 entropy keep    disabling attn-fusion helps latency
+e5f6g7h 18  [attention-fusion]  w8a16   []  0.973   17.9    256 entropy keep    opset18 best so far
+f6g7h8i 18  [attention-fusion]  w8a8    [MultiHeadAttention]    0.961   14.2    256 entropy keep    mixed prec: meet latency budget
+
+ +
+

6. Skill outputs

+

autoconfig produces two primary outputs after convergence or user stop:

+

Output A: Best config file

+

config_<ep>_optimal.json — the winning config.json, ready to pass to winml build. Contains provenance metadata so it's reproducible:

+
{
+  "_autoconfig_meta": {
+    "model_id": "facebook/convnext-tiny-224",
+    "ep": "qnn",
+    "objective": "latency-primary",
+    "latency_budget_ms": 20,
+    "accuracy_floor": 0.95,
+    "experiments_run": 23,
+    "best_iter": "iter_17",
+    "timestamp": "2026-06-10T11:55:05+08:00"
+  },
+  "export": { "opset_version": 18 },
+  "optimize": { "attention-fusion": false },
+  "quantize": {
+    "precision": "w8a16",
+    "calibration_method": "entropy",
+    "calibration_samples": 256,
+    "nodes_to_exclude": ["MultiHeadAttention_0"]
+  }
+}
+
+ +

Output B: HTML benchmark report

+

report.html — self-contained single-file report (no external dependencies), viewable in any browser. Contains:

+

Section 1 — Summary card

+
Model:    facebook/convnext-tiny-224     EP: QNN (NPU)
+Objective: latency-primary ≤ 20ms       Accuracy floor: 0.95
+Result:   ✅ FOUND                       Experiments: 23  Time: 41 min
+
+Best config:  W8A16, entropy calib, 256 samples
+  Accuracy:   0.953  (floor 0.95 ✓)
+  p50 latency: 15.8ms  (budget 20ms ✓)
+
+ +

Section 2 — Search progress chart +Scatter plot: all 23 experiments, x=p50_latency_ms, y=accuracy. +- Green dot = kept (improvement) +- Red dot = discarded (regression) +- Star = best found +- Hover tooltip: iter ID, config diff vs previous

+

Section 3 — Iteration table +Full results.tsv rendered as sortable HTML table with columns:

+
iter | opset | precision | nodes_excluded | calib | accuracy | p50_ms | Δacc | Δlatency | status | hypothesis
+
+ +

Color-coded rows: green = keep, red = discard, gold = best.

+

Section 4 — Config diff timeline +Visual diff showing what changed between each kept iteration (config deltas as +/- lines).

+

Section 5 — Model graph analysis (from pre-search winml analyze) +- Op distribution pie chart (ONNX vs com.microsoft) +- EP compatibility table: ops supported/unsupported on target EP +- Detected patterns (GELU variant, attention structure, Transpose-sandwich)

+

Section 6 — Benchmark details +For the best config, full winml perf output: +- p10/p50/p90/p99 latency histogram +- Throughput (samples/sec) +- Warmup vs steady-state comparison +- (If multi-EP: side-by-side EP comparison bar chart)

+

Section 7 — Reproduction instructions

+
# Reproduce the winning config:
+winml build -c config_qnn_optimal.json -m facebook/convnext-tiny-224 -o out/
+# For NPU: always compile after build (empirically +1.7× speedup)
+winml compile -m out/model.onnx --device npu --ep qnn -o out_compiled/
+winml perf -m out_compiled/model_npu_ctx.onnx --ep qnn --iterations 100 --warmup 10
+
+ +

Report generation approach: The agent generates report.html using inline Python with Jinja2-style string templating + embedded Chart.js (CDN or inlined). No external dependencies — single file, opens offline.

+
+

7. What the agent says in chat

+

After convergence or user stop (terminal summary, report is the real deliverable):

+
autoconfig completed. 23 experiments run (41 min).
+
+Best config (QNN NPU):
+  W8A16, entropy calib, 256 samples, MultiHeadAttention excluded
+  accuracy 0.953 ✓ (floor 0.95)   p50 15.8ms ✓ (budget 20ms)
+
+Outputs:
+  config_qnn_optimal.json   ← drop into winml build -c
+  report.html               ← open in browser for full benchmark breakdown
+
+Next: winml validate-before-ship for production gate.
+
+ +
+

8. Constraints and failure handling

+
    +
  • Build timeout: If winml build exceeds 15 minutes, kill and log as crash
  • +
  • OOM: If build fails with out-of-memory, reduce calibration_samples by half
  • +
  • All hypotheses exhausted: Report best config found, note convergence limit
  • +
  • Latency not measurable (target EP not on machine): run eval only, skip perf gate
  • +
+

9. CLI-only constraint (critical)

+

The agent MUST use only official winml CLI commands as its tool surface. No Python scripting, no direct ONNX manipulation, no third-party tools (onnxconverter-common, onnxsim, Olive, etc.) except where explicitly documented as a known workaround.

+

Rationale: autoconfig's output is a config.json + report.html that a user can reproduce with winml build -c config.json. If the agent used a Python hack to produce a model artifact, the config is not reproducible and the report is misleading.

+

Known workarounds (allowed, must be flagged in report): +| Workaround | Replaces | Tracking issue | Required flag in report | +|---|---|---|---| +| python winml_profile.py | winml perf --profile (not yet shipped) | pending | ⚠️ "Profile data via POC script, not official API" |

+

Gap reporting rule: If a hypothesis cannot be tested because the required winml CLI capability does not exist, the agent MUST: +1. Record the hypothesis as SKIPPED — CLI gap in the experiment table +2. Add an entry to Section 6 "Gaps & Issues" block in report.html: + GAP: <hypothesis> requires <missing capability> + Impact: <what speedup/accuracy improvement was not measurable> + Filed: <issue URL or "not yet filed"> +3. NOT silently substitute a Python workaround that produces unverifiable artifacts

+

Example gaps encountered during ConvNext QNN GPU validation: +- winml build --precision fp16 flag not available (#867) → FP16 native export untested → SKIPPED — CLI gap +- winml perf --ep-option not available (#865) → runtime flag sweep untested → SKIPPED — CLI gap +- winml perf --profile for QNN EP not available → profiling via POC script (allowed workaround) +- W8A8 QDQ ONNX on QNN GPU EP hangs indefinitely — root cause is QNN SDK behavior; winml build already prevents this via _patch_device(); fast-fail enhancement filed as #868 (low priority)

+
+

Key commands used

+
# Phase 1: profiling (--profile flag on winml perf, before search)
+winml perf -m baseline_built/model.onnx --ep <ep> --warmup 5 --iterations 20 \
+  --profile --out profile_out/ --format json
+# → profile_out/bottleneck.json  (machine-readable for Explorer)
+# → profile_out/bottleneck.txt   (human-readable summary)
+# POC: python winml_profile.py --model ... --ep ... (until --profile ships)
+
+# Phase 2: analysis (informs nodes_to_exclude hypotheses)
+winml analyze -m <exported>.onnx --ep <ep> --format json
+
+# Phase 2: experiment
+winml build -c config.json -m <model-id> -o out_<n>/
+
+# Phase 2: metrics
+winml eval --mode compare -m out_<n>/artifact.onnx --model-id <model-id> --format json
+winml perf -m out_<n>/artifact.onnx --device <target> --ep <ep> --iterations 50 --format json
+
+# Phase 3: compile best candidate to QNN EPContext (NPU only)
+# Eliminates JIT overhead; empirically ~1.7× further speedup on ConvNext W8A16
+winml compile -m best_candidate/model.onnx --device npu --ep qnn -o best_compiled/
+# → best_compiled/model_npu_ctx.onnx  (loads context binary at runtime)
+# → best_compiled/model_npu_ctx_qnn.bin  (QNN hardware-compiled graph)
+
+# Phase 3: re-benchmark compiled model
+winml perf -m best_compiled/model_npu_ctx.onnx --device npu --ep qnn --warmup 10 --iterations 50
+
+ +

Empirical data: ConvNext QNN NPU compile impact +| Version | p50 | vs FP32 NPU | +|---|---|---| +| FP32 baseline | 19.39ms | — | +| W8A16 quantized | 10.29ms | 1.9× | +| W8A16 + compile | 6.01ms | 3.2× | +→ winml compile alone adds ~1.7× on top of quantization. Always compile for NPU deployment.

+

Empirical data: ConvNext QNN GPU optimization sweep (Adreno X1-85) — full search +| Experiment | p50 | p90 | std | vs FP32 | Notes | +|---|---|---|---|---|---| +| FP32 baseline (autoconf) | 17.7ms | 19.7ms | 0.97 | — | ✅ OPTIMAL with current CLI | +| NHWC transformer | 19.5ms | 23.8ms | 3.43 | ❌ −10% | Hurts Adreno+QNN EP | +| NHWC + all GPU fusions | 18.1ms | 23.9ms | 2.71 | ❌ −2% | Still worse | +| Conv/norm fusions (no NHWC) | 17.6ms | 22.6ms | 5.51 | ≈0% | Variance ↑, no gain | +| LayerNorm rewrite | 18.4ms | 21.4ms | 2.04 | ❌ −4% | Pattern mismatch anyway | +| Transpose optimizer | 0% node Δ | — | — | no-op | Already optimal positions | +| HiDimRTR→LowDimRTR | 0% node Δ | — | — | no-op | ConvNext RTR doesn't match pattern | +| MatMulAdd→Conv2D (2d/3d/4d) | 0% node Δ | — | — | no-op | ConvNext uses Reshape→MatMul, not bare MatMul+Add | +| FP32 + compile | 23.7ms | — | — | ❌ −34% | Compile hurts GPU (opposite of NPU) | +| W8A8 QDQ quantized | hangs | — | — | ❌ blocked | #868 enhancement (fast-fail) | +| FP16 (invalid CLI path) | 8.8ms | ~32ms | bimodal | ⚠️ 2× p50 | BLOCKED — need #867 |

+

Root cause: why no pass matches ConvNext on QNN GPU +- All 251 ops run natively on GPU (251/0/0/0) — no CPU fallback to eliminate +- ConvNext linear layers: Reshape → MatMul → Reshape pattern, not bare MatMul+Add → Conv2D rewrites don't match +- 72 Reshape + 42 Transpose are already at minimum / optimal topology from PyTorch export +- winml build autoconf (gelu_fusion + matmul_add_fusion) already applied all relevant transforms +- The bottleneck is compute throughput + memory bandwidth — only FP16 (smaller tensors) can improve this

+

Key insight: gelu_fusion matters for variance, not p50 +| Version | p50 | p90 | std | +|---|---|---|---| +| Raw export (287 nodes, unfused Gelu) | 17.4ms | 29.2ms | 5.90 | +| Autoconf (251 nodes, fused Gelu+Gemm) | 17.7ms | 19.7ms | 0.97 |

+

Unfused Gelu = 5 separate GPU kernel launches (Mul→Div→Erf→Mul→Add) with scheduling jitter. +A single Gelu kernel eliminates dispatch overhead → p90 −48%, std −6×. +→ autoconf's role on GPU is stability, not speedup. Critical for real-time / latency-SLA deployments.

+

QNN GPU search space exhausted. FP16 is the only remaining lever, blocked by #867.

+

Empirical data: ConvNext DML optimization sweep (Adreno X1-85, DirectML) +| Experiment | p50 | p90 | std | vs FP32 | +|---|---|---|---|---| +| FP32 baseline (autoconf, 251 nodes) | 16.9ms | 17.7ms | 0.52 | — ← OPTIMAL with current CLI | +| NHWC transformer | 16.5ms | 21.0ms | 1.89 | ❌ p90 worse | +| Raw unfused export (287 nodes) | 16.5ms | 18.4ms | 2.74 | ❌ p99=35ms, worse tail | +| FP16 (Python hack ⚠️) | 11.8ms | 12.8ms | 0.66 | ✅ 1.4× faster, clean dist — BLOCKED #867 |

+

DML vs QNN GPU comparison (same Adreno X1-85): +| | QNN GPU FP32 | DML FP32 | DML FP16 (invalid) | +|---|---|---|---| +| p50 | 17.7ms | 16.9ms | 11.8ms | +| p90 | 19.7ms | 17.7ms | 12.8ms | +| std | 0.97 | 0.52 | 0.66 |

+

→ DML is consistently faster and more stable than QNN GPU at FP32. Root cause: DML JIT-compiles HLSL shaders at model load time; QNN GPU EP does graph partitioning at each session creation. +→ DML FP16: no DVFS bimodal (unlike QNN GPU FP16) — DML's shader compilation locks in FP16 compute paths. +→ NHWC hurts DML too (same reason as QNN GPU: Adreno X1-85 + D3D12 doesn't benefit from explicit NHWC transforms). +→ Note: winml analyze returns 0/0/0/251 (all Unknown) for DML — no rule data. DML supports all standard ONNX ops by design.

+

QNN Hub benchmark comparison (Snapdragon X Elite CRD) — WITH cross-stack test

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ModelStackNPU p50GPU p50Notes
QNN Hub Float (opset 21, 222 nodes, MatMul)qairt cloud2.687msReference
QNN Hub Float (same model)winml ORT QNN EP8.78ms23.9msDirect test on this device
Our Float (opset 17, 251 nodes, Gemm)winml ORT QNN EP19.4ms17.7mswinml build output
QNN Hub W8A16 (opset 21, 798 QDQ, uint16 input)qairt cloud2.612msReference
QNN Hub W8A16 (same model)winml ORT QNN EP14.82ms (std=8.8!)ORT-QNN mismatch
Our W8A16 + compile (opset 17, ORT quant)winml ORT QNN EP6.01msBest we can do
+

Gap decomposition (three independent sources):

+
QNN Hub cloud:   2.7ms
+                  ↑ 3.3× Runtime gap  (qairt native vs ORT QNN EP adapter overhead)
+QNN Hub on winml: 8.78ms
+                  ↑ 2.2× Model graph gap (opset 21/MatMul/222 nodes vs opset 17/Gemm/251 nodes)
+Our model on winml: 19.4ms (FP32)
+
+ +

Actionable findings (updated 2026-06-10 — mechanism confirmed via ORT source): +1. opset 21 NPU speedup mechanism CONFIRMED — but ORT-version-dependent (#869) + - Root cause: kMaxSupportedOpset gate in IsSupportedOpset() (layout_transformation.cc). On older ORT where kMaxSupportedOpset < 21, opset 21 models bypass the NHWC layout transform entirely (transform_layout_fn = nullptr). + - Why bypass helps ConvNext: NHWC transform inserts Transpose(NCHW→NHWC/NHWC→NCHW) around Conv. ConvNext residual connections block full transpose cancellation → extra Transpose ops on HTP → slower. Bypassing = cleaner graph = faster. + - Critical caveat: Current ORT main has kMaxSupportedOpset = 26 → BOTH opset 17 and 21 get NHWC transform. Must verify ORT version before assuming the speedup exists. + - Does NOT generalize to: MobileNet/EfficientNet (no residual Transpose blocks), ViT (no Conv). + - Perf claim validation status: Gate 1 (iter≥1000×3) and Gate 3 (thermal control) still FAILED. Perf numbers are DVFS-dominated. +2. Runtime stack gap (3.3×) is structural: qairt native will always be faster. Correct baseline = "QNN Hub ONNX on winml" (8.78ms). +3. QNN Hub W8A16 is WORSE on our stack (14.82ms, std=8.8ms): opset 21 QDQ + uint16 input incompatible with ORT QNN EP format. +4. Opset is a search dimension — but the correct action is a FULL SWEEP (17–22), not "try 21 first". The optimal opset depends on ORT version.

+

EP-specific search space rules

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
EPQuantizationOpsetGraph passesCompileKey insight
QNN NPU✅ W8A16Full sweep 17-22 (mechanism ORT-version-dependent)autoconf (gelu+matmul_add)✅ AlwaysW8A8 catastrophic on LN+GELU; opset effect depends on ORT kMaxSupportedOpset
QNN GPU❌ Skip17 (opset 21 not validated)autoconf only❌ SkipCompile regresses; FP16 only lever (#867)
DML❌ Skip17 (opset 21 not validated)autoconf onlyN/AFP16 primary lever (#867); faster+stabler than QNN GPU
CPU❌ Skip17 only (kMaxSupportedOpset causes 3-4× regression on 19+)nchwc, matmul-add, geluN/AkMaxSupportedOpset gate hurts CPU for same reason it helps QNN
+

Rule: autoconfig must use EP-specific search space. Do NOT run quantization experiments for GPU/DML/CPU. +Rule: for QNN NPU opset sweep, verify ORT kMaxSupportedOpset first — if ≥ 22, all opsets get NHWC transform and the opset-based speedup may not apply. +Rule: for NPU, if W8A8 top-1 ≤ 15% on first attempt → skip all W8A8 variants, go directly to W8A16. +Rule: always run winml compile after finding best quantized config for QNN NPU. NEVER compile for GPU (regresses). +Rule: for GPU/DML, skip ALL graph optimization passes beyond what winml build autoconf applies (NHWC and additional fusions hurt). +Rule: W8A8 QDQ on GPU EP hangs — skip quantization immediately for GPU targets without testing.

+

User scenario mapping

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ScenarioHow autoconfig addresses it
S1: LLM fast support (7-30d)autoconfig replaces manual per-EP tuning; outputs config_optimal.json + report.html deployable in hours not days
S2: ISV non-LLM model supportExact use case: ISV brings model → autoconfig finds config → report is deliverable with SOP turnaround
S3: Cross-EP parityMulti-EP parallel run: same model, EP-specific search spaces in parallel → output config matrix per EP
S4: Customer ONNX can't runPhase 0 intake diagnoses "can't run" (partial ops → block reason); Phase 1+2 finds "escape config" for "runs poorly"
S5: PyTorch HF Hub coveragePhase 0 IS the "can WinML run it?" gate; failed Phase 0 → structured block reason feeds long-tail gap tracking
+

Dependencies on code changes: +- winml perf --profile (new flag) — adds per-op bottleneck output alongside existing latency metrics; POC script winml_profile.py exists to unblock +- --format json on winml eval (#847), winml analyze (#848), winml perf (#849)

+

Cross-references

+
    +
  • Run check-model-feasibility before starting to pick a model and verify the EP is available
  • +
  • After autoconfig completes → ship-to-winapp for final validation gates + packaging
  • +
  • If autoconfig cannot meet objective → debug-accuracy-drop for deeper diagnosis
  • +
  • Multi-EP output feeds directly into ship-to-winapp's manifest layout
  • +
  • If the best config found is still not good enough → escalate to optimization-research
  • +
+
+

Skill: optimization-research (contributor — internal, deep gap analysis)

+

Frontmatter

+
name: optimization-research
+description: >
+  Use this skill when a winml-cli engineer wants to find out whether a model can
+  be optimized better than what winml-cli currently achieves, identify what is
+  blocking that optimization, and produce concrete backlog work items.
+  The agent performs a deep search across: ORT source code and its optimizer
+  passes, Olive recipes and benchmarks, other ONNX ecosystem tools (onnxsim,
+  onnxoptimizer, neural-compressor, etc.), and native stack reference models
+  and datasets. It compares the best achievable result (using all available tools)
+  against what winml produces today, diagnoses the gap, and files GitHub issues
+  with reproduction steps. Use when an internal engineer says "why is this model
+  slower than it should be", "what optimization techniques are we missing",
+  or "what would it take to match Olive's results".
+
+audience: internal (winml-cli team engineers)
+
+ +

When to use

+
    +
  • "ConvNext on QNN is 3× slower than what Qualcomm's SDK achieves — why?"
  • +
  • "Olive gets 15ms on this model; winml gets 28ms — what's the gap?"
  • +
  • "We're seeing quantization accuracy drop on LLaMA; are there better calibration methods we're not supporting?"
  • +
  • "What would it take to match ORT's best-known config for this architecture?"
  • +
  • After autoconfig hits a ceiling: best config found is still not meeting the objective
  • +
+

What this skill produces

+

Primary outputs: +1. gap_analysis.md — structured report of what the best achievable result is and what's missing +2. repro/ — scripts to reproduce the better result using external tools +3. GitHub issues — one per identified gap, filed against winml-cli with: repro steps, expected vs actual, what ORT/Olive/ecosystem already does, proposed fix direction

+
+

Design: Deep Search Process

+
┌──────────────────────────────────────────────────────────────────┐
+│ PHASE 1 — BASELINE                                               │
+│   winml autoconfig best result for this model/EP                 │
+│   (or provided by user if already run)                           │
+└─────────────────────────┬────────────────────────────────────────┘
+                          ▼
+┌──────────────────────────────────────────────────────────────────┐
+│ PHASE 2 — EXTERNAL BENCHMARK                                     │
+│   Run same model through:                                        │
+│     A. ORT optimizer directly (onnxruntime.tools.transformers)   │
+│     B. Olive (olive-ai) with ep-specific recipe                  │
+│     C. onnxsim + onnxoptimizer (static graph simplification)     │
+│     D. neural-compressor (Intel) for quantization comparison     │
+│   Record: best latency, accuracy, config used                    │
+└─────────────────────────┬────────────────────────────────────────┘
+                          ▼
+┌──────────────────────────────────────────────────────────────────┐
+│ PHASE 3 — GAP DIAGNOSIS                                          │
+│   For each gap (external better than winml):                     │
+│     a. Diff the ONNX graphs (what ops/patterns differ?)          │
+│     b. Read ORT optimizer source to understand what it does      │
+│     c. Check winml's capability registry — is this pass missing? │
+│        disabled by default? wired incorrectly?                   │
+│     d. Check Olive recipe — what flags/params does it use?       │
+│   Classify gap as one of:                                        │
+│     [MISSING_CAPABILITY]   — pass exists in ORT, not in winml   │
+│     [WRONG_DEFAULT]        — pass exists but wrong default/order │
+│     [BUG]                  — pass exists but produces wrong graph│
+│     [CALIBRATION_DATA]     — accuracy gap from calibration set   │
+│     [EP_LIMITATION]        — EP itself can't do this, not winml  │
+│     [KNOWN_TRADEOFF]       — intentional: winml trades X for Y   │
+└─────────────────────────┬────────────────────────────────────────┘
+                          ▼
+┌──────────────────────────────────────────────────────────────────┐
+│ PHASE 4 — NATIVE STACK VALIDATION                                │
+│   Check existing reference models in winml-cli test suite:       │
+│     - Are there models of this architecture in tests/models/?    │
+│     - Do their expected results match what we see?               │
+│   Check Windows AI Studio / WinML model zoo:                     │
+│     - Is this architecture listed? At what performance?          │
+│   Check QNN SDK reference benchmarks (if QNN EP):               │
+│     - Does QNN vendor claim better numbers for this model?       │
+└─────────────────────────┬────────────────────────────────────────┘
+                          ▼
+┌──────────────────────────────────────────────────────────────────┐
+│ PHASE 5 — WORK ITEMS                                             │
+│   For each [MISSING_CAPABILITY] or [WRONG_DEFAULT] gap:          │
+│     - Draft GitHub issue with: title, body, repro, expected,     │
+│       actual, proposed fix, ORT source pointer                   │
+│     - Estimate implementation complexity (S/M/L/XL)             │
+│   For [BUG]: file with full repro script                         │
+│   For [CALIBRATION_DATA]: suggest dataset and eval protocol      │
+│   For [EP_LIMITATION]: file with QNN/DML SDK reference           │
+└──────────────────────────────────────────────────────────────────┘
+
+ +
+

Key external tools to invoke

+
# A. ORT transformer optimizer (the "gold standard" for transformer models)
+python -c "
+from onnxruntime.transformers import optimizer
+from onnxruntime.transformers.fusion_options import FusionOptions
+opts = FusionOptions('bert')   # or 'gpt2', 'clip', etc.
+opts.enable_attention = True
+opts.enable_gelu = True
+model = optimizer.optimize_model(
+    'export.onnx', model_type='bert',
+    num_heads=12, hidden_size=768,
+    optimization_options=opts
+)
+model.save_model_to_file('ort_optimized.onnx')
+"
+
+# B. Olive (end-to-end, EP-aware)
+olive run --config olive_recipe.json
+# olive recipe template: see skills/optimization-research/templates/olive_qnn.json
+
+# C. onnxsim (structural simplification)
+python -m onnxsim export.onnx simplified.onnx
+
+# D. onnxoptimizer
+python -c "
+import onnxoptimizer, onnx
+m = onnx.load('export.onnx')
+passes = onnxoptimizer.get_available_passes()
+m2 = onnxoptimizer.optimize(m, passes)
+onnx.save(m2, 'onnxopt.onnx')
+"
+
+ +
+

Gap report format (gap_analysis.md)

+
# Optimization Gap Analysis: <model_id> on <ep>
+
+Date: <timestamp>
+winml-cli version: <version>
+ORT version: <version>
+
+## Summary
+| Tool | Latency p50 | Accuracy | Config notes |
+|---|---|---|---|
+| winml best (autoconfig) | 28.3ms | 0.953 | W8A16, entropy, 256 samples |
+| ORT transformer optimizer | 19.1ms | 0.951 | model_type=bert, all fusions |
+| Olive QNN recipe | 17.8ms | 0.948 | W8A8 + attention fusion |
+| **Gap** | **10.5ms (37%)** | — | — |
+
+## Gap 1: [MISSING_CAPABILITY] FusedMatMul with rotary embedding
+**What external tool does:** ...
+**What winml does:** ...
+**ORT source:** `onnxruntime/python/tools/transformers/fusion_rotary_attention.py`
+**Proposed fix:** Add RotaryAttentionFusion to FusionPipe capability registry
+**Estimated effort:** M
+
+## Gap 2: [WRONG_DEFAULT] attention-fusion disabled by default
+...
+
+ +
+

GitHub issue template

+
title: [optimization-gap] <model_arch>/<ep>: <gap description>
+
+body:
+## Summary
+<one-sentence description of what's missing>
+
+## Reproduction
+```bash
+# Install
+uv pip install winml-cli
+
+# Baseline (winml current)
+winml build -c config.json -m <model-id> -o winml_out/
+winml perf -m winml_out/model.onnx --ep <ep> --warmup 10 --iterations 50
+
+# Better result (external)
+<commands to reproduce the external result>
+
+ +

Expected vs actual

+
    +
  • External tool achieves: ms at
  • +
  • winml achieves: ms at
  • +
  • Gap: ms (%)
  • +
+

Root cause

+

+

ORT source reference

+

+

Proposed fix direction

+

+

Complexity estimate

+

S / M / L / XL

+
---
+
+### What this skill does NOT do
+- Does not make code changes to winml-cli itself (files issues only)
+- Does not run production benchmarks (uses quick screening methodology)
+- Does not replace formal performance testing with validated hardware
+
+### Cross-references
+- `autoconfig` provides the winml baseline to compare against
+- Issues filed here feed `adding-ep-support` and `contributing-a-skill` workflows
+- Use `check-model-feasibility` to confirm EP availability before running external benchmarks
+
+---
+
+
+---
+
+## ConvNext Autoconfig POC — Rigorous Ablation Results
+
+**Source:** `C:\tmp\autoconfig-demo\ablation.py` — 4-phase rigorous ablation experiment
+**Measurement:** `winml perf --ep cpu --warmup 10 --iterations 50` — pure inference latency, no preprocessing
+**Design:** 3 independent runs per config; promotion threshold = max(3%, 2×σ_baseline); correctness gate (`winml eval --samples 20`) per config
+**Report:** `C:\tmp\autoconfig-demo\report.html` | **Config:** `C:\tmp\autoconfig-demo\config_cpu_optimal.json`
+
+### Graph structure (facebook/convnext-tiny-224, opset 17)
+
+**Op counts (raw export):** 287 nodes total
+
+ +

Add×72 Mul×54 Transpose×42 MatMul×36 LayerNormalization×23 +Conv×22 Div×18 Erf×18 ReduceMean×1 Gemm×1

+
**ConvNext block structure** (traced from first DW-Conv):
+
+ +

DW-Conv(7x7, g=96) → Transpose +→ LayerNormalization (native, already fused at export) +→ MatMul(C→4C) → Add(bias) +→ [GELU: Div → Erf → Add(1) → Mul → Mul(0.5)] ← 18 unfused in export +→ MatMul(4C→C) → Add(bias) [Gemm after ORT L2] +→ Mul (layer scale) → Add (residual) +→ Transpose (back to NCHW)

+
**Conv breakdown:** 4 regular (1×stem 4x4, 3×downsample 2x2 stride-2), 18×DW-Conv 7x7
+
+**Transpose patterns:**
+
+ +

19× Conv → Transpose → LayerNormalization (NCHW→NHWC for LN) +15× Mul → Transpose → Add (NHWC→NCHW for residual) + 4× LayerNormalization → Transpose → Conv (NHWC→NCHW for next DW-Conv) + 2× Add → Transpose → Conv + 2× Add → Transpose → LayerNormalization

+
→ ConvNext is a **Transpose-sandwich** model: alternates NCHW (Conv) and NHWC (LN) layout
+
+**Observed graph transformation (export.onnx → model.onnx after winml build, baseline config):**
+| Op | export.onnx | model.onnx (baseline) | Change |
+|---|---|---|---|
+| `com.microsoft/Gelu` | 0 | 18 | +18 |
+| `Gemm` | 1 | 37 | +36 |
+| `MatMul` | 36 | 0 | −36 |
+| `Add` | 72 | 18 | −54 |
+| `Mul` | 54 | 18 | −36 |
+| `Div`, `Erf` | 18 each | 0 | −18 each |
+| `Reshape` | 0 | 72 | +72 |
+
+**Observation (confirmed):** The baseline `model.onnx` (no user fusion flags) already differs substantially from `export.onnx`. GELU and MatMul+Add are fused before any user capability flag is applied.
+
+**Open question (unresolved):** The `ORTGraphPipe` design (graph.py) is supposed to disable `GeluFusion`/`GeluFusionL2`/`LayerNormFusion` in the baseline via `optimization.disable_specified_optimizers`. Yet the baseline output clearly contains `com.microsoft/Gelu`. This contradiction is unresolved — possible explanations include: ORT name mismatch in disabled list, a different code path fusing GELU, or the export step (via HF Optimum) applying fusion before winml. **This must be investigated before any mechanistic claims about "ORT L2 already does X" are written in user-facing reports.**
+
+---
+
+### Ablation results (rigorous, Phase 0–4)
+
+**Clean baseline:** 43.7ms p50 (base_0 + base_1, 6 runs, all within 42.5–45.4ms)
+
+| config | p50 mean | Δ vs baseline | runs (ms) | verdict |
+|---|---|---|---|---|
+| base_0 | 43.0ms | −0.6ms | 43.8 / 42.7 / 42.5 | baseline |
+| base_1 | 44.3ms | +0.6ms | 43.2 / 44.3 / 45.4 | baseline |
+| base_2 | 73.5ms | +29.8ms | 47.2 / **127.1** / 46.2 | outlier run (system spike) |
+| opset_18 | 48.0ms | +4.3ms | 50.2 / 44.0 / 49.7 | neutral |
+| **opset_19** | **160.3ms** | **+116ms** | **147.6 / 145.8 / 187.4** | **⚠️ SEVERE REGRESSION** |
+| **opset_20** | **131.0ms** | **+87ms** | **135.7 / 129.8 / 127.5** | **⚠️ SEVERE REGRESSION** |
+| **opset_21** | **170.3ms** | **+126ms** | **190.1 / 164.9 / 155.8** | **⚠️ SEVERE REGRESSION** |
+| **opset_22** | **85.0ms** | **+41ms** | **70.9 / 93.9 / 90.2** | **confirmed regression** |
+| no_cf_17 | 51.8ms | +8.1ms | 56.4 / 49.0 / 49.9 | mild regression |
+| base_mid | 49.4ms | +5.8ms | 51.3 / 51.1 / 45.9 | baseline (mid-exp drift) |
+| gelu_only | 52.5ms | +8.9ms | 53.0 / 55.6 / 49.1 | mild regression |
+| ln_only | 57.2ms | +13.6ms | **79.3** / 47.9 / 44.5 | inconclusive (outlier) |
+| conv_add | 50.2ms | +6.5ms | 47.3 / 55.9 / 47.4 | inconclusive |
+| conv_act | 51.2ms | +7.5ms | 45.2 / 41.9 / **66.4** | inconclusive (outlier) |
+| **matmul_add** | **81.7ms** | **+38.0ms** | **63.0 / 70.8 / 111.2** | **CONFIRMED REGRESSION** |
+| transpose_opt | 45.5ms | +1.8ms | 42.3 / 52.3 / 41.8 | neutral |
+| nchwc | 45.4ms | +1.7ms | 43.4 / 48.0 / 44.7 | neutral |
+| matmul_scale | 56.9ms | +13.3ms | 51.5 / 58.1 / 61.2 | probable mild regression |
+| base_end | 48.3ms | +4.7ms | 45.3 / 56.7 / 43.1 | baseline (end-of-exp drift) |
+
+**Phase 3 outcome:** No candidates met promotion threshold (29.4ms needed). Baseline is optimal.
+
+---
+
+### Confirmed findings (statistically defensible)
+
+**1. `matmul-add-fusion` is a confirmed regression on ConvNext CPU (+38ms)**
+- All 3 independent runs: 63.0 / 70.8 / 111.2ms — each far above the highest clean baseline run (45.4ms)
+- Not attributable to system noise (no run-to-run overlap with baseline distribution)
+- Mechanism hypothesis: baseline already converts MatMul+Add→Gemm (37 Gemm in model.onnx); applying matmul-add-fusion on top may create redundant or conflicting kernel dispatch. Unconfirmed — requires profiling.
+
+**2. `transpose-optimizer` is NEUTRAL on pure inference latency**
+- Runs: 42.3 / 52.3 / 41.8ms — overlapping with clean baseline (42.5–45.4ms)
+- ⚠️ **CORRECTION OF EARLIER FINDING:** A previous 8-iteration search (using `winml eval`) reported +270ms. That was a measurement artifact — `winml eval` includes HF preprocessing pipeline overhead and has no warmup. It measures *application startup + preprocessing + inference*, not *inference alone*. With `winml perf` (warmup=10, iter=50, pure inference): transpose_opt = baseline. Do not cite the +270ms in any report.
+
+**3. `nchwc-transformer` is neutral on this model**
+- NCHWc SIMD layout: 43.4 / 48.0 / 44.7ms — no benefit for ConvNext CPU inference.
+
+**4. opset=18 is neutral**
+- Same node count (251) as opset=17 — no graph structure changes. Mean slightly above baseline (48ms) is within machine variance.
+
+**5. No flag improved latency beyond noise. Baseline is the optimal config.**
+
+---
+
+### ⚠️ Critical finding: ORT performance cliff at opset 19 (ConvNext CPU)
+
+**Experiment:** tested opset 17–22, all with identical graph structure (251 nodes, same op counts)
+
+| opset | mean p50 | slowdown |
+|---|---|---|
+| 17 | 43.7ms | — (baseline) |
+| 18 | 48.0ms | 1.1× |
+| **19** | **160.3ms** | **3.7×** |
+| **20** | **131.0ms** | **3.0×** |
+| **21** | **170.3ms** | **3.9×** |
+| **22** | **85.0ms** | **1.9×** |
+
+**Key facts:**
+- All runs within each opset are consistent (no outliers) — this is real, not noise
+- Graph structure is **byte-for-byte identical**: Reshape×72, Transpose×42, Gemm×37, LN×23, Conv×22 for ALL opsets
+- The performance difference is entirely in ORT's runtime execution path, not the graph
+
+**Mechanism: CONFIRMED ROOT CAUSE — ORT `kMaxSupportedOpset` gates Transpose Optimizer**
+
+Source: `onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h`
+```cpp
+constexpr int64_t kMaxSupportedOpset = 18;  // ORT v1.14.x — bumped each ORT release
+
+ +

Entry point onnx_transpose_optimization::Optimize()MakeOptimizerContext():

+
if (*opset > kMaxSupportedOpset) {
+    return std::nullopt;  // entire Transpose Optimizer skipped silently
+}
+
+ +

ConvNext has 42 Transpose nodes (NCHW↔NHWC sandwich in every block). The Transpose Optimizer normally: +- Pushes Transposes through Add×18, Mul×18 (layer-scale + residual) across block boundaries +- Cancels adjacent inverse pairs

+

When bypassed (opset > kMaxSupportedOpset), all 42 Transposes execute as full memory-layout copies → 3–4× systemic slowdown.

+

ORT optimization level experiment (definitive proof):

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Session opt levelopset=17opset=19ratioexplanation
DISABLE_ALL47.5ms355ms7.5×No Transpose Optimizer → all 42 Transposes raw
ENABLE_BASIC289ms315ms1.1×Both slow (re-optimizing pre-fused graph)
ENABLE_EXTENDED209ms241ms1.2×Better but no layout transform
ENABLE_ALL216ms215ms1.0×Transpose Optimizer runs on both → full parity
+

kMaxSupportedOpset version history:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ORT versionkMaxSupportedOpsetopset ≥ N disabled
v1.14.x18≥ 19
v1.16.x19≥ 20
v1.17.x20≥ 21
v1.18.x21≥ 22
main/HEAD26fully covered
+

Classification for optimization-research skill: [KNOWN_TRADEOFF] (intentional design: ORT bumps the ceiling with each ONNX opset release) +- winml-cli ships a specific ORT build → its kMaxSupportedOpset is fixed +- winml-cli's default opset=17 is correct and essential — it is the safe zone for all current ORT builds +- Raising opset requires ensuring the shipping ORT version has kMaxSupportedOpset ≥ target_opset +- Do NOT raise default opset without verifying kMaxSupportedOpset in the shipped ORT

+

Call chain:

+
InferenceSession::Initialize()
+  → TransposeOptimizer::ApplyImpl()         [transpose_optimizer.cc:18]
+      → onnx_transpose_optimization::Optimize()
+          → MakeOptimizerContext()
+              → if opset > kMaxSupportedOpset: return nullopt  ← THE GATE
+
+ +
+

Inconclusive / do not report

+

These show elevated means but cannot be confirmed as regressions given machine variance (p90 = 2–3× p50 throughout): +- ln_only, conv_add, conv_act: each has ≥1 extreme outlier run; other runs are baseline-level +- gelu_only: consistently 49–56ms, possibly a mild regression but no outlier; 3 runs insufficient to separate from drift +- matmul_scale: all 3 runs elevated (51–61ms), but concurrent baseline also drifted (+5ms); net delta ~+8ms, weak signal

+

Do not write these as confirmed regressions in user-facing reports. Label as "inconclusive" or omit.

+
+

Measurement methodology correction (winml eval vs winml perf)

+ + + + + + + + + + + + + + + + + + + + + + + + + +
ToolWhat it measuresLatency for ConvNext CPU
winml eval (no warmup, includes preprocessing)Application-level: model load + HF preprocessing + inference × N~67ms/sample
winml perf --warmup 10 --iterations 50Pure inference: steady-state kernel execution only~43.7ms p50
DifferenceHF preprocessing + JIT warmup overhead~23ms
+

Rule for autoconfig skill: Always use winml perf with --warmup 10 --iterations 50 for latency measurements in experiments. Never use winml eval latency to compare configs.

+
+

Key insight for autoconfig skill

+
    +
  • CPU EP on ConvNext: no extra flag tested improved latency. Baseline (no fusions beyond what ORT L2 applies unconditionally) is optimal.
  • +
  • The only actionable finding is: do not add matmul-add-fusion for ConvNext on CPU (or any model where baseline already uses Gemm).
  • +
  • QNN/DML: not yet tested. Guidance on those EPs requires separate validated experiments.
  • +
+
+

winml analyze gaps discovered

+

These are cases where analyzing the graph before running autoconfig would have prevented wasted search iterations:

+

Gap 1: "Already fused" vs "fuseable" not distinguished +- ConvNext has LayerNormalization as a native op (already fused at PyTorch export) +- layer-norm-fusion targets the decomposed ReduceMean→Sub→... pattern +- winml analyze reports OP/ai.onnx/LayerNormalization without indicating it's already in canonical form +- Impact: user enables layer-norm-fusion thinking it will help; it does nothing (but builds take longer) +- Fix: analyze should tag ops as already_canonical vs fuseable_subgraph

+

Gap 2: DW-Conv not distinguished from regular Conv +- ConvNext has 18×7x7 DW-Conv (group=C) and 4×regular Conv (group=1) +- winml analyze reports all as OP/ai.onnx/Conv (undifferentiated) +- QNN EP supports DW-Conv natively (important for NPU efficiency), but EP support classification is per op type, not per groups value +- Impact: user cannot tell whether Conv ops are the DW or regular variant; EP support may differ +- Fix: analyze should emit OP/ai.onnx/Conv[depthwise] vs OP/ai.onnx/Conv[regular]

+

Gap 3: Transpose-sandwich pattern not detected +- 42 Transpose nodes in ConvNext form a clear Conv→Transpose→LN→...→Transpose repeating pattern +- transpose-optimizer turns this into NHWC chains (good for GPU/NPU, bad for CPU) +- winml analyze reports Transpose as just OP/ai.onnx/Transpose with no structural context +- Impact: user cannot predict whether transpose-optimizer will help or hurt without running it +- Fix: analyze should detect transpose_sandwich_depth: N and emit a warning for CPU EP

+

Gap 4: ORT L2 baseline fusions not surfaced +- After ORT Level 2 optimization (which runs unconditionally), the graph already has fused Gelu, Gemm +- The analyze command runs on the pre-optimize export.onnx, not the actual optimized model +- winml analyze sees 36×MatMul in export.onnx but the real model at inference has 37×Gemm +- Impact: analyze output doesn't reflect what the model actually looks like when running +- Fix: analyze should optionally run on optimized.onnx (post-ORT-L2), not just export.onnx

+

Gap 5: MatMul semantic not classified +- 36 MatMul ops are all MLP dense layers (4C→C or C→4C expansion) +- No attention MatMuls present (ConvNext has no self-attention) +- QNN handles dense-layer MatMul differently from attention-context MatMul +- winml analyze reports OP/ai.onnx/MatMul without semantic classification +- Fix: analyze could detect MatMul role heuristically (shapes: attention = square-ish, MLP = wide fan-out)

+
+

Why skill eval matters

+

Mobius has no skill eval mechanism — it tests models but not skills themselves. This is a gap. +A SKILL.md can have correct content but still cause the agent to give wrong guidance if the +trigger description is poorly written or the structure is confusing. Skill eval catches this.

+

Two eval dimensions

+ + + + + + + + + + + + + + + + + + + + +
DimensionWhat it checksWhen to run
Static (content quality)description trigger phrases, command accuracy, cross-reference validityEvery PR that modifies a SKILL.md
Dynamic (agent behavior)Given a user scenario + skill injected, does the agent produce the right commands and diagnosis?On significant content changes; periodically
+

Static eval = the review checklist in contributing-a-skill. +Dynamic eval = test cases in evals/eval.yaml per skill, run with winml skill eval.

+

winml skill — new CLI subcommand

+

The eval system is built into winml-cli itself as a new skill subcommand. +This keeps the toolchain self-contained and enables CI integration without external dependencies.

+

Command surface:

+
winml skill check  [--skill <name>]   # static: lint + auto-verify all commands in SKILL.md
+winml skill gen-evals [--skill <name>] # auto-research: generate eval.yaml from SKILL.md content
+winml skill eval   [--skill <name>]   # dynamic: run agent behavior tests
+winml skill list                      # list all skills with pass/fail status
+
+ +

winml skill check — auto-research via command extraction

+

This is the "code change that does auto research":

+
    +
  1. Parse SKILL.md — extract every code block containing winml <command> patterns
  2. +
  3. Verify flags exist — run winml <command> --help and check each flag is present
  4. +
  5. Verify cross-references — confirm every .agents/skills/<name>/SKILL.md path exists
  6. +
  7. Verify trigger coverage — count quoted phrases in description frontmatter (must be ≥3)
  8. +
  9. Optionally run commands — with --dry-run-commands, execute each command on a + canary model to verify it doesn't crash
  10. +
+

Example output:

+
winml skill check --skill debug-accuracy-drop
+
+Checking debug-accuracy-drop...
+  ✓ description: 4 trigger phrases found
+  ✓ winml eval --mode compare     [flag verified against eval --help]
+  ✓ winml analyze -m ... --ep qnn [flag verified against analyze --help]
+  ✗ winml perf --monitor          [flag '--monitor' not found in perf --help]  ← STALE
+  ✓ cross-ref: ep-compatibility-check/SKILL.md exists
+  ✗ cross-ref: validate-before-ship/SKILL.md [file missing]  ← BROKEN LINK
+Summary: 2 issues found
+
+ +

Key insight: every time winml-cli flags change, winml skill check automatically +detects which skills have stale commands — no manual audit needed.

+

Implementation sketch (src/winml/modelkit/commands/skill.py):

+
import re, subprocess
+from pathlib import Path
+import click
+
+SKILLS_DIR = Path(__file__).parents[5] / "skills"
+WINML_CMD_PATTERN = re.compile(r'^\s*(winml\s+\w[\w\-]*\s+[^\n]+)', re.MULTILINE)
+
+def extract_commands(skill_md: str) -> list[str]:
+    """Extract all 'winml <subcommand> ...' lines from code blocks."""
+    in_block = False
+    commands = []
+    for line in skill_md.splitlines():
+        if line.strip().startswith("```"):
+            in_block = not in_block
+        elif in_block and line.strip().startswith("winml "):
+            commands.append(line.strip())
+    return commands
+
+def verify_flag(command_line: str) -> tuple[bool, str]:
+    """Check flags in a command line exist in --help output."""
+    parts = command_line.split()
+    subcommand = parts[1]
+    flags = [p for p in parts[2:] if p.startswith("--")]
+    result = subprocess.run(["winml", subcommand, "--help"],
+                            capture_output=True, text=True)
+    help_text = result.stdout
+    for flag in flags:
+        if flag not in help_text:
+            return False, f"flag '{flag}' not found in {subcommand} --help"
+    return True, "ok"
+
+@click.group("skill")
+def skill_cmd():
+    """Manage and evaluate winml-cli skills."""
+
+@skill_cmd.command("check")
+@click.option("--skill", default=None, help="Skill name to check (default: all)")
+@click.option("--dry-run-commands", is_flag=True, help="Execute commands on canary model")
+def check(skill, dry_run_commands):
+    """Static check: verify commands and cross-references in SKILL.md files."""
+    targets = [SKILLS_DIR / skill] if skill else list(SKILLS_DIR.iterdir())
+    for skill_dir in targets:
+        skill_md = (skill_dir / "SKILL.md").read_text()
+        for cmd in extract_commands(skill_md):
+            ok, msg = verify_flag(cmd)
+            status = "✓" if ok else "✗ STALE"
+            click.echo(f"  {status}  {cmd[:60]}")
+
+ +

winml skill gen-evals — LLM-powered eval case generation

+

Auto-generates evals/eval.yaml from SKILL.md content using an LLM:

+
    +
  1. Extract trigger phrases from description frontmatter
  2. +
  3. Extract symptom→fix tables from SKILL.md sections
  4. +
  5. Prompt an LLM to generate (user scenario, expected commands) pairs
  6. +
  7. Write evals/eval.yaml in PromptFoo format
  8. +
+

This is "auto research": the LLM reads the skill and generates adversarial cases +that challenge the agent — including negative cases where the agent should NOT +recommend something.

+
winml skill gen-evals --skill debug-accuracy-drop --model gpt-4o --count 5
+# Writes: skills/debug-accuracy-drop/evals/eval.yaml (auto-generated)
+# Human review before committing
+
+ +

The generated eval.yaml is a starting point — contributors review and refine before +committing. Over time, real user questions (from GitHub issues) can be mined and +added as additional eval cases.

+

winml skill eval — agent behavior testing

+

Runs the eval cases and reports results:

+
winml skill eval --skill debug-accuracy-drop
+# Uses evals/eval.yaml + injects SKILL.md as system prompt
+# Reports pass/fail per test case
+
+ +

Internally shells out to PromptFoo (if installed) or uses a lightweight built-in runner +that calls the configured LLM API directly.

+

Directory layout

+

Each skill carries its own eval cases:

+
skills/
+  debug-accuracy-drop/
+    SKILL.md
+    evals/
+      eval.yaml     ← agent behavior test cases (hand-written or gen-evals output)
+
+ +

eval.yaml format (PromptFoo)

+
# skills/debug-accuracy-drop/evals/eval.yaml
+description: "Agent behavior eval for debug-accuracy-drop skill"
+
+prompts:
+  - "{{user_message}}"
+
+providers:
+  - id: openai:gpt-4o
+    config:
+      systemPrompt: |
+        You are a WinML CLI assistant. Use the following skill:
+        ---
+        {{skill_content}}
+
+tests:
+  - description: "Low cosine after W8A8  should isolate to quantize stage"
+    vars:
+      user_message: "I quantized my model to W8A8 and cosine similarity is 0.87. What's wrong?"
+    assert:
+      - type: contains
+        value: "winml eval --mode compare"
+      - type: icontains
+        value: "quantize"
+      - type: icontains
+        value: "w8a16"              # should suggest escalating precision
+
+  - description: "NPU vs CPU discrepancy  should point to op fallback"
+    vars:
+      user_message: "My model gives different results on QNN NPU vs CPU after compile"
+    assert:
+      - type: contains
+        value: "winml analyze"
+      - type: icontains
+        value: "partial"            # mention partial op fallback
+      - type: icontains
+        value: "compile"            # blame compile stage, not quantize
+
+  - description: "Drop after optimize only  should NOT blame calibration"
+    vars:
+      user_message: "cosine similarity dropped after winml optimize, I haven't quantized yet"
+    assert:
+      - type: contains
+        value: "winml eval --mode compare"
+      - type: icontains
+        value: "optimize"
+      - type: not-icontains
+        value: "calibration"        # calibration is irrelevant here
+
+ +

Minimum eval cases per skill

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SkillMin casesKey assertions
check-model-feasibility4Screens candidates with winml inspect (never recommends an unsupported model); recommends the 3-layer check in order; gives fallback when EP absent
debug-accuracy-drop4Correctly isolates pipeline stage; suggests precision escalation
ship-to-winapp4Lists all 6 validation gates; handles waiver scenario; produces manifest.json with CPU fallback
autoconfig3Applies latency-budget vs accuracy-floor framework (manual mode); keeps/discards by objective (auto mode)
adding-model-support2Suggests L1→L5 order; correct recipe structure
contributing-a-skill2Flags missing trigger phrases; flags pseudocode commands
+

What "passing" means

+

An eval case passes when all assertions hold. Recommended pass threshold before merging: +- All contains / icontains assertions pass +- All not-icontains (negative) assertions pass (agent does NOT give wrong advice)

+

The negative assertions are the most valuable — they catch the agent confidently giving +wrong guidance (e.g., blaming calibration for an optimize-stage drop).

+

Running evals

+
# Install PromptFoo
+npm install -g promptfoo
+
+# Run eval for a single skill
+cd skills/debug-accuracy-drop
+promptfoo eval --config evals/eval.yaml
+
+# Run all skill evals
+for dir in skills/*/; do
+  if [ -f "$dir/evals/eval.yaml" ]; then
+    promptfoo eval --config "$dir/evals/eval.yaml"
+  fi
+done
+
+ +
+

Implementation notes

+

Directory structure

+
skills/
+  use-winml-cli/              ← existing, extend (user)
+    SKILL.md
+    evals/eval.yaml
+  check-model-feasibility/    ← new (user — model discovery + EP/device compatibility)
+    SKILL.md
+    evals/eval.yaml
+  debug-accuracy-drop/        ← new (user)
+    SKILL.md
+    evals/eval.yaml
+  autoconfig/                 ← new (user — optimize: autoresearch loop + manual framework)
+    SKILL.md
+    evals/eval.yaml
+  ship-to-winapp/             ← new (user — validation gates + multi-EP packaging; partial dep on winml package feature)
+    SKILL.md
+    evals/eval.yaml
+  adding-model-support/       ← new (contributor)
+    SKILL.md
+    evals/eval.yaml
+  adding-ep-support/          ← new (contributor)
+    SKILL.md
+    evals/eval.yaml
+  contributing-a-skill/       ← new (contributor)
+    SKILL.md
+    evals/eval.yaml
+  optimization-research/      ← new (contributor — internal deep gap analysis for winml-cli team)
+    SKILL.md
+    templates/olive_qnn.json
+    templates/olive_dml.json
+    evals/eval.yaml
+
+ +

Priority order for implementation

+

This is implementation sequencing (risk- and dependency-driven), which intentionally differs from +the importance ranking in the Overview. Importance answers "which skill matters most to users"; +this answers "which is safest to build first." Example: autoconfig is the #1 importance user skill +but ships last because it depends on the --format json changes and is the most complex.

+

Code changes first (unblocks agentic skill execution): +0. winml eval --format json — critical: enables all accuracy-related agentic flows +0. winml analyze --format json — enables EP compatibility agentic flows +0. winml perf --format json — enables performance SLA agentic flows

+

User skills: +1. check-model-feasibility — lowest risk, pure existing commands (inspect/sys/analyze); front door for new users (model discovery half needs analyze --format json) +2. debug-accuracy-drop — closes clearest pain point, existing eval --mode compare +3. ship-to-winapp — validation checklist + packaging; build it once the gate commands exist (partial dep on winml package feature) +4. autoconfig — depends on #847/#848/#849 + most complex skill to implement (manual mode can ship first as the lightweight framework)

+

Contributor skills: +5. contributing-a-skill — enables community contributions to the skill ecosystem +6. adding-model-support — most impactful for model coverage growth +7. adding-ep-support — lower frequency, but needed for new EP onboarding +8. optimization-research — internal gap-finder; depends on a working autoconfig baseline to compare against

+

Required code changes for agentic skill execution

+

The three changes that turn skills from documentation into agentic programs:

+

1. winml eval --format json

+

File: src/winml/modelkit/commands/eval.py

+

Add --format option and emit structured JSON to stdout:

+
{
+  "mode": "compare",
+  "model": "path/to/quantized.onnx",
+  "model_id": "microsoft/resnet-50",
+  "metrics": {
+    "cosine_similarity": 0.87,
+    "sqnr_db": 28.3,
+    "psnr_db": 31.1,
+    "max_abs_diff": 0.042
+  },
+  "task_metric": { "top1_accuracy": 0.741 },
+  "threshold_pass": false
+}
+
+ +

2. winml analyze --format json

+

File: src/winml/modelkit/commands/analyze.py

+

Already supports --output file.json. Add --format json to also print to stdout +(mirrors pattern from winml inspect and winml sys):

+
{
+  "ep": "qnn",
+  "model": "path/to/model.onnx",
+  "summary": { "supported": 142, "partial": 3, "unsupported": 1 },
+  "partial_ops": ["MultiHeadAttention", "LayerNorm", "Softmax"],
+  "unsupported_ops": ["CustomRotaryEmbedding"]
+}
+
+ +

3. winml perf --format json

+

File: src/winml/modelkit/commands/perf.py

+

Already writes JSON to file via -o. Add --format json stdout output:

+
{
+  "model": "path/to/model.onnx",
+  "ep": "qnn",
+  "device": "npu",
+  "iterations": 100,
+  "latency_ms": { "p50": 18.3, "p90": 21.7, "p99": 28.4, "mean": 18.9 },
+  "throughput_rps": 54.6
+}
+
+ +

These three changes are ~50 lines of code each, follow the existing pattern from +winml inspect --format json and winml sys --format json, and unlock the full +agentic execution model for all consumer skills.

+

Sizing estimate (per skill)

+

Each SKILL.md based on Mobius patterns (~8–14KB): +- ~200 lines prose + decision tables +- ~50 lines code examples +- Cross-reference section

+

Relationship to existing use-winml-cli skill

+

The new skills are task-scoped (problem → solution) vs the existing skill which is +tool-scoped (here's what each command does). They complement, not replace each other. +The existing skill should add cross-references to the new skills in its "Common patterns" section.

+
+

QNN NPU Catalog Sweep — Findings & Feature Gaps (2026-06-13)

+

Source: 8-model catalog sweep via autoconfig POC (C:\tmp\autoconfig-demo\catalog_qnn_sweep.py)

+

Cross-model results

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ModelArchBaseline p50Best p50GainBest config
microsoft/resnet-18resnet0.96ms0.96msbaseline (opset17)
google/vit-base-patch16-224vit9.04ms9.04msbaseline (opset17)
apple/mobilevit-smallmobilevit12.07ms8.62ms+29%opset21+conv_fusions
facebook/dinov2-smalldinov26.56ms4.98ms+24%opset21
hustvl/yolos-smallyolos78.69mstimeout
distilbert SST-2distilbert19.48ms19.48msbaseline
all-MiniLM-L6-v2bert5.81ms5.81msbaseline
deepset/roberta-base-squad2roberta14.94ms14.72ms1.5%opset21
+

Validated KB findings

+

npu-001 refined: opset21 benefit is architecture-gated: +- ✅ Conv + residual connections: +25–31% (mobilevit, dinov2, convnext) +- ❌ Pure transformer (ViT, YOLOS): -7% or neutral +- ⚪ NLP BERT-family: neutral

+

npu-006 NEW — CRITICAL: Conv fusions (conv-bn/add/activation) cause catastrophic QNN NPU CPU fallback +- ResNet-18 with conv fusions: 0.96ms → 132ms (+4900% regression) +- MobileViT: safe (no regression) +- Severity: critical — can produce 50x+ regression silently

+

npu-007 NEW: DVFS thermal noise makes CV gate unreliable on QNN NPU +- New bench protocol: 3 sessions × 500 iters + 30s cool-down + median p50 + >10% noise floor

+

Feature gaps (winml-cli backlog items)

+

Gap A: winml analyze — Conv fusion QNN safety check +winml analyze should detect Conv-dominant topologies and warn when conv-bn/add/activation +fusions are configured for QNN NPU target. Currently no pre-build detection of this hazard. +- Command to add: warning in analyze output when ep=qnn AND conv_fusion_pass is enabled AND model has >N Conv ops +- Priority: HIGH (silent 50x regression risk)

+

Gap B: budget-aware sweep in autoconfig +Large models (YOLOS, ~78ms/inf) cause sweep timeout with current fixed budget. +Need: per-hypothesis time estimation → auto-skip models that exceed budget, log as "timeout" not failure. +- Affects: autoconfig POC and any future winml sweep command

+

Gap C: winml perf DVFS-aware session averaging +winml perf should natively support session-level median aggregation for QNN NPU. +Current single-session variance is dominated by DVFS thermal state, not model performance. +- Flag proposal: --sessions 3 --cool-down 30 --signal median-p50 +- This would make winml perf output trustworthy for optimization decisions on Snapdragon X Elite

+
+

Feature Request: FusedConv detection + unfuse-for-qnn (2026-06-15)

+

Problem

+

用户可能从外部拿到一个已经做过 Conv fusion 的 ONNX 模型,或者 autoconfig 实验里开了 conv-add-activation-fusion flag。 +这类模型在 QNN NPU 上跑起来特别慢(ResNet-18 实测 +4900% regression),但没有任何报错,用户完全不知道原因。

+

Root cause

+

conv-add-activation-fusion 生成的是 ORT 扩展 op FusedConv(非标准 ONNX op)。 +QNN EP 不认识这个 op,所有 FusedConv 节点全部 fallback 到 CPU,PCIe round-trip 开销极大。

+

conv-bn-fusion 不同:它把 BN 参数数学吸收进 Conv weight,不产生新 op 类型,结果仍是标准 Conv,不可逆

+

Proposed feature

+

1. winml analyze — FusedConv detection

+

winml analyze -m model.onnx --ep qnn 扫描图中所有节点, +如果发现 FusedConv 节点且目标 EP 为 QNN,输出警告:

+

⚠ QNN NPU: 23 FusedConv nodes detected. + FusedConv is an ORT-internal op not supported by QNN EP — these nodes will fall back to CPU. + Recommend: run winml optimize --unfuse-conv to expand back to standard ONNX ops.

+

2. winml optimize --unfuse-conv

+

新增 optimize pass:把 FusedConv 节点拆回 Conv + Add + 。 +- Lossless(权重不变,只拆 op 结构) +- 输出标准 ONNX,QNN EP 可正常映射 HTP kernel +- 适用场景:BYOM 用户带入已做过 fusion 的模型

+

Implementation notes +- 检测: +ode.op_type == "FusedConv" 即可定位 +- 拆分:读 FusedConv attribute ctivation 字段 → 插入对应 Relu/Sigmoid/Tanh 节点 +- 不处理 conv-bn-fusion 产生的模型(那个无法反向,只能重新从 FP32 export)

+

Priority

+

MEDIUM — 默认 flag 是关的,不是高频路径,但对 BYOM 场景(拿到别人优化过的模型)有实际价值。

+
+
+ + diff --git a/research/autoconfig/docs/skills-design.md b/research/autoconfig/docs/skills-design.md index 1ad4b5d6c..bdfe23c99 100644 --- a/research/autoconfig/docs/skills-design.md +++ b/research/autoconfig/docs/skills-design.md @@ -2,31 +2,59 @@ ## Overview -This document defines the design for 11 skills to be added to `skills/` in winml-cli. -Skills are split into three audiences: +This document defines the design for 9 skills to be added to `skills/` in winml-cli. +Skills are split into **two categories by the single question: does the task require editing repo code?** -- **Consumer skills (7)** — for WinApp developers deploying models -- **Contributor skills (3)** — for engineers extending winml-cli itself -- **Internal research skills (1)** — for winml-cli team to find optimization gaps and backlog items +- **User skills (5)** — the user reaches their goal purely by specifying conditions and letting + winml-cli produce or modify a `config.json` / `manifest.json` / report. **No source code is touched.** + Audience: WinApp developers and ISVs deploying models. +- **Contributor skills (4)** — the task requires a winml-cli source-code change (a new exporter, a new + EP backend, a new skill), or exists specifically to produce code-change backlog. Audience: winml-cli engineers. + +> Discriminator: if the deliverable is a config/manifest/report, it is a **User** skill. If completing it +> requires editing code in the repo (or its whole purpose is to drive such edits), it is a **Contributor** skill. Each skill follows the SKILL.md frontmatter convention (`name:`, `description:`) established by Mobius, NVIDIA Model-Optimizer, and Google LiteRT-CLI as the de facto standard. -### Consumer skill dependency graph +### User skills — ranked by importance + +| Rank | Skill | Why it ranks here | Output (no code) | +|---|---|---|---| +| 1 | `autoconfig` | Flagship. Autonomously searches the config space and delivers the optimal `config.json` per EP. Also hosts the **manual optimize path** (precision-ladder + latency/accuracy-budget decision framework + hardware table) for users who want to choose by hand or have no target hardware. Maps to all five user scenarios (S1–S5). | `config__optimal.json` + `report.html` | +| 2 | `check-model-feasibility` | Pre-build front door, merging model discovery + EP/device compatibility: "find me a *supported* model from my constraints, then confirm it runs on my hardware." The single "what do I run, and will it run?" gate (`inspect` → `sys` → `analyze`). Highest frequency — every user hits it before building. | model shortlist + go/no-go + fallback EP | +| 3 | `debug-accuracy-drop` | Closes the most acute pain point: accuracy dropped, cause unknown. High-frequency diagnostic need with the clearest existing tooling (`eval --mode compare`). | stage + root cause + fix | +| 4 | `ship-to-winapp` | Ship-time skill, merging validation + packaging: L1–L5 Definition-of-Done gates **plus** multi-EP artifact layout, `manifest.json`, and runtime EP selection. Everything between "the model is good" and "it's running in the app." | pass/fail report + `manifest.json` | +| 5 | `use-winml-cli` | General tool-scoped onboarding reference (existing). Foundational but low differentiation vs the task-scoped skills above. | command reference | + +### Contributor skills — ranked by importance + +| Rank | Skill | Why it ranks here | Code touched | +|---|---|---|---| +| 1 | `adding-model-support` | Directly grows model coverage — the core long-tail business problem (ISV onboarding, S2/S5). Highest contribution frequency. | new exporter + recipe | +| 2 | `optimization-research` | High leverage: deep-searches ORT/Olive/ecosystem to find gaps and file the backlog that drives every other contributor skill. Internal, but sets the roadmap. | files issues + repro (drives code changes) | +| 3 | `adding-ep-support` | Onboards a new execution-provider backend. Infrequent, but high value the moment a new NPU vendor lands. | compile backend + EP registry | +| 4 | `contributing-a-skill` | Meta-tooling: how to author, lint, and eval a SKILL.md. Sustains the ecosystem but is supporting infrastructure, not a direct model/EP/perf deliverable. | `SKILL.md` + evals | + +> The detailed `## Skill:` sections below appear in document order, not priority order. Importance is +> defined by the two ranked tables above; implementation sequencing (risk/dependency-driven) is in +> [Priority order for implementation](#priority-order-for-implementation). + +### User skill dependency graph ``` -ep-compatibility-check ──┐ - ├──► optimize-for-device ──┐ -use-winml-cli ────────────┤ ├──► validate-before-ship - └──► debug-accuracy-drop ───┤ - │ -prepare-for-winapp ────────────────────────────────────┘ +check-model-feasibility ──► autoconfig ──────────► ship-to-winapp + find a supported model optimize the model validate (L1–L5 gates) + + confirm EP/device runs (automated autoresearch + package multi-EP artifacts + loop OR manual framework) + manifest + runtime EP selection + │ │ ▲ + └──────────► debug-accuracy-drop ───────────────────┘ + (diagnose accuracy drops at any stage) -autoconfig ────────────────────────────────────────────► validate-before-ship - (autoresearch loop: finds optimal config for user-defined EP/accuracy/latency targets) +use-winml-cli ── general command reference; underpins every step above ``` -### Internal research skill +### Contributor research skill ``` optimization-research ──► [GitHub issues / winml backlog] @@ -140,7 +168,7 @@ full diagnostic in one turn. ## Validation confidence levels (L1–L5) -Inspired by Mobius `writing-tests`. Applied in `validate-before-ship` as the Definition-of-Done backbone. +Inspired by Mobius `writing-tests`. Applied in `ship-to-winapp` as the Definition-of-Done backbone. Each level is checked **independently** — a model can pass L3 without passing L2. | Level | Name | What it verifies | Key command | @@ -160,7 +188,7 @@ Each level is checked **independently** — a model can pass L3 without passing | W8A8 | cosine_similarity ≥ 0.90 (or task-specific) | Waivers: any level that cannot be verified must be documented with a reason and tracking issue. -The `validate-before-ship` skill maps each of its 6 gates to an L-level. +The `ship-to-winapp` skill maps each of its 6 validation gates to an L-level. --- @@ -368,7 +396,7 @@ ModelOpt (HF nvidia/ org), AI Hub (500+ models), NNCF (Model Zoo with accuracy t --- -## Skill 1: `use-winml-cli` (existing — extend) +## Skill: `use-winml-cli` (existing — extend) **Status:** Exists at `skills/use-winml-cli/SKILL.md`. Needs two additions: - Add `winml run` and `winml serve` usage (currently missing) @@ -378,72 +406,7 @@ No structural changes needed; the existing skill is the general entry point. --- -## Skill 2: `optimize-for-device` - -### Frontmatter -```yaml -name: optimize-for-device -description: > - Use this skill when a user wants the best performance for their model on a - specific Windows device, or wants to compare latency/accuracy tradeoffs across - quantization levels (FP16, W8A16, W8A8) and execution providers (QNN NPU, - DirectML GPU, CPU). Covers the precision sweep workflow, hardware-specific - recommendations, and how to read tradeoff results to make a deployment decision. - Use when the user says "make it faster", "which precision should I use", "is NPU - worth it", or asks to compare hardware. -``` - -### When to use -- "I want to run this on NPU, how much faster will it be?" -- "Which quantization should I pick?" -- "Compare QNN vs DirectML vs CPU for my model" -- "Is W8A8 accurate enough for my use case?" - -### Sections - -**1. The decision framework** -Two inputs: latency budget OR accuracy budget. Decision tree: -- Have a latency SLA (e.g. <50ms)? → Find highest accuracy within that budget -- Have an accuracy floor (e.g. <2% drop)? → Find fastest within that floor - -**2. The precision ladder** -Table: FP32 → FP16 → W8A16 → W8A8, with typical speedup and accuracy-drop ranges -per model family (Encoder/BERT-like, Vision/ConvNet, Transformer/ViT). - -**3. The sweep workflow** -Step-by-step: run `winml build` + `winml eval` + `winml perf` for each precision, -collect into a tradeoff table, apply decision framework. - -Key commands: -```bash -winml config -m --device --precision fp16 -o config_fp16.json -winml build -c config_fp16.json -m -o out_fp16/ -winml eval -m out_fp16/.onnx --model-id -winml perf -m out_fp16/.onnx --device --iterations 50 -# repeat for w8a16, w8a8 -``` - -**4. Hardware-specific guidance table** -| Device | Best EP | Sweet-spot precision | Notes | -|---|---|---|---| -| Snapdragon X Elite NPU | QNN | W8A16 | HTP native for W8A16; W8A8 risky for Attention | -| Intel Core Ultra NPU | OpenVINO | W8A8 | OpenVINO PTQ handles INT8 well | -| AMD Ryzen AI NPU | VitisAI | W8A8 | Phoenix/Hawk Point prefer INT8 | -| Any GPU | DirectML | FP16 | FP16 sufficient; quantization rarely helps on GPU | -| CPU fallback | CPU | W8A8 | Size + latency both benefit | - -**5. Reading the output** -How to interpret `winml eval` cosine_similarity, SQNR, and `winml perf` p50/p90/p99. -What values indicate "acceptable" vs "needs investigation". - -**Cross-references:** -- If accuracy dropped unexpectedly → `debug-accuracy-drop` -- If EP not available → `ep-compatibility-check` -- After choosing a precision → `validate-before-ship` - ---- - -## Skill 3: `debug-accuracy-drop` +## Skill: `debug-accuracy-drop` ### Frontmatter ```yaml @@ -525,37 +488,146 @@ W8A8 → W8A16 → FP16 → FP32 Stop at the first precision that meets accuracy requirements. **Cross-references:** -- To compare precision options systematically → `optimize-for-device` -- If op is listed as unsupported → `ep-compatibility-check` +- To compare precision options systematically → `autoconfig` (manual or automated optimize) +- If op is listed as unsupported → `check-model-feasibility` --- -## Skill 4: `prepare-for-winapp` +## Skill: `ship-to-winapp` (merge of `validate-before-ship` + `prepare-for-winapp`) + +Covers the whole ship-time phase: **first validate** the model meets the Definition-of-Done, +**then package** the multi-EP artifacts and manifest for the WinApp to load at runtime. ### Frontmatter ```yaml -name: prepare-for-winapp +name: ship-to-winapp description: > - Use this skill when a WinApp developer needs to take winml-cli build artifacts - and integrate them into a Windows application. Covers how to organize multi-EP - artifacts (QNN/NPU, DirectML/GPU, CPU fallback), the recommended directory - layout and manifest structure for runtime EP selection, how to load models - using the Windows ML WinRT API or ONNX Runtime C++ API, and runtime EP - detection and fallback patterns. Use when the user asks "how do I use this - in my app", "how do I package the model", or "what file do I load at runtime". + Use this skill when taking a winml-cli model artifact the last mile into a Windows + application — both validating it is good enough to ship and packaging it for the app. + Validation half: a Definition-of-Done checklist covering artifact completeness, accuracy + vs FP32 baseline, performance SLA, output correctness on real inputs, cross-EP consistency, + and fallback chain (every item checked or explicitly waived). Packaging half: how to organize + multi-EP artifacts (QNN/NPU, OpenVINO, VitisAI, DirectML/GPU, CPU fallback), the recommended + directory layout and manifest.json for runtime EP selection, and the runtime EP detection / + fallback pattern. Use when the user says "I'm ready to ship", "what should I test before + release", "how do I know the model is good enough", "how do I use this in my app", + "how do I package the model", or "what file do I load at runtime". ``` ### When to use +- About to ship a WinApp with on-device inference; final QA gate before production +- After any build config change (new quantization, new EP, new model version) - "I built the model, how do I ship it in my app?" -- "How do I load different models for different hardware?" -- "What happens when the user doesn't have an NPU?" +- "How do I load different models for different hardware / what happens with no NPU?" - "How do I package QNN + DML + CPU variants together?" -### Sections +--- + +### Part A — Validate (Definition-of-Done gates) + +**The checklist** + +**Gate 1 — Artifact completeness** +- [ ] All target EP artifacts exist and are loadable +- [ ] CPU fallback artifact exists +- [ ] manifest.json (if using multi-EP layout) is valid and references existing files +- [ ] Artifact was built with `winml build` (not opaque cache artifact) + +```bash +winml inspect -m .onnx # verify each artifact loads +``` + +**Gate 2 — Accuracy vs FP32 baseline** +- [ ] cosine_similarity ≥ 0.99 for FP16 artifacts +- [ ] cosine_similarity ≥ 0.95 for W8A16 artifacts +- [ ] cosine_similarity ≥ 0.90 for W8A8 artifacts (or task-specific threshold) +- [ ] Task accuracy metric (Top-1, F1, mAP) within acceptable drop from FP32 + +```bash +winml eval --mode compare -m .onnx --model-id +winml eval -m .onnx --model-id # task accuracy +``` + +**Gate 3 — Performance SLA** +- [ ] p50 latency meets application target on target device +- [ ] p99 latency within 2x p50 (no outlier spikes) +- [ ] Benchmark run on actual target hardware (not developer machine) + +```bash +winml perf -m .onnx --device --iterations 100 --monitor +``` + +**Gate 4 — Output correctness on real inputs** +- [ ] Model produces correct output on ≥3 representative real-world inputs +- [ ] No NaN or Inf in outputs +- [ ] Output shape matches expected shape + +```bash +winml run -m .onnx --file # visual/manual check +``` + +**Gate 5 — Cross-EP consistency (if shipping multiple EP variants)** +- [ ] QNN and DML outputs agree within tolerance on same input +- [ ] CPU fallback output agrees with primary EP within tolerance + +```bash +winml run -m model_qnn.onnx --file sample.jpg --format json -o qnn_out.json +winml run -m model_dml.onnx --file sample.jpg --format json -o dml_out.json +winml run -m model_cpu.onnx --file sample.jpg --format json -o cpu_out.json +# compare qnn_out.json vs dml_out.json vs cpu_out.json manually +``` + +**Gate 6 — Fallback chain** +- [ ] CPU fallback artifact verified independently (not just assumed to work) +- [ ] App runtime selects correct artifact when target EP is absent (simulate by removing EP) + +**Waiver policy** +Any item that cannot be completed must be waived explicitly: +``` +Waivers: +- Cross-EP consistency: VitisAI not available on developer machine. + Verified on target hardware by QA team. Issue #NNN. +- Performance SLA: Target hardware (Snapdragon X Elite) in procurement. + Benchmark deferred to post-merge, tracked in issue #NNN. +``` +Unchecked items without waiver → do not ship. + +**L-level mapping** — the 6 gates map directly to the L1–L5 confidence system (see Overview): + +| Gate | L-level | +|---|---| +| Gate 1 — Artifact completeness | L1 | +| Gate 2 — Accuracy vs FP32 baseline | L3 + L4 | +| Gate 3 — Performance SLA | L5 | +| Gate 4 — Output correctness on real inputs | L4 | +| Gate 5 — Cross-EP consistency | L5 | +| Gate 6 — Fallback chain | L1 (CPU artifact) | + +Minimum to ship: L1 + L3 all passing. L4 + L5 required for production release. + +**Quick command reference** +```bash +# Gate 1: inspect all artifacts +for f in model_qnn.onnx model_dml.onnx model_cpu.onnx; do winml inspect -m $f; done +# Gate 2: accuracy +winml eval --mode compare -m .onnx --model-id +winml eval -m .onnx --model-id +# Gate 3: perf +winml perf -m .onnx --device auto --iterations 100 --monitor +# Gate 4: real input +winml run -m .onnx --file +# Gate 5: cross-EP (run individually, compare outputs) +winml run -m model_qnn.onnx --file --format json +winml run -m model_dml.onnx --file --format json +``` + +--- + +### Part B — Package & integrate (multi-EP) **1. The multi-EP artifact problem** -Explain why `winml compile` produces EP-locked files (not portable), -so a WinApp needs a strategy to select the right file per device. +`winml compile` produces EP-locked files (not portable), so a WinApp needs a strategy to +select the right file per device. **2. Recommended artifact layout** ``` @@ -584,6 +656,7 @@ my_model/ "selection_order": ["qnn", "openvino", "vitisai", "dml", "cpu"] } ``` +(For multi-EP artifacts, `autoconfig` emits this `manifest.json` directly with experiment provenance.) **4. Building all variants with winml-cli** ```bash @@ -614,53 +687,88 @@ Pseudocode for app-side logic: - Don't ship only the compiled artifact without a CPU fallback **Cross-references:** +- If accuracy gate fails → `debug-accuracy-drop` +- If performance gate fails → `autoconfig` (manual or automated optimize path) +- If EP not available for testing, or to pick the right EP → `check-model-feasibility` - To build the artifacts → `use-winml-cli` -- To verify each artifact → `validate-before-ship` --- -## Skill 5: `ep-compatibility-check` +## Skill: `check-model-feasibility` (merge of `find-a-model` + `ep-compatibility-check`) + +The pre-build front door. Two entry points, one shared engine (`inspect` → `sys` → `analyze`): +**(A)** the user has no model yet → recommend a *supported* one from their constraints; +**(B)** the user has a model → confirm it runs on their target EP/device. Both converge on the +same three-layer check, so they are one skill. ### Frontmatter ```yaml -name: ep-compatibility-check +name: check-model-feasibility description: > - Use this skill to determine whether a specific model will work on specific - Windows hardware before starting a full build. Covers winml inspect for model - support verification, winml sys for EP availability on the current machine, - winml analyze for operator-level EP compatibility, and the EP-to-hardware - mapping for Windows AI PCs. Use when the user asks "will this work on my - device", "is QNN supported here", "what hardware do I need for NPU", or - when they get an unsupported operator error. + Use this skill before a full build, to answer two linked questions: "which model should I + use?" and "will it run on my hardware?". Model discovery: when the user knows the task + (image classification, text embedding, object detection, summarization, …) but has no model + yet, gather their constraints, generate Hugging Face candidates, and screen each one for + winml-cli support. Compatibility: for a chosen (or candidate) model, run the three-layer check + — winml inspect (model support), winml sys (EP availability on this machine), winml analyze + (operator-level EP coverage) — plus the EP-to-hardware mapping and fallback chain for Windows + AI PCs. Use when the user says "what model should I use for X", "find me a model that runs + under 20ms on the NPU", "recommend a small image classifier", "I don't have a model yet", + "will this work on my device", "is QNN supported here", "what hardware do I need for NPU", + or when they hit an unsupported-operator error. + +audience: external (WinApp developers) ``` ### When to use -- "Will this model work on my Snapdragon X Elite laptop?" -- "I don't know if my machine has a QNN EP" -- "The compile step failed with unsupported op" -- Starting a new project: verify feasibility before investing build time +- "What model should I use for background blur / OCR / summarization?" +- "Find a text-embedding model under 100MB that runs on the Intel NPU" +- "Will this model work on my Snapdragon X Elite laptop? Is QNN supported here?" +- "The compile step failed with an unsupported op" +- Starting a new project: pick a model and verify feasibility before investing build time + +### What this skill does NOT do +- It does not train, fine-tune, or optimize a model — optimization hands off to `autoconfig`. +- It only recommends models whose architecture winml-cli can actually export/run (verified via + `winml inspect`), never an arbitrary HF model it cannot load. ### Sections -**1. Three-layer compatibility check** -Layer 1 — Model support: does winml-cli know this model type? -Layer 2 — EP availability: is the target EP registered on this machine? -Layer 3 — Operator coverage: does the target EP support all ops in this model? +**1. Two entry points** +- (A) **No model yet** → run Section 2 (discovery) to produce candidates, then Section 3 on each. +- (B) **Have a model** → skip to Section 3 (three-layer check) directly. -Each layer has a command; run in order, stop at first failure. +**2. Discovery — find candidate models (entry point A)** +Capture and lock the selection constraints first: -**2. Layer 1: Model support** +| Condition | Example | Drives | +|---|---|---| +| Task | image-classification, feature-extraction, text-generation | HF Hub filter | +| Target device / EP | Snapdragon X NPU (QNN), Intel NPU (OpenVINO), any GPU (DML) | feasibility + latency class | +| Latency budget | p50 ≤ 20 ms | size / architecture shortlist | +| Accuracy need | "≥ ResNet-50 top-1" or a benchmark floor | candidate quality bar | +| Size limit | ≤ 100 MB on disk | excludes large variants | +| License | permissive (Apache-2.0 / MIT) | excludes restricted models | + +The agent queries the HF Hub by task, sorted by downloads/likes, restricted to architecture +families winml-cli is known to support → a 5–10 model shortlist. Each candidate then goes +through the three-layer check below; drop any that fail Layer 1 or have heavy unsupported ops. + +**3. The three-layer feasibility check (entry points A and B)** +Layer 1 — Model support · Layer 2 — EP availability · Layer 3 — Operator coverage. +Run in order, stop at first hard failure. + +*Layer 1 — Model support* ```bash -winml inspect -m +winml inspect -m --format json ``` -What to look for: `loader`, `exporter`, `winml_inference_class` fields populated. -If inspect fails or shows "unsupported" → model is out of scope for winml-cli. +Look for `loader`, `exporter`, `winml_inference_class` populated. If inspect fails or shows +"unsupported" → model is out of scope for winml-cli (drop the candidate; do not recommend it). -**3. Layer 2: EP availability** +*Layer 2 — EP availability* ```bash winml sys --list-ep --list-device ``` -EP-to-hardware reference table: | EP | Hardware requirement | Check for | |---|---|---| | QNN | Qualcomm Snapdragon X Elite / X Plus | QNNExecutionProvider in list | @@ -670,176 +778,47 @@ EP-to-hardware reference table: | DML | Any DirectX 12 GPU | DmlExecutionProvider | | CPU | Any | Always available | -If the desired EP is not listed → recommend next best EP from fallback chain. +If the desired EP is not listed → recommend next best EP from the fallback chain. -**4. Layer 3: Operator coverage** +*Layer 3 — Operator coverage* ```bash -winml analyze -m .onnx --ep +winml analyze -m .onnx --ep --format json # or for all EPs at once: winml analyze -m .onnx --device all ``` -Output interpretation: - `supported` (green): op runs natively on EP - `partial` (yellow): op may fall back to CPU for some configurations - `unsupported` (red): op cannot run on this EP -Decision rule: any `unsupported` → either change EP or accept CPU fallback -for those ops (which may impact accuracy and latency). +Decision rule: any `unsupported` → either change EP or accept CPU fallback for those ops +(which may impact accuracy and latency). -**5. Fallback chain recommendation** +**4. Fallback chain recommendation** If target EP not available or has unsupported ops: ``` QNN not available → OpenVINO (if Intel) or VitisAI (if AMD) → DML → CPU ``` -**6. Fast-fail before compile** -`winml compile` is expensive (minutes). Always run analyze first. -If analyze shows >20% unsupported ops → likely not worth compiling for that EP. +**5. Rank and recommend (entry point A) / fast-fail before compile (entry point B)** +- Discovery: rank surviving candidates by fit against the locked conditions (size, latency + class, accuracy reference, op coverage, downloads as a popularity prior). Output a short + ranked table + one recommended pick + rationale. +- `winml compile` is expensive (minutes). Always run `analyze` first; if it shows >20% + unsupported ops → likely not worth compiling for that EP. **Cross-references:** -- After confirming compatibility → `use-winml-cli` (build) -- If all EPs show unsupported ops → model may be out of scope for winml-cli - ---- - -## Skill 6: `validate-before-ship` - -### Frontmatter -```yaml -name: validate-before-ship -description: > - Use this skill when preparing to release a Windows application with an - on-device AI model. Provides a Definition-of-Done checklist covering artifact - completeness, accuracy validation against FP32 baseline, performance SLA - verification, output correctness on real inputs, cross-EP consistency, and - fallback chain verification. Every item must be checked or explicitly waived - before shipping. Use when the user says "I'm ready to ship", "what should I - test before release", or "how do I know the model is good enough". -``` - -### When to use -- About to ship a WinApp with on-device inference -- Final QA gate before a model artifact goes to production -- After any build config change (new quantization, new EP, new model version) - -### Sections - -**1. The checklist** - -**Gate 1 — Artifact completeness** -- [ ] All target EP artifacts exist and are loadable -- [ ] CPU fallback artifact exists -- [ ] manifest.json (if using multi-EP layout) is valid and references existing files -- [ ] Artifact was built with `winml build` (not opaque cache artifact) - -Command: -```bash -winml inspect -m .onnx # verify each artifact loads -``` - -**Gate 2 — Accuracy vs FP32 baseline** -- [ ] cosine_similarity ≥ 0.99 for FP16 artifacts -- [ ] cosine_similarity ≥ 0.95 for W8A16 artifacts -- [ ] cosine_similarity ≥ 0.90 for W8A8 artifacts (or task-specific threshold) -- [ ] Task accuracy metric (Top-1, F1, mAP) within acceptable drop from FP32 - -Commands: -```bash -winml eval --mode compare -m .onnx --model-id -winml eval -m .onnx --model-id # task accuracy -``` - -**Gate 3 — Performance SLA** -- [ ] p50 latency meets application target on target device -- [ ] p99 latency within 2x p50 (no outlier spikes) -- [ ] Benchmark run on actual target hardware (not developer machine) - -Command: -```bash -winml perf -m .onnx --device --iterations 100 --monitor -``` - -**Gate 4 — Output correctness on real inputs** -- [ ] Model produces correct output on ≥3 representative real-world inputs -- [ ] No NaN or Inf in outputs -- [ ] Output shape matches expected shape - -Command: -```bash -winml run -m .onnx --file # visual/manual check -``` - -**Gate 5 — Cross-EP consistency (if shipping multiple EP variants)** -- [ ] QNN and DML outputs agree within tolerance on same input -- [ ] CPU fallback output agrees with primary EP within tolerance - -Command (manual comparison across runs): -```bash -winml run -m model_qnn.onnx --file sample.jpg --format json -o qnn_out.json -winml run -m model_dml.onnx --file sample.jpg --format json -o dml_out.json -winml run -m model_cpu.onnx --file sample.jpg --format json -o cpu_out.json -# compare qnn_out.json vs dml_out.json vs cpu_out.json manually -``` - -**Gate 6 — Fallback chain** -- [ ] CPU fallback artifact verified independently (not just assumed to work) -- [ ] App runtime selects correct artifact when target EP is absent (simulate by removing EP) - -**2. Waiver policy** -Any item that cannot be completed must be waived explicitly: -``` -Waivers: -- Cross-EP consistency: VitisAI not available on developer machine. - Verified on target hardware by QA team. Issue #NNN. -- Performance SLA: Target hardware (Snapdragon X Elite) in procurement. - Benchmark deferred to post-merge, tracked in issue #NNN. -``` -Unchecked items without waiver → do not ship. - -**3. L-level mapping** - -The 6 gates map directly to the L1–L5 confidence system (see Overview): - -| Gate | L-level | -|---|---| -| Gate 1 — Artifact completeness | L1 | -| Gate 2 — Accuracy vs FP32 baseline | L3 + L4 | -| Gate 3 — Performance SLA | L5 | -| Gate 4 — Output correctness on real inputs | L4 | -| Gate 5 — Cross-EP consistency | L5 | -| Gate 6 — Fallback chain | L1 (CPU artifact) | - -Minimum to ship: L1 + L3 all passing. L4 + L5 required for production release. +- After picking a model + confirming feasibility → `autoconfig` (find the optimal config) +- To build the chosen artifacts → `use-winml-cli` +- If **no** supported model meets the constraints, or all EPs show unsupported ops → the gap + feeds `optimization-research` (long-tail coverage) and `adding-model-support` -**3. Quick command reference** -```bash -# Gate 1: inspect all artifacts -for f in model_qnn.onnx model_dml.onnx model_cpu.onnx; do winml inspect -m $f; done - -# Gate 2: accuracy -winml eval --mode compare -m .onnx --model-id -winml eval -m .onnx --model-id - -# Gate 3: perf -winml perf -m .onnx --device auto --iterations 100 --monitor - -# Gate 4: real input -winml run -m .onnx --file - -# Gate 5: cross-EP (run individually, compare outputs) -winml run -m model_qnn.onnx --file --format json -winml run -m model_dml.onnx --file --format json -``` - -**Cross-references:** -- If accuracy gate fails → `debug-accuracy-drop` -- If performance gate fails → `optimize-for-device` -- If EP not available for testing → `ep-compatibility-check` -- For multi-EP artifact packaging → `prepare-for-winapp` +> Addresses the **Pre-quantized model zoo / cold-start** whitespace from the Competitive Analysis: +> NVIDIA (`nvidia/` HF org) and AI Hub (500+ models) reduce cold-start with curated zoos; winml-cli +> has none, so this skill substitutes a constraints-driven recommender that only returns *supported* models. --- -## Skill 7: `adding-model-support` (contributor) +## Skill: `adding-model-support` (contributor) ### Frontmatter ```yaml @@ -901,12 +880,12 @@ Minimal recipe template: - Non-standard tokenizer → verify `winml run` input preprocessing **Cross-references:** -- If EP shows unsupported ops → `ep-compatibility-check` -- After L1–L5 all pass → `validate-before-ship` for PR gate +- If EP shows unsupported ops → `check-model-feasibility` +- After L1–L5 all pass → `ship-to-winapp` for PR gate --- -## Skill 8: `adding-ep-support` (contributor) +## Skill: `adding-ep-support` (contributor) ### Frontmatter ```yaml @@ -962,12 +941,12 @@ Minimum before merging: - L5: `winml perf` produces valid latency output on target hardware **Cross-references:** -- Operator coverage analysis → `ep-compatibility-check` -- After adding: document the EP in `ep-compatibility-check` hardware table +- Operator coverage analysis → `check-model-feasibility` +- After adding: document the EP in the `check-model-feasibility` hardware table --- -## Skill 9: `contributing-a-skill` (contributor) +## Skill: `contributing-a-skill` (contributor) ### Frontmatter ```yaml @@ -1026,28 +1005,33 @@ documentation summary. Include representative user phrases in quotes. - [ ] All commands are tested and produce the described output - [ ] Cross-references use relative paths and the linked skill exists - [ ] No commands reference flags that don't exist in current `winml --help` -- [ ] Hardware names and EP names match the canonical list in `ep-compatibility-check` +- [ ] Hardware names and EP names match the canonical list in `check-model-feasibility` - [ ] `evals/eval.yaml` exists with ≥2 test cases (including at least one negative assertion) --- -## Skill 10: `autoconfig` (consumer — autoresearch loop) +## Skill: `autoconfig` (user — optimize the model: automated loop + manual framework) + +The optimize skill. Two modes: **automated** (the autoresearch loop — the bulk of this section) for +"figure it out for me / run overnight", and **manual** (the decision framework folded in from +`optimize-for-device`) for "I'll choose by hand" or when there is no target hardware to benchmark on. ### Frontmatter ```yaml name: autoconfig description: > - Use this skill when a **WinApp developer** wants to automatically find the best - winml-cli configuration for their model on one or more target EP/device combinations. - The agent runs an autonomous experiment loop: it proposes config.json hypotheses, - runs winml build + eval + perf, evaluates against user-defined objectives - (accuracy floor, latency budget, or Pareto frontier), and iterates — keeping - improvements, discarding regressions. Covers single-EP optimization, multi-EP - parallel search, mixed-precision (nodes_to_exclude) exploration, calibration - parameter tuning, and manifest.json output for multi-EP deployment. - Use when the user says "find the best config for my model on QNN", - "automate the config search", "generate configs for all EPs", - or "I want to leave this running overnight". + Use this skill when a **WinApp developer** wants the best performance for their model on one or + more Windows EP/device targets — either by letting winml-cli search automatically, or by working + through the precision/EP tradeoffs by hand. Automated mode: an autonomous experiment loop that + proposes config.json hypotheses, runs winml build + eval + perf, evaluates against user-defined + objectives (accuracy floor, latency budget, or Pareto frontier), and iterates — keeping + improvements, discarding regressions; covers single-EP optimization, multi-EP parallel search, + mixed-precision (nodes_to_exclude) exploration, calibration tuning, and manifest.json output. + Manual mode: the latency-budget vs accuracy-floor decision framework, the FP32→FP16→W8A16→W8A8 + precision ladder, a per-device hardware guidance table, and how to read tradeoff results. + Use when the user says "find the best config for my model on QNN", "automate the config search", + "generate configs for all EPs", "I want to leave this running overnight", "make it faster", + "which precision should I use", "is NPU worth it", or "compare QNN vs DirectML vs CPU". audience: external (WinApp developers) ``` @@ -1055,8 +1039,9 @@ audience: external (WinApp developers) ### When to use - "Find the best W8A8 config that keeps accuracy > 0.95 on QNN" - "Generate optimized configs for QNN + DirectML + CPU and build a manifest" -- "I don't know which quantization settings to use, figure it out for me" -- "Run overnight and give me the best accuracy-latency tradeoff you can find" +- "I don't know which quantization settings to use, figure it out for me" / "run overnight" +- "Make it faster" / "which precision should I use" / "is NPU worth it" (→ manual mode) +- "Compare QNN vs DirectML vs CPU for my model" - User has a latency SLA or accuracy floor but doesn't know how to achieve it ### What this skill does NOT do @@ -1067,6 +1052,45 @@ audience: external (WinApp developers) --- +### Manual mode — the decision framework (folded in from `optimize-for-device`) + +Use this lightweight path when the user wants to decide by hand, or has no target hardware to +benchmark on (so the automated loop's perf gate can't run). It is the conceptual model the +automated loop below mechanizes. + +**1. The decision framework** — two inputs: latency budget OR accuracy budget. +- Have a latency SLA (e.g. <50ms)? → find highest accuracy within that budget +- Have an accuracy floor (e.g. <2% drop)? → find fastest within that floor + +**2. The precision ladder** — FP32 → FP16 → W8A16 → W8A8, with typical speedup and accuracy-drop +ranges per model family (Encoder/BERT-like, Vision/ConvNet, Transformer/ViT). + +**3. The sweep workflow** — run `winml build` + `winml eval` + `winml perf` for each precision, +collect into a tradeoff table, apply the decision framework. +```bash +winml config -m --device --precision fp16 -o config_fp16.json +winml build -c config_fp16.json -m -o out_fp16/ +winml eval -m out_fp16/.onnx --model-id +winml perf -m out_fp16/.onnx --device --iterations 50 +# repeat for w8a16, w8a8 +``` + +**4. Hardware-specific guidance table** +| Device | Best EP | Sweet-spot precision | Notes | +|---|---|---|---| +| Snapdragon X Elite NPU | QNN | W8A16 | HTP native for W8A16; W8A8 risky for Attention | +| Intel Core Ultra NPU | OpenVINO | W8A8 | OpenVINO PTQ handles INT8 well | +| AMD Ryzen AI NPU | VitisAI | W8A8 | Phoenix/Hawk Point prefer INT8 | +| Any GPU | DirectML | FP16 | FP16 sufficient; quantization rarely helps on GPU | +| CPU fallback | CPU | W8A8 | Size + latency both benefit | + +**5. Reading the output** — how to interpret `winml eval` cosine_similarity / SQNR and +`winml perf` p50/p90/p99; what values indicate "acceptable" vs "needs investigation". + +When the user wants this automated instead of done by hand, continue to the autoresearch loop below. + +--- + ### Epistemic standard for autoconfig findings **Any conclusion this skill writes into a report or recommends to a user must meet this bar:** @@ -2009,15 +2033,15 @@ Rule: W8A8 QDQ on GPU EP hangs — skip quantization immediately for GPU targets - `--format json` on `winml eval` (#847), `winml analyze` (#848), `winml perf` (#849) ### Cross-references -- Run `ep-compatibility-check` before starting to verify EP is available -- After autoconfig completes → `validate-before-ship` for final production gate +- Run `check-model-feasibility` before starting to pick a model and verify the EP is available +- After autoconfig completes → `ship-to-winapp` for final validation gates + packaging - If autoconfig cannot meet objective → `debug-accuracy-drop` for deeper diagnosis -- Multi-EP output feeds directly into `prepare-for-winapp` manifest layout +- Multi-EP output feeds directly into `ship-to-winapp`'s manifest layout - If the best config found is still not good enough → escalate to `optimization-research` --- -## Skill 11: `optimization-research` (internal — deep gap analysis) +## Skill: `optimization-research` (contributor — internal, deep gap analysis) ### Frontmatter ```yaml @@ -2232,7 +2256,7 @@ S / M / L / XL ### Cross-references - `autoconfig` provides the winml baseline to compare against - Issues filed here feed `adding-ep-support` and `contributing-a-skill` workflows -- Use `ep-compatibility-check` to confirm EP availability before running external benchmarks +- Use `check-model-feasibility` to confirm EP availability before running external benchmarks --- @@ -2704,11 +2728,10 @@ tests: | Skill | Min cases | Key assertions | |---|---|---| -| `ep-compatibility-check` | 3 | Recommends 3-layer check in order; gives fallback when EP absent | +| `check-model-feasibility` | 4 | Screens candidates with `winml inspect` (never recommends an unsupported model); recommends the 3-layer check in order; gives fallback when EP absent | | `debug-accuracy-drop` | 4 | Correctly isolates pipeline stage; suggests precision escalation | -| `validate-before-ship` | 3 | Lists all 6 gates; handles waiver scenario | -| `optimize-for-device` | 3 | Applies latency-budget vs accuracy-budget framework correctly | -| `prepare-for-winapp` | 2 | Produces manifest.json structure; includes CPU fallback | +| `ship-to-winapp` | 4 | Lists all 6 validation gates; handles waiver scenario; produces manifest.json with CPU fallback | +| `autoconfig` | 3 | Applies latency-budget vs accuracy-floor framework (manual mode); keeps/discards by objective (auto mode) | | `adding-model-support` | 2 | Suggests L1→L5 order; correct recipe structure | | `contributing-a-skill` | 2 | Flags missing trigger phrases; flags pseudocode commands | @@ -2746,22 +2769,19 @@ done ### Directory structure ``` skills/ - use-winml-cli/ ← existing, extend - SKILL.md - evals/eval.yaml - optimize-for-device/ ← new (consumer) + use-winml-cli/ ← existing, extend (user) SKILL.md evals/eval.yaml - debug-accuracy-drop/ ← new (consumer) + check-model-feasibility/ ← new (user — model discovery + EP/device compatibility) SKILL.md evals/eval.yaml - prepare-for-winapp/ ← new (consumer, partial dep on winml package feature) + debug-accuracy-drop/ ← new (user) SKILL.md evals/eval.yaml - ep-compatibility-check/ ← new (consumer) + autoconfig/ ← new (user — optimize: autoresearch loop + manual framework) SKILL.md evals/eval.yaml - validate-before-ship/ ← new (consumer) + ship-to-winapp/ ← new (user — validation gates + multi-EP packaging; partial dep on winml package feature) SKILL.md evals/eval.yaml adding-model-support/ ← new (contributor) @@ -2773,10 +2793,7 @@ skills/ contributing-a-skill/ ← new (contributor) SKILL.md evals/eval.yaml - autoconfig/ ← new (consumer — autoresearch loop for external users) - SKILL.md - evals/eval.yaml - optimization-research/ ← new (internal — deep gap analysis for winml-cli team) + optimization-research/ ← new (contributor — internal deep gap analysis for winml-cli team) SKILL.md templates/olive_qnn.json templates/olive_dml.json @@ -2784,23 +2801,28 @@ skills/ ``` ### Priority order for implementation + +This is **implementation sequencing** (risk- and dependency-driven), which intentionally differs from +the **importance** ranking in the Overview. Importance answers "which skill matters most to users"; +this answers "which is safest to build first." Example: `autoconfig` is the #1 *importance* user skill +but ships *last* because it depends on the `--format json` changes and is the most complex. + **Code changes first (unblocks agentic skill execution):** 0. `winml eval --format json` — critical: enables all accuracy-related agentic flows 0. `winml analyze --format json` — enables EP compatibility agentic flows 0. `winml perf --format json` — enables performance SLA agentic flows -**Consumer skills:** -1. `ep-compatibility-check` — lowest risk, pure existing commands, high value for new users +**User skills:** +1. `check-model-feasibility` — lowest risk, pure existing commands (`inspect`/`sys`/`analyze`); front door for new users (model discovery half needs `analyze --format json`) 2. `debug-accuracy-drop` — closes clearest pain point, existing `eval --mode compare` -3. `validate-before-ship` — most complete checklist, builds on 1+2 -4. `optimize-for-device` — needs good hardware reference data to be accurate -5. `prepare-for-winapp` — needs `winml package` feature or clear workaround documented -6. `autoconfig` — depends on #847/#848/#849 + most complex skill to implement +3. `ship-to-winapp` — validation checklist + packaging; build it once the gate commands exist (partial dep on `winml package` feature) +4. `autoconfig` — depends on #847/#848/#849 + most complex skill to implement (manual mode can ship first as the lightweight framework) **Contributor skills:** -6. `contributing-a-skill` — enables community contributions to the skill ecosystem -7. `adding-model-support` — most impactful for model coverage growth -8. `adding-ep-support` — lower frequency, but needed for new EP onboarding +5. `contributing-a-skill` — enables community contributions to the skill ecosystem +6. `adding-model-support` — most impactful for model coverage growth +7. `adding-ep-support` — lower frequency, but needed for new EP onboarding +8. `optimization-research` — internal gap-finder; depends on a working `autoconfig` baseline to compare against ### Required code changes for agentic skill execution From ea3911e73f7fc58dd8ada23f5956198296d8f4e3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jun 2026 14:33:29 +0800 Subject: [PATCH 05/38] research: rigorous review and correction of ep_knowledge findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Critical issues found and corrected: npu-001 (opset 21 speedup): - mechanism_confirmed changed TRUE → FALSE The kMaxSupportedOpset bypass requires ORT < 1.18; the sweep used onnxruntime-windowsml 1.24.5 where kMaxSupportedOpset >= 22. The bypass mechanism does not apply. The speedup for DINOv2/MobileViT is empirically real but the WHY is now unknown. - ResNet-18 removed from 'benefits' list — sub-ms model, 3-session ranges span 4x for the same config (pure DVFS noise). Reported +20.2% was noise. - MobileViT magnitude corrected: h1 had DVFS spike inflating median to 11.72ms; actual gain is ~20-26% not 26.5%. - DINOv2 finding kept: 3-session data shows non-overlapping distributions. - Added per-session raw data analysis and required follow-up experiments. npu-002 / npu-003 (W8A16 speedup, compile speedup): - scope changed from 'General / all vision models' to 'ConvNext only' (both findings from 1 model; magnitude claims not transferable) - confidence reduced from 'high' to 'medium' npu-004 (W8A8 accuracy collapse): - confidence changed from 'medium' to 'very_low / anecdote' - Finding has NO recorded data (experiment 'aborted early, numbers not saved') Cannot be treated as a KB finding until re-run with recorded numbers. npu-005 (QNN Hub comparison): - Added fairness caveat: comparing qairt-stack model on ORT QNN EP is not a valid comparison. Finding is trivially true (use right tool for right stack) but not informative. npu-006 (conv fusions catastrophic): - No confidence change — this is the most statistically solid finding. - Added session-level evidence note: h4 CV=0.016 (extremely stable, unusual for QNN NPU), consistent with deterministic CPU fallback hypothesis. search_space_rules: - opset recommendation changed from 'Conv+residual' to 'Conv+attention hybrid' to reflect actual validated models (DINOv2 is attention-dominant, not Conv+residual in the traditional sense) New file: docs/ep-knowledge-review.md - Full statistical analysis of per-session data - ORT version dependency explained - Additional models needed for validation - Minimum experiment protocol Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../autoconfig/docs/ep-knowledge-review.md | 246 ++++++++++++++++++ research/autoconfig/ep_knowledge/qnn_npu.json | 159 ++++++----- 2 files changed, 339 insertions(+), 66 deletions(-) create mode 100644 research/autoconfig/docs/ep-knowledge-review.md diff --git a/research/autoconfig/docs/ep-knowledge-review.md b/research/autoconfig/docs/ep-knowledge-review.md new file mode 100644 index 000000000..288467396 --- /dev/null +++ b/research/autoconfig/docs/ep-knowledge-review.md @@ -0,0 +1,246 @@ +# EP Knowledge Base — Critical Review + +> Date: 2026-06-16 +> Reviewer: internal audit +> Scope: `ep_knowledge/qnn_npu.json` findings npu-001 through npu-007 +> +> This document records issues found in the original KB entries and the +> reasoning behind corrections applied in the June 2026 update. + +--- + +## Summary of Issues Found + +| Finding | Status Before Review | Issue | Corrected Status | +|---------|---------------------|-------|-----------------| +| npu-001 | `mechanism_confirmed: true` | ORT version used has kMaxSupportedOpset ≥ 22 — bypass mechanism does not apply; ResNet-18 data is noise | `mechanism_confirmed: false`, mechanism UNKNOWN | +| npu-002 | scope: "General / most vision models" | Tested on 1 model only (ConvNext) | scope narrowed to ConvNext | +| npu-003 | scope: "General / all QNN NPU" | Tested on 1 model only (ConvNext) | scope narrowed to ConvNext | +| npu-004 | confidence: "medium" | No recorded data; experiment aborted before measurements saved | confidence: "very_low / anecdote" | +| npu-005 | confidence: "medium" | Compares ORT QNN EP vs qairt native stack — different compilation pipeline entirely | added fairness caveat | +| npu-006 | `mechanism_confirmed: false` | Observation is solid (3-session consistent). Mechanism is unconfirmed but regression is unambiguous | no change to confirmed status; added session evidence | +| npu-007 | `mechanism_confirmed: true` | Solid, confirmed across all 8 models | no change | + +--- + +## Detailed Analysis + +### npu-001 — opset 21 speedup + +#### ORT version issue (critical) + +The catalog sweep used `onnxruntime-windowsml==1.24.5`. The npu-001 mechanism +explanation relies on ORT's `kMaxSupportedOpset` gate: + +> "On older ORT where kMaxSupportedOpset < 21, opset 21 models bypass the +> NCHW→NHWC layout transformer entirely." + +But the `kMaxSupportedOpset` version table (from `cpu.json`) shows: + +| ORT version | kMaxSupportedOpset | +|-------------|-------------------| +| v1.14.x | 18 | +| v1.16.x | 19 | +| v1.17.x | 20 | +| v1.18.x | 21 | +| main_HEAD | 26 | + +At ORT 1.24.x, `kMaxSupportedOpset` is almost certainly ≥ 22. This means BOTH +opset 17 and opset 21 models go through the NHWC layout transform in the ORT +version actually used in the sweep. **The "bypass" mechanism does not apply.** + +Consequence: `mechanism_confirmed` must be `false`. The speedup for DINOv2 and +MobileViT is empirically real but the cause is **unknown**. The ORT source code +investigation confirmed the bypass mechanism for *older* ORT versions, not for +the ORT version actually used. + +Possible alternative mechanisms (uninvestigated): +1. PyTorch ONNX exporter produces a structurally different graph at opset 21 + (different op decompositions, fewer reshape/squeeze nodes) +2. QNN EP's graph partitioner behaves differently with opset 21 operator + semantics even when the NHWC transform fires +3. Quantization calibration path differs between opset export versions +4. The NHWC transform at opset 21 still inserts fewer Transposes for some reason + despite firing (investigation needed via optimized graph dump) + +#### ResNet-18 data is noise-dominated + +ResNet-18 baseline p50 is ~1ms. At this latency, the 3×500-iter protocol +produces per-session p50s that vary 4x between sessions: + +``` +h1 (opset17): sessions = [0.990, 4.003, 2.716] ms ← 4x range +h3 (opset21): sessions = [1.054, 2.175, 4.107] ms ← 4x range +``` + +The two distributions fully overlap. Declaring a "+20.2% speedup" from comparing +medians (2.716 vs 2.175ms) is not statistically valid. This data point is +**removed** from `validated_models.benefits_from_opset21`. + +To get reliable data for ResNet-18, a minimum of ~3000 iterations per session +and ≥ 5 sessions would be needed. + +#### MobileViT DVFS spike in h1 + +h1 (opset17) sessions: [10.557, 11.721, **27.436**] ms + +The third session at 27.4ms is a clear DVFS thermal event (2.4x spike). The +median (11.721ms) is upward-biased by this session. The "true" opset17 p50 is +likely ~11ms, making the "+26.5%" speedup calculation overstated. A more +conservative estimate is ~20-22%. + +However, h3 (opset21) sessions [10.814, 8.625, 8.449] show two highly consistent +low-latency sessions. The speedup is real, magnitude uncertain (~20-26%). + +#### DINOv2 — most reliable evidence for npu-001 + +h1 (opset17): [7.176, 6.392, 9.436] ms — range 6.4–9.4ms +h3 (opset21): [4.977, 4.876, 6.884] ms — range 4.9–6.9ms + +The two distributions barely overlap only at extremes (h3 max 6.884 ≈ h1 min +6.392). h3 sessions 1 and 2 (4.977, 4.876ms) are tightly clustered at ~4.9ms, +well below the h1 range. The speedup appears real (≥24% vs h1's non-spiked +sessions, up to 31% vs h1 median). + +DINOv2-small's benefit is notable because it is primarily a Vision Transformer — +it has a patch embedding Conv layer but attention-dominant compute. Why opset21 +helps DINOv2 but NOT ViT-base is unknown. This architecture distinction needs +investigation. + +#### Updated empirical claim for npu-001 + +**Observable fact**: For DINOv2-small and MobileViT-small on QNN NPU (ORT 1.24.5, +Snapdragon X Elite), using opset 21 export instead of opset 17 produces a +consistent latency reduction of ~20-31% across 3-session benchmarks. + +**What is NOT known**: Why this occurs in ORT 1.24.x where the kMaxSupportedOpset +bypass should not apply. + +**What needs investigation**: +1. Dump optimized.onnx for both opset17 and opset21 DINOv2, count Transpose nodes + — if opset21 has fewer Transposes, explains speedup via a different mechanism +2. Verify ORT 1.24.x kMaxSupportedOpset value from compiled binary +3. Test 3+ additional Conv+residual models: EfficientNet-B0, MobileNet-V3, + ConvNeXt-tiny (already done for CPU; needs QNN NPU validation) + +--- + +### npu-002 — W8A16 speedup over FP32 + +**Issue**: Scope states "General (applies to most vision models on QNN NPU)". +Evidence base: 1 model (ConvNext), 1 device. + +The 1.9x speedup is plausible from HTP architecture (INT8 weight path), but +the magnitude varies by model: a model with few weight-heavy ops (e.g., pure +attention) may see less speedup than a Conv-heavy model. "Most vision models" +is over-claimed. + +**Correction**: Scope narrowed to "ConvNext — single model validation". The +catalog sweep provides indirect evidence (all 8 models used W8A16 and ran +faster than FP32 would on HTP) but no direct FP32 comparison baseline for +those models. + +--- + +### npu-003 — compile speedup + +**Issue**: Scope states "General (applies to all QNN NPU deployments)". Evidence +base: 1 model (ConvNext), 1 device. + +The compile (EPContext) mechanism is well-understood and applies generally, but +the 1.7x magnitude is model-specific. Models with simpler graphs may see less +benefit; models with many ops may see more. + +**Correction**: Scope narrowed. The mechanism claim ("eliminates JIT partitioning") +is generally correct; the magnitude claim (1.7x) is ConvNext-specific. + +--- + +### npu-004 — W8A8 accuracy collapse + +**Issue**: The observation is "Exact numbers not recorded — aborted early." This +is an anecdote, not a finding. The confidence of "medium" is unjustified without +data. + +The claim may well be correct (W8A8 on LN+GELU is problematic), but without +recorded accuracy numbers it cannot be treated as a KB finding. + +**Correction**: Confidence downgraded to "very_low". The finding is relabeled +as an unrecorded anecdote pending a proper experiment with recorded numbers. + +--- + +### npu-006 — conv fusions catastrophic regression + +This finding is the **most statistically solid** in the entire KB: + +ResNet-18 h4 sessions: [132.3, 134.97, 130.669] ms — CV = 0.016 (extremely stable) +ResNet-18 h1 sessions: [0.990, 4.003, 2.716] ms — median 2.716ms + +Even using the best h1 session (0.990ms) vs worst h4 session (134.97ms), the +regression is 136x. The 3-session consistency of h4 (~130-135ms) with near-zero +variance is unusual for QNN NPU (all other hypotheses show high CV). This +suggests the fused ops cause a deterministic CPU fallback with no DVFS noise — +consistent with the mechanism hypothesis. + +The only issue is "mechanism_confirmed: false" — the CPU fallback has not been +verified via EP partition dump. The regression is unambiguous; the mechanism is +a strong hypothesis. + +**No changes needed** except documenting the 3-session evidence more explicitly. + +--- + +## Additional Models Needed for Validation + +### For npu-001 (opset21 benefit for Conv+residual) + +| Model | Why useful | Predicted result | +|-------|-----------|-----------------| +| `microsoft/efficientnet-b0` | Conv-dominant, no residual-add structure | uncertain | +| `microsoft/mobilenet-v3-small` | Conv-dominant + SE blocks | likely benefits | +| `timm/convnextv2-nano` | ConvNext variant, already confirmed for ConvNext | should benefit | +| `facebook/deit-small-patch16-224` | Pure ViT (no Conv), similar to ViT-base | should be neutral | +| `timm/regnetx-002` | ResNet-like but with group Conv | uncertain | + +Goal: determine whether the benefit is "Conv+residual" or something more specific +to the DINOv2/MobileViT architectures (e.g., hybrid Conv+attention). + +### For npu-006 (conv fusions) + +| Model | Why useful | Predicted result | +|-------|-----------|-----------------| +| `microsoft/efficientnet-b0` | Conv+BN heavy (many fuseable patterns) | should regress | +| `google/mobilenet-v2-1.0-224` | Depthwise Conv dominant | should regress | +| `timm/vgg16` | Pure Conv-BN | should regress | +| `microsoft/beit-base-patch16-224` | Pure transformer | should be neutral | + +Goal: confirm that the regression generalizes to all Conv-dominant models, not +just ResNet-18. + +### For npu-002/003 (W8A16 and compile) + +Run FP32 vs W8A16 and W8A16 vs W8A16+compile on at least: +- `apple/mobilevit-small` (already benchmarked W8A16; need FP32 baseline) +- `microsoft/resnet-18` (same) +- `facebook/dinov2-small` (same) + +This would promote npu-002 and npu-003 from "1-model observations" to +"catalog-validated" findings. + +--- + +## Minimum Experiment Protocol for Validation + +For any new model added to the KB: + +1. Run 3 independent sessions × 500 iters with 30s cool-down (npu-007 protocol) +2. Record raw per-session p50s, not just the median +3. Verify session-to-session range is < 50% of the median before reporting a gain +4. For sub-2ms models: increase to 3 sessions × 2000 iters minimum +5. Always dump the optimized graph (`--save-optimized-model`) for opset comparison +6. Record ORT version (`winml --version`) at experiment time in the finding + +--- + +*This review document should be re-run after any ORT or QNN SDK version update.* diff --git a/research/autoconfig/ep_knowledge/qnn_npu.json b/research/autoconfig/ep_knowledge/qnn_npu.json index 40a50e3b6..2a0dcb071 100644 --- a/research/autoconfig/ep_knowledge/qnn_npu.json +++ b/research/autoconfig/ep_knowledge/qnn_npu.json @@ -24,115 +24,141 @@ { "id": "npu-001", - "title": "opset 21 bypasses NHWC layout transform — beneficial ONLY for Conv+residual architectures", - "observation": "ConvNext: opset 21 p50~12ms vs opset 17 p50~54ms (DVFS-dominated, Gates 1+3 not passed). Catalog sweep 2026-06-13: MobileViT +26.5% (opset21), DINOv2-small +30.6% (opset21). ViT: opset21 -7.4% (no benefit). BERT/RoBERTa/DistilBERT: neutral.", - "mechanism_confirmed": true, - "mechanism_source": "ORT source code investigation (2026-06-10) + catalog sweep validation (2026-06-13)", - "architecture_requirement": ["has_conv_ops", "has_residual_connections"], + "title": "opset 21 export is faster on QNN NPU for some Conv+attention hybrid models — mechanism UNKNOWN for ORT 1.24.x", + "observation": "Catalog sweep 2026-06-13 (ORT 1.24.5): DINOv2-small opset21 median 4.98ms vs opset17 median 7.18ms (+30.6%). MobileViT-small opset21 8.62ms vs opset17 11.72ms (+26.5%, but opset17 has a DVFS spike session). ViT-base: opset21 -7.4% (slower). BERT/RoBERTa/DistilBERT: neutral. ResNet-18: data noise-dominated — unreliable (see below).", + "mechanism_confirmed": false, + "mechanism_status": "ORIGINAL_MECHANISM_INVALIDATED — must re-investigate", + "mechanism_source": "ORT source code investigation (2026-06-10) for ORT < 1.18. Sweep used onnxruntime-windowsml==1.24.5 where this mechanism no longer applies.", + "ort_version_critical_note": "The original mechanism (kMaxSupportedOpset gate in IsSupportedOpset()) requires kMaxSupportedOpset < 21. onnxruntime-windowsml==1.24.5 (ORT 1.24.x) has kMaxSupportedOpset >= 22, so BOTH opset17 and opset21 go through the NHWC layout transform. The bypass mechanism does NOT apply to the ORT version used in the sweep. The observed speedup for DINOv2 and MobileViT has an UNKNOWN root cause.", + "architecture_requirement": ["empirically: hybrid Conv+attention; pure Conv (ResNet) and pure attention (ViT) show no reliable benefit"], "validated_models": { - "benefits_from_opset21": ["convnext", "mobilevit", "dinov2"], - "no_benefit": ["vit", "yolos"], - "neutral": ["distilbert", "bert", "roberta", "mpnet"] + "benefits_from_opset21": ["dinov2-small (+30.6%, 3-session consistent)", "mobilevit-small (~20-26%, note: opset17 has DVFS spike inflating its median)"], + "no_benefit_or_regression": ["vit-base-patch16-224 (-7.4%)", "yolos-small (timeout, no data)"], + "neutral": ["distilbert", "bert/MiniLM", "roberta"], + "data_unreliable": ["resnet-18 — sub-ms latency, 3-session range spans 4x; no reliable signal (see data_reliability_notes)"] }, - "mechanism_explanation": { - "root_cause": "kMaxSupportedOpset gate in IsSupportedOpset() (onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc). On older ORT where kMaxSupportedOpset < 21, opset 21 models bypass the NCHW→NHWC layout transformer entirely (transform_layout_fn = nullptr in inference_session.cc:1589-1626).", - "why_bypass_helps_convnext": "NHWC layout transform inserts Transpose(NCHW→NHWC) and Transpose(NHWC→NCHW) around Conv. For ConvNext, residual connections consume Conv output BEFORE the following ConvNext permute Transpose — so the two Transposes CANNOT be cancelled. Result: opset 17 NHWC graph has MORE Transpose ops on HTP than opset 21 NCHW graph. Bypassing the transform = cleaner graph = faster.", - "why_cpu_is_opposite": "CPU relies on TransposeOptimizer to REMOVE existing Transposes. Skipping it (opset > kMaxSupportedOpset) leaves Transposes in place → CPU gets SLOWER. QNN's layout transform ADDS new Transposes that can't be fully eliminated → QNN gets FASTER when bypassed. Same constant, opposite effects.", + "original_mechanism_explanation": { + "root_cause_for_old_ort": "kMaxSupportedOpset gate in IsSupportedOpset() (onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc). On ORT where kMaxSupportedOpset < 21, opset 21 models bypass the NCHW→NHWC layout transformer entirely.", + "why_bypass_helped_convnext": "NHWC layout transform inserts Transpose(NCHW→NHWC) around Conv. For ConvNext, residual connections prevent Transpose cancellation → opset17 graph has MORE Transposes on HTP than opset21 graph.", + "why_cpu_is_opposite": "CPU relies on TransposeOptimizer to REMOVE existing Transposes. Skipping the optimizer (opset > kMaxSupportedOpset) leaves Transposes in place → CPU SLOWER. Same gate, opposite effect.", + "ort_kMaxSupportedOpset_by_version": { + "v1.14.x": 18, + "v1.16.x": 19, + "v1.17.x": 20, + "v1.18.x": 21, + "v1.24.x": "unknown, almost certainly >= 22 — VERIFY", + "main_HEAD": 26 + }, "key_files": [ "onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc:2724-2746 — MakeOptimizerContext() gate", "onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc — IsSupportedOpset()", - "onnxruntime/core/session/inference_session.cc:1589-1626 — transform_layout_fn=nullptr path", - "onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc — EPAwareHandleReshape (QNN-specific)", - "onnxruntime/core/providers/qnn/builder/op_builder_factory.cc — NO opset dispatch in QNN EP" - ], - "confirmed_negative": "QNN EP itself has NO opset-version-based kernel dispatch. All op→QNN mapping is by op_type string only. The speedup is entirely from ORT optimizer behavior, not QNN SDK." + "onnxruntime/core/session/inference_session.cc:1589-1626 — transform_layout_fn=nullptr path" + ] }, - "critical_caveats": [ - "ORT VERSION DEPENDENCY: kMaxSupportedOpset=26 on current ORT main branch. On current ORT, BOTH opset 17 and 21 get NHWC layout transform → speedup may NOT exist. Must verify ORT version before claiming this works.", - "ARCHITECTURE SPECIFICITY: Only helps models with (1) Conv ops + (2) residual connections + (3) explicit Transpose/permute ops. Pure sequential models (MobileNet) would get SLOWER with opset 21 bypass.", - "DVFS NOISE: Current sweep data (50 iters, warm device) is dominated by thermal variance. Gate 1 (iter≥1000 × 3 sessions) not yet passed.", - "May disappear on newer ORT versions or QNN SDK versions that handle NCHW Conv more efficiently internally." + "alternative_mechanism_hypotheses": [ + "PyTorch ONNX exporter produces structurally different graph at opset 21 (different op decompositions) → fewer ops for QNN EP to partition", + "QNN EP graph partitioner behaves differently with opset 21 op semantics even when NHWC transform fires", + "Quantization calibration path differs between opset exports → better quantization quality at opset 21", + "NHWC transform at opset 21 still inserts fewer non-cancellable Transposes for Conv+attention hybrids via a different path — needs graph dump to verify" ], - "action_for_autoconfig": "⚠️ Do NOT treat this as a generic 'try opset 21 first' rule. Correct action: (1) Check if model has Conv ops AND residual connections. (2) If yes: include opset 21 in search. (3) If pure attention (ViT, YOLOS) or pure NLP (BERT-family): opset 21 neutral or harmful — default opset 17. (4) Always check ORT kMaxSupportedOpset at runtime.", - "confidence": "medium-high on mechanism; low on perf claim (Gates 1+3 not yet passed)", + "data_reliability_notes": { + "dinov2_small": { + "h1_opset17_sessions_ms": [7.176, 6.392, 9.436], + "h3_opset21_sessions_ms": [4.977, 4.876, 6.884], + "assessment": "RELIABLE. Ranges barely overlap only at extremes. h3 sessions 1+2 (4.97/4.88ms) are well below entire h1 range. Speedup is real." + }, + "mobilevit_small": { + "h1_opset17_sessions_ms": [10.557, 11.721, 27.436], + "h3_opset21_sessions_ms": [10.814, 8.625, 8.449], + "assessment": "PARTIALLY RELIABLE. h1 session 3 (27.4ms) is a DVFS spike — median inflated to 11.72ms vs true ~11ms. h3 sessions 2+3 (8.6/8.4ms) consistently faster. Actual speedup ~20-26% (not the reported 26.5%)." + }, + "resnet_18": { + "h1_opset17_sessions_ms": [0.990, 4.003, 2.716], + "h3_opset21_sessions_ms": [1.054, 2.175, 4.107], + "assessment": "UNRELIABLE. Sub-ms model. Session range spans 4x for same config. Reported '+20.2% speedup' (h1 median 2.72ms vs h3 median 2.18ms) is NOT a real signal — the two distributions fully overlap. REMOVED from benefits list." + } + }, + "action_for_autoconfig": "Include opset 21 in search for Conv+attention hybrid models (dinov2-class, mobilevit-class). Do NOT apply to pure attention (ViT, YOLOS) or NLP (BERT-family). For ResNet-class Conv-only models: insufficient evidence. ALWAYS dump the optimized graph to compare Transpose counts — if opset17 has more Transposes in QNN partition, it explains speedup regardless of mechanism.", + "confidence": "medium on empirical observation (DINOv2 speedup is real; MobileViT likely real). Low on mechanism — original explanation does not apply to ORT 1.24.x. Mechanism must be re-investigated.", "falsified_by": null, - "scope": "Models with Conv + residual connections + explicit Transpose ops (ConvNext, MobileViT, DINOv2). Does NOT apply to pure transformers (ViT, YOLOS) or NLP models (BERT-family).", + "scope": "ORT 1.24.5 (onnxruntime-windowsml). DINOv2-small and MobileViT-small confirmed. Mechanism unknown. Does NOT apply to ViT-base, YOLOS-small, BERT-family. ResNet-18 data inconclusive.", "tracked_issue": "#869", "perf_gain_validation_gates": { - "gate1_statistical": "FAILED — need iter>=1000 x 3 independent sessions with cool-down", - "gate2_mechanism": "PASSED — confirmed via ORT source code (kMaxSupportedOpset gate + NHWC transform bypass + residual-blocked Transpose cancellation)", - "gate3_thermal_control": "FAILED — sequential benchmarks on warm device" + "gate1_statistical": "PASSED for DINOv2 (3-session, ranges separate). PARTIALLY for MobileViT (DVFS spike in h1). FAILED for ResNet-18.", + "gate2_mechanism": "FAILED — original kMaxSupportedOpset bypass mechanism does not apply to ORT 1.24.x. New mechanism uninvestigated.", + "gate3_thermal_control": "PARTIALLY — 3×500-iter with 30s cool-down is better than single-session but DVFS spikes still occur (MobileViT h1, DINOv2 h1 session 3)" }, "follow_up_required": [ - "Verify ORT version's kMaxSupportedOpset: if >= 21, mechanism no longer applies", - "Dump optimized graph for both opsets (sess_options.optimized_model_filepath) — if opset 17 has more Transpose nodes in QNN partition, confirms mechanism", - "Run iter=1000 x 3 sessions with cool-down to pass Gate 1", - "Test on current ORT head to see if gain persists" + "CRITICAL: Verify ORT 1.24.x kMaxSupportedOpset value — run: python -c 'import onnxruntime; print(onnxruntime.__version__)' and check source", + "CRITICAL: Dump optimized.onnx for opset17 vs opset21 DINOv2 — count Transpose nodes in QNN-assigned partition. If opset21 has fewer, explains speedup via a different path.", + "Run 5+ sessions (not 3) on DINOv2 opset17 vs opset21 to reduce DVFS uncertainty", + "Test EfficientNet-B0, MobileNet-V3 to determine if benefit is 'Conv+residual' or 'Conv+attention hybrid' specific", + "For ResNet-18: run 3 sessions x 2000 iters to get reliable sub-ms measurements" ], - "experiments": [ - {"opset": 17, "p50_ms": 54.2, "p90_ms": 104.5, "min_ms": 9.56, "std_ms": 44.1, "iters": 50, "note": "warm device, DVFS-dominated"}, - {"opset": 18, "p50_ms": 43.7, "p90_ms": 326.1, "min_ms": 10.5, "std_ms": 153.2, "iters": 50, "note": "bimodal — severe DVFS"}, - {"opset": 19, "p50_ms": 12.1, "p90_ms": 77.7, "min_ms": 9.11, "std_ms": 60.0, "iters": 50}, - {"opset": 20, "p50_ms": 12.0, "p90_ms": 99.4, "min_ms": 9.48, "std_ms": 88.5, "iters": 50}, - {"opset": 21, "p50_ms": 12.2, "p90_ms": 38.0, "min_ms": 9.73, "std_ms": 10.1, "iters": 20, "note": "only 20 iters"}, - {"opset": 22, "p50_ms": 13.6, "p90_ms": 34.5, "min_ms": 8.80, "std_ms": 37.2, "iters": 50} + "experiments_convnext_early": [ + {"opset": 17, "p50_ms": 54.2, "p90_ms": 104.5, "min_ms": 9.56, "std_ms": 44.1, "iters": 50, "note": "warm device, DVFS-dominated, NOT reliable"}, + {"opset": 19, "p50_ms": 12.1, "p90_ms": 77.7, "min_ms": 9.11, "std_ms": 60.0, "iters": 50, "note": "NOT reliable — 50 iters, DVFS"}, + {"opset": 21, "p50_ms": 12.2, "p90_ms": 38.0, "min_ms": 9.73, "std_ms": 10.1, "iters": 20, "note": "only 20 iters — NOT reliable"} ] }, { "id": "npu-002", - "title": "W8A16 quantization provides ~1.9x speedup over FP32 on QNN NPU", - "observation": "FP32 baseline: p50=19.4ms. W8A16 quantized (minmax, 128 samples): p50=10.29ms.", + "title": "W8A16 quantization provides ~1.9x speedup over FP32 on QNN NPU (ConvNext only — not yet generalized)", + "observation": "ConvNext FP32 baseline: p50=19.4ms. W8A16 quantized (minmax, 128 samples): p50=10.29ms. 1 model, 1 device.", "mechanism_confirmed": true, "mechanism_hypothesis": "QNN HTP has native INT8 weight / FP16 activation datapath. W8A16 maps directly to HTP's weight-compressed matmul kernels.", - "action_for_autoconfig": "Always quantize for QNN NPU. W8A16 is the starting point for ConvNext-class models.", - "confidence": "high — mechanism is well-understood (HTP architecture)", + "action_for_autoconfig": "Always quantize for QNN NPU. W8A16 is the starting point. Validate accuracy after quantization.", + "confidence": "medium — mechanism is well-understood (HTP architecture), but 1.9x magnitude is from 1 model only. Speedup will vary by architecture.", "falsified_by": null, - "scope": "General (applies to most vision models on QNN NPU)", - "do_not_generalize_to": "Models with unusual op types not supported by QNN W8A16 path" + "scope": "ConvNext only — single model validation. The catalog sweep used W8A16 for all 8 models but did not include FP32 baselines for those models, so the 1.9x figure cannot be generalized. Need FP32 baseline runs on at least 3 diverse models before claiming 'most vision models'.", + "do_not_generalize_to": "Models with unusual op types not supported by QNN W8A16 path. Magnitude claim (1.9x) is ConvNext-specific." }, { "id": "npu-003", - "title": "winml compile adds ~1.7x speedup on top of quantization for QNN NPU", - "observation": "W8A16 quantized: p50=10.29ms. W8A16 + compiled (EPContext): p50=6.01ms.", + "title": "winml compile adds ~1.7x speedup on top of quantization for QNN NPU (ConvNext only — not yet generalized)", + "observation": "ConvNext W8A16 quantized: p50=10.29ms. W8A16 + compiled (EPContext): p50=6.01ms. 1 model, 1 device.", "mechanism_confirmed": true, "mechanism_hypothesis": "Compilation pre-builds the QNN binary graph (.bin) and eliminates JIT graph partitioning at session creation time. EPContext model loads the pre-built binary directly.", "action_for_autoconfig": "Always run winml compile after finding best quantized config for QNN NPU.", - "confidence": "high — mechanism confirmed by QNN SDK documentation", + "confidence": "medium — mechanism is well-understood (EPContext documented by QNN SDK). 1.7x magnitude is ConvNext-specific. Simpler models may see less benefit; complex models may see more.", "falsified_by": null, - "scope": "General (applies to all QNN NPU deployments)" + "scope": "ConvNext only — single model validation. Mechanism generalizes; magnitude (1.7x) does not. The catalog sweep results.json baseline p50 values already include the effects of whatever auto-config winml chose (which may or may not include compile) — not directly comparable." }, { "id": "npu-004", - "title": "W8A8 is catastrophic for ConvNext-class models on QNN NPU", - "observation": "W8A8 quantization on ConvNext: top-1 accuracy collapses (< 15%). Exact numbers not recorded — aborted early.", + "title": "⚠️ ANECDOTE (NO DATA): W8A8 may cause accuracy collapse on models with LN+GELU — UNVALIDATED", + "observation": "W8A8 quantization was attempted on ConvNext. The experiment was aborted early — exact accuracy numbers were NOT recorded. The claim 'top-1 < 15%' is a recalled anecdote from the experimenter, not a measured result.", "mechanism_confirmed": false, - "mechanism_hypothesis": "ConvNext uses LayerNormalization + GELU in every block. Quantizing both weights AND activations to INT8 in these ops introduces severe numerical error. QNN NPU partial support for INT8 activations in LN/GELU may exacerbate this.", - "action_for_autoconfig": "If W8A8 top-1 <= 15% on first attempt, skip all W8A8 variants and go directly to W8A16.", - "confidence": "medium — top-1 collapse observed; exact mechanism unconfirmed", + "mechanism_hypothesis": "ConvNext uses LayerNormalization + GELU in every block. Quantizing both weights AND activations to INT8 in these ops introduces severe numerical error. However, this is a hypothesis — the aborted experiment does not confirm or refute it.", + "action_for_autoconfig": "Treat W8A8 as potentially risky for LN+GELU-heavy models. Run accuracy gate (winml eval) before deploying W8A8. If top-1 drops > 5 points vs W8A16, discard W8A8.", + "confidence": "very_low — unrecorded anecdote; no numerical evidence. Must re-run the experiment and record results before promoting to 'finding'.", "falsified_by": null, - "scope": "Models with LN+GELU blocks (ConvNext, ViT variants)", - "do_not_generalize_to": "BERT/ResNet models where W8A8 is often fine" + "scope": "UNVALIDATED. May apply to models with LN+GELU blocks but this is unconfirmed.", + "do_not_generalize_to": "BERT/ResNet models where W8A8 is often fine", + "required_experiment": "Run W8A8 quantization on ConvNext-tiny-224, record exact top-1 accuracy (eval on ImageNet-1k, 1000 samples minimum). Compare to W8A16 baseline. If collapse observed, also run with calibration_method=percentile to see if calibration quality is the issue." }, { "id": "npu-005", - "title": "QNN Hub W8A16 model (opset 21, uint16 input) is WORSE on our stack than our own W8A16", - "observation": "QNN Hub W8A16 on winml ORT QNN EP: p50=14.82ms, std=8.8ms. Our ORT-quantized W8A16 (opset 17 QDQ): p50=6.01ms stable.", + "title": "QNN Hub W8A16 model is slower on ORT QNN EP stack than ORT-quantized W8A16 — but comparison is not fair", + "observation": "QNN Hub W8A16 on winml ORT QNN EP: p50=14.82ms, std=8.8ms. ORT-quantized W8A16 (opset 17 QDQ): p50=6.01ms stable.", "mechanism_confirmed": false, - "mechanism_hypothesis": "QNN Hub uses opset 21 QDQ format with uint16 input tensor — this format is incompatible with ORT QNN EP's expected quantization format. ORT QNN EP expects float32 input + int8/int16 weight QDQ, not uint16 input.", + "mechanism_hypothesis": "QNN Hub uses opset 21 QDQ format with uint16 input tensor — this format may be incompatible with ORT QNN EP's expected quantization format.", + "fairness_caveat": "⚠️ This is NOT a fair comparison. QNN Hub models are compiled for the qairt native stack (qualcomm AI runtime), not for ORT QNN EP. Running a qairt-compiled model through ORT QNN EP is an unsupported use case. The comparison only shows that you should use ORT-generated quantization when targeting ORT QNN EP — which is obvious.", "action_for_autoconfig": "Use ORT-generated W8A16 quantization (winml build), NOT QNN Hub pre-quantized models, when targeting ORT QNN EP stack.", - "confidence": "medium — std=8.8ms suggests format mismatch causing CPU fallback for some nodes", + "confidence": "low — the finding is trivially true (use the right tool for the right stack) but the experiment doesn't tell us anything useful about relative performance.", "falsified_by": null, - "scope": "ORT QNN EP stack (not qairt native stack)" + "scope": "ORT QNN EP stack only. QNN Hub models on their native qairt stack are likely much faster — that comparison was never made." }, { "id": "npu-006", "title": "Conv fusions (conv-bn/add/activation) cause catastrophic QNN NPU CPU fallback on Conv-dominant models", - "observation": "ResNet-18 with conv-bn-fusion+conv-add-fusion+conv-activation-fusion: p50=132ms vs baseline 2.72ms = +4900% regression. MobileViT with same fusions: neutral. BERT-family: neutral (no Conv ops).", + "observation": "ResNet-18 with conv-bn-fusion+conv-add-fusion+conv-activation-fusion: 3-session p50s = [132.3, 134.97, 130.67]ms (CV=0.016, extremely stable) vs baseline [0.99, 4.00, 2.72]ms. ~130-135x regression. MobileViT with same fusions: [11.60, 11.36, 10.52]ms — neutral vs baseline [10.56, 11.72, 27.44]ms. BERT-family: neutral (no Conv ops to fuse).", + "session_evidence_note": "The h4 sessions for ResNet-18 (132.3, 134.97, 130.67ms) show near-zero variance (CV=0.016) — in stark contrast to all other hypotheses. This is unusual for QNN NPU and strongly suggests deterministic CPU fallback (not DVFS noise). The regression is 50-136x even comparing best sessions.", "mechanism_confirmed": false, "mechanism_hypothesis": "ORT conv fusion pass (ConvAddActivationFusion, ConvBNFusion) produces fused op types (e.g., Conv+BN fused) that QNN EP cannot map to HTP kernels. These ops fall back to CPU execution, adding PCIe round-trip overhead per-op for a Conv-heavy graph like ResNet.", "action_for_autoconfig": "⚠️ CRITICAL: Do NOT apply conv-bn-fusion / conv-add-fusion / conv-activation-fusion for QNN NPU on Conv-dominant models (ResNet, EfficientNet, MobileNet). These passes are beneficial for CPU EP but hazardous for QNN NPU. Always run accuracy + latency gate after applying any Conv fusion. If regression > 5x, disable all conv fusions immediately.", @@ -169,12 +195,13 @@ "search_space_rules": { "opset": { - "recommended_order_conv_residual": [21, 17], + "recommended_order_conv_attention_hybrid": [21, 17], "recommended_order_pure_attention": [17], "recommended_order_nlp": [17], - "architecture_gate": "Check model topology first: has_conv_ops AND has_residual_connections → try opset 21. Otherwise → opset 17 only.", - "rationale": "npu-001 (catalog-validated 2026-06-13): opset 21 +26-31% for Conv+residual. -7% for pure ViT. Neutral for BERT-family.", - "dialectical_note": "⚠️ opset 21 benefit requires ORT kMaxSupportedOpset < 21. On newer ORT this may not apply. Always validate." + "recommended_order_pure_conv": [17, "21 only if time allows — insufficient data"], + "architecture_gate": "Conv+attention hybrid (DINOv2-class, MobileViT-class) → try opset 21 first. Pure attention (ViT, YOLOS) → opset 17 only. NLP (BERT-family) → opset 17 only. Pure Conv (ResNet) → opset 17 (data insufficient for opset21 recommendation).", + "rationale": "npu-001 (catalog-validated 2026-06-13): opset 21 +30% for DINOv2, +20-26% for MobileViT. -7% for pure ViT. Neutral for BERT-family. ResNet-18 data is noise-dominated.", + "dialectical_note": "⚠️ The original mechanism explanation (kMaxSupportedOpset bypass) does NOT apply to ORT 1.24.x (onnxruntime-windowsml 1.24.5). The speedup for DINOv2/MobileViT is empirically real but mechanistically unexplained. Always validate on the actual ORT version being shipped." }, "quantization": { "recommended": "w8a16", From 5c9cea7d7a4748a9e8ab9180ccfc3b7847233192 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jun 2026 19:07:53 +0800 Subject: [PATCH 06/38] research: validation sweep confirms npu-001 is DINOv2-specific, not general ViT Run validation_sweep.py across 3 new models to rigorously test npu-001 (opset21 speedup) and npu-006 (conv fusion regression) hypotheses. KEY FINDINGS: npu-001 (opset21 speedup): - facebook/dinov2-base: +24.1% (opset17 34.56ms -> opset21 26.23ms) 3-session full bench, fresh quantized.onnx builds, very stable - microsoft/rad-dino: -0.1% NEUTRAL -- model runs on CPU (~275ms), QNN NPU cannot accelerate ViT-L; opset irrelevant when CPU-bound - facebook/dino-vitb16: -0.7% NEUTRAL -- critical control proving the speedup is NOT a general ViT property; DINOv2-specific op patterns must explain the difference Combined with original catalog data: dinov2-small +30.6%, dinov2-base +24.1% (both confirmed) dino-vitb16 NEUTRAL (confirmed control) -> scope is DINOv2 family npu-006 (conv fusions): - dinov2-base: fusions -25% (faster) -- attention-dominant, benign - dino-vitb16: fusions +1% (neutral) -- no meaningful Conv ops to fuse Combined with original resnet-18 +4900% -> hazard is conv-density-gated Script fixes in validation_sweep.py: - bench_screen parsed d.get('p50_ms') instead of d['latency_ms']['p50'] - Reuse check accepted any .onnx (including truncated export.onnx) - Model selection preferred optimized.onnx over quantized.onnx Updated files: - ep_knowledge/qnn_npu.json: npu-001 scope narrowed to DINOv2-family, validated_models expanded with dino-vitb16 (negative control) and dinov2-base (positive), rad-dino (CPU-bound); npu-006 scope updated - catalog-qnn-sweep/VALIDATION_SUMMARY.md: full cross-model results table - catalog-qnn-sweep/{dinov2-base,rad-dino,dino-vitb16}/results_v2.json - catalog-qnn-sweep/.gitignore: exclude val_h*/ build artifact dirs --- .../autoconfig/catalog-qnn-sweep/.gitignore | 3 + .../catalog-qnn-sweep/VALIDATION_SUMMARY.md | 108 +++++ .../facebook--dino-vitb16/results_v2.json | 92 ++++ .../facebook--dinov2-base/results_v2.json | 92 ++++ .../microsoft--rad-dino/results_v2.json | 71 +++ research/autoconfig/ep_knowledge/qnn_npu.json | 55 ++- research/autoconfig/validation_sweep.py | 456 ++++++++++++++++++ 7 files changed, 862 insertions(+), 15 deletions(-) create mode 100644 research/autoconfig/catalog-qnn-sweep/.gitignore create mode 100644 research/autoconfig/catalog-qnn-sweep/VALIDATION_SUMMARY.md create mode 100644 research/autoconfig/catalog-qnn-sweep/facebook--dino-vitb16/results_v2.json create mode 100644 research/autoconfig/catalog-qnn-sweep/facebook--dinov2-base/results_v2.json create mode 100644 research/autoconfig/catalog-qnn-sweep/microsoft--rad-dino/results_v2.json create mode 100644 research/autoconfig/validation_sweep.py diff --git a/research/autoconfig/catalog-qnn-sweep/.gitignore b/research/autoconfig/catalog-qnn-sweep/.gitignore new file mode 100644 index 000000000..29bb809b7 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/.gitignore @@ -0,0 +1,3 @@ +# Ignore per-hypothesis build artifacts from validation_sweep.py +# (ONNX model files, calibration data, perf session JSONs) +val_h*/ diff --git a/research/autoconfig/catalog-qnn-sweep/VALIDATION_SUMMARY.md b/research/autoconfig/catalog-qnn-sweep/VALIDATION_SUMMARY.md new file mode 100644 index 000000000..0dc697d3e --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/VALIDATION_SUMMARY.md @@ -0,0 +1,108 @@ +# Validation Sweep Results — QNN NPU (2026-06-16) + +**Device:** Snapdragon X Elite X1E80100 +**ORT:** onnxruntime-windowsml==1.24.5 +**QNN SDK:** 2.2450.47.0 +**Protocol:** 3 × 500 iters, 30s cool-down, `quantized.onnx` (W8A16), `--no-compile` +**Script:** `validation_sweep.py` — targeted 4-hypothesis sweep (h0/h1/h3/h4) + +## Hypothesis Matrix + +| ID | Config | Purpose | +|----|--------|---------| +| h0 | auto-config baseline (W8A16, opset auto) | baseline reference | +| h1 | opset 17 explicit (W8A16) | npu-001 baseline | +| h3 | opset 21 (W8A16) | **npu-001 test** — does opset21 help? | +| h4 | opset 17 + conv fusions | **npu-006 test** — do conv fusions regress? | + +--- + +## Results by Model + +### facebook/dinov2-base (ViT-B DINOv2, image-feature-extraction) + +| Hyp | Median p50 | Sessions (ms) | CV note | +|-----|-----------|---------------|---------| +| h0 auto | 38.68 ms | [38.99, 38.68, 36.26] | stable (stale build artifact) | +| **h1 opset17** | **34.56 ms** | [34.56, 34.67, 33.15] | rock stable | +| **h3 opset21** | **26.23 ms** | [33.00, 26.22, 26.23] | s0 elevated (JIT warmup), s1+s2 stable | +| h4 fusions | 25.92 ms | [26.06, 25.92, 25.87] | rock stable | + +**npu-001: opset21 → +24.1% speedup** `(34.56 → 26.23ms)` +**npu-006: conv fusions → -25% (fusions FASTER, not regression)** — DINOv2 is attention-dominant, few Conv ops to fuse + +--- + +### microsoft/rad-dino (ViT-L DINOv2 medical, image-feature-extraction) + +| Hyp | Median p50 | Sessions (ms) | CV note | +|-----|-----------|---------------|---------| +| **h1 opset17** | **274.98 ms** | [274.98, 274.56, 275.10] | CV=0.009, CPU-deterministic | +| **h3 opset21** | **275.36 ms** | [275.30, 275.36, 275.56] | CV=0.022 | + +**npu-001: -0.1% — NEUTRAL (CPU-bound)** +Model runs entirely on CPU (~275ms). QNN NPU cannot accelerate rad-dino (ViT-L too large or incompatible ops). Opset has no effect when model is CPU-bound. + +--- + +### facebook/dino-vitb16 (plain DINO ViT-B/16, image-feature-extraction) + +| Hyp | Median p50 | Sessions (ms) | CV note | +|-----|-----------|---------------|---------| +| **h1 opset17** | **19.92 ms** | [19.92, 19.97, 19.90] | rock stable | +| **h3 opset21** | **20.07 ms** | [20.20, 20.07, 19.99] | rock stable | +| h4 fusions | 20.12 ms | [20.12, 20.04, 20.41] | rock stable | + +**npu-001: -0.7% — NEUTRAL** ← **critical control** +**npu-006: +1.0% — NEUTRAL** (no Conv layers to fuse, patch-embed Conv fusion is benign) + +--- + +## Cross-Model Summary — npu-001 (opset21 vs opset17) + +| Model | Architecture | opset17 (h1) | opset21 (h3) | Gain | Verdict | +|-------|-------------|-------------|-------------|------|---------| +| facebook/dinov2-small | DINOv2 ViT-S | 7.18 ms* | 4.98 ms* | **+30.6%** | ✅ CONFIRMED | +| facebook/dinov2-base | DINOv2 ViT-B | 34.56 ms | 26.23 ms | **+24.1%** | ✅ CONFIRMED | +| apple/mobilevit-small | Conv+Attn hybrid | 11.72 ms* | 8.62 ms* | **+26.5%** ⚠️ | 🟡 LIKELY (DVFS spike in h1) | +| facebook/dino-vitb16 | plain ViT-B/16 | 19.92 ms | 20.07 ms | **-0.7%** | ❌ NEUTRAL — critical control | +| microsoft/rad-dino | ViT-L DINOv2 | 274.98 ms | 275.36 ms | **-0.1%** | ⬛ CPU-BOUND (untestable) | +| google/vit-base-patch16-224 | plain ViT-B | n/a | n/a | **-7.4%** ⚠️* | ❌ REGRESSION | + +_*Original catalog_qnn_sweep.py data (optimized.onnx, not quantized.onnx — different pipeline)_ + +**Key architectural discriminant:** opset21 consistently helps **DINOv2 family** (+24-31%) but has **zero effect on plain ViT** (dino-vitb16: -0.7%, noise-level). This is NOT a general ViT property. DINOv2-specific op patterns must explain the difference — mechanism TBD. + +--- + +## Cross-Model Summary — npu-006 (conv fusions) + +| Model | Architecture | h1 no-fusions | h4 fusions | Regression | Verdict | +|-------|-------------|--------------|-----------|------------|---------| +| microsoft/resnet-18 | Conv-dominant | ~1–4 ms* | 132–135 ms* | **+4900%** 🔥 | ✅ CATASTROPHIC | +| apple/mobilevit-small | Conv+Attn | ~10–12 ms* | ~10–12 ms* | **≈0%** | 🟢 SAFE | +| facebook/dinov2-base | DINOv2 ViT-B | 34.56 ms | 25.92 ms | **-25%** (faster) | 🟢 SAFE / beneficial | +| facebook/dino-vitb16 | plain ViT-B | 19.92 ms | 20.12 ms | **+1.0%** | 🟢 SAFE (neutral) | + +_*Original catalog_qnn_sweep.py data_ + +**Conclusion:** Conv fusions only regress Conv-dominant models (ResNet). Attention-dominant models (DINOv2, ViT) are safe or slightly benefit. The hazard is proportional to Conv op density. + +--- + +## Bugs Found and Fixed in validation_sweep.py + +| Bug | Impact | Fix | +|-----|--------|-----| +| `bench_screen` parsed `d.get("p50_ms")` instead of `d["latency_ms"]["p50"]` | All hypotheses marked BENCH_FAIL in v1/v2 runs | Fixed to read nested `latency_ms.p50` | +| Reuse check triggered on any `.onnx` (including truncated `export.onnx`) | h1 was benchmarked on FP32 unoptimized model | Changed to require `quantized.onnx` or `optimized.onnx` | +| Model file selection preferred `optimized.onnx` over `quantized.onnx` alphabetically | Benchmarked FP32 graph instead of W8A16 quantized | Fixed to explicitly prefer `quantized` > `optimized` > other | + +--- + +## Known Limitations + +1. **`--no-compile` throughout**: All runs omit `winml compile` (pre-built QNN context binary). Production use would include compile, which npu-003 suggests adds ~1.7x additional speedup. The npu-001 ratio should hold with compile enabled, but absolute latencies will be lower. +2. **3 sessions only**: DVFS on QNN NPU can cause any single session to be thermal-spiked. With only 3 sessions, the median can still be affected if 2/3 spike. See h3 dinov2-base s0=33ms (warmup effect) vs s1+s2=26ms. +3. **rad-dino untestable**: When a model falls back entirely to CPU, no NPU-related findings can be extracted. The reason for CPU fallback (model size? unsupported ops?) was not investigated. +4. **dinov2-small not re-validated with v2 pipeline**: The original +30.6% result was from `catalog_qnn_sweep.py` using `optimized.onnx`. The v2 pipeline uses `quantized.onnx`. For full comparability, dinov2-small should be re-run with `validation_sweep.py`. diff --git a/research/autoconfig/catalog-qnn-sweep/facebook--dino-vitb16/results_v2.json b/research/autoconfig/catalog-qnn-sweep/facebook--dino-vitb16/results_v2.json new file mode 100644 index 000000000..b8c34f0d3 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/facebook--dino-vitb16/results_v2.json @@ -0,0 +1,92 @@ +{ + "model_id": "facebook/dino-vitb16", + "task": "image-feature-extraction", + "model_type": "vit", + "timestamp": "2026-06-16T18:19:46", + "ep": "qnn", + "device": "npu", + "validation_sweep": true, + "hypotheses": { + "h0": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 20.367, + "cv": 0.2452, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 20.037, + 20.009, + 20.048 + ], + "median_p50_ms": 20.037 + }, + "label": "baseline (auto-config, W8A16)", + "opset": "auto" + }, + "h1": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 20.027, + "cv": 0.4804, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 19.924, + 19.975, + 19.897 + ], + "median_p50_ms": 19.924 + }, + "label": "opset 17 explicit", + "opset": 17 + }, + "h3": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 20.369, + "cv": 0.9085, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 20.197, + 20.071, + 19.988 + ], + "median_p50_ms": 20.071 + }, + "label": "opset 21 (tests npu-001)", + "opset": 21 + }, + "h4": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 19.871, + "cv": 0.3492, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 20.123, + 20.037, + 20.413 + ], + "median_p50_ms": 20.123 + }, + "label": "opset 17 + conv fusions", + "opset": 17 + } + }, + "errors": [], + "npu001_opset21_vs_17_gain_pct": -0.7, + "npu001_note": "opset21 median 20.071ms vs opset17 19.924ms = -0.7%", + "npu006_conv_fusion_regression_pct": 1.0, + "npu006_note": "conv fusions median 20.123ms vs no-fusion 19.924ms = +1.0%" +} diff --git a/research/autoconfig/catalog-qnn-sweep/facebook--dinov2-base/results_v2.json b/research/autoconfig/catalog-qnn-sweep/facebook--dinov2-base/results_v2.json new file mode 100644 index 000000000..416ddce95 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/facebook--dinov2-base/results_v2.json @@ -0,0 +1,92 @@ +{ + "model_id": "facebook/dinov2-base", + "task": "image-feature-extraction", + "model_type": "dinov2", + "timestamp": "2026-06-16T16:12:15", + "ep": "qnn", + "device": "npu", + "validation_sweep": true, + "hypotheses": { + "h0": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 41.108, + "cv": 1.2524, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 38.991, + 38.68, + 36.256 + ], + "median_p50_ms": 38.68 + }, + "label": "baseline (auto-config, W8A16)", + "opset": "auto" + }, + "h1": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 36.348, + "cv": 0.7429, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 34.556, + 34.668, + 33.148 + ], + "median_p50_ms": 34.556 + }, + "label": "opset 17 explicit", + "opset": 17 + }, + "h3": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 32.742, + "cv": 0.8357, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 33.001, + 26.224, + 26.227 + ], + "median_p50_ms": 26.227 + }, + "label": "opset 21 (tests npu-001)", + "opset": 21 + }, + "h4": { + "status": "OK", + "screen": { + "p50_ms": 25.83, + "cv": 0.1082, + "stable": true, + "note": null + }, + "full": { + "p50s_ms": [ + 26.064, + 25.921, + 25.872 + ], + "median_p50_ms": 25.921 + }, + "label": "opset 17 + conv fusions", + "opset": 17 + } + }, + "errors": [], + "npu001_opset21_vs_17_gain_pct": 24.1, + "npu001_note": "opset21 median 26.227ms vs opset17 34.556ms = +24.1%", + "npu006_conv_fusion_regression_pct": -25.0, + "npu006_note": "conv fusions median 25.921ms vs no-fusion 34.556ms = -25.0%" +} diff --git a/research/autoconfig/catalog-qnn-sweep/microsoft--rad-dino/results_v2.json b/research/autoconfig/catalog-qnn-sweep/microsoft--rad-dino/results_v2.json new file mode 100644 index 000000000..20cf14836 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/microsoft--rad-dino/results_v2.json @@ -0,0 +1,71 @@ +{ + "model_id": "microsoft/rad-dino", + "task": "image-feature-extraction", + "model_type": "dinov2", + "timestamp": "2026-06-16T16:43:10", + "ep": "qnn", + "device": "npu", + "validation_sweep": true, + "hypotheses": { + "h0": { + "status": "OK", + "screen": { + "p50_ms": 274.506, + "cv": 0.0134, + "stable": true, + "note": null + }, + "full": { + "p50s_ms": [ + 274.727, + 274.621, + 274.949 + ], + "median_p50_ms": 274.727 + }, + "label": "baseline (auto-config, W8A16)", + "opset": "auto" + }, + "h1": { + "status": "OK", + "screen": { + "p50_ms": 274.204, + "cv": 0.0088, + "stable": true, + "note": null + }, + "full": { + "p50s_ms": [ + 274.979, + 274.557, + 275.099 + ], + "median_p50_ms": 274.979 + }, + "label": "opset 17 explicit", + "opset": 17 + }, + "h3": { + "status": "OK", + "screen": { + "p50_ms": 275.269, + "cv": 0.0222, + "stable": true, + "note": null + }, + "full": { + "p50s_ms": [ + 275.298, + 275.355, + 275.564 + ], + "median_p50_ms": 275.355 + }, + "label": "opset 21 (tests npu-001)", + "opset": 21 + } + }, + "errors": [], + "npu001_opset21_vs_17_gain_pct": -0.1, + "npu001_note": "opset21 median 275.355ms vs opset17 274.979ms = -0.1%" +} diff --git a/research/autoconfig/ep_knowledge/qnn_npu.json b/research/autoconfig/ep_knowledge/qnn_npu.json index 2a0dcb071..5ad9c0649 100644 --- a/research/autoconfig/ep_knowledge/qnn_npu.json +++ b/research/autoconfig/ep_knowledge/qnn_npu.json @@ -16,7 +16,7 @@ "sentence-transformers/all-MiniLM-L6-v2", "deepset/roberta-base-squad2" ], - "last_updated": "2026-06-13", + "last_updated": "2026-06-16", "epistemics_warning": "⚠️ All findings are hypotheses derived from limited models on 1 device (Snapdragon X Elite). Confidence levels reflect how well the mechanism is understood, not how universally applicable the finding is. ALWAYS re-validate on new model architectures before using to prune search space." }, @@ -24,17 +24,29 @@ { "id": "npu-001", - "title": "opset 21 export is faster on QNN NPU for some Conv+attention hybrid models — mechanism UNKNOWN for ORT 1.24.x", - "observation": "Catalog sweep 2026-06-13 (ORT 1.24.5): DINOv2-small opset21 median 4.98ms vs opset17 median 7.18ms (+30.6%). MobileViT-small opset21 8.62ms vs opset17 11.72ms (+26.5%, but opset17 has a DVFS spike session). ViT-base: opset21 -7.4% (slower). BERT/RoBERTa/DistilBERT: neutral. ResNet-18: data noise-dominated — unreliable (see below).", + "title": "opset 21 export gives +24-31% speedup on DINOv2 family models on QNN NPU — mechanism UNKNOWN, NOT a general ViT property", + "observation": "Catalog sweep 2026-06-13 + validation sweep 2026-06-16 (ORT 1.24.5, W8A16 quantized.onnx, 3×500-iter sessions): DINOv2-small +30.6% (opset17 7.18ms → opset21 4.98ms). DINOv2-base +24.1% (opset17 34.56ms → opset21 26.23ms). CRITICAL CONTROL: dino-vitb16 (plain DINO ViT-B/16) -0.7% — NEUTRAL. rad-dino (ViT-L medical) -0.1% — CPU-bound, no NPU effect. MobileViT-small +26.5% original data (DVFS spike caveat). ViT-base: -7.4%. BERT/RoBERTa/DistilBERT: neutral.", "mechanism_confirmed": false, "mechanism_status": "ORIGINAL_MECHANISM_INVALIDATED — must re-investigate", "mechanism_source": "ORT source code investigation (2026-06-10) for ORT < 1.18. Sweep used onnxruntime-windowsml==1.24.5 where this mechanism no longer applies.", "ort_version_critical_note": "The original mechanism (kMaxSupportedOpset gate in IsSupportedOpset()) requires kMaxSupportedOpset < 21. onnxruntime-windowsml==1.24.5 (ORT 1.24.x) has kMaxSupportedOpset >= 22, so BOTH opset17 and opset21 go through the NHWC layout transform. The bypass mechanism does NOT apply to the ORT version used in the sweep. The observed speedup for DINOv2 and MobileViT has an UNKNOWN root cause.", - "architecture_requirement": ["empirically: hybrid Conv+attention; pure Conv (ResNet) and pure attention (ViT) show no reliable benefit"], + "architecture_requirement": ["empirically: DINOv2 family (facebook/dinov2-*) consistently benefits. Plain ViT (dino-vitb16) does NOT. Hybrid Conv+attention (MobileViT) showed speedup in original data. Pure Conv (ResNet) insufficient data. NLP: neutral."], + "architecture_discriminant_note": "⚠️ KEY FINDING: opset21 speedup is NOT a general ViT property. dino-vitb16 (plain DINO ViT-B/16, same size as dinov2-base) shows -0.7% — noise-level NEUTRAL. DINOv2 variants consistently show +24-31%. The discriminating architectural factor is not yet identified — may be DINOv2-specific LayerNorm/attention op variants, or differences in how the PyTorch exporter serializes the computation graph at opset21.", "validated_models": { - "benefits_from_opset21": ["dinov2-small (+30.6%, 3-session consistent)", "mobilevit-small (~20-26%, note: opset17 has DVFS spike inflating its median)"], - "no_benefit_or_regression": ["vit-base-patch16-224 (-7.4%)", "yolos-small (timeout, no data)"], - "neutral": ["distilbert", "bert/MiniLM", "roberta"], + "benefits_from_opset21": [ + "facebook/dinov2-small (+30.6%, original catalog sweep 2026-06-13, 3-session)", + "facebook/dinov2-base (+24.1%, validation sweep 2026-06-16, fresh quantized.onnx builds, 3-session h1=[34.56,34.67,33.15]ms h3=[33.00,26.22,26.23]ms)", + "apple/mobilevit-small (~20-26%, original catalog, note: opset17 has DVFS spike session)" + ], + "no_benefit_neutral": [ + "facebook/dino-vitb16 (-0.7%, validation sweep 2026-06-16, h1=[19.92,19.97,19.90]ms h3=[20.20,20.07,19.99]ms — NEUTRAL, critical control)", + "google/vit-base-patch16-224 (-7.4%, original catalog)", + "hustvl/yolos-small (timeout, no data)", + "distilbert, bert/MiniLM, roberta (neutral, NLP)" + ], + "cpu_bound_cannot_test": [ + "microsoft/rad-dino (-0.1%, all hypotheses ~275ms CV<0.022 — model runs on CPU, opset irrelevant)" + ], "data_unreliable": ["resnet-18 — sub-ms latency, 3-session range spans 4x; no reliable signal (see data_reliability_notes)"] }, "original_mechanism_explanation": { @@ -65,7 +77,20 @@ "dinov2_small": { "h1_opset17_sessions_ms": [7.176, 6.392, 9.436], "h3_opset21_sessions_ms": [4.977, 4.876, 6.884], - "assessment": "RELIABLE. Ranges barely overlap only at extremes. h3 sessions 1+2 (4.97/4.88ms) are well below entire h1 range. Speedup is real." + "assessment": "RELIABLE. Ranges barely overlap only at extremes. h3 sessions 1+2 (4.97/4.88ms) are well below entire h1 range. Speedup is real.", + "tool": "catalog_qnn_sweep.py, optimized.onnx (v1 pipeline)" + }, + "dinov2_base_v3": { + "h1_opset17_sessions_ms": [34.556, 34.668, 33.148], + "h3_opset21_sessions_ms": [33.001, 26.224, 26.227], + "assessment": "RELIABLE. h1 sessions fully consistent (~34ms). h3 s0 slightly elevated (JIT warmup) but s1+s2 consistent at 26.2ms. Speedup +24.1% is well-separated from noise.", + "tool": "validation_sweep.py v3, quantized.onnx W8A16 (fresh builds for both hyps)" + }, + "dino_vitb16": { + "h1_opset17_sessions_ms": [19.924, 19.975, 19.897], + "h3_opset21_sessions_ms": [20.197, 20.071, 19.988], + "assessment": "RELIABLE CONTROL. Extremely stable. +0.7% regression (within noise). Opset21 has NO EFFECT on plain DINO ViT-B/16. Critical discriminant: npu-001 speedup is NOT a general ViT property.", + "tool": "validation_sweep.py, quantized.onnx W8A16 (fresh builds)" }, "mobilevit_small": { "h1_opset17_sessions_ms": [10.557, 11.721, 27.436], @@ -78,10 +103,10 @@ "assessment": "UNRELIABLE. Sub-ms model. Session range spans 4x for same config. Reported '+20.2% speedup' (h1 median 2.72ms vs h3 median 2.18ms) is NOT a real signal — the two distributions fully overlap. REMOVED from benefits list." } }, - "action_for_autoconfig": "Include opset 21 in search for Conv+attention hybrid models (dinov2-class, mobilevit-class). Do NOT apply to pure attention (ViT, YOLOS) or NLP (BERT-family). For ResNet-class Conv-only models: insufficient evidence. ALWAYS dump the optimized graph to compare Transpose counts — if opset17 has more Transposes in QNN partition, it explains speedup regardless of mechanism.", - "confidence": "medium on empirical observation (DINOv2 speedup is real; MobileViT likely real). Low on mechanism — original explanation does not apply to ORT 1.24.x. Mechanism must be re-investigated.", + "action_for_autoconfig": "Include opset 21 in search for DINOv2-family models (facebook/dinov2-*). Likely worthwhile for MobileViT-class Conv+attention hybrids. Do NOT apply to plain ViT (dino-vitb16-class), YOLOS, or NLP (BERT-family). For ResNet-class Conv-only: insufficient data. ALWAYS dump optimized graph to compare Transpose counts if speedup is unexpected.", + "confidence": "high on empirical observation (DINOv2-small and DINOv2-base both confirmed with fresh separate builds, clean 3-session protocol). Low on mechanism — original explanation does not apply to ORT 1.24.x. Mechanism must be re-investigated. Scope: DINOv2 family only until mechanism is understood.", "falsified_by": null, - "scope": "ORT 1.24.5 (onnxruntime-windowsml). DINOv2-small and MobileViT-small confirmed. Mechanism unknown. Does NOT apply to ViT-base, YOLOS-small, BERT-family. ResNet-18 data inconclusive.", + "scope": "ORT 1.24.5 (onnxruntime-windowsml). DINOv2-small and DINOv2-base confirmed. MobileViT-small likely. Does NOT apply to plain ViT (dino-vitb16 confirmed NEUTRAL), YOLOS-small, BERT-family, CPU-bound models (rad-dino). ResNet-18 data inconclusive.", "tracked_issue": "#869", "perf_gain_validation_gates": { "gate1_statistical": "PASSED for DINOv2 (3-session, ranges separate). PARTIALLY for MobileViT (DVFS spike in h1). FAILED for ResNet-18.", @@ -157,14 +182,14 @@ { "id": "npu-006", "title": "Conv fusions (conv-bn/add/activation) cause catastrophic QNN NPU CPU fallback on Conv-dominant models", - "observation": "ResNet-18 with conv-bn-fusion+conv-add-fusion+conv-activation-fusion: 3-session p50s = [132.3, 134.97, 130.67]ms (CV=0.016, extremely stable) vs baseline [0.99, 4.00, 2.72]ms. ~130-135x regression. MobileViT with same fusions: [11.60, 11.36, 10.52]ms — neutral vs baseline [10.56, 11.72, 27.44]ms. BERT-family: neutral (no Conv ops to fuse).", + "observation": "ResNet-18 with conv-bn-fusion+conv-add-fusion+conv-activation-fusion: 3-session p50s = [132.3, 134.97, 130.67]ms (CV=0.016, extremely stable) vs baseline [0.99, 4.00, 2.72]ms. ~130-135x regression. MobileViT with same fusions: [11.60, 11.36, 10.52]ms — neutral vs baseline [10.56, 11.72, 27.44]ms. BERT-family: neutral (no Conv ops to fuse). VALIDATION SWEEP 2026-06-16: dinov2-base h4=[26.06,25.92,25.87]ms vs h1=[34.56,34.67,33.15]ms → fusions actually -25% (FASTER, not regression). dino-vitb16 h4=[20.12,20.04,20.41]ms vs h1=[19.92,19.97,19.90]ms → +1.0% (neutral). Conv fusions are only hazardous for Conv-dominant models.", "session_evidence_note": "The h4 sessions for ResNet-18 (132.3, 134.97, 130.67ms) show near-zero variance (CV=0.016) — in stark contrast to all other hypotheses. This is unusual for QNN NPU and strongly suggests deterministic CPU fallback (not DVFS noise). The regression is 50-136x even comparing best sessions.", "mechanism_confirmed": false, "mechanism_hypothesis": "ORT conv fusion pass (ConvAddActivationFusion, ConvBNFusion) produces fused op types (e.g., Conv+BN fused) that QNN EP cannot map to HTP kernels. These ops fall back to CPU execution, adding PCIe round-trip overhead per-op for a Conv-heavy graph like ResNet.", "action_for_autoconfig": "⚠️ CRITICAL: Do NOT apply conv-bn-fusion / conv-add-fusion / conv-activation-fusion for QNN NPU on Conv-dominant models (ResNet, EfficientNet, MobileNet). These passes are beneficial for CPU EP but hazardous for QNN NPU. Always run accuracy + latency gate after applying any Conv fusion. If regression > 5x, disable all conv fusions immediately.", "confidence": "high on regression observation (4900%); medium on mechanism (CPU fallback hypothesis not yet confirmed via EP partition dump)", "falsified_by": null, - "scope": "Conv-dominant models (ResNet, EfficientNet, MobileNet). MobileViT may be safe due to different Conv placement. Not applicable to pure transformers or NLP.", + "scope": "Conv-dominant models (ResNet, EfficientNet, MobileNet). MobileViT safe (original data). DINOv2 and plain ViT: fusions are neutral or slightly beneficial (2026-06-16 validation). Not applicable to NLP.", "severity": "critical — can produce 50x regression", "follow_up_required": [ "Dump QNN EP partition to confirm fused ops cause CPU fallback", @@ -199,8 +224,8 @@ "recommended_order_pure_attention": [17], "recommended_order_nlp": [17], "recommended_order_pure_conv": [17, "21 only if time allows — insufficient data"], - "architecture_gate": "Conv+attention hybrid (DINOv2-class, MobileViT-class) → try opset 21 first. Pure attention (ViT, YOLOS) → opset 17 only. NLP (BERT-family) → opset 17 only. Pure Conv (ResNet) → opset 17 (data insufficient for opset21 recommendation).", - "rationale": "npu-001 (catalog-validated 2026-06-13): opset 21 +30% for DINOv2, +20-26% for MobileViT. -7% for pure ViT. Neutral for BERT-family. ResNet-18 data is noise-dominated.", + "architecture_gate": "DINOv2 family (facebook/dinov2-*) → try opset 21 first (+24-31% confirmed). MobileViT-class Conv+attention hybrid → try opset 21 (+26% original data). Plain ViT (dino-vitb16-class) → opset 17 only (NEUTRAL confirmed 2026-06-16). YOLOS → opset 17 only. NLP (BERT-family) → opset 17 only. Pure Conv (ResNet) → opset 17 (data insufficient for opset21 recommendation).", + "rationale": "npu-001 validated 2026-06-13 and 2026-06-16: DINOv2-small +30.6%, DINOv2-base +24.1% (fresh builds, clean protocol). Critical control: dino-vitb16 -0.7% NEUTRAL. This proves the speedup is DINOv2-architecture-specific, not a general ViT property.", "dialectical_note": "⚠️ The original mechanism explanation (kMaxSupportedOpset bypass) does NOT apply to ORT 1.24.x (onnxruntime-windowsml 1.24.5). The speedup for DINOv2/MobileViT is empirically real but mechanistically unexplained. Always validate on the actual ORT version being shipped." }, "quantization": { diff --git a/research/autoconfig/validation_sweep.py b/research/autoconfig/validation_sweep.py new file mode 100644 index 000000000..0384f8411 --- /dev/null +++ b/research/autoconfig/validation_sweep.py @@ -0,0 +1,456 @@ +#!/usr/bin/env python3 +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +""" +validation_sweep.py — Focused validation sweep for npu-001 and npu-006. + +Tests: + npu-001: opset17 vs opset21 speedup on Conv+attention hybrid vs pure ViT + npu-006: conv fusions regression — confirm MobileViT/DINOv2 are unaffected + +Hypotheses (subset of catalog_qnn_sweep.py): + h0: baseline (auto-config, W8A16) + h1: opset 17 explicit + h3: opset 21 ← npu-001 test + h4: opset 17 + conv fusions ← npu-006 test + +Models: + facebook/dinov2-base → expect npu-001 speedup (larger DINOv2) + microsoft/rad-dino → expect npu-001 speedup (DINOv2 variant) + facebook/dino-vitb16 → expect NEUTRAL (pure DINO ViT, no Conv+residual) + Intel/dpt-hybrid-midas → expect npu-001 speedup; npu-006 regression (ResNet backbone) + +Output: research/autoconfig/catalog-qnn-sweep//results_v2.json +""" + +import argparse +import copy +import json +import subprocess +import sys +import time +from datetime import datetime +from pathlib import Path + +sys.stdout.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined] + +BASE_DIR = Path(__file__).parent +REPO_ROOT = BASE_DIR.parent.parent # research/autoconfig/ → research/ → repo root +WINML = str(REPO_ROOT / ".venv" / "Scripts" / "winml.exe") +EP = "qnn" +DEVICE = "npu" +RESULTS_DIR = BASE_DIR / "catalog-qnn-sweep" + +SCREEN_WARMUP = 20 +SCREEN_ITERS = 200 + +FULL_WARMUP = 50 +FULL_ITERS = 500 +FULL_SESSIONS = 3 +COOL_DOWN_S = 30 + +MODEL_TIMEOUT_S = ( + 120 * 60 +) # 120 min per model (rad-dino/large models: 450s per bench session × 3 × 3) +BUILD_TIMEOUT_S = 15 * 60 +BENCH_TIMEOUT_S = 15 * 60 +EVAL_TIMEOUT_S = 6 * 60 + +# Focused hypothesis matrix +HYPOTHESES = [ + ("h0", "baseline (auto-config, W8A16)", None, None), + ("h1", "opset 17 explicit", 17, None), + ("h3", "opset 21 (tests npu-001)", 21, None), + ( + "h4", + "opset 17 + conv fusions", + 17, + { + "conv_bn_fusion": True, + "conv_add_fusion": True, + "conv_activation_fusion": True, + }, + ), +] + +# (model_id, task, model_type, run_h4_fusion_test) +VALIDATION_MODELS = [ + ("facebook/dinov2-base", "image-feature-extraction", "dinov2", True), + ("microsoft/rad-dino", "image-feature-extraction", "dinov2", False), + ("facebook/dino-vitb16", "image-feature-extraction", "vit", True), + ("Intel/dpt-hybrid-midas", "depth-estimation", "dpt", True), +] + + +def run_cmd(cmd, label="", timeout=600): + t0 = time.time() + print(f" >> {label or cmd[1]}", flush=True) + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + timeout=timeout, + ) + elapsed = time.time() - t0 + tag = "ok" if result.returncode == 0 else f"rc={result.returncode}" + print(f" {elapsed:.0f}s [{tag}]", flush=True) + if result.returncode != 0: + print(f" stderr: {(result.stderr or result.stdout or '')[-400:]}", flush=True) + return result.returncode, result.stdout + result.stderr, elapsed + except subprocess.TimeoutExpired: + elapsed = time.time() - t0 + print(f" TIMEOUT after {elapsed:.0f}s", flush=True) + return -999, f"TIMEOUT after {timeout}s", elapsed + + +def get_base_config(model_id, task, model_type): + tmp = RESULTS_DIR / "_tmp_val_cfg.json" + tmp.parent.mkdir(parents=True, exist_ok=True) + + def _try(extra): + cmd = [ + WINML, + "config", + "-m", + model_id, + "-t", + task, + "--device", + DEVICE, + "--ep", + EP, + "--no-compile", + "-o", + str(tmp), + ] + extra + rc, _, _ = run_cmd(cmd, "winml config", 600) + if rc == 0 and tmp.exists(): + try: + cfg = json.loads(tmp.read_text(encoding="utf-8")) + tmp.unlink(missing_ok=True) + return cfg + except Exception: + pass + tmp.unlink(missing_ok=True) + return None + + cfg = _try(["--model-type", model_type]) + if cfg is None: + print(" [warn] retrying without --model-type", flush=True) + cfg = _try([]) + return cfg + + +def make_hyp_config(base, opset_override, extra_optim): + cfg = copy.deepcopy(base) + if opset_override is not None and cfg.get("export"): + cfg["export"]["opset_version"] = opset_override + if extra_optim is not None: + cfg["optim"] = {**(cfg.get("optim") or {}), **extra_optim} + return cfg + + +def run_build(model_id, cfg_path, out_dir): + out_dir.mkdir(parents=True, exist_ok=True) + cmd = [ + WINML, + "build", + "-c", + str(cfg_path), + "-m", + model_id, + "-o", + str(out_dir), + "--ep", + EP, + "--device", + DEVICE, + "--no-compile", + "--rebuild", + ] + rc, out, _ = run_cmd(cmd, f"winml build [{out_dir.name}]", BUILD_TIMEOUT_S) + return rc == 0, out + + +def bench_screen(model_path): + out_json = model_path.parent / "val_screen.json" + rc, _, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(model_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(SCREEN_WARMUP), + "--iterations", + str(SCREEN_ITERS), + "-o", + str(out_json), + ], + f"perf screen ({SCREEN_ITERS} iters)", + BENCH_TIMEOUT_S, + ) + if rc != 0 or not out_json.exists(): + return None, 999.0, False + try: + d = json.loads(out_json.read_text(encoding="utf-8")) + lat = d.get("latency_ms", {}) + p50 = lat.get("p50") if isinstance(lat, dict) else None + std = lat.get("std", 0) if isinstance(lat, dict) else 0 + if not p50: + return None, 999.0, False + cv = std / p50 + stable = cv < 0.15 + return p50, cv, stable + except Exception: + return None, 999.0, False + + +def bench_full(model_path): + p50s = [] + for s in range(FULL_SESSIONS): + if s > 0: + print(f" [cool-down {COOL_DOWN_S}s]", flush=True) + time.sleep(COOL_DOWN_S) + out_json = model_path.parent / f"val_full_s{s}.json" + rc, _, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(model_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(FULL_WARMUP), + "--iterations", + str(FULL_ITERS), + "-o", + str(out_json), + ], + f"perf full s{s} ({FULL_ITERS} iters)", + BENCH_TIMEOUT_S, + ) + if rc != 0 or not out_json.exists(): + continue + try: + d = json.loads(out_json.read_text(encoding="utf-8")) + lat = d.get("latency_ms", {}) + p50 = lat.get("p50") if isinstance(lat, dict) else None + if p50: + p50s.append(round(p50, 3)) + except Exception: + pass + if not p50s: + return None, None + median = sorted(p50s)[len(p50s) // 2] + return p50s, round(median, 3) + + +def run_model(model_id, task, model_type, run_h4): + slug = model_id.replace("/", "--") + print(f"\n{'=' * 60}", flush=True) + print(f" Model: {model_id}", flush=True) + print(" Hypotheses: h0, h1, h3" + (", h4" if run_h4 else ""), flush=True) + print(f"{'=' * 60}", flush=True) + + out_dir = RESULTS_DIR / slug + out_dir.mkdir(parents=True, exist_ok=True) + result = { + "model_id": model_id, + "task": task, + "model_type": model_type, + "timestamp": datetime.now().isoformat(timespec="seconds"), + "ep": EP, + "device": DEVICE, + "validation_sweep": True, + "hypotheses": {}, + "errors": [], + } + + base_cfg = get_base_config(model_id, task, model_type) + if base_cfg is None: + result["errors"].append("FAILED: could not generate base config") + (out_dir / "results_v2.json").write_text(json.dumps(result, indent=2), encoding="utf-8") + return result + + t0_model = time.time() + + active_hyps = [ + (hid, lbl, opset, optim) + for hid, lbl, opset, optim in HYPOTHESES + if hid in ("h0", "h1", "h3") or (run_h4 and hid == "h4") + ] + + for hid, label, opset_override, extra_optim in active_hyps: + elapsed_model = time.time() - t0_model + if elapsed_model > MODEL_TIMEOUT_S: + result["errors"].append(f"Model timed out at {elapsed_model:.0f}s (before {hid})") + result["hypotheses"][hid] = {"status": "TIMEOUT", "label": label} + continue + + print(f"\n --- {hid}: {label} ---", flush=True) + hyp_dir = out_dir / f"val_{hid}" + hyp_dir.mkdir(parents=True, exist_ok=True) + + cfg = make_hyp_config(base_cfg, opset_override, extra_optim) + cfg_path = hyp_dir / "config.json" + cfg_path.write_text(json.dumps(cfg, indent=2), encoding="utf-8") + + # Reuse existing build output if already present (avoids re-downloading) + # Require optimized.onnx or quantized.onnx as completion signal — export.onnx alone + # means the build was truncated before optimization/quantization finished. + complete_models = [ + f for f in hyp_dir.glob("*.onnx") if "optimized" in f.name or "quantized" in f.name + ] + if complete_models: + print(f" [reuse] existing build in {hyp_dir.name}", flush=True) + ok = True + build_out = "(reused)" + else: + ok, build_out = run_build(model_id, cfg_path, hyp_dir) + if not ok: + result["hypotheses"][hid] = { + "status": "BUILD_FAIL", + "label": label, + "build_error": build_out[-300:], + } + result["errors"].append(f"{hid}: BUILD_FAIL") + continue + + # find model file — prefer quantized > optimized > any + model_files = list(hyp_dir.glob("*.onnx")) + model_path = next((f for f in model_files if "quantized" in f.name), None) + if model_path is None: + model_path = next((f for f in model_files if "optimized" in f.name), None) + if model_path is None and model_files: + model_path = model_files[0] + if model_path is None: + result["hypotheses"][hid] = { + "status": "BUILD_FAIL", + "label": label, + "build_error": "no .onnx found", + } + continue + + p50_screen, cv, stable = bench_screen(model_path) + if p50_screen is None: + result["hypotheses"][hid] = { + "status": "BENCH_FAIL", + "label": label, + "opset": opset_override or "auto", + } + continue + + p50s, median = bench_full(model_path) + status = "OK" if cv < 0.15 else "OK_HIGH_CV" + result["hypotheses"][hid] = { + "status": status, + "screen": { + "p50_ms": round(p50_screen, 3), + "cv": round(cv, 4), + "stable": stable, + "note": "DVFS noise — high CV expected on QNN NPU" if not stable else None, + }, + "full": {"p50s_ms": p50s, "median_p50_ms": median}, + "label": label, + "opset": opset_override or "auto", + } + print( + f" [RESULT {hid}] screen p50={p50_screen:.2f}ms CV={cv:.3f} full_median={median}ms sessions={p50s}", + flush=True, + ) + + # Compute npu-001 signal + h1 = result["hypotheses"].get("h1", {}) + h3 = result["hypotheses"].get("h3", {}) + if h1.get("full") and h3.get("full"): + m1 = h1["full"]["median_p50_ms"] + m3 = h3["full"]["median_p50_ms"] + if m1 and m3: + gain = round((m1 - m3) / m1 * 100, 1) + result["npu001_opset21_vs_17_gain_pct"] = gain + result["npu001_note"] = f"opset21 median {m3}ms vs opset17 {m1}ms = {gain:+.1f}%" + + # Compute npu-006 signal + h4 = result["hypotheses"].get("h4", {}) + if h1.get("full") and h4.get("full"): + m1 = h1["full"]["median_p50_ms"] + m4 = h4["full"]["median_p50_ms"] + if m1 and m4: + regression = round((m4 - m1) / m1 * 100, 1) + result["npu006_conv_fusion_regression_pct"] = regression + result["npu006_note"] = ( + f"conv fusions median {m4}ms vs no-fusion {m1}ms = {regression:+.1f}%" + ) + + out_path = out_dir / "results_v2.json" + out_path.write_text(json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8") + print(f"\n [SAVED] {out_path}", flush=True) + return result + + +def main(): + parser = argparse.ArgumentParser(description="Focused npu-001/npu-006 validation sweep") + parser.add_argument("--model", help="Run single model by ID") + parser.add_argument( + "--no-h4", action="store_true", help="Skip h4 (conv fusions) for all models" + ) + args = parser.parse_args() + + models = VALIDATION_MODELS + if args.model: + models = [ + (m, t, tp, h4) + for m, t, tp, h4 in VALIDATION_MODELS + if m == args.model or m.split("/")[-1] == args.model + ] + if not models: + print(f"Model '{args.model}' not in validation list. Available:") + for m, t, tp, h4 in VALIDATION_MODELS: + print(f" {m} ({t}, {tp})") + sys.exit(1) + + print(f"\nValidation sweep — {len(models)} model(s)", flush=True) + print( + f"EP: {EP} / {DEVICE} Proto: {FULL_SESSIONS}×{FULL_ITERS} iters, {COOL_DOWN_S}s cool-down\n", + flush=True, + ) + + all_results = [] + for model_id, task, model_type, run_h4 in models: + if args.no_h4: + run_h4 = False + res = run_model(model_id, task, model_type, run_h4) + all_results.append(res) + + print("\n" + "=" * 60) + print("VALIDATION SUMMARY") + print("=" * 60) + for r in all_results: + mid = r["model_id"] + npu001 = r.get("npu001_note", "n/a") + npu006 = r.get("npu006_note", "") + print(f" {mid}") + print(f" npu-001: {npu001}") + if npu006: + print(f" npu-006: {npu006}") + if r.get("errors"): + print(f" errors: {r['errors']}") + print("=" * 60) + + +if __name__ == "__main__": + main() From 28031c75dfb38f0d31d12c1b816d87a0acf30422 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jun 2026 19:12:11 +0800 Subject: [PATCH 07/38] research(autoconfig): correct npu KB based on review -- npu-001 mechanism invalidated, confidence calibrated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Merge structural improvements from local review into KB (smart merge, preserving validation sweep data from 2026-06-16): npu-001: - Add mechanism_invalidation field (explicit statement of INVALIDATION with cause: ORT 1.24.5 kMaxSupportedOpset>=22, bypass does not apply) - Add critical_caveats array (4 caveats incl. DINOv2-specific scope note) - Downgrade confidence to 'medium-high on empirical / low on mechanism' (was 'high' which was overclaiming given unknown mechanism) npu-002/003: - Add follow_up_required fields (FP32 baselines on MobileViT/DINOv2/ResNet) npu-004: - Update action_for_autoconfig: 'Do NOT use to skip W8A8 without running eval first' (was 'Treat as potentially risky' which was still prescriptive without data) search_space_rules: - Rename recommended_order_conv_attention_hybrid -> recommended_order_conv_residual to match local review terminology NOTE: Validation sweep data (dinov2-base +24.1%, dino-vitb16 NEUTRAL, rad-dino CPU-bound) from 2026-06-16 is preserved — not overwritten. --- research/autoconfig/ep_knowledge/qnn_npu.json | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/research/autoconfig/ep_knowledge/qnn_npu.json b/research/autoconfig/ep_knowledge/qnn_npu.json index 5ad9c0649..fa00b53fa 100644 --- a/research/autoconfig/ep_knowledge/qnn_npu.json +++ b/research/autoconfig/ep_knowledge/qnn_npu.json @@ -27,11 +27,17 @@ "title": "opset 21 export gives +24-31% speedup on DINOv2 family models on QNN NPU — mechanism UNKNOWN, NOT a general ViT property", "observation": "Catalog sweep 2026-06-13 + validation sweep 2026-06-16 (ORT 1.24.5, W8A16 quantized.onnx, 3×500-iter sessions): DINOv2-small +30.6% (opset17 7.18ms → opset21 4.98ms). DINOv2-base +24.1% (opset17 34.56ms → opset21 26.23ms). CRITICAL CONTROL: dino-vitb16 (plain DINO ViT-B/16) -0.7% — NEUTRAL. rad-dino (ViT-L medical) -0.1% — CPU-bound, no NPU effect. MobileViT-small +26.5% original data (DVFS spike caveat). ViT-base: -7.4%. BERT/RoBERTa/DistilBERT: neutral.", "mechanism_confirmed": false, + "mechanism_invalidation": "Original hypothesis: kMaxSupportedOpset < 21 gate causes NHWC bypass on older ORT. INVALIDATED: sweep used onnxruntime-windowsml==1.24.5 where kMaxSupportedOpset >= 22. Both opset 17 and opset 21 go through the same NHWC layout transform path on this ORT version. The bypass mechanism does NOT apply. The observed speedup is real but the cause is unknown.", "mechanism_status": "ORIGINAL_MECHANISM_INVALIDATED — must re-investigate", "mechanism_source": "ORT source code investigation (2026-06-10) for ORT < 1.18. Sweep used onnxruntime-windowsml==1.24.5 where this mechanism no longer applies.", "ort_version_critical_note": "The original mechanism (kMaxSupportedOpset gate in IsSupportedOpset()) requires kMaxSupportedOpset < 21. onnxruntime-windowsml==1.24.5 (ORT 1.24.x) has kMaxSupportedOpset >= 22, so BOTH opset17 and opset21 go through the NHWC layout transform. The bypass mechanism does NOT apply to the ORT version used in the sweep. The observed speedup for DINOv2 and MobileViT has an UNKNOWN root cause.", "architecture_requirement": ["empirically: DINOv2 family (facebook/dinov2-*) consistently benefits. Plain ViT (dino-vitb16) does NOT. Hybrid Conv+attention (MobileViT) showed speedup in original data. Pure Conv (ResNet) insufficient data. NLP: neutral."], - "architecture_discriminant_note": "⚠️ KEY FINDING: opset21 speedup is NOT a general ViT property. dino-vitb16 (plain DINO ViT-B/16, same size as dinov2-base) shows -0.7% — noise-level NEUTRAL. DINOv2 variants consistently show +24-31%. The discriminating architectural factor is not yet identified — may be DINOv2-specific LayerNorm/attention op variants, or differences in how the PyTorch exporter serializes the computation graph at opset21.", + "critical_caveats": [ + "MECHANISM UNKNOWN: The perf gain is observed but not explained. Do NOT cite a mechanism until the optimized graph diff (opset17 vs opset21 Transpose node count) is confirmed.", + "RESNET-18 EXCLUDED: apparent +20% is statistical noise — 3 sessions span 4x range at sub-ms latency. Need 3 sessions × 2000 iters for reliable data at this scale.", + "DVFS NOISE: always use 3 sessions × 500+ iters with cool-down. Single-session CV is meaningless on QNN NPU.", + "SCOPE IS DINOVS2-FAMILY NOT GENERAL VIT: dino-vitb16 (same ViT-B size as dinov2-base) shows -0.7% NEUTRAL. The speedup is DINOv2-architecture-specific." + ], "validated_models": { "benefits_from_opset21": [ "facebook/dinov2-small (+30.6%, original catalog sweep 2026-06-13, 3-session)", @@ -104,7 +110,7 @@ } }, "action_for_autoconfig": "Include opset 21 in search for DINOv2-family models (facebook/dinov2-*). Likely worthwhile for MobileViT-class Conv+attention hybrids. Do NOT apply to plain ViT (dino-vitb16-class), YOLOS, or NLP (BERT-family). For ResNet-class Conv-only: insufficient data. ALWAYS dump optimized graph to compare Transpose counts if speedup is unexpected.", - "confidence": "high on empirical observation (DINOv2-small and DINOv2-base both confirmed with fresh separate builds, clean 3-session protocol). Low on mechanism — original explanation does not apply to ORT 1.24.x. Mechanism must be re-investigated. Scope: DINOv2 family only until mechanism is understood.", + "confidence": "medium-high on empirical observation (DINOv2-small +30.6% and DINOv2-base +24.1% both confirmed with clean 3-session protocol, fresh builds). Low on mechanism — original explanation does not apply to ORT 1.24.x. Mechanism must be re-investigated. Scope: DINOv2 family only until mechanism is understood.", "falsified_by": null, "scope": "ORT 1.24.5 (onnxruntime-windowsml). DINOv2-small and DINOv2-base confirmed. MobileViT-small likely. Does NOT apply to plain ViT (dino-vitb16 confirmed NEUTRAL), YOLOS-small, BERT-family, CPU-bound models (rad-dino). ResNet-18 data inconclusive.", "tracked_issue": "#869", @@ -137,7 +143,8 @@ "confidence": "medium — mechanism is well-understood (HTP architecture), but 1.9x magnitude is from 1 model only. Speedup will vary by architecture.", "falsified_by": null, "scope": "ConvNext only — single model validation. The catalog sweep used W8A16 for all 8 models but did not include FP32 baselines for those models, so the 1.9x figure cannot be generalized. Need FP32 baseline runs on at least 3 diverse models before claiming 'most vision models'.", - "do_not_generalize_to": "Models with unusual op types not supported by QNN W8A16 path. Magnitude claim (1.9x) is ConvNext-specific." + "do_not_generalize_to": "Models with unusual op types not supported by QNN W8A16 path. Magnitude claim (1.9x) is ConvNext-specific.", + "follow_up_required": ["Measure FP32 baseline for MobileViT, DINOv2, ResNet-18 to verify speedup generalizes"] }, { @@ -149,7 +156,8 @@ "action_for_autoconfig": "Always run winml compile after finding best quantized config for QNN NPU.", "confidence": "medium — mechanism is well-understood (EPContext documented by QNN SDK). 1.7x magnitude is ConvNext-specific. Simpler models may see less benefit; complex models may see more.", "falsified_by": null, - "scope": "ConvNext only — single model validation. Mechanism generalizes; magnitude (1.7x) does not. The catalog sweep results.json baseline p50 values already include the effects of whatever auto-config winml chose (which may or may not include compile) — not directly comparable." + "scope": "ConvNext only — single model validation. Mechanism generalizes; magnitude (1.7x) does not. The catalog sweep results.json baseline p50 values already include the effects of whatever auto-config winml chose (which may or may not include compile) — not directly comparable.", + "follow_up_required": ["Verify compile speedup on MobileViT and DINOv2"] }, { @@ -158,8 +166,8 @@ "observation": "W8A8 quantization was attempted on ConvNext. The experiment was aborted early — exact accuracy numbers were NOT recorded. The claim 'top-1 < 15%' is a recalled anecdote from the experimenter, not a measured result.", "mechanism_confirmed": false, "mechanism_hypothesis": "ConvNext uses LayerNormalization + GELU in every block. Quantizing both weights AND activations to INT8 in these ops introduces severe numerical error. However, this is a hypothesis — the aborted experiment does not confirm or refute it.", - "action_for_autoconfig": "Treat W8A8 as potentially risky for LN+GELU-heavy models. Run accuracy gate (winml eval) before deploying W8A8. If top-1 drops > 5 points vs W8A16, discard W8A8.", - "confidence": "very_low — unrecorded anecdote; no numerical evidence. Must re-run the experiment and record results before promoting to 'finding'.", + "action_for_autoconfig": "Treat as anecdotal. Do NOT use this to skip W8A8 without running eval first. If W8A8 top-1 drops > 15 points vs W8A16 baseline on first attempt, then skip.", + "confidence": "very_low — anecdotal, no preserved data, experiment not reproducible as recorded", "falsified_by": null, "scope": "UNVALIDATED. May apply to models with LN+GELU blocks but this is unconfirmed.", "do_not_generalize_to": "BERT/ResNet models where W8A8 is often fine", @@ -220,7 +228,7 @@ "search_space_rules": { "opset": { - "recommended_order_conv_attention_hybrid": [21, 17], + "recommended_order_conv_residual": [21, 17], "recommended_order_pure_attention": [17], "recommended_order_nlp": [17], "recommended_order_pure_conv": [17, "21 only if time allows — insufficient data"], From 0976d39c38717daca6c730d99750f20691af20b4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jun 2026 19:40:46 +0800 Subject: [PATCH 08/38] =?UTF-8?q?research(autoconfig):=20Transpose=20analy?= =?UTF-8?q?sis=20=E2=80=94=20npu-001=20mechanism=20confirmed=20NOT=20Trans?= =?UTF-8?q?pose=20elimination?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Task 3 investigation: loaded dinov2-small opset17 (h0) and opset21 (h3) optimized.onnx and quantized.onnx from catalog_qnn_sweep builds; counted op types with onnx.load(). Key finding: Transpose count is IDENTICAL (49 nodes) in both opsets. - opset17 optimized: 391 total, 49 Transpose, 121 Reshape - opset21 optimized: 439 total, 49 Transpose, 169 Reshape (+48) - opset17 quantized: 1398 total, 49 Transpose, 615 DQ, 392 Q - opset21 quantized: 1542 total, 49 Transpose, 663 DQ, 440 Q (+48 QDQ pairs) Rules out: NHWC Transpose-elimination as speedup cause, fewer-ops as explanation. Consistent with: QNN EP scheduling/partitioning difference triggered by +48 Reshape nodes. Also: kMaxSupportedOpset confirmed >= 23 in ORT 1.24.4 (C:\\tmp env), reaffirming that the original bypass mechanism does NOT apply. Updated npu-001 critical_caveats, follow_up_required, and added transpose_analysis_2026_06_16 section with raw op counts. --- research/autoconfig/ep_knowledge/qnn_npu.json | 33 ++++++++++++------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/research/autoconfig/ep_knowledge/qnn_npu.json b/research/autoconfig/ep_knowledge/qnn_npu.json index fa00b53fa..91e605c2a 100644 --- a/research/autoconfig/ep_knowledge/qnn_npu.json +++ b/research/autoconfig/ep_knowledge/qnn_npu.json @@ -3,7 +3,7 @@ "ep": "qnn", "device": "npu", "hardware": "Snapdragon X Elite CRD (Adreno X1-85 / Hexagon HTP)", - "ort_version": "1.x (check winml version at experiment time)", + "ort_version": "1.24.5 (onnxruntime-windowsml; confirmed kMaxSupportedOpset >= 23)", "qnn_sdk_version": "unknown — check QnnSystem.dll version", "models_tested": [ "facebook/convnext-tiny-224", @@ -16,7 +16,7 @@ "sentence-transformers/all-MiniLM-L6-v2", "deepset/roberta-base-squad2" ], - "last_updated": "2026-06-16", + "last_updated": "2026-06-17", "epistemics_warning": "⚠️ All findings are hypotheses derived from limited models on 1 device (Snapdragon X Elite). Confidence levels reflect how well the mechanism is understood, not how universally applicable the finding is. ALWAYS re-validate on new model architectures before using to prune search space." }, @@ -33,10 +33,10 @@ "ort_version_critical_note": "The original mechanism (kMaxSupportedOpset gate in IsSupportedOpset()) requires kMaxSupportedOpset < 21. onnxruntime-windowsml==1.24.5 (ORT 1.24.x) has kMaxSupportedOpset >= 22, so BOTH opset17 and opset21 go through the NHWC layout transform. The bypass mechanism does NOT apply to the ORT version used in the sweep. The observed speedup for DINOv2 and MobileViT has an UNKNOWN root cause.", "architecture_requirement": ["empirically: DINOv2 family (facebook/dinov2-*) consistently benefits. Plain ViT (dino-vitb16) does NOT. Hybrid Conv+attention (MobileViT) showed speedup in original data. Pure Conv (ResNet) insufficient data. NLP: neutral."], "critical_caveats": [ - "MECHANISM UNKNOWN: The perf gain is observed but not explained. Do NOT cite a mechanism until the optimized graph diff (opset17 vs opset21 Transpose node count) is confirmed.", + "MECHANISM UNKNOWN: Transpose count is IDENTICAL in opset17 and opset21 (both 49 nodes on dinov2-small). The original Transpose-elimination hypothesis is RULED OUT. The +48 Reshape nodes in opset21 are the most observable structural difference but why this speeds up QNN NPU is not understood.", "RESNET-18 EXCLUDED: apparent +20% is statistical noise — 3 sessions span 4x range at sub-ms latency. Need 3 sessions × 2000 iters for reliable data at this scale.", "DVFS NOISE: always use 3 sessions × 500+ iters with cool-down. Single-session CV is meaningless on QNN NPU.", - "SCOPE IS DINOVS2-FAMILY NOT GENERAL VIT: dino-vitb16 (same ViT-B size as dinov2-base) shows -0.7% NEUTRAL. The speedup is DINOv2-architecture-specific." + "SCOPE IS DINOV2-FAMILY NOT GENERAL VIT: dino-vitb16 (same ViT-B size as dinov2-base) shows -0.7% NEUTRAL. The speedup is DINOv2-architecture-specific." ], "validated_models": { "benefits_from_opset21": [ @@ -64,7 +64,7 @@ "v1.16.x": 19, "v1.17.x": 20, "v1.18.x": 21, - "v1.24.x": "unknown, almost certainly >= 22 — VERIFY", + "v1.24.x": ">= 23 (CONFIRMED: ORT 1.24.4 in C:\\tmp\\autoconfig-demo accepts opset 22 and 23 via InferenceSession with CPUExecutionProvider; opset 24 fails with 'No op registered for ...' not 'Unsupported opset')", "main_HEAD": 26 }, "key_files": [ @@ -73,11 +73,21 @@ "onnxruntime/core/session/inference_session.cc:1589-1626 — transform_layout_fn=nullptr path" ] }, + "transpose_analysis_2026_06_16": { + "method": "onnx.load() on winml-built optimized.onnx and quantized.onnx for h0 (opset17) and h3 (opset21) from catalog_qnn_sweep facebook--dinov2-small. Op counts via collections.Counter on graph.node.", + "opset17_optimized": {"total_nodes": 391, "Transpose": 49, "Reshape": 121, "Gemm": 72, "Mul": 48, "Conv": 1}, + "opset21_optimized": {"total_nodes": 439, "Transpose": 49, "Reshape": 169, "Gemm": 72, "Mul": 48, "Conv": 1}, + "opset17_quantized": {"total_nodes": 1398, "Transpose": 49, "Reshape": 121, "DequantizeLinear": 615, "QuantizeLinear": 392}, + "opset21_quantized": {"total_nodes": 1542, "Transpose": 49, "Reshape": 169, "DequantizeLinear": 663, "QuantizeLinear": 440}, + "key_finding": "Transpose count is IDENTICAL (49 nodes) in both opset17 and opset21. The NHWC Transpose-reduction hypothesis is RULED OUT. opset21 has MORE Reshape nodes (+48), more QDQ pairs (+48 DQ, +48 Q), and more total nodes. Despite more nodes, opset21 runs 30% faster on QNN NPU — mechanism still unknown.", + "rules_out": ["NHWC Transpose elimination as speedup cause", "Fewer total ops as explanation"], + "consistent_with": ["Different graph structure at opset21 enabling better QNN NPU internal scheduling or graph partitioning, possibly via the +48 Reshape nodes acting as data-layout hints or memory access pattern changes"] + }, "alternative_mechanism_hypotheses": [ - "PyTorch ONNX exporter produces structurally different graph at opset 21 (different op decompositions) → fewer ops for QNN EP to partition", - "QNN EP graph partitioner behaves differently with opset 21 op semantics even when NHWC transform fires", - "Quantization calibration path differs between opset exports → better quantization quality at opset 21", - "NHWC transform at opset 21 still inserts fewer non-cancellable Transposes for Conv+attention hybrids via a different path — needs graph dump to verify" + "QNN EP graph partitioner assigns ops differently when the model has opset21 Reshape semantics — the +48 Reshape nodes may segment the graph into better-aligned HTP subgraphs", + "Quantization calibration path differs between opset exports → quantized.onnx has different scale/zero-point distributions at opset21 → better QNN NPU numeric alignment", + "PyTorch ONNX exporter produces different intermediate tensor shapes at opset 21 → better memory access locality on QNN NPU HBM", + "The +48 Reshape ops in opset21 are 'free' no-ops on QNN NPU (identity reshape with same shape) that happen to trigger a faster QNN internal code path" ], "data_reliability_notes": { "dinov2_small": { @@ -120,8 +130,9 @@ "gate3_thermal_control": "PARTIALLY — 3×500-iter with 30s cool-down is better than single-session but DVFS spikes still occur (MobileViT h1, DINOv2 h1 session 3)" }, "follow_up_required": [ - "CRITICAL: Verify ORT 1.24.x kMaxSupportedOpset value — run: python -c 'import onnxruntime; print(onnxruntime.__version__)' and check source", - "CRITICAL: Dump optimized.onnx for opset17 vs opset21 DINOv2 — count Transpose nodes in QNN-assigned partition. If opset21 has fewer, explains speedup via a different path.", + "DONE: kMaxSupportedOpset >= 23 confirmed for ORT 1.24.4 (accepts opset 22 and 23 at InferenceSession level)", + "DONE: Transpose analysis — opset17 vs opset21 DINOv2-small: IDENTICAL (49 Transpose both). Not the mechanism.", + "OPEN: Investigate QNN EP graph partitioning diff for opset17 vs opset21. Why do +48 Reshape nodes help?", "Run 5+ sessions (not 3) on DINOv2 opset17 vs opset21 to reduce DVFS uncertainty", "Test EfficientNet-B0, MobileNet-V3 to determine if benefit is 'Conv+residual' or 'Conv+attention hybrid' specific", "For ResNet-18: run 3 sessions x 2000 iters to get reliable sub-ms measurements" From 967ddccb61c9cc4d5ca3696308ef2d5bc749dcea Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jun 2026 22:25:12 +0800 Subject: [PATCH 09/38] =?UTF-8?q?research(autoconfig):=20extended=20model?= =?UTF-8?q?=20sweep=20=E2=80=94=20npu-001=20scope=20confirmed=20DINOv2-spe?= =?UTF-8?q?cific?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New benchmark results (2026-06-17, QNN NPU Snapdragon X Elite, 3x500-iter W8A16): BAAI/bge-small-en-v1.5 (BERT/sentence-similarity): h0=10.617ms [10.52, 10.32, 11.01] h3=9.840ms [10.25, 9.33, 9.94] opset21 gain +7.3% -- MARGINAL / INCONCLUSIVE (CV=0.3, ranges barely non-overlapping) Unusual vs all other NLP models (distilbert -0.1%, MiniLM -0.7%, roberta +0.1%) Needs 5+ sessions to differentiate from DVFS noise. rizvandwiki/gender-classification (plain ViT): h0=14.326ms [14.15, 14.94, 13.89] h3=13.830ms [13.70, 13.92, 13.87] opset21 gain +3.5% -- NEUTRAL (ranges overlap 13.89/13.92ms, CV=0.35) CRITICAL FINDING: this ViT model has IDENTICAL op counts to DINOv2-small (49 Transpose, 121 Reshape, ~72 Gemm) yet shows NO benefit. Confirms npu-001 is not explainable by op-count profiles or general ViT architecture. Combined with Transpose analysis (Task 3): opset17 and opset21 DINOv2-small have identical Transpose node counts (49). The speedup mechanism is NOT Transpose elimination. The effect is specific to DINOv2 family at a level below op-count visibility -- possibly quantization behavior, tensor layout, or QNN EP partitioning. Also updated: models_tested list (+5 entries), validated_models sections, scope and confidence statements, task completion notes in follow_up_required. --- .../BAAI--bge-small-en-v1.5/results_new.json | 31 +++++++++++++++ .../results_new.json | 31 +++++++++++++++ research/autoconfig/ep_knowledge/qnn_npu.json | 38 ++++++++++++++++--- 3 files changed, 95 insertions(+), 5 deletions(-) create mode 100644 research/autoconfig/catalog-qnn-sweep/BAAI--bge-small-en-v1.5/results_new.json create mode 100644 research/autoconfig/catalog-qnn-sweep/rizvandwiki--gender-classification/results_new.json diff --git a/research/autoconfig/catalog-qnn-sweep/BAAI--bge-small-en-v1.5/results_new.json b/research/autoconfig/catalog-qnn-sweep/BAAI--bge-small-en-v1.5/results_new.json new file mode 100644 index 000000000..fed23f364 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/BAAI--bge-small-en-v1.5/results_new.json @@ -0,0 +1,31 @@ +{ + "model_id": "BAAI/bge-small-en-v1.5", + "task": "sentence-similarity", + "hypotheses": { + "h0": { + "description": "opset17 no opts", + "model_file": "quantized.onnx", + "screen_p50_ms": 9.208, + "screen_cv": 0.3059, + "full_p50s_ms": [ + 10.516, + 10.323, + 11.01 + ], + "avg_p50_ms": 10.616 + }, + "h3": { + "description": "opset21 no opts", + "model_file": "quantized.onnx", + "screen_p50_ms": 9.562, + "screen_cv": 0.2575, + "full_p50s_ms": [ + 10.253, + 9.331, + 9.937 + ], + "avg_p50_ms": 9.84 + } + }, + "opset21_gain_pct": 7.31 +} diff --git a/research/autoconfig/catalog-qnn-sweep/rizvandwiki--gender-classification/results_new.json b/research/autoconfig/catalog-qnn-sweep/rizvandwiki--gender-classification/results_new.json new file mode 100644 index 000000000..ad2ca7a54 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/rizvandwiki--gender-classification/results_new.json @@ -0,0 +1,31 @@ +{ + "model_id": "rizvandwiki/gender-classification", + "task": "image-classification", + "hypotheses": { + "h0": { + "description": "opset17 no opts", + "model_file": "quantized.onnx", + "screen_p50_ms": 29.602, + "screen_cv": 0.5068, + "full_p50s_ms": [ + 14.151, + 14.942, + 13.889 + ], + "avg_p50_ms": 14.327 + }, + "h3": { + "description": "opset21 no opts", + "model_file": "quantized.onnx", + "screen_p50_ms": 15.056, + "screen_cv": 0.579, + "full_p50s_ms": [ + 13.698, + 13.921, + 13.868 + ], + "avg_p50_ms": 13.829 + } + }, + "opset21_gain_pct": 3.48 +} diff --git a/research/autoconfig/ep_knowledge/qnn_npu.json b/research/autoconfig/ep_knowledge/qnn_npu.json index 91e605c2a..0280af9bd 100644 --- a/research/autoconfig/ep_knowledge/qnn_npu.json +++ b/research/autoconfig/ep_knowledge/qnn_npu.json @@ -14,7 +14,12 @@ "hustvl/yolos-small", "distilbert/distilbert-base-uncased-finetuned-sst-2-english", "sentence-transformers/all-MiniLM-L6-v2", - "deepset/roberta-base-squad2" + "deepset/roberta-base-squad2", + "facebook/dinov2-base", + "microsoft/rad-dino", + "facebook/dino-vitb16", + "BAAI/bge-small-en-v1.5", + "rizvandwiki/gender-classification" ], "last_updated": "2026-06-17", "epistemics_warning": "⚠️ All findings are hypotheses derived from limited models on 1 device (Snapdragon X Elite). Confidence levels reflect how well the mechanism is understood, not how universally applicable the finding is. ALWAYS re-validate on new model architectures before using to prune search space." @@ -48,7 +53,18 @@ "facebook/dino-vitb16 (-0.7%, validation sweep 2026-06-16, h1=[19.92,19.97,19.90]ms h3=[20.20,20.07,19.99]ms — NEUTRAL, critical control)", "google/vit-base-patch16-224 (-7.4%, original catalog)", "hustvl/yolos-small (timeout, no data)", - "distilbert, bert/MiniLM, roberta (neutral, NLP)" + "rizvandwiki/gender-classification (+3.5% apparent, ranges overlap 13.89/13.92ms, NEUTRAL — plain ViT, CRITICAL: near-identical op counts to DINOv2-small (49 Transpose, 121 Reshape) yet NO benefit)", + "distilbert/distilbert-base-uncased-finetuned-sst-2-english (-0.1%, NLP neutral)", + "sentence-transformers/all-MiniLM-L6-v2 (-0.7%, NLP neutral)", + "deepset/roberta-base-squad2 (+0.1%, NLP neutral)" + ], + "marginal_inconclusive": [ + "BAAI/bge-small-en-v1.5 (+7.3%, h0=[10.52,10.32,11.01]ms h3=[10.25,9.33,9.94]ms — ranges barely non-overlapping but CV=0.3; NOT CONFIRMED. Needs 5+ sessions to differentiate from noise. Unusual for BERT architecture; all other NLP models tested at <1%)" + ], + "not_benchmarked_predicted_neutral": [ + "openai/clip-vit-base-patch32 — build failed at quantization (feature-extraction task calibration not supported); pure transformer, expected neutral based on all NLP data", + "cardiffnlp/twitter-roberta-base-sentiment-latest — not run; RoBERTa architecture, predicted neutral (consistent with roberta-base-squad2 +0.1%)", + "distilbert/distilbert-base-cased-distilled-squad — not run; DistilBERT architecture, predicted neutral (consistent with distilbert-base-uncased -0.1%)" ], "cpu_bound_cannot_test": [ "microsoft/rad-dino (-0.1%, all hypotheses ~275ms CV<0.022 — model runs on CPU, opset irrelevant)" @@ -117,12 +133,24 @@ "h1_opset17_sessions_ms": [0.990, 4.003, 2.716], "h3_opset21_sessions_ms": [1.054, 2.175, 4.107], "assessment": "UNRELIABLE. Sub-ms model. Session range spans 4x for same config. Reported '+20.2% speedup' (h1 median 2.72ms vs h3 median 2.18ms) is NOT a real signal — the two distributions fully overlap. REMOVED from benefits list." + }, + "gender_classification_vit": { + "h0_opset17_sessions_ms": [14.15, 14.94, 13.89], + "h3_opset21_sessions_ms": [13.70, 13.92, 13.87], + "assessment": "NEUTRAL. Ranges barely not overlapping (h0 min=13.89ms, h3 max=13.92ms). +3.5% is within DVFS noise (CV ~0.35). CRITICAL: this ViT model has IDENTICAL op counts to DINOv2-small (49 Transpose, 121 Reshape, ~72 Gemm) yet shows NO benefit. Confirms npu-001 is not explainable by op-count or general ViT architecture.", + "tool": "run_one.py 2026-06-17, quantized.onnx W8A16" + }, + "bge_small_en": { + "h0_opset17_sessions_ms": [10.52, 10.32, 11.01], + "h3_opset21_sessions_ms": [10.25, 9.33, 9.94], + "assessment": "MARGINAL / INCONCLUSIVE. Ranges barely not overlapping but CV ~0.3 means high within-session variance. +7.3% apparent gain — larger than all other NLP models (distilbert -0.1%, MiniLM -0.7%, RoBERTa +0.1%) but may be DVFS noise. Needs 5+ sessions to confirm. Do NOT cite as benefit.", + "tool": "run_one.py 2026-06-17, quantized.onnx W8A16, bert model-type" } }, - "action_for_autoconfig": "Include opset 21 in search for DINOv2-family models (facebook/dinov2-*). Likely worthwhile for MobileViT-class Conv+attention hybrids. Do NOT apply to plain ViT (dino-vitb16-class), YOLOS, or NLP (BERT-family). For ResNet-class Conv-only: insufficient data. ALWAYS dump optimized graph to compare Transpose counts if speedup is unexpected.", - "confidence": "medium-high on empirical observation (DINOv2-small +30.6% and DINOv2-base +24.1% both confirmed with clean 3-session protocol, fresh builds). Low on mechanism — original explanation does not apply to ORT 1.24.x. Mechanism must be re-investigated. Scope: DINOv2 family only until mechanism is understood.", + "action_for_autoconfig": "Include opset 21 in search for DINOv2-family models (facebook/dinov2-*). Likely worthwhile for MobileViT-class Conv+attention hybrids. Do NOT apply to plain ViT (dino-vitb16, gender-classification both neutral), YOLOS, or NLP (BERT-family all neutral at ±0.7%). CRITICAL: gender-classification ViT has IDENTICAL op counts to DINOv2-small (49 Transpose, 121 Reshape) but shows NO benefit — the effect is deeper than op counts. For ResNet-class Conv-only: insufficient data. ALWAYS dump optimized graph to compare Transpose counts if speedup is unexpected.", + "confidence": "medium-high on empirical observation (DINOv2-small +30.6% and DINOv2-base +24.1% both confirmed with clean 3-session protocol, fresh builds). Low on mechanism — original Transpose-bypass explanation ruled out (Transpose count identical opset17/21), kMaxSupportedOpset>=23 confirmed. Mechanism unknown. Scope: DINOv2 family only until mechanism is understood. 12 models now tested: 3 benefit, 7 neutral, 1 marginal/inconclusive (BGE-small +7.3% with high CV), 1 CPU-bound.", "falsified_by": null, - "scope": "ORT 1.24.5 (onnxruntime-windowsml). DINOv2-small and DINOv2-base confirmed. MobileViT-small likely. Does NOT apply to plain ViT (dino-vitb16 confirmed NEUTRAL), YOLOS-small, BERT-family, CPU-bound models (rad-dino). ResNet-18 data inconclusive.", + "scope": "ORT 1.24.5 (onnxruntime-windowsml). DINOv2-small and DINOv2-base confirmed. MobileViT-small likely. Does NOT apply to plain ViT (dino-vitb16 and rizvandwiki/gender-classification both confirmed NEUTRAL despite identical op counts to DINOv2-small), YOLOS-small, BERT-family NLP, CPU-bound models (rad-dino). ResNet-18 data inconclusive. BGE-small-en +7.3% marginal, inconclusive.", "tracked_issue": "#869", "perf_gain_validation_gates": { "gate1_statistical": "PASSED for DINOv2 (3-session, ranges separate). PARTIALLY for MobileViT (DVFS spike in h1). FAILED for ResNet-18.", From b3c0856c9493a00a9c467b06aca0d4a49bd6dc83 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 10:14:00 +0800 Subject: [PATCH 10/38] research(autoconfig): correct cpu/dml/qnn_gpu KB -- remove invalid findings, fix mechanism claims cpu.json: - cpu-001: mechanism_confirmed true->false. Data is real (opset 17 best) but the kMaxSupportedOpset gate hypothesis doesn't explain the non-monotonic pattern (opset22=85ms partial recovery while 19/20/21 all ~150-170ms). Two separate kMaxSupportedOpset constants exist (NHWC gate vs Transpose Optimizer gate); the CPU one is unverified. Added note on this distinction. - cpu-006: mechanism_confirmed true->false (derived from cpu-001). Meta-rule (EP isolation) remains valid. Added note that NPU/CPU experiments used different models (DINOv2 vs ConvNext) -- comparison is directional only. dml.json: - dml-001: INVALIDATED as 'DML is faster'. DML p50=16.9ms vs QNN GPU p50=17.7ms: diff = 0.8ms = 0.82 sigma of GPU measurement -- distributions OVERLAP. Retained: DML IS more stable (std 0.52 vs 0.97), that difference is real. - dml-002: HEADLINE CORRECTED. p50 with NHWC is marginally BETTER (16.5 vs 16.9ms), not worse. The actual finding is NHWC increases tail latency (p90 +19%) and variance (std 3.6x worse). Action unchanged (avoid NHWC) but for stability reasons, not p50. qnn_gpu.json: - gpu-003: Downgraded from medium to low confidence. Single experiment, 34% gap is above noise level but needs replication before citing as 'NEVER use compile'. --- research/autoconfig/ep_knowledge/cpu.json | 26 +++++++++-------- research/autoconfig/ep_knowledge/dml.json | 28 +++++++++---------- research/autoconfig/ep_knowledge/qnn_gpu.json | 10 +++---- 3 files changed, 33 insertions(+), 31 deletions(-) diff --git a/research/autoconfig/ep_knowledge/cpu.json b/research/autoconfig/ep_knowledge/cpu.json index 8edb8fb06..42a693928 100644 --- a/research/autoconfig/ep_knowledge/cpu.json +++ b/research/autoconfig/ep_knowledge/cpu.json @@ -5,7 +5,7 @@ "hardware": "Snapdragon X Elite CRD (Oryon CPU)", "ort_version": "1.x (check winml version at experiment time)", "model": "facebook/convnext-tiny-224 (ALL findings from this model only)", - "last_updated": "2026-06-10", + "last_updated": "2026-06-17", "epistemics_warning": "⚠️ All findings from rigorous 3-run ablation. However, still 1 model, 1 device. CPU behavior can differ significantly between x86 and ARM (Oryon). Check architecture before applying rules." }, @@ -13,19 +13,21 @@ { "id": "cpu-001", - "title": "opset 19+ causes severe regression on CPU EP (3-4x slowdown)", - "observation": "opset 17: p50=43.7ms. opset 19: p50=160ms (3.7x). opset 20: p50=131ms (3.0x). opset 21: p50=170ms (3.9x). opset 22: p50=85ms (1.9x). All runs consistent — not noise.", - "mechanism_confirmed": true, - "mechanism_hypothesis": "ORT C++ Transpose Optimizer has kMaxSupportedOpset gate. If model opset > kMaxSupportedOpset, the entire Transpose Optimizer is skipped silently. ConvNext has 42 Transpose nodes — without optimization, each executes as a full memory-layout copy. Code: onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h. kMaxSupportedOpset is bumped with each ORT release.", - "action_for_autoconfig": "For CPU EP: default to opset 17. Do NOT try opset 19+ unless you first verify that the shipping ORT version's kMaxSupportedOpset >= target_opset.", - "confidence": "high — mechanism confirmed by source code + ORT session opt-level experiment (ENABLE_ALL removes the regression)", + "title": "opset 19+ causes severe regression on CPU EP (3-4x slowdown) — data confirmed, mechanism uncertain", + "observation": "opset 17: p50=43.7ms. opset 19: p50=160ms (3.7x). opset 20: p50=131ms (3.0x). opset 21: p50=170ms (3.9x). opset 22: p50=85ms (1.9x). All runs consistent — not noise. Pattern is non-monotonic: opset 22 partially recovers but remains 1.9x slower than opset 17.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "Original hypothesis: ORT C++ Transpose Optimizer has a kMaxSupportedOpset gate (optimizer_api.h). If model opset > kMaxSupportedOpset, Transpose Optimizer is skipped silently. ConvNext has 42 Transpose nodes — without optimization, each executes as a full memory-layout copy. HOWEVER: the non-monotonic recovery at opset 22 (85ms vs 160-170ms at opset 19-21) is inconsistent with a simple binary gate. If the gate fires for opset > N, opset 22 should behave identically to opset 19. The actual mechanism is more complex. Additionally, ORT 1.24.x has kMaxSupportedOpset >= 23 confirmed (separate NHWC gate) — the Transpose Optimizer gate threshold may differ but is unverified.", + "action_for_autoconfig": "For CPU EP: default to opset 17. The empirical data (1 model, consistent across opsets) is unambiguous — opset 17 is the best option. Do NOT try opset 19+. The mechanism reason is uncertain but the practical conclusion is solid.", + "confidence": "high on empirical observation (consistent data across opsets for 1 model). Low on mechanism — the gate hypothesis does not fully explain the non-monotonic opset 22 partial recovery.", "falsified_by": null, - "scope": "Models with many Transpose nodes (ConvNext, ViT, vision transformers). Models with few Transposes (BERT) may be less affected.", + "scope": "ConvNext on Oryon CPU, ORT 1.24.x. Models with few Transpose nodes (BERT) likely unaffected.", "ort_kMaxSupportedOpset_by_version": { + "note": "These values are for the NHWC layout_transformation gate, NOT the Transpose Optimizer gate. The two constants may differ within the same ORT release.", "v1.14.x": 18, "v1.16.x": 19, "v1.17.x": 20, "v1.18.x": 21, + "v1.24.x": ">= 23 (confirmed for NHWC gate; Transpose Optimizer gate unknown)", "main_HEAD": 26 }, "do_not_generalize_to": "QNN NPU EP or DML EP — kMaxSupportedOpset is a CPU-only ORT optimizer gate. These EPs have their own kernel dispatch unaffected by this." @@ -85,11 +87,11 @@ { "id": "cpu-006", "title": "CPU EP opset 21 is 3.9x SLOWER — opposite of QNN NPU behavior", - "observation": "CPU opset 21: p50=170ms. CPU opset 17: p50=43.7ms. QNN NPU opset 21: p50=8.45ms (2.3x FASTER).", - "mechanism_confirmed": true, - "mechanism_hypothesis": "Same kMaxSupportedOpset gate as cpu-001. CPU and QNN NPU have completely different optimizer paths. CPU regression from Transpose Optimizer bypass. QNN NPU speedup from better kernel dispatch (mechanism under research).", + "observation": "CPU opset 21: p50=170ms. CPU opset 17: p50=43.7ms. QNN NPU opset 21 (DINOv2): p50=26ms (~24% FASTER than opset 17 at 34ms). Note: the NPU and CPU experiments used DIFFERENT models (CPU=ConvNext, NPU=DINOv2) — the comparison is directional only, not quantitative.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "CPU regression from Transpose Optimizer bypass (see cpu-001 — mechanism uncertain). QNN NPU speedup from unknown cause (original Transpose bypass hypothesis invalidated; Transpose counts identical in opset17/21 graphs). The key insight is that CPU and QNN NPU respond oppositely to opset changes, regardless of the root cause.", "action_for_autoconfig": "EP ISOLATION: CPU opset findings MUST NOT influence QNN NPU search space, and vice versa. Always validate per EP independently.", - "confidence": "high — both directions confirmed empirically", + "confidence": "high on empirical observation. Low on mechanism for both directions.", "falsified_by": null, "scope": "ALL — this is a meta-rule about EP isolation, not model-specific" } diff --git a/research/autoconfig/ep_knowledge/dml.json b/research/autoconfig/ep_knowledge/dml.json index 8b9adb1af..829a7a85e 100644 --- a/research/autoconfig/ep_knowledge/dml.json +++ b/research/autoconfig/ep_knowledge/dml.json @@ -5,7 +5,7 @@ "hardware": "Snapdragon X Elite CRD (Adreno X1-85 / DirectML via D3D12)", "ort_version": "1.x with onnxruntime-directml package", "model": "facebook/convnext-tiny-224 (ALL findings from this model only)", - "last_updated": "2026-06-10", + "last_updated": "2026-06-17", "epistemics_warning": "⚠️ DML experiments required swapping onnxruntime-directml for onnxruntime (Python package conflict). Results reflect DML EP behavior via winml's DML DLL, not the Python onnxruntime-directml package directly. Re-validate if package setup changes." }, @@ -13,27 +13,27 @@ { "id": "dml-001", - "title": "DML FP32 is faster and more stable than QNN GPU FP32 on the same Adreno X1-85", - "observation": "DML FP32: p50=16.9ms, p90=17.7ms, std=0.52. QNN GPU FP32: p50=17.7ms, p90=19.7ms, std=0.97.", + "title": "DML FP32 is more stable than QNN GPU FP32 — p50 difference is within noise", + "observation": "DML FP32: p50=16.9ms, p90=17.7ms, std=0.52. QNN GPU FP32: p50=17.7ms, p90=19.7ms, std=0.97. p50 diff = 0.8ms = 0.82σ of QNN GPU measurement — distributions OVERLAP. NOT a separable performance difference. DML is meaningfully more stable (std 0.52 vs 0.97, CV 3% vs 5.5%).", "mechanism_confirmed": false, - "mechanism_hypothesis": "DML JIT-compiles HLSL shaders at model load time — shader compilation is done once, producing stable execution. QNN GPU EP does graph partitioning at each session creation — more overhead and jitter.", - "action_for_autoconfig": "Prefer DML over QNN GPU for GPU inference (faster + more stable). DML is the primary GPU EP to optimize.", - "confidence": "medium — consistent in 3 runs each; mechanism is plausible but not confirmed by profiling", - "falsified_by": null, - "scope": "Adreno X1-85, ConvNext-class models", + "mechanism_hypothesis": "DML JIT-compiles HLSL shaders at model load time — shader compilation done once, producing stable execution. QNN GPU EP does graph partitioning at each session creation — more overhead and jitter.", + "action_for_autoconfig": "CORRECTED: Do NOT claim DML is faster than QNN GPU based on this data — the 0.8ms difference is within noise. DML IS more stable (lower CV). Prefer DML for lower tail latency (p90) and variance. p50 advantage is unconfirmed.", + "confidence": "low on p50 speedup (not statistically separable). Medium on stability advantage (std 0.52 vs 0.97 is real difference even if p50 overlaps).", + "falsified_by": "Statistical analysis: 0.8ms diff < 1σ of GPU measurement. Removed from 'DML is faster' claims.", + "scope": "Adreno X1-85, ConvNext-class models, 3-run comparison (insufficient for definitive p50 ranking)", "do_not_generalize_to": "NVIDIA/Intel GPUs (QNN GPU not available there anyway)" }, { "id": "dml-002", - "title": "NHWC transformer hurts DML (same as QNN GPU)", - "observation": "DML NHWC: p50=16.5ms, p90=21.0ms (+19% p90), std=1.89 (3.6x worse than FP32 baseline).", + "title": "NHWC transformer increases latency variance on DML — p50 is neutral or marginally better", + "observation": "DML NHWC: p50=16.5ms (-0.4ms vs baseline 16.9ms), p90=21.0ms (+19% vs baseline 17.7ms), std=1.89 (3.6x worse than FP32 baseline 0.52). NOTE: p50 is marginally BETTER with NHWC, not worse. The regression is in tail latency and variance.", "mechanism_confirmed": false, - "mechanism_hypothesis": "D3D12 on Adreno X1-85 does not benefit from explicit NHWC layout transforms. DML handles tensor layouts internally via HLSL; adding ORT NHWC Transposes creates overhead.", - "action_for_autoconfig": "Do NOT apply nhwc-transformer for DML EP.", - "confidence": "medium — single run comparison; consistent direction", + "mechanism_hypothesis": "D3D12 on Adreno X1-85 handles tensor layouts internally via HLSL shaders. Adding explicit ORT NHWC Transposes does not improve memory alignment for DML but adds dispatch overhead that occasionally causes scheduling jitter, inflating p90 and std.", + "action_for_autoconfig": "Do NOT apply nhwc-transformer for DML EP if tail latency stability matters. p50 may be marginally better but p90 is 19% worse and std is 3.6x worse. For applications sensitive to worst-case latency, NHWC is harmful.", + "confidence": "low — single run comparison, different baselines (run_count unspecified). Direction for variance is clear; p50 benefit is marginal and unreliable.", "falsified_by": null, - "scope": "Adreno X1-85 + DML", + "scope": "Adreno X1-85 + DML, ConvNext", "do_not_generalize_to": "NVIDIA GPUs (NHWC may help with CUDNN)" }, diff --git a/research/autoconfig/ep_knowledge/qnn_gpu.json b/research/autoconfig/ep_knowledge/qnn_gpu.json index e51e22457..d350cc54a 100644 --- a/research/autoconfig/ep_knowledge/qnn_gpu.json +++ b/research/autoconfig/ep_knowledge/qnn_gpu.json @@ -6,7 +6,7 @@ "ort_version": "1.x (check winml version at experiment time)", "qnn_sdk_version": "unknown — check QnnSystem.dll version", "model": "facebook/convnext-tiny-224 (ALL findings from this model only)", - "last_updated": "2026-06-10", + "last_updated": "2026-06-17", "epistemics_warning": "⚠️ All findings are hypotheses derived from 1 model on 1 device. Confidence levels reflect mechanism understanding, not universal applicability. GPU EP behavior varies significantly by model architecture and Adreno driver version." }, @@ -40,12 +40,12 @@ { "id": "gpu-003", - "title": "winml compile HURTS QNN GPU (~34% regression)", - "observation": "FP32 + compile: p50=23.7ms vs baseline 17.7ms. compile is opposite of NPU: regresses on GPU.", + "title": "winml compile appears to hurt QNN GPU (~34% regression) — SINGLE EXPERIMENT, LOW CONFIDENCE", + "observation": "FP32 + compile: p50=23.7ms vs baseline 17.7ms (+34%). Single experiment only.", "mechanism_confirmed": false, "mechanism_hypothesis": "QNN GPU EP compile (EPContext) is designed for NPU (HTP). On GPU EP, the compilation path may force a different dispatch mode that bypasses the optimized GPU shader path. QNN SDK likely has a GPU-specific compilation flow that winml compile doesn't trigger correctly.", - "action_for_autoconfig": "NEVER run winml compile for QNN GPU EP. This is the opposite of NPU behavior.", - "confidence": "medium — single experiment, consistent direction (34% is large signal); mechanism unconfirmed", + "action_for_autoconfig": "AVOID winml compile for QNN GPU EP. Direction (regression) is consistent with mechanism hypothesis and 34% is a large signal, but this is a single experiment. Until replicated, treat as likely harmful but not confirmed.", + "confidence": "low — single experiment. 34% gap is above DVFS noise level (CV ~0.05 → noise ~1ms, gap is 6ms). Direction probably real but magnitude uncertain.", "falsified_by": null, "scope": "QNN GPU EP", "do_not_generalize_to": "QNN NPU EP (compile always helps NPU)" From 9f31db7d2be7ef9729b623ea2a1ebff66d5ae7ab Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 10:29:22 +0800 Subject: [PATCH 11/38] research(autoconfig): update skills-design with validated KB findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Key corrections: - Bench protocol: QNN NPU CV 0.10-1.2 is normal (DVFS); never reject on CV. Protocol is 3x500-iter always, not gated on CV. - Phase 4 conv fusions: add npu-006 hard gate — FusedConv not supported by QNN EP -> CPU fallback -> +4900% regression on Conv-dense models. Rule: skip all conv-*-fusion if Conv% of total ops > 20%. - Diagnosis table: add npu-006 catastrophic regression row. - Gate 2 lesson: DINOv2 opset21 +24-31% is real but mechanism UNKNOWN. Two hypotheses ruled out: kMaxSupportedOpset bypass (ORT>=23), Transpose elimination (count identical opset17/21). +48 Reshape nodes only diff found. ViT models with identical op counts see no benefit -- effect below topology. - DML vs QNN GPU: correct 'consistently faster' claim -- 0.8ms diff = 0.82sigma, distributions overlap. Real finding: DML is more stable (std 0.52 vs 0.97). - EP table: update QNN NPU to 'architecture-dependent', add conv-fusion caveat; DML note corrected; CPU note: mechanism uncertain (two kMaxSupportedOpset). - Actionable findings: replace 'mechanism CONFIRMED' with full invalidation log. --- research/autoconfig/docs/skills-design.md | 119 ++++++++++++++-------- 1 file changed, 77 insertions(+), 42 deletions(-) diff --git a/research/autoconfig/docs/skills-design.md b/research/autoconfig/docs/skills-design.md index bdfe23c99..20532a5d5 100644 --- a/research/autoconfig/docs/skills-design.md +++ b/research/autoconfig/docs/skills-design.md @@ -1116,25 +1116,33 @@ Before **any** perf gain is written into a report, config recommendation, or kno Phase A — Quick screen (fast, ~2 min): winml perf -m --ep --device --warmup 20 --iterations 200 -o screen.json CV = screen.json.std / screen.json.p50 - IF CV > 0.10 (10%): REJECT — high DVFS variance, measurement unreliable - → cool down 120s, retry once - → if still CV > 0.10: flag as [UNSTABLE], skip candidate - -Phase B — Full bench (only if Phase A passes, ~15 min): - # 3 independent sessions with 60s cool-down between each - winml perf ... --warmup 50 --iterations 1000 -o run1.json - sleep 60 - winml perf ... --warmup 50 --iterations 1000 -o run2.json - sleep 60 - winml perf ... --warmup 50 --iterations 1000 -o run3.json + IF CV > 0.10 (10%) on CPU/GPU: REJECT — high variance, measurement unreliable + → cool down 120s, retry once + → if still CV > 0.10: flag as [UNSTABLE], skip candidate + EXCEPTION — QNN NPU: CV 0.10–1.2 is NORMAL due to DVFS (Hexagon HTP thermal throttling). + Do NOT reject on CV for QNN NPU. Instead: proceed to Phase B unconditionally. + Phase B's 3-session cool-down is the thermal control mechanism for NPU. + Watch for: s0 of any session may be elevated (JIT warmup) — exclude if >20% above s1/s2. + +Phase B — Full bench (for CPU/GPU: only if Phase A passes; for QNN NPU: always): + # 3 independent sessions with 30s cool-down between each (QNN NPU) + # or 60s cool-down (GPU) between each + winml perf ... --warmup 20 --iterations 500 -o run1.json + sleep 30 # (30s for NPU, 60s for GPU) + winml perf ... --warmup 20 --iterations 500 -o run2.json + sleep 30 + winml perf ... --warmup 20 --iterations 500 -o run3.json # KEEP if ALL of: - # 1. p50(run1,2,3) are all faster than baseline p50 × (1 - min_improvement) - # 2. CV of each run < 0.10 - # 3. cosine_similarity ≥ accuracy_floor + # 1. p50(run1,2,3) are ALL faster than baseline p50 × (1 - min_improvement) + # (for NPU: ranges must be non-overlapping, not just means) + # 2. cosine_similarity ≥ accuracy_floor KEEP_threshold = baseline_p50 × 0.99 # ≥1% improvement required ``` -Rationale: DVFS on mobile NPUs causes 2-10x run-to-run variance. CV check catches this before wasting 15 min on full bench. +Rationale: DVFS on QNN NPU causes 0.15–1.2 CV routinely — single sessions are meaningless. CV check only +gates CPU/GPU. For NPU, multi-session averaging + range separation is the reliability criterion. +Validated on DINOv2-small: 3 sessions separated cleanly (h3 s1/s2=4.97/4.88ms well below entire h1 +range 6.4–9.4ms). **Gate 2 — Mechanism: read ORT/QNN source code before explaining why** @@ -1142,19 +1150,31 @@ Rationale: DVFS on mobile NPUs causes 2-10x run-to-run variance. CV check catche - For QNN EP gains: check `onnxruntime/core/providers/qnn/builder/` for opset-conditional dispatch - For CPU EP gains: check `onnxruntime/core/optimizer/` for pass applicability conditions - For DML EP gains: check DML operator mapping tables -- **Do not publish "opset 21 = 2.3x faster on QNN NPU" without confirming the mechanism in source code.** It may be DVFS bias, not a real architectural difference. +- **Do not publish "opset 21 = faster on QNN NPU for model X" without confirming the mechanism.** + Even after: (1) confirming kMaxSupportedOpset ≥ 23 (bypass hypothesis RULED OUT), (2) verifying + Transpose count identical in optimized graphs for opset17 vs opset21 (Transpose-elimination + hypothesis RULED OUT) — the DINOv2 +24-31% speedup is empirically real but mechanism is UNKNOWN. + The only confirmed observation is +48 Reshape nodes in opset21 vs opset17. Why this helps QNN NPU + is an open research question. KB status: observed=true, mechanism_confirmed=false. **Gate 3 — Reproducibility: baseline and candidate measured in same thermal state** - Run baseline and candidate back-to-back in the same session OR - Use a device-level tool to lock NPU clock frequency - If you cannot control thermal state, report min_ms (peak-performance ceiling) alongside p50 (typical performance), and flag the variance explicitly. -**Lesson from ConvNext opset sweep (2026-06-10):** -Initial opset 21 measurement (8.45ms, 50 iters) vs opset 17 (19.4ms) appeared to show 2.3x gain. Full 17-22 sweep with 50 iters each showed: -- All opsets min ~9-10ms (same peak capability) -- opset 17 p50=54ms, opset 19-22 p50=12ms — but opset 18 p50=43ms (bimodal) -- opset 21 std varied from 10ms (cool device) to 37ms (warm device) -**Conclusion: data is inconclusive. Gain may be real OR may be thermal artifact. Gates 1+2 not yet passed.** +**Lesson from DINOv2 and ConvNext opset sweep (validated 2026-06-17):** + +DINOv2-small and DINOv2-base (Facebook DINO pre-training, ConvNeXt-style patch projection + ViT): +- opset21 vs opset17: DINOv2-small +30.6%, DINOv2-base +24.1% (3×500-iter, clean protocol ✅) +- dino-vitb16 (Google ViT): -0.7% NEUTRAL — same ViT architecture, no benefit +- gender-classification ViT: +3.5% NEUTRAL — IDENTICAL op counts to DINOv2-small (49 Transpose, 121 Reshape) but no benefit +**Conclusion: opset21 benefit is real for DINOv2 family but NOT generalizable to ViT. Mechanism unknown.** +The gain is NOT from: NHWC bypass (kMaxSupportedOpset ≥ 23), Transpose elimination (count identical). +The only structural difference: +48 Reshape nodes in opset21 optimized graph. Effect is below op-count visibility. + +ConvNext CPU opset sweep: data IS real (opset17 best, opset19+ 3-4× regression). NOT inconclusive. +Mechanism uncertainty exists for CPU (two separate kMaxSupportedOpset constants in ORT, one unverified), +but the practical rule stands: use opset17 for CPU EP unconditionally. --- @@ -1621,8 +1641,15 @@ Phase 4 — optimize pass tuning (independent of quant, affects graph structure) Hypothesis: some fusion patterns create op shapes QNN handles poorly Transformer models (try in order): attention-fusion → skip-layer-norm-fusion → layer-norm-fusion → fuse-rmsnorm - Vision models (try in order): - conv-bn-fusion → conv-add-fusion → conv-activation-fusion + Vision models — CRITICAL GATE BEFORE CONV FUSIONS (npu-006): + ⚠️ conv-bn-fusion, conv-add-fusion, conv-activation-fusion produce FusedConv ops. + FusedConv is NOT a standard ONNX op — QNN EP does not support it → CPU fallback. + On Conv-dense models (ResNet, EfficientNet): this causes +4900% regression (confirmed). + On attention-dominant models (DINOv2, ViT): only 1 Conv, CPU fallback is negligible. + RULE: run `winml analyze` FIRST. If Conv% of total ops > 20% → SKIP all conv fusions for QNN NPU. + If Conv% < 5% (attention-dominant) → safe to try. Always bench to confirm. + Vision models — only try if Conv% < 5%: + conv-bn-fusion → conv-add-fusion → conv-activation-fusion Shared (try if cosine drops or build crashes): constant-folding=false (prevents size bloat; sometimes exposes EP-incompatible shape) clamp-constant-values=true (fixes -inf attention mask → quantization issues) @@ -1653,6 +1680,7 @@ Phase 6 — combined search (if single-dimension changes are stuck) | All ops supported, cosine still drops after fusions | Fusion creates non-quantizable shape | Phase 4: disable skip-layer-norm-fusion | | QNN build fails with "invalid scale" | -inf in attention mask initializer | Phase 4: clamp-constant-values=true | | Vision model: accuracy drops unexpectedly | Conv+BN fusion slightly changes weight values | Phase 4: disable conv-bn-fusion | +| **QNN NPU** Conv model: latency catastrophically worse (+10x) after conv fusions | **FusedConv not supported by QNN EP → CPU fallback** (npu-006, confirmed on ResNet-18 +4900%) | Phase 4: **immediately disable all conv-*-fusion flags**; NEVER enable for Conv-dense models on QNN NPU | | MatMul-heavy model: latency not improving | MatMul not being fused | Phase 4: matmul-add-fusion, matmul-transpose-fusion | | RMSNorm model (Llama etc.) poor QNN perf | ORT not recognizing RMSNorm pattern | Phase 4: fuse-rmsnorm=true | @@ -1959,17 +1987,20 @@ A single `Gelu` kernel eliminates dispatch overhead → p90 −48%, std −6×. | Raw unfused export (287 nodes) | 16.5ms | 18.4ms | 2.74 | ❌ p99=35ms, worse tail | | FP16 (Python hack ⚠️) | **11.8ms** | 12.8ms | 0.66 | ✅ **1.4× faster, clean dist** — BLOCKED #867 | -**DML vs QNN GPU comparison (same Adreno X1-85):** +**DML vs QNN GPU comparison (same Adreno X1-85) — validated 2026-06-17:** | | QNN GPU FP32 | DML FP32 | DML FP16 (invalid) | |---|---|---|---| | p50 | 17.7ms | **16.9ms** | **11.8ms** | | p90 | 19.7ms | **17.7ms** | **12.8ms** | | std | 0.97 | **0.52** | **0.66** | -→ DML is consistently faster and more stable than QNN GPU at FP32. Root cause: DML JIT-compiles HLSL shaders at model load time; QNN GPU EP does graph partitioning at each session creation. -→ DML FP16: no DVFS bimodal (unlike QNN GPU FP16) — DML's shader compilation locks in FP16 compute paths. -→ NHWC hurts DML too (same reason as QNN GPU: Adreno X1-85 + D3D12 doesn't benefit from explicit NHWC transforms). -→ Note: `winml analyze` returns 0/0/0/251 (all Unknown) for DML — no rule data. DML supports all standard ONNX ops by design. +⚠️ **Correction (dml-001):** The 0.8ms p50 difference (17.7 vs 16.9ms) = 0.82σ of the GPU measurement. +Distributions OVERLAP. "DML is consistently faster than QNN GPU" is NOT supported at p50. +**What IS confirmed**: DML has meaningfully better stability (std 0.52 vs 0.97). For latency-SLA workloads, +DML's lower variance is the real advantage, not raw p50 speed. +→ Correct claim: "DML is more stable than QNN GPU at FP32 (std 0.52 vs 0.97)." +→ Root cause of stability: DML JIT-compiles HLSL shaders at model load; QNN GPU EP partitions at each session. +→ For speed, both EPs need FP16 (#867) to show meaningful improvement. **QNN Hub benchmark comparison (Snapdragon X Elite CRD) — WITH cross-stack test** @@ -1991,28 +2022,32 @@ QNN Hub on winml: 8.78ms Our model on winml: 19.4ms (FP32) ``` -**Actionable findings (updated 2026-06-10 — mechanism confirmed via ORT source):** -1. **opset 21 NPU speedup mechanism CONFIRMED — but ORT-version-dependent** (#869) - - **Root cause**: `kMaxSupportedOpset` gate in `IsSupportedOpset()` (layout_transformation.cc). On older ORT where `kMaxSupportedOpset` < 21, opset 21 models bypass the NHWC layout transform entirely (`transform_layout_fn = nullptr`). - - **Why bypass helps ConvNext**: NHWC transform inserts `Transpose(NCHW→NHWC/NHWC→NCHW)` around Conv. ConvNext residual connections **block** full transpose cancellation → extra Transpose ops on HTP → slower. Bypassing = cleaner graph = faster. - - **Critical caveat**: Current ORT main has `kMaxSupportedOpset = 26` → BOTH opset 17 and 21 get NHWC transform. **Must verify ORT version** before assuming the speedup exists. - - **Does NOT generalize** to: MobileNet/EfficientNet (no residual Transpose blocks), ViT (no Conv). - - **Perf claim validation status**: Gate 1 (iter≥1000×3) and Gate 3 (thermal control) still FAILED. Perf numbers are DVFS-dominated. +**Actionable findings (updated 2026-06-17 — validated by 3×500-iter protocol + source analysis):** +1. **opset 21 speedup for DINOv2 family — empirically REAL, mechanism UNKNOWN** (#869) + - DINOv2-small +30.6%, DINOv2-base +24.1% — confirmed by 3-session non-overlapping ranges. + - dino-vitb16 -0.7% NEUTRAL, gender-classification ViT +3.5% NEUTRAL. + - **Two hypotheses definitively ruled out**: + - (a) kMaxSupportedOpset bypass: ORT 1.24.4 kMaxSupportedOpset ≥ 23 → NHWC transform applies to both opset17 and opset21 equally. Bypass does NOT occur. + - (b) Transpose elimination: Transpose count identical (49 both) in opset17 vs opset21 optimized.onnx and quantized.onnx. + - Only observed structural difference: +48 Reshape nodes in opset21. Why this helps QNN NPU is unknown. + - **Do not generalize**: benefit appears specific to DINOv2 family. ViT models with identical op counts see no benefit. + - **Do not use for autoconfig search default**: only try opset21 sweep after profiling suggests Reshape/layout overhead; otherwise use opset17. 2. **Runtime stack gap (3.3×) is structural**: qairt native will always be faster. Correct baseline = "QNN Hub ONNX on winml" (8.78ms). 3. **QNN Hub W8A16 is WORSE on our stack** (14.82ms, std=8.8ms): opset 21 QDQ + uint16 input incompatible with ORT QNN EP format. -4. **Opset is a search dimension** — but the correct action is a FULL SWEEP (17–22), not "try 21 first". The optimal opset depends on ORT version. +4. **Opset is a search dimension** — full sweep (17–22), no prior. The optimal opset is model-architecture-dependent and may change with ORT version upgrades. -**EP-specific search space rules** +**EP-specific search space rules (validated 2026-06-17)** | EP | Quantization | Opset | Graph passes | Compile | Key insight | |---|---|---|---|---|---| -| QNN NPU | ✅ W8A16 | Full sweep 17-22 (mechanism ORT-version-dependent) | autoconf (gelu+matmul_add) | ✅ Always | W8A8 catastrophic on LN+GELU; opset effect depends on ORT kMaxSupportedOpset | +| QNN NPU | ✅ W8A16 | Full sweep 17-22 (benefit is model-architecture-dependent, not ORT-version) | autoconf (gelu+matmul_add); **NO conv fusions for Conv-dense models** (npu-006) | ✅ Always | W8A8 catastrophic on LN+GELU; opset21 benefit real for DINOv2 family, mechanism unknown | | QNN GPU | ❌ Skip | 17 (opset 21 not validated) | autoconf only | ❌ Skip | Compile regresses; FP16 only lever (#867) | -| DML | ❌ Skip | 17 (opset 21 not validated) | autoconf only | N/A | FP16 primary lever (#867); faster+stabler than QNN GPU | -| CPU | ❌ Skip | 17 only (kMaxSupportedOpset causes 3-4× regression on 19+) | nchwc, matmul-add, gelu | N/A | kMaxSupportedOpset gate hurts CPU for same reason it helps QNN | +| DML | ❌ Skip | 17 (opset 21 not validated) | autoconf only | N/A | FP16 primary lever (#867); more stable than QNN GPU (p50 diff is noise, std is real) | +| CPU | ❌ Skip | 17 only (kMaxSupportedOpset causes 3-4× regression on opset 19+) | nchwc, matmul-add, gelu | N/A | Regression data confirmed; mechanism uncertain (two separate kMaxSupportedOpset constants in ORT) | Rule: autoconfig must use EP-specific search space. Do NOT run quantization experiments for GPU/DML/CPU. -Rule: for QNN NPU opset sweep, verify ORT `kMaxSupportedOpset` first — if ≥ 22, all opsets get NHWC transform and the opset-based speedup may not apply. +Rule: for QNN NPU conv-fusion experiments — check `winml analyze` output first. If Conv op count > 20% of total → SKIP all conv-*-fusion flags. FusedConv is not a standard ONNX op; QNN EP falls back to CPU → catastrophic regression (npu-006). +Rule: for QNN NPU opset sweep — full sweep 17–22 with no prior. kMaxSupportedOpset ≥ 23 is confirmed in ORT 1.24.4, so the NHWC bypass hypothesis does not apply. Observe results empirically. Rule: for NPU, if W8A8 top-1 ≤ 15% on first attempt → skip all W8A8 variants, go directly to W8A16. Rule: always run `winml compile` after finding best quantized config for QNN NPU. NEVER compile for GPU (regresses). Rule: for GPU/DML, skip ALL graph optimization passes beyond what `winml build` autoconf applies (NHWC and additional fusions hurt). From 21dda6a491e09c996a7a134ceec84a7aa330c25d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 10:34:41 +0800 Subject: [PATCH 12/38] research(autoconfig): add operational constraints to autoconfig skill loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 0 — new analyze step sets 3 EP-specific flags before any experiment: conv_fusions_blocked: QNN NPU + Conv% > 20% -> skip all conv-*-fusion nhwc_blocked: QNN GPU / DML -> skip nhwc-transformer (dml-002) opset_sweep_blocked: CPU EP -> never sweep opset (cpu-001, fixed at 17) bench_protocol: 'npu' if QNN NPU -> always 3-session, no CV gate Phase 1 skip_set — 3 new hard blocks wired from Phase 0 flags: conv fusions blocked when npu-006 risk detected nhwc-transformer blocked for GPU/DML EPs opset sweep blocked for CPU EP Conv bottleneck queue respects conv_fusions_blocked flag Phase 2 loop: Hypothesis rule 2a: start with W8A16 (not W8A8); W8A8 is high-risk for LN/GELU W8A8 early exit: if top-1 <= 15% on first W8A8 attempt -> skip all W8A8 variants PERF step: full EP-aware bench protocol with 3-session NPU path, CV gate for CPU/GPU, s0 JIT exclusion rule, and non-overlapping range requirement for KEEP Post-convergence: mandatory compile for QNN NPU (+1.7x validated), explicit compile-skip guard for GPU/DML (compile regresses on Adreno X1-85). Hypothesis generation: opset sweep is now EP-qualified — CPU always blocked, GPU/DML not validated (skip), QNN NPU full sweep 17-21 with scope note. --- research/autoconfig/docs/skills-design.md | 106 +++++++++++++++++++--- 1 file changed, 95 insertions(+), 11 deletions(-) diff --git a/research/autoconfig/docs/skills-design.md b/research/autoconfig/docs/skills-design.md index 20532a5d5..8f223f88b 100644 --- a/research/autoconfig/docs/skills-design.md +++ b/research/autoconfig/docs/skills-design.md @@ -1470,16 +1470,35 @@ The ablation experiment ran 22 experiments over multiple days. Had the profiler # Step 1: verify the model is supported winml inspect -m --format json -# Step 2: baseline build (default config, opset=17) +# Step 2: graph analysis — extract structural flags before any experiment +winml analyze -m --ep --format json -o analyze_out/ +# Parse analyze_out/analysis.json: +# conv_op_count = count of "Conv" in op_distribution +# total_op_count = sum of all op counts +# conv_pct = conv_op_count / total_op_count * 100 +# +# Set EP-specific flags (used throughout Phase 2 loop): +# conv_fusions_blocked = (ep == "qnn" AND device == "npu" AND conv_pct > 20) +# [npu-006: FusedConv not supported by QNN EP → CPU fallback] +# nhwc_blocked = (ep in ["qnn_gpu", "dml"]) +# [dml-002: NHWC increases p90 variance on Adreno+D3D12] +# opset_sweep_blocked = (ep == "cpu") +# [cpu-001: opset 19+ causes 3-4× regression on CPU EP] +# bench_protocol = "npu" if (ep == "qnn" AND device == "npu") else "standard" +# [npu-007: QNN NPU CV 0.15-1.2 normal; always use 3-session] + +# Step 3: baseline build (default config, opset=17) winml export -m -o baseline/ winml build -c config_baseline.json -m -o baseline_built/ -# Step 3: correctness contract +# Step 4: correctness contract winml eval --mode compare -m baseline_built/model.onnx --model-id --format json # Expected: cosine=1.0 (FP32 self-comparison) -# Step 4: baseline perf -winml perf -m baseline_built/model.onnx --ep --warmup 10 --iterations 50 --format json +# Step 5: baseline perf (using EP-appropriate protocol) +# standard: winml perf -m baseline_built/model.onnx --ep --warmup 20 --iterations 200 +# npu: full 3-session bench (see Phase 2 PERF step) +winml perf -m baseline_built/model.onnx --ep --warmup 20 --iterations 200 --format json # Record: baseline_p50_ms ``` @@ -1520,11 +1539,27 @@ BUILD skip_set (passes not worth trying): IF Transpose pct > 10% AND opset >= 19: flag as [KNOWN_TRADEOFF]; add to report + # EP-specific hard blocks (from Phase 0 flags): + IF conv_fusions_blocked: # npu-006: FusedConv → QNN EP CPU fallback + skip_set.add(conv-bn-fusion, conv-add-fusion, conv-mul-fusion, + conv-activation-fusion, conv-add-activation-fusion) + log "BLOCKED conv-*-fusion: npu-006 FusedConv risk (Conv% = {conv_pct:.0f}%)" + + IF nhwc_blocked: # dml-002: NHWC worsens p90/std on Adreno+D3D12 + skip_set.add(nhwc-transformer) + log "BLOCKED nhwc-transformer: dml-002 variance increase on GPU EP" + + IF opset_sweep_blocked: # cpu-001: opset 19+ regresses 3-4× on CPU EP + skip_set.add(opset_sweep) # opset is FIXED at 17 for CPU; never sweep + log "BLOCKED opset sweep: cpu-001, using opset=17 only" + BUILD priority_queue (hypotheses in evidence-based order): IF top_bottleneck == "Gemm" OR "MatMul": queue: [quant_precision, calib_method, calib_samples, matmul_fusions, per_channel] IF top_bottleneck == "Conv": - queue: [nchwc (if not in skip_set), conv_fusions, quant_precision] + # Only add conv_fusions if not blocked by npu-006 + conv_fusions_entry = [] if conv_fusions_blocked else [conv_fusions] + queue: [nchwc (if not in skip_set)] + conv_fusions_entry + [quant_precision] IF top_bottleneck == "Attention": queue: [quant_precision, nodes_to_exclude (Attention), calib_method] DEFAULT: @@ -1544,7 +1579,9 @@ LOOP FOREVER (until user stops or convergence): 2. HYPOTHESIZE: build config.json delta based on hypothesis Hypothesis rules (profile-informed, in priority order): - a. If first loop: start with full W8A8/W8A16, all ops quantized + a. If first loop: start with W8A16 (NOT W8A8), all ops quantized + Rationale: W8A8 is high-risk on models with LN/GELU (catastrophic on QNN NPU). + Try W8A16 first; only escalate to W8A8 after W8A16 establishes a valid baseline. b. If cosine < floor: add worst partial_op to nodes_to_exclude (one at a time) c. If cosine ≥ floor but latency > budget: try W8A8 instead of W8A16, or reduce calibration_samples, or add per_channel=true @@ -1579,14 +1616,43 @@ LOOP FOREVER (until user stops or convergence): → top1_accuracy (image-classification), f1 (text), mAP (detection), etc. This is the authoritative accuracy metric for Reviewer verdict. + W8A8 EARLY EXIT (save 3+ wasted bench sessions): + IF precision == "w8a8" AND top1_accuracy ≤ 0.15 (near-random): + → log "W8A8 EARLY EXIT: top-1 ≤15%, quantization collapsed" + → skip_set.add(all W8A8 variants) # never try W8A8 again for this model/EP + → discard this config immediately (skip step 6 PERF) + → next hypothesis: try W8A16 with nodes_to_exclude for sensitive op types + Why cosine alone is not sufficient: - High cosine (0.97) but top-1 drops 5%: logit magnitudes preserved but relative ranking shifted - Low cosine (0.92) but same top-1: relative ranking unchanged despite numeric difference → Only task accuracy tells you whether the model still does its job -6. PERF: winml perf -m out_/artifact.onnx \ - --device --ep --warmup 10 --iterations 50 --format json - → p50_ms, p90_ms +6. PERF: bench protocol depends on bench_protocol flag set in Phase 0 + + standard (CPU / GPU / DML): + winml perf -m out_/artifact.onnx \ + --device --ep --warmup 20 --iterations 200 --format json + CV = std / p50 + IF CV > 0.10: log [UNSTABLE], cool down 120s, retry once; if still >0.10 → skip/discard + IF CV ≤ 0.10: proceed to full bench (3×500-iter, 60s cool-down for GPU) + + npu (QNN NPU only) — always use 3-session protocol (npu-007): + # High CV (0.15-1.2) is NORMAL for Hexagon HTP. Never reject on CV alone. + winml perf ... --warmup 20 --iterations 500 -o run1.json + sleep 30 + winml perf ... --warmup 20 --iterations 500 -o run2.json + sleep 30 + winml perf ... --warmup 20 --iterations 500 -o run3.json + + # s0 JIT exclusion: if any run's first 50 iters (inferred via warmup behavior) are + # elevated, it reflects JIT compilation, not steady-state. When run1/2/3 disagree: + # candidate_p50 = median(run1.p50, run2.p50, run3.p50) + # If run1.p50 > median(run2.p50, run3.p50) × 1.20 → suspect JIT; use run2+run3 median. + + # KEEP only if: ALL of run1/2/3 p50 < baseline best × (1 - min_improvement) + # (ranges must not overlap — median alone is insufficient for noisy NPU measurements) + → record: candidate_p50_ms, bench_sessions_used 7. REVIEWER: cross-experiment verdict keep if task_accuracy ≥ accuracy_floor AND p50_ms ≤ latency_budget @@ -1611,6 +1677,21 @@ LOOP FOREVER (until user stops or convergence): - 5 consecutive discards with no improvement: report best so far - User manually stops the agent +**Post-convergence: mandatory finalization for QNN NPU** + +```bash +# For QNN NPU only: always compile the best-found quantized model +# compile adds ~1.7× speedup on top of quantization (validated on ConvNext: 10.3ms → 6.0ms) +IF bench_protocol == "npu": + winml compile -m best_config/model.onnx --device npu --ep qnn -o best_compiled/ + # Re-bench compiled model (same 3-session protocol) + # compiled latency replaces quantized latency in report + +# For GPU/DML: NEVER run winml compile — it regresses latency on Adreno X1-85 +IF ep in ["qnn_gpu", "dml"]: + log "compile step skipped: GPU compile regresses latency (validated -34% on ConvNext QNN GPU)" +``` + --- **3. Hypothesis generation rules (the intelligence layer)** @@ -1654,8 +1735,11 @@ Phase 4 — optimize pass tuning (independent of quant, affects graph structure) constant-folding=false (prevents size bloat; sometimes exposes EP-incompatible shape) clamp-constant-values=true (fixes -inf attention mask → quantization issues) remove-isnan-in-attention-mask=true (use after clamp; cleans dead IsNaN guards) - Try opset_version: 17 → 18 → 19 - (Higher opsets expose newer op types that may have better EP support) + Try opset_version (only if opset_sweep NOT blocked): + CPU EP: SKIP entirely — opset_sweep_blocked=True (cpu-001: opset 19+ regresses 3-4×) + QNN GPU/DML: SKIP — not validated beyond opset 17 + QNN NPU: full sweep 17 → 18 → 19 → 20 → 21 (architecture-dependent benefit; + opset21 confirmed +24-31% for DINOv2 family, NEUTRAL for general ViT) Phase 5 — selective node exclusion (when analyze shows partial ops) Read winml analyze --format json → partial_ops list From e903f67e00c7eb0f72e2195115b1b4ce59de1f36 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 13:10:51 +0800 Subject: [PATCH 13/38] research(autoconfig): add npu-006/npu-007 constraints to catalog sweep script catalog_qnn_sweep.py: - Add NPU006_CONV_PCT_THRESHOLD constant (20%) -- npu-006 guard - Add _count_conv_pct(): after h0 builds, count Conv ops via onnx library to assess whether h4/h5 conv fusions are safe or will catastrophically regress - In hypothesis loop: after h0 succeeds, analyze model.onnx Conv%. If Conv% > 20%: print [npu-006] WARNING before running h4/h5. Annotate h4/h5 bench result with npu006_expected_regression=True/False. - results dict: add conv_pct, npu006_risk, npu006_regression, npu001_ranges_non_overlapping fields - _compute_summary: improve npu001_generalized with range-overlap check (max(h3_p50s) < min(h1_p50s)) alongside median test. DVFS-noisy NPU results where ranges overlap are reported as 'median_only' (marginal), not True -- prevents false positives like BGE-small (+7.3%, overlapping). - _compute_summary: add npu-006 catastrophic regression detector (h4/h5 median >= 5x baseline = CPU fallback confirmed) - write_summary: SUMMARY.md now includes Conv% column, npu-006 regression column, and range-overlap note in npu001 column. Bench protocol header updated to note DVFS expectation. --- research/autoconfig/catalog_qnn_sweep.py | 190 +++++++++++++++++++---- 1 file changed, 164 insertions(+), 26 deletions(-) diff --git a/research/autoconfig/catalog_qnn_sweep.py b/research/autoconfig/catalog_qnn_sweep.py index 6236b4127..ae06b4ad4 100644 --- a/research/autoconfig/catalog_qnn_sweep.py +++ b/research/autoconfig/catalog_qnn_sweep.py @@ -10,13 +10,21 @@ h0: baseline (auto-config, default winml build for QNN NPU + W8A16) h1: opset 17 explicit (explicit opset, same optim as baseline) h2: opset 19 - h3: opset 21 ← tests npu-001 generalization + h3: opset 21 <- tests npu-001 generalization h4: opset 17 + conv fusions (conv-bn, conv-add, conv-activation) h5: opset 21 + conv fusions -2-phase bench protocol: - Phase A: 200-iter screen — reject if CV >= 15% - Phase B: 3 independent sessions × 500 iters, 30 s cool-down between sessions +2-phase bench protocol (npu-007): + Phase A: 200-iter screen — high CV is NORMAL on QNN NPU (DVFS), always proceed to Phase B. + Phase B: 3 independent sessions x 500 iters, 30 s cool-down between sessions. + KEEP criterion: all 3 sessions faster than baseline, ranges must not overlap. + +Validated constraints applied: + npu-006: conv fusions (conv-bn/add/activation) produce FusedConv ops that QNN EP cannot + dispatch -> CPU fallback -> catastrophic regression on Conv-dense models. h4/h5 are + annotated with npu006_expected_regression=True when Conv% of total ops > 20%. + npu-001: opset21 speedup is architecture-specific. npu001_generalized uses range-overlap + check (max(h3_p50s) < min(h1_p50s)), not just median comparison. Results: catalog-qnn-sweep//results.json Summary: catalog-qnn-sweep/SUMMARY.md @@ -34,6 +42,11 @@ sys.stdout.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined] +# npu-006 guard: conv fusions produce FusedConv (ORT private op) that QNN EP cannot dispatch. +# On Conv-dense models (Conv% > this threshold), h4/h5 will catastrophically regress. +# Validated: ResNet-18 (Conv-dense) +4900%, DINOv2-base (1 Conv total) benign. +NPU006_CONV_PCT_THRESHOLD = 20.0 # percent of total ops; above this = high npu-006 risk + # ── constants ───────────────────────────────────────────────────────────────── BASE_DIR = Path(__file__).parent WINML = str(BASE_DIR / ".venv" / "Scripts" / "winml.exe") @@ -136,6 +149,27 @@ def run_cmd(cmd: list[str], label: str = "", timeout: int = 600) -> tuple[int, s return -999, f"TIMEOUT after {timeout}s", elapsed +def _count_conv_pct(model_onnx: Path) -> tuple[float, int, int]: + """Count Conv ops in a built ONNX model. Returns (conv_pct, conv_count, total_count). + Used to assess npu-006 risk before running conv-fusion hypotheses. + Falls back to (0.0, 0, 0) if onnx is not importable or file missing. + """ + if not model_onnx.exists(): + return 0.0, 0, 0 + try: + import onnx # noqa: PLC0415 + + model = onnx.load(str(model_onnx)) + ops = [n.op_type for n in model.graph.node] + total = len(ops) + conv_count = sum(1 for o in ops if o == "Conv") + pct = conv_count / total * 100 if total > 0 else 0.0 + return round(pct, 1), conv_count, total + except Exception as e: + print(f" [warn] Conv% analysis failed: {e}", flush=True) + return 0.0, 0, 0 + + # ── winml wrappers ──────────────────────────────────────────────────────────── @@ -406,12 +440,16 @@ def sweep_model( "ep": EP, "device": DEVICE, "baseline_opset": None, + "conv_pct": None, # Conv ops % of total — drives npu-006 risk + "npu006_risk": None, # True if conv_pct > NPU006_CONV_PCT_THRESHOLD + "npu006_regression": None, # True if h4/h5 median >= 10x baseline (catastrophic) "hypotheses": {}, "best_hypothesis": None, "baseline_p50_ms": None, "best_p50_ms": None, "best_gain_pct": None, - "npu001_generalized": None, # True/False/"neutral"/None + "npu001_generalized": None, # True/False/"neutral"/None (median-based) + "npu001_ranges_non_overlapping": None, # True/False — stricter range-overlap test "feature_gaps": [], "errors": [], } @@ -449,6 +487,10 @@ def sweep_model( # ── Step 2: per-hypothesis loop ─────────────────────────────────────────── print(f"\n[2/3] Running {len(HYPOTHESES)} hypotheses…", flush=True) + # conv_pct is filled in after h0 succeeds (used to annotate npu-006 risk for h4/h5) + conv_pct: float = 0.0 + npu006_risk: bool = False + for hyp_id, label, opset_override, extra_optim in HYPOTHESES: elapsed_total = time.time() - model_start if elapsed_total > MODEL_TIMEOUT_S: @@ -523,12 +565,46 @@ def sweep_model( ) continue + # After h0: analyze Conv% to assess npu-006 risk for h4/h5 + if hyp_id == "h0" and onnx_path.exists(): + conv_pct, conv_count, total_count = _count_conv_pct(onnx_path) + npu006_risk = conv_pct > NPU006_CONV_PCT_THRESHOLD + results["conv_pct"] = conv_pct + results["npu006_risk"] = npu006_risk + if npu006_risk: + print( + f" [npu-006] Conv%={conv_pct:.1f}% ({conv_count}/{total_count} ops)" + f" > {NPU006_CONV_PCT_THRESHOLD:.0f}% threshold", + flush=True, + ) + print( + " [npu-006] h4/h5 (conv fusions) EXPECTED to catastrophically regress" + " — FusedConv not supported by QNN EP -> CPU fallback", + flush=True, + ) + else: + print( + f" [npu-006] Conv%={conv_pct:.1f}% ({conv_count}/{total_count} ops)" + f" <= {NPU006_CONV_PCT_THRESHOLD:.0f}% — h4/h5 low risk", + flush=True, + ) + + # Annotate h4/h5 with npu-006 risk BEFORE running bench + if hyp_id in ("h4", "h5") and npu006_risk: + print( + f" [npu-006] WARNING: {hyp_id} uses conv fusions on Conv-dense model" + f" (Conv%={conv_pct:.1f}%) — expect catastrophic regression", + flush=True, + ) + # Only run eval for h0 (baseline) on image-classification models do_eval = run_eval_on_baseline and hyp_id == "h0" and task == "image-classification" bench = _perf_result(onnx_path, model_id, task, do_eval) bench["label"] = label bench["opset"] = opset_used + if hyp_id in ("h4", "h5"): + bench["npu006_expected_regression"] = npu006_risk results["hypotheses"][hyp_id] = bench if bench["status"] == "UNSTABLE": @@ -542,7 +618,7 @@ def sweep_model( def _compute_summary(results: dict) -> None: - """Fill in baseline_p50, best_hypothesis, best_gain, npu001_generalized.""" + """Fill in baseline_p50, best_hypothesis, best_gain, npu001_generalized, npu006_regression.""" hyps = results["hypotheses"] # Baseline p50: prefer h0, fall back to h1 @@ -571,33 +647,84 @@ def _compute_summary(results: dict) -> None: gain_pct = (baseline_p50 - best_p50) / baseline_p50 * 100 results["best_gain_pct"] = round(gain_pct, 2) - # npu-001 generalization: does h3 (opset 21) beat h1 (opset 17) by ≥5%? + # ── npu-001: opset21 vs opset17 (h3 vs h1) ────────────────────────────── + # Criterion 1 (median): h3 p50 < h1 p50 by >=5% + # Criterion 2 (range-overlap, stricter): max(h3_p50s) < min(h1_p50s) + # Both must agree for "True"; either failing gives "neutral" h1 = hyps.get("h1", {}) h3 = hyps.get("h3", {}) if h1.get("status") in ("OK", "OK_HIGH_CV") and h3.get("status") in ("OK", "OK_HIGH_CV"): p50_h1 = h1["full"].get("median_p50_ms", float("inf")) p50_h3 = h3["full"].get("median_p50_ms", float("inf")) - if p50_h3 < p50_h1 * 0.95: # ≥5% improvement for h3 + h1_p50s: list[float] = h1["full"].get("p50s_ms", [p50_h1]) + h3_p50s: list[float] = h3["full"].get("p50s_ms", [p50_h3]) + + # Median-based test (>=5% improvement) + median_gain = p50_h3 < p50_h1 * 0.95 + median_loss = p50_h1 < p50_h3 * 0.95 + + # Range-overlap test (non-overlapping = more reliable for DVFS-noisy NPU) + ranges_non_overlapping = max(h3_p50s) < min(h1_p50s) if h3_p50s and h1_p50s else None + results["npu001_ranges_non_overlapping"] = ranges_non_overlapping + + if median_gain and ranges_non_overlapping: results["npu001_generalized"] = True gain = (p50_h1 - p50_h3) / p50_h1 * 100 print( - f" ✓ npu-001 GENERALIZES: opset21={p50_h3:.1f}ms vs opset17={p50_h1:.1f}ms (+{gain:.1f}%)", + f" [npu-001] CONFIRMED: opset21={p50_h3:.1f}ms vs opset17={p50_h1:.1f}ms" + f" (+{gain:.1f}%, ranges non-overlapping)", flush=True, ) - elif p50_h1 < p50_h3 * 0.95: # opset 17 is better + elif median_gain and not ranges_non_overlapping: + results["npu001_generalized"] = "median_only" + gain = (p50_h1 - p50_h3) / p50_h1 * 100 + print( + f" [npu-001] MARGINAL: opset21 median {gain:.1f}% faster but ranges OVERLAP" + f" (h3 max={max(h3_p50s):.1f}ms > h1 min={min(h1_p50s):.1f}ms) -- DVFS noise", + flush=True, + ) + elif median_loss: results["npu001_generalized"] = False print( - f" ✗ npu-001 does NOT generalize: opset17={p50_h1:.1f}ms < opset21={p50_h3:.1f}ms", + f" [npu-001] NEGATIVE: opset17={p50_h1:.1f}ms < opset21={p50_h3:.1f}ms", flush=True, ) else: results["npu001_generalized"] = "neutral" print( - f" ~ npu-001 neutral: opset17={p50_h1:.1f}ms ≈ opset21={p50_h3:.1f}ms", flush=True + f" [npu-001] NEUTRAL: opset17={p50_h1:.1f}ms ~ opset21={p50_h3:.1f}ms", + flush=True, ) else: - missing = [h for h, d in [("h1", h1), ("h3", h3)] if d.get("status") != "OK"] + missing = [ + h for h, d in [("h1", h1), ("h3", h3)] if d.get("status") not in ("OK", "OK_HIGH_CV") + ] results["npu001_generalized"] = f"N/A ({', '.join(missing)} not OK)" + results["npu001_ranges_non_overlapping"] = None + + # ── npu-006: detect catastrophic conv-fusion regression (h4/h5) ────────── + # "Catastrophic" = h4 or h5 median p50 >= 5x baseline (CPU fallback signature) + npu006_regression = False + for h_id in ("h4", "h5"): + h = hyps.get(h_id, {}) + if h.get("status") in ("OK", "OK_HIGH_CV") and baseline_p50: + p50_fused = h["full"].get("median_p50_ms") + if p50_fused and p50_fused >= baseline_p50 * 5.0: + npu006_regression = True + ratio = p50_fused / baseline_p50 + print( + f" [npu-006] CATASTROPHIC REGRESSION confirmed on {h_id}:" + f" {p50_fused:.1f}ms vs baseline {baseline_p50:.1f}ms ({ratio:.0f}x slower)" + f" -- FusedConv CPU fallback", + flush=True, + ) + elif h.get("status") == "BENCH_FAIL" and h.get("npu006_expected_regression"): + # Bench failure on expected-regression hypothesis is also a signal + print( + f" [npu-006] {h_id} bench FAILED on conv-dense model -- possible CPU fallback timeout", + flush=True, + ) + results["npu006_regression"] = npu006_regression def _save_results(results: dict, model_dir: Path) -> None: @@ -617,41 +744,52 @@ def write_summary(all_results: list[dict]) -> None: "", f"Generated: {datetime.now().isoformat(timespec='seconds')} ", f"EP: `{EP}` / device: `{DEVICE}` ", - f"Bench protocol: Phase-A {SCREEN_ITERS} iters (CV<{SCREEN_CV_MAX * 100:.0f}%)," - f" Phase-B {FULL_ITERS}×{FULL_SESSIONS} sessions ", + f"Bench protocol: Phase-A {SCREEN_ITERS} iters (high CV expected on QNN NPU — DVFS)," + f" Phase-B {FULL_ITERS}x{FULL_SESSIONS} sessions, 30s cool-down ", + "npu-001 criterion: median >=5% gain AND ranges non-overlapping ", + "npu-006 criterion: Conv% of ops; h4/h5 marked catastrophic if >=5x baseline ", "", "---", "", "## Per-Model Results", "", - "| Model | Task | Baseline p50 | Best p50 | Best config | Gain% | opset-21 helps? | Notes |", - "|-------|------|-------------|----------|-------------|-------|-----------------|-------|", + "| Model | Conv% | Baseline p50 | Best p50 | Best config | Gain% | npu-001? | npu-006 regression? | Notes |", + "|-------|-------|-------------|----------|-------------|-------|----------|---------------------|-------|", ] for r in all_results: model_id = r["model_id"] - task = r.get("task", "?") + conv_pct = r.get("conv_pct") + conv_str = f"{conv_pct:.0f}%" if conv_pct is not None else "N/A" + if r.get("npu006_risk"): + conv_str += " ⚠️" baseline = f"{r['baseline_p50_ms']:.1f} ms" if r.get("baseline_p50_ms") else "N/A" best = f"{r['best_p50_ms']:.1f} ms" if r.get("best_p50_ms") else "N/A" best_h = r.get("best_hypothesis") or "N/A" - # Annotate best_h with label - best_label = "" if best_h != "N/A": h_data = r.get("hypotheses", {}).get(best_h, {}) best_label = h_data.get("label", "") + else: + best_label = "" gain = f"{r['best_gain_pct']:.1f}%" if r.get("best_gain_pct") is not None else "N/A" npu001 = r.get("npu001_generalized") + non_overlap = r.get("npu001_ranges_non_overlapping") if npu001 is True: - npu001_str = "✓ YES" + npu001_str = "CONFIRMED (ranges sep.)" if non_overlap else "YES (median)" elif npu001 is False: - npu001_str = "✗ NO" + npu001_str = "NO" + elif npu001 == "median_only": + npu001_str = "MARGINAL (overlap)" elif npu001 == "neutral": - npu001_str = "~ neutral" + npu001_str = "neutral" else: - npu001_str = f"N/A ({npu001})" - errors = "; ".join(r.get("errors", []))[:100] or "none" + npu001_str = "N/A" + npu006 = ( + "YES ⚠️" if r.get("npu006_regression") else ("risk" if r.get("npu006_risk") else "no") + ) + errors = "; ".join(r.get("errors", []))[:80] or "none" lines.append( - f"| `{model_id}` | {task} | {baseline} | {best} | {best_h} ({best_label}) | {gain} | {npu001_str} | {errors} |" + f"| `{model_id}` | {conv_str} | {baseline} | {best} | {best_h} ({best_label}) | {gain} | {npu001_str} | {npu006} | {errors} |" ) # Per-model hypothesis breakdown From 0cd43d95fd9118793b7fec0bc07dedce2ce1eb50 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 15:33:27 +0800 Subject: [PATCH 14/38] research(autoconfig): fix 5 bugs found in code review + add bench_utils MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bugs fixed (from code-review + rubber-duck analysis): 1. [CRITICAL] autoconfig.py hypothesis optim keys were kebab-case ('conv-bn-fusion') but build_config() in pipes/graph.py looks up cap.python_name (snake_case). All h1-h5 were silently benchmarking the baseline config. Fix: rename all optim keys to snake_case ('conv_bn_fusion', 'gelu_fusion', etc.) 2. [HIGH] autoconfig.py hypothesis accumulation: h2-h5 used {**cfg['optim'], ...} but each hypothesis starts from a fresh BASELINE copy where optim={}. Refactored to explicit isolated mode — each hypothesis is independent. Labels updated to remove misleading '+' prefix. Behavior now matches intent. 3. [HIGH] autoconfig.py baseline_p50 only set when i==0 AND bench passes. If iter 0 was KB-skipped, baseline_p50 stayed None forever and the perf gate never fired. Fix: set baseline_p50 on the first successful Phase B bench regardless of iteration index. 4. [HIGH] catalog_qnn_sweep.py MODEL_TIMEOUT_S=20*60 (20 min) caused all hypotheses after h0 to time out. A single hypothesis takes ~30 min minimum. Fix: raise to 180 min (3 hours for 6 hypotheses). 5. [MEDIUM] catalog_qnn_sweep.py _count_conv_pct() used a catch-all except that masked ImportError. When onnx is missing, conv_pct returns 0.0 which evaluates as 'no risk' — silently disabling the npu-006 guard. Fix: split ImportError (loud warning + treat as UNKNOWN/HIGH risk) from other exceptions (parse errors, silent fallback). Additional fixes: - validation_sweep.py npu-007 bug: bench_screen failure gated Phase B for QNN NPU. For QNN NPU, only non-NPU EPs should gate Phase B on screen fail. - autoconfig.py: replace 'Likely DVFS noise' CV message with EP-aware text - autoconfig.py: median_p50 local variable shadowed imported function — renamed to med_p50 to prevent confusion - autoconfig.py: remove duplicate code section left by earlier refactor - bench_utils.py: new shared module with run_cmd, bench_screen, bench_full, ScreenResult, count_conv_pct, ranges_non_overlapping, median_p50, etc. bench_full now accepts warmup/iters/cool_down_s overrides for CPU protocol --- research/autoconfig/autoconfig.py | 624 ++++------------------- research/autoconfig/bench_utils.py | 391 ++++++++++++++ research/autoconfig/catalog_qnn_sweep.py | 29 +- research/autoconfig/validation_sweep.py | 18 +- 4 files changed, 527 insertions(+), 535 deletions(-) create mode 100644 research/autoconfig/bench_utils.py diff --git a/research/autoconfig/autoconfig.py b/research/autoconfig/autoconfig.py index c7f37cbfe..b8c0f315b 100644 --- a/research/autoconfig/autoconfig.py +++ b/research/autoconfig/autoconfig.py @@ -8,26 +8,42 @@ Demo: facebook/convnext-tiny-224, CPU EP, FP32 Loop: hypothesize → winml build → quick-screen bench (CV gate) → - full bench (iter=1000×3) → eval → keep/discard → repeat + full bench (3 sessions) → eval → keep/discard → repeat Key design principles (from GPU Optimizer V2 + ConvNext lessons): - 1. Two-phase bench: 200-iter CV screen FIRST, full bench only if CV < 10% + 1. Two-phase bench: 200-iter CV screen FIRST, full bench only if CV < threshold + (CPU/GPU) — or unconditionally for QNN NPU (npu-007: DVFS makes CV unreliable) 2. Use winml perf (NOT winml eval) for latency — eval includes HF preprocessing 3. Mandatory external-research after 5 consecutive DISCARDs in same dimension 4. Load ep_knowledge/*.json (only "confirmed" entries) to prune search space 5. Per-experiment structured output: hypothesis/impl/parity/perf/analysis/decision 6. Stop condition: 30 consecutive DISCARDs (not 5) + +Hypothesis design — ISOLATED mode (each hypothesis is independent): + Each hypothesis is applied to a fresh copy of BASELINE. The labels "+" prefix + is cosmetic; no state is accumulated across hypotheses. This allows independent + attribution: "does gelu-fusion alone help?" rather than "does gelu help on top + of conv fusions?". To run a cumulative search, chain patch functions explicitly. """ import copy import csv import json -import subprocess import sys import time from datetime import datetime from pathlib import Path +from bench_utils import ( + FULL_ITERS, + FULL_SESSIONS, + SCREEN_CV_MAX_STD, + SCREEN_ITERS, + bench_full, + bench_screen, + median_p50, + run_cmd, +) sys.stdout.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined] @@ -129,77 +145,84 @@ def load_ep_knowledge(ep: str) -> dict: # ── hypothesis sequence ─────────────────────────────────────────────────────── +# Each function receives a FRESH copy of BASELINE (isolated mode). +# Hypotheses are independent — no state is accumulated across them. +# Use "+" in labels only when the function explicitly inherits optim from prior +# state. Here all hypotheses start from baseline optim={}, so labels are flat. + + def h0_baseline(cfg: dict) -> dict: - """FP32 export, no extra fusions — reference point""" + """FP32 export, no extra fusions — reference point.""" cfg["optim"] = {} return cfg def h1_conv_fusions(cfg: dict) -> dict: - cfg["optim"] = {"conv-bn-fusion": True, "conv-add-fusion": True, "conv-activation-fusion": True} + """Conv+BN+Add+Activation fusions in isolation. + + NOTE: These are ORT graph-level fusions (conv_bn_fusion etc.) that create FusedConv ops. + On QNN NPU, FusedConv causes CPU fallback → catastrophic regression (npu-006). + Only run on CPU/DML EPs. Use count_conv_pct() before enabling on QNN. + """ + cfg["optim"] = { + "conv_bn_fusion": True, + "conv_add_fusion": True, + "conv_activation_fusion": True, + } return cfg def h2_gelu_fusion(cfg: dict) -> dict: - cfg["optim"] = {**cfg["optim"], "gelu-fusion": True} + """Gelu fusion in isolation (no conv fusions).""" + cfg["optim"] = {"gelu_fusion": True} return cfg -def h3_add_layernorm(cfg: dict) -> dict: - cfg["optim"] = {**cfg["optim"], "layer-norm-fusion": True} +def h3_layernorm_fusion(cfg: dict) -> dict: + """LayerNorm fusion in isolation.""" + cfg["optim"] = {"layer_norm_fusion": True} return cfg -def h4_add_matmul(cfg: dict) -> dict: - cfg["optim"] = {**cfg["optim"], "matmul-add-fusion": True} +def h4_matmul_add(cfg: dict) -> dict: + """MatMul+Add fusion in isolation (MLP block bottleneck).""" + cfg["optim"] = {"matmul_add_fusion": True} return cfg def h5_transpose_opt(cfg: dict) -> dict: - cfg["optim"] = {**cfg["optim"], "transpose-optimizer": True} + """Transpose optimizer in isolation.""" + cfg["optim"] = {"transpose_optimizer": True} return cfg def h6_opset21(cfg: dict) -> dict: - """Try opset 21 — may trigger kMaxSupportedOpset bypass on older ORT (see npu-001). - NOTE: This is a research hypothesis, not a confirmed optimization. Gate 2 required. + """Opset 21 research hypothesis — model-architecture-dependent benefit (npu-001). + NOTE: Mechanism unknown. Not a confirmed optimization. Gate 2 required before KB. """ cfg["export"]["opset_version"] = 21 - cfg["optim"] = {**cfg["optim"], "transpose-optimizer": True} return cfg HYPOTHESES: list[tuple[str, object, str]] = [ # (label, patch_fn, search_dimension) - ("baseline: no fusions (FP32 reference)", h0_baseline, "baseline"), + ("baseline (FP32, no fusions)", h0_baseline, "baseline"), ("conv fusions: bn+add+activation", h1_conv_fusions, "graph_pass"), - ("+ gelu-fusion", h2_gelu_fusion, "graph_pass"), - ("+ layer-norm-fusion", h3_add_layernorm, "graph_pass"), - ("+ matmul-add-fusion (MLP blocks)", h4_add_matmul, "graph_pass"), - ("+ transpose-optimizer", h5_transpose_opt, "graph_pass"), - ("opset=21 (kMaxSupportedOpset research)", h6_opset21, "opset"), + ("gelu-fusion only", h2_gelu_fusion, "graph_pass"), + ("layer-norm-fusion only", h3_layernorm_fusion, "graph_pass"), + ("matmul-add-fusion (MLP blocks)", h4_matmul_add, "graph_pass"), + ("transpose-optimizer only", h5_transpose_opt, "graph_pass"), + ("opset=21 (npu-001 research)", h6_opset21, "opset"), ] # ── helpers ─────────────────────────────────────────────────────────────────── -def run(cmd: list[str], label: str = "") -> tuple[int, str, float]: - t0 = time.time() - print(f" >> {label or cmd[1]}") - result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", errors="replace") - elapsed = time.time() - t0 - status = "ok" if result.returncode == 0 else f"rc={result.returncode}" - print(f" done in {elapsed:.0f}s [{status}]") - if result.returncode != 0: - print(f" stderr: {(result.stderr or result.stdout or '')[-400:]}") - return result.returncode, result.stdout + result.stderr, elapsed - - def build(cfg: dict, out_dir: Path) -> tuple[bool, str]: out_dir.mkdir(parents=True, exist_ok=True) cfg_path = out_dir / "config.json" cfg_path.write_text(json.dumps(cfg, indent=2)) - rc, out, _ = run( + rc, out, _ = run_cmd( [ WINML, "build", @@ -221,85 +244,35 @@ def build(cfg: dict, out_dir: Path) -> tuple[bool, str]: return rc == 0, out -def bench_phase_a(model_path: Path) -> tuple[float | None, float]: - """Phase A quick screen: 200 iters, check CV < SCREEN_CV_MAX. - Returns (p50_ms, cv). p50_ms=None means unstable (reject). - """ - out_json = model_path.parent / "screen_perf.json" - rc, _, _ = run( - [ - WINML, - "perf", - "-m", - str(model_path), - "--ep", - EP, - "--device", - DEVICE, - "--warmup", - str(SCREEN_WARMUP), - "--iterations", - str(SCREEN_ITERS), - "-o", - str(out_json), - ], - label=f"winml perf (screen, iter={SCREEN_ITERS})", - ) - if rc != 0 or not out_json.exists(): - return None, 999.0 - try: - data = json.loads(out_json.read_text()) - lat = data["latency_ms"] - p50 = lat["p50"] - std = lat["std"] - cv = std / p50 if p50 > 0 else 999.0 - print(f" screen: p50={p50:.1f}ms std={std:.1f}ms CV={cv:.2f}") - if cv > SCREEN_CV_MAX: - print(f" ⚠️ CV={cv:.2f} > {SCREEN_CV_MAX} — UNSTABLE, rejecting candidate") - return None, cv - return p50, cv - except Exception as e: - print(f" [warn] parse error: {e}") - return None, 999.0 - +def _run_screen(model_path: Path) -> tuple[float | None, float]: + """Phase A: 200-iter screen with CV gate. -def bench_phase_b(model_path: Path, label: str) -> list[float]: - """Phase B full bench: 3 independent sessions × 1000 iters with cool-down. - Returns list of p50_ms values (one per session). + For CPU EP, high CV means thermal/scheduling noise — reject and retry later. + Returns (p50_ms, cv). p50_ms=None means unstable or command failed. """ - p50s = [] - for session in range(1, FULL_SESSIONS + 1): - out_json = model_path.parent / f"full_perf_s{session}.json" - rc, _, _ = run( - [ - WINML, - "perf", - "-m", - str(model_path), - "--ep", - EP, - "--device", - DEVICE, - "--warmup", - str(FULL_WARMUP), - "--iterations", - str(FULL_ITERS), - "-o", - str(out_json), - ], - label=f"winml perf (full s{session}/{FULL_SESSIONS}, iter={FULL_ITERS})", + sr = bench_screen(winml=WINML, model_path=model_path, ep=EP, device=DEVICE) + if sr.hard_failed: + return None, 999.0 + if sr.cv is not None and sr.cv > SCREEN_CV_MAX: + print( + f" Phase A rejected: CV={sr.cv:.2f} > {SCREEN_CV_MAX}" + f" (thermal/scheduling noise on {EP.upper()} — cool device and retry)" ) - if rc == 0 and out_json.exists(): - data = json.loads(out_json.read_text()) - p50 = data["latency_ms"]["p50"] - std = data["latency_ms"]["std"] - cv = std / p50 if p50 > 0 else 999.0 - print(f" full s{session}: p50={p50:.1f}ms std={std:.1f}ms CV={cv:.2f}") - p50s.append(p50) - if session < FULL_SESSIONS: - print(f" cooling down {COOL_DOWN_S}s …") - time.sleep(COOL_DOWN_S) - return p50s + return None, sr.cv + return sr.p50_ms, sr.cv or 0.0 + + +def _run_full(model_path: Path) -> list[float]: + """Phase B: 3 sessions × FULL_ITERS with cool-down. Returns p50 per session.""" + return bench_full( + winml=WINML, + model_path=model_path, + ep=EP, + device=DEVICE, + out_prefix="full", + iters=FULL_ITERS, + cool_down_s=COOL_DOWN_S, + ) def eval_accuracy(out_dir: Path) -> float | None: @@ -308,7 +281,7 @@ def eval_accuracy(out_dir: Path) -> float | None: if not model_path.exists(): return None result_json = out_dir / "eval_result.json" - rc, _, _ = run( + rc, _, _ = run_cmd( [ WINML, "eval", @@ -482,28 +455,31 @@ def main() -> None: exp_info["analysis"] = "winml build failed — check build log" else: # Phase A: quick screen - screen_p50, screen_cv = bench_phase_a(out_dir / "model.onnx") + screen_p50, screen_cv = _run_screen(out_dir / "model.onnx") exp_info["screen_p50"] = f"{screen_p50:.1f}" if screen_p50 else "UNSTABLE" exp_info["screen_cv"] = f"{screen_cv:.3f}" if screen_p50 is None: status = "discard (unstable — CV too high)" exp_info["analysis"] = ( - f"Phase A rejected: CV={screen_cv:.2f} > {SCREEN_CV_MAX}. Likely DVFS noise. Cool device and retry." + f"Phase A rejected: CV={screen_cv:.2f} > {SCREEN_CV_MAX}. " + f"Thermal or scheduling noise on {EP.upper()} EP. Cool device and retry." ) else: # Phase B: full bench - full_p50s = bench_phase_b(out_dir / "model.onnx", label) + full_p50s = _run_full(out_dir / "model.onnx") if not full_p50s: status = "crash (full bench failed)" exp_info["analysis"] = "Phase B winml perf returned no data" else: - median_p50 = sorted(full_p50s)[len(full_p50s) // 2] + med_p50 = median_p50(full_p50s) + assert med_p50 is not None exp_info["full_p50s"] = [f"{p:.1f}" for p in full_p50s] - exp_info["median_p50"] = f"{median_p50:.1f}" + exp_info["median_p50"] = f"{med_p50:.1f}" - if baseline_p50 is None and i == 0: - baseline_p50 = median_p50 + # Set baseline from the first successful full bench (any iteration). + if baseline_p50 is None: + baseline_p50 = med_p50 exp_info["baseline_p50"] = f"{baseline_p50:.1f}" # Accuracy gate @@ -513,10 +489,10 @@ def main() -> None: if accuracy is not None and accuracy < ACCURACY_FLOOR: status = f"discard (accuracy {accuracy:.4f} < floor {ACCURACY_FLOOR})" exp_info["analysis"] = "Accuracy regression below floor" - elif baseline_p50 is not None and median_p50 > baseline_p50 * ( + elif baseline_p50 is not None and med_p50 > baseline_p50 * ( 1 - MIN_IMPROVEMENT ): - delta_pct = (median_p50 - baseline_p50) / baseline_p50 * 100 + delta_pct = (med_p50 - baseline_p50) / baseline_p50 * 100 status = f"discard (Δp50={delta_pct:+.1f}% < {MIN_IMPROVEMENT * 100:.0f}% threshold)" exp_info["delta_pct"] = f"{delta_pct:+.1f}%" exp_info["analysis"] = ( @@ -524,17 +500,15 @@ def main() -> None: ) else: delta_pct = ( - (median_p50 - (baseline_p50 or median_p50)) - / (baseline_p50 or median_p50) - * 100 + (med_p50 - (baseline_p50 or med_p50)) / (baseline_p50 or med_p50) * 100 ) status = "keep" exp_info["delta_pct"] = f"{delta_pct:+.1f}%" exp_info["analysis"] = ( - f"Improvement confirmed: p50 {baseline_p50:.1f}ms → {median_p50:.1f}ms ({delta_pct:+.1f}%)" + f"Improvement confirmed: p50 {baseline_p50:.1f}ms -> {med_p50:.1f}ms ({delta_pct:+.1f}%)" ) - if median_p50 < best_p50: - best_p50 = median_p50 + if med_p50 < best_p50: + best_p50 = med_p50 best_label = label status = "keep *** NEW BEST ***" @@ -599,403 +573,3 @@ def main() -> None: if __name__ == "__main__": main() - - -import sys -from pathlib import Path - - -sys.stdout.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined] - -# ── settings ───────────────────────────────────────────────────────────────── -MODEL_ID = "facebook/convnext-tiny-224" -TASK = "image-classification" -EP = "cpu" -DEVICE = "cpu" -WINML = str(Path(__file__).parent / ".venv" / "Scripts" / "winml.exe") -WORK_DIR = Path(__file__).parent / "convnext-search" -RESULTS_TSV = WORK_DIR / "results.tsv" - -EVAL_SAMPLES = 50 # small for demo speed (~12s per eval) -ACCURACY_FLOOR = 0.70 # drop below this → discard (FP32 baseline ~78%) -LATENCY_FLOOR = 1.0 # seconds — more than this means regression - -# ── baseline config ─────────────────────────────────────────────────────────── -BASELINE: dict = { - "export": { - "opset_version": 17, - "batch_size": 1, - "do_constant_folding": True, - "dynamo": False, - "input_tensors": [ - { - "name": "pixel_values", - "dtype": "float32", - "shape": [1, 3, 224, 224], - "value_range": [0, 1], - } - ], - "output_tensors": [{"name": "logits"}], - }, - "optim": {}, # will be patched per hypothesis - "loader": { - "task": TASK, - "model_class": "AutoModelForImageClassification", - "model_type": "convnext", - }, - "eval": { - "task": TASK, - "dataset": {"path": "timm/mini-imagenet", "split": "test", "samples": EVAL_SAMPLES}, - }, -} - -# ── hypothesis sequence ─────────────────────────────────────────────────────── -# ConvNext-tiny architecture: -# Stem: Conv 4x4 + LN → 4 stages of ConvNext blocks -# Each block: DW-Conv → LN → Linear (=Gemm) → GELU → Linear -# Skip connections: pointwise Add -# -# Relevant fusions: -# conv-bn-fusion — conv+BatchNorm folding (stem/downsample layers) -# conv-add-fusion — conv+bias add (ConvNext uses DepthwiseConv with bias) -# gelu-fusion — fuse decomposed GELU → com.microsoft/Gelu -# layer-norm-fusion — fuse LN subgraph (ConvNext uses LayerNorm heavily) -# matmul-add-fusion — fuse Gemm+bias (the inverted bottleneck MLPs) -# transpose-optimizer — eliminate redundant transposes around reshape ops -# constant-folding — pre-fold constant subgraphs (on by default in export, -# but also at optim stage via ORT) - - -def h0_baseline(cfg: dict) -> dict: - """FP32 export, no extra fusions — reference point""" - cfg["optim"] = {} - return cfg - - -def h1_conv_fusions(cfg: dict) -> dict: - """Enable all conv fusions — ConvNext stem uses Conv+BN, blocks use DW-Conv+bias""" - cfg["optim"] = { - "conv-bn-fusion": True, - "conv-add-fusion": True, - "conv-activation-fusion": True, - } - return cfg - - -def h2_gelu_fusion(cfg: dict) -> dict: - """Add GELU fusion — ConvNext MLP blocks use GELU activation""" - cfg["optim"] = { - "conv-bn-fusion": True, - "conv-add-fusion": True, - "conv-activation-fusion": True, - "gelu-fusion": True, - } - return cfg - - -def h3_add_layernorm(cfg: dict) -> dict: - """Add LayerNorm fusion — ConvNext uses LN (not BN) in blocks""" - cfg["optim"] = { - "conv-bn-fusion": True, - "conv-add-fusion": True, - "conv-activation-fusion": True, - "gelu-fusion": True, - "layer-norm-fusion": True, - } - return cfg - - -def h4_add_matmul(cfg: dict) -> dict: - """Add MatMul+Add fusion — ConvNext MLP uses Gemm (collapsed MatMul+bias)""" - cfg["optim"] = { - "conv-bn-fusion": True, - "conv-add-fusion": True, - "conv-activation-fusion": True, - "gelu-fusion": True, - "layer-norm-fusion": True, - "matmul-add-fusion": True, - } - return cfg - - -def h5_transpose_opt(cfg: dict) -> dict: - """Add transpose optimizer — ConvNext has many Transpose ops (NCHW reshapes)""" - cfg["optim"] = { - "conv-bn-fusion": True, - "conv-add-fusion": True, - "conv-activation-fusion": True, - "gelu-fusion": True, - "layer-norm-fusion": True, - "matmul-add-fusion": True, - "transpose-optimizer": True, - } - return cfg - - -def h6_opset18(cfg: dict) -> dict: - """Try opset 18 with all fusions — GroupNorm introduced in opset18""" - cfg["export"]["opset_version"] = 18 - cfg["optim"] = { - "conv-bn-fusion": True, - "conv-add-fusion": True, - "conv-activation-fusion": True, - "gelu-fusion": True, - "layer-norm-fusion": True, - "matmul-add-fusion": True, - "transpose-optimizer": True, - } - return cfg - - -def h7_surgery(cfg: dict) -> dict: - """Add clamp-constant-values — prevents -inf attention mask quant issues""" - cfg["export"]["opset_version"] = 17 - cfg["optim"] = { - "conv-bn-fusion": True, - "conv-add-fusion": True, - "conv-activation-fusion": True, - "gelu-fusion": True, - "layer-norm-fusion": True, - "matmul-add-fusion": True, - "transpose-optimizer": True, - "clamp-constant-values": True, - } - return cfg - - -HYPOTHESES: list[tuple[str, object]] = [ - ("baseline: no fusions (FP32 reference)", h0_baseline), - ("conv fusions: bn+add+activation", h1_conv_fusions), - ("+ gelu-fusion", h2_gelu_fusion), - ("+ layer-norm-fusion", h3_add_layernorm), - ("+ matmul-add-fusion (MLP blocks)", h4_add_matmul), - ("+ transpose-optimizer", h5_transpose_opt), - ("opset=18 + all fusions", h6_opset18), - ("back to opset=17 + surgery: clamp-constant-values", h7_surgery), -] - -# ── helpers ─────────────────────────────────────────────────────────────────── - - -def run(cmd: list[str], label: str = "") -> tuple[int, str, float]: - t0 = time.time() - print(f" >> {label or cmd[1]}") - result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", errors="replace") - elapsed = time.time() - t0 - status = "ok" if result.returncode == 0 else f"rc={result.returncode}" - print(f" done in {elapsed:.0f}s [{status}]") - if result.returncode != 0: - tail = (result.stderr or result.stdout or "")[-600:] - print(f" stderr: {tail}") - return result.returncode, result.stdout + result.stderr, elapsed - - -def build(cfg: dict, out_dir: Path) -> tuple[bool, str]: - out_dir.mkdir(parents=True, exist_ok=True) - cfg_path = out_dir / "config.json" - cfg_path.write_text(json.dumps(cfg, indent=2)) - rc, out, _ = run( - [ - WINML, - "build", - "-c", - str(cfg_path), - "-m", - MODEL_ID, - "-o", - str(out_dir), - "--ep", - EP, - "--device", - DEVICE, - "--no-quant", - "--no-compile", - ], - label="winml build", - ) - return rc == 0, out - - -def eval_onnx(out_dir: Path) -> tuple[float | None, float | None]: - """Eval model.onnx; return (accuracy, latency_s).""" - model_path = out_dir / "model.onnx" - if not model_path.exists(): - print(" [warn] model.onnx not found") - return None, None - - result_json = out_dir / "eval_result.json" - rc, _, _ = run( - [ - WINML, - "eval", - "-m", - str(model_path), - "--model-id", - MODEL_ID, - "--task", - TASK, - "--ep", - EP, - "--device", - DEVICE, - "--samples", - str(EVAL_SAMPLES), - "-o", - str(result_json), - ], - label="winml eval", - ) - if rc != 0 or not result_json.exists(): - return None, None - try: - data = json.loads(result_json.read_text()) - metrics = data.get("metrics", data) - accuracy = metrics.get("accuracy") - latency = metrics.get("latency_in_seconds") - return ( - float(accuracy) if accuracy is not None else None, - float(latency) if latency is not None else None, - ) - except Exception as e: - print(f" [warn] parse error: {e}") - return None, None - - -def log(row: dict) -> None: - fields = [ - "iter", - "label", - "optim_flags", - "opset", - "accuracy", - "latency_ms", - "delta_acc", - "delta_lat_ms", - "status", - "elapsed_s", - "timestamp", - ] - is_new = not RESULTS_TSV.exists() - with RESULTS_TSV.open("a", newline="", encoding="utf-8") as f: - w = csv.DictWriter(f, fieldnames=fields, delimiter="\t", extrasaction="ignore") - if is_new: - w.writeheader() - w.writerow(row) - - -def optim_flags(cfg: dict) -> str: - flags = [k for k, v in cfg.get("optim", {}).items() if v is True] - return ",".join(flags) if flags else "(none)" - - -# ── main loop ───────────────────────────────────────────────────────────────── - - -def main() -> None: - WORK_DIR.mkdir(parents=True, exist_ok=True) - - sep = "=" * 62 - print(f"\n{sep}") - print(f" autoconfig search -- {MODEL_ID}") - print(f" EP: {EP} eval_samples: {EVAL_SAMPLES} hypotheses: {len(HYPOTHESES)}") - print(f" Objective: maximize accuracy (floor={ACCURACY_FLOOR})") - print(" Search space: WinMLOptimizationConfig capability flags") - print(f"{sep}\n") - - baseline_acc: float | None = None - baseline_lat: float | None = None - best_acc = 0.0 - best_lat = float("inf") - best_label = "" - total_start = time.time() - - for i, (label, patch_fn) in enumerate(HYPOTHESES): - iter_start = time.time() - print(f"\n{'--' * 31}") - print(f" iter {i} | {label}") - print(f"{'--' * 31}") - - cfg = patch_fn(copy.deepcopy(BASELINE)) # type: ignore[operator] - flags = optim_flags(cfg) - opset = cfg["export"]["opset_version"] - print(f" optim: {flags}") - print(f" opset: {opset}") - - out_dir = WORK_DIR / f"iter_{i:02d}" - ok, _ = build(cfg, out_dir) - if not ok: - status = "crash" - accuracy = latency = None - else: - accuracy, latency = eval_onnx(out_dir) - if accuracy is None: - status = "eval_error" - elif accuracy < ACCURACY_FLOOR: - status = "discard (accuracy < floor)" - elif latency is not None and latency > LATENCY_FLOOR: - status = "discard (latency regression)" - else: - status = "keep" - if accuracy > best_acc or (accuracy == best_acc and (latency or 999) < best_lat): - best_acc = accuracy - best_lat = latency or float("inf") - best_label = label - status = "keep *** NEW BEST ***" - - # Print result - if accuracy is not None: - lat_ms = f"{(latency or 0) * 1000:.0f}ms" if latency else "N/A" - print(f" accuracy={accuracy:.4f} latency={lat_ms} -> {status}") - if baseline_acc is None and i == 0: - baseline_acc = accuracy - baseline_lat = latency - if baseline_acc is not None and i > 0: - d_acc = accuracy - baseline_acc - d_lat = ((latency or 0) - (baseline_lat or 0)) * 1000 - sign_acc = "+" if d_acc >= 0 else "" - sign_lat = "+" if d_lat >= 0 else "" - print(f" vs baseline: acc {sign_acc}{d_acc:.4f} lat {sign_lat}{d_lat:.0f}ms") - else: - print(f" -> {status}") - - elapsed = time.time() - iter_start - delta_acc = ( - f"{accuracy - baseline_acc:+.4f}" - if (accuracy is not None and baseline_acc is not None) - else "N/A" - ) - delta_lat = ( - f"{((latency or 0) - (baseline_lat or 0)) * 1000:+.0f}" - if (latency is not None and baseline_lat is not None) - else "N/A" - ) - log( - { - "iter": i, - "label": label, - "optim_flags": flags, - "opset": opset, - "accuracy": f"{accuracy:.4f}" if accuracy is not None else "N/A", - "latency_ms": f"{(latency or 0) * 1000:.0f}" if latency is not None else "N/A", - "delta_acc": delta_acc, - "delta_lat_ms": delta_lat, - "status": status, - "elapsed_s": f"{elapsed:.0f}", - "timestamp": datetime.now().isoformat(timespec="seconds"), - } - ) - - total = time.time() - total_start - print(f"\n{sep}") - print(f" SEARCH COMPLETE | {total / 60:.1f} min total") - print(f" Best config: {best_label}") - print(f" Best accuracy: {best_acc:.4f} latency: {best_lat * 1000:.0f}ms") - print(f" Results: {RESULTS_TSV}") - print(f"{sep}\n") - - if RESULTS_TSV.exists(): - print(RESULTS_TSV.read_text(encoding="utf-8")) - - -if __name__ == "__main__": - main() diff --git a/research/autoconfig/bench_utils.py b/research/autoconfig/bench_utils.py new file mode 100644 index 000000000..e3e2db4d8 --- /dev/null +++ b/research/autoconfig/bench_utils.py @@ -0,0 +1,391 @@ +#!/usr/bin/env python3 +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""bench_utils.py — Shared benchmarking helpers for QNN NPU sweeps. + +Bench protocol (npu-007): + Phase A: 200-iter screen. For QNN NPU, high CV (0.15-1.2) is NORMAL due to + DVFS/Hexagon HTP thermal throttling. Phase A result is informational only; + it never gates Phase B on NPU. Only use CV gate for CPU/GPU EPs. + Phase B: 3 independent sessions x 500 iters with 30s cool-down. + KEEP criterion: all p50s below baseline; for NPU, ranges must not overlap. + +winml config + build helpers are also centralized here to avoid duplication +between catalog_qnn_sweep.py and validation_sweep.py. +""" + +from __future__ import annotations + +import copy +import json +import subprocess +import time +from pathlib import Path + +# ── Protocol constants (overridable by callers via module-level reassignment) ─ +SCREEN_WARMUP: int = 20 +SCREEN_ITERS: int = 200 +SCREEN_CV_MAX_NPU: float = 999.0 # never gate on CV for QNN NPU (npu-007) +SCREEN_CV_MAX_STD: float = 0.10 # CPU / GPU: reject if CV > 10% + +FULL_WARMUP: int = 50 +FULL_ITERS: int = 500 +FULL_SESSIONS: int = 3 +COOL_DOWN_S: int = 30 # seconds between full-bench sessions (NPU) + +BUILD_TIMEOUT_S: int = 8 * 60 +BENCH_TIMEOUT_S: int = 8 * 60 +CONFIG_TIMEOUT_S: int = 120 + + +# ── subprocess wrapper ──────────────────────────────────────────────────────── + + +def run_cmd(cmd: list[str], label: str = "", timeout: int = 600) -> tuple[int, str, float]: + """Run a subprocess command. Returns (returncode, combined_output, elapsed_s).""" + t0 = time.time() + print(f" >> {label or cmd[1]}", flush=True) + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + timeout=timeout, + ) + elapsed = time.time() - t0 + tag = "ok" if result.returncode == 0 else f"rc={result.returncode}" + print(f" {elapsed:.0f}s [{tag}]", flush=True) + if result.returncode != 0: + snippet = (result.stderr or result.stdout or "")[-600:] + print(f" stderr: {snippet}", flush=True) + return result.returncode, result.stdout + result.stderr, elapsed + except subprocess.TimeoutExpired: + elapsed = time.time() - t0 + print(f" TIMEOUT after {elapsed:.0f}s", flush=True) + return -999, f"TIMEOUT after {timeout}s", elapsed + + +# ── winml wrappers ──────────────────────────────────────────────────────────── + + +def get_base_config( + winml: str, + model_id: str, + task: str, + model_type: str, + ep: str, + device: str, + out_path: Path, +) -> dict | None: + """Generate a config via `winml config`. Returns parsed dict or None on failure. + + Tries with --model-type first, then falls back without it. + """ + + def _try(extra_args: list[str]) -> dict | None: + cmd = [ + winml, + "config", + "-m", + model_id, + "-t", + task, + "--device", + device, + "--ep", + ep, + "--no-compile", + "-o", + str(out_path), + ] + extra_args + rc, _, _ = run_cmd(cmd, label="winml config", timeout=CONFIG_TIMEOUT_S) + if rc == 0 and out_path.exists(): + try: + cfg = json.loads(out_path.read_text(encoding="utf-8")) + out_path.unlink(missing_ok=True) + return cfg + except Exception as e: + print(f" [warn] config parse error: {e}", flush=True) + out_path.unlink(missing_ok=True) + return None + + cfg = _try(["--model-type", model_type]) + if cfg is None: + print(" [warn] config with --model-type failed, retrying without...", flush=True) + cfg = _try([]) + return cfg + + +def run_build( + winml: str, + model_id: str, + cfg_path: Path, + out_dir: Path, + ep: str, + device: str, + extra_flags: list[str] | None = None, +) -> tuple[bool, str]: + """Run `winml build`. Returns (success, combined_output).""" + out_dir.mkdir(parents=True, exist_ok=True) + cmd = [ + winml, + "build", + "-c", + str(cfg_path), + "-m", + model_id, + "-o", + str(out_dir), + "--ep", + ep, + "--device", + device, + "--no-compile", + "--rebuild", + ] + if extra_flags: + cmd.extend(extra_flags) + rc, out, _ = run_cmd(cmd, label=f"winml build [{out_dir.name}]", timeout=BUILD_TIMEOUT_S) + return rc == 0, out + + +def make_hypothesis_config( + base: dict, opset_override: int | None, extra_optim: dict | None +) -> dict: + """Return a modified deep copy of base config for one hypothesis.""" + cfg = copy.deepcopy(base) + if opset_override is not None and cfg.get("export"): + cfg["export"]["opset_version"] = opset_override + if extra_optim is not None: + cfg["optim"] = {**(cfg.get("optim") or {}), **extra_optim} + return cfg + + +def find_model_onnx(hyp_dir: Path) -> Path | None: + """Locate the best ONNX artifact in a build output dir. + + Priority: quantized > optimized > any .onnx. + Returns None if no .onnx file exists. + """ + model_files = list(hyp_dir.glob("*.onnx")) + if not model_files: + return None + for preference in ("quantized", "optimized"): + match = next((f for f in model_files if preference in f.name), None) + if match: + return match + return model_files[0] + + +def is_build_complete(hyp_dir: Path) -> bool: + """Return True if the hyp_dir contains a complete build artifact. + + 'Complete' means optimized.onnx or quantized.onnx is present. + export.onnx alone means the pipeline was truncated before optimization. + """ + return any( + f.name for f in hyp_dir.glob("*.onnx") if "optimized" in f.name or "quantized" in f.name + ) + + +# ── benchmark helpers ───────────────────────────────────────────────────────── + + +class ScreenResult: + """Result from Phase A quick screen.""" + + __slots__ = ("p50_ms", "cv", "rc_failed") + + def __init__(self, p50_ms: float | None, cv: float, rc_failed: bool = False) -> None: + self.p50_ms = p50_ms + self.cv = cv + self.rc_failed = rc_failed # True only on subprocess failure; never on high CV + + @property + def hard_failed(self) -> bool: + """True if the bench command itself failed (rc != 0 or no output file).""" + return self.rc_failed + + def to_dict(self, ep: str = "cpu") -> dict: + note = None + if ep in ("qnn", "npu") and self.cv > 0.10: + note = "DVFS noise — high CV expected on QNN NPU (npu-007)" + return { + "p50_ms": round(self.p50_ms, 3) if self.p50_ms is not None else None, + "cv": round(self.cv, 4), + "note": note, + } + + +def bench_screen( + winml: str, + model_path: Path, + ep: str, + device: str, + out_json: Path | None = None, +) -> ScreenResult: + """Phase A: 200-iter screen. + + For QNN NPU: high CV is NORMAL (npu-007). Never treat high CV as failure. + Only hard-fail on subprocess rc != 0 or missing output file. + For CPU/GPU: high CV (> SCREEN_CV_MAX_STD) indicates measurement instability. + """ + if out_json is None: + out_json = model_path.parent / "screen_perf.json" + rc, _, _ = run_cmd( + [ + winml, + "perf", + "-m", + str(model_path), + "--ep", + ep, + "--device", + device, + "--warmup", + str(SCREEN_WARMUP), + "--iterations", + str(SCREEN_ITERS), + "-o", + str(out_json), + ], + label=f"perf screen ({SCREEN_ITERS} iters)", + timeout=BENCH_TIMEOUT_S, + ) + if rc != 0 or not out_json.exists(): + return ScreenResult(None, 999.0, rc_failed=True) + try: + data = json.loads(out_json.read_text(encoding="utf-8")) + lat = data.get("latency_ms", data) + p50 = lat.get("p50") if isinstance(lat, dict) else None + std = lat.get("std", 0.0) if isinstance(lat, dict) else 0.0 + if not p50: + return ScreenResult(None, 999.0, rc_failed=True) + cv = std / p50 if p50 > 0 else 999.0 + ep_tag = "NPU" if ep in ("qnn",) and device in ("npu",) else ep.upper() + print( + f" screen: p50={p50:.2f}ms cv={cv:.3f}" + + (" [DVFS-normal]" if ep_tag == "NPU" and cv > 0.10 else ""), + flush=True, + ) + return ScreenResult(p50, cv) + except Exception as e: + print(f" [warn] screen parse error: {e}", flush=True) + return ScreenResult(None, 999.0, rc_failed=True) + + +def bench_full( + winml: str, + model_path: Path, + ep: str, + device: str, + out_prefix: str = "full_perf", + warmup: int | None = None, + iters: int | None = None, + cool_down_s: int | None = None, +) -> list[float]: + """Phase B: 3 × FULL_ITERS-iter full bench with cool-down. + + Returns list of per-session p50_ms values. Empty list = all sessions failed. + Session files are written as {out_prefix}_s{n}.json in model_path.parent. + + warmup/iters/cool_down_s override module-level defaults when provided. + """ + _warmup = warmup if warmup is not None else FULL_WARMUP + _iters = iters if iters is not None else FULL_ITERS + _cool_down = cool_down_s if cool_down_s is not None else COOL_DOWN_S + p50s: list[float] = [] + for s in range(1, FULL_SESSIONS + 1): + out_json = model_path.parent / f"{out_prefix}_s{s}.json" + rc, _, _ = run_cmd( + [ + winml, + "perf", + "-m", + str(model_path), + "--ep", + ep, + "--device", + device, + "--warmup", + str(_warmup), + "--iterations", + str(_iters), + "-o", + str(out_json), + ], + label=f"perf full s{s}/{FULL_SESSIONS} ({_iters} iters)", + timeout=BENCH_TIMEOUT_S, + ) + if rc == 0 and out_json.exists(): + try: + data = json.loads(out_json.read_text(encoding="utf-8")) + lat = data.get("latency_ms", data) + p50 = lat.get("p50") if isinstance(lat, dict) else None + std = lat.get("std", 0.0) if isinstance(lat, dict) else 0.0 + if p50: + cv = std / p50 if p50 > 0 else 999.0 + print( + f" full s{s}: p50={p50:.2f}ms std={std:.2f}ms cv={cv:.3f}", + flush=True, + ) + p50s.append(round(p50, 3)) + except Exception as e: + print(f" [warn] full bench s{s} parse error: {e}", flush=True) + else: + print(f" [warn] full bench s{s} failed", flush=True) + if s < FULL_SESSIONS: + print(f" cool-down {_cool_down}s...", flush=True) + time.sleep(_cool_down) + return p50s + + +def median_p50(p50s: list[float]) -> float | None: + """Return the median of a list of p50 values, or None if empty.""" + if not p50s: + return None + return sorted(p50s)[len(p50s) // 2] + + +def ranges_non_overlapping(a: list[float], b: list[float]) -> bool | None: + """Return True if max(a) < min(b) (a is strictly faster than b). + + Returns None if either list is empty (can't determine). + """ + if not a or not b: + return None + return max(a) < min(b) + + +# ── ONNX analysis helpers ───────────────────────────────────────────────────── + + +def count_conv_pct(model_onnx: Path) -> tuple[float, int, int]: + """Count Conv ops as a percentage of all graph nodes. + + Returns (conv_pct, conv_count, total_count). + Used to assess npu-006 risk: Conv% > 20% means conv fusions will likely + produce FusedConv ops that QNN EP cannot dispatch (-> CPU fallback). + + Returns (0.0, 0, 0) if onnx is not installed or file is missing. + The caller must treat (0.0, 0, 0) as 'unknown', not as 'safe'. + """ + if not model_onnx.exists(): + return 0.0, 0, 0 + try: + import onnx # noqa: PLC0415 + + model = onnx.load(str(model_onnx)) + ops = [n.op_type for n in model.graph.node] + total = len(ops) + conv_count = sum(1 for o in ops if o == "Conv") + pct = conv_count / total * 100 if total > 0 else 0.0 + return round(pct, 1), conv_count, total + except Exception as e: + print(f" [warn] Conv% analysis failed (onnx not installed?): {e}", flush=True) + return 0.0, 0, 0 diff --git a/research/autoconfig/catalog_qnn_sweep.py b/research/autoconfig/catalog_qnn_sweep.py index ae06b4ad4..eb95ba41f 100644 --- a/research/autoconfig/catalog_qnn_sweep.py +++ b/research/autoconfig/catalog_qnn_sweep.py @@ -63,7 +63,7 @@ FULL_SESSIONS = 3 COOL_DOWN_S = 30 -MODEL_TIMEOUT_S = 20 * 60 # 20 min per model total +MODEL_TIMEOUT_S = 180 * 60 # 3 hours per model — 6 hypotheses × ~30min each BUILD_TIMEOUT_S = 8 * 60 # 8 min per individual build BENCH_TIMEOUT_S = 8 * 60 # 8 min per bench run EVAL_TIMEOUT_S = 6 * 60 # 6 min for accuracy eval @@ -153,12 +153,23 @@ def _count_conv_pct(model_onnx: Path) -> tuple[float, int, int]: """Count Conv ops in a built ONNX model. Returns (conv_pct, conv_count, total_count). Used to assess npu-006 risk before running conv-fusion hypotheses. Falls back to (0.0, 0, 0) if onnx is not importable or file missing. + + WARNING: (0.0, 0, 0) means UNKNOWN, not SAFE. The caller must treat a zero + result as unknown and emit a warning rather than silently skipping the guard. """ if not model_onnx.exists(): return 0.0, 0, 0 try: import onnx # noqa: PLC0415 - + except ImportError: + print( + " [ERROR] onnx package not installed — cannot assess npu-006 risk for conv fusions.\n" + " Install it: pip install onnx\n" + " Conv-fusion hypotheses (h4/h5) will be annotated as UNKNOWN risk.", + flush=True, + ) + return 0.0, 0, 0 + try: model = onnx.load(str(model_onnx)) ops = [n.op_type for n in model.graph.node] total = len(ops) @@ -568,10 +579,18 @@ def sweep_model( # After h0: analyze Conv% to assess npu-006 risk for h4/h5 if hyp_id == "h0" and onnx_path.exists(): conv_pct, conv_count, total_count = _count_conv_pct(onnx_path) - npu006_risk = conv_pct > NPU006_CONV_PCT_THRESHOLD - results["conv_pct"] = conv_pct + # Treat (0.0, 0, 0) as UNKNOWN (not safe) — onnx may be unavailable. + conv_unknown = conv_pct == 0.0 and total_count == 0 + npu006_risk = conv_pct > NPU006_CONV_PCT_THRESHOLD or conv_unknown + results["conv_pct"] = None if conv_unknown else conv_pct results["npu006_risk"] = npu006_risk - if npu006_risk: + if conv_unknown: + print( + " [npu-006] Conv% analysis returned UNKNOWN (onnx unavailable or file missing)" + " — treating h4/h5 as HIGH RISK to be safe", + flush=True, + ) + elif npu006_risk: print( f" [npu-006] Conv%={conv_pct:.1f}% ({conv_count}/{total_count} ops)" f" > {NPU006_CONV_PCT_THRESHOLD:.0f}% threshold", diff --git a/research/autoconfig/validation_sweep.py b/research/autoconfig/validation_sweep.py index 0384f8411..1ec68f752 100644 --- a/research/autoconfig/validation_sweep.py +++ b/research/autoconfig/validation_sweep.py @@ -346,7 +346,11 @@ def run_model(model_id, task, model_type, run_h4): continue p50_screen, cv, stable = bench_screen(model_path) - if p50_screen is None: + # npu-007: For QNN NPU, screen failure (rc!=0, empty output) must NOT gate Phase B. + # DVFS thermal noise can cause transient subprocess failures on first inference. + # Only skip Phase B if screen hard-failed AND the EP is not QNN NPU. + is_npu = EP == "qnn" and DEVICE == "npu" + if p50_screen is None and not is_npu: result["hypotheses"][hid] = { "status": "BENCH_FAIL", "label": label, @@ -355,12 +359,14 @@ def run_model(model_id, task, model_type, run_h4): continue p50s, median = bench_full(model_path) - status = "OK" if cv < 0.15 else "OK_HIGH_CV" + status = "OK" if (cv is None or cv < 0.15) else "OK_HIGH_CV" + if not p50s: + status = "BENCH_FAIL" result["hypotheses"][hid] = { "status": status, "screen": { - "p50_ms": round(p50_screen, 3), - "cv": round(cv, 4), + "p50_ms": round(p50_screen, 3) if p50_screen is not None else None, + "cv": round(cv, 4) if cv is not None else None, "stable": stable, "note": "DVFS noise — high CV expected on QNN NPU" if not stable else None, }, @@ -368,8 +374,10 @@ def run_model(model_id, task, model_type, run_h4): "label": label, "opset": opset_override or "auto", } + screen_str = f"{p50_screen:.2f}ms" if p50_screen is not None else "N/A" + cv_str = f"{cv:.3f}" if cv is not None else "N/A" print( - f" [RESULT {hid}] screen p50={p50_screen:.2f}ms CV={cv:.3f} full_median={median}ms sessions={p50s}", + f" [RESULT {hid}] screen p50={screen_str} CV={cv_str} full_median={median}ms sessions={p50s}", flush=True, ) From 59e7329d19a508d07aa27a8d7024beb0ed58458b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 15:59:40 +0800 Subject: [PATCH 15/38] research(autoconfig): add VerdictPolicy, screen early exit, crash-resume (AgenticGPUOptimizer V2) Three improvements borrowed from AgenticGPUOptimizer V2 patterns: 1. ThroughputOnly verdict policy (bench_utils.py) - improvement must exceed max(1% floor, 2x screen-CV) - noise-level deltas (delta < stat_bar * CV) are DISCARD, not KEEP - marks marginal KKEPs (1x < delta < 1.5x threshold) as MARGINAL_KEEP 2. Screen phase early exit (autoconfig.py) - if screen improvement < 1%, skip 3x full-bench entirely - saves ~25-90 min per rejected hypothesis on first run - applied only when baseline_p50 is known (not first iter) 3. Crash-resume via SessionManager (bench_utils.py) - session.json written atomically after each experiment - on restart, completed iters are loaded and skipped - state includes baseline_p50, best_p50/label, consecutive_discards Also extracts _run_phase_b() helper to reduce main() nesting depth. --- research/autoconfig/autoconfig.py | 225 +++++++++++++++++++++-------- research/autoconfig/bench_utils.py | 175 ++++++++++++++++++++++ 2 files changed, 342 insertions(+), 58 deletions(-) diff --git a/research/autoconfig/autoconfig.py b/research/autoconfig/autoconfig.py index b8c0f315b..bc06b6448 100644 --- a/research/autoconfig/autoconfig.py +++ b/research/autoconfig/autoconfig.py @@ -39,6 +39,9 @@ FULL_SESSIONS, SCREEN_CV_MAX_STD, SCREEN_ITERS, + SessionManager, + ThroughputOnly, + VerdictInput, bench_full, bench_screen, median_p50, @@ -61,6 +64,14 @@ ACCURACY_FLOOR = 0.70 # cosine drop below this → discard MIN_IMPROVEMENT = 0.01 # require ≥1% p50 improvement to KEEP +# Verdict policy: improvement must exceed max(MIN_IMPROVEMENT, STAT_BAR * screen_cv) +# Borrowed from AgenticGPUOptimizer V2 (avoids calling noise-level deltas "improvements") +STAT_BAR_MULTIPLIER = 2.0 + +# Screen early exit: skip 3x full-bench when screen already shows < this % improvement. +# Saves ~25-90 min per rejected hypothesis (3 sessions × FULL_ITERS iters). +SCREEN_PASS_MIN_IMPROVEMENT_PCT = 1.0 + # Bench protocol (two-phase, from GPU Optimizer V2) SCREEN_WARMUP = 20 SCREEN_ITERS = 200 @@ -275,6 +286,68 @@ def _run_full(model_path: Path) -> list[float]: ) +def _run_phase_b( + out_dir: Path, + label: str, + exp_info: dict, + screen_cv: float, + baseline_p50: float | None, + best_p50: float, + best_label: str, + policy: ThroughputOnly, +) -> tuple[str, dict]: + """Run Phase B (full bench + accuracy gate) and evaluate with VerdictPolicy. + + Returns (status_str, updated exp_info). Does not update best_p50/best_label — + caller is responsible so champion tracking stays in one place. + """ + full_p50s = _run_full(out_dir / "model.onnx") + if not full_p50s: + exp_info["analysis"] = "Phase B winml perf returned no data" + return "crash (full bench failed)", exp_info + + med_p50 = median_p50(full_p50s) + assert med_p50 is not None + exp_info["full_p50s"] = [f"{p:.1f}" for p in full_p50s] + exp_info["median_p50"] = f"{med_p50:.1f}" + + # Promote baseline from first successful full bench + if baseline_p50 is None: + baseline_p50 = med_p50 + exp_info["baseline_p50"] = f"{baseline_p50:.1f}" + + # Accuracy gate + accuracy = eval_accuracy(out_dir) + exp_info["accuracy"] = f"{accuracy:.4f}" if accuracy is not None else "N/A" + + improvement_pct = (baseline_p50 - med_p50) / baseline_p50 * 100 + delta_pct = -improvement_pct + exp_info["delta_pct"] = f"{delta_pct:+.1f}%" + + correctness_pass = accuracy is None or accuracy >= ACCURACY_FLOOR + verdict = policy.evaluate( + VerdictInput( + improvement_pct=improvement_pct, + cv_pct=screen_cv * 100.0, + correctness_pass=correctness_pass, + ) + ) + + exp_info["analysis"] = verdict.reasoning + if verdict.verdict in ("KEEP", "MARGINAL_KEEP"): + status = "keep" + (" (marginal)" if verdict.marginal else "") + exp_info["analysis"] = ( + f"Improvement confirmed: p50 {baseline_p50:.1f}ms -> {med_p50:.1f}ms " + f"({delta_pct:+.1f}%). {verdict.reasoning}" + ) + elif verdict.verdict == "ACC_FAIL": + status = f"discard (accuracy {accuracy:.4f} < floor {ACCURACY_FLOOR})" + else: + status = f"discard ({verdict.reasoning})" + + return status, exp_info + + def eval_accuracy(out_dir: Path) -> float | None: """Run winml eval; return accuracy (top-1 or cosine). For latency: use bench_*.""" model_path = out_dir / "model.onnx" @@ -398,24 +471,44 @@ def main() -> None: for note in kb["notes"]: print(f" {note}") + # Resume from prior session if interrupted + session = SessionManager(WORK_DIR) + sep = "=" * 64 print(f"\n{sep}") print(f" autoconfig search -- {MODEL_ID}") print(f" EP: {EP} eval_samples: {EVAL_SAMPLES} hypotheses: {len(HYPOTHESES)}") print( - f" Bench: screen={SCREEN_ITERS} iters (CV<{SCREEN_CV_MAX}) → full={FULL_ITERS}×{FULL_SESSIONS}" + f" Bench: screen={SCREEN_ITERS} iters (CV<{SCREEN_CV_MAX}) -> full={FULL_ITERS}x{FULL_SESSIONS}" ) print(f" Stop: {STOP_CONSECUTIVE_DISCARDS} consecutive DISCARDs OR budget") print(f" External research trigger: after {EXTERNAL_RESEARCH_TRIGGER} DISCARDs same dimension") + print( + f" Verdict: improvement must exceed max({MIN_IMPROVEMENT * 100:.0f}%, {STAT_BAR_MULTIPLIER:.0f}x screen-CV)" + ) + print( + f" Screen early exit: skip full bench if screen improvement < {SCREEN_PASS_MIN_IMPROVEMENT_PCT:.0f}%" + ) print(f"{sep}\n") - baseline_p50: float | None = None - best_p50 = float("inf") - best_label = "" - consecutive_discards = 0 - discard_by_dimension: dict[str, int] = {} + # Restore state from prior session (if resuming) + baseline_p50: float | None = session.baseline_p50 + best_p50 = session.best_p50 + best_label = session.best_label + consecutive_discards = session.consecutive_discards + discard_by_dimension: dict[str, int] = session.discard_by_dimension + + policy = ThroughputOnly( + min_improvement_pct=MIN_IMPROVEMENT * 100, + stat_bar_multiplier=STAT_BAR_MULTIPLIER, + ) for i, (label, patch_fn, dimension) in enumerate(HYPOTHESES): + # Skip iters completed in a prior run + if i in session.completed_iters: + print(f" [resume] skipping iter {i} ({label}) — already done") + continue + iter_start = time.time() print(f"\n{'--' * 32}") print(f" iter {i} | {label} [{dimension}]") @@ -427,7 +520,7 @@ def main() -> None: (r for r in kb["skip_passes"] if any(f in flags_preview for f in r.split()[:2])), None ) if skip_reason: - print(f" ⏭️ skipped by KB confirmed rule: {skip_reason}") + print(f" skipped by KB confirmed rule: {skip_reason}") continue cfg = patch_fn(copy.deepcopy(BASELINE)) # type: ignore[operator] @@ -465,52 +558,59 @@ def main() -> None: f"Phase A rejected: CV={screen_cv:.2f} > {SCREEN_CV_MAX}. " f"Thermal or scheduling noise on {EP.upper()} EP. Cool device and retry." ) - else: - # Phase B: full bench - full_p50s = _run_full(out_dir / "model.onnx") - if not full_p50s: - status = "crash (full bench failed)" - exp_info["analysis"] = "Phase B winml perf returned no data" + elif baseline_p50 is not None: + # Screen early exit: skip full bench when screen shows negligible gain. + # Saves 3x full-bench time for clearly non-improving configs. + screen_improvement_pct = (baseline_p50 - screen_p50) / baseline_p50 * 100 + if screen_improvement_pct < SCREEN_PASS_MIN_IMPROVEMENT_PCT: + status = ( + f"discard (screen early exit: improvement {screen_improvement_pct:+.1f}%" + f" < {SCREEN_PASS_MIN_IMPROVEMENT_PCT:.0f}% — full bench skipped)" + ) + exp_info["analysis"] = ( + f"Phase A early exit: screen p50={screen_p50:.1f}ms vs baseline " + f"{baseline_p50:.1f}ms ({screen_improvement_pct:+.1f}% improvement) is " + f"below {SCREEN_PASS_MIN_IMPROVEMENT_PCT:.0f}% threshold. " + f"Full bench skipped — not worth 3x{FULL_ITERS} iters." + ) + exp_info["delta_pct"] = f"{-screen_improvement_pct:+.1f}% (screen estimate)" else: - med_p50 = median_p50(full_p50s) - assert med_p50 is not None - exp_info["full_p50s"] = [f"{p:.1f}" for p in full_p50s] - exp_info["median_p50"] = f"{med_p50:.1f}" - - # Set baseline from the first successful full bench (any iteration). - if baseline_p50 is None: - baseline_p50 = med_p50 - exp_info["baseline_p50"] = f"{baseline_p50:.1f}" - - # Accuracy gate - accuracy = eval_accuracy(out_dir) - exp_info["accuracy"] = f"{accuracy:.4f}" if accuracy is not None else "N/A" - - if accuracy is not None and accuracy < ACCURACY_FLOOR: - status = f"discard (accuracy {accuracy:.4f} < floor {ACCURACY_FLOOR})" - exp_info["analysis"] = "Accuracy regression below floor" - elif baseline_p50 is not None and med_p50 > baseline_p50 * ( - 1 - MIN_IMPROVEMENT - ): - delta_pct = (med_p50 - baseline_p50) / baseline_p50 * 100 - status = f"discard (Δp50={delta_pct:+.1f}% < {MIN_IMPROVEMENT * 100:.0f}% threshold)" - exp_info["delta_pct"] = f"{delta_pct:+.1f}%" - exp_info["analysis"] = ( - f"No meaningful improvement: {delta_pct:+.1f}% vs {MIN_IMPROVEMENT * 100:.0f}% threshold" - ) - else: - delta_pct = ( - (med_p50 - (baseline_p50 or med_p50)) / (baseline_p50 or med_p50) * 100 - ) - status = "keep" - exp_info["delta_pct"] = f"{delta_pct:+.1f}%" - exp_info["analysis"] = ( - f"Improvement confirmed: p50 {baseline_p50:.1f}ms -> {med_p50:.1f}ms ({delta_pct:+.1f}%)" - ) - if med_p50 < best_p50: - best_p50 = med_p50 + status, exp_info = _run_phase_b( + out_dir, + label, + exp_info, + screen_cv, + baseline_p50, + best_p50, + best_label, + policy, + ) + if status.startswith("keep"): + # Update champion tracking + new_p50 = float(exp_info.get("median_p50", best_p50)) + if new_p50 < best_p50: + best_p50 = new_p50 best_label = label status = "keep *** NEW BEST ***" + else: + # First iteration: no baseline yet — always run full bench + status, exp_info = _run_phase_b( + out_dir, label, exp_info, screen_cv, None, best_p50, best_label, policy + ) + if status.startswith("keep"): + new_p50 = float(exp_info.get("median_p50", best_p50)) + if new_p50 < best_p50: + best_p50 = new_p50 + best_label = label + status = "keep *** NEW BEST ***" + + # Extract baseline from first successful full bench + if baseline_p50 is None and "median_p50" in exp_info: + try: + baseline_p50 = float(exp_info["median_p50"]) + exp_info["baseline_p50"] = f"{baseline_p50:.1f}" + except (ValueError, TypeError): + pass # Write per-experiment doc (V2 pattern) exp_info["status"] = status @@ -522,13 +622,13 @@ def main() -> None: discard_by_dimension[dimension] = discard_by_dimension.get(dimension, 0) + 1 if discard_by_dimension[dimension] == EXTERNAL_RESEARCH_TRIGGER: print( - f"\n ⚡ EXTERNAL RESEARCH TRIGGER: {EXTERNAL_RESEARCH_TRIGGER} consecutive DISCARDs in [{dimension}]" + f"\n EXTERNAL RESEARCH TRIGGER: {EXTERNAL_RESEARCH_TRIGGER} consecutive DISCARDs in [{dimension}]" ) - print(" → Search ORT/QNN source code for mechanism before continuing") + print(" -> Search ORT/QNN source code for mechanism before continuing") print( - " → Check kMaxSupportedOpset for opset dimension, EP-specific rules for others" + " -> Check kMaxSupportedOpset for opset dimension, EP-specific rules for others" ) - print(f" → File findings in ep_knowledge/{EP}.json as 'draft' entry") + print(f" -> File findings in ep_knowledge/{EP}.json as 'draft' entry") else: consecutive_discards = 0 discard_by_dimension[dimension] = 0 @@ -553,13 +653,22 @@ def main() -> None: } ) - print(f" → {status}") + print(f" -> {status}") + + # Persist state for crash-resume + session.save( + iter_idx=i, + verdict=status, + baseline_p50=baseline_p50, + best_p50=best_p50, + best_label=best_label, + consecutive_discards=consecutive_discards, + discard_by_dimension=discard_by_dimension, + ) # Stop condition if consecutive_discards >= STOP_CONSECUTIVE_DISCARDS: - print( - f"\n 🛑 STOP: {STOP_CONSECUTIVE_DISCARDS} consecutive DISCARDs — plateau reached" - ) + print(f"\n STOP: {STOP_CONSECUTIVE_DISCARDS} consecutive DISCARDs — plateau reached") break print(f"\n{sep}") diff --git a/research/autoconfig/bench_utils.py b/research/autoconfig/bench_utils.py index e3e2db4d8..800f67e1d 100644 --- a/research/autoconfig/bench_utils.py +++ b/research/autoconfig/bench_utils.py @@ -23,6 +23,8 @@ import json import subprocess import time +from abc import ABC, abstractmethod +from dataclasses import dataclass from pathlib import Path # ── Protocol constants (overridable by callers via module-level reassignment) ─ @@ -365,6 +367,179 @@ def ranges_non_overlapping(a: list[float], b: list[float]) -> bool | None: # ── ONNX analysis helpers ───────────────────────────────────────────────────── +# ── Verdict policies ───────────────────────────────────────────────────────── + + +@dataclass +class VerdictInput: + """Inputs to a verdict policy. + + improvement_pct: positive = latency improvement + = (baseline_p50 - new_p50) / baseline_p50 * 100 + cv_pct: screen coefficient of variation as percent (e.g., 5.0 for 5%) + correctness_pass: True if accuracy/parity check passed + build_ok: True if build succeeded + """ + + improvement_pct: float + cv_pct: float + correctness_pass: bool + build_ok: bool = True + + +@dataclass +class VerdictOutput: + """Output from a verdict policy.""" + + verdict: str # KEEP | MARGINAL_KEEP | DISCARD | ACC_FAIL | BUILD_FAIL + reasoning: str + marginal: bool = False + threshold_pct: float = 0.0 + + +class VerdictPolicy(ABC): + """Abstract base for verdict policies.""" + + def __init__(self, min_improvement_pct: float = 1.0, stat_bar_multiplier: float = 2.0) -> None: + self.min_improvement_pct = min_improvement_pct + self.stat_bar_multiplier = stat_bar_multiplier + + @abstractmethod + def evaluate(self, inp: VerdictInput) -> VerdictOutput: ... + + +class ThroughputOnly(VerdictPolicy): + """KEEP iff improvement > max(min_improvement_pct, stat_bar * cv_pct). + + Parameterized statistical significance: forces improvements to exceed + measurement noise before being declared real (borrowed from + AgenticGPUOptimizer V2). Marks verdicts as 'marginal' when improvement is + between 1x and 1.5x the threshold. + """ + + def evaluate(self, inp: VerdictInput) -> VerdictOutput: + if not inp.build_ok: + return VerdictOutput("BUILD_FAIL", "Build step failed.") + if not inp.correctness_pass: + return VerdictOutput("ACC_FAIL", "Accuracy check failed.") + + threshold = max(self.min_improvement_pct, self.stat_bar_multiplier * inp.cv_pct) + + if inp.improvement_pct < threshold: + return VerdictOutput( + "DISCARD", + f"Improvement +{inp.improvement_pct:.1f}% < threshold {threshold:.1f}% " + f"(max({self.min_improvement_pct:.0f}% floor, " + f"{self.stat_bar_multiplier:.0f}x CV={inp.cv_pct:.1f}%))", + threshold_pct=threshold, + ) + + marginal = inp.improvement_pct < threshold * 1.5 + return VerdictOutput( + "MARGINAL_KEEP" if marginal else "KEEP", + f"Improvement +{inp.improvement_pct:.1f}% > threshold {threshold:.1f}%", + marginal=marginal, + threshold_pct=threshold, + ) + + +# ── Session manager ─────────────────────────────────────────────────────────── + + +class SessionManager: + """Crash-resume state manager backed by session.json. + + Writes session state atomically (temp-file + rename) after each experiment + so an interrupted run can be resumed from where it left off. + + Usage:: + sm = SessionManager(WORK_DIR) + if sm.has_state: + print(f"Resuming: {len(sm.completed_iters)} completed iters") + # In the hypothesis loop: + if i in sm.completed_iters: + continue + # ... run experiment ... + sm.save(iter_idx=i, verdict=status, baseline_p50=..., ...) + """ + + def __init__(self, work_dir: Path) -> None: + self.path = work_dir / "session.json" + self._state: dict = {} + if self.path.exists(): + try: + self._state = json.loads(self.path.read_text(encoding="utf-8")) + n = len(self.completed_iters) + if n > 0: + print( + f" [session] Resuming: {n} completed iter(s) loaded from {self.path.name}", + flush=True, + ) + except Exception as e: + print(f" [session] Warning: could not load {self.path.name}: {e}", flush=True) + + @property + def has_state(self) -> bool: + return bool(self._state) + + @property + def completed_iters(self) -> set[int]: + return set(self._state.get("completed_iters", [])) + + @property + def baseline_p50(self) -> float | None: + return self._state.get("baseline_p50") + + @property + def best_p50(self) -> float: + v = self._state.get("best_p50") + return float(v) if v is not None else float("inf") + + @property + def best_label(self) -> str: + return self._state.get("best_label", "") + + @property + def consecutive_discards(self) -> int: + return int(self._state.get("consecutive_discards", 0)) + + @property + def discard_by_dimension(self) -> dict[str, int]: + return dict(self._state.get("discard_by_dimension", {})) + + def save( + self, + *, + iter_idx: int, + verdict: str, + baseline_p50: float | None, + best_p50: float, + best_label: str, + consecutive_discards: int, + discard_by_dimension: dict[str, int], + ) -> None: + """Save current state to session.json atomically.""" + completed = list(self.completed_iters | {iter_idx}) + self._state.update( + { + "completed_iters": completed, + "last_verdict": verdict, + "baseline_p50": baseline_p50, + "best_p50": best_p50 if best_p50 < float("inf") else None, + "best_label": best_label, + "consecutive_discards": consecutive_discards, + "discard_by_dimension": discard_by_dimension, + "last_iter": iter_idx, + } + ) + tmp = self.path.with_suffix(".tmp") + try: + tmp.write_text(json.dumps(self._state, indent=2), encoding="utf-8") + tmp.replace(self.path) + except Exception as e: + print(f" [session] Warning: could not save session state: {e}", flush=True) + + def count_conv_pct(model_onnx: Path) -> tuple[float, int, int]: """Count Conv ops as a percentage of all graph nodes. From da32a887f48adbfb839284be4fbd04da5a52afa1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 16:29:59 +0800 Subject: [PATCH 16/38] research(autoconfig): update diagram + agent-design, add ep-findings-summary.html autoconfig_diagram.html (v3): - Phase 2 Optimizer: screen early exit box (skip full bench when screen delta < 1%) - Phase 2 Reviewer: ThroughputOnly verdict policy with KEEP/MARGINAL_KEEP/DISCARD/EARLY pills - Phase 2: crash-resume session.json box (new teal row) - Phase 0: session.json load on startup (crash-resume) - Phase 1 skip_set: updated with empirical KB rules (npu-006 Conv% gate, cpu-002, gpu-004, etc.) - Side panel: session.json added alongside results.tsv and ep_knowledge/ - Footnote: v3 change summary + pending features with issue references agent-design.md: - New Section 2.1: improved loop V3 (what it does well) vs remaining agent gaps - Section 2.2: corrected framing (original was wrong; V3 fixes the computation layer; agent gaps are explanation/architecture-awareness/cross-device/KB self-update) - Date updated to 2026-06-17 docs/ep-findings-summary.html (new): - 17 findings across QNN NPU / CPU / DML / QNN GPU, only confirmed/valid - Color-coded by EP, confidence badges (HIGH/MEDIUM/LOW) - Per-finding: observation data, scope, autoconfig action - 7 feature requests table with issue IDs (#155, #158, #443, #867, #868) + 2 not-yet-filed gaps (FusedConv detect, DML analyze rules) --- research/autoconfig/autoconfig_diagram.html | 526 +++++------- research/autoconfig/docs/agent-design.md | 41 +- .../autoconfig/docs/ep-findings-summary.html | 764 ++++++++++++++++++ 3 files changed, 1017 insertions(+), 314 deletions(-) create mode 100644 research/autoconfig/docs/ep-findings-summary.html diff --git a/research/autoconfig/autoconfig_diagram.html b/research/autoconfig/autoconfig_diagram.html index 0a60a4bbc..4769e0669 100644 --- a/research/autoconfig/autoconfig_diagram.html +++ b/research/autoconfig/autoconfig_diagram.html @@ -11,55 +11,30 @@ padding: 32px 24px; color: #1a1a2e; } - h1 { - font-size: 18px; font-weight: 700; color: #1a1a2e; - margin-bottom: 6px; - } - .subtitle { - font-size: 12px; color: #666; margin-bottom: 28px; + h1 { font-size: 18px; font-weight: 700; color: #1a1a2e; margin-bottom: 4px; } + .subtitle { font-size: 12px; color: #666; margin-bottom: 6px; } + .version-tag { + display: inline-block; background: #e8eaf6; color: #3949ab; + border-radius: 4px; padding: 2px 8px; font-size: 10px; font-weight: 700; + letter-spacing: 0.5px; margin-bottom: 24px; } /* ── Layout ── */ - .diagram { - display: flex; - flex-direction: column; - align-items: center; - gap: 0; - position: relative; - } + .diagram { display: flex; flex-direction: column; align-items: center; gap: 0; } /* ── Phase strips ── */ .phase-row { - display: flex; - align-items: stretch; - width: 100%; - max-width: 860px; - margin-bottom: 0; + display: flex; align-items: stretch; width: 100%; max-width: 900px; margin-bottom: 0; } .phase-label { - writing-mode: vertical-rl; - text-orientation: mixed; - transform: rotate(180deg); - font-size: 10px; - font-weight: 700; - letter-spacing: 1.2px; - text-transform: uppercase; - padding: 12px 8px; - border-radius: 8px 0 0 8px; - min-width: 32px; - display: flex; - align-items: center; - justify-content: center; - flex-shrink: 0; + writing-mode: vertical-rl; text-orientation: mixed; transform: rotate(180deg); + font-size: 10px; font-weight: 700; letter-spacing: 1.2px; text-transform: uppercase; + padding: 12px 8px; border-radius: 8px 0 0 8px; min-width: 32px; + display: flex; align-items: center; justify-content: center; flex-shrink: 0; } .phase-body { - flex: 1; - padding: 16px 20px; - border-radius: 0 8px 8px 0; - display: flex; - align-items: center; - gap: 12px; - flex-wrap: wrap; + flex: 1; padding: 16px 20px; border-radius: 0 8px 8px 0; + display: flex; align-items: center; gap: 12px; flex-wrap: wrap; } /* ── Phase colors ── */ @@ -74,299 +49,179 @@ /* ── Connector arrows ── */ .arrow { - display: flex; - justify-content: center; - align-items: center; - height: 28px; - width: 100%; - max-width: 860px; - position: relative; + display: flex; justify-content: center; align-items: center; + height: 28px; width: 100%; max-width: 900px; position: relative; } .arrow::before { - content: ''; - position: absolute; - left: 50%; - top: 0; - bottom: 0; - width: 2px; - background: #aab; + content: ''; position: absolute; left: 50%; top: 0; bottom: 0; + width: 2px; background: #aab; } .arrow-head { - position: relative; - z-index: 1; - background: #f4f6f9; - padding: 0 6px; - font-size: 18px; - color: #889; - line-height: 1; - } - .arrow-label { - position: relative; - z-index: 1; - background: #f4f6f9; - font-size: 10px; - color: #778; - padding: 2px 6px; - border-radius: 4px; - margin-left: 4px; + position: relative; z-index: 1; background: #f4f6f9; + padding: 0 6px; font-size: 18px; color: #889; line-height: 1; } - /* ── Boxes inside phases ── */ + /* ── Generic box ── */ .box { - background: #fff; - border-radius: 8px; - padding: 10px 14px; - border: 1.5px solid #dde; - font-size: 12px; - min-width: 130px; + background: #fff; border-radius: 8px; padding: 10px 14px; + border: 1.5px solid #dde; font-size: 12px; min-width: 130px; } .box-title { - font-weight: 700; - font-size: 11px; - margin-bottom: 5px; - display: flex; - align-items: center; - gap: 6px; - } - .box-title .icon { font-size: 14px; } - .box ul { - padding-left: 14px; - color: #445; - line-height: 1.6; - font-size: 11px; + font-weight: 700; font-size: 11px; margin-bottom: 5px; + display: flex; align-items: center; gap: 6px; } + .box ul { padding-left: 14px; color: #445; line-height: 1.6; font-size: 11px; } .box code { - font-family: "Cascadia Code", "Fira Code", monospace; - font-size: 10px; - background: #f0f2f5; - padding: 1px 4px; - border-radius: 3px; - color: #2d4a8a; + font-family: "Cascadia Code","Fira Code",monospace; font-size: 10px; + background: #f0f2f5; padding: 1px 4px; border-radius: 3px; color: #2d4a8a; } - /* ── Loop box (Phase 2) ── */ - .loop-container { - width: 100%; - display: flex; - gap: 10px; - align-items: flex-start; - position: relative; - } - .loop-agents { - flex: 1; - display: flex; - flex-direction: column; - gap: 8px; - } + /* ── Loop (Phase 2) ── */ + .loop-container { width: 100%; display: flex; gap: 10px; align-items: flex-start; } + .loop-agents { flex: 1; display: flex; flex-direction: column; gap: 8px; } .loop-agent { - background: #fff; - border-radius: 8px; - padding: 10px 14px; - border: 1.5px solid #ffe0b2; - font-size: 11px; - position: relative; + background: #fff; border-radius: 8px; padding: 10px 14px; + border: 1.5px solid #ffe0b2; font-size: 11px; } - .loop-agent .agent-title { - font-weight: 700; - font-size: 11px; - margin-bottom: 4px; - color: #bf360c; - } - .loop-agent ul { - padding-left: 14px; - color: #445; - line-height: 1.65; - } - .loop-side { - display: flex; - flex-direction: column; - gap: 8px; - min-width: 150px; + .loop-agent .agent-title { font-weight: 700; font-size: 11px; margin-bottom: 4px; color: #bf360c; } + .loop-agent ul { padding-left: 14px; color: #445; line-height: 1.65; } + .loop-agent .new-badge { + display: inline-block; background: #e8f5e9; color: #2e7d32; border: 1px solid #a5d6a7; + border-radius: 4px; padding: 0 5px; font-size: 9px; font-weight: 700; + letter-spacing: 0.5px; margin-left: 6px; vertical-align: middle; } + .loop-side { display: flex; flex-direction: column; gap: 8px; min-width: 158px; } .stop-box { - background: #fff; - border: 1.5px dashed #aab; - border-radius: 8px; - padding: 10px 14px; - font-size: 11px; - color: #556; + background: #fff; border: 1.5px dashed #aab; border-radius: 8px; + padding: 10px 14px; font-size: 11px; color: #556; } .stop-box .stop-title { font-weight: 700; margin-bottom: 4px; } .stop-box ul { padding-left: 14px; line-height: 1.65; } - - /* loop back arrow */ .loop-back { - display: flex; - align-items: center; - gap: 6px; - margin-top: 6px; - padding: 5px 10px; - background: #fff8f0; - border-radius: 20px; - font-size: 10px; - color: #e65100; - border: 1px solid #ffcc80; - align-self: flex-start; + display: flex; align-items: center; gap: 6px; margin-top: 6px; + padding: 5px 10px; background: #fff8f0; border-radius: 20px; + font-size: 10px; color: #e65100; border: 1px solid #ffcc80; align-self: flex-start; } + .mini-arrow { text-align: center; font-size: 13px; color: #e65100; line-height: 1; margin: -2px 0; } - /* ── Output badges ── */ - .output-badges { - display: flex; - gap: 10px; - flex-wrap: wrap; + /* verdict pill */ + .verdict-row { + display: flex; gap: 6px; flex-wrap: wrap; margin-top: 5px; padding-left: 4px; } + .pill { + padding: 2px 8px; border-radius: 10px; font-size: 10px; font-weight: 700; + letter-spacing: 0.3px; + } + .pill-keep { background: #e8f5e9; color: #2e7d32; border: 1px solid #a5d6a7; } + .pill-marginal{ background: #fff8e1; color: #f57f17; border: 1px solid #ffe082; } + .pill-discard { background: #fce4ec; color: #880e4f; border: 1px solid #f48fb1; } + .pill-early { background: #e3f2fd; color: #1565c0; border: 1px solid #90caf9; } + .pill-crash { background: #efebe9; color: #4e342e; border: 1px solid #bcaaa4; } + + /* ── Output badges ── */ + .output-badges { display: flex; gap: 10px; flex-wrap: wrap; } .badge { - background: #fff; - border-radius: 8px; - padding: 8px 14px; - border: 1.5px solid #f48fb1; - font-size: 11px; - min-width: 160px; + background: #fff; border-radius: 8px; padding: 8px 14px; + border: 1.5px solid #f48fb1; font-size: 11px; min-width: 160px; } .badge .badge-title { - font-weight: 700; - font-size: 10px; - color: #880e4f; - margin-bottom: 3px; - text-transform: uppercase; - letter-spacing: 0.5px; + font-weight: 700; font-size: 10px; color: #880e4f; + margin-bottom: 3px; text-transform: uppercase; letter-spacing: 0.5px; } .badge code { - display: block; - font-family: "Cascadia Code", monospace; - font-size: 10px; - background: #fce4ec; - padding: 2px 5px; - border-radius: 3px; - margin-top: 2px; - color: #6a0e30; + display: block; font-family: "Cascadia Code",monospace; font-size: 10px; + background: #fce4ec; padding: 2px 5px; border-radius: 3px; margin-top: 2px; color: #6a0e30; } - /* ── User input block ── */ + /* ── User input ── */ .user-input { - max-width: 860px; - width: 100%; - background: #fff; - border: 2px solid #3949ab; - border-radius: 10px; - padding: 12px 20px; - display: flex; - align-items: center; - gap: 20px; - margin-bottom: 0; + max-width: 900px; width: 100%; background: #fff; border: 2px solid #3949ab; + border-radius: 10px; padding: 12px 20px; display: flex; align-items: center; gap: 20px; } .user-input .ui-icon { font-size: 24px; } .user-input .ui-text { font-size: 12px; } .user-input .ui-text strong { font-size: 13px; } .tag { - display: inline-block; - background: #e8eaf6; - color: #3949ab; - border-radius: 4px; - padding: 2px 6px; - font-size: 10px; - font-weight: 600; - margin: 2px 3px 2px 0; + display: inline-block; background: #e8eaf6; color: #3949ab; + border-radius: 4px; padding: 2px 6px; font-size: 10px; font-weight: 600; margin: 2px 3px 2px 0; } + /* profile-result box */ .profile-result { - background: #e0f7fa; - border: 1.5px solid #80cbc4; - border-radius: 6px; - padding: 6px 10px; - font-size: 10.5px; - color: #004d40; - min-width: 190px; + background: #e0f7fa; border: 1.5px solid #80cbc4; border-radius: 6px; + padding: 6px 10px; font-size: 10.5px; color: #004d40; min-width: 190px; } .profile-result strong { font-size: 11px; display: block; margin-bottom: 3px; } - .profile-result .skip { color: #c62828; } - .profile-result .priority { color: #1b5e20; } - - hr.section-divider { - width: 100%; - max-width: 860px; - border: none; - border-top: 1px dashed #ccd; - margin: 4px 0; - } - - /* connector between agent rows */ - .mini-arrow { - text-align: center; - font-size: 13px; - color: #e65100; - line-height: 1; - margin: -2px 0; - }

autoconfig — Skill Architecture

Profile-guided autonomous config search for WinApp developers

+
v3 · 2026-06-17 · AgenticGPUOptimizer V2 patterns applied
- +
-
👤
+
👤
User input
Model ID  +  Target EP/device  +  Objective: accuracy-primary latency-primary Pareto -   + optional constraints (latency budget, accuracy floor) +  + optional constraints (latency budget, accuracy floor)
-
+
- +
Phase 0 · Intake
-
🔍 Inspect
+
Inspect
  • winml inspect
  • -
  • Validate model is supported
  • +
  • Validate model supported
  • Check EP availability
  • +
  • Load session.json (crash-resume)
-
+
-
🏗️ Baseline Build
+
Baseline Build
  • winml build (default config)
  • opset=17, no quant
  • Output: baseline/model.onnx
-
+
-
Correctness Contract
+
Correctness Contract
  • winml eval --mode compare
  • Lock cosine = 1.000
  • -
  • Record baseline p50
  • +
  • Record baseline p50 in session.json
-
+
- +
Phase 1 · Insight
-
-
📊 Runtime Profile
+
Runtime Profile
  • winml perf --profile
  • ORT per-op kernel time
  • @@ -377,57 +232,55 @@

    autoconfig — Skill Architecture

-
🔬 Static Analyzer
+
Static Analyzer
  • winml analyze --ep <ep>
  • Partial-support ops list
  • EP fallback candidates
  • -
  • Quant-sensitive node names
  • -
  • EP-specific constraints
  • +
  • Conv% → npu-006 risk flag
  • +
  • FusedConv detection (pending)
-
🗂️ Graph Analysis
+
Graph Analysis
  • ONNX proto inspection
  • -
  • opset version (kMaxSupportedOpset check)
  • Op counts per type
  • -
  • Fusion opportunities (decomposed subgraphs)
  • +
  • Conv% gate for npu-006
  • +
  • Fusion opportunities
  • Static shape vs dynamic axes
-
-
Insight Engine — fuse 3 signals →
+
Insight Engine — fuse 3 signals →
-
-
🚫 skip_set (passes to prune)
+
skip_set (passes to prune)
    -
  • Gelu op present → skip gelu-fusion
  • -
  • LN op present → skip layer-norm-fusion
  • -
  • ReorderInput > 2% → skip nchwc-transformer
  • -
  • Transpose < 5% + opset=17 → skip transpose-opt
  • -
  • opset ≥ 19 + Transpose > 10% → flag [KNOWN_TRADEOFF]
  • -
  • Partial-op list empty → skip nodes_to_exclude trials
  • +
  • EP=QNN NPU + Conv%>20% → block conv-bn/add/act fusions (npu-006)
  • +
  • EP=CPU + opset>17 → skip high opsets (cpu-001)
  • +
  • EP=QNN GPU or DML → skip nhwc-transformer (gpu-002/dml-002)
  • +
  • EP=QNN GPU → skip all quantization (gpu-004 hang)
  • +
  • Model already has Gemm → skip matmul-add-fusion (cpu-002)
  • +
  • Partial-op list empty → skip nodes_to_exclude trials
-
📋 priority_queue (ranked hypotheses)
+
priority_queue (ranked hypotheses)
    -
  • Gemm > 50% → quant precision, calib method first
  • -
  • Conv > 20% → nchwc, conv-fusions first
  • -
  • Partial ops exist → nodes_to_exclude exclusion trials
  • -
  • Decomposed Gelu subgraph → gelu-fusion trial
  • -
  • Dynamic axes → try static shape export
  • +
  • EP=QNN NPU + DINOv2-family → opset 21 first (npu-001, +24-31%)
  • +
  • Gemm>50% → quant precision, calib method first
  • +
  • Conv%>20%, Conv-only model → nchwc trial
  • +
  • Decomposed GELU → gelu-fusion trial (gpu-005: stability+)
  • +
  • Dynamic axes → static shape export
@@ -435,79 +288,109 @@

autoconfig — Skill Architecture

-
+
- +
Phase 2 · Opt Loop
-
+
-
🔭 Explorer
+
Explorer
    +
  • Skip iters already in session.json (crash-resume) NEW
  • Pop next hypothesis from priority_queue
  • -
  • Check KB ep_knowledge/<ep>.json — skip if "confirmed" rule prunes it
  • +
  • Check KB ep_knowledge/<ep>.json confirmed rules → skip if pruned
  • Build config.json delta (opset / quant / fusions)
  • -
  • ⚡ External research trigger: 5 DISCARDs in same dimension → read ORT/QNN source code
  • +
  • External research trigger: 5 DISCARDs same dimension → read ORT/QNN source
-
+
+
-
⚙️ Optimizer
+
Optimizer
  • winml build -c config.json
  • -
  • Phase A: winml perf --iter 200 → CV = std/p50
  • -
  • CV > 10%? → REJECT (DVFS noise) — do NOT run full bench
  • -
  • Phase B (if CV passes): winml perf --iter 1000 ×3, 60s cool-down
  • -
  • winml eval --samples 100 → accuracy gate
  • +
  • Phase A — screen (200 iters, CV = std/p50):
  • +
  • CPU/GPU: CV>10% → UNSTABLE, skip full bench
  • +
  • QNN NPU: CV gate DISABLED (npu-007 DVFS normal)
  • +
  • Screen early exit NEW: if baseline known and screen Δp50 < 1% → DISCARD (early) without full bench
  • +
  • Phase B — full bench (3 × 1000 iters, 60s cool-down)
  • +
  • winml eval --samples 100 → accuracy gate
-
+
+
-
🔎 Reviewer
+
Reviewer — ThroughputOnly Verdict Policy NEW
    -
  • keep — all 3 p50s better than baseline × 99% AND cosine ≥ floor
  • -
  • discard — revert to last kept config; write per-experiment .md
  • -
  • unstable — CV too high; log [UNSTABLE], do not count as DISCARD
  • +
  • Compute: threshold = max(1% floor, 2.0 × screen_CV)
  • +
  • improvement = (baseline − new_p50) / baseline × 100%
  • +
+
+ KEEP > 1.5×threshold + MARGINAL KEEP 1x–1.5x + DISCARD < threshold + EARLY DISCARD (screen) + ACC_FAIL / BUILD_FAIL +
+
    +
  • On KEEP: promote baseline_p50 → cumulative search enabled
  • Write KB draft entry if new mechanism observed (status="draft")
-
↩ loop back to Explorer  (until stop condition)
+ +
+
Crash-Resume: save session.json NEW
+
    +
  • Atomically written after every experiment (temp-file + rename)
  • +
  • Stores: completed_iters, baseline_p50, best_p50/label, consecutive_discards
  • +
  • On next startup: skip completed iters, restore state
  • +
+
+ +
↩ loop back to Explorer (until stop condition)
- +
-
🛑 Stop conditions
+
Stop conditions
  • Objective achieved
  • -
  • 30 consecutive DISCARDs (plateau)
  • +
  • 30 consecutive DISCARDs
  • priority_queue empty
  • User manually stops
-
📋 results.tsv
+
results.tsv
Every experiment:
- config · screen_p50 · median_p50
- CV · delta_pct · status · dim + config · screen_p50 · median_p50
+ CV · delta_pct · status · dim
-
-
📚 ep_knowledge/
- New entries written as
- status="draft"
- Promoted to "confirmed"
- only after Gate 2 (source) +
+
session.json
+ Crash-resume state:
+ completed_iters
+ baseline/best p50
+ discard counters +
+
+
ep_knowledge/
+ New entries as
+ status="draft"
+ Confirmed after Gate 2
@@ -515,9 +398,9 @@

autoconfig — Skill Architecture

-
+
- +
Phase 3 · Report
@@ -534,38 +417,63 @@

autoconfig — Skill Architecture

Per-Experiment Artifacts
- hypothesis/impl/parity/perf/analysis/decision + hypothesis / impl / parity / perf / analysis / decision experiments/<n>/experiment.md
KB Draft Entry
- New findings (status="draft") — promoted to "confirmed" after Gate 2 + New findings (status="draft") — promoted after Gate 2 ep_knowledge/<ep>.json
-
Manifest (multi-EP)
- Ranked EP configs for WinApp deployment - manifest.json +
Session State
+ Crash-resume artifact (auto-deleted on success) + session.json
- -
- Key insight (validated on ConvNext): - Profiler first → Gemm=57.7%, Transpose=2.6% → skip_set eliminates 16+ irrelevant pass experiments before search starts. - Estimated reduction: 22 experiments → ~6 with identical conclusions. -
- Bench protocol (from GPU Optimizer V2): - Phase A: 200-iter CV screen (CV = std/p50 < 10%) gates Phase B. - Phase B: 3×1000-iter with 60s cool-down. KEEP only if all 3 sessions beat baseline × 99%. - Single 50-iter run is NOT sufficient — DVFS on mobile NPUs causes 2-10× run-to-run variance. + +
+ + v3 changes (2026-06-17, AgenticGPUOptimizer V2 patterns): + 1) ThroughputOnly verdict policy — + improvement must exceed max(1% floor, 2×screen_CV). + Noise-level deltas that previously slipped through as KEEP (delta=1.2%, CV=1%) are now correctly DISCARD. + Marginal cases flagged as MARGINAL_KEEP for manual review. +   + 2) Screen early exit — + if screen p50 improvement < 1% vs baseline, skip 3×full-bench immediately. + Saves ~25–90 min per rejected hypothesis. +   + 3) Crash-resume via session.json — + every experiment atomically committed. On restart, completed iters skipped. No data lost. + +

+ + Bench protocol: + Phase A: 200-iter screen — CV gate for CPU/GPU only (10% threshold); disabled for QNN NPU (npu-007 DVFS normal). + Phase B: 3×1000-iter with 60s cool-down. + KEEP only if improvement > statistical threshold. +
- External research trigger: After 5 DISCARDs in same search dimension → read ORT/QNN source code. - Lesson: opset 21 QNN NPU effect (kMaxSupportedOpset gate) was discovered accidentally. Systematic external-research would have found it after 5 graph-pass DISCARDs. + + Key empirical constraints loaded from ep_knowledge/: + npu-006 (conv fusions → 4900% regression on ResNet, Conv%>20% gate); + npu-007 (DVFS CV gate disabled on NPU); + cpu-001 (opset 17 only on CPU); + gpu-004 (quantization hangs on QNN GPU); + cpu-002 (matmul-add-fusion regresses if Gemm already present). +
- Dependency: winml perf --profile (new flag); POC: winml_profile.py bridges until it ships. + + Pending features: + winml perf --profile (#158); + --precision fp16 (#867); + FusedConv detection in winml analyze (not yet filed); + multi-session bench protocol in winml perf (#155). +
diff --git a/research/autoconfig/docs/agent-design.md b/research/autoconfig/docs/agent-design.md index fa72a332f..688dad029 100644 --- a/research/autoconfig/docs/agent-design.md +++ b/research/autoconfig/docs/agent-design.md @@ -1,6 +1,6 @@ # WinML CLI Agent Design -> Status: Draft — 2026-06-11 +> Status: Draft — 2026-06-17 (updated: autoconfig loop V3 changes incorporated) > Context: Strategic design for the agent layer of winml-cli --- @@ -38,14 +38,45 @@ WinApp developers lack access to a senior ML engineer who: ## 2. Agent Design Philosophy -### 2.1 The Wrong Design (Current Autoconfig) +### 2.1 The Improved Loop (autoconfig V3) vs The Agent Layer -The current autoconfig agent runs a **headless search loop**: +The autoconfig search loop has been significantly improved since the initial draft. As of v3 (`59e7329d`): + +**What the improved loop does well:** +- Statistical significance via `ThroughputOnly` verdict policy: `improvement > max(1% floor, 2× screen_CV)` — noise-level deltas no longer pass as KEEP +- Screen early exit: if screen improvement < 1%, skip 3× full bench — saves 25–90 min per rejected hypothesis +- Crash-resume via `session.json`: atomic state persistence, restartable without re-running completed experiments +- KB-guided search: `ep_knowledge/*.json` confirmed rules prune the search space before any experiment runs +- DVFS-aware bench protocol: npu-007 CV gate disabled on QNN NPU; 3× 500-iter sessions with cool-down +- npu-006 guard: Conv% > 20% → hard-block conv fusions before they cause 4900% regression + +**What still requires the agent layer:** + +The loop is a *computation engine*, not an *intelligence layer*. It needs an agent because: + +1. **No architecture-aware hypothesis generation** — hypotheses are hardcoded per EP, not generated from model analysis. An attention-heavy model gets the same hypotheses as a Conv-heavy one. +2. **No failure explanation** — DISCARD is logged but not explained. Developers can't learn from results without reading raw JSON. +3. **No cross-device reasoning** — a config found on Snapdragon X Elite has unknown behavior on Intel Meteor Lake. The loop can't tell you that. +4. **No adaptive stopping** — 30-DISCARD plateau is a static heuristic. An agent would recognize when all architectural levers for this model/EP pair have been exhausted. +5. **No KB self-update** — KB is manually maintained. An agent with memory extraction (cf. AgenticGPUOptimizer `memory_extractor.py`) would auto-update `ep_knowledge/*.json` after each run. + +The revised framing: **autoconfig is a sub-tool that the agent invokes and explains, not a headless replacement for the agent**. + +### 2.2 The Wrong Design (Original Autoconfig) + +The *original* autoconfig ran a **headless search loop** with no statistical significance, no crash-resume, and no KB-guided pruning: Explorer → Optimizer → Reviewer → repeat -**Problems with this approach:** +**Problems that were present (now fixed in V3):** + +- No statistical significance — 1% hardcoded floor meant noise-level deltas passed as KEEP +- No screen early exit — every hypothesis ran 3× full bench regardless of screen result +- No crash-resume — an interrupted run lost all state +- All optim keys in kebab-case → `build_config()` silently used snake_case lookups → every hypothesis ran as baseline (critical bug, fixed) + +**Remaining problems (require agent layer to fix):** -- A Python script can do benchmark loops faster, cheaper, and more reliably than an LLM agent +- A Python script can do benchmark loops faster, cheaper, and more reliably than an LLM agent — the loop is good, the LLM overhead is not worth it - Results (config files) are not auditable — developer cannot verify why a config was chosen - No explainability — developer doesn't understand what was decided or why - Treats developer as absent; no collaborative interaction diff --git a/research/autoconfig/docs/ep-findings-summary.html b/research/autoconfig/docs/ep-findings-summary.html new file mode 100644 index 000000000..5fd864164 --- /dev/null +++ b/research/autoconfig/docs/ep-findings-summary.html @@ -0,0 +1,764 @@ + + + + +WinML EP Findings — Validated Catalog + + + + +

WinML EP Findings — Validated Catalog

+

+ Hardware: Snapdragon X Elite CRD  |  ORT: 1.24.5 (onnxruntime-windowsml)  |  + QNN SDK: Hexagon HTP (NPU) + Adreno X1-85 (GPU/DML)  |  + Last updated: 2026-06-17  |  14 models tested (QNN NPU), 1 model (CPU/DML/GPU) +

+ +
+
17
total findings
+
9
high confidence
+
6
medium confidence
+
2
low / single-run
+
14
models tested (NPU)
+
7
feature requests
+
+ +
+ Scope warning: All findings are from 1 hardware device (Snapdragon X Elite CRD). + CPU/DML/GPU findings are from 1 model only (facebook/convnext-tiny-224). + QNN NPU findings cover 14 models. Always re-validate on new model architectures before using to prune search space. + Confidence levels reflect mechanism certainty, not universal applicability. +
+ + +
+
+ QNN NPU  —  Hexagon HTP (Snapdragon X Elite) +  14 models tested, 3×500-iter sessions, 30s cool-down +
+
+ + +
+
npu-006
+
HIGH
confirmed
+
+
Conv fusions cause catastrophic CPU fallback on Conv-dominant models
+
+ ResNet-18 with conv-bn-fusion + conv-add-fusion + conv-activation-fusion: + 3-session p50 = [132.3, 135.0, 130.7]ms (CV=0.016) vs baseline ~1-4ms. + ~130x regression, near-zero variance = deterministic CPU fallback. + DINOv2-base (Conv%<1%): fusion is neutral or slightly beneficial (-25%). + ORT FusedConv op produced by these passes is not dispatchable by QNN EP → falls back to CPU for every Conv node. +
Scope: Conv-dominant models (ResNet, EfficientNet, MobileNet). Not applicable to Transformer, NLP.
+
+
+
+
Autoconfig action
+
+ Hard-block conv-bn/add/act fusions for QNN NPU when Conv% > 20%. + Gate is enforced in catalog_qnn_sweep.py via count_conv_pct(). + Feature request: winml analyze should detect FusedConv ops pre-build. +
+
+
+ + +
+
npu-007
+
HIGH
confirmed
+
+
DVFS thermal noise makes CV-based stability gating unreliable on QNN NPU
+
+ Across all 8 catalog models, within-session CV ranges 0.1–2.0+ even on warm device. + CV gate (e.g., <15%) blocks most valid candidates — the noise is DVFS, not model instability. + Reliable signal requires: 3+ independent sessions × 500+ iters with 30s cool-down between sessions. + Use median p50 across sessions. Differences < 10% are within noise floor. +
Scope: General — all models on QNN NPU / Snapdragon X Elite HTP.
+
+
+
+
Autoconfig action
+
+ CV gate DISABLED for QNN NPU in bench_utils.py. + SCREEN_CV_MAX_NPU = 999.0. + Always run 3×500 Phase B regardless of screen CV. + Feature request: winml perf --sessions 3 --cool-down 30s (#155). +
+
+
+ + +
+
npu-001
+
MEDIUM
empirical
+
+
opset 21 export gives +24–31% speedup on DINOv2 family — mechanism UNKNOWN
+
+ DINOv2-small: opset17 7.2ms → opset21 5.0ms (+30.6%) (3-session, ranges separated). + DINOv2-base: opset17 34.6ms → opset21 26.2ms (+24.1%) (fresh builds, clean protocol). + MobileViT-small: ~20–26% (DVFS spike in one session — partially reliable). + Critical controls: dino-vitb16 -0.7% NEUTRAL; gender-classification ViT +3.5% NEUTRAL (SAME op counts as DINOv2-small). Opset21 speedup is DINOv2-architecture-specific, NOT a general ViT property. + Original kMaxSupportedOpset bypass mechanism INVALIDATED (ORT 1.24.5 has kMaxSupportedOpset≥22). + Mechanism unknown. Transpose count identical (49 nodes both opsets). +48 Reshape nodes in opset21 may be relevant. +
Scope: DINOv2-family (facebook/dinov2-*). MobileViT likely. NOT plain ViT, NOT NLP (12 models tested).
+
+
+
+
Autoconfig action
+
+ For DINOv2-family: try opset 21 first in priority queue. + For plain ViT / NLP: skip (confirmed neutral). + Do NOT apply blindly without architecture check. + Tracked: #869 (closed). +
+
+
+ + +
+
npu-002
+
MEDIUM
1 model
+
+
W8A16 quantization gives ~1.9x speedup over FP32 on QNN NPU
+
+ ConvNext FP32: 19.4ms → W8A16: 10.3ms (1.9x speedup). 1 model only. + Mechanism confirmed: QNN HTP has native INT8 weight / FP16 activation datapath; W8A16 maps to weight-compressed matmul kernels. + Magnitude (1.9x) is ConvNext-specific. All catalog sweep models ran W8A16 but no FP32 baselines for those. +
Scope: ConvNext only for magnitude. Mechanism generalizes; magnitude does not.
+
+
+
+
Autoconfig action
+
+ Always quantize for QNN NPU. + W8A16 is the starting point. + Validate accuracy after. +
+
+
+ + +
+
npu-003
+
MEDIUM
1 model
+
+
winml compile (EPContext) adds ~1.7x speedup on top of W8A16
+
+ ConvNext W8A16: 10.3ms → compiled EPContext: 6.0ms (1.7x). 1 model only. + Mechanism confirmed: EPContext pre-builds QNN binary graph, eliminates JIT graph partitioning at session creation. +
Scope: ConvNext only for magnitude. Mechanism generalizes to all models on QNN NPU.
+
+
+
+
Autoconfig action
+
+ Always run winml compile after finding best quantized config for QNN NPU. +
+
+
+ + +
+
npu-004
+
LOW
anecdote
+
+
W8A8 may cause accuracy collapse on models with LN+GELU (UNVALIDATED)
+
+ Experiment aborted early — no accuracy numbers preserved. Recalled anecdote only. + Do NOT use this to skip W8A8 without running eval first. + If W8A8 top-1 drops >15 points vs W8A16, then skip. +
Scope: UNVALIDATED. Anecdote from ConvNext-tiny-224 only.
+
+
+
+
Autoconfig action
+
+ Treat as anecdotal. + Run W8A8 eval before deciding. + Only skip after confirmed accuracy gate failure. +
+
+
+ +
+
+ + +
+
+ CPU EP  —  Oryon CPU (Snapdragon X Elite) +  1 model only (facebook/convnext-tiny-224), 3×1000-iter sessions +
+
+ + +
+
cpu-001
+
HIGH
empirical
+
+
opset 19+ causes 3–4x slowdown on CPU EP — mechanism uncertain
+
+ opset 17: 43.7ms. opset 19: 160ms (3.7x). opset 21: 170ms (3.9x). + Non-monotonic: opset 22 partially recovers to 85ms (1.9x slower than 17). + Hypothesis: Transpose Optimizer bypass via kMaxSupportedOpset gate — but non-monotonic recovery at opset 22 is inconsistent with a simple gate. + Mechanism uncertain; empirical data is solid across opsets. +
Scope: ConvNext on Oryon CPU, ORT 1.24.x. Models with few Transpose nodes (BERT) likely unaffected.
+
+
+
+
Autoconfig action
+
+ Default to opset 17 for CPU EP. + Do NOT try opset 19+. + Practical rule is solid regardless of mechanism. +
+
+
+ + +
+
cpu-002
+
HIGH
confirmed
+
+
matmul_add_fusion is a confirmed regression on ConvNext CPU (+87%)
+
+ matmul_add_fusion: p50=81.7ms vs baseline 43.7ms (+87%). All 3 runs far above baseline. + ORT L2 already converts MatMul+Add → Gemm at baseline (37 Gemm nodes). + Applying matmul_add_fusion on top conflicts with existing Gemm nodes. +
Scope: Models where ORT L2 baseline already has Gemm. Check model.onnx before applying.
+
+
+
+
Autoconfig action
+
+ Skip matmul_add_fusion for CPU EP when model.onnx already contains Gemm nodes. + Check baseline model for Gemm count before adding to search space. +
+
+
+ + +
+
cpu-005
+
HIGH
confirmed
+
+
Baseline (no extra flags) is optimal for ConvNext CPU — optimization pass sweep is wasted
+
+ 22-experiment ablation: no flag improved p50 beyond noise. Baseline at 43.7ms is the floor. + ORT L2 already applies gelu_fusion and MatMul→Gemm. + Compute bottleneck (Gemm=57.7%) not addressable via graph passes. +
Scope: ConvNext-class vision models on CPU. BERT/Transformer models may benefit from attention_fusion.
+
+
+
+
Autoconfig action
+
+ For CPU EP + ConvNext-class: skip optimization pass sweep. + Go directly to quantization experiments (W8A8 first). +
+
+
+ + +
+
cpu-006
+
HIGH
empirical
+
+
CPU EP and QNN NPU respond OPPOSITELY to opset changes — EP isolation is mandatory
+
+ CPU opset 17 vs 21: 3.9x SLOWER at opset 21. + QNN NPU opset 17 vs 21 (DINOv2): 24% FASTER at opset 21. + (Note: different models used — directional comparison only.) + Same opset change, opposite effect. CPU and QNN NPU have independent optimizer paths. +
Scope: Meta-rule about EP isolation. Applies to all models.
+
+
+
+
Autoconfig action
+
+ NEVER transfer opset findings across EPs. + Always validate per EP independently. + CPU search space rules are separate from NPU rules. +
+
+
+ +
+
+ + +
+
+ DML EP  —  Adreno X1-85 via Direct3D 12 +  1 model only (facebook/convnext-tiny-224) +
+
+ + +
+
dml-001
+
MEDIUM
stability
+
+
DML is more stable than QNN GPU — p50 difference is within noise
+
+ DML FP32: p50=16.9ms, std=0.52. QNN GPU FP32: p50=17.7ms, std=0.97. + p50 diff = 0.8ms = 0.82σ of QNN GPU — distributions OVERLAP. Not a separable p50 advantage. + DML IS meaningfully more stable: std 0.52 vs 0.97, CV 3% vs 5.5%. +
Scope: Adreno X1-85, ConvNext. 3-run comparison (insufficient for definitive p50 ranking).
+
+
+
+
Autoconfig action
+
+ Prefer DML over QNN GPU for lower tail latency (p90) and variance. + Do NOT claim DML is faster based on p50 alone. +
+
+
+ + +
+
dml-002
+
MEDIUM
1 run
+
+
NHWC transformer increases latency variance on DML — p50 neutral, p90 +19%
+
+ DML NHWC: p50=16.5ms (-0.4ms, marginal), p90=21.0ms (+19%), std=1.89 (3.6x worse). + Baseline: p50=16.9ms, p90=17.7ms, std=0.52. + D3D12 handles tensor layouts internally via HLSL. Adding ORT NHWC Transposes adds dispatch overhead → scheduling jitter. +
Scope: Adreno X1-85 + DML, ConvNext. NVIDIA/Intel may differ (NHWC can help with CUDNN).
+
+
+
+
Autoconfig action
+
+ Do NOT apply nhwc-transformer for DML EP. + Tail latency stability matters for apps; p90 +19% is unacceptable. +
+
+
+ + +
+
dml-003
+
LOW
CLI gap
+
+
DML FP16 gives ~1.4x speedup with clean unimodal distribution — BLOCKED (#867)
+
+ DML FP16 (Python hack only): p50=11.8ms, p90=12.8ms, std=0.66 (clean unimodal). + vs FP32 baseline 16.9ms. 1.4x speedup. + DML HLSL shaders lock in FP16 paths at load time — no DVFS bimodal issues. + Cannot be reproduced with winml CLI today. Blocked on #867 (--precision fp16). +
Scope: Adreno X1-85 + DML. 1 experiment, Python workaround only.
+
+
+
+
Autoconfig action
+
+ Marked SKIPPED in search space until #867 ships. + FP16 is the primary DML optimization lever once available. +
+
+
+ + +
+
dml-004
+
HIGH
confirmed
+
+
winml analyze returns 0/0/0/251 (all Unknown) for DML EP — no rule data loaded
+
+ winml analyze --ep dml outputs: supported=0, partial=0, unsupported=0, unknown=251. + D3D12 supports all standard ONNX ops by design. + winml analyze has no DML-specific rule data file — cosmetic gap only. +
Scope: DML EP (all models). Cosmetic, not functional.
+
+
+
+
Autoconfig action
+
+ Do NOT use winml analyze to prune search space for DML. + Assume all ops supported. + Feature request: DML rule data (not yet filed, low priority). +
+
+
+ +
+
+ + +
+
+ QNN GPU EP  —  Adreno X1-85 via QNN SDK +  1 model only (facebook/convnext-tiny-224) +
+
+ + +
+
gpu-001
+
HIGH
confirmed
+
+
FP32 baseline is already optimal for ConvNext on QNN GPU — no optimization pass helps
+
+ 11-pass sweep on ConvNext QNN GPU: all returned 0% node reduction or worse latency. + 251/0/0/0 analyze output — all ops native, zero CPU fallback. + ConvNext uses Reshape→MatMul→Reshape pattern; MatMulAdd→Conv2D rewrites don’t match. +
Scope: ConvNext-class models (Reshape→MatMul→Reshape pattern). Transformer models may benefit.
+
+
+
+
Autoconfig action
+
+ Skip all graph optimization experiments for QNN GPU on ConvNext-class. + Use FP32 baseline directly. + FP16 is the only remaining lever (#867). +
+
+
+ + +
+
gpu-002
+
MEDIUM
consistent
+
+
NHWC transformer hurts QNN GPU on Adreno — ~10% worse p50, +21% p90
+
+ NHWC: p50=19.5ms (+10%), p90=23.8ms (+21%), std=3.43 (3.5x worse) vs baseline p50=17.7ms. + QNN GPU EP handles layout internally; forcing NHWC creates Reshape overhead without alignment benefit. +
Scope: Adreno X1-85 + QNN GPU. Non-Adreno GPUs may differ.
+
+
+
+
Autoconfig action
+
+ Do NOT apply nhwc-transformer for QNN GPU EP. +
+
+
+ + +
+
gpu-004
+
HIGH
confirmed
+
+
W8A8 QDQ hangs indefinitely on QNN GPU EP
+
+ Passing W8A8 QDQ-annotated ONNX to QNN GPU EP → infinite hang. + QNN SDK GPU EP does not support QDQ-quantized graphs. + winml build already protects via _patch_device() (sets quant=null for GPU). + Fast-fail enhancement filed: #868. +
Scope: QNN GPU EP (QNN SDK limitation). Not a winml bug.
+
+
+
+
Autoconfig action
+
+ Skip ALL quantization experiments for QNN GPU EP. + winml build protects automatically. + Tracked: #868. +
+
+
+ + +
+
gpu-005
+
HIGH
confirmed
+
+
gelu_fusion improves latency STABILITY on QNN GPU — not p50
+
+ Raw (unfused GELU, 287 nodes): p50=17.4ms, p90=29.2ms, std=5.90. + Autoconf (fused GELU, 251 nodes): p50=17.7ms, p90=19.7ms (-48%), std=0.97 (-6x). + 5 separate GPU kernel dispatches for unfused GELU create scheduling jitter. + Single Gelu kernel eliminates dispatch overhead → dramatically lower tail latency. +
Scope: Any model with GELU activations on QNN GPU.
+
+
+
+
Autoconfig action
+
+ Always apply gelu_fusion for QNN GPU (stability, not p50 benefit). + Do not expect p50 improvement. +
+
+
+ + +
+
gpu-003
+
LOW
1 run
+
+
winml compile regresses QNN GPU by ~34% — single experiment, low confidence
+
+ FP32 + compile: p50=23.7ms vs baseline 17.7ms (+34%). Single experiment only. + EPContext compile designed for NPU (HTP). On GPU EP it may bypass optimized shader path. + 34% gap is above DVFS noise level (CV~0.05 → noise ~1ms); direction probably real. +
Scope: QNN GPU EP. QNN NPU EP (compile always helps there).
+
+
+
+
Autoconfig action
+
+ Avoid winml compile for QNN GPU EP. + Re-validate if winml compile behavior changes. +
+
+
+ +
+
+ + + +
+
+ Feature Requests & CLI Gaps + — required to complete the autoconfig skill +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FeatureIssuePriorityMotivationEP FindingStatus
winml perf multi-session bench protocol
+ --sessions N --cool-down S
#155P0npu-007: reliable QNN NPU measurement requires 3 independent sessions with 30s cool-down. Single-session p50 is meaningless due to DVFS. catalog_qnn_sweep.py works around via subprocess loops but CLI support is needed for production autoconfig.npu-007OPEN
winml analyze: detect FusedConv ops pre-build
+ Warn when Conv% > threshold before fusions applied
not filedP0npu-006: conv fusions on QNN NPU cause 130x regression by creating FusedConv ops that QNN EP can't dispatch. autoconfig already guards via Conv% counter, but a CLI-level pre-build lint would catch it without any Python code. Should be part of winml analyze --ep qnn.npu-006NEEDS ISSUE
winml build --precision fp16
+ --enable-fp16-conversion
#867P1dml-003: DML FP16 gives ~1.4x speedup with clean distribution. Currently only achievable via Python workaround. Same for QNN GPU (FP16 is the only remaining lever after all graph passes exhausted). Unblocks DML search space.dml-003, gpu-001OPEN
winml perf: report p90/p99 + std per session
+ Plus per-session breakdown for multi-session runs
#155P1dml-002/gpu-002: NHWC transformer hurts p90 (+19-21%) while p50 is neutral or marginally better. Without p90 in perf output, autoconfig cannot detect stability regressions. p50-only verdict policy misses these cases.dml-002, gpu-002OPEN
winml perf --profile
+ Per-op kernel time breakdown
#158P1Phase 1 (Insight) in autoconfig requires knowing which op type dominates latency (Gemm% vs Conv% vs Attention%) to prioritize hypotheses. Without this, all hypotheses run blindly. POC bridges via analyze_graph.py (static), but dynamic profiling is needed for accurate attribution.all EPsOPEN
winml build --report (agent-friendly JSON)
+ Structured output for each pipeline step
#443P2autoconfig parses winml build stdout to detect failures. Machine-readable JSON output would eliminate fragile string parsing and let autoconfig recover from partial failures precisely.all EPsOPEN
winml analyze: DML EP rule data
+ Currently returns 0/0/0/251 (all Unknown)
not filedP3dml-004: winml analyze has no DML-specific rules. D3D12 supports all ops by design, so autoconfig treats this as "assume all supported" and skips analysis for DML. Low impact for now but should be fixed for completeness.dml-004NEEDS ISSUE (low priority)
+
+ + +
+ How to read confidence levels: + HIGH confirmed = mechanism understood + data from ≥3 independent sessions with non-overlapping ranges. +   + MEDIUM empirical = data is reliable but mechanism unconfirmed or from 1 model only. +   + LOW = single experiment, anecdote, or CLI gap blocking proper validation. +
+ All findings from Snapdragon X Elite CRD (Oryon CPU + Adreno X1-85 GPU + Hexagon HTP NPU). + ORT 1.24.5 (onnxruntime-windowsml). Findings may not generalize to x86 hardware or older ORT versions. +
+ + + From ef0b48bbac8493de0d4595697518fd4d01adfe9c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 16:34:42 +0800 Subject: [PATCH 17/38] research(autoconfig): hide single-model findings by default in ep-findings-summary.html 11 findings (npu-002/003/004, cpu-001/002/005, dml-001/002/003, gpu-001/002/003/005) are hidden by default because they derive from only 1 model (convnext-tiny-224). 6 multi-model / universal findings remain visible: npu-001 (14 models), npu-006 (4 models), npu-007 (8 models), cpu-006 (meta EP-isolation rule), dml-004 (all DML models), gpu-004 (QNN SDK limitation). A toggle button lets readers expand hidden findings on demand. sm-divider rows summarize how many are hidden per EP section. --- .../autoconfig/docs/ep-findings-summary.html | 87 +++++++++++++++---- 1 file changed, 69 insertions(+), 18 deletions(-) diff --git a/research/autoconfig/docs/ep-findings-summary.html b/research/autoconfig/docs/ep-findings-summary.html index 5fd864164..a32499847 100644 --- a/research/autoconfig/docs/ep-findings-summary.html +++ b/research/autoconfig/docs/ep-findings-summary.html @@ -34,8 +34,37 @@ .gpu .ep-header { background: #fff3e0; color: #e65100; border: 1.5px solid #ffcc80; border-bottom: none; } .gpu .ep-body { border: 1.5px solid #ffcc80; border-top: none; } - /* ── Finding rows ── */ - .finding { + /* ── Single-model hiding ── */ + .single-model { display: none; } + body.show-single .single-model { display: grid; } + .single-model .find-id { color: #c5cae9; } + + .sm-divider { + display: flex; align-items: center; gap: 10px; + padding: 6px 16px; + background: #fafafa; border-top: 1px solid #eef; + font-size: 10px; color: #aab; + } + .sm-divider .sm-count { + background: #f0f2f5; color: #778; border-radius: 8px; + padding: 1px 7px; font-weight: 700; + } + /* hide the divider when already showing single-model rows */ + body.show-single .sm-divider { display: none; } + /* also hide divider when there are no hidden rows (EP has none) */ + + /* toggle button */ + .toggle-bar { + display: flex; align-items: center; gap: 12px; + max-width: 900px; margin: 0 0 20px 0; + } + .toggle-btn { + padding: 6px 14px; border-radius: 20px; border: 1.5px solid #9fa8da; + background: #e8eaf6; color: #3949ab; font-size: 11px; font-weight: 700; + cursor: pointer; transition: background 0.15s; + } + .toggle-btn:hover { background: #c5cae9; } + .toggle-note { font-size: 11px; color: #aab; } display: grid; grid-template-columns: 28px 70px 1fr auto; gap: 0; @@ -157,9 +186,8 @@

WinML EP Findings — Validated Catalog

17
total findings
-
9
high confidence
-
6
medium confidence
-
2
low / single-run
+
6
visible (multi-model)
+
11
hidden (single-model)
14
models tested (NPU)
7
feature requests
@@ -171,6 +199,13 @@

WinML EP Findings — Validated Catalog

Confidence levels reflect mechanism certainty, not universal applicability.
+
+ + Showing only multi-model / universal findings by default. Single-model findings (1 model tested) are hidden. +
+
@@ -257,7 +292,11 @@

WinML EP Findings — Validated Catalog

-
+
+ 3 single-model findings hidden (npu-002/003/004 — ConvNext only) + Click “Show single-model findings” above to expand +
+
npu-002
MEDIUM
1 model
@@ -280,7 +319,7 @@

WinML EP Findings — Validated Catalog

-
+
npu-003
MEDIUM
1 model
@@ -300,7 +339,7 @@

WinML EP Findings — Validated Catalog

-
+
npu-004
LOW
anecdote
@@ -334,7 +373,11 @@

WinML EP Findings — Validated Catalog

-
+
+ 3 single-model findings hidden (cpu-001/002/005 — ConvNext only) + Click “Show single-model findings” above to expand +
+
cpu-001
HIGH
empirical
@@ -358,7 +401,7 @@

WinML EP Findings — Validated Catalog

-
+
cpu-002
HIGH
confirmed
@@ -380,7 +423,7 @@

WinML EP Findings — Validated Catalog

-
+
cpu-005
HIGH
confirmed
@@ -437,7 +480,11 @@

WinML EP Findings — Validated Catalog

-
+
+ 3 single-model findings hidden (dml-001/002/003 — ConvNext only) + Click “Show single-model findings” above to expand +
+
dml-001
MEDIUM
stability
@@ -459,7 +506,7 @@

WinML EP Findings — Validated Catalog

-
+
dml-002
MEDIUM
1 run
@@ -481,7 +528,7 @@

WinML EP Findings — Validated Catalog

-
+
dml-003
LOW
CLI gap
@@ -538,7 +585,11 @@

WinML EP Findings — Validated Catalog

-
+
+ 4 single-model findings hidden (gpu-001/002/003/005 — ConvNext only) + Click “Show single-model findings” above to expand +
+
gpu-001
HIGH
confirmed
@@ -561,7 +612,7 @@

WinML EP Findings — Validated Catalog

-
+
gpu-002
MEDIUM
consistent
@@ -605,7 +656,7 @@

WinML EP Findings — Validated Catalog

-
+
gpu-005
HIGH
confirmed
@@ -628,7 +679,7 @@

WinML EP Findings — Validated Catalog

-
+
gpu-003
LOW
1 run
From da6b29c4e40bfe60d6f4eb8d985b6d72c5670ede Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 16:36:35 +0800 Subject: [PATCH 18/38] research(autoconfig): fix missing .finding CSS selector in ep-findings-summary.html The .finding { grid rule lost its selector, breaking the 4-column layout for every finding row. Restored selector and fixed grid-template-columns to explicit 28px 70px 1fr 220px (was auto, caused action column collapse). --- research/autoconfig/docs/ep-findings-summary.html | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/research/autoconfig/docs/ep-findings-summary.html b/research/autoconfig/docs/ep-findings-summary.html index a32499847..6067c4b56 100644 --- a/research/autoconfig/docs/ep-findings-summary.html +++ b/research/autoconfig/docs/ep-findings-summary.html @@ -65,8 +65,11 @@ } .toggle-btn:hover { background: #c5cae9; } .toggle-note { font-size: 11px; color: #aab; } + + /* ── Finding row ── */ + .finding { display: grid; - grid-template-columns: 28px 70px 1fr auto; + grid-template-columns: 28px 70px 1fr 220px; gap: 0; background: #fff; border-bottom: 1px solid #eef; @@ -97,7 +100,7 @@ } .find-detail .data { color: #2e7d32; font-weight: 600; } .find-detail .warn { color: #c62828; font-weight: 600; } - .find-action { padding: 10px 14px; min-width: 180px; max-width: 220px; border-left: 1px solid #eef; } + .find-action { padding: 10px 14px; min-width: 180px; border-left: 1px solid #eef; } .find-action .action-label { font-size: 9px; text-transform: uppercase; letter-spacing: 0.8px; color: #aab; font-weight: 700; margin-bottom: 4px; From 2c387214925981a49770be8afe168157c1812fd0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 16:46:10 +0800 Subject: [PATCH 19/38] research(autoconfig): rename --report to --json, drop dml-004 FR row --- .../autoconfig/docs/ep-findings-summary.html | 39 ++----------------- 1 file changed, 3 insertions(+), 36 deletions(-) diff --git a/research/autoconfig/docs/ep-findings-summary.html b/research/autoconfig/docs/ep-findings-summary.html index 6067c4b56..77076680c 100644 --- a/research/autoconfig/docs/ep-findings-summary.html +++ b/research/autoconfig/docs/ep-findings-summary.html @@ -553,29 +553,6 @@

WinML EP Findings — Validated Catalog

- -
-
dml-004
-
HIGH
confirmed
-
-
winml analyze returns 0/0/0/251 (all Unknown) for DML EP — no rule data loaded
-
- winml analyze --ep dml outputs: supported=0, partial=0, unsupported=0, unknown=251. - D3D12 supports all standard ONNX ops by design. - winml analyze has no DML-specific rule data file — cosmetic gap only. -
Scope: DML EP (all models). Cosmetic, not functional.
-
-
-
-
Autoconfig action
-
- Do NOT use winml analyze to prune search space for DML. - Assume all ops supported. - Feature request: DML rule data (not yet filed, low priority). -
-
-
-
@@ -778,25 +755,15 @@

WinML EP Findings — Validated Catalog

- winml build --report (agent-friendly JSON)
- Structured output for each pipeline step + winml build --json (agent-friendly structured output)
+ Per-step status, timing, output paths, exit code #443 P2 - autoconfig parses winml build stdout to detect failures. Machine-readable JSON output would eliminate fragile string parsing and let autoconfig recover from partial failures precisely. + autoconfig parses winml build stdout to detect failures — fragile string parsing. A --json flag should emit a structured JSON to stdout (or a sidecar file) with per-step status (success/fail/skip), elapsed time, output artifact paths, and a top-level exit code. Would let autoconfig detect partial failures precisely and resume from the failed step. all EPs OPEN - - winml analyze: DML EP rule data
- Currently returns 0/0/0/251 (all Unknown) - not filed - P3 - dml-004: winml analyze has no DML-specific rules. D3D12 supports all ops by design, so autoconfig treats this as "assume all supported" and skips analysis for DML. Low impact for now but should be fixed for completeness. - dml-004 - NEEDS ISSUE (low priority) - -
From 5c5684a4ede70f9faded192c2878031fe91b4f4c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 16:50:51 +0800 Subject: [PATCH 20/38] =?UTF-8?q?research(autoconfig):=20trim=20autoconfig?= =?UTF-8?q?=5Fdiagram.html=20=E2=80=94=20shorter=20bullets,=20condensed=20?= =?UTF-8?q?footnote?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- research/autoconfig/autoconfig_diagram.html | 192 +++++++------------- 1 file changed, 64 insertions(+), 128 deletions(-) diff --git a/research/autoconfig/autoconfig_diagram.html b/research/autoconfig/autoconfig_diagram.html index 4769e0669..b40d6960a 100644 --- a/research/autoconfig/autoconfig_diagram.html +++ b/research/autoconfig/autoconfig_diagram.html @@ -166,12 +166,12 @@

autoconfig — Skill Architecture

👤
- User input
- Model ID  +  Target EP/device  +  Objective: + User input  —  + Model ID  +  Target EP  +  Objective: accuracy-primary latency-primary Pareto -  + optional constraints (latency budget, accuracy floor) +  + optional budget / accuracy floor
@@ -185,18 +185,16 @@

autoconfig — Skill Architecture

Inspect
  • winml inspect
  • -
  • Validate model supported
  • -
  • Check EP availability
  • -
  • Load session.json (crash-resume)
  • +
  • EP availability check
  • +
  • Load session.json (crash-resume)
Baseline Build
    -
  • winml build (default config)
  • -
  • opset=17, no quant
  • -
  • Output: baseline/model.onnx
  • +
  • winml build (opset17, no quant)
  • +
  • Record baseline p50
@@ -204,8 +202,7 @@

autoconfig — Skill Architecture

Correctness Contract
  • winml eval --mode compare
  • -
  • Lock cosine = 1.000
  • -
  • Record baseline p50 in session.json
  • +
  • Lock cosine similarity = 1.000
@@ -219,40 +216,29 @@

autoconfig — Skill Architecture

-
Runtime Profile
    -
  • winml perf --profile
  • -
  • ORT per-op kernel time
  • -
  • Bottleneck op type + %
  • -
  • Canonical vs decomposed ops
  • -
  • Layout ops (Reorder) activity
  • +
  • winml perf --profile (pending #158)
  • +
  • Per-op kernel time, bottleneck %
-
Static Analyzer
  • winml analyze --ep <ep>
  • -
  • Partial-support ops list
  • -
  • EP fallback candidates
  • Conv% → npu-006 risk flag
  • -
  • FusedConv detection (pending)
  • +
  • Partial-support op list
-
Graph Analysis
    -
  • ONNX proto inspection
  • -
  • Op counts per type
  • -
  • Conv% gate for npu-006
  • +
  • Op counts by type
  • Fusion opportunities
  • -
  • Static shape vs dynamic axes
  • +
  • Static vs dynamic axes
-
@@ -263,23 +249,20 @@

autoconfig — Skill Architecture

-
skip_set (passes to prune)
+
skip_set (hard blocks)
    -
  • EP=QNN NPU + Conv%>20% → block conv-bn/add/act fusions (npu-006)
  • -
  • EP=CPU + opset>17 → skip high opsets (cpu-001)
  • -
  • EP=QNN GPU or DML → skip nhwc-transformer (gpu-002/dml-002)
  • -
  • EP=QNN GPU → skip all quantization (gpu-004 hang)
  • -
  • Model already has Gemm → skip matmul-add-fusion (cpu-002)
  • -
  • Partial-op list empty → skip nodes_to_exclude trials
  • +
  • QNN NPU + Conv%>20% → no conv fusions
  • +
  • CPU + opset>17 → skip high opsets
  • +
  • QNN GPU / DML → skip nhwc-transformer
  • +
  • QNN GPU → skip all quantization
-
priority_queue (ranked hypotheses)
+
priority_queue (ranked hints)
    -
  • EP=QNN NPU + DINOv2-family → opset 21 first (npu-001, +24-31%)
  • -
  • Gemm>50% → quant precision, calib method first
  • -
  • Conv%>20%, Conv-only model → nchwc trial
  • -
  • Decomposed GELU → gelu-fusion trial (gpu-005: stability+)
  • +
  • QNN NPU + DINOv2-family → opset21 first (+24-31%)
  • +
  • Gemm>50% → quant / calib method first
  • +
  • Conv%>20%, Conv-only → nchwc trial
  • Dynamic axes → static shape export
@@ -298,90 +281,74 @@

autoconfig — Skill Architecture

-
Explorer
    -
  • Skip iters already in session.json (crash-resume) NEW
  • +
  • Skip completed iters from session.json NEW
  • Pop next hypothesis from priority_queue
  • -
  • Check KB ep_knowledge/<ep>.json confirmed rules → skip if pruned
  • -
  • Build config.json delta (opset / quant / fusions)
  • -
  • External research trigger: 5 DISCARDs same dimension → read ORT/QNN source
  • +
  • Check KB rules → skip if pruned
  • +
  • Build config.json delta
-
Optimizer
  • winml build -c config.json
  • -
  • Phase A — screen (200 iters, CV = std/p50):
  • -
  • CPU/GPU: CV>10% → UNSTABLE, skip full bench
  • -
  • QNN NPU: CV gate DISABLED (npu-007 DVFS normal)
  • -
  • Screen early exit NEW: if baseline known and screen Δp50 < 1% → DISCARD (early) without full bench
  • +
  • Phase A — screen (200 iters): CV gate for CPU/GPU; disabled for QNN NPU (DVFS)
  • +
  • Early exit NEW: screen Δ < 1% → DISCARD, skip full bench
  • Phase B — full bench (3 × 1000 iters, 60s cool-down)
  • -
  • winml eval --samples 100 → accuracy gate
  • +
  • winml eval → accuracy gate
-
-
Reviewer — ThroughputOnly Verdict Policy NEW
+
Reviewer — ThroughputOnly NEW
    -
  • Compute: threshold = max(1% floor, 2.0 × screen_CV)
  • -
  • improvement = (baseline − new_p50) / baseline × 100%
  • +
  • threshold = max(1%, 2.0 × CV)
- KEEP > 1.5×threshold - MARGINAL KEEP 1x–1.5x - DISCARD < threshold - EARLY DISCARD (screen) - ACC_FAIL / BUILD_FAIL + KEEP >1.5×thr + MARGINAL 1×–1.5× + DISCARD + EARLY DISCARD + ACC/BUILD FAIL
-
    -
  • On KEEP: promote baseline_p50 → cumulative search enabled
  • -
  • Write KB draft entry if new mechanism observed (status="draft")
  • -
-
-
Crash-Resume: save session.json NEW
+
Crash-Resume NEW
    -
  • Atomically written after every experiment (temp-file + rename)
  • -
  • Stores: completed_iters, baseline_p50, best_p50/label, consecutive_discards
  • -
  • On next startup: skip completed iters, restore state
  • +
  • Atomic write after every experiment
  • +
  • Stores: completed iters, baseline/best p50, discard counters
-
↩ loop back to Explorer (until stop condition)
+
↩ loop back to Explorer
-
Stop conditions
    -
  • Objective achieved
  • -
  • 30 consecutive DISCARDs
  • -
  • priority_queue empty
  • -
  • User manually stops
  • +
  • Objective met
  • +
  • 30 consecutive DISCARDs
  • +
  • Queue empty
  • +
  • User stops
results.tsv
- Every experiment:
config · screen_p50 · median_p50
- CV · delta_pct · status · dim + CV · delta_pct · status
session.json
- Crash-resume state:
completed_iters
baseline/best p50
discard counters @@ -389,8 +356,7 @@

autoconfig — Skill Architecture

ep_knowledge/
New entries as
- status="draft"
- Confirmed after Gate 2 + status="draft"
@@ -407,73 +373,43 @@

autoconfig — Skill Architecture

Champion Config
- Best config with provenance metadata + Best config + provenance config_<ep>_optimal.json
HTML Report
- Benchmark chart + experiment table + profile section + Chart + experiment table report.html
-
Per-Experiment Artifacts
- hypothesis / impl / parity / perf / analysis / decision - experiments/<n>/experiment.md +
Experiment Artifacts
+ Per-hypothesis logs + experiments/<n>/
KB Draft Entry
- New findings (status="draft") — promoted after Gate 2 + New findings, promoted after Gate 2 ep_knowledge/<ep>.json
-
-
Session State
- Crash-resume artifact (auto-deleted on success) - session.json -
- - v3 changes (2026-06-17, AgenticGPUOptimizer V2 patterns): - 1) ThroughputOnly verdict policy — - improvement must exceed max(1% floor, 2×screen_CV). - Noise-level deltas that previously slipped through as KEEP (delta=1.2%, CV=1%) are now correctly DISCARD. - Marginal cases flagged as MARGINAL_KEEP for manual review. -   - 2) Screen early exit — - if screen p50 improvement < 1% vs baseline, skip 3×full-bench immediately. - Saves ~25–90 min per rejected hypothesis. -   - 3) Crash-resume via session.json — - every experiment atomically committed. On restart, completed iters skipped. No data lost. - -

- - Bench protocol: - Phase A: 200-iter screen — CV gate for CPU/GPU only (10% threshold); disabled for QNN NPU (npu-007 DVFS normal). - Phase B: 3×1000-iter with 60s cool-down. - KEEP only if improvement > statistical threshold. - -
- - Key empirical constraints loaded from ep_knowledge/: - npu-006 (conv fusions → 4900% regression on ResNet, Conv%>20% gate); - npu-007 (DVFS CV gate disabled on NPU); - cpu-001 (opset 17 only on CPU); - gpu-004 (quantization hangs on QNN GPU); - cpu-002 (matmul-add-fusion regresses if Gemm already present). - -
- - Pending features: - winml perf --profile (#158); - --precision fp16 (#867); - FusedConv detection in winml analyze (not yet filed); - multi-session bench protocol in winml perf (#155). - + v3 · 2026-06-17: + ThroughputOnly verdict policy (threshold = max(1%, 2×CV)); + screen early exit (Δ<1% skips full bench, saves ~25–90 min); + crash-resume via atomic session.json. +  ·  + Key constraints: + npu-006 (Conv%>20% → block conv fusions); + npu-007 (CV gate off on NPU); + cpu-001 (opset17 on CPU); + gpu-004 (no quant on QNN GPU). +  ·  + Pending: + --profile (#158) · --precision fp16 (#867) · FusedConv detection · multi-session bench (#155).
From e18e066df7c2eecc209973e7a56314912584b23c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 16:55:41 +0800 Subject: [PATCH 21/38] research(autoconfig): add Pending Features badge to Phase 3 in diagram --- research/autoconfig/autoconfig_diagram.html | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/research/autoconfig/autoconfig_diagram.html b/research/autoconfig/autoconfig_diagram.html index b40d6960a..58cd33da6 100644 --- a/research/autoconfig/autoconfig_diagram.html +++ b/research/autoconfig/autoconfig_diagram.html @@ -391,6 +391,14 @@

autoconfig — Skill Architecture

New findings, promoted after Gate 2 ep_knowledge/<ep>.json
+
+
Pending Features
+ #155 multi-session bench
+ #158 perf --profile
+ #443 build --json
+ #867 --precision fp16
+ ? FusedConv detection +
@@ -407,9 +415,6 @@

autoconfig — Skill Architecture

npu-007 (CV gate off on NPU); cpu-001 (opset17 on CPU); gpu-004 (no quant on QNN GPU). -  ·  - Pending: - --profile (#158) · --precision fp16 (#867) · FusedConv detection · multi-session bench (#155).
From 526fcd080c79b1f430c1654ffb50b3cdde1d7af9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 17:01:58 +0800 Subject: [PATCH 22/38] research(autoconfig): add local PyTorch reference FR; clarify correctness contract --- research/autoconfig/autoconfig_diagram.html | 7 +++++-- research/autoconfig/docs/ep-findings-summary.html | 10 ++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/research/autoconfig/autoconfig_diagram.html b/research/autoconfig/autoconfig_diagram.html index 58cd33da6..14f5a8a51 100644 --- a/research/autoconfig/autoconfig_diagram.html +++ b/research/autoconfig/autoconfig_diagram.html @@ -202,7 +202,9 @@

autoconfig — Skill Architecture

Correctness Contract
  • winml eval --mode compare
  • -
  • Lock cosine similarity = 1.000
  • +
  • Reference: original ONNX or HF PyTorch model
  • +
  • Lock cosine similarity = 1.000 vs reference
  • +
  • Note: local .pt/.pth reference not yet supported (#TBD)
@@ -397,7 +399,8 @@

autoconfig — Skill Architecture

#158 perf --profile
#443 build --json
#867 --precision fp16
- ? FusedConv detection + ? FusedConv detection
+ ? eval local .pt reference
diff --git a/research/autoconfig/docs/ep-findings-summary.html b/research/autoconfig/docs/ep-findings-summary.html index 77076680c..31101d51a 100644 --- a/research/autoconfig/docs/ep-findings-summary.html +++ b/research/autoconfig/docs/ep-findings-summary.html @@ -764,6 +764,16 @@

WinML EP Findings — Validated Catalog

OPEN + + winml eval --mode compare — support local PyTorch model as reference
+ Currently requires --model-id (HF hub only) + not filed + P2 + autoconfig correctness contract compares optimized ONNX vs a reference model. Currently winml eval --mode compare requires a HuggingFace model ID as the golden reference — local .pt / .pth files are not supported. For models not on HF Hub, or custom fine-tunes, there is no way to establish a cosine-similarity correctness gate. + all EPs + NEEDS ISSUE + +
From 9d5148efada4b89643ae8a3cd15ebf1d7abad2d4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 17:05:54 +0800 Subject: [PATCH 23/38] =?UTF-8?q?research(autoconfig):=20fix=20Phase=200?= =?UTF-8?q?=20layout=20=E2=80=94=20nowrap,=203=20equal-width=20boxes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- research/autoconfig/autoconfig_diagram.html | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/research/autoconfig/autoconfig_diagram.html b/research/autoconfig/autoconfig_diagram.html index 14f5a8a51..c16626dae 100644 --- a/research/autoconfig/autoconfig_diagram.html +++ b/research/autoconfig/autoconfig_diagram.html @@ -180,8 +180,8 @@

autoconfig — Skill Architecture

Phase 0 · Intake
-
-
+
+
Inspect
  • winml inspect
  • @@ -189,22 +189,21 @@

    autoconfig — Skill Architecture

  • Load session.json (crash-resume)
-
-
+
+
Baseline Build
  • winml build (opset17, no quant)
  • Record baseline p50
-
-
+
+
Correctness Contract
  • winml eval --mode compare
  • -
  • Reference: original ONNX or HF PyTorch model
  • -
  • Lock cosine similarity = 1.000 vs reference
  • -
  • Note: local .pt/.pth reference not yet supported (#TBD)
  • +
  • Reference: original ONNX or HF PyTorch
  • +
  • Lock cosine similarity = 1.000
From 408c647c0c7db2c8850b59b72c47f3b8b244d5e2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 17:07:29 +0800 Subject: [PATCH 24/38] =?UTF-8?q?research(autoconfig):=20pending=20feature?= =?UTF-8?q?s=20badge=20=E2=80=94=20outcome-focused=20descriptions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- research/autoconfig/autoconfig_diagram.html | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/research/autoconfig/autoconfig_diagram.html b/research/autoconfig/autoconfig_diagram.html index c16626dae..77761b0eb 100644 --- a/research/autoconfig/autoconfig_diagram.html +++ b/research/autoconfig/autoconfig_diagram.html @@ -394,12 +394,12 @@

autoconfig — Skill Architecture

Pending Features
- #155 multi-session bench
- #158 perf --profile
- #443 build --json
- #867 --precision fp16
- ? FusedConv detection
- ? eval local .pt reference + #155 reliable NPU bench (multi-session)
+ #158 per-op bottleneck insight
+ #443 structured build status
+ #867 DML fp16 search space
+  ?  block npu-006 pre-build
+  ?  correctness gate for custom models
From d761741791ee7fa23163d38b6d133b189787a5f3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 17:10:30 +0800 Subject: [PATCH 25/38] research(autoconfig): Phase 3 -> Outcome; Feature Requirements badge with sample issue --- research/autoconfig/autoconfig_diagram.html | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/research/autoconfig/autoconfig_diagram.html b/research/autoconfig/autoconfig_diagram.html index 77761b0eb..d91fa1e44 100644 --- a/research/autoconfig/autoconfig_diagram.html +++ b/research/autoconfig/autoconfig_diagram.html @@ -369,7 +369,7 @@

autoconfig — Skill Architecture

-
Phase 3 · Report
+
Phase 3 · Outcome
@@ -392,14 +392,14 @@

autoconfig — Skill Architecture

New findings, promoted after Gate 2 ep_knowledge/<ep>.json
-
-
Pending Features
- #155 reliable NPU bench (multi-session)
- #158 per-op bottleneck insight
- #443 structured build status
- #867 DML fp16 search space
-  ?  block npu-006 pre-build
-  ?  correctness gate for custom models +
+
Feature Requirements
+
+ #155 + winml perf: multi-session bench + per-session breakdown
+ Needed for: reliable NPU latency under DVFS +
+
+5 more → ep-findings-summary.html
From e9f1cf4f81e5f5f84761b94fbaa454e01f1ab12e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 17:12:06 +0800 Subject: [PATCH 26/38] research(autoconfig): align Feature Requirements badge style with other Phase 3 badges --- research/autoconfig/autoconfig_diagram.html | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/research/autoconfig/autoconfig_diagram.html b/research/autoconfig/autoconfig_diagram.html index d91fa1e44..3992c1d83 100644 --- a/research/autoconfig/autoconfig_diagram.html +++ b/research/autoconfig/autoconfig_diagram.html @@ -392,14 +392,11 @@

autoconfig — Skill Architecture

New findings, promoted after Gate 2 ep_knowledge/<ep>.json
-
-
Feature Requirements
-
- #155 - winml perf: multi-session bench + per-session breakdown
- Needed for: reliable NPU latency under DVFS -
-
+5 more → ep-findings-summary.html
+
+
Feature Requirements
+ Sample: #155 multi-session bench
+ reliable NPU latency under DVFS + +5 more → ep-findings-summary.html
From d242fddcbb7506a9a1a36239ea4378ac14879cfb Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 17:13:23 +0800 Subject: [PATCH 27/38] research(autoconfig): simplify Feature Requirements badge to issue numbers only --- research/autoconfig/autoconfig_diagram.html | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/research/autoconfig/autoconfig_diagram.html b/research/autoconfig/autoconfig_diagram.html index 3992c1d83..0e0978251 100644 --- a/research/autoconfig/autoconfig_diagram.html +++ b/research/autoconfig/autoconfig_diagram.html @@ -394,9 +394,8 @@

autoconfig — Skill Architecture

Feature Requirements
- Sample: #155 multi-session bench
- reliable NPU latency under DVFS - +5 more → ep-findings-summary.html + #155 · #158 · #443 · #867 · +2 untracked + details → ep-findings-summary.html
From 0940ef5bbd36cba848abcdc88e47d31b2fb2ebaf Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 17:14:33 +0800 Subject: [PATCH 28/38] =?UTF-8?q?research(autoconfig):=20feature=20require?= =?UTF-8?q?ments=20badge=20=E2=80=94=20issue=20numbers=20in=20code=20bar?= =?UTF-8?q?=20only?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- research/autoconfig/autoconfig_diagram.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/research/autoconfig/autoconfig_diagram.html b/research/autoconfig/autoconfig_diagram.html index 0e0978251..5fdba9845 100644 --- a/research/autoconfig/autoconfig_diagram.html +++ b/research/autoconfig/autoconfig_diagram.html @@ -394,8 +394,8 @@

autoconfig — Skill Architecture

Feature Requirements
- #155 · #158 · #443 · #867 · +2 untracked - details → ep-findings-summary.html + 6 issues identified + #155 · #158 · #443 · #867
From eeef0b9ac032b412a43be6e01a16e2a1fe90ac79 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 17:15:38 +0800 Subject: [PATCH 29/38] =?UTF-8?q?research(autoconfig):=20feature=20require?= =?UTF-8?q?ments=20badge=20=E2=80=94=20show=20issue=20template=20format?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- research/autoconfig/autoconfig_diagram.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/research/autoconfig/autoconfig_diagram.html b/research/autoconfig/autoconfig_diagram.html index 5fdba9845..10b90cba2 100644 --- a/research/autoconfig/autoconfig_diagram.html +++ b/research/autoconfig/autoconfig_diagram.html @@ -394,8 +394,8 @@

autoconfig — Skill Architecture

Feature Requirements
- 6 issues identified - #155 · #158 · #443 · #867 + Issues filed per finding + #NNN · <feature gap title>
From 75bea5ae390b7041455b2b9037f794448a97e933 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 17:18:20 +0800 Subject: [PATCH 30/38] research(autoconfig): simplify Insight Engine boxes to concept + example only --- research/autoconfig/autoconfig_diagram.html | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/research/autoconfig/autoconfig_diagram.html b/research/autoconfig/autoconfig_diagram.html index 10b90cba2..e31eff5d6 100644 --- a/research/autoconfig/autoconfig_diagram.html +++ b/research/autoconfig/autoconfig_diagram.html @@ -250,22 +250,12 @@

autoconfig — Skill Architecture

-
skip_set (hard blocks)
-
    -
  • QNN NPU + Conv%>20% → no conv fusions
  • -
  • CPU + opset>17 → skip high opsets
  • -
  • QNN GPU / DML → skip nhwc-transformer
  • -
  • QNN GPU → skip all quantization
  • -
+
skip_set
+
Hard blocks from KB rules
e.g. conv fusions on QNN NPU
-
priority_queue (ranked hints)
-
    -
  • QNN NPU + DINOv2-family → opset21 first (+24-31%)
  • -
  • Gemm>50% → quant / calib method first
  • -
  • Conv%>20%, Conv-only → nchwc trial
  • -
  • Dynamic axes → static shape export
  • -
+
priority_queue
+
Ranked hypotheses from profile
e.g. opset21 for DINOv2-family
From 2dd7f05f4abca5cb71443298fa524251b1f5b797 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 17:39:49 +0800 Subject: [PATCH 31/38] research(autoconfig): implement Phase 1 Insight Engine + Phase 3 report + KB draft MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1 — analyze_insight.py (new module): - run_graph_analysis(): op counts, Conv%, GELU variant, dynamic axes from ONNX proto - run_winml_analyze(): calls winml analyze --ep -o json, parses partial/unsupported ops - build_insight(): fuses 3 signals (graph + analyze + KB) into skip_set + priority_boosts - skip_set: npu-006 Conv%>20% block, cpu-001 opset deprioritise, gpu-004 quant skip, dml-002/gpu-002 nhwc-transformer skip - priority_boosts: npu-001 DINOv2 heuristic (+10), GELU-decomposed (+3), high-Gemm% (+2) Phase 2 integration in autoconfig.py: - Calls build_insight() after KB load (graceful fallback if baseline ONNX not yet built) - Sorts HYPOTHESES by priority_boost (highest first) - Checks insight.skip_set before each iteration (in addition to KB skip_passes) Phase 3 — report_gen.py (new module): - generate_report(): reads results.tsv, writes report.html - Champion config box (best KEEP verdict) - Benchmark bar chart (CSS bars, colour-coded by status) - Full experiment table - Phase 1 Insight Engine notes section Phase 3 — KB draft auto-write in autoconfig.py: - write_kb_draft(): on KEEP verdict with improvement > 10%, appends status=draft entry to ep_knowledge/.json - Draft has mechanism_confirmed=false; human must Gate-2 validate before promoting --- research/autoconfig/analyze_insight.py | 368 +++++++++++++++++++++++++ research/autoconfig/autoconfig.py | 103 ++++++- research/autoconfig/report_gen.py | 280 +++++++++++++++++++ 3 files changed, 750 insertions(+), 1 deletion(-) create mode 100644 research/autoconfig/analyze_insight.py create mode 100644 research/autoconfig/report_gen.py diff --git a/research/autoconfig/analyze_insight.py b/research/autoconfig/analyze_insight.py new file mode 100644 index 000000000..99ccdf15f --- /dev/null +++ b/research/autoconfig/analyze_insight.py @@ -0,0 +1,368 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""analyze_insight.py — Phase 1 Insight Engine for autoconfig. + +Fuses three signals to build skip_set and priority_queue: + 1. Graph analysis : op counts, Conv%, GELU variant, dynamic axes + 2. winml analyze : partial/unsupported op list per EP (static rule data) + 3. ep_knowledge KB : confirmed empirical findings (skip_passes, priority hints) + +Outputs: + InsightResult.skip_set — set of hypothesis labels to prune + InsightResult.priority_boosts — {hypothesis_label: boost_score} for reordering + InsightResult.notes — human-readable explanation of each decision +""" + +from __future__ import annotations + +import json +import re +import tempfile +from collections import Counter +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +# Optional heavy imports — gracefully degrade if not available +try: + import onnx # type: ignore[import-untyped] + + _ONNX_OK = True +except ImportError: + _ONNX_OK = False + +from bench_utils import run_cmd + + +# ── data types ──────────────────────────────────────────────────────────────── + + +@dataclass +class GraphInfo: + total_ops: int = 0 + op_counts: dict[str, int] = field(default_factory=dict) + conv_pct: float = 0.0 # Conv / total_ops (0-100) + gemm_pct: float = 0.0 # Gemm / total_ops + has_gelu_decomposed: bool = False # Erf-based GELU sub-pattern + has_dynamic_axes: bool = False + transpose_count: int = 0 + available: bool = False # False when onnx not installed or model not found + + +@dataclass +class AnalyzeResult: + supported: list[str] = field(default_factory=list) + partial: list[str] = field(default_factory=list) + unsupported: list[str] = field(default_factory=list) + unknown: list[str] = field(default_factory=list) + available: bool = False # False when winml analyze failed or ep has no rule data + + +@dataclass +class InsightResult: + skip_set: set[str] = field(default_factory=set) + """Labels from HYPOTHESES that should be pruned before the search loop.""" + + priority_boosts: dict[str, float] = field(default_factory=dict) + """hypothesis_label -> boost (positive = higher priority, negative = deprioritise).""" + + notes: list[str] = field(default_factory=list) + """Human-readable explanation for each decision.""" + + graph_info: GraphInfo = field(default_factory=GraphInfo) + analyze_result: AnalyzeResult = field(default_factory=AnalyzeResult) + + +# ── graph analysis ──────────────────────────────────────────────────────────── + + +def run_graph_analysis(onnx_path: Path) -> GraphInfo: + """Analyse the ONNX proto and return structural statistics.""" + info = GraphInfo() + if not _ONNX_OK: + return info + if not onnx_path.exists(): + return info + + try: + model = onnx.load(str(onnx_path)) + g = model.graph + counts: Counter = Counter(n.op_type for n in g.node) + total = sum(counts.values()) + info.total_ops = total + info.op_counts = dict(counts) + info.available = True + + if total > 0: + info.conv_pct = counts.get("Conv", 0) / total * 100 + info.gemm_pct = counts.get("Gemm", 0) / total * 100 + info.transpose_count = counts.get("Transpose", 0) + + # Decomposed GELU: presence of Erf node with Div predecessor + out2node = {o: n for n in g.node for o in n.output} + for n in g.node: + if n.op_type == "Erf" and n.input: + pred = out2node.get(n.input[0]) + if pred and pred.op_type == "Div": + info.has_gelu_decomposed = True + break + + # Dynamic axes: any input with dim_param (string dimension) + for inp in g.input: + for dim in inp.type.tensor_type.shape.dim: + if dim.dim_param: + info.has_dynamic_axes = True + break + + except Exception as e: + info.available = False + print(f" [analyze_insight] graph analysis failed: {e}") + + return info + + +# ── winml analyze ───────────────────────────────────────────────────────────── + + +def run_winml_analyze(winml: str, onnx_path: Path, ep: str, device: str) -> AnalyzeResult: + """Call `winml analyze -m --ep ` and parse JSON output.""" + result = AnalyzeResult() + if not onnx_path.exists(): + return result + + with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as f: + out_path = Path(f.name) + + try: + rc, out, _ = run_cmd( + [ + winml, + "analyze", + "-m", + str(onnx_path), + "--ep", + ep, + "--device", + device, + "-o", + str(out_path), + ], + label=f"winml analyze --ep {ep}", + timeout=120, + ) + if rc not in (0, 1) or not out_path.exists(): + return result + + data = json.loads(out_path.read_text(encoding="utf-8")) + # Output is a list; take first entry (single-EP mode) + entry = data[0] if isinstance(data, list) and data else data + ep_results = entry.get("results", []) + if not ep_results: + return result + + ep_res = ep_results[0] + cls = ep_res.get("classification", {}) + + def _extract_op_types(lst: list[str]) -> list[str]: + """Turn 'OP/ai.onnx/Conv (QDQ)' into 'Conv'.""" + types = [] + for s in lst: + m = re.search(r"/([A-Za-z][A-Za-z0-9_]*)(?:\s|$|\()", s) + if m: + types.append(m.group(1)) + return list(dict.fromkeys(types)) # dedupe, preserve order + + result.supported = _extract_op_types(cls.get("supported", [])) + result.partial = _extract_op_types(cls.get("partial", [])) + result.unsupported = _extract_op_types(cls.get("unsupported", [])) + result.unknown = _extract_op_types(cls.get("unknown", [])) + # Consider results available only when there's actual rule data + result.available = bool(result.supported or result.partial or result.unsupported) + + except Exception as e: + print(f" [analyze_insight] winml analyze failed: {e}") + finally: + out_path.unlink(missing_ok=True) + + return result + + +# ── insight engine ──────────────────────────────────────────────────────────── + + +def build_insight( + onnx_path: Path, + winml: str, + ep: str, + device: str, + hypotheses: list[tuple[str, Any, str]], + kb: dict, +) -> InsightResult: + """Fuse graph + analyze + KB signals into skip_set and priority_boosts. + + Args: + onnx_path: Path to baseline ONNX (post-export, pre-optim). + winml: Path to winml executable. + ep: Execution provider string (e.g. "cpu", "qnn"). + device: Device string (e.g. "cpu", "npu"). + hypotheses: List of (label, patch_fn, dimension) from autoconfig.py. + kb: dict from load_ep_knowledge(ep). + + Returns: + InsightResult with skip_set, priority_boosts, notes. + """ + result = InsightResult() + notes = result.notes + + print("\n=== Phase 1: Insight Engine ===") + + # ── signal 1: graph analysis ─────────────────────────────── + print(" [1/3] Graph analysis…") + g = run_graph_analysis(onnx_path) + result.graph_info = g + if g.available: + top5 = sorted(g.op_counts.items(), key=lambda x: -x[1])[:5] + print( + f" total_ops={g.total_ops} conv_pct={g.conv_pct:.1f}% " + f"gemm_pct={g.gemm_pct:.1f}% transpose={g.transpose_count}" + ) + print(f" top ops: {dict(top5)}") + else: + print(" [skip] onnx not available or model not found") + + # ── signal 2: winml analyze ──────────────────────────────── + print(f" [2/3] winml analyze --ep {ep}…") + ar = run_winml_analyze(winml, onnx_path, ep, device) + result.analyze_result = ar + if ar.available: + print( + f" supported={len(ar.supported)} partial={len(ar.partial)} " + f"unsupported={len(ar.unsupported)} unknown={len(ar.unknown)}" + ) + if ar.partial: + print(f" partial ops: {ar.partial[:5]}") + if ar.unsupported: + print(f" unsupported ops: {ar.unsupported[:5]}") + else: + print(" [skip] no rule data for this EP or analyze failed") + + # ── signal 3: KB confirmed rules ─────────────────────────── + print(" [3/3] Applying KB confirmed rules…") + + # ── build skip_set ───────────────────────────────────────── + + # KB-derived skips (already applied per confirmed finding) + for note in kb.get("notes", []): + if "[KB confirmed] Skip pass:" in note: + pass_name = note.split("Skip pass:")[-1].strip() + # Match against hypothesis labels that use this pass + for label, _, _ in hypotheses: + if pass_name.replace("_", "-") in label or pass_name in label: + result.skip_set.add(label) + notes.append(f"skip [{label}]: KB confirmed rule — {pass_name}") + + # Graph-derived skips + if g.available: + # npu-006: Conv% > 20% → hard-block conv fusions on QNN NPU + if ep in ("qnn",) and device == "npu" and g.conv_pct > 20.0: + for label, _, dim in hypotheses: + if dim == "graph_pass" and any(kw in label for kw in ("conv", "bn", "batch")): + result.skip_set.add(label) + notes.append( + f"skip [{label}]: npu-006 — Conv%={g.conv_pct:.1f}%>20% on QNN NPU" + " (FusedConv → CPU fallback)" + ) + + # cpu-001: opset > 17 regresses on CPU (empirical, mechanism unknown) + if ep == "cpu": + for label, _, dim in hypotheses: + if dim == "opset" and "21" in label: + notes.append( + f"deprioritise [{label}]: cpu-001 — opset21 regresses on CPU" + " (non-monotonic, mechanism unknown)" + ) + result.priority_boosts[label] = result.priority_boosts.get(label, 0) - 5 + + # gpu-004: QNN GPU — skip all quantization + if ep == "qnn" and device == "gpu": + for label, _, dim in hypotheses: + if dim in ("quant", "precision"): + result.skip_set.add(label) + notes.append(f"skip [{label}]: gpu-004 — quantization hangs on QNN GPU") + + # nhwc-transformer regresses p90 on DML/QNN GPU transformers + if ep in ("dml",) or (ep == "qnn" and device == "gpu"): + for label, _, dim in hypotheses: + if "nhwc" in label.lower(): + result.skip_set.add(label) + notes.append( + f"skip [{label}]: dml-002/gpu-002 — nhwc-transformer increases p90 variance" + ) + + # ── build priority_boosts ────────────────────────────────── + + if g.available: + # DINOv2-family on QNN NPU: opset21 gets strong positive boost (npu-001) + if ep == "qnn" and device == "npu": + # Heuristic: DINOv2 has many Reshape and high attention ops + if g.op_counts.get("Reshape", 0) > 30 and g.conv_pct < 10: + for label, _, dim in hypotheses: + if dim == "opset" and "21" in label: + result.priority_boosts[label] = result.priority_boosts.get(label, 0) + 10 + notes.append( + f"boost [{label}]: npu-001 heuristic — high Reshape count" + f" ({g.op_counts.get('Reshape', 0)}) + low Conv% suggests DINOv2-family" + ) + + # GELU-decomposed: boost gelu-fusion hypothesis + if g.has_gelu_decomposed: + for label, _, dim in hypotheses: + if "gelu" in label.lower(): + result.priority_boosts[label] = result.priority_boosts.get(label, 0) + 3 + notes.append( + f"boost [{label}]: decomposed GELU detected — fusion likely beneficial" + ) + + # High Gemm% → matmul-add-fusion more likely to help + if g.gemm_pct > 30: + for label, _, dim in hypotheses: + if "matmul" in label.lower() or "gemm" in label.lower(): + result.priority_boosts[label] = result.priority_boosts.get(label, 0) + 2 + notes.append( + f"boost [{label}]: high Gemm% ({g.gemm_pct:.1f}%) — matmul fusion promising" + ) + + # Conv-dense → conv fusions more likely to help (CPU only — not QNN NPU) + if g.conv_pct > 40 and ep not in ("qnn",): + for label, _, dim in hypotheses: + if "conv" in label.lower() and dim == "graph_pass": + result.priority_boosts[label] = result.priority_boosts.get(label, 0) + 2 + notes.append( + f"boost [{label}]: high Conv% ({g.conv_pct:.1f}%) — conv fusions promising" + ) + + # analyze-derived: if partial ops in model → deprioritise those optims + if ar.available and ar.partial: + for label, _, dim in hypotheses: + for pop in ar.partial: + if pop.lower() in label.lower(): + result.priority_boosts[label] = result.priority_boosts.get(label, 0) - 2 + notes.append( + f"deprioritise [{label}]: op '{pop}' is partial-support on {ep.upper()}" + ) + + # ── print summary ────────────────────────────────────────── + print("\n Insight Engine result:") + print(f" skip_set ({len(result.skip_set)}): {result.skip_set or '(none)'}") + boosts = {k: v for k, v in result.priority_boosts.items() if v != 0} + print(f" priority_boosts: {boosts or '(none)'}") + if notes: + print(" notes:") + for n in notes: + print(f" - {n}") + print() + + return result diff --git a/research/autoconfig/autoconfig.py b/research/autoconfig/autoconfig.py index bc06b6448..9b58b787a 100644 --- a/research/autoconfig/autoconfig.py +++ b/research/autoconfig/autoconfig.py @@ -34,6 +34,7 @@ from datetime import datetime from pathlib import Path +from analyze_insight import build_insight from bench_utils import ( FULL_ITERS, FULL_SESSIONS, @@ -47,6 +48,7 @@ median_p50, run_cmd, ) +from report_gen import generate_report sys.stdout.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined] @@ -340,6 +342,16 @@ def _run_phase_b( f"Improvement confirmed: p50 {baseline_p50:.1f}ms -> {med_p50:.1f}ms " f"({delta_pct:+.1f}%). {verdict.reasoning}" ) + # Auto-write KB draft entry for notable improvements + if not verdict.marginal: + write_kb_draft( + ep=EP, + label=label, + improvement_pct=improvement_pct, + cv=screen_cv, + model_id=MODEL_ID, + dimension=exp_info.get("dimension", "unknown"), + ) elif verdict.verdict == "ACC_FAIL": status = f"discard (accuracy {accuracy:.4f} < floor {ACCURACY_FLOOR})" else: @@ -429,6 +441,55 @@ def write_experiment_doc(exp_dir: Path, info: dict) -> None: (exp_dir / "experiment.md").write_text(doc, encoding="utf-8") +def write_kb_draft( + ep: str, label: str, improvement_pct: float, cv: float, model_id: str, dimension: str +) -> None: + """Append a draft finding to ep_knowledge/.json when improvement > 10%. + + The entry gets status='draft' — a human must review and promote to 'confirmed' + after Gate 2 validation (>=2 independent models, mechanism understood). + """ + if improvement_pct < 10.0: + return + kb_path = KB_DIR / f"{ep}.json" + if not kb_path.exists(): + return + try: + kb = json.loads(kb_path.read_text(encoding="utf-8")) + except Exception: + return + + findings = kb.setdefault("findings", []) + # Auto-generate a draft ID: ep-draft- + draft_id = f"{ep}-draft-{datetime.now().strftime('%Y%m%d%H%M%S')}" + + # Don't duplicate if same label+model already drafted + for f in findings: + if ( + f.get("status") == "draft" + and f.get("model_id") == model_id + and f.get("title", "").startswith(label[:30]) + ): + return + + draft = { + "id": draft_id, + "status": "draft", + "title": f"[DRAFT] {label} — {improvement_pct:+.1f}% on {model_id}", + "model_id": model_id, + "dimension": dimension, + "improvement_pct": round(improvement_pct, 2), + "cv": round(cv, 3), + "mechanism_confirmed": False, + "note": "Auto-generated draft. Requires Gate 2: >=2 models, mechanism understood.", + "action_for_autoconfig": "investigate", + "timestamp": datetime.now().isoformat(timespec="seconds"), + } + findings.append(draft) + kb_path.write_text(json.dumps(kb, indent=2), encoding="utf-8") + print(f" [KB draft] Wrote draft entry {draft_id}: {label} ({improvement_pct:+.1f}%)") + + def log(row: dict) -> None: fields = [ "iter", @@ -503,7 +564,28 @@ def main() -> None: stat_bar_multiplier=STAT_BAR_MULTIPLIER, ) - for i, (label, patch_fn, dimension) in enumerate(HYPOTHESES): + # ── Phase 1: Insight Engine ──────────────────────────────────────────────── + # Run AFTER baseline build so we have a real ONNX to analyse. + # The baseline ONNX is expected at WORK_DIR/iter_00/model.onnx once h0 has run. + # On first run the baseline may not exist yet — insight falls back gracefully. + baseline_onnx = WORK_DIR / "iter_00" / "model.onnx" + insight = build_insight( + onnx_path=baseline_onnx, + winml=WINML, + ep=EP, + device=DEVICE, + hypotheses=HYPOTHESES, + kb=kb, + ) + + # Reorder HYPOTHESES by priority boost (highest first), keeping stable sort + def _sort_key(item: tuple) -> float: + lbl = item[0] + return -insight.priority_boosts.get(lbl, 0.0) + + active_hypotheses = sorted(HYPOTHESES, key=_sort_key) + + for i, (label, patch_fn, dimension) in enumerate(active_hypotheses): # Skip iters completed in a prior run if i in session.completed_iters: print(f" [resume] skipping iter {i} ({label}) — already done") @@ -523,6 +605,11 @@ def main() -> None: print(f" skipped by KB confirmed rule: {skip_reason}") continue + # Check insight skip_set (Phase 1 analysis-derived rules) + if label in insight.skip_set: + print(f" skipped by Insight Engine: {label}") + continue + cfg = patch_fn(copy.deepcopy(BASELINE)) # type: ignore[operator] flags = optim_flags(cfg) opset = cfg["export"]["opset_version"] @@ -677,6 +764,20 @@ def main() -> None: print(f" Best p50: {best_p50:.1f}ms" if best_p50 < float("inf") else " No improvement found") print(f" Results: {RESULTS_TSV}") print(f" Experiments: {WORK_DIR / 'experiments'}") + + # ── Phase 3: Generate HTML report ───────────────────────────────────────── + try: + report_path = generate_report( + results_tsv=RESULTS_TSV, + work_dir=WORK_DIR, + model_id=MODEL_ID, + ep=EP, + insight_notes=insight.notes, + ) + print(f" Report: {report_path}") + except Exception as e: + print(f" [warn] Report generation failed: {e}") + print(f"{sep}\n") diff --git a/research/autoconfig/report_gen.py b/research/autoconfig/report_gen.py new file mode 100644 index 000000000..0a4769bc5 --- /dev/null +++ b/research/autoconfig/report_gen.py @@ -0,0 +1,280 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""report_gen.py — Phase 3 HTML report generator for autoconfig. + +Reads results.tsv and generates report.html with: + - Summary bar chart (p50 per hypothesis, colour-coded by status) + - Experiment table (config / delta_pct / status / CV) + - Champion config box +""" + +from __future__ import annotations + +import csv +import html as html_lib +from datetime import datetime +from pathlib import Path + + +# ── helpers ─────────────────────────────────────────────────────────────────── + + +def _load_tsv(results_tsv: Path) -> list[dict]: + if not results_tsv.exists(): + return [] + with results_tsv.open(encoding="utf-8") as f: + return list(csv.DictReader(f, delimiter="\t")) + + +def _status_color(status: str) -> str: + s = status.lower() + if "new best" in s or (s.startswith("keep") and "marginal" not in s): + return "#2e7d32" # dark green + if "marginal" in s: + return "#f57f17" # amber + if "discard" in s: + return "#b0bec5" # grey + if "crash" in s or "fail" in s: + return "#c62828" # red + return "#78909c" + + +def _status_bg(status: str) -> str: + s = status.lower() + if "new best" in s or (s.startswith("keep") and "marginal" not in s): + return "#e8f5e9" + if "marginal" in s: + return "#fff8e1" + if "crash" in s or "fail" in s: + return "#ffebee" + return "#f5f5f5" + + +def _p50_float(val: str | None) -> float | None: + if not val or val == "N/A" or "UNSTABLE" in str(val): + return None + try: + return float(str(val).replace("ms", "").strip()) + except ValueError: + return None + + +# ── bar chart ───────────────────────────────────────────────────────────────── + + +def _bar_chart_html(rows: list[dict], baseline_p50: float | None) -> str: + valid = [(r, _p50_float(r.get("median_p50_ms") or r.get("screen_p50_ms"))) for r in rows] + valid = [(r, v) for r, v in valid if v is not None] + if not valid: + return "

No benchmark data yet.

" + + max_val = max(v for _, v in valid) * 1.1 + bars = [] + for r, p50 in valid: + label = html_lib.escape(r.get("label", "?")) + status = r.get("status", "") + color = _status_color(status) + width_pct = p50 / max_val * 100 + delta = r.get("delta_pct", "") + baseline_marker = "" + if baseline_p50: + bx = baseline_p50 / max_val * 100 + baseline_marker = ( + f'
' + ) + bars.append(f""" +
+ {baseline_marker} +
{label}
+
+
+
+
+
{p50:.1f}ms + {html_lib.escape(delta)} +
+
+
""") + + return ( + '
\n' + '
' + "— baseline (blue line)
\n" + "".join(bars) + "\n
" + ) + + +# ── experiment table ────────────────────────────────────────────────────────── + + +def _table_html(rows: list[dict]) -> str: + cols = [ + "iter", + "label", + "dimension", + "optim_flags", + "opset", + "screen_p50_ms", + "median_p50_ms", + "delta_pct", + "cv", + "status", + ] + hdrs = "".join( + f'{c.replace("_", " ")}' + for c in cols + ) + trs = [] + for r in rows: + status = r.get("status", "") + bg = _status_bg(status) + color = _status_color(status) + cells = [] + for c in cols: + val = html_lib.escape(str(r.get(c, ""))) + if c == "status": + cells.append( + f'{val}' + ) + else: + cells.append(f'{val}') + trs.append( + f'' + "".join(cells) + "" + ) + return ( + '' + f"{hdrs}" + f"{''.join(trs)}" + "
" + ) + + +# ── champion box ───────────────────────────────────────────────────────────── + + +def _champion_html(rows: list[dict], model_id: str, ep: str) -> str: + keeps = [r for r in rows if r.get("status", "").lower().startswith("keep")] + if not keeps: + return ( + '
' + "No KEEP verdict yet — search in progress.
" + ) + best = min(keeps, key=lambda r: _p50_float(r.get("median_p50_ms")) or 999) + flags = html_lib.escape(best.get("optim_flags", "(none)")) + opset = html_lib.escape(str(best.get("opset", 17))) + p50 = html_lib.escape(best.get("median_p50_ms", "N/A")) + delta = html_lib.escape(best.get("delta_pct", "N/A")) + label = html_lib.escape(best.get("label", "?")) + return f""" +
+
+ Champion Config
+ + + + + + + + + + + + + +
Model{html_lib.escape(model_id)}
EP{html_lib.escape(ep.upper())}
Hypothesis{label}
Optim flags{flags}
Opset{opset}
Median p50{p50} ms + ({delta})
+
""" + + +# ── main entry ──────────────────────────────────────────────────────────────── + + +def generate_report( + results_tsv: Path, + work_dir: Path, + model_id: str, + ep: str, + insight_notes: list[str] | None = None, +) -> Path: + """Generate report.html inside work_dir. Returns the output path.""" + rows = _load_tsv(results_tsv) + out_path = work_dir / "report.html" + + # Find baseline p50 from h0 row + baseline_p50: float | None = None + for r in rows: + if r.get("iter") == "0" or "baseline" in r.get("label", "").lower(): + baseline_p50 = _p50_float(r.get("median_p50_ms")) + if baseline_p50: + break + + chart = _bar_chart_html(rows, baseline_p50) + table = _table_html(rows) + champion = _champion_html(rows, model_id, ep) + ts = datetime.now().strftime("%Y-%m-%d %H:%M") + n_done = len(rows) + n_keep = sum(1 for r in rows if r.get("status", "").lower().startswith("keep")) + + insight_section = "" + if insight_notes: + items = "".join(f"
  • {html_lib.escape(n)}
  • " for n in insight_notes) + insight_section = f""" +

    Phase 1 Insight Engine

    +
      {items}
    """ + + html = f""" + + + +autoconfig report — {html_lib.escape(model_id)} ({ep.upper()}) + + + + +

    autoconfig — {html_lib.escape(model_id)}

    +
    EP: {html_lib.escape(ep.upper())}  ·  + {n_done} experiments  ·  {n_keep} KEEP  ·  + Generated: {ts}
    + +
    + {champion} +
    + +
    +

    Benchmark Chart (median p50)

    + {chart} +
    + +{f'
    {insight_section}
    ' if insight_section else ""} + +
    +

    All Experiments

    + {table} +
    + + +""" + + out_path.write_text(html, encoding="utf-8") + print(f" Report written: {out_path}") + return out_path From 0ef818c147b50bd4863b797b2728013b5c2b5d43 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 19:48:43 +0800 Subject: [PATCH 32/38] research(autoconfig): expand graph analysis + hypothesis matrix from winml optimize flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit analyze_insight.py: - Added FusionCandidate dataclass: flag, count, evidence - _detect_fusion_candidates(): 30+ patterns mapped to winml optimize flags GELU (erf/tanh/quick), LayerNorm variants, attention, MatMul patterns, Conv patterns, Gemm patterns, eliminations, layout transforms - build_insight(): log-scaled priority boosts from fusion_candidates Validated on 5 real sweep ONNX (optimized.onnx): dinov2: 49x transpose_optimizer, 24x matmul_transpose_fusion, 12x attention_fusion roberta: +12x bias_softmax_fusion, 12x matmul_add_fusion resnet: 11x conv_add_fusion, 11x conv_add_activation_fusion mobilevit: 36x matmul_transpose_fusion, 12x highdimRTR_lowdimRTR bge-small: 12x matmul_add_fusion, 12x matmul_transpose_fusion catalog_qnn_sweep.py — hypothesis matrix expanded h6-h10: h6: opset21 + matmul_transpose_fusion (24-36x in all transformers) h7: opset21 + bias_softmax_fusion (12x BERT-family Add->Softmax) h8: opset21 + attention_fusion (9-12x Softmax nodes) h9: opset21 + highdimRTR_lowdimRTR (12x RTR chains on MobileViT) h10: opset17 + conv_add_fusion only (11x Conv->Add on ResNet, safe subset) ep_knowledge/qnn_npu.json: npu-008 rad-dino BUILD_FAIL (rc=0xC0000005) --- research/autoconfig/analyze_insight.py | 560 +++++++++++++++++- research/autoconfig/catalog_qnn_sweep.py | 58 ++ research/autoconfig/ep_knowledge/qnn_npu.json | 23 +- 3 files changed, 621 insertions(+), 20 deletions(-) diff --git a/research/autoconfig/analyze_insight.py b/research/autoconfig/analyze_insight.py index 99ccdf15f..9e5466095 100644 --- a/research/autoconfig/analyze_insight.py +++ b/research/autoconfig/analyze_insight.py @@ -76,9 +76,475 @@ class InsightResult: analyze_result: AnalyzeResult = field(default_factory=AnalyzeResult) +# ── data types ──────────────────────────────────────────────────────────────── + + +@dataclass +class FusionCandidate: + """One detectable pattern that maps to a winml optimize flag.""" + + flag: str + """winml optimize flag name (e.g. 'gelu_fusion').""" + + count: int + """How many candidate instances were found in the graph.""" + + evidence: str + """Short human-readable description of what was found.""" + + +@dataclass +class GraphInfo: + total_ops: int = 0 + op_counts: dict[str, int] = field(default_factory=dict) + conv_pct: float = 0.0 # Conv / total_ops (0-100) + matmul_pct: float = 0.0 # MatMul / total_ops + gemm_pct: float = 0.0 # Gemm / total_ops + has_gelu_decomposed: bool = False # any multi-op GELU subgraph detected + gelu_types: list[str] = field(default_factory=list) # 'erf', 'tanh', 'quick' + has_dynamic_axes: bool = False + transpose_count: int = 0 + fusion_candidates: list[FusionCandidate] = field(default_factory=list) + """Ordered list of detected optimisation opportunities, highest-count first.""" + available: bool = False # False when onnx not installed or model not found + + +@dataclass +class AnalyzeResult: + supported: list[str] = field(default_factory=list) + partial: list[str] = field(default_factory=list) + unsupported: list[str] = field(default_factory=list) + unknown: list[str] = field(default_factory=list) + available: bool = False # False when winml analyze failed or ep has no rule data + + +@dataclass +class InsightResult: + skip_set: set[str] = field(default_factory=set) + """Labels from HYPOTHESES that should be pruned before the search loop.""" + + priority_boosts: dict[str, float] = field(default_factory=dict) + """hypothesis_label -> boost (positive = higher priority, negative = deprioritise).""" + + notes: list[str] = field(default_factory=list) + """Human-readable explanation for each decision.""" + + graph_info: GraphInfo = field(default_factory=GraphInfo) + analyze_result: AnalyzeResult = field(default_factory=AnalyzeResult) + + # ── graph analysis ──────────────────────────────────────────────────────────── +def _build_consumer_map(graph) -> dict[str, list]: # type: ignore[type-arg] + """Map each output name → list of consumer nodes.""" + consumers: dict[str, list] = {} + for node in graph.node: + for inp in node.input: + consumers.setdefault(inp, []).append(node) + return consumers + + +def _build_producer_map(graph) -> dict[str, object]: + """Map each output name → the node that produces it.""" + return {out: n for n in graph.node for out in n.output} + + +def _get_attr_float(node, name: str) -> float | None: + """Extract a float attribute from an ONNX node.""" + for a in node.attribute: + if a.name == name: + return float(a.f) + return None + + +def _detect_fusion_candidates(graph) -> list[FusionCandidate]: # type: ignore[type-arg] + """ + Scan the ONNX graph for subgraph patterns that map to winml optimize flags. + + Returns a list of FusionCandidate, ordered highest-count first. + + Detection strategy + ------------------ + We build two lookup tables (producer_map, consumer_map) and then sweep the + graph once per pattern family. Each check is O(N) in the number of nodes. + + Pattern families + ---------------- + GELU variants + gelu_fusion : Div → Erf → Add → Mul → Mul (exact GELU) + fast_gelu_fusion : Tanh-based GELU (Tanh node with Pow(3) ancestor) + quick_gelu_fusion : x * sigmoid(1.702*x) + bias_gelu_fusion : Add → GELU subgraph (bias before GELU entry) + LayerNorm variants + layer_norm_fusion : ReduceMean → Sub → Pow(2) → … → Add(ε) + simplified_layer_norm_fusion : Pow(2) + ReduceMean (no Sub) + fuse_rmsnorm : Pow → ReduceMean → Add → Sqrt → Div → Mul + skip_layer_norm_fusion : Add (residual) feeds directly into LN subgraph + Attention + attention_fusion : Q/K/V MatMul trio feeding a Softmax + bias_softmax_fusion : Add immediately before Softmax + MatMul patterns + matmul_add_fusion : MatMul → Add (not already counted in LN) + matmul_activation_fusion : MatMul → {Relu, Sigmoid, Tanh, Clip} + matmul_transpose_fusion : Transpose → MatMul OR MatMul → Transpose + matmul_scale_fusion : MatMul → Mul (scalar constant) + Conv patterns + conv_bn_fusion : Conv → BatchNormalization + conv_add_fusion : Conv → Add + conv_mul_fusion : Conv → Mul + conv_activation_fusion : Conv → {Relu, LeakyRelu, Sigmoid, Tanh, Clip} + conv_add_activation_fusion: Conv → Add → activation (3-node chain) + pad_fusion : Pad → Conv + Gemm patterns + gemm_activation_fusion : Gemm → {Relu, Tanh, Sigmoid} + gemm_sum_fusion : Gemm → Add + gemm_transpose_fusion : Transpose → Gemm + Eliminations + slice_elimination : multiple Slice ops (potential redundancy) + unsqueeze_elimination : Unsqueeze of initializers + concat_slice_elimination : Concat → Slice (reverse of split) + expand_elimination : Expand nodes + Layout + transpose_optimizer : Transpose count > 10 + nhwc_transformer : Conv-heavy + Transpose → layout transform candidate + Rewrite: highdimRTR_lowdimRTR : Reshape → Transpose → Reshape with rank > 4 + """ + producer = _build_producer_map(graph) + consumer = _build_consumer_map(graph) + + # Helper: get the single consumer of a node output (or None) + def _single_consumer(node, out_idx: int = 0): + if out_idx >= len(node.output): + return None + consumers = consumer.get(node.output[out_idx], []) + return consumers[0] if len(consumers) == 1 else None + + # Helper: check if a node output feeds a specific op type + def _consumer_is(node, op: str, out_idx: int = 0) -> bool: + c = _single_consumer(node, out_idx) + return c is not None and c.op_type == op + + # Helper: check if all inputs to node are exclusively from initializers (weight-only) + init_names = {i.name for i in graph.initializer} + + def _is_initializer_input(inp_name: str) -> bool: + return inp_name in init_names + + candidates: dict[str, FusionCandidate] = {} + + def _add(flag: str, evidence: str, n: int = 1) -> None: + if flag in candidates: + candidates[flag].count += n + candidates[flag].evidence = evidence # update to latest + else: + candidates[flag] = FusionCandidate(flag=flag, count=n, evidence=evidence) + + # ── GELU patterns ────────────────────────────────────────────────────────── + erf_gelu_count = 0 + tanh_gelu_count = 0 + quick_gelu_count = 0 + bias_before_gelu = 0 + + for node in graph.node: + # Erf-based GELU: Div → Erf → (Add → Mul → Mul) + if node.op_type == "Erf" and node.input: + pred = producer.get(node.input[0]) + if pred and pred.op_type == "Div": + erf_gelu_count += 1 + # Check if there's an Add feeding the Erf entry point (bias_gelu) + # The entry to Erf-GELU is typically through the Div; check what feeds Div + if pred.input: + div_pred = producer.get(pred.input[0]) + if div_pred and div_pred.op_type in ("Add", "Gemm", "MatMul"): + bias_before_gelu += 1 + + # Tanh-based GELU: Tanh with Pow(3) somewhere in the sub-tree + if node.op_type == "Tanh" and node.input: + # Check 3-hop ancestry for Pow + cur = producer.get(node.input[0]) + for _ in range(4): + if cur is None: + break + if cur.op_type == "Pow": + tanh_gelu_count += 1 + break + cur = producer.get(cur.input[0]) if cur.input else None + + # Quick GELU: Sigmoid where predecessor is Mul with constant ≈ 1.702 + if node.op_type == "Sigmoid" and node.input: + pred = producer.get(node.input[0]) + if pred and pred.op_type == "Mul": + quick_gelu_count += 1 + + if erf_gelu_count: + _add("gelu_fusion", f"{erf_gelu_count} Erf-based GELU subgraph(s)", erf_gelu_count) + _add( + "gelu_singlegelu", + f"{erf_gelu_count} decomposed GELU → can normalise to single Gelu op", + erf_gelu_count, + ) + if tanh_gelu_count: + _add( + "fast_gelu_fusion", + f"{tanh_gelu_count} Tanh-based GELU subgraph(s)", + tanh_gelu_count, + ) + if quick_gelu_count: + _add( + "quick_gelu_fusion", + f"{quick_gelu_count} Sigmoid(1.702x) quick-GELU pattern(s)", + quick_gelu_count, + ) + if bias_before_gelu: + _add( + "bias_gelu_fusion", + f"{bias_before_gelu} Add/MatMul feeding GELU entry", + bias_before_gelu, + ) + + # ── LayerNorm patterns ───────────────────────────────────────────────────── + ln_full_count = 0 # ReduceMean + Sub + Pow(2) + ln_simplified_count = 0 # Pow(2) + ReduceMean (no Sub) + rmsnorm_count = 0 # Pow + ReduceMean (no Sub, no mean-centering) + skip_ln_count = 0 # Add → LayerNorm subgraph + + for node in graph.node: + if node.op_type == "Pow" and node.input: + pred = producer.get(node.input[0]) + if pred and pred.op_type == "Sub": + # Sub → Pow: classic LN (ReduceMean → Sub → Pow) + sub_pred = producer.get(pred.input[0]) if pred.input else None + if sub_pred and sub_pred.op_type == "ReduceMean": + ln_full_count += 1 + elif pred and pred.op_type in ("ReduceMean", "Mul", "Add"): + # Simplified / RMSNorm: no Sub predecessor + ln_simplified_count += 1 + + # RMSNorm: Pow → ReduceMean (direct, without Sub) + if node.op_type == "ReduceMean" and node.input: + pred = producer.get(node.input[0]) + if pred and pred.op_type == "Pow": + rmsnorm_count += 1 + + # skip_layer_norm: Add whose output feeds into the start of an LN subgraph + # Heuristic: Add → ReduceMean (the mean-centering step of LN) + if node.op_type == "Add" and _consumer_is(node, "ReduceMean"): + skip_ln_count += 1 + + if ln_full_count: + _add( + "layer_norm_fusion", + f"{ln_full_count} ReduceMean→Sub→Pow LayerNorm subgraph(s)", + ln_full_count, + ) + if ln_simplified_count: + _add( + "simplified_layer_norm_fusion", + f"{ln_simplified_count} simplified LayerNorm pattern(s) (no mean-centering)", + ln_simplified_count, + ) + if rmsnorm_count: + _add("fuse_rmsnorm", f"{rmsnorm_count} RMSNorm Pow→ReduceMean pattern(s)", rmsnorm_count) + if skip_ln_count: + _add( + "skip_layer_norm_fusion", + f"{skip_ln_count} Add→ReduceMean (residual+LN) pattern(s)", + skip_ln_count, + ) + + # ── Attention patterns ───────────────────────────────────────────────────── + softmax_count = sum(1 for n in graph.node if n.op_type == "Softmax") + add_before_softmax = 0 + for node in graph.node: + if node.op_type == "Softmax" and node.input: + pred = producer.get(node.input[0]) + if pred and pred.op_type == "Add": + add_before_softmax += 1 + + if softmax_count: + _add( + "attention_fusion", + f"{softmax_count} Softmax node(s) — likely attention head(s)", + softmax_count, + ) + if add_before_softmax: + _add( + "bias_softmax_fusion", + f"{add_before_softmax} Add→Softmax (bias+attention mask) pattern(s)", + add_before_softmax, + ) + + # ── MatMul patterns ──────────────────────────────────────────────────────── + _ACTIVATIONS = {"Relu", "LeakyRelu", "Sigmoid", "Tanh", "Clip", "Gelu", "FastGelu"} + + mm_add = mm_act = mm_tp = mm_scale = 0 + for node in graph.node: + if node.op_type != "MatMul": + continue + c = _single_consumer(node) + if c is None: + continue + if c.op_type == "Add": + mm_add += 1 + elif c.op_type in _ACTIVATIONS: + mm_act += 1 + elif c.op_type == "Transpose": + mm_tp += 1 + elif c.op_type == "Mul": + # Mul with a scalar → scale fusion; heuristic: second input is initializer + if len(c.input) > 1 and _is_initializer_input(c.input[1]): + mm_scale += 1 + + # Also check Transpose → MatMul + tp_before_mm = sum( + 1 for node in graph.node if node.op_type == "Transpose" and _consumer_is(node, "MatMul") + ) + + if mm_add: + _add("matmul_add_fusion", f"{mm_add} MatMul→Add pattern(s)", mm_add) + _add( + "matmuladd_reshapegemm", + f"{mm_add} MatMul+Add → Reshape+Gemm rewrite candidate(s)", + mm_add, + ) + if mm_act: + _add("matmul_activation_fusion", f"{mm_act} MatMul→activation pattern(s)", mm_act) + if mm_tp + tp_before_mm: + _add( + "matmul_transpose_fusion", + f"{mm_tp + tp_before_mm} MatMul↔Transpose pattern(s)", + mm_tp + tp_before_mm, + ) + if mm_scale: + _add("matmul_scale_fusion", f"{mm_scale} MatMul→Mul(scalar) pattern(s)", mm_scale) + + # ── Conv patterns ────────────────────────────────────────────────────────── + conv_bn = conv_add = conv_mul = conv_act = conv_add_act = pad_conv = 0 + for node in graph.node: + if node.op_type == "Pad" and _consumer_is(node, "Conv"): + pad_conv += 1 + + if node.op_type != "Conv": + continue + c = _single_consumer(node) + if c is None: + continue + if c.op_type == "BatchNormalization": + conv_bn += 1 + elif c.op_type == "Add": + conv_add += 1 + # Check for Conv → Add → activation chain + cc = _single_consumer(c) + if cc and cc.op_type in _ACTIVATIONS: + conv_add_act += 1 + elif c.op_type == "Mul": + conv_mul += 1 + elif c.op_type in _ACTIVATIONS: + conv_act += 1 + + if conv_bn: + _add("conv_bn_fusion", f"{conv_bn} Conv→BN pattern(s)", conv_bn) + if conv_add: + _add("conv_add_fusion", f"{conv_add} Conv→Add pattern(s)", conv_add) + if conv_mul: + _add("conv_mul_fusion", f"{conv_mul} Conv→Mul pattern(s)", conv_mul) + if conv_act: + _add("conv_activation_fusion", f"{conv_act} Conv→activation pattern(s)", conv_act) + if conv_add_act: + _add( + "conv_add_activation_fusion", + f"{conv_add_act} Conv→Add→activation chain(s) (FusedConv)", + conv_add_act, + ) + if pad_conv: + _add("pad_fusion", f"{pad_conv} Pad→Conv pattern(s)", pad_conv) + + # ── Gemm patterns ────────────────────────────────────────────────────────── + gemm_act = gemm_add = gemm_tp = 0 + for node in graph.node: + if node.op_type != "Gemm": + continue + c = _single_consumer(node) + if c is None: + continue + if c.op_type in _ACTIVATIONS: + gemm_act += 1 + elif c.op_type == "Add": + gemm_add += 1 + elif c.op_type == "Transpose": + gemm_tp += 1 + tp_before_gemm = sum( + 1 for node in graph.node if node.op_type == "Transpose" and _consumer_is(node, "Gemm") + ) + if gemm_act: + _add("gemm_activation_fusion", f"{gemm_act} Gemm→activation pattern(s)", gemm_act) + if gemm_add: + _add("gemm_sum_fusion", f"{gemm_add} Gemm→Add pattern(s)", gemm_add) + if gemm_tp + tp_before_gemm: + _add( + "gemm_transpose_fusion", + f"{gemm_tp + tp_before_gemm} Gemm↔Transpose pattern(s)", + gemm_tp + tp_before_gemm, + ) + + # ── Elimination patterns ─────────────────────────────────────────────────── + slice_count = sum(1 for n in graph.node if n.op_type == "Slice") + expand_count = sum(1 for n in graph.node if n.op_type == "Expand") + unsqueeze_init = sum( + 1 + for n in graph.node + if n.op_type == "Unsqueeze" and n.input and _is_initializer_input(n.input[0]) + ) + concat_slice = sum(1 for n in graph.node if n.op_type == "Concat" and _consumer_is(n, "Slice")) + + if slice_count > 3: + _add("slice_elimination", f"{slice_count} Slice nodes (potential redundancy)", slice_count) + if expand_count > 2: + _add("expand_elimination", f"{expand_count} Expand nodes", expand_count) + if unsqueeze_init: + _add( + "unsqueeze_elimination", + f"{unsqueeze_init} Unsqueeze(initializer) node(s)", + unsqueeze_init, + ) + if concat_slice: + _add( + "concat_slice_elimination", + f"{concat_slice} Concat→Slice pattern(s) (reverse-split)", + concat_slice, + ) + + # ── Layout patterns ──────────────────────────────────────────────────────── + tp_count = sum(1 for n in graph.node if n.op_type == "Transpose") + if tp_count > 10: + _add( + "transpose_optimizer", + f"{tp_count} Transpose nodes — optimizer may collapse chains", + tp_count, + ) + + # Reshape → Transpose → Reshape with high-dimensional input (rank > 4) + rtr_highdim = 0 + for node in graph.node: + if node.op_type == "Transpose" and node.input: + pred = producer.get(node.input[0]) + c = _single_consumer(node) + if pred and c and pred.op_type == "Reshape" and c.op_type == "Reshape": + # Check if any input to the reshape has rank > 4 via shape inference + # Approximation: count as candidate if the graph has many dims + rtr_highdim += 1 + if rtr_highdim > 2: + _add( + "highdimRTR_lowdimRTR", + f"{rtr_highdim} Reshape→Transpose→Reshape chain(s) — may reduce to lower rank", + rtr_highdim, + ) + + # Sort by count descending + return sorted(candidates.values(), key=lambda c: -c.count) + + def run_graph_analysis(onnx_path: Path) -> GraphInfo: """Analyse the ONNX proto and return structural statistics.""" info = GraphInfo() @@ -98,17 +564,18 @@ def run_graph_analysis(onnx_path: Path) -> GraphInfo: if total > 0: info.conv_pct = counts.get("Conv", 0) / total * 100 + info.matmul_pct = counts.get("MatMul", 0) / total * 100 info.gemm_pct = counts.get("Gemm", 0) / total * 100 info.transpose_count = counts.get("Transpose", 0) - # Decomposed GELU: presence of Erf node with Div predecessor - out2node = {o: n for n in g.node for o in n.output} - for n in g.node: - if n.op_type == "Erf" and n.input: - pred = out2node.get(n.input[0]) - if pred and pred.op_type == "Div": - info.has_gelu_decomposed = True - break + # Detect GELU types + if counts.get("Erf", 0): + info.has_gelu_decomposed = True + info.gelu_types.append("erf") + if counts.get("Tanh", 0): + info.gelu_types.append("tanh") + if counts.get("Sigmoid", 0): + info.gelu_types.append("sigmoid/quick") # Dynamic axes: any input with dim_param (string dimension) for inp in g.input: @@ -117,6 +584,9 @@ def run_graph_analysis(onnx_path: Path) -> GraphInfo: info.has_dynamic_axes = True break + # Full fusion candidate scan + info.fusion_candidates = _detect_fusion_candidates(g) + except Exception as e: info.available = False print(f" [analyze_insight] graph analysis failed: {e}") @@ -226,10 +696,17 @@ def build_insight( if g.available: top5 = sorted(g.op_counts.items(), key=lambda x: -x[1])[:5] print( - f" total_ops={g.total_ops} conv_pct={g.conv_pct:.1f}% " - f"gemm_pct={g.gemm_pct:.1f}% transpose={g.transpose_count}" + f" total_ops={g.total_ops} conv%={g.conv_pct:.1f} " + f"matmul%={g.matmul_pct:.1f} gemm%={g.gemm_pct:.1f} " + f"transpose={g.transpose_count} dynamic_axes={g.has_dynamic_axes}" ) print(f" top ops: {dict(top5)}") + if g.fusion_candidates: + print(f" fusion candidates ({len(g.fusion_candidates)}):") + for fc in g.fusion_candidates[:10]: # top-10 only + print(f" [{fc.count:3d}×] {fc.flag:40s} {fc.evidence}") + if len(g.fusion_candidates) > 10: + print(f" ... and {len(g.fusion_candidates) - 10} more") else: print(" [skip] onnx not available or model not found") @@ -317,22 +794,67 @@ def build_insight( f" ({g.op_counts.get('Reshape', 0)}) + low Conv% suggests DINOv2-family" ) - # GELU-decomposed: boost gelu-fusion hypothesis - if g.has_gelu_decomposed: + # Fusion-candidate-driven boosts: map detected patterns → hypothesis labels + # + # Strategy: for each FusionCandidate, find hypotheses whose label or dimension + # mentions the relevant flag. Boost proportional to log(count) so that + # "288 MatMul→Add" doesn't overwhelm "12 GELU" by 24×. + import math + + _FLAG_KEYWORDS: dict[str, list[str]] = { + "gelu_fusion": ["gelu"], + "fast_gelu_fusion": ["gelu", "fast"], + "bias_gelu_fusion": ["gelu", "bias"], + "quick_gelu_fusion": ["gelu", "quick"], + "gelu_singlegelu": ["gelu"], + "layer_norm_fusion": ["layer_norm", "layernorm", "ln"], + "skip_layer_norm_fusion": ["skip_layer_norm", "skip_ln"], + "simplified_layer_norm_fusion": ["layer_norm", "simplified"], + "fuse_rmsnorm": ["rmsnorm", "rms_norm"], + "attention_fusion": ["attention"], + "bias_softmax_fusion": ["softmax", "attention"], + "matmul_add_fusion": ["matmul_add", "matmul-add"], + "matmul_activation_fusion": ["matmul_act", "matmul-act"], + "matmul_transpose_fusion": ["matmul_transp", "matmul-transp"], + "matmul_scale_fusion": ["matmul_scale", "matmul-scale"], + "matmuladd_reshapegemm": ["reshape_gemm", "matmuladd"], + "conv_bn_fusion": ["conv_bn", "conv-bn"], + "conv_add_fusion": ["conv_add", "conv-add"], + "conv_mul_fusion": ["conv_mul", "conv-mul"], + "conv_activation_fusion": ["conv_act", "conv-act"], + "conv_add_activation_fusion": ["conv_add_act", "fused_conv"], + "pad_fusion": ["pad_conv", "pad-conv"], + "gemm_activation_fusion": ["gemm_act", "gemm-act"], + "gemm_sum_fusion": ["gemm_sum", "gemm-sum"], + "gemm_transpose_fusion": ["gemm_transp"], + "slice_elimination": ["slice_elim"], + "unsqueeze_elimination": ["unsqueeze_elim"], + "expand_elimination": ["expand_elim"], + "concat_slice_elimination": ["concat_slice"], + "transpose_optimizer": ["transpose_opt", "tp_opt"], + "highdimRTR_lowdimRTR": ["rtr", "reshape_transpose"], + } + + for fc in g.fusion_candidates: + keywords = _FLAG_KEYWORDS.get(fc.flag, [fc.flag.replace("_", "-")]) + boost = round(1 + math.log(max(fc.count, 1)), 1) for label, _, dim in hypotheses: - if "gelu" in label.lower(): - result.priority_boosts[label] = result.priority_boosts.get(label, 0) + 3 + label_lower = label.lower() + if any(kw in label_lower for kw in keywords): + result.priority_boosts[label] = result.priority_boosts.get(label, 0) + boost notes.append( - f"boost [{label}]: decomposed GELU detected — fusion likely beneficial" + f"boost [{label}] +{boost:.1f}: graph has {fc.count}× {fc.flag} candidate(s)" ) - # High Gemm% → matmul-add-fusion more likely to help - if g.gemm_pct > 30: + # GELU-decomposed: additional direct boost for gelu hypotheses + if g.has_gelu_decomposed: for label, _, dim in hypotheses: - if "matmul" in label.lower() or "gemm" in label.lower(): + if "gelu" in label.lower() and label not in { + n.split("]")[0].lstrip("boost [") for n in notes if "gelu" in n + }: result.priority_boosts[label] = result.priority_boosts.get(label, 0) + 2 notes.append( - f"boost [{label}]: high Gemm% ({g.gemm_pct:.1f}%) — matmul fusion promising" + f"boost [{label}]: decomposed GELU detected — fusion likely beneficial" ) # Conv-dense → conv fusions more likely to help (CPU only — not QNN NPU) diff --git a/research/autoconfig/catalog_qnn_sweep.py b/research/autoconfig/catalog_qnn_sweep.py index eb95ba41f..4914b976a 100644 --- a/research/autoconfig/catalog_qnn_sweep.py +++ b/research/autoconfig/catalog_qnn_sweep.py @@ -11,9 +11,20 @@ h1: opset 17 explicit (explicit opset, same optim as baseline) h2: opset 19 h3: opset 21 <- tests npu-001 generalization + + Conv fusions (npu-006 hazard on Conv-dense models): h4: opset 17 + conv fusions (conv-bn, conv-add, conv-activation) h5: opset 21 + conv fusions + Attention/transformer fusions (graph-analysis-driven; 2026-06-17): + h6: opset 21 + matmul_transpose_fusion (24-36× detected on all transformer models) + h7: opset 21 + bias_softmax_fusion (12× on BERT-family: roberta, bge, MiniLM) + h8: opset 21 + attention_fusion (12× Softmax nodes across all transformers) + + Rewrite hypotheses (graph-analysis-driven; 2026-06-17): + h9: opset 21 + highdimRTR_lowdimRTR (12× Reshape-Transpose-Reshape on MobileViT) + h10: opset 17 + conv_add_fusion only (11× on ResNet; safe subset of npu-006 convoy) + 2-phase bench protocol (npu-007): Phase A: 200-iter screen — high CV is NORMAL on QNN NPU (DVFS), always proceed to Phase B. Phase B: 3 independent sessions x 500 iters, 30 s cool-down between sessions. @@ -78,6 +89,7 @@ ("h1", "opset 17 explicit", 17, None), ("h2", "opset 19", 19, None), ("h3", "opset 21 (tests npu-001 bypass)", 21, None), + # ── conv fusions (npu-006) ────────────────────────────────────────────── ( "h4", "opset 17 + conv fusions", @@ -98,6 +110,52 @@ "conv_activation_fusion": True, }, ), + # ── attention/transformer fusions (graph-analysis-driven, 2026-06-17) ── + # matmul_transpose_fusion: 24-36× patterns detected on all transformer + # models (dinov2, roberta, bge, mobilevit). Tests whether fusing + # Transpose↔MatMul pairs helps QNN NPU dispatch. + ( + "h6", + "opset 21 + matmul_transpose_fusion", + 21, + {"matmul_transpose_fusion": True}, + ), + # bias_softmax_fusion: 12× Add→Softmax patterns in BERT-family models + # (roberta, bge, MiniLM). Attention mask is added before softmax — + # fusing may help QNN NPU kernel scheduling. + ( + "h7", + "opset 21 + bias_softmax_fusion", + 21, + {"bias_softmax_fusion": True}, + ), + # attention_fusion: 9-12× Softmax nodes across all transformers. + # Full QK^T V attention fusion into a single op. + ( + "h8", + "opset 21 + attention_fusion", + 21, + {"attention_fusion": True}, + ), + # ── rewrite hypotheses (graph-analysis-driven, 2026-06-17) ───────────── + # highdimRTR_lowdimRTR: 12× Reshape→Transpose→Reshape detected on + # MobileViT. Reduces high-rank RTR chains to lower-rank equivalents, + # potentially reducing Transpose overhead on QNN NPU. + ( + "h9", + "opset 21 + highdimRTR_lowdimRTR", + 21, + {"highdimRTR_lowdimRTR": True}, + ), + # conv_add_fusion only (safe subset of npu-006 convoy): 11× Conv→Add + # on ResNet. Distinct from conv_add_activation_fusion (FusedConv) — + # only fuses the Conv+bias Add, not the full 3-node chain. + ( + "h10", + "opset 17 + conv_add_fusion only", + 17, + {"conv_add_fusion": True}, + ), ] # Full catalog sweep list: (model_id, task, model_type, run_eval_on_baseline) diff --git a/research/autoconfig/ep_knowledge/qnn_npu.json b/research/autoconfig/ep_knowledge/qnn_npu.json index 0280af9bd..8023db8e6 100644 --- a/research/autoconfig/ep_knowledge/qnn_npu.json +++ b/research/autoconfig/ep_knowledge/qnn_npu.json @@ -67,7 +67,7 @@ "distilbert/distilbert-base-cased-distilled-squad — not run; DistilBERT architecture, predicted neutral (consistent with distilbert-base-uncased -0.1%)" ], "cpu_bound_cannot_test": [ - "microsoft/rad-dino (-0.1%, all hypotheses ~275ms CV<0.022 — model runs on CPU, opset irrelevant)" + "microsoft/rad-dino (-0.1% on CPU EP, all hypotheses ~275ms CV<0.022 — model runs on CPU, opset irrelevant; QNN NPU BUILD_FAIL 2026-06-17, see npu-008)" ], "data_unreliable": ["resnet-18 — sub-ms latency, 3-session range spans 4x; no reliable signal (see data_reliability_notes)"] }, @@ -263,6 +263,27 @@ } } + , + + { + "id": "npu-008", + "title": "microsoft/rad-dino fails to build on QNN NPU across all opset variants (winml crash rc=0xC0000005)", + "observation": "catalog_qnn_sweep run 2026-06-17: all 6 hypotheses for microsoft/rad-dino (opset 17/19/21, with/without conv fusions) returned rc=3221225794 (0xC0000005, access violation) in <2s. No stderr captured — winml process crashed before producing any output. This is distinct from a build error: it is a hard crash of the winml CLI itself.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "rad-dino is a ViT encoder with a non-standard DINOv2 variant (larger heads, custom CLS token handling). Likely contains one or more ONNX operators or graph shapes that trigger an unguarded null-dereference or out-of-bounds access in the QNN EP quantization or compilation path (winml build calls QNN SDK compilation under the hood). Could also be a model size / dynamic axis issue.", + "action_for_autoconfig": "Skip QNN NPU for microsoft/rad-dino. If QNN NPU is required, file a bug with the crash dump and test with winml analyze first to identify unsupported ops before attempting build.", + "confidence": "high on observation (reproducible across all 6 hypotheses in same run); low on mechanism (no stack trace available)", + "falsified_by": null, + "scope": "microsoft/rad-dino only (confirmed). DINOv2-family models in general (facebook/dinov2-small, facebook/dinov2-base) are NOT affected — they build and run on QNN NPU successfully.", + "severity": "blocker — model is incompatible with QNN NPU build", + "follow_up_required": [ + "Run winml analyze --ep qnn on rad-dino ONNX to check unsupported ops", + "Capture crash dump (ProcDump) to get stack trace", + "Compare ONNX graph structure of rad-dino vs facebook/dinov2-small to isolate differentiating ops" + ], + "date_observed": "2026-06-17" + } + ], "search_space_rules": { From 77657753d0e6b4069e205fd1fb4dfa4bf72b675a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 19:54:03 +0800 Subject: [PATCH 33/38] research(autoconfig): add --only-hypotheses + --reuse-h0-config flags for delta sweeps Enables incremental sweeps without re-running all hypotheses: --only-hypotheses h6,h7,h8 run only specified IDs (skip others) --reuse-h0-config load base config from existing h0/build_config.json When --only-hypotheses is set: - loads existing results.json and preserves prior hypothesis data - skips winml config call if --reuse-h0-config + h0/build_config.json exists - writes updated results.json merging old + new entries Allows targeted delta sweeps, e.g. testing only new h6-h10 on models that already have h0-h5 data from a previous full run. --- research/autoconfig/catalog_qnn_sweep.py | 117 ++++++++++++++++++----- 1 file changed, 92 insertions(+), 25 deletions(-) diff --git a/research/autoconfig/catalog_qnn_sweep.py b/research/autoconfig/catalog_qnn_sweep.py index 4914b976a..081b49b3a 100644 --- a/research/autoconfig/catalog_qnn_sweep.py +++ b/research/autoconfig/catalog_qnn_sweep.py @@ -495,43 +495,78 @@ def sweep_model( task: str, model_type: str, run_eval_on_baseline: bool, + only_hyp_ids: "set[str] | None" = None, + reuse_h0_config: bool = False, ) -> dict: - """Run all 6 hypotheses for one model on QNN NPU. Returns results dict.""" + """Run hypotheses for one model on QNN NPU. Returns results dict. + + Args: + only_hyp_ids: If set, only run these hypothesis IDs (e.g. {'h6','h7'}). + reuse_h0_config: If True, load base config from existing h0/build_config.json + instead of calling winml config again. + """ model_slug = model_id.replace("/", "--") model_dir = RESULTS_DIR / model_slug model_dir.mkdir(parents=True, exist_ok=True) - results: dict = { - "model_id": model_id, - "task": task, - "model_type": model_type, - "timestamp": datetime.now().isoformat(timespec="seconds"), - "ep": EP, - "device": DEVICE, - "baseline_opset": None, - "conv_pct": None, # Conv ops % of total — drives npu-006 risk - "npu006_risk": None, # True if conv_pct > NPU006_CONV_PCT_THRESHOLD - "npu006_regression": None, # True if h4/h5 median >= 10x baseline (catastrophic) - "hypotheses": {}, - "best_hypothesis": None, - "baseline_p50_ms": None, - "best_p50_ms": None, - "best_gain_pct": None, - "npu001_generalized": None, # True/False/"neutral"/None (median-based) - "npu001_ranges_non_overlapping": None, # True/False — stricter range-overlap test - "feature_gaps": [], - "errors": [], - } + # When resuming from partial run, load existing results to preserve prior data + results_path = model_dir / "results.json" + if only_hyp_ids and results_path.exists(): + try: + results = json.loads(results_path.read_text(encoding="utf-8")) + print(f" [resume] loaded existing results from {results_path}", flush=True) + except Exception: + results = {} + else: + results = {} + + results.update( + { + "model_id": model_id, + "task": task, + "model_type": model_type, + "timestamp": datetime.now().isoformat(timespec="seconds"), + "ep": EP, + "device": DEVICE, + } + ) + results.setdefault("baseline_opset", None) + results.setdefault("conv_pct", None) + results.setdefault("npu006_risk", None) + results.setdefault("npu006_regression", None) + results.setdefault("hypotheses", {}) + results.setdefault("best_hypothesis", None) + results.setdefault("baseline_p50_ms", None) + results.setdefault("best_p50_ms", None) + results.setdefault("best_gain_pct", None) + results.setdefault("npu001_generalized", None) + results.setdefault("npu001_ranges_non_overlapping", None) + results.setdefault("feature_gaps", []) + results.setdefault("errors", []) print(f"\n{'=' * 64}", flush=True) print(f" SWEEP: {model_id} [{task}]", flush=True) + if only_hyp_ids: + print(f" (delta sweep — only: {sorted(only_hyp_ids)})", flush=True) print(f"{'=' * 64}", flush=True) model_start = time.time() - # ── Step 1: generate base config (auto-detect for QNN NPU) ──────────────── + # ── Step 1: generate base config (or reuse from existing h0) ────────────── print("\n[1/3] Generating base config (winml config)…", flush=True) - base_config = get_base_config(model_id, task, model_type) + base_config = None + + if reuse_h0_config: + h0_cfg_path = model_dir / "h0" / "build_config.json" + if h0_cfg_path.exists(): + try: + base_config = json.loads(h0_cfg_path.read_text(encoding="utf-8")) + print(f" [reuse] loaded h0 config from {h0_cfg_path}", flush=True) + except Exception as e: + print(f" [reuse] failed to load h0 config: {e} — regenerating", flush=True) + + if base_config is None: + base_config = get_base_config(model_id, task, model_type) if base_config is None: results["errors"].append("base config generation failed — model may not be supported") @@ -561,6 +596,9 @@ def sweep_model( npu006_risk: bool = False for hyp_id, label, opset_override, extra_optim in HYPOTHESES: + # Hypothesis filter: skip if not in --only-hypotheses list + if only_hyp_ids is not None and hyp_id not in only_hyp_ids: + continue elapsed_total = time.time() - model_start if elapsed_total > MODEL_TIMEOUT_S: print( @@ -1041,8 +1079,30 @@ def main() -> None: action="store_true", help="Skip winml eval accuracy step even for image models", ) + parser.add_argument( + "--only-hypotheses", + default=None, + help=( + "Comma-separated list of hypothesis IDs to run, e.g. h6,h7,h8. " + "Skips all others. Use with --reuse-h0-config to avoid regenerating base config." + ), + ) + parser.add_argument( + "--reuse-h0-config", + action="store_true", + help=( + "Reuse the base config from an existing h0/build_config.json instead of " + "running winml config again. Requires a previous full sweep to have run." + ), + ) args = parser.parse_args() + # Parse hypothesis filter + only_hyp_ids: set[str] | None = None + if args.only_hypotheses: + only_hyp_ids = {h.strip() for h in args.only_hypotheses.split(",")} + print(f" Running only: {sorted(only_hyp_ids)}", flush=True) + RESULTS_DIR.mkdir(parents=True, exist_ok=True) # Confirm QNN EP is present @@ -1070,7 +1130,14 @@ def main() -> None: if args.skip_eval: do_eval = False try: - result = sweep_model(model_id, task, model_type, do_eval) + result = sweep_model( + model_id, + task, + model_type, + do_eval, + only_hyp_ids=only_hyp_ids, + reuse_h0_config=args.reuse_h0_config, + ) except Exception as exc: print(f"\n❌ Unexpected error for {model_id}: {exc}", flush=True) result = { From c3acb85d1922109e2cb9e66c08b6340103757f4d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 20:22:18 +0800 Subject: [PATCH 34/38] research(autoconfig): add QNN GPU sweep script (catalog_gpu_sweep.py) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QNN GPU sweep differs from NPU: - No quantization (gpu-004: QDQ hangs on GPU EP) - No compile (gpu-003: EPContext regresses ~34% on GPU) - No nhwc-transformer (gpu-002: Adreno X1-85 does not benefit) - CV gating IS reliable (no DVFS noise unlike NPU) - opset 21 previously untested — explicitly validated via h3 (gpu-006) Hypothesis matrix (13 total, h0-h12): h0-h3: opset 17/17-explicit/19/21 baselines (FP32, no quant) h4-h8: targeted fusions from graph analysis matmul_transpose, attention, bias_softmax, layer_norm, skip_layer_norm h9-h10: bundled combinations (opset21+attention, ln+skip_ln+matmul_tp) h11: gelu_fusion explicit (tests gpu-005 stability on non-ConvNext) h12: transpose_optimizer Models: 8 catalog + 3 recipe (rad-dino, tinyroberta-squad2, bge-small) Sweep is queued to auto-start after NPU h6-h10 finishes (run_gpu_sweep.bat polls h6h10_sweep.log for completion marker) Supports --only-hypotheses and --reuse-h0-config for delta sweeps. --- research/autoconfig/catalog_gpu_sweep.py | 739 +++++++++++++++++++++++ 1 file changed, 739 insertions(+) create mode 100644 research/autoconfig/catalog_gpu_sweep.py diff --git a/research/autoconfig/catalog_gpu_sweep.py b/research/autoconfig/catalog_gpu_sweep.py new file mode 100644 index 000000000..f9563f214 --- /dev/null +++ b/research/autoconfig/catalog_gpu_sweep.py @@ -0,0 +1,739 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""catalog_gpu_sweep.py — QNN GPU optimization hypothesis sweep for winml catalog models. + +QNN GPU differs fundamentally from QNN NPU: + - NO quantization (gpu-004: QDQ graphs hang on QNN GPU EP) + - NO compile (gpu-003: EPContext compilation regresses ~34% on GPU) + - NO nhwc-transformer (gpu-002: Adreno X1-85 does not benefit) + - CV gating IS reliable on GPU (no DVFS noise unlike NPU) + - All findings from gpu-001..006 are ConvNext-specific — transformer fusions + (attention, matmul_add, layer_norm) are UNTESTED and may help + +Hypothesis matrix (per model): + h0: baseline FP32 (auto-config, no quant, no compile) + h1: opset 17 explicit + h2: opset 19 + h3: opset 21 ← tests gpu-006 (unknown territory) + + Transformer/attention fusions (graph-analysis-driven): + h4: opset 17 + matmul_transpose_fusion (24-36× on transformer optimized.onnx) + h5: opset 17 + attention_fusion + h6: opset 17 + bias_softmax_fusion (12× on BERT-family) + h7: opset 17 + layer_norm_fusion + h8: opset 17 + skip_layer_norm_fusion + + Combined bundles: + h9: opset 21 + matmul_transpose_fusion + attention_fusion + h10: opset 17 + layer_norm_fusion + skip_layer_norm_fusion + matmul_transpose_fusion + h11: opset 17 + gelu_fusion (already in autoconf baseline; test stability benefit — gpu-005) + + Layout (Conv-heavy models only): + h12: opset 17 + transpose_optimizer + +2-phase bench (CV-gated, GPU is stable unlike NPU): + Phase A: 200-iter screen, CV < 15% required. + Phase B: 2 sessions × 300 iters, 5s cool-down. + KEEP criterion: median p50 >= 5% improvement AND CV < 5%. + +Results: catalog-gpu-sweep//results.json +Summary: catalog-gpu-sweep/SUMMARY.md +""" + +from __future__ import annotations + +import argparse +import copy +import json +import subprocess +import sys +import time +from datetime import datetime +from pathlib import Path + + +sys.stdout.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined] + +# ── constants ───────────────────────────────────────────────────────────────── +BASE_DIR = Path(__file__).parent +WINML = str(BASE_DIR / ".venv" / "Scripts" / "winml.exe") +EP = "qnn" +DEVICE = "gpu" +RESULTS_DIR = BASE_DIR / "catalog-gpu-sweep" + +SCREEN_WARMUP = 20 +SCREEN_ITERS = 200 +SCREEN_CV_MAX = 0.15 # GPU is CV-stable, unlike NPU + +FULL_WARMUP = 20 +FULL_ITERS = 300 +FULL_SESSIONS = 2 +COOL_DOWN_S = 5 # GPU cools faster than NPU HTP + +MIN_IMPROVEMENT_PCT = 5.0 # % gain required to declare KEEP + +BUILD_TIMEOUT_S = 10 * 60 +BENCH_TIMEOUT_S = 5 * 60 + +# gpu-004: no quantization allowed +# gpu-003: no compile +GPU_NO_QUANT = True +GPU_NO_COMPILE = True + +# Hypotheses: (id, label, opset_override, extra_optim) +# extra_optim=None → keep auto-config optim unchanged +# extra_optim=dict → merge ON TOP of auto-config optim +HYPOTHESES = [ + ("h0", "baseline FP32 (no quant, no compile)", None, None), + ("h1", "opset 17 explicit", 17, None), + ("h2", "opset 19", 19, None), + ("h3", "opset 21 (tests gpu-006)", 21, None), + # ── transformer/attention fusions (graph-analysis-driven) ────────────── + ("h4", "opset 17 + matmul_transpose_fusion", 17, {"matmul_transpose_fusion": True}), + ("h5", "opset 17 + attention_fusion", 17, {"attention_fusion": True}), + ("h6", "opset 17 + bias_softmax_fusion", 17, {"bias_softmax_fusion": True}), + ( + "h7", + "opset 17 + layer_norm_fusion", + 17, + {"layer_norm_fusion": True}, + ), + ( + "h8", + "opset 17 + skip_layer_norm_fusion", + 17, + {"skip_layer_norm_fusion": True}, + ), + # ── combined bundles ──────────────────────────────────────────────────── + ( + "h9", + "opset 21 + matmul_transpose + attention_fusion", + 21, + {"matmul_transpose_fusion": True, "attention_fusion": True}, + ), + ( + "h10", + "opset 17 + ln + skip_ln + matmul_transpose", + 17, + { + "layer_norm_fusion": True, + "skip_layer_norm_fusion": True, + "matmul_transpose_fusion": True, + }, + ), + # ── gelu stability (gpu-005) ──────────────────────────────────────────── + # gelu_fusion is already in autoconf defaults, but test explicitly + # to confirm p90/std stability benefit on non-ConvNext models + ("h11", "opset 17 + gelu_fusion explicit", 17, {"gelu_fusion": True}), + # ── layout ────────────────────────────────────────────────────────────── + ("h12", "opset 17 + transpose_optimizer", 17, {"transpose_optimizer": True}), +] + +# Catalog models (same as NPU sweep + recipe models) +ALL_MODELS: list[tuple[str, str, str]] = [ + # Catalog 8 + ("microsoft/resnet-18", "image-classification", "resnet"), + ("google/vit-base-patch16-224", "image-classification", "vit"), + ("apple/mobilevit-small", "image-classification", "mobilevit"), + ("facebook/dinov2-small", "image-feature-extraction", "dinov2"), + ("hustvl/yolos-small", "object-detection", "yolos"), + ( + "distilbert/distilbert-base-uncased-finetuned-sst-2-english", + "text-classification", + "distilbert", + ), + ("sentence-transformers/all-MiniLM-L6-v2", "sentence-similarity", "bert"), + ("deepset/roberta-base-squad2", "question-answering", "roberta"), + # Recipe models (from winml-cli examples/recipes) + ("microsoft/rad-dino", "image-feature-extraction", "dinov2"), + ("deepset/tinyroberta-squad2", "question-answering", "roberta"), + ("BAAI/bge-small-en-v1.5", "sentence-similarity", "bert"), +] + + +# ── low-level helpers ───────────────────────────────────────────────────────── + + +def run_cmd(cmd: list[str], label: str = "", timeout: int = 600) -> tuple[int, str, float]: + """Run a command; return (returncode, combined_output, elapsed_s).""" + t0 = time.time() + print(f" >> {label or cmd[1]}", flush=True) + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + timeout=timeout, + ) + elapsed = time.time() - t0 + tag = "ok" if result.returncode == 0 else f"rc={result.returncode}" + print(f" {elapsed:.0f}s [{tag}]", flush=True) + if result.returncode != 0: + stderr = result.stderr.strip() + if stderr: + print(f" stderr: {stderr[:200]}", flush=True) + return result.returncode, result.stdout + result.stderr, elapsed + except subprocess.TimeoutExpired: + elapsed = time.time() - t0 + print(f" TIMEOUT ({elapsed:.0f}s)", flush=True) + return -1, "TIMEOUT", elapsed + + +def _get_p50(perf_json: Path) -> float | None: + try: + d = json.loads(perf_json.read_text(encoding="utf-8")) + return float(d.get("p50_ms") or d.get("p50") or 0) or None + except Exception: + return None + + +def _get_cv(perf_json: Path) -> float | None: + try: + d = json.loads(perf_json.read_text(encoding="utf-8")) + return float(d.get("cv_pct") or d.get("cv") or d.get("std_ms", 0)) or None + except Exception: + return None + + +# ── config helpers ──────────────────────────────────────────────────────────── + + +def _patch_for_gpu(cfg: dict) -> dict: + """Strip quantization and compile from a base config for GPU EP.""" + cfg = copy.deepcopy(cfg) + cfg["quant"] = None + cfg["compile"] = None + # Remove nhwc-transformer (gpu-002) + optim = cfg.get("optim") or {} + optim.pop("nhwc_transformer", None) + cfg["optim"] = optim + return cfg + + +def get_base_config(model_id: str, task: str, model_type: str) -> dict | None: + """Call winml config for GPU EP and return the parsed config.""" + tmp_dir = RESULTS_DIR / "_tmp_config" + tmp_dir.mkdir(parents=True, exist_ok=True) + cfg_out = tmp_dir / f"{model_id.replace('/', '--')}_gpu.json" + + rc, out, _ = run_cmd( + [ + WINML, + "config", + "--model", + model_id, + "--task", + task, + "--ep", + EP, + "--device", + DEVICE, + "--model-type", + model_type, + "--output", + str(cfg_out), + ], + label="winml config --ep qnn --device gpu", + timeout=300, + ) + if rc != 0 or not cfg_out.exists(): + # Try without --output (some versions write to stdout) + for line in out.splitlines(): + line = line.strip() + if line.startswith("{"): + try: + cfg = json.loads(line) + return _patch_for_gpu(cfg) + except Exception: + pass + return None + + cfg = json.loads(cfg_out.read_text(encoding="utf-8")) + return _patch_for_gpu(cfg) + + +def make_hypothesis_config( + base_config: dict, opset_override: int | None, extra_optim: dict | None +) -> dict: + """Apply opset + extra_optim on top of base config.""" + cfg = copy.deepcopy(base_config) + if opset_override is not None: + cfg.setdefault("export", {})["opset_version"] = opset_override + if extra_optim: + existing = cfg.get("optim") or {} + cfg["optim"] = {**existing, **extra_optim} + return cfg + + +# ── build + bench ───────────────────────────────────────────────────────────── + + +def run_build(model_id: str, cfg_path: Path, out_dir: Path) -> tuple[bool, str]: + """winml build --no-quant --no-compile. Returns (ok, output).""" + rc, out, _ = run_cmd( + [ + WINML, + "build", + "-m", + model_id, + "-c", + str(cfg_path), + "-o", + str(out_dir), + "--ep", + EP, + "--device", + DEVICE, + "--no-quant", + "--no-compile", + ], + label="winml build", + timeout=BUILD_TIMEOUT_S, + ) + return rc == 0, out + + +def run_perf_screen(onnx_path: Path, out_json: Path) -> tuple[float | None, float | None]: + """Phase A: 200-iter screen. Returns (p50_ms, cv).""" + rc, out, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(onnx_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(SCREEN_WARMUP), + "--iterations", + str(SCREEN_ITERS), + "--output-json", + str(out_json), + ], + label="perf screen (200 iters)", + timeout=BENCH_TIMEOUT_S, + ) + if rc != 0 or not out_json.exists(): + return None, None + p50 = _get_p50(out_json) + cv = _get_cv(out_json) + if p50: + print(f" screen: p50={p50:.2f}ms CV={cv:.3f}", flush=True) + return p50, cv + + +def run_perf_full(onnx_path: Path, hyp_dir: Path) -> list[float]: + """Phase B: 2 × 300-iter sessions. Returns list of p50 values.""" + p50s = [] + for s in range(1, FULL_SESSIONS + 1): + out_json = hyp_dir / f"full_s{s}.json" + rc, out, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(onnx_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(FULL_WARMUP), + "--iterations", + str(FULL_ITERS), + "--output-json", + str(out_json), + ], + label=f"perf full s{s}/{FULL_SESSIONS} ({FULL_ITERS} iters)", + timeout=BENCH_TIMEOUT_S, + ) + p50 = _get_p50(out_json) if rc == 0 and out_json.exists() else None + if p50: + print(f" full s{s}: p50={p50:.2f}ms", flush=True) + p50s.append(p50) + if s < FULL_SESSIONS: + print(f" cool-down {COOL_DOWN_S}s…", flush=True) + time.sleep(COOL_DOWN_S) + return p50s + + +# ── sweep logic ─────────────────────────────────────────────────────────────── + + +def sweep_model( + model_id: str, + task: str, + model_type: str, + only_hyp_ids: "set[str] | None" = None, + reuse_h0_config: bool = False, +) -> dict: + """Run GPU hypotheses for one model. Returns results dict.""" + model_slug = model_id.replace("/", "--") + model_dir = RESULTS_DIR / model_slug + model_dir.mkdir(parents=True, exist_ok=True) + + # Resume from partial run + results_path = model_dir / "results.json" + if only_hyp_ids and results_path.exists(): + try: + results = json.loads(results_path.read_text(encoding="utf-8")) + print(" [resume] loaded existing results", flush=True) + except Exception: + results = {} + else: + results = {} + + results.update( + { + "model_id": model_id, + "task": task, + "model_type": model_type, + "timestamp": datetime.now().isoformat(timespec="seconds"), + "ep": EP, + "device": DEVICE, + } + ) + results.setdefault("baseline_opset", None) + results.setdefault("hypotheses", {}) + results.setdefault("best_hypothesis", None) + results.setdefault("baseline_p50_ms", None) + results.setdefault("best_p50_ms", None) + results.setdefault("best_gain_pct", None) + results.setdefault("opset21_gain_pct", None) # tests gpu-006 + results.setdefault("feature_gaps", []) + results.setdefault("errors", []) + + print(f"\n{'=' * 64}", flush=True) + print(f" SWEEP [GPU]: {model_id} [{task}]", flush=True) + if only_hyp_ids: + print(f" (delta — only: {sorted(only_hyp_ids)})", flush=True) + print(f"{'=' * 64}", flush=True) + + # ── Step 1: base config ──────────────────────────────────────────────── + print("\n[1/3] Generating base config…", flush=True) + base_config = None + + if reuse_h0_config: + h0_cfg = model_dir / "h0" / "build_config.json" + if h0_cfg.exists(): + try: + base_config = json.loads(h0_cfg.read_text(encoding="utf-8")) + print(" [reuse] h0 config loaded", flush=True) + except Exception: + pass + + if base_config is None: + base_config = get_base_config(model_id, task, model_type) + + if base_config is None: + results["errors"].append("base config generation failed") + _save_results(results, model_dir) + return results + + baseline_opset = (base_config.get("export") or {}).get("opset_version", "?") + results["baseline_opset"] = baseline_opset + print(f" baseline opset={baseline_opset} quant=NONE (GPU EP) compile=NONE", flush=True) + + # ── Step 2: hypothesis loop ──────────────────────────────────────────── + print(f"\n[2/3] Running {len(HYPOTHESES)} hypotheses…", flush=True) + + baseline_p50: float | None = results.get("baseline_p50_ms") + + for hyp_id, label, opset_override, extra_optim in HYPOTHESES: + if only_hyp_ids is not None and hyp_id not in only_hyp_ids: + continue + + sep = "─" * 56 + print(f"\n{sep}", flush=True) + print(f" {hyp_id}: {label}", flush=True) + print(f"{sep}", flush=True) + + hyp_config = make_hypothesis_config(base_config, opset_override, extra_optim) + opset_used = (hyp_config.get("export") or {}).get("opset_version", "?") + print(f" opset={opset_used} extra_optim={extra_optim}", flush=True) + + hyp_dir = model_dir / hyp_id + hyp_dir.mkdir(parents=True, exist_ok=True) + cfg_path = hyp_dir / "build_config.json" + cfg_path.write_text(json.dumps(hyp_config, indent=2), encoding="utf-8") + + # Build + build_ok, build_out = run_build(model_id, cfg_path, hyp_dir) + if not build_ok: + results["hypotheses"][hyp_id] = { + "status": "BUILD_FAIL", + "label": label, + "opset": opset_used, + "build_error": build_out[-300:] if build_out else "", + } + results["errors"].append(f"{hyp_id}: BUILD_FAIL") + continue + + # Find output ONNX + onnx_path = hyp_dir / "model.onnx" + if not onnx_path.exists(): + candidates = list(hyp_dir.glob("*.onnx")) + if candidates: + onnx_path = candidates[0] + else: + results["hypotheses"][hyp_id] = {"status": "NO_ONNX", "label": label} + results["errors"].append(f"{hyp_id}: build OK but no ONNX") + continue + + # Phase A: screen + screen_json = hyp_dir / "screen_perf.json" + screen_p50, screen_cv = run_perf_screen(onnx_path, screen_json) + + if screen_p50 is None: + results["hypotheses"][hyp_id] = {"status": "BENCH_FAIL", "label": label} + results["errors"].append(f"{hyp_id}: screen bench failed") + continue + + if screen_cv is not None and screen_cv > SCREEN_CV_MAX: + print( + f" [warn] high CV={screen_cv:.3f} on GPU (unusual) — proceeding anyway", flush=True + ) + + # Phase B: full bench + p50s = run_perf_full(onnx_path, hyp_dir) + if not p50s: + results["hypotheses"][hyp_id] = { + "status": "BENCH_FAIL", + "label": label, + "screen_p50_ms": screen_p50, + } + continue + + median_p50 = sorted(p50s)[len(p50s) // 2] + + hyp_data: dict = { + "status": "OK", + "label": label, + "opset": opset_used, + "extra_optim": extra_optim, + "screen_p50_ms": screen_p50, + "screen_cv": screen_cv, + "full_p50s_ms": p50s, + "median_p50_ms": median_p50, + } + + # Track baseline + if hyp_id == "h0": + baseline_p50 = median_p50 + results["baseline_p50_ms"] = baseline_p50 + print(f" [baseline] p50={baseline_p50:.2f}ms", flush=True) + + # Compare to baseline + if baseline_p50 and hyp_id != "h0": + gain_pct = (baseline_p50 - median_p50) / baseline_p50 * 100 + hyp_data["gain_vs_baseline_pct"] = round(gain_pct, 2) + verdict = ( + "KEEP" + if gain_pct >= MIN_IMPROVEMENT_PCT + else ("MARGINAL" if gain_pct > 0 else "DISCARD") + ) + hyp_data["verdict"] = verdict + print( + f" [{verdict}] gain={gain_pct:+.1f}% ({baseline_p50:.2f}ms → {median_p50:.2f}ms)", + flush=True, + ) + + # Track best + best_p50 = results.get("best_p50_ms") + if best_p50 is None or median_p50 < best_p50: + if gain_pct >= MIN_IMPROVEMENT_PCT: + results["best_p50_ms"] = median_p50 + results["best_hypothesis"] = hyp_id + results["best_gain_pct"] = round(gain_pct, 2) + + # gpu-006: track opset21 result + if opset_override == 21 and extra_optim is None: + results["opset21_gain_pct"] = round(gain_pct, 2) + else: + hyp_data["verdict"] = "BASELINE" + + results["hypotheses"][hyp_id] = hyp_data + + # ── Step 3: finalise ─────────────────────────────────────────────────── + _post_process(results) + _save_results(results, model_dir) + return results + + +def _post_process(results: dict) -> None: + """Print summary and add cross-hypothesis notes.""" + hyps = results.get("hypotheses", {}) + baseline_p50 = results.get("baseline_p50_ms") + if not baseline_p50: + return + + keeps = [(hid, h) for hid, h in hyps.items() if h.get("verdict") == "KEEP"] + if keeps: + print(f"\n ✓ KEEP verdicts: {[h[0] for h in keeps]}", flush=True) + else: + print("\n No improvements found above 5% threshold.", flush=True) + + # gpu-006 summary + opset21 = results.get("opset21_gain_pct") + if opset21 is not None: + if opset21 >= 5: + print(f" [gpu-006] opset21 HELPS GPU: +{opset21:.1f}%", flush=True) + elif opset21 <= -5: + print(f" [gpu-006] opset21 HURTS GPU: {opset21:.1f}%", flush=True) + else: + print(f" [gpu-006] opset21 NEUTRAL on GPU: {opset21:.1f}%", flush=True) + + +def _save_results(results: dict, model_dir: Path) -> None: + out = model_dir / "results.json" + out.write_text(json.dumps(results, indent=2, ensure_ascii=False), encoding="utf-8") + print(f" Results: {out}", flush=True) + + +# ── summary writer ──────────────────────────────────────────────────────────── + + +def write_summary(all_results: list[dict]) -> None: + lines = [ + "# QNN GPU Optimization Sweep — Catalog Models", + "", + f"Generated: {datetime.now().isoformat(timespec='seconds')} ", + f"EP: `{EP}` / device: `{DEVICE}` ", + f"Protocol: screen {SCREEN_ITERS} iters (CV<{SCREEN_CV_MAX * 100:.0f}%)," + f" full {FULL_ITERS}×{FULL_SESSIONS} sessions ", + "Constraints: NO quant (gpu-004), NO compile (gpu-003), NO nhwc (gpu-002) ", + "", + "---", + "", + "## Per-Model Results", + "", + "| Model | Baseline p50 | Best p50 | Best config | Gain% | opset21 gain% | Notes |", + "|-------|-------------|----------|-------------|-------|--------------|-------|", + ] + + for r in all_results: + model_id = r["model_id"] + baseline = f"{r['baseline_p50_ms']:.1f} ms" if r.get("baseline_p50_ms") else "N/A" + best = f"{r['best_p50_ms']:.1f} ms" if r.get("best_p50_ms") else "N/A" + best_h = r.get("best_hypothesis") or "N/A" + best_label = "" + if best_h != "N/A": + best_label = r.get("hypotheses", {}).get(best_h, {}).get("label", "") + gain = f"{r['best_gain_pct']:.1f}%" if r.get("best_gain_pct") is not None else "N/A" + opset21 = r.get("opset21_gain_pct") + opset21_str = f"{opset21:+.1f}%" if opset21 is not None else "N/A" + errors = "; ".join(r.get("errors", []))[:80] or "none" + lines.append( + f"| `{model_id}` | {baseline} | {best} | {best_h} ({best_label}) | {gain} | {opset21_str} | {errors} |" + ) + + lines += [ + "", + "## gpu-006: opset 21 on QNN GPU", + "", + "Previously untested. This sweep provides first data across multiple architectures.", + "", + ] + + opset21_helps = [r["model_id"] for r in all_results if (r.get("opset21_gain_pct") or 0) >= 5] + opset21_hurts = [r["model_id"] for r in all_results if (r.get("opset21_gain_pct") or 0) <= -5] + opset21_neutral = [ + r["model_id"] + for r in all_results + if r.get("opset21_gain_pct") is not None and -5 < (r.get("opset21_gain_pct") or 0) < 5 + ] + lines += [ + f"- **Helps (≥5%):** {', '.join(opset21_helps) or 'none'}", + f"- **Hurts (≤-5%):** {', '.join(opset21_hurts) or 'none'}", + f"- **Neutral:** {', '.join(opset21_neutral) or 'none (no data yet)'}", + "", + ] + + lines += ["## Feature Gaps", ""] + all_gaps = [ + f"- **`{r['model_id']}`**: {g}" for r in all_results for g in r.get("feature_gaps", []) + ] + lines += all_gaps if all_gaps else ["- None observed"] + + summary_path = RESULTS_DIR / "SUMMARY.md" + summary_path.write_text("\n".join(lines) + "\n", encoding="utf-8") + print(f"\n📄 Summary: {summary_path}", flush=True) + + +# ── entry point ─────────────────────────────────────────────────────────────── + + +def main() -> None: + parser = argparse.ArgumentParser( + description="QNN GPU hypothesis sweep for winml catalog models" + ) + parser.add_argument("--model", default=None) + parser.add_argument("--task", default=None) + parser.add_argument("--model-type", default="auto") + parser.add_argument( + "--only-hypotheses", default=None, help="Comma-separated h IDs, e.g. h3,h4,h9" + ) + parser.add_argument("--reuse-h0-config", action="store_true") + args = parser.parse_args() + + only_hyp_ids: set[str] | None = None + if args.only_hypotheses: + only_hyp_ids = {h.strip() for h in args.only_hypotheses.split(",")} + + RESULTS_DIR.mkdir(parents=True, exist_ok=True) + + # Confirm QNN GPU EP + print("=== Confirming QNN GPU EP ===", flush=True) + rc, out, _ = run_cmd([WINML, "sys", "--list-ep"], label="winml sys --list-ep", timeout=30) + if "qnn" not in out.lower(): + print("❌ QNN EP not detected! Aborting.", flush=True) + sys.exit(1) + print("✓ QNN EP available\n", flush=True) + + if args.model: + if not args.task: + print("Error: --task required with --model", flush=True) + sys.exit(1) + models_to_run = [(args.model, args.task, args.model_type)] + else: + models_to_run = ALL_MODELS # type: ignore[assignment] + + all_results: list[dict] = [] + + for model_id, task, model_type in models_to_run: + try: + result = sweep_model( + model_id, + task, + model_type, + only_hyp_ids=only_hyp_ids, + reuse_h0_config=args.reuse_h0_config, + ) + except Exception as exc: + print(f"\n❌ Unexpected error for {model_id}: {exc}", flush=True) + result = { + "model_id": model_id, + "task": task, + "model_type": model_type, + "errors": [f"Unexpected exception: {exc}"], + "hypotheses": {}, + "feature_gaps": [], + } + all_results.append(result) + write_summary(all_results) + + print("\n" + "=" * 64, flush=True) + print(" GPU SWEEP COMPLETE", flush=True) + print("=" * 64, flush=True) + write_summary(all_results) + + +if __name__ == "__main__": + main() From bb34c9d99ba81a2bd522c5899ec84d71acc55b0b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 20:39:37 +0800 Subject: [PATCH 35/38] research(autoconfig): add Phase C confirmation pass to GPU + NPU sweeps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both sweeps now run additional confirmation sessions for any KEEP-level result to reduce false positives from thermal/DVFS noise: QNN GPU (catalog_gpu_sweep.py): - Phase B: 2 → 3 sessions × 300 iters (baseline) - Phase C: KEEP candidates get 2 extra confirmation sessions - All 5 sessions above MIN_IMPROVEMENT_PCT → KEEP_CONFIRMED - Fewer → MARGINAL_UNCONFIRMED (downgraded, not dropped) QNN NPU (catalog_qnn_sweep.py): - Phase B unchanged: 3 sessions × 500 iters (already robust) - Phase C: best hypothesis (gain ≥ 5%) gets 2 extra confirmation sessions - Strict criterion: max(all 5 p50s) < min(baseline p50s) → CONFIRMED - Otherwise → MARGINAL_UNCONFIRMED (ranges overlap = DVFS noise) Motivation: avoid publishing false conclusions from single-run noise. GPU is more stable (CV gating already helps) but confirmation pass gives rigour before updating ep_knowledge KB with a new finding. --- research/autoconfig/catalog_gpu_sweep.py | 132 +++++++++++++++++++++-- research/autoconfig/catalog_qnn_sweep.py | 120 +++++++++++++++++++++ 2 files changed, 246 insertions(+), 6 deletions(-) diff --git a/research/autoconfig/catalog_gpu_sweep.py b/research/autoconfig/catalog_gpu_sweep.py index f9563f214..353a127f0 100644 --- a/research/autoconfig/catalog_gpu_sweep.py +++ b/research/autoconfig/catalog_gpu_sweep.py @@ -36,7 +36,10 @@ 2-phase bench (CV-gated, GPU is stable unlike NPU): Phase A: 200-iter screen, CV < 15% required. - Phase B: 2 sessions × 300 iters, 5s cool-down. + Phase B: 3 sessions × 300 iters, 5s cool-down. + Phase C (confirmation): KEEP candidates get 2 additional sessions. + All 5 sessions must show improvement → KEEP_CONFIRMED. + Fewer than 5/5 → MARGINAL_UNCONFIRMED. KEEP criterion: median p50 >= 5% improvement AND CV < 5%. Results: catalog-gpu-sweep//results.json @@ -70,7 +73,8 @@ FULL_WARMUP = 20 FULL_ITERS = 300 -FULL_SESSIONS = 2 +FULL_SESSIONS = 3 # baseline sessions per hypothesis +CONFIRM_SESSIONS = 2 # extra sessions for KEEP candidates (Phase C) COOL_DOWN_S = 5 # GPU cools faster than NPU HTP MIN_IMPROVEMENT_PCT = 5.0 # % gain required to declare KEEP @@ -561,12 +565,120 @@ def sweep_model( results["hypotheses"][hyp_id] = hyp_data + # ── Step 2b: Phase C — confirmation runs for KEEP candidates ────────────── + _run_confirmation_pass(results, model_dir, baseline_p50) + # ── Step 3: finalise ─────────────────────────────────────────────────── _post_process(results) _save_results(results, model_dir) return results +def _run_confirmation_pass(results: dict, model_dir: Path, baseline_p50: float | None) -> None: + """Phase C: re-run CONFIRM_SESSIONS additional sessions for every KEEP candidate. + + If all (FULL_SESSIONS + CONFIRM_SESSIONS) sessions show >= MIN_IMPROVEMENT_PCT: + verdict stays KEEP_CONFIRMED. + Otherwise downgrade to MARGINAL_UNCONFIRMED. + """ + if not baseline_p50: + return + hyps = results.get("hypotheses", {}) + keep_ids = [hid for hid, h in hyps.items() if h.get("verdict") == "KEEP"] + if not keep_ids: + return + + print( + f"\n ── Phase C: confirming {keep_ids} ({CONFIRM_SESSIONS} extra sessions each) ──", + flush=True, + ) + + for hyp_id in keep_ids: + hyp_data = hyps[hyp_id] + onnx_path: Path | None = None + hyp_dir = model_dir / hyp_id + + # Find built ONNX from the hypothesis dir + for candidate in (hyp_dir / "optimized.onnx", hyp_dir / "quantized.onnx"): + if candidate.exists(): + onnx_path = candidate + break + if onnx_path is None: + print(f" [confirm] {hyp_id}: no onnx found, skipping", flush=True) + continue + + print(f" [confirm] {hyp_id} ({hyp_data['label']})", flush=True) + extra_p50s: list[float] = [] + for s in range(1, CONFIRM_SESSIONS + 1): + out_json = hyp_dir / f"confirm_s{s}.json" + rc, _, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(onnx_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(FULL_WARMUP), + "--iterations", + str(FULL_ITERS), + "--output-json", + str(out_json), + ], + label=f"confirm s{s}/{CONFIRM_SESSIONS}", + timeout=BENCH_TIMEOUT_S, + ) + p50 = _get_p50(out_json) if rc == 0 and out_json.exists() else None + if p50: + print(f" confirm s{s}: p50={p50:.2f}ms", flush=True) + extra_p50s.append(p50) + if s < CONFIRM_SESSIONS: + time.sleep(COOL_DOWN_S) + + if not extra_p50s: + print(f" [confirm] {hyp_id}: confirm bench failed, keeping KEEP", flush=True) + continue + + all_p50s: list[float] = hyp_data.get("full_p50s_ms", []) + extra_p50s + overall_median = sorted(all_p50s)[len(all_p50s) // 2] + overall_gain = (baseline_p50 - overall_median) / baseline_p50 * 100 + wins = sum( + 1 for p in all_p50s if (baseline_p50 - p) / baseline_p50 * 100 >= MIN_IMPROVEMENT_PCT + ) + + hyp_data["confirm_p50s_ms"] = extra_p50s + hyp_data["all_p50s_ms"] = all_p50s + hyp_data["overall_median_p50_ms"] = round(overall_median, 3) + hyp_data["overall_gain_pct"] = round(overall_gain, 2) + hyp_data["sessions_above_threshold"] = wins + hyp_data["total_sessions"] = len(all_p50s) + + if wins == len(all_p50s): + hyp_data["verdict"] = "KEEP_CONFIRMED" + print( + f" [KEEP_CONFIRMED] {hyp_id}: {wins}/{len(all_p50s)} sessions ≥ {MIN_IMPROVEMENT_PCT}%," + f" overall gain={overall_gain:+.1f}%", + flush=True, + ) + else: + hyp_data["verdict"] = "MARGINAL_UNCONFIRMED" + print( + f" [MARGINAL_UNCONFIRMED] {hyp_id}: only {wins}/{len(all_p50s)} sessions above threshold", + flush=True, + ) + + # Update best_hypothesis tracking + if hyp_data["verdict"] == "KEEP_CONFIRMED": + best_p50 = results.get("best_p50_ms") + if best_p50 is None or overall_median < best_p50: + results["best_p50_ms"] = overall_median + results["best_hypothesis"] = hyp_id + results["best_gain_pct"] = round(overall_gain, 2) + + def _post_process(results: dict) -> None: """Print summary and add cross-hypothesis notes.""" hyps = results.get("hypotheses", {}) @@ -574,10 +686,18 @@ def _post_process(results: dict) -> None: if not baseline_p50: return - keeps = [(hid, h) for hid, h in hyps.items() if h.get("verdict") == "KEEP"] + keeps = [(hid, h) for hid, h in hyps.items() if h.get("verdict") in ("KEEP", "KEEP_CONFIRMED")] + unconfirmed = [ + (hid, h) for hid, h in hyps.items() if h.get("verdict") == "MARGINAL_UNCONFIRMED" + ] if keeps: - print(f"\n ✓ KEEP verdicts: {[h[0] for h in keeps]}", flush=True) - else: + print(f"\n ✓ KEEP/KEEP_CONFIRMED: {[h[0] for h in keeps]}", flush=True) + if unconfirmed: + print( + f" ⚠ MARGINAL_UNCONFIRMED (failed confirmation): {[h[0] for h in unconfirmed]}", + flush=True, + ) + if not keeps and not unconfirmed: print("\n No improvements found above 5% threshold.", flush=True) # gpu-006 summary @@ -607,7 +727,7 @@ def write_summary(all_results: list[dict]) -> None: f"Generated: {datetime.now().isoformat(timespec='seconds')} ", f"EP: `{EP}` / device: `{DEVICE}` ", f"Protocol: screen {SCREEN_ITERS} iters (CV<{SCREEN_CV_MAX * 100:.0f}%)," - f" full {FULL_ITERS}×{FULL_SESSIONS} sessions ", + f" full {FULL_ITERS}×{FULL_SESSIONS} sessions + {CONFIRM_SESSIONS} confirm sessions for KEEP ", "Constraints: NO quant (gpu-004), NO compile (gpu-003), NO nhwc (gpu-002) ", "", "---", diff --git a/research/autoconfig/catalog_qnn_sweep.py b/research/autoconfig/catalog_qnn_sweep.py index 081b49b3a..75903ed5d 100644 --- a/research/autoconfig/catalog_qnn_sweep.py +++ b/research/autoconfig/catalog_qnn_sweep.py @@ -72,6 +72,7 @@ FULL_WARMUP = 50 FULL_ITERS = 500 FULL_SESSIONS = 3 +CONFIRM_SESSIONS = 2 # extra sessions for best hypothesis (Phase C confirmation) COOL_DOWN_S = 30 MODEL_TIMEOUT_S = 180 * 60 # 3 hours per model — 6 hypotheses × ~30min each @@ -728,10 +729,129 @@ def sweep_model( # ── Step 3: compute summary stats ───────────────────────────────────────── print("\n[3/3] Computing summary stats…", flush=True) _compute_summary(results) + + # ── Step 3b: Phase C — confirm the best hypothesis with 2 extra sessions ── + _run_confirmation_pass_npu(results, model_dir) + _save_results(results, model_dir) return results +def _run_confirmation_pass_npu(results: dict, model_dir: Path) -> None: + """Phase C: run CONFIRM_SESSIONS extra sessions on the best hypothesis. + + For NPU (high DVFS noise), uses range-non-overlap criterion: + - All (FULL_SESSIONS + CONFIRM_SESSIONS) p50s < baseline_min → CONFIRMED + - Otherwise → MARGINAL_UNCONFIRMED, best_gain_pct flagged as uncertain + """ + best_h_id: str | None = results.get("best_hypothesis") + baseline_p50: float | None = results.get("baseline_p50_ms") + if not best_h_id or not baseline_p50: + return + + best_hyp = results["hypotheses"].get(best_h_id, {}) + best_gain = results.get("best_gain_pct", 0.0) + if best_gain < 5.0: + return # nothing worth confirming + + # Find ONNX + hyp_dir = model_dir / best_h_id + onnx_path: Path | None = None + for candidate in (hyp_dir / "quantized.onnx", hyp_dir / "optimized.onnx"): + if candidate.exists(): + onnx_path = candidate + break + if onnx_path is None: + return + + print( + f"\n ── Phase C: confirming best hypothesis {best_h_id} ({CONFIRM_SESSIONS} extra sessions) ──", + flush=True, + ) + + confirm_p50s: list[float] = [] + for s in range(1, CONFIRM_SESSIONS + 1): + out_json = hyp_dir / f"confirm_s{s}.json" + rc, _, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(onnx_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(FULL_WARMUP), + "--iterations", + str(FULL_ITERS), + "-o", + str(out_json), + ], + label=f"confirm s{s}/{CONFIRM_SESSIONS}", + timeout=BENCH_TIMEOUT_S, + ) + if rc == 0 and out_json.exists(): + try: + data = json.loads(out_json.read_text()) + lat = data["latency_ms"] + p50 = lat["p50"] + print(f" confirm s{s}: p50={p50:.2f}ms", flush=True) + confirm_p50s.append(p50) + except Exception as e: + print(f" [warn] confirm s{s} parse error: {e}", flush=True) + if s < CONFIRM_SESSIONS: + print(f" cool-down {COOL_DOWN_S}s…", flush=True) + time.sleep(COOL_DOWN_S) + + if not confirm_p50s: + print(f" [confirm] {best_h_id}: confirm bench failed, conclusion unchanged", flush=True) + return + + # Get all p50s including prior FULL_SESSIONS runs + prior_p50s: list[float] = best_hyp.get("full", {}).get("p50s_ms", []) + all_p50s = prior_p50s + confirm_p50s + + # Baseline comparison: use h0/h1 p50s for range overlap test + baseline_h = None + for h_id in ("h0", "h1"): + h = results["hypotheses"].get(h_id, {}) + if h.get("status") in ("OK", "OK_HIGH_CV"): + baseline_h = h + break + baseline_p50s: list[float] = ( + baseline_h["full"].get("p50s_ms", [baseline_p50]) if baseline_h else [baseline_p50] + ) + + overall_median = float(sorted(all_p50s)[len(all_p50s) // 2]) + overall_gain = (baseline_p50 - overall_median) / baseline_p50 * 100 + # Strict: max of all best-hypothesis sessions must be < min of baseline sessions + ranges_confirmed = max(all_p50s) < min(baseline_p50s) if baseline_p50s else False + + best_hyp["confirm_p50s_ms"] = [round(p, 3) for p in confirm_p50s] + best_hyp["all_p50s_ms"] = [round(p, 3) for p in all_p50s] + best_hyp["confirm_overall_median_ms"] = round(overall_median, 3) + best_hyp["confirm_overall_gain_pct"] = round(overall_gain, 2) + best_hyp["confirm_ranges_non_overlapping"] = ranges_confirmed + + if ranges_confirmed: + best_hyp["confirm_verdict"] = "CONFIRMED" + results["best_gain_pct"] = round(overall_gain, 2) + print( + f" [CONFIRMED] {best_h_id}: all {len(all_p50s)} p50s < baseline min" + f" — gain={overall_gain:+.1f}% (ranges non-overlapping)", + flush=True, + ) + else: + best_hyp["confirm_verdict"] = "MARGINAL_UNCONFIRMED" + print( + f" [MARGINAL_UNCONFIRMED] {best_h_id}: max={max(all_p50s):.1f}ms" + f" ≥ baseline min={min(baseline_p50s):.1f}ms — DVFS noise, ranges overlap", + flush=True, + ) + + def _compute_summary(results: dict) -> None: """Fill in baseline_p50, best_hypothesis, best_gain, npu001_generalized, npu006_regression.""" hyps = results["hypotheses"] From cc25e31d730a6b0ea5168606c353618389a0a6a8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 17 Jun 2026 21:21:21 +0800 Subject: [PATCH 36/38] fix(autoconfig): fix GPU sweep perf flag and JSON parsing - Replace --output-json with --output (correct winml perf flag) - Fix _get_p50/_get_cv to read latency_ms.p50/std keys (winml perf JSON nests metrics under 'latency_ms', not top-level) --- research/autoconfig/catalog_gpu_sweep.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/research/autoconfig/catalog_gpu_sweep.py b/research/autoconfig/catalog_gpu_sweep.py index 353a127f0..92e7f4da0 100644 --- a/research/autoconfig/catalog_gpu_sweep.py +++ b/research/autoconfig/catalog_gpu_sweep.py @@ -191,15 +191,20 @@ def run_cmd(cmd: list[str], label: str = "", timeout: int = 600) -> tuple[int, s def _get_p50(perf_json: Path) -> float | None: try: d = json.loads(perf_json.read_text(encoding="utf-8")) - return float(d.get("p50_ms") or d.get("p50") or 0) or None + lat = d.get("latency_ms", d) + return float(lat.get("p50") or 0) or None except Exception: return None def _get_cv(perf_json: Path) -> float | None: + """Return CV (std/p50). Returns None on parse error.""" try: d = json.loads(perf_json.read_text(encoding="utf-8")) - return float(d.get("cv_pct") or d.get("cv") or d.get("std_ms", 0)) or None + lat = d.get("latency_ms", d) + p50 = float(lat.get("p50") or 0) + std = float(lat.get("std") or 0) + return std / p50 if p50 > 0 else None except Exception: return None @@ -318,7 +323,7 @@ def run_perf_screen(onnx_path: Path, out_json: Path) -> tuple[float | None, floa str(SCREEN_WARMUP), "--iterations", str(SCREEN_ITERS), - "--output-json", + "--output", str(out_json), ], label="perf screen (200 iters)", @@ -352,7 +357,7 @@ def run_perf_full(onnx_path: Path, hyp_dir: Path) -> list[float]: str(FULL_WARMUP), "--iterations", str(FULL_ITERS), - "--output-json", + "--output", str(out_json), ], label=f"perf full s{s}/{FULL_SESSIONS} ({FULL_ITERS} iters)", @@ -625,7 +630,7 @@ def _run_confirmation_pass(results: dict, model_dir: Path, baseline_p50: float | str(FULL_WARMUP), "--iterations", str(FULL_ITERS), - "--output-json", + "--output", str(out_json), ], label=f"confirm s{s}/{CONFIRM_SESSIONS}", From 7b30db9b1403fa4421ca1137f421977ca069b94b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 18 Jun 2026 09:05:15 +0800 Subject: [PATCH 37/38] fix(autoconfig): add --rebuild to GPU sweep build step Without --rebuild, winml build fails when a partial export.onnx exists in the output directory (optimize step exits rc=1). --rebuild forces a clean pipeline run, which succeeds consistently. --- research/autoconfig/catalog_gpu_sweep.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/research/autoconfig/catalog_gpu_sweep.py b/research/autoconfig/catalog_gpu_sweep.py index 92e7f4da0..7bfeab75f 100644 --- a/research/autoconfig/catalog_gpu_sweep.py +++ b/research/autoconfig/catalog_gpu_sweep.py @@ -283,7 +283,7 @@ def make_hypothesis_config( def run_build(model_id: str, cfg_path: Path, out_dir: Path) -> tuple[bool, str]: - """winml build --no-quant --no-compile. Returns (ok, output).""" + """winml build --no-quant --no-compile --rebuild. Returns (ok, output).""" rc, out, _ = run_cmd( [ WINML, @@ -300,6 +300,7 @@ def run_build(model_id: str, cfg_path: Path, out_dir: Path) -> tuple[bool, str]: DEVICE, "--no-quant", "--no-compile", + "--rebuild", ], label="winml build", timeout=BUILD_TIMEOUT_S, From 292d3782b33c32f0d44ee4dd148950de5995273c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 18 Jun 2026 09:54:30 +0800 Subject: [PATCH 38/38] research(autoconfig): add CPU EP sweep script (catalog_cpu_sweep.py) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 14 hypotheses covering: h0-h3: opset variants (17/19/21) — tests whether cpu-001 regression (3-4x slowdown on ConvNext) generalizes to transformer models h4-h7: transformer fusions: attention, skip_layer_norm, layer_norm, bias_softmax h8: matmul_add_fusion with cpu-002 Gemm guard (auto-skipped if model has Gemm) h9: matmul_transpose_fusion h10: bundled transformer pack (attention + skip_ln + layer_norm) h11: nchwc_transformer (Conv-heavy vision models) h12: transpose_optimizer h13: gelu_fusion explicit DML EP note: not available on this machine (only CPU + QNN). Sweep queued to start after GPU retry finishes. --- research/autoconfig/catalog_cpu_sweep.py | 827 +++++++++++++++++++++++ 1 file changed, 827 insertions(+) create mode 100644 research/autoconfig/catalog_cpu_sweep.py diff --git a/research/autoconfig/catalog_cpu_sweep.py b/research/autoconfig/catalog_cpu_sweep.py new file mode 100644 index 000000000..69a90ea59 --- /dev/null +++ b/research/autoconfig/catalog_cpu_sweep.py @@ -0,0 +1,827 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""catalog_cpu_sweep.py — WinML CPU EP optimization sweep across catalog + recipe models. + +Sweeps graph-optimization flags for CPU EP to find improvement opportunities beyond +autoconf defaults. Based on patterns detected by analyze_insight.py (30+ fusion candidates). + +Key CPU constraints from ep_knowledge/cpu.json: + cpu-001: opset 19+ REGRESSES on CPU (3-4x slowdown, Transpose Optimizer bypass) + → h3/h4 included deliberately to test on transformer models (cpu-001 was ConvNext only) + cpu-002: matmul_add_fusion REGRESSES if model already has Gemm ops + → guarded by Gemm check before applying + cpu-003: transpose_optimizer is neutral on ConvNext (may help transformers) + cpu-004: nchwc_transformer neutral on Gemm-heavy models + cpu-005: baseline is optimal for ConvNext — transformers untested + +Phase A: 200-iter screen, CV < 10% required (CPU is thermally stable). +Phase B: 3 sessions × 300 iters, 2s cool-down. +Phase C (confirmation): best hypothesis + 2 extra sessions. + All 5 p50s < baseline_min → CONFIRMED. +KEEP criterion: median p50 >= 5% improvement. + +Results: catalog-cpu-sweep//results.json +Summary: catalog-cpu-sweep/SUMMARY.md +""" + +from __future__ import annotations + +import argparse +import copy +import json +import subprocess +import sys +import time +from datetime import datetime +from pathlib import Path + + +sys.stdout.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined] + +# ── constants ───────────────────────────────────────────────────────────────── +BASE_DIR = Path(__file__).parent +WINML = str(BASE_DIR / ".venv" / "Scripts" / "winml.exe") +EP = "cpu" +DEVICE = "cpu" +RESULTS_DIR = BASE_DIR / "catalog-cpu-sweep" + +SCREEN_WARMUP = 10 +SCREEN_ITERS = 200 +SCREEN_CV_MAX = 0.10 # CPU is stable — stricter than QNN + +FULL_WARMUP = 10 +FULL_ITERS = 300 +FULL_SESSIONS = 3 +CONFIRM_SESSIONS = 2 # Phase C: extra sessions for best hypothesis +COOL_DOWN_S = 2 # CPU cools quickly + +MIN_IMPROVEMENT_PCT = 5.0 + +BUILD_TIMEOUT_S = 10 * 60 +BENCH_TIMEOUT_S = 8 * 60 + +# Gemm threshold: if model has Gemm ops, skip matmul_add_fusion (cpu-002) +GEMM_SAFE_MATMUL_ADD = False # Conservative default; overridden per model + +# Hypotheses: (id, label, opset_override, extra_optim, skip_if_gemm) +# skip_if_gemm=True → skip if model.onnx already contains Gemm nodes (cpu-002 guard) +HYPOTHESES: list[tuple[str, str, int | None, dict | None, bool]] = [ + # ── Opset variants ───────────────────────────────────────────────────── + ("h0", "baseline (opset 17, autoconf defaults)", None, None, False), + ("h1", "opset 17 explicit", 17, None, False), + # cpu-001: opset 19/21 KNOWN to regress on ConvNext — included to test transformers + ("h2", "opset 19 (cpu-001 risk — transformer test)", 19, None, False), + ("h3", "opset 21 (cpu-001 risk — transformer test)", 21, None, False), + # ── Transformer fusions (graph-analysis-driven) ──────────────────────── + ("h4", "opset 17 + attention_fusion", 17, {"attention_fusion": True}, False), + ("h5", "opset 17 + skip_layer_norm_fusion", 17, {"skip_layer_norm_fusion": True}, False), + ("h6", "opset 17 + layer_norm_fusion", 17, {"layer_norm_fusion": True}, False), + ("h7", "opset 17 + bias_softmax_fusion", 17, {"bias_softmax_fusion": True}, False), + # ── MatMul fusions ───────────────────────────────────────────────────── + # matmul_add_fusion: skip if Gemm already present (cpu-002) + ("h8", "opset 17 + matmul_add_fusion (cpu-002 guarded)", 17, {"matmul_add_fusion": True}, True), + ("h9", "opset 17 + matmul_transpose_fusion", 17, {"matmul_transpose_fusion": True}, False), + # ── Transformer bundle (best flags combined) ─────────────────────────── + ( + "h10", + "opset 17 + attention + skip_layer_norm + layer_norm", + 17, + {"attention_fusion": True, "skip_layer_norm_fusion": True, "layer_norm_fusion": True}, + False, + ), + # ── Conv / layout (vision models) ───────────────────────────────────── + # nchwc_transformer: neutral on Gemm-heavy models (cpu-004), may help Conv-heavy + ( + "h11", + "opset 17 + nchwc_transformer (Conv-heavy models)", + 17, + {"nchwc_transformer": True}, + False, + ), + # ── Misc ─────────────────────────────────────────────────────────────── + ("h12", "opset 17 + transpose_optimizer", 17, {"transpose_optimizer": True}, False), + ("h13", "opset 17 + gelu_fusion explicit", 17, {"gelu_fusion": True}, False), +] + +# Catalog + recipe models (task, model_type) +ALL_MODELS = [ + ("microsoft/resnet-18", "image-classification", "resnet"), + ("apple/mobilevit-small", "image-classification", "mobilevit"), + ("facebook/dinov2-small", "image-feature-extraction", "dinov2"), + ("deepset/roberta-base-squad2", "question-answering", "roberta"), + ("deepset/tinyroberta-squad2", "question-answering", "roberta"), + ("BAAI/bge-small-en-v1.5", "sentence-similarity", "bert"), + ("sentence-transformers/all-MiniLM-L6-v2", "sentence-similarity", "bert"), + ("microsoft/rad-dino", "image-feature-extraction", "dinov2"), +] + + +# ── subprocess helpers ──────────────────────────────────────────────────────── + + +def run_cmd(cmd: list[str], label: str = "", timeout: int = 300) -> tuple[int, str, float]: + t0 = time.monotonic() + print(f" >> {label or ' '.join(cmd[:3])}", flush=True) + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout, + encoding="utf-8", + errors="replace", + ) + elapsed = time.monotonic() - t0 + ok = "ok" if result.returncode == 0 else f"rc={result.returncode}" + print(f" {elapsed:.0f}s [{ok}]", flush=True) + if result.returncode != 0: + stderr = result.stderr.strip() + if stderr: + print(f" stderr: {stderr[:200]}", flush=True) + return result.returncode, result.stdout + result.stderr, elapsed + except subprocess.TimeoutExpired: + print(f" TIMEOUT after {timeout}s", flush=True) + return -1, "TIMEOUT", timeout + + +def _get_p50(perf_json: Path) -> float | None: + try: + d = json.loads(perf_json.read_text(encoding="utf-8")) + lat = d.get("latency_ms", d) + return float(lat.get("p50") or 0) or None + except Exception: + return None + + +def _get_cv(perf_json: Path) -> float | None: + try: + d = json.loads(perf_json.read_text(encoding="utf-8")) + lat = d.get("latency_ms", d) + p50 = float(lat.get("p50") or 0) + std = float(lat.get("std") or 0) + return std / p50 if p50 > 0 else None + except Exception: + return None + + +# ── config helpers ───────────────────────────────────────────────────────────── + + +def _patch_for_cpu(cfg: dict) -> dict: + """Remove quantization and compile from CPU config.""" + cfg = copy.deepcopy(cfg) + cfg["quant"] = None + cfg["compile"] = None + return cfg + + +def get_base_config(model_id: str, task: str, model_type: str) -> dict | None: + tmp_dir = RESULTS_DIR / "_tmp_config" + tmp_dir.mkdir(parents=True, exist_ok=True) + cfg_out = tmp_dir / f"{model_id.replace('/', '--')}_cpu.json" + + rc, out, _ = run_cmd( + [ + WINML, + "config", + "--model", + model_id, + "--task", + task, + "--ep", + EP, + "--device", + DEVICE, + "--model-type", + model_type, + "--output", + str(cfg_out), + ], + label=f"winml config --ep {EP}", + timeout=300, + ) + if rc != 0 or not cfg_out.exists(): + for line in out.splitlines(): + line = line.strip() + if line.startswith("{"): + try: + return _patch_for_cpu(json.loads(line)) + except Exception: + pass + return None + return _patch_for_cpu(json.loads(cfg_out.read_text(encoding="utf-8"))) + + +def make_hypothesis_config( + base_config: dict, opset_override: int | None, extra_optim: dict | None +) -> dict: + cfg = copy.deepcopy(base_config) + if opset_override is not None: + cfg.setdefault("export", {})["opset_version"] = opset_override + if extra_optim: + existing = cfg.get("optim") or {} + cfg["optim"] = {**existing, **extra_optim} + return cfg + + +def _model_has_gemm(model_onnx: Path) -> bool: + """Check if an optimized.onnx has Gemm nodes (cpu-002 guard).""" + try: + import onnx + + m = onnx.load(str(model_onnx)) + return any(n.op_type == "Gemm" for n in m.graph.node) + except Exception: + return False # Assume safe if can't check + + +# ── build + bench ────────────────────────────────────────────────────────────── + + +def run_build(model_id: str, cfg_path: Path, out_dir: Path) -> tuple[bool, str]: + """winml build --no-quant --no-compile --rebuild for CPU EP.""" + rc, out, _ = run_cmd( + [ + WINML, + "build", + "-m", + model_id, + "-c", + str(cfg_path), + "-o", + str(out_dir), + "--ep", + EP, + "--device", + DEVICE, + "--no-quant", + "--no-compile", + "--rebuild", + ], + label="winml build", + timeout=BUILD_TIMEOUT_S, + ) + return rc == 0, out + + +def run_perf_screen(onnx_path: Path, out_json: Path) -> tuple[float | None, float | None]: + rc, _, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(onnx_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(SCREEN_WARMUP), + "--iterations", + str(SCREEN_ITERS), + "--output", + str(out_json), + ], + label="perf screen (200 iters)", + timeout=BENCH_TIMEOUT_S, + ) + if rc != 0 or not out_json.exists(): + return None, None + p50 = _get_p50(out_json) + cv = _get_cv(out_json) + if p50: + print(f" screen: p50={p50:.2f}ms CV={cv:.3f}", flush=True) + return p50, cv + + +def run_perf_full(onnx_path: Path, hyp_dir: Path) -> list[float]: + p50s = [] + for s in range(1, FULL_SESSIONS + 1): + out_json = hyp_dir / f"full_s{s}.json" + rc, _, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(onnx_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(FULL_WARMUP), + "--iterations", + str(FULL_ITERS), + "--output", + str(out_json), + ], + label=f"perf full s{s}/{FULL_SESSIONS} ({FULL_ITERS} iters)", + timeout=BENCH_TIMEOUT_S, + ) + p50 = _get_p50(out_json) if rc == 0 and out_json.exists() else None + if p50: + print(f" full s{s}: p50={p50:.2f}ms", flush=True) + p50s.append(p50) + if s < FULL_SESSIONS: + print(f" cool-down {COOL_DOWN_S}s…", flush=True) + time.sleep(COOL_DOWN_S) + return p50s + + +# ── sweep logic ──────────────────────────────────────────────────────────────── + + +def sweep_model( + model_id: str, + task: str, + model_type: str, + only_hyp_ids: "set[str] | None" = None, + reuse_h0_config: bool = False, +) -> dict: + model_slug = model_id.replace("/", "--") + model_dir = RESULTS_DIR / model_slug + model_dir.mkdir(parents=True, exist_ok=True) + + results_path = model_dir / "results.json" + if only_hyp_ids and results_path.exists(): + try: + results = json.loads(results_path.read_text(encoding="utf-8")) + print(" [resume] loaded existing results", flush=True) + except Exception: + results = {} + else: + results = {} + + results.update( + { + "model_id": model_id, + "task": task, + "model_type": model_type, + "timestamp": datetime.now().isoformat(timespec="seconds"), + "ep": EP, + "device": DEVICE, + } + ) + results.setdefault("hypotheses", {}) + results.setdefault("baseline_p50_ms", None) + results.setdefault("best_p50_ms", None) + results.setdefault("best_hypothesis", None) + results.setdefault("best_gain_pct", None) + results.setdefault("errors", []) + + print(f"\n{'=' * 64}", flush=True) + print(f" SWEEP [CPU]: {model_id} [{task}]", flush=True) + if only_hyp_ids: + print(f" (delta — only: {sorted(only_hyp_ids)})", flush=True) + print(f"{'=' * 64}", flush=True) + + # Step 1: base config + print("\n[1/3] Generating base config…", flush=True) + base_config = None + + if reuse_h0_config: + h0_cfg = model_dir / "h0" / "build_config.json" + if h0_cfg.exists(): + try: + base_config = json.loads(h0_cfg.read_text(encoding="utf-8")) + print(" [reuse] h0 config loaded", flush=True) + except Exception: + pass + + if base_config is None: + base_config = get_base_config(model_id, task, model_type) + + if base_config is None: + results["errors"].append("base config generation failed") + _save_results(results, model_dir) + return results + + baseline_opset = (base_config.get("export") or {}).get("opset_version", "?") + results["baseline_opset"] = baseline_opset + print(f" baseline opset={baseline_opset} quant=NONE (CPU EP) compile=NONE", flush=True) + + # Step 2: hypothesis loop + print(f"\n[2/3] Running {len(HYPOTHESES)} hypotheses…", flush=True) + + baseline_p50: float | None = results.get("baseline_p50_ms") + model_has_gemm: bool | None = None # lazy-loaded for cpu-002 guard + + for hyp_id, label, opset_override, extra_optim, skip_if_gemm in HYPOTHESES: + if only_hyp_ids is not None and hyp_id not in only_hyp_ids: + continue + + sep = "─" * 56 + print(f"\n{sep}", flush=True) + print(f" {hyp_id}: {label}", flush=True) + print(f"{sep}", flush=True) + + hyp_config = make_hypothesis_config(base_config, opset_override, extra_optim) + opset_used = (hyp_config.get("export") or {}).get("opset_version", "?") + print(f" opset={opset_used} extra_optim={extra_optim}", flush=True) + + hyp_dir = model_dir / hyp_id + hyp_dir.mkdir(parents=True, exist_ok=True) + cfg_path = hyp_dir / "build_config.json" + cfg_path.write_text(json.dumps(hyp_config, indent=2), encoding="utf-8") + + # Build + build_ok, build_out = run_build(model_id, cfg_path, hyp_dir) + if not build_ok: + results["hypotheses"][hyp_id] = { + "status": "BUILD_FAIL", + "label": label, + "opset": opset_used, + "build_error": build_out[-300:] if build_out else "", + } + results["errors"].append(f"{hyp_id}: BUILD_FAIL") + continue + + # Find output ONNX + onnx_path = hyp_dir / "model.onnx" + if not onnx_path.exists(): + candidates = list(hyp_dir.glob("*.onnx")) + if candidates: + onnx_path = candidates[0] + else: + results["hypotheses"][hyp_id] = {"status": "NO_ONNX", "label": label} + results["errors"].append(f"{hyp_id}: build OK but no ONNX") + continue + + # cpu-002 guard: skip matmul_add_fusion if model already has Gemm + if skip_if_gemm: + if model_has_gemm is None: + opt_onnx = hyp_dir / "optimized.onnx" + model_has_gemm = _model_has_gemm(opt_onnx) if opt_onnx.exists() else False + if model_has_gemm: + print( + f" [cpu-002] SKIP {hyp_id}: model has Gemm nodes — matmul_add_fusion likely harmful", + flush=True, + ) + results["hypotheses"][hyp_id] = { + "status": "SKIPPED_CPU002", + "label": label, + "opset": opset_used, + "reason": "cpu-002: model already has Gemm — matmul_add_fusion skipped", + } + continue + + # Annotate cpu-001 risk + if opset_override is not None and opset_override >= 19: + print( + f" [cpu-001] NOTE: opset={opset_override} may regress on Conv-heavy models" + f" (cpu-001 validated on ConvNext only — testing transformer behavior)", + flush=True, + ) + + # Phase A: screen + screen_json = hyp_dir / "screen_perf.json" + screen_p50, screen_cv = run_perf_screen(onnx_path, screen_json) + + if screen_p50 is None: + results["hypotheses"][hyp_id] = {"status": "BENCH_FAIL", "label": label} + results["errors"].append(f"{hyp_id}: screen bench failed") + continue + + if screen_cv is not None and screen_cv > SCREEN_CV_MAX: + print(f" [warn] high CV={screen_cv:.3f} on CPU (unusual) — proceeding", flush=True) + + # Phase B: full bench + p50s = run_perf_full(onnx_path, hyp_dir) + if not p50s: + results["hypotheses"][hyp_id] = { + "status": "BENCH_FAIL", + "label": label, + "screen_p50_ms": screen_p50, + } + continue + + median_p50 = sorted(p50s)[len(p50s) // 2] + + hyp_data: dict = { + "status": "OK", + "label": label, + "opset": opset_used, + "extra_optim": extra_optim, + "screen_p50_ms": screen_p50, + "screen_cv": screen_cv, + "full_p50s_ms": p50s, + "median_p50_ms": median_p50, + } + + if hyp_id == "h0": + baseline_p50 = median_p50 + results["baseline_p50_ms"] = baseline_p50 + print(f" [baseline] p50={baseline_p50:.2f}ms", flush=True) + + if baseline_p50 and hyp_id != "h0": + gain_pct = (baseline_p50 - median_p50) / baseline_p50 * 100 + hyp_data["gain_vs_baseline_pct"] = round(gain_pct, 2) + verdict = ( + "KEEP" + if gain_pct >= MIN_IMPROVEMENT_PCT + else ("MARGINAL" if gain_pct > 0 else "DISCARD") + ) + # cpu-001: flag known-regression hypotheses specially + if opset_override is not None and opset_override >= 19 and gain_pct <= -50: + verdict = "CPU001_REGRESSION" + hyp_data["verdict"] = verdict + print( + f" [{verdict}] gain={gain_pct:+.1f}% ({baseline_p50:.2f}ms → {median_p50:.2f}ms)", + flush=True, + ) + + best_p50 = results.get("best_p50_ms") + if best_p50 is None or median_p50 < best_p50: + if gain_pct >= MIN_IMPROVEMENT_PCT: + results["best_p50_ms"] = median_p50 + results["best_hypothesis"] = hyp_id + results["best_gain_pct"] = round(gain_pct, 2) + else: + hyp_data["verdict"] = "BASELINE" + + results["hypotheses"][hyp_id] = hyp_data + + # Step 2b: Phase C confirmation + _run_confirmation_pass(results, model_dir, baseline_p50) + + # Step 3: finalise + _post_process(results) + _save_results(results, model_dir) + return results + + +def _run_confirmation_pass(results: dict, model_dir: Path, baseline_p50: float | None) -> None: + """Phase C: CONFIRM_SESSIONS extra sessions for best hypothesis.""" + if not baseline_p50: + return + hyps = results.get("hypotheses", {}) + keep_ids = [hid for hid, h in hyps.items() if h.get("verdict") == "KEEP"] + if not keep_ids: + return + + print( + f"\n ── Phase C: confirming {keep_ids} ({CONFIRM_SESSIONS} extra sessions each) ──", + flush=True, + ) + + for hyp_id in keep_ids: + hyp_data = hyps[hyp_id] + onnx_path: Path | None = None + hyp_dir = model_dir / hyp_id + for candidate in (hyp_dir / "model.onnx", hyp_dir / "optimized.onnx"): + if candidate.exists(): + onnx_path = candidate + break + if onnx_path is None: + continue + + print(f" [confirm] {hyp_id} ({hyp_data['label']})", flush=True) + extra_p50s: list[float] = [] + for s in range(1, CONFIRM_SESSIONS + 1): + out_json = hyp_dir / f"confirm_s{s}.json" + rc, _, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(onnx_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(FULL_WARMUP), + "--iterations", + str(FULL_ITERS), + "--output", + str(out_json), + ], + label=f"confirm s{s}/{CONFIRM_SESSIONS}", + timeout=BENCH_TIMEOUT_S, + ) + p50 = _get_p50(out_json) if rc == 0 and out_json.exists() else None + if p50: + print(f" confirm s{s}: p50={p50:.2f}ms", flush=True) + extra_p50s.append(p50) + if s < CONFIRM_SESSIONS: + time.sleep(COOL_DOWN_S) + + if not extra_p50s: + continue + + all_p50s = hyp_data.get("full_p50s_ms", []) + extra_p50s + overall_median = sorted(all_p50s)[len(all_p50s) // 2] + overall_gain = (baseline_p50 - overall_median) / baseline_p50 * 100 + wins = sum( + 1 for p in all_p50s if (baseline_p50 - p) / baseline_p50 * 100 >= MIN_IMPROVEMENT_PCT + ) + + hyp_data["confirm_p50s_ms"] = extra_p50s + hyp_data["all_p50s_ms"] = all_p50s + hyp_data["overall_median_p50_ms"] = round(overall_median, 3) + hyp_data["overall_gain_pct"] = round(overall_gain, 2) + hyp_data["sessions_above_threshold"] = wins + hyp_data["total_sessions"] = len(all_p50s) + + if wins == len(all_p50s): + hyp_data["verdict"] = "KEEP_CONFIRMED" + print( + f" [KEEP_CONFIRMED] {hyp_id}: {wins}/{len(all_p50s)} sessions ≥ {MIN_IMPROVEMENT_PCT}%," + f" overall={overall_gain:+.1f}%", + flush=True, + ) + else: + hyp_data["verdict"] = "MARGINAL_UNCONFIRMED" + print( + f" [MARGINAL_UNCONFIRMED] {hyp_id}: only {wins}/{len(all_p50s)} sessions above threshold", + flush=True, + ) + + if hyp_data["verdict"] == "KEEP_CONFIRMED": + best_p50 = results.get("best_p50_ms") + if best_p50 is None or overall_median < best_p50: + results["best_p50_ms"] = overall_median + results["best_hypothesis"] = hyp_id + results["best_gain_pct"] = round(overall_gain, 2) + + +def _post_process(results: dict) -> None: + hyps = results.get("hypotheses", {}) + baseline_p50 = results.get("baseline_p50_ms") + if not baseline_p50: + return + + keeps = [(hid, h) for hid, h in hyps.items() if h.get("verdict") in ("KEEP", "KEEP_CONFIRMED")] + unconfirmed = [ + (hid, h) for hid, h in hyps.items() if h.get("verdict") == "MARGINAL_UNCONFIRMED" + ] + regressions = [(hid, h) for hid, h in hyps.items() if h.get("verdict") == "CPU001_REGRESSION"] + + if keeps: + print(f"\n ✓ KEEP/KEEP_CONFIRMED: {[h[0] for h in keeps]}", flush=True) + if unconfirmed: + print(f" ⚠ MARGINAL_UNCONFIRMED: {[h[0] for h in unconfirmed]}", flush=True) + if regressions: + print(f" ✗ CPU001_REGRESSION: {[h[0] for h in regressions]}", flush=True) + if not keeps and not unconfirmed and not regressions: + print("\n No improvements found above 5% threshold.", flush=True) + + # Cross-architecture cpu-001 check: does opset 19/21 regress on THIS model? + for hid in ("h2", "h3"): + h = hyps.get(hid, {}) + if h.get("status") == "OK" and baseline_p50: + gain = h.get("gain_vs_baseline_pct", 0.0) + if gain < -50: + print( + f" [cpu-001] CONFIRMED regression on {hid} for this architecture: {gain:.1f}%", + flush=True, + ) + elif gain > -10: + print( + f" [cpu-001] NOT OBSERVED on {hid} for {results.get('model_type')} — " + f"gain={gain:+.1f}% (ConvNext-specific?)", + flush=True, + ) + + +def _save_results(results: dict, model_dir: Path) -> None: + out = model_dir / "results.json" + out.write_text(json.dumps(results, indent=2, ensure_ascii=False), encoding="utf-8") + print(f" Results: {out}", flush=True) + + +# ── summary writer ──────────────────────────────────────────────────────────── + + +def write_summary(all_results: list[dict]) -> None: + lines = [ + "# CPU EP Optimization Sweep — Catalog Models", + "", + f"Generated: {datetime.now().isoformat(timespec='seconds')} ", + f"EP: `{EP}` / device: `{DEVICE}` ", + f"Protocol: screen {SCREEN_ITERS} iters (CV<{SCREEN_CV_MAX * 100:.0f}%)," + f" full {FULL_ITERS}×{FULL_SESSIONS} sessions" + f" + {CONFIRM_SESSIONS} confirm sessions for KEEP ", + "Constraints: NO quant, NO compile ", + "", + "---", + "", + "## cpu-001 Check: Does opset 19/21 Regress on Non-ConvNext Models?", + "", + "| Model | type | h2(opset19) gain% | h3(opset21) gain% | cpu-001 fires? |", + "|-------|------|-------------------|-------------------|---------------|", + ] + + for r in all_results: + model_id = r.get("model_id", "?") + mtype = r.get("model_type", "?") + h2 = r.get("hypotheses", {}).get("h2", {}) + h3 = r.get("hypotheses", {}).get("h3", {}) + g2 = ( + f"{h2.get('gain_vs_baseline_pct', 'N/A'):+.1f}%" + if h2.get("gain_vs_baseline_pct") is not None + else h2.get("status", "N/A") + ) + g3 = ( + f"{h3.get('gain_vs_baseline_pct', 'N/A'):+.1f}%" + if h3.get("gain_vs_baseline_pct") is not None + else h3.get("status", "N/A") + ) + fires = ( + "YES ≤-50%" + if any( + r.get("hypotheses", {}).get(h, {}).get("gain_vs_baseline_pct", 0) <= -50 + for h in ("h2", "h3") + ) + else "no" + ) + lines.append(f"| `{model_id}` | {mtype} | {g2} | {g3} | {fires} |") + + lines += [ + "", + "## Per-Model Results", + "", + "| Model | Baseline p50 | Best p50 | Best config | Gain% | Notes |", + "|-------|-------------|----------|-------------|-------|-------|", + ] + + for r in all_results: + model_id = r.get("model_id", "?") + baseline = f"{r['baseline_p50_ms']:.1f} ms" if r.get("baseline_p50_ms") else "N/A" + best = f"{r['best_p50_ms']:.1f} ms" if r.get("best_p50_ms") else "N/A" + best_h = r.get("best_hypothesis") or "N/A" + best_label = "" + if best_h != "N/A": + best_label = r.get("hypotheses", {}).get(best_h, {}).get("label", "") + gain = f"{r['best_gain_pct']:.1f}%" if r.get("best_gain_pct") is not None else "N/A" + errors = "; ".join(r.get("errors", []))[:80] or "none" + lines.append( + f"| `{model_id}` | {baseline} | {best} | {best_h} ({best_label}) | {gain} | {errors} |" + ) + + summary_path = RESULTS_DIR / "SUMMARY.md" + RESULTS_DIR.mkdir(parents=True, exist_ok=True) + summary_path.write_text("\n".join(lines) + "\n", encoding="utf-8") + print(f"\n📄 Summary: {summary_path}", flush=True) + + +# ── CLI ──────────────────────────────────────────────────────────────────────── + + +def main() -> None: + parser = argparse.ArgumentParser(description="CPU EP sweep across catalog models") + parser.add_argument("--model", help="Run a single model (HuggingFace model ID)") + parser.add_argument("--task", help="Task for single model run") + parser.add_argument("--model-type", dest="model_type", help="Model type for single model run") + parser.add_argument( + "--only-hypotheses", + dest="only_hyp", + help="Comma-separated list of hypothesis IDs to run (e.g. h4,h5,h10)", + ) + parser.add_argument( + "--reuse-h0-config", + dest="reuse_h0", + action="store_true", + help="Load base config from existing h0/build_config.json", + ) + args = parser.parse_args() + + only_hyp_ids = set(args.only_hyp.split(",")) if args.only_hyp else None + + all_results = [] + + if args.model: + if not args.task or not args.model_type: + print("ERROR: --task and --model-type required with --model", file=sys.stderr) + sys.exit(1) + r = sweep_model( + args.model, + args.task, + args.model_type, + only_hyp_ids=only_hyp_ids, + reuse_h0_config=args.reuse_h0, + ) + all_results.append(r) + else: + for model_id, task, model_type in ALL_MODELS: + r = sweep_model( + model_id, + task, + model_type, + only_hyp_ids=only_hyp_ids, + reuse_h0_config=args.reuse_h0, + ) + all_results.append(r) + + write_summary(all_results) + print("\n================================================================", flush=True) + print(" CPU SWEEP COMPLETE", flush=True) + print("================================================================", flush=True) + print(f"\n📄 Summary: {RESULTS_DIR / 'SUMMARY.md'}", flush=True) + + +if __name__ == "__main__": + main()