diff --git a/pyproject.toml b/pyproject.toml index 20aada0df..597f1de0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -298,6 +298,8 @@ lint.per-file-ignores."tests/**" = [ "ANN", "D", "PLR2004", "PT", "S101", "T20" lint.per-file-ignores."tests/**/generate_patterns.py" = [ "PERF401" ] # Generated opset code: Allow long lines lint.per-file-ignores."src/winml/modelkit/analyze/onnx_opset/**" = [ "D", "E501", "N802", "N803", "N806", "TC001", "TC002", "TC003" ] +# Research scripts: POC code, not production — exempt from all style/type/security rules +lint.per-file-ignores."research/**" = [ "ANN", "D", "E", "N", "S", "T20", "UP", "W", "B", "C4", "FA", "I", "PERF", "PIE", "PT", "PTH", "RET", "RSE", "RUF", "SIM", "TCH", "TID", "TRY", "G", "ICN", "E402", "E501", "F401", "F403", "F811" ] # === Import Conventions === lint.flake8-bandit.check-typed-exception = true lint.flake8-bandit.hardcoded-tmp-directory = [ "/tmp", "/var/tmp", "C:\\Temp" ] diff --git a/research/autoconfig/README.md b/research/autoconfig/README.md new file mode 100644 index 000000000..2d37f0e70 --- /dev/null +++ b/research/autoconfig/README.md @@ -0,0 +1,220 @@ +# autoconfig — Automated Config Search POC + +**Status: Research POC — not production code.** + +This directory contains an experimental automated search system that finds the optimal +`winml-cli` build configuration (execution provider, opset version, graph optimizations) +for a given model on Windows hardware — without requiring the user to understand the +underlying ORT/EP optimizer mechanics. + +--- + +## What This Is + +`autoconfig.py` implements an Explorer/Optimizer/Reviewer loop: + +1. **Explorer** — proposes the next hypothesis (opset, EP flags, graph passes) by reading + `ep_knowledge/` to prune already-refuted configurations +2. **Optimizer** — runs `winml build` + `winml perf` (two-phase: 200-iter CV screen → 3×500-iter full bench) +3. **Reviewer** — evaluates the result, updates the knowledge base, and decides keep/discard + +The loop terminates after 30 consecutive discards (plateau detection) or a time budget. + +`catalog_qnn_sweep.py` is a generalized multi-model sweep that tests a fixed hypothesis +matrix (h0–h5: baseline, opset 17–21, conv fusions) across a catalog of models on the +QNN NPU, collecting structured results in `catalog-qnn-sweep//results.json`. + +`analyze_graph.py` is an ONNX graph analysis helper that identifies architectural +patterns relevant to EP optimization (Transpose sandwiches, residual branches, GELU +variants, depthwise Conv) and surfaces gaps in `winml analyze` output. + +`gen_report_v3.py` generates an HTML sweep report from `results.json` files. + +`autoconfig_diagram.html` is an interactive architecture diagram of the Explorer/Optimizer/ +Reviewer loop. + +--- + +## Key Findings — 8-Model QNN NPU Catalog Sweep (2026-06-13) + +### npu-001: opset 21 NHWC bypass is real — but architecture-specific + +Opset ≥ 21 bypasses ORT's NHWC layout transformer for QNN EP, giving a large speedup +on **Conv + residual** models but no benefit (or slight regression) on pure transformers: + +| Architecture | Models | opset 21 vs opset 17 | +|---|---|---| +| Conv + residual | MobileViT-small, DINOv2-small | **+26–31% speedup** | +| Pure transformer | ViT-base, YOLOS-small | neutral / slight regression | +| BERT-family NLP | DistilBERT, MiniLM, RoBERTa | neutral (within DVFS noise) | +| Plain Conv (ResNet) | ResNet-18 | ~+20% (h1→h3), but DVFS-dominated | + +Root cause: ORT's `IsSupportedOpset()` gate in `layout_transformation.cc` causes the +NHWC layout transform to insert Transpose nodes around Conv ops. For Conv+residual +models these Transposes cannot be cancelled, so bypassing the transform (opset 21) gives +a cleaner HTP graph. Pure attention models have no Conv→NHWC transposes, so the bypass +has no effect. + +### npu-006: Conv fusions cause ~4900% regression on QNN NPU for Conv-dominant models + +`conv_bn_fusion`, `conv_add_fusion`, `conv_activation_fusion` produce fused op nodes +that QNN EP cannot execute natively — falling back to CPU for every fused Conv: + +| Model | h4 (conv fusions) vs h1 (baseline) | +|---|---| +| ResNet-18 | **132.3 ms vs 2.72 ms (+4764% regression)** | +| MobileViT-small | 11.36 ms vs 11.72 ms (neutral) | +| DistilBERT | 19.59 ms vs 19.5 ms (neutral — no Conv to fuse) | + +This is a critical correctness/performance hazard. `winml` should detect when the target +EP would CPU-fallback fused Conv ops and suppress incompatible fusions automatically +(see [Feature Gaps](#feature-gaps)). + +### npu-007: DVFS thermal noise requires session-level averaging for reliable results + +QNN NPU exhibits extreme DVFS thermal throttling. CV is consistently 0.10–2.0+ across +all models. Practical implications: + +- The CV < 15% Phase-A gate must be **disabled** for QNN NPU (blocks all models) +- Differences < 10% between configs are **unreliable** without ≥ 1500 total iterations +- Recommended protocol: **3 × 500-iter sessions** with 30 s cool-down; report median of + session p50 values +- 30 s cool-down reduces but does not eliminate DVFS spikes + +--- + +## How to Run + +### Prerequisites + +- `winml` CLI installed and on PATH +- Python 3.11+ with `onnx` package (`pip install onnx`) +- For QNN experiments: Snapdragon X Elite device with QNN SDK (Hexagon HTP driver) + +### autoconfig.py — single-model adaptive search + +Configured at the top of the file (edit `MODEL_ID`, `TASK`, `EP`, `DEVICE`, `WORK_DIR`): + +```bash +# Default: facebook/convnext-tiny-224 on CPU +python autoconfig.py +``` + +Results are written to `WORK_DIR/results.tsv` and per-hypothesis subdirectories. +The script reads `ep_knowledge/.json` to prune already-refuted configurations. + +### catalog_qnn_sweep.py — multi-model QNN NPU sweep + +```bash +# Full catalog sweep (all 8 models, ~6-8 hours on X Elite) +python catalog_qnn_sweep.py + +# Single model +python catalog_qnn_sweep.py --model microsoft/resnet-18 + +# Show available models +python catalog_qnn_sweep.py --list +``` + +Results land in `catalog-qnn-sweep//results.json` and a `SUMMARY.md` is +regenerated at the end of each sweep. + +### analyze_graph.py — ONNX graph analysis + +```bash +# Edit the onnx path at the top of the file, then: +python analyze_graph.py +``` + +Prints Transpose patterns, residual branch structure, GELU variants, and op domain +breakdown to stdout. + +--- + +## ep_knowledge/ — Empirical Knowledge Base + +Each JSON file stores empirical findings for one EP/device combination: + +| File | EP/device | +|---|---| +| `cpu.json` | CPU EP (Snapdragon X Elite Oryon) | +| `dml.json` | DirectML EP | +| `qnn_gpu.json` | QNN Adreno GPU | +| `qnn_npu.json` | QNN HTP (Hexagon NPU) — most findings here | + +### Schema overview + +Each file has a `findings` array. Each finding has: + +```json +{ + "id": "npu-001", + "title": "...", + "mechanism_confirmed": true, + "architecture_requirement": ["has_conv_ops", "has_residual_connections"], + "status": "confirmed", + "confidence": "high" +} +``` + +And a `search_space_rules` object that `autoconfig.py` reads to prune configurations +(only findings with `"mechanism_confirmed": true` are applied as pruning rules). + +### Adding a new finding + +1. Run the experiment and collect bench data +2. Add an entry to the appropriate `ep_knowledge/.json` under `findings` +3. Set `"mechanism_confirmed": false` and `"confidence": "draft"` until the mechanism + is understood from ORT/EP source code +4. If the finding prunes a search dimension, add a rule under `search_space_rules` +5. Set `"mechanism_confirmed": true` only after source code investigation confirms + the root cause — do NOT promote to confirmed based on benchmark numbers alone +6. See `ep_knowledge/README.md` for the epistemics guidelines + +--- + +## Feature Gaps Identified + +Three actionable gaps in `winml-cli` surfaced by this research: + +1. **FusedConv detection in `winml analyze`** — `analyze` should detect Conv ops that + would CPU-fallback on QNN NPU after fusion (npu-006), and either warn or suppress + incompatible fusions in the generated build config. + +2. **DVFS-aware perf** — `winml perf` should support `--thermal-stabilization` mode + that waits for device temperature to stabilize before measurements, and should report + confidence intervals rather than a single p50. + +3. **Budget-aware sweep** — `catalog_qnn_sweep.py` exhausts the 20-min budget on models + > 50 ms baseline after just 2 hypotheses (YOLOS: 78 ms × 3×500 iters = 207 s/hypothesis). + A `--quick` flag that reduces to 1×200-iter for large models is needed. + +--- + +## Directory Layout + +``` +research/autoconfig/ +├── README.md ← this file +├── autoconfig.py ← adaptive single-model config search loop +├── catalog_qnn_sweep.py ← fixed-hypothesis multi-model QNN sweep +├── analyze_graph.py ← ONNX graph pattern analysis helper +├── autoconfig_diagram.html ← Explorer/Optimizer/Reviewer architecture diagram +├── gen_report_v3.py ← HTML report generator for sweep results +├── ep_knowledge/ +│ ├── README.md ← epistemics guidelines and KB format +│ ├── cpu.json ← CPU EP findings (ConvNext, 6 findings) +│ ├── dml.json ← DirectML EP findings +│ ├── qnn_gpu.json ← QNN Adreno GPU findings +│ └── qnn_npu.json ← QNN HTP NPU findings (npu-001 through npu-007) +└── catalog-qnn-sweep/ + ├── SUMMARY.md ← 8-model sweep results and cross-model analysis + ├── apple--mobilevit-small/results.json + ├── facebook--dinov2-small/results.json + ├── microsoft--resnet-18/results.json + ├── google--vit-base-patch16-224/results.json + ├── deepset--roberta-base-squad2/results.json + ├── distilbert--distilbert-base-uncased-finetuned-sst-2-english/results.json + ├── sentence-transformers--all-MiniLM-L6-v2/results.json + └── hustvl--yolos-small/results.json +``` diff --git a/research/autoconfig/analyze_graph.py b/research/autoconfig/analyze_graph.py new file mode 100644 index 000000000..e57ff1032 --- /dev/null +++ b/research/autoconfig/analyze_graph.py @@ -0,0 +1,172 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +from collections import Counter + +import onnx + + +m = onnx.load(r"convnext-search\iter_00\export.onnx") +g = m.graph + +out2node = {} +for n in g.node: + for o in n.output: + out2node[o] = n + + +def consumers(node): + result = [] + for o in node.output: + for n in g.node: + if o in n.input: + result.append(n) + return result + + +def producer(inp): + return out2node.get(inp) + + +# ── 1. Block structure ──────────────────────────────────────── +print("=== ConvNext block structure (trace first DW-Conv forward) ===") +first_dw = next( + ( + n + for n in g.node + if n.op_type == "Conv" and next((a.i for a in n.attribute if a.name == "group"), 1) > 1 + ), + None, +) +cur = first_dw +for _ in range(14): + if cur is None: + break + c = consumers(cur) + c_types = [n.op_type for n in c] + print(f" {cur.op_type:25s} -> {c_types}") + if len(c) == 1: + cur = c[0] + elif len(c) > 1: + non_add = [n for n in c if n.op_type != "Add"] + cur = non_add[0] if non_add else c[0] + else: + break + +# ── 2. Transpose patterns ───────────────────────────────────── +print() +print("=== Transpose patterns (before -> Transpose -> after) ===") +trans_patterns = Counter() +for n in g.node: + if n.op_type == "Transpose": + c = consumers(n) + p = producer(n.input[0]) + before = p.op_type if p else "INPUT" + after = c[0].op_type if c else "OUTPUT" + trans_patterns[f"{before} -> Transpose -> {after}"] += 1 +for pat, cnt in trans_patterns.most_common(): + print(f" {cnt:3d}x {pat}") + +# ── 3. GELU variants ────────────────────────────────────────── +print() +print("=== GELU sub-patterns ===") +# Standard GELU: Mul -> Div -> Erf -> Add -> Mul -> Mul +gelu_standard = 0 +for n in g.node: + if n.op_type == "Erf": + p = producer(n.input[0]) + if p and p.op_type == "Div": + gelu_standard += 1 +print(f" Div->Erf (Erf-based GELU): {gelu_standard}") + +# Check for Sigmoid-based QuickGELU (x * sigmoid(1.702 * x)) +quick_gelu = 0 +for n in g.node: + if n.op_type == "Sigmoid": + c = consumers(n) + if c and c[0].op_type == "Mul": + quick_gelu += 1 +print(f" Sigmoid->Mul (QuickGELU candidate): {quick_gelu}") + +# ── 4. Downsampling blocks (stage transitions) ──────────────── +print() +print("=== Downsampling block pattern (LN->Conv 2x2 stride 2) ===") +down_blocks = 0 +for n in g.node: + if n.op_type == "Conv": + stride = next((list(a.ints) for a in n.attribute if a.name == "strides"), [1, 1]) + kernel = next((list(a.ints) for a in n.attribute if a.name == "kernel_shape"), []) + groups = next((a.i for a in n.attribute if a.name == "group"), 1) + if stride == [2, 2] and groups == 1: + p = producer(n.input[0]) + print(f" stride-2 Conv kernel={kernel} preceded_by={p.op_type if p else 'INPUT'}") + down_blocks += 1 + +# ── 5. Residual branches ────────────────────────────────────── +print() +print("=== Add nodes with 2 distinct producer op-types (residual candidates) ===") +residual_counter = Counter() +for n in g.node: + if n.op_type == "Add" and len(n.input) == 2: + p0 = producer(n.input[0]) + p1 = producer(n.input[1]) + t0 = p0.op_type if p0 else "INIT" + t1 = p1.op_type if p1 else "INIT" + if t0 != t1: + key = tuple(sorted([t0, t1])) + residual_counter[key] += 1 +for pair, cnt in residual_counter.most_common(): + print(f" {cnt:3d}x Add({pair[0]}, {pair[1]})") + +# ── 6. Node domain analysis ─────────────────────────────────── +print() +print("=== Op domains ===") +domains = Counter() +for n in g.node: + dom = n.domain if n.domain else "ai.onnx" + domains[dom] += 1 +for d, c in domains.most_common(): + print(f" {d}: {c} nodes") + +# ── 7. analyze gaps ─────────────────────────────────────────── +print() +print("=== Patterns winml analyze may miss ===") +# 1. Depthwise conv with large kernels (7x7 DW-Conv is ConvNext specific) +dw7x7 = sum( + 1 + for n in g.node + if n.op_type == "Conv" + and next((a.i for a in n.attribute if a.name == "group"), 1) > 1 + and next((list(a.ints) for a in n.attribute if a.name == "kernel_shape"), []) == [7, 7] +) +print(f" 7x7 DW-Conv (ConvNext pattern): {dw7x7}") +print(" -> analyze classifies as OP/ai.onnx/Conv (undifferentiated)") +print(" -> no distinction between DW-Conv and regular Conv EP support") + +# 2. Transpose wrapping every layer (NCHW<->NHWC conversion) +trans_total = sum(1 for n in g.node if n.op_type == "Transpose") +print(f" Transpose nodes total: {trans_total}") +print(" -> analyze reports as single OP/ai.onnx/Transpose") +print(" -> no detection of Transpose-sandwich (NCHW->NHWC->op->NCHW)") +print(" -> transpose-optimizer capability not reflected in analyze output") + +# 3. MatMul used as dense layer (not Gemm) - different EP kernel path +matmul_count = sum(1 for n in g.node if n.op_type == "MatMul") +print(f" MatMul (not Gemm): {matmul_count}") +print(" -> ConvNext uses MatMul for MLP (not Gemm), QNN handles differently") +print(" -> analyze does not distinguish MatMul-as-FC from MatMul-as-attention") + +# 4. LayerNormalization as a single op (already fused by PyTorch export) +ln_count = sum(1 for n in g.node if n.op_type == "LayerNormalization") +print(f" LayerNormalization (native op): {ln_count}") +print(" -> These are already fused (not the ReduceMean->Sub->... subgraph)") +print(" -> layer-norm-fusion capability targets the decomposed pattern") +print(" -> analyze should note these are ALREADY fused - no fusion needed") + +# 5. Erf-based GELU (not tagged as Gelu op, appears as com.microsoft/Gelu after fusion) +print(f" Erf-based GELU subgraphs (unfused): {gelu_standard}") +print(' -> analyze cannot detect "unfused GELU" as a pattern') +print(" -> gelu-fusion would convert these to com.microsoft/Gelu") +print(' -> no analyze rule for "fuseable_pattern: gelu_erf"') diff --git a/research/autoconfig/analyze_insight.py b/research/autoconfig/analyze_insight.py new file mode 100644 index 000000000..9e5466095 --- /dev/null +++ b/research/autoconfig/analyze_insight.py @@ -0,0 +1,890 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""analyze_insight.py — Phase 1 Insight Engine for autoconfig. + +Fuses three signals to build skip_set and priority_queue: + 1. Graph analysis : op counts, Conv%, GELU variant, dynamic axes + 2. winml analyze : partial/unsupported op list per EP (static rule data) + 3. ep_knowledge KB : confirmed empirical findings (skip_passes, priority hints) + +Outputs: + InsightResult.skip_set — set of hypothesis labels to prune + InsightResult.priority_boosts — {hypothesis_label: boost_score} for reordering + InsightResult.notes — human-readable explanation of each decision +""" + +from __future__ import annotations + +import json +import re +import tempfile +from collections import Counter +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +# Optional heavy imports — gracefully degrade if not available +try: + import onnx # type: ignore[import-untyped] + + _ONNX_OK = True +except ImportError: + _ONNX_OK = False + +from bench_utils import run_cmd + + +# ── data types ──────────────────────────────────────────────────────────────── + + +@dataclass +class GraphInfo: + total_ops: int = 0 + op_counts: dict[str, int] = field(default_factory=dict) + conv_pct: float = 0.0 # Conv / total_ops (0-100) + gemm_pct: float = 0.0 # Gemm / total_ops + has_gelu_decomposed: bool = False # Erf-based GELU sub-pattern + has_dynamic_axes: bool = False + transpose_count: int = 0 + available: bool = False # False when onnx not installed or model not found + + +@dataclass +class AnalyzeResult: + supported: list[str] = field(default_factory=list) + partial: list[str] = field(default_factory=list) + unsupported: list[str] = field(default_factory=list) + unknown: list[str] = field(default_factory=list) + available: bool = False # False when winml analyze failed or ep has no rule data + + +@dataclass +class InsightResult: + skip_set: set[str] = field(default_factory=set) + """Labels from HYPOTHESES that should be pruned before the search loop.""" + + priority_boosts: dict[str, float] = field(default_factory=dict) + """hypothesis_label -> boost (positive = higher priority, negative = deprioritise).""" + + notes: list[str] = field(default_factory=list) + """Human-readable explanation for each decision.""" + + graph_info: GraphInfo = field(default_factory=GraphInfo) + analyze_result: AnalyzeResult = field(default_factory=AnalyzeResult) + + +# ── data types ──────────────────────────────────────────────────────────────── + + +@dataclass +class FusionCandidate: + """One detectable pattern that maps to a winml optimize flag.""" + + flag: str + """winml optimize flag name (e.g. 'gelu_fusion').""" + + count: int + """How many candidate instances were found in the graph.""" + + evidence: str + """Short human-readable description of what was found.""" + + +@dataclass +class GraphInfo: + total_ops: int = 0 + op_counts: dict[str, int] = field(default_factory=dict) + conv_pct: float = 0.0 # Conv / total_ops (0-100) + matmul_pct: float = 0.0 # MatMul / total_ops + gemm_pct: float = 0.0 # Gemm / total_ops + has_gelu_decomposed: bool = False # any multi-op GELU subgraph detected + gelu_types: list[str] = field(default_factory=list) # 'erf', 'tanh', 'quick' + has_dynamic_axes: bool = False + transpose_count: int = 0 + fusion_candidates: list[FusionCandidate] = field(default_factory=list) + """Ordered list of detected optimisation opportunities, highest-count first.""" + available: bool = False # False when onnx not installed or model not found + + +@dataclass +class AnalyzeResult: + supported: list[str] = field(default_factory=list) + partial: list[str] = field(default_factory=list) + unsupported: list[str] = field(default_factory=list) + unknown: list[str] = field(default_factory=list) + available: bool = False # False when winml analyze failed or ep has no rule data + + +@dataclass +class InsightResult: + skip_set: set[str] = field(default_factory=set) + """Labels from HYPOTHESES that should be pruned before the search loop.""" + + priority_boosts: dict[str, float] = field(default_factory=dict) + """hypothesis_label -> boost (positive = higher priority, negative = deprioritise).""" + + notes: list[str] = field(default_factory=list) + """Human-readable explanation for each decision.""" + + graph_info: GraphInfo = field(default_factory=GraphInfo) + analyze_result: AnalyzeResult = field(default_factory=AnalyzeResult) + + +# ── graph analysis ──────────────────────────────────────────────────────────── + + +def _build_consumer_map(graph) -> dict[str, list]: # type: ignore[type-arg] + """Map each output name → list of consumer nodes.""" + consumers: dict[str, list] = {} + for node in graph.node: + for inp in node.input: + consumers.setdefault(inp, []).append(node) + return consumers + + +def _build_producer_map(graph) -> dict[str, object]: + """Map each output name → the node that produces it.""" + return {out: n for n in graph.node for out in n.output} + + +def _get_attr_float(node, name: str) -> float | None: + """Extract a float attribute from an ONNX node.""" + for a in node.attribute: + if a.name == name: + return float(a.f) + return None + + +def _detect_fusion_candidates(graph) -> list[FusionCandidate]: # type: ignore[type-arg] + """ + Scan the ONNX graph for subgraph patterns that map to winml optimize flags. + + Returns a list of FusionCandidate, ordered highest-count first. + + Detection strategy + ------------------ + We build two lookup tables (producer_map, consumer_map) and then sweep the + graph once per pattern family. Each check is O(N) in the number of nodes. + + Pattern families + ---------------- + GELU variants + gelu_fusion : Div → Erf → Add → Mul → Mul (exact GELU) + fast_gelu_fusion : Tanh-based GELU (Tanh node with Pow(3) ancestor) + quick_gelu_fusion : x * sigmoid(1.702*x) + bias_gelu_fusion : Add → GELU subgraph (bias before GELU entry) + LayerNorm variants + layer_norm_fusion : ReduceMean → Sub → Pow(2) → … → Add(ε) + simplified_layer_norm_fusion : Pow(2) + ReduceMean (no Sub) + fuse_rmsnorm : Pow → ReduceMean → Add → Sqrt → Div → Mul + skip_layer_norm_fusion : Add (residual) feeds directly into LN subgraph + Attention + attention_fusion : Q/K/V MatMul trio feeding a Softmax + bias_softmax_fusion : Add immediately before Softmax + MatMul patterns + matmul_add_fusion : MatMul → Add (not already counted in LN) + matmul_activation_fusion : MatMul → {Relu, Sigmoid, Tanh, Clip} + matmul_transpose_fusion : Transpose → MatMul OR MatMul → Transpose + matmul_scale_fusion : MatMul → Mul (scalar constant) + Conv patterns + conv_bn_fusion : Conv → BatchNormalization + conv_add_fusion : Conv → Add + conv_mul_fusion : Conv → Mul + conv_activation_fusion : Conv → {Relu, LeakyRelu, Sigmoid, Tanh, Clip} + conv_add_activation_fusion: Conv → Add → activation (3-node chain) + pad_fusion : Pad → Conv + Gemm patterns + gemm_activation_fusion : Gemm → {Relu, Tanh, Sigmoid} + gemm_sum_fusion : Gemm → Add + gemm_transpose_fusion : Transpose → Gemm + Eliminations + slice_elimination : multiple Slice ops (potential redundancy) + unsqueeze_elimination : Unsqueeze of initializers + concat_slice_elimination : Concat → Slice (reverse of split) + expand_elimination : Expand nodes + Layout + transpose_optimizer : Transpose count > 10 + nhwc_transformer : Conv-heavy + Transpose → layout transform candidate + Rewrite: highdimRTR_lowdimRTR : Reshape → Transpose → Reshape with rank > 4 + """ + producer = _build_producer_map(graph) + consumer = _build_consumer_map(graph) + + # Helper: get the single consumer of a node output (or None) + def _single_consumer(node, out_idx: int = 0): + if out_idx >= len(node.output): + return None + consumers = consumer.get(node.output[out_idx], []) + return consumers[0] if len(consumers) == 1 else None + + # Helper: check if a node output feeds a specific op type + def _consumer_is(node, op: str, out_idx: int = 0) -> bool: + c = _single_consumer(node, out_idx) + return c is not None and c.op_type == op + + # Helper: check if all inputs to node are exclusively from initializers (weight-only) + init_names = {i.name for i in graph.initializer} + + def _is_initializer_input(inp_name: str) -> bool: + return inp_name in init_names + + candidates: dict[str, FusionCandidate] = {} + + def _add(flag: str, evidence: str, n: int = 1) -> None: + if flag in candidates: + candidates[flag].count += n + candidates[flag].evidence = evidence # update to latest + else: + candidates[flag] = FusionCandidate(flag=flag, count=n, evidence=evidence) + + # ── GELU patterns ────────────────────────────────────────────────────────── + erf_gelu_count = 0 + tanh_gelu_count = 0 + quick_gelu_count = 0 + bias_before_gelu = 0 + + for node in graph.node: + # Erf-based GELU: Div → Erf → (Add → Mul → Mul) + if node.op_type == "Erf" and node.input: + pred = producer.get(node.input[0]) + if pred and pred.op_type == "Div": + erf_gelu_count += 1 + # Check if there's an Add feeding the Erf entry point (bias_gelu) + # The entry to Erf-GELU is typically through the Div; check what feeds Div + if pred.input: + div_pred = producer.get(pred.input[0]) + if div_pred and div_pred.op_type in ("Add", "Gemm", "MatMul"): + bias_before_gelu += 1 + + # Tanh-based GELU: Tanh with Pow(3) somewhere in the sub-tree + if node.op_type == "Tanh" and node.input: + # Check 3-hop ancestry for Pow + cur = producer.get(node.input[0]) + for _ in range(4): + if cur is None: + break + if cur.op_type == "Pow": + tanh_gelu_count += 1 + break + cur = producer.get(cur.input[0]) if cur.input else None + + # Quick GELU: Sigmoid where predecessor is Mul with constant ≈ 1.702 + if node.op_type == "Sigmoid" and node.input: + pred = producer.get(node.input[0]) + if pred and pred.op_type == "Mul": + quick_gelu_count += 1 + + if erf_gelu_count: + _add("gelu_fusion", f"{erf_gelu_count} Erf-based GELU subgraph(s)", erf_gelu_count) + _add( + "gelu_singlegelu", + f"{erf_gelu_count} decomposed GELU → can normalise to single Gelu op", + erf_gelu_count, + ) + if tanh_gelu_count: + _add( + "fast_gelu_fusion", + f"{tanh_gelu_count} Tanh-based GELU subgraph(s)", + tanh_gelu_count, + ) + if quick_gelu_count: + _add( + "quick_gelu_fusion", + f"{quick_gelu_count} Sigmoid(1.702x) quick-GELU pattern(s)", + quick_gelu_count, + ) + if bias_before_gelu: + _add( + "bias_gelu_fusion", + f"{bias_before_gelu} Add/MatMul feeding GELU entry", + bias_before_gelu, + ) + + # ── LayerNorm patterns ───────────────────────────────────────────────────── + ln_full_count = 0 # ReduceMean + Sub + Pow(2) + ln_simplified_count = 0 # Pow(2) + ReduceMean (no Sub) + rmsnorm_count = 0 # Pow + ReduceMean (no Sub, no mean-centering) + skip_ln_count = 0 # Add → LayerNorm subgraph + + for node in graph.node: + if node.op_type == "Pow" and node.input: + pred = producer.get(node.input[0]) + if pred and pred.op_type == "Sub": + # Sub → Pow: classic LN (ReduceMean → Sub → Pow) + sub_pred = producer.get(pred.input[0]) if pred.input else None + if sub_pred and sub_pred.op_type == "ReduceMean": + ln_full_count += 1 + elif pred and pred.op_type in ("ReduceMean", "Mul", "Add"): + # Simplified / RMSNorm: no Sub predecessor + ln_simplified_count += 1 + + # RMSNorm: Pow → ReduceMean (direct, without Sub) + if node.op_type == "ReduceMean" and node.input: + pred = producer.get(node.input[0]) + if pred and pred.op_type == "Pow": + rmsnorm_count += 1 + + # skip_layer_norm: Add whose output feeds into the start of an LN subgraph + # Heuristic: Add → ReduceMean (the mean-centering step of LN) + if node.op_type == "Add" and _consumer_is(node, "ReduceMean"): + skip_ln_count += 1 + + if ln_full_count: + _add( + "layer_norm_fusion", + f"{ln_full_count} ReduceMean→Sub→Pow LayerNorm subgraph(s)", + ln_full_count, + ) + if ln_simplified_count: + _add( + "simplified_layer_norm_fusion", + f"{ln_simplified_count} simplified LayerNorm pattern(s) (no mean-centering)", + ln_simplified_count, + ) + if rmsnorm_count: + _add("fuse_rmsnorm", f"{rmsnorm_count} RMSNorm Pow→ReduceMean pattern(s)", rmsnorm_count) + if skip_ln_count: + _add( + "skip_layer_norm_fusion", + f"{skip_ln_count} Add→ReduceMean (residual+LN) pattern(s)", + skip_ln_count, + ) + + # ── Attention patterns ───────────────────────────────────────────────────── + softmax_count = sum(1 for n in graph.node if n.op_type == "Softmax") + add_before_softmax = 0 + for node in graph.node: + if node.op_type == "Softmax" and node.input: + pred = producer.get(node.input[0]) + if pred and pred.op_type == "Add": + add_before_softmax += 1 + + if softmax_count: + _add( + "attention_fusion", + f"{softmax_count} Softmax node(s) — likely attention head(s)", + softmax_count, + ) + if add_before_softmax: + _add( + "bias_softmax_fusion", + f"{add_before_softmax} Add→Softmax (bias+attention mask) pattern(s)", + add_before_softmax, + ) + + # ── MatMul patterns ──────────────────────────────────────────────────────── + _ACTIVATIONS = {"Relu", "LeakyRelu", "Sigmoid", "Tanh", "Clip", "Gelu", "FastGelu"} + + mm_add = mm_act = mm_tp = mm_scale = 0 + for node in graph.node: + if node.op_type != "MatMul": + continue + c = _single_consumer(node) + if c is None: + continue + if c.op_type == "Add": + mm_add += 1 + elif c.op_type in _ACTIVATIONS: + mm_act += 1 + elif c.op_type == "Transpose": + mm_tp += 1 + elif c.op_type == "Mul": + # Mul with a scalar → scale fusion; heuristic: second input is initializer + if len(c.input) > 1 and _is_initializer_input(c.input[1]): + mm_scale += 1 + + # Also check Transpose → MatMul + tp_before_mm = sum( + 1 for node in graph.node if node.op_type == "Transpose" and _consumer_is(node, "MatMul") + ) + + if mm_add: + _add("matmul_add_fusion", f"{mm_add} MatMul→Add pattern(s)", mm_add) + _add( + "matmuladd_reshapegemm", + f"{mm_add} MatMul+Add → Reshape+Gemm rewrite candidate(s)", + mm_add, + ) + if mm_act: + _add("matmul_activation_fusion", f"{mm_act} MatMul→activation pattern(s)", mm_act) + if mm_tp + tp_before_mm: + _add( + "matmul_transpose_fusion", + f"{mm_tp + tp_before_mm} MatMul↔Transpose pattern(s)", + mm_tp + tp_before_mm, + ) + if mm_scale: + _add("matmul_scale_fusion", f"{mm_scale} MatMul→Mul(scalar) pattern(s)", mm_scale) + + # ── Conv patterns ────────────────────────────────────────────────────────── + conv_bn = conv_add = conv_mul = conv_act = conv_add_act = pad_conv = 0 + for node in graph.node: + if node.op_type == "Pad" and _consumer_is(node, "Conv"): + pad_conv += 1 + + if node.op_type != "Conv": + continue + c = _single_consumer(node) + if c is None: + continue + if c.op_type == "BatchNormalization": + conv_bn += 1 + elif c.op_type == "Add": + conv_add += 1 + # Check for Conv → Add → activation chain + cc = _single_consumer(c) + if cc and cc.op_type in _ACTIVATIONS: + conv_add_act += 1 + elif c.op_type == "Mul": + conv_mul += 1 + elif c.op_type in _ACTIVATIONS: + conv_act += 1 + + if conv_bn: + _add("conv_bn_fusion", f"{conv_bn} Conv→BN pattern(s)", conv_bn) + if conv_add: + _add("conv_add_fusion", f"{conv_add} Conv→Add pattern(s)", conv_add) + if conv_mul: + _add("conv_mul_fusion", f"{conv_mul} Conv→Mul pattern(s)", conv_mul) + if conv_act: + _add("conv_activation_fusion", f"{conv_act} Conv→activation pattern(s)", conv_act) + if conv_add_act: + _add( + "conv_add_activation_fusion", + f"{conv_add_act} Conv→Add→activation chain(s) (FusedConv)", + conv_add_act, + ) + if pad_conv: + _add("pad_fusion", f"{pad_conv} Pad→Conv pattern(s)", pad_conv) + + # ── Gemm patterns ────────────────────────────────────────────────────────── + gemm_act = gemm_add = gemm_tp = 0 + for node in graph.node: + if node.op_type != "Gemm": + continue + c = _single_consumer(node) + if c is None: + continue + if c.op_type in _ACTIVATIONS: + gemm_act += 1 + elif c.op_type == "Add": + gemm_add += 1 + elif c.op_type == "Transpose": + gemm_tp += 1 + tp_before_gemm = sum( + 1 for node in graph.node if node.op_type == "Transpose" and _consumer_is(node, "Gemm") + ) + if gemm_act: + _add("gemm_activation_fusion", f"{gemm_act} Gemm→activation pattern(s)", gemm_act) + if gemm_add: + _add("gemm_sum_fusion", f"{gemm_add} Gemm→Add pattern(s)", gemm_add) + if gemm_tp + tp_before_gemm: + _add( + "gemm_transpose_fusion", + f"{gemm_tp + tp_before_gemm} Gemm↔Transpose pattern(s)", + gemm_tp + tp_before_gemm, + ) + + # ── Elimination patterns ─────────────────────────────────────────────────── + slice_count = sum(1 for n in graph.node if n.op_type == "Slice") + expand_count = sum(1 for n in graph.node if n.op_type == "Expand") + unsqueeze_init = sum( + 1 + for n in graph.node + if n.op_type == "Unsqueeze" and n.input and _is_initializer_input(n.input[0]) + ) + concat_slice = sum(1 for n in graph.node if n.op_type == "Concat" and _consumer_is(n, "Slice")) + + if slice_count > 3: + _add("slice_elimination", f"{slice_count} Slice nodes (potential redundancy)", slice_count) + if expand_count > 2: + _add("expand_elimination", f"{expand_count} Expand nodes", expand_count) + if unsqueeze_init: + _add( + "unsqueeze_elimination", + f"{unsqueeze_init} Unsqueeze(initializer) node(s)", + unsqueeze_init, + ) + if concat_slice: + _add( + "concat_slice_elimination", + f"{concat_slice} Concat→Slice pattern(s) (reverse-split)", + concat_slice, + ) + + # ── Layout patterns ──────────────────────────────────────────────────────── + tp_count = sum(1 for n in graph.node if n.op_type == "Transpose") + if tp_count > 10: + _add( + "transpose_optimizer", + f"{tp_count} Transpose nodes — optimizer may collapse chains", + tp_count, + ) + + # Reshape → Transpose → Reshape with high-dimensional input (rank > 4) + rtr_highdim = 0 + for node in graph.node: + if node.op_type == "Transpose" and node.input: + pred = producer.get(node.input[0]) + c = _single_consumer(node) + if pred and c and pred.op_type == "Reshape" and c.op_type == "Reshape": + # Check if any input to the reshape has rank > 4 via shape inference + # Approximation: count as candidate if the graph has many dims + rtr_highdim += 1 + if rtr_highdim > 2: + _add( + "highdimRTR_lowdimRTR", + f"{rtr_highdim} Reshape→Transpose→Reshape chain(s) — may reduce to lower rank", + rtr_highdim, + ) + + # Sort by count descending + return sorted(candidates.values(), key=lambda c: -c.count) + + +def run_graph_analysis(onnx_path: Path) -> GraphInfo: + """Analyse the ONNX proto and return structural statistics.""" + info = GraphInfo() + if not _ONNX_OK: + return info + if not onnx_path.exists(): + return info + + try: + model = onnx.load(str(onnx_path)) + g = model.graph + counts: Counter = Counter(n.op_type for n in g.node) + total = sum(counts.values()) + info.total_ops = total + info.op_counts = dict(counts) + info.available = True + + if total > 0: + info.conv_pct = counts.get("Conv", 0) / total * 100 + info.matmul_pct = counts.get("MatMul", 0) / total * 100 + info.gemm_pct = counts.get("Gemm", 0) / total * 100 + info.transpose_count = counts.get("Transpose", 0) + + # Detect GELU types + if counts.get("Erf", 0): + info.has_gelu_decomposed = True + info.gelu_types.append("erf") + if counts.get("Tanh", 0): + info.gelu_types.append("tanh") + if counts.get("Sigmoid", 0): + info.gelu_types.append("sigmoid/quick") + + # Dynamic axes: any input with dim_param (string dimension) + for inp in g.input: + for dim in inp.type.tensor_type.shape.dim: + if dim.dim_param: + info.has_dynamic_axes = True + break + + # Full fusion candidate scan + info.fusion_candidates = _detect_fusion_candidates(g) + + except Exception as e: + info.available = False + print(f" [analyze_insight] graph analysis failed: {e}") + + return info + + +# ── winml analyze ───────────────────────────────────────────────────────────── + + +def run_winml_analyze(winml: str, onnx_path: Path, ep: str, device: str) -> AnalyzeResult: + """Call `winml analyze -m --ep ` and parse JSON output.""" + result = AnalyzeResult() + if not onnx_path.exists(): + return result + + with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as f: + out_path = Path(f.name) + + try: + rc, out, _ = run_cmd( + [ + winml, + "analyze", + "-m", + str(onnx_path), + "--ep", + ep, + "--device", + device, + "-o", + str(out_path), + ], + label=f"winml analyze --ep {ep}", + timeout=120, + ) + if rc not in (0, 1) or not out_path.exists(): + return result + + data = json.loads(out_path.read_text(encoding="utf-8")) + # Output is a list; take first entry (single-EP mode) + entry = data[0] if isinstance(data, list) and data else data + ep_results = entry.get("results", []) + if not ep_results: + return result + + ep_res = ep_results[0] + cls = ep_res.get("classification", {}) + + def _extract_op_types(lst: list[str]) -> list[str]: + """Turn 'OP/ai.onnx/Conv (QDQ)' into 'Conv'.""" + types = [] + for s in lst: + m = re.search(r"/([A-Za-z][A-Za-z0-9_]*)(?:\s|$|\()", s) + if m: + types.append(m.group(1)) + return list(dict.fromkeys(types)) # dedupe, preserve order + + result.supported = _extract_op_types(cls.get("supported", [])) + result.partial = _extract_op_types(cls.get("partial", [])) + result.unsupported = _extract_op_types(cls.get("unsupported", [])) + result.unknown = _extract_op_types(cls.get("unknown", [])) + # Consider results available only when there's actual rule data + result.available = bool(result.supported or result.partial or result.unsupported) + + except Exception as e: + print(f" [analyze_insight] winml analyze failed: {e}") + finally: + out_path.unlink(missing_ok=True) + + return result + + +# ── insight engine ──────────────────────────────────────────────────────────── + + +def build_insight( + onnx_path: Path, + winml: str, + ep: str, + device: str, + hypotheses: list[tuple[str, Any, str]], + kb: dict, +) -> InsightResult: + """Fuse graph + analyze + KB signals into skip_set and priority_boosts. + + Args: + onnx_path: Path to baseline ONNX (post-export, pre-optim). + winml: Path to winml executable. + ep: Execution provider string (e.g. "cpu", "qnn"). + device: Device string (e.g. "cpu", "npu"). + hypotheses: List of (label, patch_fn, dimension) from autoconfig.py. + kb: dict from load_ep_knowledge(ep). + + Returns: + InsightResult with skip_set, priority_boosts, notes. + """ + result = InsightResult() + notes = result.notes + + print("\n=== Phase 1: Insight Engine ===") + + # ── signal 1: graph analysis ─────────────────────────────── + print(" [1/3] Graph analysis…") + g = run_graph_analysis(onnx_path) + result.graph_info = g + if g.available: + top5 = sorted(g.op_counts.items(), key=lambda x: -x[1])[:5] + print( + f" total_ops={g.total_ops} conv%={g.conv_pct:.1f} " + f"matmul%={g.matmul_pct:.1f} gemm%={g.gemm_pct:.1f} " + f"transpose={g.transpose_count} dynamic_axes={g.has_dynamic_axes}" + ) + print(f" top ops: {dict(top5)}") + if g.fusion_candidates: + print(f" fusion candidates ({len(g.fusion_candidates)}):") + for fc in g.fusion_candidates[:10]: # top-10 only + print(f" [{fc.count:3d}×] {fc.flag:40s} {fc.evidence}") + if len(g.fusion_candidates) > 10: + print(f" ... and {len(g.fusion_candidates) - 10} more") + else: + print(" [skip] onnx not available or model not found") + + # ── signal 2: winml analyze ──────────────────────────────── + print(f" [2/3] winml analyze --ep {ep}…") + ar = run_winml_analyze(winml, onnx_path, ep, device) + result.analyze_result = ar + if ar.available: + print( + f" supported={len(ar.supported)} partial={len(ar.partial)} " + f"unsupported={len(ar.unsupported)} unknown={len(ar.unknown)}" + ) + if ar.partial: + print(f" partial ops: {ar.partial[:5]}") + if ar.unsupported: + print(f" unsupported ops: {ar.unsupported[:5]}") + else: + print(" [skip] no rule data for this EP or analyze failed") + + # ── signal 3: KB confirmed rules ─────────────────────────── + print(" [3/3] Applying KB confirmed rules…") + + # ── build skip_set ───────────────────────────────────────── + + # KB-derived skips (already applied per confirmed finding) + for note in kb.get("notes", []): + if "[KB confirmed] Skip pass:" in note: + pass_name = note.split("Skip pass:")[-1].strip() + # Match against hypothesis labels that use this pass + for label, _, _ in hypotheses: + if pass_name.replace("_", "-") in label or pass_name in label: + result.skip_set.add(label) + notes.append(f"skip [{label}]: KB confirmed rule — {pass_name}") + + # Graph-derived skips + if g.available: + # npu-006: Conv% > 20% → hard-block conv fusions on QNN NPU + if ep in ("qnn",) and device == "npu" and g.conv_pct > 20.0: + for label, _, dim in hypotheses: + if dim == "graph_pass" and any(kw in label for kw in ("conv", "bn", "batch")): + result.skip_set.add(label) + notes.append( + f"skip [{label}]: npu-006 — Conv%={g.conv_pct:.1f}%>20% on QNN NPU" + " (FusedConv → CPU fallback)" + ) + + # cpu-001: opset > 17 regresses on CPU (empirical, mechanism unknown) + if ep == "cpu": + for label, _, dim in hypotheses: + if dim == "opset" and "21" in label: + notes.append( + f"deprioritise [{label}]: cpu-001 — opset21 regresses on CPU" + " (non-monotonic, mechanism unknown)" + ) + result.priority_boosts[label] = result.priority_boosts.get(label, 0) - 5 + + # gpu-004: QNN GPU — skip all quantization + if ep == "qnn" and device == "gpu": + for label, _, dim in hypotheses: + if dim in ("quant", "precision"): + result.skip_set.add(label) + notes.append(f"skip [{label}]: gpu-004 — quantization hangs on QNN GPU") + + # nhwc-transformer regresses p90 on DML/QNN GPU transformers + if ep in ("dml",) or (ep == "qnn" and device == "gpu"): + for label, _, dim in hypotheses: + if "nhwc" in label.lower(): + result.skip_set.add(label) + notes.append( + f"skip [{label}]: dml-002/gpu-002 — nhwc-transformer increases p90 variance" + ) + + # ── build priority_boosts ────────────────────────────────── + + if g.available: + # DINOv2-family on QNN NPU: opset21 gets strong positive boost (npu-001) + if ep == "qnn" and device == "npu": + # Heuristic: DINOv2 has many Reshape and high attention ops + if g.op_counts.get("Reshape", 0) > 30 and g.conv_pct < 10: + for label, _, dim in hypotheses: + if dim == "opset" and "21" in label: + result.priority_boosts[label] = result.priority_boosts.get(label, 0) + 10 + notes.append( + f"boost [{label}]: npu-001 heuristic — high Reshape count" + f" ({g.op_counts.get('Reshape', 0)}) + low Conv% suggests DINOv2-family" + ) + + # Fusion-candidate-driven boosts: map detected patterns → hypothesis labels + # + # Strategy: for each FusionCandidate, find hypotheses whose label or dimension + # mentions the relevant flag. Boost proportional to log(count) so that + # "288 MatMul→Add" doesn't overwhelm "12 GELU" by 24×. + import math + + _FLAG_KEYWORDS: dict[str, list[str]] = { + "gelu_fusion": ["gelu"], + "fast_gelu_fusion": ["gelu", "fast"], + "bias_gelu_fusion": ["gelu", "bias"], + "quick_gelu_fusion": ["gelu", "quick"], + "gelu_singlegelu": ["gelu"], + "layer_norm_fusion": ["layer_norm", "layernorm", "ln"], + "skip_layer_norm_fusion": ["skip_layer_norm", "skip_ln"], + "simplified_layer_norm_fusion": ["layer_norm", "simplified"], + "fuse_rmsnorm": ["rmsnorm", "rms_norm"], + "attention_fusion": ["attention"], + "bias_softmax_fusion": ["softmax", "attention"], + "matmul_add_fusion": ["matmul_add", "matmul-add"], + "matmul_activation_fusion": ["matmul_act", "matmul-act"], + "matmul_transpose_fusion": ["matmul_transp", "matmul-transp"], + "matmul_scale_fusion": ["matmul_scale", "matmul-scale"], + "matmuladd_reshapegemm": ["reshape_gemm", "matmuladd"], + "conv_bn_fusion": ["conv_bn", "conv-bn"], + "conv_add_fusion": ["conv_add", "conv-add"], + "conv_mul_fusion": ["conv_mul", "conv-mul"], + "conv_activation_fusion": ["conv_act", "conv-act"], + "conv_add_activation_fusion": ["conv_add_act", "fused_conv"], + "pad_fusion": ["pad_conv", "pad-conv"], + "gemm_activation_fusion": ["gemm_act", "gemm-act"], + "gemm_sum_fusion": ["gemm_sum", "gemm-sum"], + "gemm_transpose_fusion": ["gemm_transp"], + "slice_elimination": ["slice_elim"], + "unsqueeze_elimination": ["unsqueeze_elim"], + "expand_elimination": ["expand_elim"], + "concat_slice_elimination": ["concat_slice"], + "transpose_optimizer": ["transpose_opt", "tp_opt"], + "highdimRTR_lowdimRTR": ["rtr", "reshape_transpose"], + } + + for fc in g.fusion_candidates: + keywords = _FLAG_KEYWORDS.get(fc.flag, [fc.flag.replace("_", "-")]) + boost = round(1 + math.log(max(fc.count, 1)), 1) + for label, _, dim in hypotheses: + label_lower = label.lower() + if any(kw in label_lower for kw in keywords): + result.priority_boosts[label] = result.priority_boosts.get(label, 0) + boost + notes.append( + f"boost [{label}] +{boost:.1f}: graph has {fc.count}× {fc.flag} candidate(s)" + ) + + # GELU-decomposed: additional direct boost for gelu hypotheses + if g.has_gelu_decomposed: + for label, _, dim in hypotheses: + if "gelu" in label.lower() and label not in { + n.split("]")[0].lstrip("boost [") for n in notes if "gelu" in n + }: + result.priority_boosts[label] = result.priority_boosts.get(label, 0) + 2 + notes.append( + f"boost [{label}]: decomposed GELU detected — fusion likely beneficial" + ) + + # Conv-dense → conv fusions more likely to help (CPU only — not QNN NPU) + if g.conv_pct > 40 and ep not in ("qnn",): + for label, _, dim in hypotheses: + if "conv" in label.lower() and dim == "graph_pass": + result.priority_boosts[label] = result.priority_boosts.get(label, 0) + 2 + notes.append( + f"boost [{label}]: high Conv% ({g.conv_pct:.1f}%) — conv fusions promising" + ) + + # analyze-derived: if partial ops in model → deprioritise those optims + if ar.available and ar.partial: + for label, _, dim in hypotheses: + for pop in ar.partial: + if pop.lower() in label.lower(): + result.priority_boosts[label] = result.priority_boosts.get(label, 0) - 2 + notes.append( + f"deprioritise [{label}]: op '{pop}' is partial-support on {ep.upper()}" + ) + + # ── print summary ────────────────────────────────────────── + print("\n Insight Engine result:") + print(f" skip_set ({len(result.skip_set)}): {result.skip_set or '(none)'}") + boosts = {k: v for k, v in result.priority_boosts.items() if v != 0} + print(f" priority_boosts: {boosts or '(none)'}") + if notes: + print(" notes:") + for n in notes: + print(f" - {n}") + print() + + return result diff --git a/research/autoconfig/autoconfig.py b/research/autoconfig/autoconfig.py new file mode 100644 index 000000000..9b58b787a --- /dev/null +++ b/research/autoconfig/autoconfig.py @@ -0,0 +1,785 @@ +#!/usr/bin/env python3 +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""autoconfig.py — AutoResearch-style optimize-pass search for winml-cli +Demo: facebook/convnext-tiny-224, CPU EP, FP32 + +Loop: hypothesize → winml build → quick-screen bench (CV gate) → + full bench (3 sessions) → eval → keep/discard → repeat + +Key design principles (from GPU Optimizer V2 + ConvNext lessons): + 1. Two-phase bench: 200-iter CV screen FIRST, full bench only if CV < threshold + (CPU/GPU) — or unconditionally for QNN NPU (npu-007: DVFS makes CV unreliable) + 2. Use winml perf (NOT winml eval) for latency — eval includes HF preprocessing + 3. Mandatory external-research after 5 consecutive DISCARDs in same dimension + 4. Load ep_knowledge/*.json (only "confirmed" entries) to prune search space + 5. Per-experiment structured output: hypothesis/impl/parity/perf/analysis/decision + 6. Stop condition: 30 consecutive DISCARDs (not 5) + +Hypothesis design — ISOLATED mode (each hypothesis is independent): + Each hypothesis is applied to a fresh copy of BASELINE. The labels "+" prefix + is cosmetic; no state is accumulated across hypotheses. This allows independent + attribution: "does gelu-fusion alone help?" rather than "does gelu help on top + of conv fusions?". To run a cumulative search, chain patch functions explicitly. +""" + +import copy +import csv +import json +import sys +import time +from datetime import datetime +from pathlib import Path + +from analyze_insight import build_insight +from bench_utils import ( + FULL_ITERS, + FULL_SESSIONS, + SCREEN_CV_MAX_STD, + SCREEN_ITERS, + SessionManager, + ThroughputOnly, + VerdictInput, + bench_full, + bench_screen, + median_p50, + run_cmd, +) +from report_gen import generate_report + +sys.stdout.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined] + +# ── settings ───────────────────────────────────────────────────────────────── +MODEL_ID = "facebook/convnext-tiny-224" +TASK = "image-classification" +EP = "cpu" +DEVICE = "cpu" +WINML = str(Path(__file__).parent / ".venv" / "Scripts" / "winml.exe") +WORK_DIR = Path(__file__).parent / "convnext-search" +RESULTS_TSV = WORK_DIR / "results.tsv" +KB_DIR = Path(__file__).parent / "ep_knowledge" + +EVAL_SAMPLES = 50 # for accuracy gate +ACCURACY_FLOOR = 0.70 # cosine drop below this → discard +MIN_IMPROVEMENT = 0.01 # require ≥1% p50 improvement to KEEP + +# Verdict policy: improvement must exceed max(MIN_IMPROVEMENT, STAT_BAR * screen_cv) +# Borrowed from AgenticGPUOptimizer V2 (avoids calling noise-level deltas "improvements") +STAT_BAR_MULTIPLIER = 2.0 + +# Screen early exit: skip 3x full-bench when screen already shows < this % improvement. +# Saves ~25-90 min per rejected hypothesis (3 sessions × FULL_ITERS iters). +SCREEN_PASS_MIN_IMPROVEMENT_PCT = 1.0 + +# Bench protocol (two-phase, from GPU Optimizer V2) +SCREEN_WARMUP = 20 +SCREEN_ITERS = 200 +SCREEN_CV_MAX = 0.10 # Coefficient of Variation = std/p50; reject if > 10% +FULL_WARMUP = 50 +FULL_ITERS = 1000 +FULL_SESSIONS = 3 +COOL_DOWN_S = 60 # seconds between full-bench sessions + +# Stop conditions +STOP_CONSECUTIVE_DISCARDS = 30 # plateau stop +EXTERNAL_RESEARCH_TRIGGER = 5 # trigger after this many DISCARDs in same dimension + +# ── load ep_knowledge (confirmed entries only) ──────────────────────────────── + + +def load_ep_knowledge(ep: str) -> dict: + """Load confirmed KB entries for given EP. Only 'confirmed' status entries + are used to prune search space. 'draft' entries are informational only. + """ + kb_path = KB_DIR / f"{ep}.json" + if not kb_path.exists(): + return {"skip_passes": [], "skip_quantization": False, "notes": []} + + kb = json.loads(kb_path.read_text(encoding="utf-8")) + rules = kb.get("search_space_rules", {}) + skip_passes = [] + skip_quant = False + notes = [] + + # Only apply rules from confirmed findings + confirmed_ids = {f["id"] for f in kb.get("findings", []) if f.get("mechanism_confirmed", False)} + + for finding in kb.get("findings", []): + if finding["id"] not in confirmed_ids: + notes.append(f"[DRAFT] {finding['id']}: {finding['title'][:60]}…") + continue + action = finding.get("action_for_autoconfig", "") + if "skip" in action.lower() and "quantization" in action.lower(): + skip_quant = True + notes.append(f"[KB confirmed] Skip quantization: {finding['id']}") + if "skip" in action.lower() and "compile" in action.lower(): + notes.append(f"[KB confirmed] Skip compile: {finding['id']}") + + # Parse search_space_rules for passes to skip + graph_passes = rules.get("graph_passes", {}) + for p in graph_passes.get("skip", []): + skip_passes.append(p) + notes.append(f"[KB confirmed] Skip pass: {p}") + + return {"skip_passes": skip_passes, "skip_quantization": skip_quant, "notes": notes} + + +# ── baseline config ─────────────────────────────────────────────────────────── +BASELINE: dict = { + "export": { + "opset_version": 17, + "batch_size": 1, + "do_constant_folding": True, + "dynamo": False, + "input_tensors": [ + { + "name": "pixel_values", + "dtype": "float32", + "shape": [1, 3, 224, 224], + "value_range": [0, 1], + } + ], + "output_tensors": [{"name": "logits"}], + }, + "optim": {}, + "loader": { + "task": TASK, + "model_class": "AutoModelForImageClassification", + "model_type": "convnext", + }, + "eval": { + "task": TASK, + "dataset": {"path": "timm/mini-imagenet", "split": "test", "samples": EVAL_SAMPLES}, + }, +} + + +# ── hypothesis sequence ─────────────────────────────────────────────────────── +# Each function receives a FRESH copy of BASELINE (isolated mode). +# Hypotheses are independent — no state is accumulated across them. +# Use "+" in labels only when the function explicitly inherits optim from prior +# state. Here all hypotheses start from baseline optim={}, so labels are flat. + + +def h0_baseline(cfg: dict) -> dict: + """FP32 export, no extra fusions — reference point.""" + cfg["optim"] = {} + return cfg + + +def h1_conv_fusions(cfg: dict) -> dict: + """Conv+BN+Add+Activation fusions in isolation. + + NOTE: These are ORT graph-level fusions (conv_bn_fusion etc.) that create FusedConv ops. + On QNN NPU, FusedConv causes CPU fallback → catastrophic regression (npu-006). + Only run on CPU/DML EPs. Use count_conv_pct() before enabling on QNN. + """ + cfg["optim"] = { + "conv_bn_fusion": True, + "conv_add_fusion": True, + "conv_activation_fusion": True, + } + return cfg + + +def h2_gelu_fusion(cfg: dict) -> dict: + """Gelu fusion in isolation (no conv fusions).""" + cfg["optim"] = {"gelu_fusion": True} + return cfg + + +def h3_layernorm_fusion(cfg: dict) -> dict: + """LayerNorm fusion in isolation.""" + cfg["optim"] = {"layer_norm_fusion": True} + return cfg + + +def h4_matmul_add(cfg: dict) -> dict: + """MatMul+Add fusion in isolation (MLP block bottleneck).""" + cfg["optim"] = {"matmul_add_fusion": True} + return cfg + + +def h5_transpose_opt(cfg: dict) -> dict: + """Transpose optimizer in isolation.""" + cfg["optim"] = {"transpose_optimizer": True} + return cfg + + +def h6_opset21(cfg: dict) -> dict: + """Opset 21 research hypothesis — model-architecture-dependent benefit (npu-001). + NOTE: Mechanism unknown. Not a confirmed optimization. Gate 2 required before KB. + """ + cfg["export"]["opset_version"] = 21 + return cfg + + +HYPOTHESES: list[tuple[str, object, str]] = [ + # (label, patch_fn, search_dimension) + ("baseline (FP32, no fusions)", h0_baseline, "baseline"), + ("conv fusions: bn+add+activation", h1_conv_fusions, "graph_pass"), + ("gelu-fusion only", h2_gelu_fusion, "graph_pass"), + ("layer-norm-fusion only", h3_layernorm_fusion, "graph_pass"), + ("matmul-add-fusion (MLP blocks)", h4_matmul_add, "graph_pass"), + ("transpose-optimizer only", h5_transpose_opt, "graph_pass"), + ("opset=21 (npu-001 research)", h6_opset21, "opset"), +] + +# ── helpers ─────────────────────────────────────────────────────────────────── + + +def build(cfg: dict, out_dir: Path) -> tuple[bool, str]: + out_dir.mkdir(parents=True, exist_ok=True) + cfg_path = out_dir / "config.json" + cfg_path.write_text(json.dumps(cfg, indent=2)) + rc, out, _ = run_cmd( + [ + WINML, + "build", + "-c", + str(cfg_path), + "-m", + MODEL_ID, + "-o", + str(out_dir), + "--ep", + EP, + "--device", + DEVICE, + "--no-quant", + "--no-compile", + ], + label="winml build", + ) + return rc == 0, out + + +def _run_screen(model_path: Path) -> tuple[float | None, float]: + """Phase A: 200-iter screen with CV gate. + + For CPU EP, high CV means thermal/scheduling noise — reject and retry later. + Returns (p50_ms, cv). p50_ms=None means unstable or command failed. + """ + sr = bench_screen(winml=WINML, model_path=model_path, ep=EP, device=DEVICE) + if sr.hard_failed: + return None, 999.0 + if sr.cv is not None and sr.cv > SCREEN_CV_MAX: + print( + f" Phase A rejected: CV={sr.cv:.2f} > {SCREEN_CV_MAX}" + f" (thermal/scheduling noise on {EP.upper()} — cool device and retry)" + ) + return None, sr.cv + return sr.p50_ms, sr.cv or 0.0 + + +def _run_full(model_path: Path) -> list[float]: + """Phase B: 3 sessions × FULL_ITERS with cool-down. Returns p50 per session.""" + return bench_full( + winml=WINML, + model_path=model_path, + ep=EP, + device=DEVICE, + out_prefix="full", + iters=FULL_ITERS, + cool_down_s=COOL_DOWN_S, + ) + + +def _run_phase_b( + out_dir: Path, + label: str, + exp_info: dict, + screen_cv: float, + baseline_p50: float | None, + best_p50: float, + best_label: str, + policy: ThroughputOnly, +) -> tuple[str, dict]: + """Run Phase B (full bench + accuracy gate) and evaluate with VerdictPolicy. + + Returns (status_str, updated exp_info). Does not update best_p50/best_label — + caller is responsible so champion tracking stays in one place. + """ + full_p50s = _run_full(out_dir / "model.onnx") + if not full_p50s: + exp_info["analysis"] = "Phase B winml perf returned no data" + return "crash (full bench failed)", exp_info + + med_p50 = median_p50(full_p50s) + assert med_p50 is not None + exp_info["full_p50s"] = [f"{p:.1f}" for p in full_p50s] + exp_info["median_p50"] = f"{med_p50:.1f}" + + # Promote baseline from first successful full bench + if baseline_p50 is None: + baseline_p50 = med_p50 + exp_info["baseline_p50"] = f"{baseline_p50:.1f}" + + # Accuracy gate + accuracy = eval_accuracy(out_dir) + exp_info["accuracy"] = f"{accuracy:.4f}" if accuracy is not None else "N/A" + + improvement_pct = (baseline_p50 - med_p50) / baseline_p50 * 100 + delta_pct = -improvement_pct + exp_info["delta_pct"] = f"{delta_pct:+.1f}%" + + correctness_pass = accuracy is None or accuracy >= ACCURACY_FLOOR + verdict = policy.evaluate( + VerdictInput( + improvement_pct=improvement_pct, + cv_pct=screen_cv * 100.0, + correctness_pass=correctness_pass, + ) + ) + + exp_info["analysis"] = verdict.reasoning + if verdict.verdict in ("KEEP", "MARGINAL_KEEP"): + status = "keep" + (" (marginal)" if verdict.marginal else "") + exp_info["analysis"] = ( + f"Improvement confirmed: p50 {baseline_p50:.1f}ms -> {med_p50:.1f}ms " + f"({delta_pct:+.1f}%). {verdict.reasoning}" + ) + # Auto-write KB draft entry for notable improvements + if not verdict.marginal: + write_kb_draft( + ep=EP, + label=label, + improvement_pct=improvement_pct, + cv=screen_cv, + model_id=MODEL_ID, + dimension=exp_info.get("dimension", "unknown"), + ) + elif verdict.verdict == "ACC_FAIL": + status = f"discard (accuracy {accuracy:.4f} < floor {ACCURACY_FLOOR})" + else: + status = f"discard ({verdict.reasoning})" + + return status, exp_info + + +def eval_accuracy(out_dir: Path) -> float | None: + """Run winml eval; return accuracy (top-1 or cosine). For latency: use bench_*.""" + model_path = out_dir / "model.onnx" + if not model_path.exists(): + return None + result_json = out_dir / "eval_result.json" + rc, _, _ = run_cmd( + [ + WINML, + "eval", + "-m", + str(model_path), + "--model-id", + MODEL_ID, + "--task", + TASK, + "--ep", + EP, + "--device", + DEVICE, + "--samples", + str(EVAL_SAMPLES), + "-o", + str(result_json), + ], + label="winml eval (accuracy gate)", + ) + if rc != 0 or not result_json.exists(): + return None + try: + data = json.loads(result_json.read_text()) + metrics = data.get("metrics", data) + acc = metrics.get("accuracy") + return float(acc) if acc is not None else None + except Exception as e: + print(f" [warn] parse error: {e}") + return None + + +def write_experiment_doc(exp_dir: Path, info: dict) -> None: + """Write per-experiment structured artifact (V2 pattern): + Hypothesis / Implementation / Parity / Perf / Analysis / Decision + """ + exp_dir.mkdir(parents=True, exist_ok=True) + doc = f"""# Experiment {info["iter"]:02d}: {info["label"]} + +## Hypothesis +{info.get("hypothesis", "(not recorded)")} + +## Implementation +- Config flags: `{info.get("optim_flags", "")}` +- Opset: `{info.get("opset", 17)}` +- Search dimension: `{info.get("dimension", "")}` + +## Parity (accuracy gate) +- Accuracy: `{info.get("accuracy", "N/A")}` +- Floor: `{ACCURACY_FLOOR}` +- Result: `{"PASS" if (info.get("accuracy") or 0) >= ACCURACY_FLOOR else "FAIL"}` + +## Performance +### Phase A (quick screen, {SCREEN_ITERS} iters) +- p50: `{info.get("screen_p50", "N/A")}ms` +- CV: `{info.get("screen_cv", "N/A")}` (threshold: {SCREEN_CV_MAX}) + +### Phase B (full bench, {FULL_ITERS}×{FULL_SESSIONS} sessions) +- p50 per session: `{info.get("full_p50s", [])}` +- Median p50: `{info.get("median_p50", "N/A")}ms` +- Baseline p50: `{info.get("baseline_p50", "N/A")}ms` +- Delta: `{info.get("delta_pct", "N/A")}` + +## Analysis +{info.get("analysis", "(auto-generated: no significant analysis)")} + +## Decision +**{info.get("status", "UNKNOWN").upper()}** + +Timestamp: {datetime.now().isoformat(timespec="seconds")} +""" + (exp_dir / "experiment.md").write_text(doc, encoding="utf-8") + + +def write_kb_draft( + ep: str, label: str, improvement_pct: float, cv: float, model_id: str, dimension: str +) -> None: + """Append a draft finding to ep_knowledge/.json when improvement > 10%. + + The entry gets status='draft' — a human must review and promote to 'confirmed' + after Gate 2 validation (>=2 independent models, mechanism understood). + """ + if improvement_pct < 10.0: + return + kb_path = KB_DIR / f"{ep}.json" + if not kb_path.exists(): + return + try: + kb = json.loads(kb_path.read_text(encoding="utf-8")) + except Exception: + return + + findings = kb.setdefault("findings", []) + # Auto-generate a draft ID: ep-draft- + draft_id = f"{ep}-draft-{datetime.now().strftime('%Y%m%d%H%M%S')}" + + # Don't duplicate if same label+model already drafted + for f in findings: + if ( + f.get("status") == "draft" + and f.get("model_id") == model_id + and f.get("title", "").startswith(label[:30]) + ): + return + + draft = { + "id": draft_id, + "status": "draft", + "title": f"[DRAFT] {label} — {improvement_pct:+.1f}% on {model_id}", + "model_id": model_id, + "dimension": dimension, + "improvement_pct": round(improvement_pct, 2), + "cv": round(cv, 3), + "mechanism_confirmed": False, + "note": "Auto-generated draft. Requires Gate 2: >=2 models, mechanism understood.", + "action_for_autoconfig": "investigate", + "timestamp": datetime.now().isoformat(timespec="seconds"), + } + findings.append(draft) + kb_path.write_text(json.dumps(kb, indent=2), encoding="utf-8") + print(f" [KB draft] Wrote draft entry {draft_id}: {label} ({improvement_pct:+.1f}%)") + + +def log(row: dict) -> None: + fields = [ + "iter", + "label", + "dimension", + "optim_flags", + "opset", + "accuracy", + "screen_p50_ms", + "median_p50_ms", + "baseline_p50_ms", + "delta_pct", + "cv", + "status", + "elapsed_s", + "timestamp", + ] + is_new = not RESULTS_TSV.exists() + with RESULTS_TSV.open("a", newline="", encoding="utf-8") as f: + w = csv.DictWriter(f, fieldnames=fields, delimiter="\t", extrasaction="ignore") + if is_new: + w.writeheader() + w.writerow(row) + + +def optim_flags(cfg: dict) -> str: + flags = [k for k, v in cfg.get("optim", {}).items() if v is True] + return ",".join(flags) if flags else "(none)" + + +# ── main loop ───────────────────────────────────────────────────────────────── + + +def main() -> None: + WORK_DIR.mkdir(parents=True, exist_ok=True) + + # Load EP knowledge (confirmed entries only) + kb = load_ep_knowledge(EP) + print(f"\n=== KB loaded for EP={EP} ===") + for note in kb["notes"]: + print(f" {note}") + + # Resume from prior session if interrupted + session = SessionManager(WORK_DIR) + + sep = "=" * 64 + print(f"\n{sep}") + print(f" autoconfig search -- {MODEL_ID}") + print(f" EP: {EP} eval_samples: {EVAL_SAMPLES} hypotheses: {len(HYPOTHESES)}") + print( + f" Bench: screen={SCREEN_ITERS} iters (CV<{SCREEN_CV_MAX}) -> full={FULL_ITERS}x{FULL_SESSIONS}" + ) + print(f" Stop: {STOP_CONSECUTIVE_DISCARDS} consecutive DISCARDs OR budget") + print(f" External research trigger: after {EXTERNAL_RESEARCH_TRIGGER} DISCARDs same dimension") + print( + f" Verdict: improvement must exceed max({MIN_IMPROVEMENT * 100:.0f}%, {STAT_BAR_MULTIPLIER:.0f}x screen-CV)" + ) + print( + f" Screen early exit: skip full bench if screen improvement < {SCREEN_PASS_MIN_IMPROVEMENT_PCT:.0f}%" + ) + print(f"{sep}\n") + + # Restore state from prior session (if resuming) + baseline_p50: float | None = session.baseline_p50 + best_p50 = session.best_p50 + best_label = session.best_label + consecutive_discards = session.consecutive_discards + discard_by_dimension: dict[str, int] = session.discard_by_dimension + + policy = ThroughputOnly( + min_improvement_pct=MIN_IMPROVEMENT * 100, + stat_bar_multiplier=STAT_BAR_MULTIPLIER, + ) + + # ── Phase 1: Insight Engine ──────────────────────────────────────────────── + # Run AFTER baseline build so we have a real ONNX to analyse. + # The baseline ONNX is expected at WORK_DIR/iter_00/model.onnx once h0 has run. + # On first run the baseline may not exist yet — insight falls back gracefully. + baseline_onnx = WORK_DIR / "iter_00" / "model.onnx" + insight = build_insight( + onnx_path=baseline_onnx, + winml=WINML, + ep=EP, + device=DEVICE, + hypotheses=HYPOTHESES, + kb=kb, + ) + + # Reorder HYPOTHESES by priority boost (highest first), keeping stable sort + def _sort_key(item: tuple) -> float: + lbl = item[0] + return -insight.priority_boosts.get(lbl, 0.0) + + active_hypotheses = sorted(HYPOTHESES, key=_sort_key) + + for i, (label, patch_fn, dimension) in enumerate(active_hypotheses): + # Skip iters completed in a prior run + if i in session.completed_iters: + print(f" [resume] skipping iter {i} ({label}) — already done") + continue + + iter_start = time.time() + print(f"\n{'--' * 32}") + print(f" iter {i} | {label} [{dimension}]") + print(f"{'--' * 32}") + + # Check KB skip_set (confirmed rules only) + flags_preview = optim_flags(patch_fn(copy.deepcopy(BASELINE))) # type: ignore[operator] + skip_reason = next( + (r for r in kb["skip_passes"] if any(f in flags_preview for f in r.split()[:2])), None + ) + if skip_reason: + print(f" skipped by KB confirmed rule: {skip_reason}") + continue + + # Check insight skip_set (Phase 1 analysis-derived rules) + if label in insight.skip_set: + print(f" skipped by Insight Engine: {label}") + continue + + cfg = patch_fn(copy.deepcopy(BASELINE)) # type: ignore[operator] + flags = optim_flags(cfg) + opset = cfg["export"]["opset_version"] + print(f" optim: {flags}") + print(f" opset: {opset}") + + out_dir = WORK_DIR / f"iter_{i:02d}" + exp_dir = WORK_DIR / "experiments" / f"{i:02d}_{dimension}" + ok, _ = build(cfg, out_dir) + + exp_info: dict = { + "iter": i, + "label": label, + "dimension": dimension, + "optim_flags": flags, + "opset": opset, + "hypothesis": label, + "baseline_p50": f"{baseline_p50:.1f}" if baseline_p50 else "N/A", + } + + if not ok: + status = "crash" + exp_info["analysis"] = "winml build failed — check build log" + else: + # Phase A: quick screen + screen_p50, screen_cv = _run_screen(out_dir / "model.onnx") + exp_info["screen_p50"] = f"{screen_p50:.1f}" if screen_p50 else "UNSTABLE" + exp_info["screen_cv"] = f"{screen_cv:.3f}" + + if screen_p50 is None: + status = "discard (unstable — CV too high)" + exp_info["analysis"] = ( + f"Phase A rejected: CV={screen_cv:.2f} > {SCREEN_CV_MAX}. " + f"Thermal or scheduling noise on {EP.upper()} EP. Cool device and retry." + ) + elif baseline_p50 is not None: + # Screen early exit: skip full bench when screen shows negligible gain. + # Saves 3x full-bench time for clearly non-improving configs. + screen_improvement_pct = (baseline_p50 - screen_p50) / baseline_p50 * 100 + if screen_improvement_pct < SCREEN_PASS_MIN_IMPROVEMENT_PCT: + status = ( + f"discard (screen early exit: improvement {screen_improvement_pct:+.1f}%" + f" < {SCREEN_PASS_MIN_IMPROVEMENT_PCT:.0f}% — full bench skipped)" + ) + exp_info["analysis"] = ( + f"Phase A early exit: screen p50={screen_p50:.1f}ms vs baseline " + f"{baseline_p50:.1f}ms ({screen_improvement_pct:+.1f}% improvement) is " + f"below {SCREEN_PASS_MIN_IMPROVEMENT_PCT:.0f}% threshold. " + f"Full bench skipped — not worth 3x{FULL_ITERS} iters." + ) + exp_info["delta_pct"] = f"{-screen_improvement_pct:+.1f}% (screen estimate)" + else: + status, exp_info = _run_phase_b( + out_dir, + label, + exp_info, + screen_cv, + baseline_p50, + best_p50, + best_label, + policy, + ) + if status.startswith("keep"): + # Update champion tracking + new_p50 = float(exp_info.get("median_p50", best_p50)) + if new_p50 < best_p50: + best_p50 = new_p50 + best_label = label + status = "keep *** NEW BEST ***" + else: + # First iteration: no baseline yet — always run full bench + status, exp_info = _run_phase_b( + out_dir, label, exp_info, screen_cv, None, best_p50, best_label, policy + ) + if status.startswith("keep"): + new_p50 = float(exp_info.get("median_p50", best_p50)) + if new_p50 < best_p50: + best_p50 = new_p50 + best_label = label + status = "keep *** NEW BEST ***" + + # Extract baseline from first successful full bench + if baseline_p50 is None and "median_p50" in exp_info: + try: + baseline_p50 = float(exp_info["median_p50"]) + exp_info["baseline_p50"] = f"{baseline_p50:.1f}" + except (ValueError, TypeError): + pass + + # Write per-experiment doc (V2 pattern) + exp_info["status"] = status + write_experiment_doc(exp_dir, exp_info) + + # Track consecutive discards + external research trigger + if "discard" in status or "crash" in status: + consecutive_discards += 1 + discard_by_dimension[dimension] = discard_by_dimension.get(dimension, 0) + 1 + if discard_by_dimension[dimension] == EXTERNAL_RESEARCH_TRIGGER: + print( + f"\n EXTERNAL RESEARCH TRIGGER: {EXTERNAL_RESEARCH_TRIGGER} consecutive DISCARDs in [{dimension}]" + ) + print(" -> Search ORT/QNN source code for mechanism before continuing") + print( + " -> Check kMaxSupportedOpset for opset dimension, EP-specific rules for others" + ) + print(f" -> File findings in ep_knowledge/{EP}.json as 'draft' entry") + else: + consecutive_discards = 0 + discard_by_dimension[dimension] = 0 + + # Log to TSV + log( + { + "iter": i, + "label": label, + "dimension": dimension, + "optim_flags": flags, + "opset": opset, + "accuracy": exp_info.get("accuracy", "N/A"), + "screen_p50_ms": exp_info.get("screen_p50", "N/A"), + "median_p50_ms": exp_info.get("median_p50", "N/A"), + "baseline_p50_ms": exp_info.get("baseline_p50", "N/A"), + "delta_pct": exp_info.get("delta_pct", "N/A"), + "cv": exp_info.get("screen_cv", "N/A"), + "status": status, + "elapsed_s": f"{time.time() - iter_start:.0f}", + "timestamp": datetime.now().isoformat(timespec="seconds"), + } + ) + + print(f" -> {status}") + + # Persist state for crash-resume + session.save( + iter_idx=i, + verdict=status, + baseline_p50=baseline_p50, + best_p50=best_p50, + best_label=best_label, + consecutive_discards=consecutive_discards, + discard_by_dimension=discard_by_dimension, + ) + + # Stop condition + if consecutive_discards >= STOP_CONSECUTIVE_DISCARDS: + print(f"\n STOP: {STOP_CONSECUTIVE_DISCARDS} consecutive DISCARDs — plateau reached") + break + + print(f"\n{sep}") + print(" SEARCH COMPLETE") + print(f" Best config: {best_label}") + print(f" Best p50: {best_p50:.1f}ms" if best_p50 < float("inf") else " No improvement found") + print(f" Results: {RESULTS_TSV}") + print(f" Experiments: {WORK_DIR / 'experiments'}") + + # ── Phase 3: Generate HTML report ───────────────────────────────────────── + try: + report_path = generate_report( + results_tsv=RESULTS_TSV, + work_dir=WORK_DIR, + model_id=MODEL_ID, + ep=EP, + insight_notes=insight.notes, + ) + print(f" Report: {report_path}") + except Exception as e: + print(f" [warn] Report generation failed: {e}") + + print(f"{sep}\n") + + +if __name__ == "__main__": + main() diff --git a/research/autoconfig/autoconfig_diagram.html b/research/autoconfig/autoconfig_diagram.html new file mode 100644 index 000000000..e31eff5d6 --- /dev/null +++ b/research/autoconfig/autoconfig_diagram.html @@ -0,0 +1,410 @@ + + + + +autoconfig Skill — Architecture + + + + +

autoconfig — Skill Architecture

+

Profile-guided autonomous config search for WinApp developers

+
v3 · 2026-06-17 · AgenticGPUOptimizer V2 patterns applied
+ +
+ + +
+
👤
+
+ User input  —  + Model ID  +  Target EP  +  Objective: + accuracy-primary + latency-primary + Pareto +  + optional budget / accuracy floor +
+
+ +
+ + +
+
Phase 0 · Intake
+
+
+
Inspect
+
    +
  • winml inspect
  • +
  • EP availability check
  • +
  • Load session.json (crash-resume)
  • +
+
+
+
+
Baseline Build
+
    +
  • winml build (opset17, no quant)
  • +
  • Record baseline p50
  • +
+
+
+
+
Correctness Contract
+
    +
  • winml eval --mode compare
  • +
  • Reference: original ONNX or HF PyTorch
  • +
  • Lock cosine similarity = 1.000
  • +
+
+
+
+ +
+ + +
+
Phase 1 · Insight
+
+ +
+
+
Runtime Profile
+
    +
  • winml perf --profile (pending #158)
  • +
  • Per-op kernel time, bottleneck %
  • +
+
+
+
Static Analyzer
+
    +
  • winml analyze --ep <ep>
  • +
  • Conv% → npu-006 risk flag
  • +
  • Partial-support op list
  • +
+
+
+
Graph Analysis
+
    +
  • Op counts by type
  • +
  • Fusion opportunities
  • +
  • Static vs dynamic axes
  • +
+
+
+ +
+
+
Insight Engine — fuse 3 signals →
+
+
+ +
+
+
skip_set
+
Hard blocks from KB rules
e.g. conv fusions on QNN NPU
+
+
+
priority_queue
+
Ranked hypotheses from profile
e.g. opset21 for DINOv2-family
+
+
+ +
+
+ +
+ + +
+
Phase 2 · Opt Loop
+
+
+ +
+ +
+
Explorer
+
    +
  • Skip completed iters from session.json NEW
  • +
  • Pop next hypothesis from priority_queue
  • +
  • Check KB rules → skip if pruned
  • +
  • Build config.json delta
  • +
+
+ +
+ +
+
Optimizer
+
    +
  • winml build -c config.json
  • +
  • Phase A — screen (200 iters): CV gate for CPU/GPU; disabled for QNN NPU (DVFS)
  • +
  • Early exit NEW: screen Δ < 1% → DISCARD, skip full bench
  • +
  • Phase B — full bench (3 × 1000 iters, 60s cool-down)
  • +
  • winml eval → accuracy gate
  • +
+
+ +
+ +
+
Reviewer — ThroughputOnly NEW
+
    +
  • threshold = max(1%, 2.0 × CV)
  • +
+
+ KEEP >1.5×thr + MARGINAL 1×–1.5× + DISCARD + EARLY DISCARD + ACC/BUILD FAIL +
+
+ +
+
Crash-Resume NEW
+
    +
  • Atomic write after every experiment
  • +
  • Stores: completed iters, baseline/best p50, discard counters
  • +
+
+ +
↩ loop back to Explorer
+ +
+ +
+
+
Stop conditions
+
    +
  • Objective met
  • +
  • 30 consecutive DISCARDs
  • +
  • Queue empty
  • +
  • User stops
  • +
+
+
+
results.tsv
+ config · screen_p50 · median_p50
+ CV · delta_pct · status +
+
+
session.json
+ completed_iters
+ baseline/best p50
+ discard counters +
+
+
ep_knowledge/
+ New entries as
+ status="draft" +
+
+ +
+
+
+ +
+ + +
+
Phase 3 · Outcome
+
+
+
+
Champion Config
+ Best config + provenance + config_<ep>_optimal.json +
+
+
HTML Report
+ Chart + experiment table + report.html +
+
+
Experiment Artifacts
+ Per-hypothesis logs + experiments/<n>/ +
+
+
KB Draft Entry
+ New findings, promoted after Gate 2 + ep_knowledge/<ep>.json +
+
+
Feature Requirements
+ Issues filed per finding + #NNN · <feature gap title> +
+
+
+
+ + +
+ v3 · 2026-06-17: + ThroughputOnly verdict policy (threshold = max(1%, 2×CV)); + screen early exit (Δ<1% skips full bench, saves ~25–90 min); + crash-resume via atomic session.json. +  ·  + Key constraints: + npu-006 (Conv%>20% → block conv fusions); + npu-007 (CV gate off on NPU); + cpu-001 (opset17 on CPU); + gpu-004 (no quant on QNN GPU). +
+ +
+ + diff --git a/research/autoconfig/bench_utils.py b/research/autoconfig/bench_utils.py new file mode 100644 index 000000000..800f67e1d --- /dev/null +++ b/research/autoconfig/bench_utils.py @@ -0,0 +1,566 @@ +#!/usr/bin/env python3 +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""bench_utils.py — Shared benchmarking helpers for QNN NPU sweeps. + +Bench protocol (npu-007): + Phase A: 200-iter screen. For QNN NPU, high CV (0.15-1.2) is NORMAL due to + DVFS/Hexagon HTP thermal throttling. Phase A result is informational only; + it never gates Phase B on NPU. Only use CV gate for CPU/GPU EPs. + Phase B: 3 independent sessions x 500 iters with 30s cool-down. + KEEP criterion: all p50s below baseline; for NPU, ranges must not overlap. + +winml config + build helpers are also centralized here to avoid duplication +between catalog_qnn_sweep.py and validation_sweep.py. +""" + +from __future__ import annotations + +import copy +import json +import subprocess +import time +from abc import ABC, abstractmethod +from dataclasses import dataclass +from pathlib import Path + +# ── Protocol constants (overridable by callers via module-level reassignment) ─ +SCREEN_WARMUP: int = 20 +SCREEN_ITERS: int = 200 +SCREEN_CV_MAX_NPU: float = 999.0 # never gate on CV for QNN NPU (npu-007) +SCREEN_CV_MAX_STD: float = 0.10 # CPU / GPU: reject if CV > 10% + +FULL_WARMUP: int = 50 +FULL_ITERS: int = 500 +FULL_SESSIONS: int = 3 +COOL_DOWN_S: int = 30 # seconds between full-bench sessions (NPU) + +BUILD_TIMEOUT_S: int = 8 * 60 +BENCH_TIMEOUT_S: int = 8 * 60 +CONFIG_TIMEOUT_S: int = 120 + + +# ── subprocess wrapper ──────────────────────────────────────────────────────── + + +def run_cmd(cmd: list[str], label: str = "", timeout: int = 600) -> tuple[int, str, float]: + """Run a subprocess command. Returns (returncode, combined_output, elapsed_s).""" + t0 = time.time() + print(f" >> {label or cmd[1]}", flush=True) + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + timeout=timeout, + ) + elapsed = time.time() - t0 + tag = "ok" if result.returncode == 0 else f"rc={result.returncode}" + print(f" {elapsed:.0f}s [{tag}]", flush=True) + if result.returncode != 0: + snippet = (result.stderr or result.stdout or "")[-600:] + print(f" stderr: {snippet}", flush=True) + return result.returncode, result.stdout + result.stderr, elapsed + except subprocess.TimeoutExpired: + elapsed = time.time() - t0 + print(f" TIMEOUT after {elapsed:.0f}s", flush=True) + return -999, f"TIMEOUT after {timeout}s", elapsed + + +# ── winml wrappers ──────────────────────────────────────────────────────────── + + +def get_base_config( + winml: str, + model_id: str, + task: str, + model_type: str, + ep: str, + device: str, + out_path: Path, +) -> dict | None: + """Generate a config via `winml config`. Returns parsed dict or None on failure. + + Tries with --model-type first, then falls back without it. + """ + + def _try(extra_args: list[str]) -> dict | None: + cmd = [ + winml, + "config", + "-m", + model_id, + "-t", + task, + "--device", + device, + "--ep", + ep, + "--no-compile", + "-o", + str(out_path), + ] + extra_args + rc, _, _ = run_cmd(cmd, label="winml config", timeout=CONFIG_TIMEOUT_S) + if rc == 0 and out_path.exists(): + try: + cfg = json.loads(out_path.read_text(encoding="utf-8")) + out_path.unlink(missing_ok=True) + return cfg + except Exception as e: + print(f" [warn] config parse error: {e}", flush=True) + out_path.unlink(missing_ok=True) + return None + + cfg = _try(["--model-type", model_type]) + if cfg is None: + print(" [warn] config with --model-type failed, retrying without...", flush=True) + cfg = _try([]) + return cfg + + +def run_build( + winml: str, + model_id: str, + cfg_path: Path, + out_dir: Path, + ep: str, + device: str, + extra_flags: list[str] | None = None, +) -> tuple[bool, str]: + """Run `winml build`. Returns (success, combined_output).""" + out_dir.mkdir(parents=True, exist_ok=True) + cmd = [ + winml, + "build", + "-c", + str(cfg_path), + "-m", + model_id, + "-o", + str(out_dir), + "--ep", + ep, + "--device", + device, + "--no-compile", + "--rebuild", + ] + if extra_flags: + cmd.extend(extra_flags) + rc, out, _ = run_cmd(cmd, label=f"winml build [{out_dir.name}]", timeout=BUILD_TIMEOUT_S) + return rc == 0, out + + +def make_hypothesis_config( + base: dict, opset_override: int | None, extra_optim: dict | None +) -> dict: + """Return a modified deep copy of base config for one hypothesis.""" + cfg = copy.deepcopy(base) + if opset_override is not None and cfg.get("export"): + cfg["export"]["opset_version"] = opset_override + if extra_optim is not None: + cfg["optim"] = {**(cfg.get("optim") or {}), **extra_optim} + return cfg + + +def find_model_onnx(hyp_dir: Path) -> Path | None: + """Locate the best ONNX artifact in a build output dir. + + Priority: quantized > optimized > any .onnx. + Returns None if no .onnx file exists. + """ + model_files = list(hyp_dir.glob("*.onnx")) + if not model_files: + return None + for preference in ("quantized", "optimized"): + match = next((f for f in model_files if preference in f.name), None) + if match: + return match + return model_files[0] + + +def is_build_complete(hyp_dir: Path) -> bool: + """Return True if the hyp_dir contains a complete build artifact. + + 'Complete' means optimized.onnx or quantized.onnx is present. + export.onnx alone means the pipeline was truncated before optimization. + """ + return any( + f.name for f in hyp_dir.glob("*.onnx") if "optimized" in f.name or "quantized" in f.name + ) + + +# ── benchmark helpers ───────────────────────────────────────────────────────── + + +class ScreenResult: + """Result from Phase A quick screen.""" + + __slots__ = ("p50_ms", "cv", "rc_failed") + + def __init__(self, p50_ms: float | None, cv: float, rc_failed: bool = False) -> None: + self.p50_ms = p50_ms + self.cv = cv + self.rc_failed = rc_failed # True only on subprocess failure; never on high CV + + @property + def hard_failed(self) -> bool: + """True if the bench command itself failed (rc != 0 or no output file).""" + return self.rc_failed + + def to_dict(self, ep: str = "cpu") -> dict: + note = None + if ep in ("qnn", "npu") and self.cv > 0.10: + note = "DVFS noise — high CV expected on QNN NPU (npu-007)" + return { + "p50_ms": round(self.p50_ms, 3) if self.p50_ms is not None else None, + "cv": round(self.cv, 4), + "note": note, + } + + +def bench_screen( + winml: str, + model_path: Path, + ep: str, + device: str, + out_json: Path | None = None, +) -> ScreenResult: + """Phase A: 200-iter screen. + + For QNN NPU: high CV is NORMAL (npu-007). Never treat high CV as failure. + Only hard-fail on subprocess rc != 0 or missing output file. + For CPU/GPU: high CV (> SCREEN_CV_MAX_STD) indicates measurement instability. + """ + if out_json is None: + out_json = model_path.parent / "screen_perf.json" + rc, _, _ = run_cmd( + [ + winml, + "perf", + "-m", + str(model_path), + "--ep", + ep, + "--device", + device, + "--warmup", + str(SCREEN_WARMUP), + "--iterations", + str(SCREEN_ITERS), + "-o", + str(out_json), + ], + label=f"perf screen ({SCREEN_ITERS} iters)", + timeout=BENCH_TIMEOUT_S, + ) + if rc != 0 or not out_json.exists(): + return ScreenResult(None, 999.0, rc_failed=True) + try: + data = json.loads(out_json.read_text(encoding="utf-8")) + lat = data.get("latency_ms", data) + p50 = lat.get("p50") if isinstance(lat, dict) else None + std = lat.get("std", 0.0) if isinstance(lat, dict) else 0.0 + if not p50: + return ScreenResult(None, 999.0, rc_failed=True) + cv = std / p50 if p50 > 0 else 999.0 + ep_tag = "NPU" if ep in ("qnn",) and device in ("npu",) else ep.upper() + print( + f" screen: p50={p50:.2f}ms cv={cv:.3f}" + + (" [DVFS-normal]" if ep_tag == "NPU" and cv > 0.10 else ""), + flush=True, + ) + return ScreenResult(p50, cv) + except Exception as e: + print(f" [warn] screen parse error: {e}", flush=True) + return ScreenResult(None, 999.0, rc_failed=True) + + +def bench_full( + winml: str, + model_path: Path, + ep: str, + device: str, + out_prefix: str = "full_perf", + warmup: int | None = None, + iters: int | None = None, + cool_down_s: int | None = None, +) -> list[float]: + """Phase B: 3 × FULL_ITERS-iter full bench with cool-down. + + Returns list of per-session p50_ms values. Empty list = all sessions failed. + Session files are written as {out_prefix}_s{n}.json in model_path.parent. + + warmup/iters/cool_down_s override module-level defaults when provided. + """ + _warmup = warmup if warmup is not None else FULL_WARMUP + _iters = iters if iters is not None else FULL_ITERS + _cool_down = cool_down_s if cool_down_s is not None else COOL_DOWN_S + p50s: list[float] = [] + for s in range(1, FULL_SESSIONS + 1): + out_json = model_path.parent / f"{out_prefix}_s{s}.json" + rc, _, _ = run_cmd( + [ + winml, + "perf", + "-m", + str(model_path), + "--ep", + ep, + "--device", + device, + "--warmup", + str(_warmup), + "--iterations", + str(_iters), + "-o", + str(out_json), + ], + label=f"perf full s{s}/{FULL_SESSIONS} ({_iters} iters)", + timeout=BENCH_TIMEOUT_S, + ) + if rc == 0 and out_json.exists(): + try: + data = json.loads(out_json.read_text(encoding="utf-8")) + lat = data.get("latency_ms", data) + p50 = lat.get("p50") if isinstance(lat, dict) else None + std = lat.get("std", 0.0) if isinstance(lat, dict) else 0.0 + if p50: + cv = std / p50 if p50 > 0 else 999.0 + print( + f" full s{s}: p50={p50:.2f}ms std={std:.2f}ms cv={cv:.3f}", + flush=True, + ) + p50s.append(round(p50, 3)) + except Exception as e: + print(f" [warn] full bench s{s} parse error: {e}", flush=True) + else: + print(f" [warn] full bench s{s} failed", flush=True) + if s < FULL_SESSIONS: + print(f" cool-down {_cool_down}s...", flush=True) + time.sleep(_cool_down) + return p50s + + +def median_p50(p50s: list[float]) -> float | None: + """Return the median of a list of p50 values, or None if empty.""" + if not p50s: + return None + return sorted(p50s)[len(p50s) // 2] + + +def ranges_non_overlapping(a: list[float], b: list[float]) -> bool | None: + """Return True if max(a) < min(b) (a is strictly faster than b). + + Returns None if either list is empty (can't determine). + """ + if not a or not b: + return None + return max(a) < min(b) + + +# ── ONNX analysis helpers ───────────────────────────────────────────────────── + + +# ── Verdict policies ───────────────────────────────────────────────────────── + + +@dataclass +class VerdictInput: + """Inputs to a verdict policy. + + improvement_pct: positive = latency improvement + = (baseline_p50 - new_p50) / baseline_p50 * 100 + cv_pct: screen coefficient of variation as percent (e.g., 5.0 for 5%) + correctness_pass: True if accuracy/parity check passed + build_ok: True if build succeeded + """ + + improvement_pct: float + cv_pct: float + correctness_pass: bool + build_ok: bool = True + + +@dataclass +class VerdictOutput: + """Output from a verdict policy.""" + + verdict: str # KEEP | MARGINAL_KEEP | DISCARD | ACC_FAIL | BUILD_FAIL + reasoning: str + marginal: bool = False + threshold_pct: float = 0.0 + + +class VerdictPolicy(ABC): + """Abstract base for verdict policies.""" + + def __init__(self, min_improvement_pct: float = 1.0, stat_bar_multiplier: float = 2.0) -> None: + self.min_improvement_pct = min_improvement_pct + self.stat_bar_multiplier = stat_bar_multiplier + + @abstractmethod + def evaluate(self, inp: VerdictInput) -> VerdictOutput: ... + + +class ThroughputOnly(VerdictPolicy): + """KEEP iff improvement > max(min_improvement_pct, stat_bar * cv_pct). + + Parameterized statistical significance: forces improvements to exceed + measurement noise before being declared real (borrowed from + AgenticGPUOptimizer V2). Marks verdicts as 'marginal' when improvement is + between 1x and 1.5x the threshold. + """ + + def evaluate(self, inp: VerdictInput) -> VerdictOutput: + if not inp.build_ok: + return VerdictOutput("BUILD_FAIL", "Build step failed.") + if not inp.correctness_pass: + return VerdictOutput("ACC_FAIL", "Accuracy check failed.") + + threshold = max(self.min_improvement_pct, self.stat_bar_multiplier * inp.cv_pct) + + if inp.improvement_pct < threshold: + return VerdictOutput( + "DISCARD", + f"Improvement +{inp.improvement_pct:.1f}% < threshold {threshold:.1f}% " + f"(max({self.min_improvement_pct:.0f}% floor, " + f"{self.stat_bar_multiplier:.0f}x CV={inp.cv_pct:.1f}%))", + threshold_pct=threshold, + ) + + marginal = inp.improvement_pct < threshold * 1.5 + return VerdictOutput( + "MARGINAL_KEEP" if marginal else "KEEP", + f"Improvement +{inp.improvement_pct:.1f}% > threshold {threshold:.1f}%", + marginal=marginal, + threshold_pct=threshold, + ) + + +# ── Session manager ─────────────────────────────────────────────────────────── + + +class SessionManager: + """Crash-resume state manager backed by session.json. + + Writes session state atomically (temp-file + rename) after each experiment + so an interrupted run can be resumed from where it left off. + + Usage:: + sm = SessionManager(WORK_DIR) + if sm.has_state: + print(f"Resuming: {len(sm.completed_iters)} completed iters") + # In the hypothesis loop: + if i in sm.completed_iters: + continue + # ... run experiment ... + sm.save(iter_idx=i, verdict=status, baseline_p50=..., ...) + """ + + def __init__(self, work_dir: Path) -> None: + self.path = work_dir / "session.json" + self._state: dict = {} + if self.path.exists(): + try: + self._state = json.loads(self.path.read_text(encoding="utf-8")) + n = len(self.completed_iters) + if n > 0: + print( + f" [session] Resuming: {n} completed iter(s) loaded from {self.path.name}", + flush=True, + ) + except Exception as e: + print(f" [session] Warning: could not load {self.path.name}: {e}", flush=True) + + @property + def has_state(self) -> bool: + return bool(self._state) + + @property + def completed_iters(self) -> set[int]: + return set(self._state.get("completed_iters", [])) + + @property + def baseline_p50(self) -> float | None: + return self._state.get("baseline_p50") + + @property + def best_p50(self) -> float: + v = self._state.get("best_p50") + return float(v) if v is not None else float("inf") + + @property + def best_label(self) -> str: + return self._state.get("best_label", "") + + @property + def consecutive_discards(self) -> int: + return int(self._state.get("consecutive_discards", 0)) + + @property + def discard_by_dimension(self) -> dict[str, int]: + return dict(self._state.get("discard_by_dimension", {})) + + def save( + self, + *, + iter_idx: int, + verdict: str, + baseline_p50: float | None, + best_p50: float, + best_label: str, + consecutive_discards: int, + discard_by_dimension: dict[str, int], + ) -> None: + """Save current state to session.json atomically.""" + completed = list(self.completed_iters | {iter_idx}) + self._state.update( + { + "completed_iters": completed, + "last_verdict": verdict, + "baseline_p50": baseline_p50, + "best_p50": best_p50 if best_p50 < float("inf") else None, + "best_label": best_label, + "consecutive_discards": consecutive_discards, + "discard_by_dimension": discard_by_dimension, + "last_iter": iter_idx, + } + ) + tmp = self.path.with_suffix(".tmp") + try: + tmp.write_text(json.dumps(self._state, indent=2), encoding="utf-8") + tmp.replace(self.path) + except Exception as e: + print(f" [session] Warning: could not save session state: {e}", flush=True) + + +def count_conv_pct(model_onnx: Path) -> tuple[float, int, int]: + """Count Conv ops as a percentage of all graph nodes. + + Returns (conv_pct, conv_count, total_count). + Used to assess npu-006 risk: Conv% > 20% means conv fusions will likely + produce FusedConv ops that QNN EP cannot dispatch (-> CPU fallback). + + Returns (0.0, 0, 0) if onnx is not installed or file is missing. + The caller must treat (0.0, 0, 0) as 'unknown', not as 'safe'. + """ + if not model_onnx.exists(): + return 0.0, 0, 0 + try: + import onnx # noqa: PLC0415 + + model = onnx.load(str(model_onnx)) + ops = [n.op_type for n in model.graph.node] + total = len(ops) + conv_count = sum(1 for o in ops if o == "Conv") + pct = conv_count / total * 100 if total > 0 else 0.0 + return round(pct, 1), conv_count, total + except Exception as e: + print(f" [warn] Conv% analysis failed (onnx not installed?): {e}", flush=True) + return 0.0, 0, 0 diff --git a/research/autoconfig/catalog-qnn-sweep/.gitignore b/research/autoconfig/catalog-qnn-sweep/.gitignore new file mode 100644 index 000000000..29bb809b7 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/.gitignore @@ -0,0 +1,3 @@ +# Ignore per-hypothesis build artifacts from validation_sweep.py +# (ONNX model files, calibration data, perf session JSONs) +val_h*/ diff --git a/research/autoconfig/catalog-qnn-sweep/BAAI--bge-small-en-v1.5/results_new.json b/research/autoconfig/catalog-qnn-sweep/BAAI--bge-small-en-v1.5/results_new.json new file mode 100644 index 000000000..fed23f364 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/BAAI--bge-small-en-v1.5/results_new.json @@ -0,0 +1,31 @@ +{ + "model_id": "BAAI/bge-small-en-v1.5", + "task": "sentence-similarity", + "hypotheses": { + "h0": { + "description": "opset17 no opts", + "model_file": "quantized.onnx", + "screen_p50_ms": 9.208, + "screen_cv": 0.3059, + "full_p50s_ms": [ + 10.516, + 10.323, + 11.01 + ], + "avg_p50_ms": 10.616 + }, + "h3": { + "description": "opset21 no opts", + "model_file": "quantized.onnx", + "screen_p50_ms": 9.562, + "screen_cv": 0.2575, + "full_p50s_ms": [ + 10.253, + 9.331, + 9.937 + ], + "avg_p50_ms": 9.84 + } + }, + "opset21_gain_pct": 7.31 +} diff --git a/research/autoconfig/catalog-qnn-sweep/SUMMARY.md b/research/autoconfig/catalog-qnn-sweep/SUMMARY.md new file mode 100644 index 000000000..1567c962c --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/SUMMARY.md @@ -0,0 +1,268 @@ +# QNN NPU Optimization Sweep — Catalog Models + +**Generated:** 2026-06-13 +**EP:** `qnn` / device: `npu` +**Bench protocol:** Phase-A 200-iter screen → Phase-B 3×500-iter full sessions (30s cool-down) +**Quant:** W8A16 (weight=uint8, activation=uint16) via `winml config --ep qnn --device npu` + +--- + +## Per-Model Results Summary + +| Model | Task | Baseline p50 | Best p50 | Best config | Gain% | npu-001 opset21? | +|-------|------|-------------|----------|-------------|-------|-----------------| +| `microsoft/resnet-18` | image-classification | 0.96 ms | 0.96 ms | h0 (baseline (auto-config W8A16, opset17)) | +0.0% | ✅ YES (+20.2%) | +| `google/vit-base-patch16-224` | image-classification | 9.04 ms | 9.04 ms | h0 (baseline (auto-config W8A16, opset17)) | +0.0% | ❌ NO (-7.4%) | +| `apple/mobilevit-small` | image-classification | 12.07 ms | 8.62 ms | h3 (opset 21) | +28.6% | ✅ YES (+26.5%) | +| `facebook/dinov2-small` | feature-extraction | 6.56 ms | 4.98 ms | h3 (opset 21) | +24.1% | ✅ YES (+30.6%) | +| `hustvl/yolos-small` | object-detection | 78.69 ms | 78.69 ms | h0 (baseline (auto-config W8A16, opset17)) | +0.0% | N/A (timeout) | +| `distilbert/distilbert-base-uncased-finetuned-sst-2-english` | text-classification | 19.48 ms | 19.48 ms | h0 (baseline (auto-config W8A16, opset17)) | +0.0% | ~ neutral (+0.0%) | +| `sentence-transformers/all-MiniLM-L6-v2` | sentence-similarity | 5.81 ms | 5.81 ms | h0 (baseline (auto-config W8A16, opset17)) | +0.0% | ~ neutral (+0.5%) | +| `deepset/roberta-base-squad2` | question-answering | 14.94 ms | 14.72 ms | h1 (opset 17 explicit) | +1.5% | ~ neutral (-1.4%) | + +--- + +## Per-Model Hypothesis Breakdown + +### `microsoft/resnet-18` +**Task:** image-classification **Type:** resnet + +| Hypothesis | Config | p50 (median) | CV | Status | Accuracy | +|------------|--------|-------------|-----|--------|---------| +| h0 | baseline (auto-config W8A16, opset17) | 0.96 ms | — | OK_HIGH_CV | 66.0% | +| h1 | opset 17 explicit | 2.72 ms | — | OK_HIGH_CV | — | +| h2 | opset 19 | 1.15 ms | — | OK_HIGH_CV | — | +| h3 | opset 21 | 2.17 ms | — | OK_HIGH_CV | — | +| h4 | opset17 + conv fusions | 132.30 ms | — | OK_HIGH_CV | — | +| h5 | opset21 + conv fusions | — | — | TIMEOUT | — | + +**Key findings:** +- 🟢 **npu-001 GENERALIZES**: opset21 (2.17ms) vs opset17 (2.72ms) = +20.2% speedup +- 🔴 **Conv fusions CATASTROPHIC**: h4=132.3ms vs h1=2.72ms (+4764% regression) — QNN CPU fallback suspected +- ⚠️ Model timed out at 1560s (before h5) + +### `google/vit-base-patch16-224` +**Task:** image-classification **Type:** vit + +| Hypothesis | Config | p50 (median) | CV | Status | Accuracy | +|------------|--------|-------------|-----|--------|---------| +| h0 | baseline (auto-config W8A16, opset17) | 9.04 ms | — | OK_HIGH_CV | 74.0% | +| h1 | opset 17 explicit | 9.33 ms | — | OK_HIGH_CV | — | +| h2 | opset 19 | — | — | BUILD_FAIL | — | +| h3 | opset 21 | 10.02 ms | — | OK_HIGH_CV | — | +| h4 | opset17 + conv fusions | — | — | TIMEOUT | — | +| h5 | opset21 + conv fusions | — | — | TIMEOUT | — | + +**Key findings:** +- 🔴 **npu-001 does NOT generalize**: opset21 (10.02ms) SLOWER than opset17 (9.33ms) = -7.4% +- ⚠️ h2: BUILD_FAIL +- ⚠️ Model timed out at 1204s (before h4) +- ⚠️ Model timed out at 1204s (before h5) + +### `apple/mobilevit-small` +**Task:** image-classification **Type:** mobilevit + +| Hypothesis | Config | p50 (median) | CV | Status | Accuracy | +|------------|--------|-------------|-----|--------|---------| +| h0 | baseline (auto-config W8A16, opset17) | 12.07 ms | — | OK_HIGH_CV | 58.0% | +| h1 | opset 17 explicit | 11.72 ms | — | OK_HIGH_CV | — | +| h2 | opset 19 | 10.52 ms | — | OK_HIGH_CV | — | +| h3 | opset 21 | 8.62 ms | — | OK_HIGH_CV | — | +| h4 | opset17 + conv fusions | 11.36 ms | — | OK_HIGH_CV | — | +| h5 | opset21 + conv fusions | 9.99 ms | — | OK_HIGH_CV | — | + +**Key findings:** +- 🟢 **npu-001 GENERALIZES**: opset21 (8.62ms) vs opset17 (11.72ms) = +26.5% speedup +- ⚪ **Conv fusions neutral**: h4=11.36ms vs h1=11.72ms + +### `facebook/dinov2-small` +**Task:** feature-extraction **Type:** dinov2 + +| Hypothesis | Config | p50 (median) | CV | Status | Accuracy | +|------------|--------|-------------|-----|--------|---------| +| h0 | baseline (auto-config W8A16, opset17) | 6.56 ms | — | OK_HIGH_CV | — | +| h1 | opset 17 explicit | 7.18 ms | — | OK_HIGH_CV | — | +| h2 | opset 19 | 7.19 ms | — | OK_HIGH_CV | — | +| h3 | opset 21 | 4.98 ms | — | OK_HIGH_CV | — | +| h4 | opset17 + conv fusions | — | — | TIMEOUT | — | +| h5 | opset21 + conv fusions | — | — | TIMEOUT | — | + +**Key findings:** +- 🟢 **npu-001 GENERALIZES**: opset21 (4.98ms) vs opset17 (7.18ms) = +30.6% speedup +- ⚠️ Model timed out at 1333s (before h4) +- ⚠️ Model timed out at 1333s (before h5) + +### `hustvl/yolos-small` +**Task:** object-detection **Type:** yolos + +| Hypothesis | Config | p50 (median) | CV | Status | Accuracy | +|------------|--------|-------------|-----|--------|---------| +| h0 | baseline (auto-config W8A16, opset17) | 78.69 ms | — | OK_HIGH_CV | — | +| h1 | opset 17 explicit | 92.08 ms | — | OK_HIGH_CV | — | +| h2 | opset 19 | — | — | TIMEOUT | — | +| h3 | opset 21 | — | — | TIMEOUT | — | +| h4 | opset17 + conv fusions | — | — | TIMEOUT | — | +| h5 | opset21 + conv fusions | — | — | TIMEOUT | — | + +**Key findings:** +- ⚠️ Model timed out at 1318s (before h2) +- ⚠️ Model timed out at 1318s (before h3) +- ⚠️ Model timed out at 1318s (before h4) +- ⚠️ Model timed out at 1318s (before h5) + +### `distilbert/distilbert-base-uncased-finetuned-sst-2-english` +**Task:** text-classification **Type:** distilbert + +| Hypothesis | Config | p50 (median) | CV | Status | Accuracy | +|------------|--------|-------------|-----|--------|---------| +| h0 | baseline (auto-config W8A16, opset17) | 19.48 ms | — | OK_HIGH_CV | — | +| h1 | opset 17 explicit | 19.50 ms | — | OK_HIGH_CV | — | +| h2 | opset 19 | 19.48 ms | — | OK_HIGH_CV | — | +| h3 | opset 21 | 19.50 ms | — | OK_HIGH_CV | — | +| h4 | opset17 + conv fusions | 19.59 ms | — | OK_HIGH_CV | — | +| h5 | opset21 + conv fusions | — | — | TIMEOUT | — | + +**Key findings:** +- ⚪ **npu-001 neutral**: opset21 (19.50ms) ≈ opset17 (19.50ms), diff=+0.0% +- ⚪ **Conv fusions neutral**: h4=19.59ms vs h1=19.50ms +- ⚠️ Model timed out at 1385s (before h5) + +### `sentence-transformers/all-MiniLM-L6-v2` +**Task:** sentence-similarity **Type:** bert + +| Hypothesis | Config | p50 (median) | CV | Status | Accuracy | +|------------|--------|-------------|-----|--------|---------| +| h0 | baseline (auto-config W8A16, opset17) | 5.81 ms | — | OK_HIGH_CV | — | +| h1 | opset 17 explicit | 5.88 ms | — | OK_HIGH_CV | — | +| h2 | opset 19 | 5.98 ms | — | OK_HIGH_CV | — | +| h3 | opset 21 | 5.85 ms | — | OK_HIGH_CV | — | +| h4 | opset17 + conv fusions | 5.97 ms | — | OK | — | +| h5 | opset21 + conv fusions | — | — | TIMEOUT | — | + +**Key findings:** +- ⚪ **npu-001 neutral**: opset21 (5.85ms) ≈ opset17 (5.88ms), diff=+0.5% +- ⚪ **Conv fusions neutral**: h4=5.97ms vs h1=5.88ms +- ⚠️ Model timed out at 1346s (before h5) + +### `deepset/roberta-base-squad2` +**Task:** question-answering **Type:** roberta + +| Hypothesis | Config | p50 (median) | CV | Status | Accuracy | +|------------|--------|-------------|-----|--------|---------| +| h0 | baseline (auto-config W8A16, opset17) | 14.94 ms | — | OK | — | +| h1 | opset 17 explicit | 14.72 ms | — | OK | — | +| h2 | opset 19 | 14.88 ms | — | OK_HIGH_CV | — | +| h3 | opset 21 | 14.92 ms | — | OK | — | +| h4 | opset17 + conv fusions | — | — | TIMEOUT | — | +| h5 | opset21 + conv fusions | — | — | TIMEOUT | — | + +**Key findings:** +- ⚪ **npu-001 neutral**: opset21 (14.92ms) ≈ opset17 (14.72ms), diff=-1.4% +- ⚠️ Model timed out at 1466s (before h4) +- ⚠️ Model timed out at 1466s (before h5) + +--- + +## Cross-Model Pattern Analysis + +### Finding 1: npu-001 — opset 21 NHWC bypass + +The npu-001 hypothesis (opset ≥ 21 bypasses the NHWC→NCHW layout transformation in ORT's QNN EP) **is confirmed for Conv+residual architectures** but **does not apply to pure transformers**. + +| Architecture class | Models | opset21 result | +|-------------------|--------|----------------| +| Conv + residual (spatial models) | MobileViT-small, DINOv2-small | ✅ **+26–31% speedup** | +| Pure transformer (attention-only) | ViT-base, YOLOS-small | ❌ No benefit (neutral/slight regression) | +| BERT-family NLP | DistilBERT, MiniLM, RoBERTa | ⚪ Neutral (within DVFS noise) | +| ResNet (plain conv) | ResNet-18 | ~ Marginal (+20% h1→h3, but DVFS-dominated; h0 baseline even faster) | + +> **Root cause confirmed**: NHWC layout transform is only a bottleneck when (a) the model has Conv ops that QNN EP needs to transpose for its internal NHWC representation, AND (b) those conv ops are interleaved with residual add/shortcut paths. Pure attention (no Conv) has no such transposes. ResNet's gain is marginal likely because the Conv path is so fast that the transpose overhead is relatively smaller. + +### Finding 2: Conv fusions and QNN EP compatibility + +Conv fusion optimizations (`conv_bn_fusion`, `conv_add_fusion`, `conv_activation_fusion`) are **architecture-dependent** with respect to QNN EP: + +| Model | h4 result vs h1 | Assessment | +|-------|----------------|------------| +| ResNet-18 | 132.3ms vs 2.72ms | 🔴 **~4900% regression** — QNN CPU fallback for fused ops | +| MobileViT-small | 11.36ms vs 11.72ms | ⚪ Neutral (no regression) | +| DistilBERT | 19.59ms vs 19.5ms | ⚪ Neutral (no Conv layers to fuse) | +| all-MiniLM-L6-v2 | 5.97ms vs 5.88ms | ⚪ Neutral (no Conv layers to fuse) | + +> **Root cause**: QNN EP cannot execute fused Conv+BN/Add/Activation ops natively. When ORT graph optimizer fuses these patterns (which ORT does before handing the graph to the EP), QNN falls back to CPU execution for those ops — causing massive latency spikes on ResNet (which is entirely Conv-dominated). +> +> **Feature gap**: `winml` should detect when the target EP (QNN NPU) is likely to CPU-fallback fused ops and either (a) warn the user, or (b) suppress incompatible fusions automatically. This is a critical correctness/performance hazard. + +### Finding 3: DVFS noise and bench reliability + +QNN NPU exhibits extreme DVFS (Dynamic Voltage/Frequency Scaling) thermal noise. Key observations: + +- CV (coefficient of variation) is consistently **0.10–2.0+** across all models and sessions +- Even within a 500-iter session, CV frequently exceeds 0.5 +- The original CV < 15% gate (Phase-A screening) blocks all models — must be removed for QNN NPU +- Differences < 10% between hypotheses are **unreliable** without longer runs (>2000 iterations total) +- 30s cool-down between sessions reduces but does not eliminate DVFS spikes + +> **Feature gap**: `winml perf` should support a `--thermal-stabilization` mode that waits for device temperature to stabilize before beginning measurements, and should report confidence intervals rather than raw p50. + +### Finding 4: Large model / detection model budget + +YOLOS-small (78ms baseline) exhausts the 20-min per-model budget after just 2 hypotheses. The per-hypothesis bench cost is: + +- Build: ~120–200s (fixed) +- Bench: `3 × (N_iters × latency_ms + 30s cool-down)` = `3 × (500 × 0.078s + 30s)` ≈ **207s per hypothesis** +- Total for 6 hypotheses: ~2000s — well over budget + +> **Recommendation**: For models with p50 > 50ms, reduce bench to 1×200-iter session for the sweep. Alternatively, add `--quick` flag to `catalog_qnn_sweep.py`. + +--- + +## Updated Recommendations for `ep_knowledge/qnn_npu.json` + +### Proposed KB updates: + +**npu-001 (opset bypass):** Update status from `partially_confirmed` to `CONFIRMED_CONV_RESIDUAL`. +- Restrict applicability: `architecture_requirement: ['has_conv_ops', 'has_residual_connections']` +- Add exclusion: `not_applicable_to: ['pure_transformer', 'bert_family']` +- Confirmed gains: MobileViT +26%, DINOv2 +31% +- Non-applicable: ViT, DistilBERT, MiniLM, RoBERTa (neutral within DVFS noise) + +**NEW npu-006 (Conv fusion QNN fallback):** +```json +{ + "id": "npu-006", + "title": "Conv fusions cause QNN EP CPU fallback on Conv-dominant models", + "severity": "critical", + "finding": "conv_bn_fusion + conv_add_fusion + conv_activation_fusion flags cause QNN EP to fall back to CPU for fused ops on Conv-dominant architectures (ResNet: 4900% regression). BERT/MobileViT unaffected.", + "recommendation": "Do NOT enable conv_*_fusion optimizations for QNN NPU target on ResNet-family models. Safe only for pure-transformer models (where no Conv ops exist to fuse).", + "architecture_specificity": "resnet, efficientnet, mobilenet — any model where Conv ops dominate the execution path", + "status": "confirmed", + "models_tested": ["microsoft/resnet-18"] +} +``` + +**NEW npu-007 (DVFS reliability threshold):** +```json +{ + "id": "npu-007", + "title": "QNN NPU DVFS noise requires extended bench for reliable comparison", + "finding": "CV is always 0.1–2.0+ on QNN NPU due to DVFS thermal throttling. The CV<15% Phase-A gate must be disabled. Differences <10% between configs are unreliable without >1500 total iterations.", + "recommendation": "Disable CV gate for QNN NPU. Use minimum 3×500-iter sessions. Report median of session p50s. Only trust differences >10%.", + "status": "confirmed" +} +``` + +--- + +## Build / Compatibility Issues + +| Model | Issue | +|-------|-------| +| `google/vit-base-patch16-224` h2 (opset19) | BUILD FAIL — network error downloading calibration data (parquet URL) — not an opset incompatibility | +| `hustvl/yolos-small` h2–h5 | TIMEOUT — 78ms baseline × 3×500 iters = 207s per hypothesis, exceeds 20-min budget | +| `microsoft/resnet-18` h5 | TIMEOUT after h4 catastrophic regression consumed extra time | +| Multiple models | h5 TIMEOUT — model total > 1200s before h5 | + +--- + +*Sweep completed 2026-06-13. All results in `catalog-qnn-sweep//results.json`.* diff --git a/research/autoconfig/catalog-qnn-sweep/VALIDATION_SUMMARY.md b/research/autoconfig/catalog-qnn-sweep/VALIDATION_SUMMARY.md new file mode 100644 index 000000000..0dc697d3e --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/VALIDATION_SUMMARY.md @@ -0,0 +1,108 @@ +# Validation Sweep Results — QNN NPU (2026-06-16) + +**Device:** Snapdragon X Elite X1E80100 +**ORT:** onnxruntime-windowsml==1.24.5 +**QNN SDK:** 2.2450.47.0 +**Protocol:** 3 × 500 iters, 30s cool-down, `quantized.onnx` (W8A16), `--no-compile` +**Script:** `validation_sweep.py` — targeted 4-hypothesis sweep (h0/h1/h3/h4) + +## Hypothesis Matrix + +| ID | Config | Purpose | +|----|--------|---------| +| h0 | auto-config baseline (W8A16, opset auto) | baseline reference | +| h1 | opset 17 explicit (W8A16) | npu-001 baseline | +| h3 | opset 21 (W8A16) | **npu-001 test** — does opset21 help? | +| h4 | opset 17 + conv fusions | **npu-006 test** — do conv fusions regress? | + +--- + +## Results by Model + +### facebook/dinov2-base (ViT-B DINOv2, image-feature-extraction) + +| Hyp | Median p50 | Sessions (ms) | CV note | +|-----|-----------|---------------|---------| +| h0 auto | 38.68 ms | [38.99, 38.68, 36.26] | stable (stale build artifact) | +| **h1 opset17** | **34.56 ms** | [34.56, 34.67, 33.15] | rock stable | +| **h3 opset21** | **26.23 ms** | [33.00, 26.22, 26.23] | s0 elevated (JIT warmup), s1+s2 stable | +| h4 fusions | 25.92 ms | [26.06, 25.92, 25.87] | rock stable | + +**npu-001: opset21 → +24.1% speedup** `(34.56 → 26.23ms)` +**npu-006: conv fusions → -25% (fusions FASTER, not regression)** — DINOv2 is attention-dominant, few Conv ops to fuse + +--- + +### microsoft/rad-dino (ViT-L DINOv2 medical, image-feature-extraction) + +| Hyp | Median p50 | Sessions (ms) | CV note | +|-----|-----------|---------------|---------| +| **h1 opset17** | **274.98 ms** | [274.98, 274.56, 275.10] | CV=0.009, CPU-deterministic | +| **h3 opset21** | **275.36 ms** | [275.30, 275.36, 275.56] | CV=0.022 | + +**npu-001: -0.1% — NEUTRAL (CPU-bound)** +Model runs entirely on CPU (~275ms). QNN NPU cannot accelerate rad-dino (ViT-L too large or incompatible ops). Opset has no effect when model is CPU-bound. + +--- + +### facebook/dino-vitb16 (plain DINO ViT-B/16, image-feature-extraction) + +| Hyp | Median p50 | Sessions (ms) | CV note | +|-----|-----------|---------------|---------| +| **h1 opset17** | **19.92 ms** | [19.92, 19.97, 19.90] | rock stable | +| **h3 opset21** | **20.07 ms** | [20.20, 20.07, 19.99] | rock stable | +| h4 fusions | 20.12 ms | [20.12, 20.04, 20.41] | rock stable | + +**npu-001: -0.7% — NEUTRAL** ← **critical control** +**npu-006: +1.0% — NEUTRAL** (no Conv layers to fuse, patch-embed Conv fusion is benign) + +--- + +## Cross-Model Summary — npu-001 (opset21 vs opset17) + +| Model | Architecture | opset17 (h1) | opset21 (h3) | Gain | Verdict | +|-------|-------------|-------------|-------------|------|---------| +| facebook/dinov2-small | DINOv2 ViT-S | 7.18 ms* | 4.98 ms* | **+30.6%** | ✅ CONFIRMED | +| facebook/dinov2-base | DINOv2 ViT-B | 34.56 ms | 26.23 ms | **+24.1%** | ✅ CONFIRMED | +| apple/mobilevit-small | Conv+Attn hybrid | 11.72 ms* | 8.62 ms* | **+26.5%** ⚠️ | 🟡 LIKELY (DVFS spike in h1) | +| facebook/dino-vitb16 | plain ViT-B/16 | 19.92 ms | 20.07 ms | **-0.7%** | ❌ NEUTRAL — critical control | +| microsoft/rad-dino | ViT-L DINOv2 | 274.98 ms | 275.36 ms | **-0.1%** | ⬛ CPU-BOUND (untestable) | +| google/vit-base-patch16-224 | plain ViT-B | n/a | n/a | **-7.4%** ⚠️* | ❌ REGRESSION | + +_*Original catalog_qnn_sweep.py data (optimized.onnx, not quantized.onnx — different pipeline)_ + +**Key architectural discriminant:** opset21 consistently helps **DINOv2 family** (+24-31%) but has **zero effect on plain ViT** (dino-vitb16: -0.7%, noise-level). This is NOT a general ViT property. DINOv2-specific op patterns must explain the difference — mechanism TBD. + +--- + +## Cross-Model Summary — npu-006 (conv fusions) + +| Model | Architecture | h1 no-fusions | h4 fusions | Regression | Verdict | +|-------|-------------|--------------|-----------|------------|---------| +| microsoft/resnet-18 | Conv-dominant | ~1–4 ms* | 132–135 ms* | **+4900%** 🔥 | ✅ CATASTROPHIC | +| apple/mobilevit-small | Conv+Attn | ~10–12 ms* | ~10–12 ms* | **≈0%** | 🟢 SAFE | +| facebook/dinov2-base | DINOv2 ViT-B | 34.56 ms | 25.92 ms | **-25%** (faster) | 🟢 SAFE / beneficial | +| facebook/dino-vitb16 | plain ViT-B | 19.92 ms | 20.12 ms | **+1.0%** | 🟢 SAFE (neutral) | + +_*Original catalog_qnn_sweep.py data_ + +**Conclusion:** Conv fusions only regress Conv-dominant models (ResNet). Attention-dominant models (DINOv2, ViT) are safe or slightly benefit. The hazard is proportional to Conv op density. + +--- + +## Bugs Found and Fixed in validation_sweep.py + +| Bug | Impact | Fix | +|-----|--------|-----| +| `bench_screen` parsed `d.get("p50_ms")` instead of `d["latency_ms"]["p50"]` | All hypotheses marked BENCH_FAIL in v1/v2 runs | Fixed to read nested `latency_ms.p50` | +| Reuse check triggered on any `.onnx` (including truncated `export.onnx`) | h1 was benchmarked on FP32 unoptimized model | Changed to require `quantized.onnx` or `optimized.onnx` | +| Model file selection preferred `optimized.onnx` over `quantized.onnx` alphabetically | Benchmarked FP32 graph instead of W8A16 quantized | Fixed to explicitly prefer `quantized` > `optimized` > other | + +--- + +## Known Limitations + +1. **`--no-compile` throughout**: All runs omit `winml compile` (pre-built QNN context binary). Production use would include compile, which npu-003 suggests adds ~1.7x additional speedup. The npu-001 ratio should hold with compile enabled, but absolute latencies will be lower. +2. **3 sessions only**: DVFS on QNN NPU can cause any single session to be thermal-spiked. With only 3 sessions, the median can still be affected if 2/3 spike. See h3 dinov2-base s0=33ms (warmup effect) vs s1+s2=26ms. +3. **rad-dino untestable**: When a model falls back entirely to CPU, no NPU-related findings can be extracted. The reason for CPU fallback (model size? unsupported ops?) was not investigated. +4. **dinov2-small not re-validated with v2 pipeline**: The original +30.6% result was from `catalog_qnn_sweep.py` using `optimized.onnx`. The v2 pipeline uses `quantized.onnx`. For full comparability, dinov2-small should be re-run with `validation_sweep.py`. diff --git a/research/autoconfig/catalog-qnn-sweep/apple--mobilevit-small/results.json b/research/autoconfig/catalog-qnn-sweep/apple--mobilevit-small/results.json new file mode 100644 index 000000000..3a2178e04 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/apple--mobilevit-small/results.json @@ -0,0 +1,138 @@ +{ + "model_id": "apple/mobilevit-small", + "task": "image-classification", + "model_type": "mobilevit", + "timestamp": "2026-06-13T14:26:06", + "ep": "qnn", + "device": "npu", + "baseline_opset": 17, + "hypotheses": { + "h0": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 10.651, + "cv": 1.7211, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 12.075, + 10.313, + 12.946 + ], + "median_p50_ms": 12.075 + }, + "accuracy": 0.58, + "label": "baseline (auto-config, W8A16)", + "opset": 17 + }, + "h1": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 8.714, + "cv": 0.9982, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 10.557, + 11.721, + 27.436 + ], + "median_p50_ms": 11.721 + }, + "accuracy": null, + "label": "opset 17 explicit", + "opset": 17 + }, + "h2": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 9.035, + "cv": 1.7997, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 11.541, + 10.506, + 10.52 + ], + "median_p50_ms": 10.52 + }, + "accuracy": null, + "label": "opset 19", + "opset": 19 + }, + "h3": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 11.777, + "cv": 1.1161, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 10.814, + 8.625, + 8.449 + ], + "median_p50_ms": 8.625 + }, + "accuracy": null, + "label": "opset 21 (tests npu-001 bypass)", + "opset": 21 + }, + "h4": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 11.14, + "cv": 1.8792, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 11.599, + 11.364, + 10.518 + ], + "median_p50_ms": 11.364 + }, + "accuracy": null, + "label": "opset 17 + conv fusions", + "opset": 17 + }, + "h5": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 9.256, + "cv": 2.2489, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 11.081, + 9.412, + 9.994 + ], + "median_p50_ms": 9.994 + }, + "accuracy": null, + "label": "opset 21 + conv fusions", + "opset": 21 + } + }, + "best_hypothesis": "h3", + "baseline_p50_ms": 12.075, + "best_p50_ms": 8.625, + "best_gain_pct": 28.57, + "npu001_generalized": true, + "feature_gaps": [], + "errors": [] +} diff --git a/research/autoconfig/catalog-qnn-sweep/deepset--roberta-base-squad2/results.json b/research/autoconfig/catalog-qnn-sweep/deepset--roberta-base-squad2/results.json new file mode 100644 index 000000000..fa8a959f4 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/deepset--roberta-base-squad2/results.json @@ -0,0 +1,106 @@ +{ + "model_id": "deepset/roberta-base-squad2", + "task": "question-answering", + "model_type": "roberta", + "timestamp": "2026-06-13T16:21:18", + "ep": "qnn", + "device": "npu", + "baseline_opset": 17, + "hypotheses": { + "h0": { + "status": "OK", + "screen": { + "p50_ms": 14.919, + "cv": 0.1188, + "stable": true + }, + "full": { + "p50s_ms": [ + 14.941, + 14.711, + 14.97 + ], + "median_p50_ms": 14.941 + }, + "accuracy": null, + "label": "baseline (auto-config, W8A16)", + "opset": 17 + }, + "h1": { + "status": "OK", + "screen": { + "p50_ms": 14.747, + "cv": 0.1286, + "stable": true + }, + "full": { + "p50s_ms": [ + 14.645, + 14.873, + 14.716 + ], + "median_p50_ms": 14.716 + }, + "accuracy": null, + "label": "opset 17 explicit", + "opset": 17 + }, + "h2": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 15.309, + "cv": 0.2344, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 14.951, + 14.877, + 14.834 + ], + "median_p50_ms": 14.877 + }, + "accuracy": null, + "label": "opset 19", + "opset": 19 + }, + "h3": { + "status": "OK", + "screen": { + "p50_ms": 14.798, + "cv": 0.1159, + "stable": true + }, + "full": { + "p50s_ms": [ + 16.685, + 14.743, + 14.919 + ], + "median_p50_ms": 14.919 + }, + "accuracy": null, + "label": "opset 21 (tests npu-001 bypass)", + "opset": 21 + }, + "h4": { + "status": "TIMEOUT", + "label": "opset 17 + conv fusions" + }, + "h5": { + "status": "TIMEOUT", + "label": "opset 21 + conv fusions" + } + }, + "best_hypothesis": "h1", + "baseline_p50_ms": 14.941, + "best_p50_ms": 14.716, + "best_gain_pct": 1.51, + "npu001_generalized": "neutral", + "feature_gaps": [], + "errors": [ + "Model timed out at 1466s (before h4)", + "Model timed out at 1466s (before h5)" + ] +} diff --git a/research/autoconfig/catalog-qnn-sweep/distilbert--distilbert-base-uncased-finetuned-sst-2-english/results.json b/research/autoconfig/catalog-qnn-sweep/distilbert--distilbert-base-uncased-finetuned-sst-2-english/results.json new file mode 100644 index 000000000..9d10a6736 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/distilbert--distilbert-base-uncased-finetuned-sst-2-english/results.json @@ -0,0 +1,124 @@ +{ + "model_id": "distilbert/distilbert-base-uncased-finetuned-sst-2-english", + "task": "text-classification", + "model_type": "distilbert", + "timestamp": "2026-06-13T15:34:52", + "ep": "qnn", + "device": "npu", + "baseline_opset": 17, + "hypotheses": { + "h0": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 19.511, + "cv": 0.156, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 19.512, + 19.459, + 19.48 + ], + "median_p50_ms": 19.48 + }, + "accuracy": null, + "label": "baseline (auto-config, W8A16)", + "opset": 17 + }, + "h1": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 19.721, + "cv": 0.2715, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 19.498, + 19.417, + 19.519 + ], + "median_p50_ms": 19.498 + }, + "accuracy": null, + "label": "opset 17 explicit", + "opset": 17 + }, + "h2": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 19.431, + "cv": 0.1945, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 19.471, + 19.684, + 19.477 + ], + "median_p50_ms": 19.477 + }, + "accuracy": null, + "label": "opset 19", + "opset": 19 + }, + "h3": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 19.443, + "cv": 0.2903, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 19.591, + 19.447, + 19.505 + ], + "median_p50_ms": 19.505 + }, + "accuracy": null, + "label": "opset 21 (tests npu-001 bypass)", + "opset": 21 + }, + "h4": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 19.404, + "cv": 0.237, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 19.588, + 19.628, + 19.502 + ], + "median_p50_ms": 19.588 + }, + "accuracy": null, + "label": "opset 17 + conv fusions", + "opset": 17 + }, + "h5": { + "status": "TIMEOUT", + "label": "opset 21 + conv fusions" + } + }, + "best_hypothesis": "h2", + "baseline_p50_ms": 19.48, + "best_p50_ms": 19.477, + "best_gain_pct": 0.02, + "npu001_generalized": "neutral", + "feature_gaps": [], + "errors": [ + "Model timed out at 1385s (before h5)" + ] +} diff --git a/research/autoconfig/catalog-qnn-sweep/facebook--dino-vitb16/results_v2.json b/research/autoconfig/catalog-qnn-sweep/facebook--dino-vitb16/results_v2.json new file mode 100644 index 000000000..b8c34f0d3 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/facebook--dino-vitb16/results_v2.json @@ -0,0 +1,92 @@ +{ + "model_id": "facebook/dino-vitb16", + "task": "image-feature-extraction", + "model_type": "vit", + "timestamp": "2026-06-16T18:19:46", + "ep": "qnn", + "device": "npu", + "validation_sweep": true, + "hypotheses": { + "h0": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 20.367, + "cv": 0.2452, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 20.037, + 20.009, + 20.048 + ], + "median_p50_ms": 20.037 + }, + "label": "baseline (auto-config, W8A16)", + "opset": "auto" + }, + "h1": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 20.027, + "cv": 0.4804, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 19.924, + 19.975, + 19.897 + ], + "median_p50_ms": 19.924 + }, + "label": "opset 17 explicit", + "opset": 17 + }, + "h3": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 20.369, + "cv": 0.9085, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 20.197, + 20.071, + 19.988 + ], + "median_p50_ms": 20.071 + }, + "label": "opset 21 (tests npu-001)", + "opset": 21 + }, + "h4": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 19.871, + "cv": 0.3492, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 20.123, + 20.037, + 20.413 + ], + "median_p50_ms": 20.123 + }, + "label": "opset 17 + conv fusions", + "opset": 17 + } + }, + "errors": [], + "npu001_opset21_vs_17_gain_pct": -0.7, + "npu001_note": "opset21 median 20.071ms vs opset17 19.924ms = -0.7%", + "npu006_conv_fusion_regression_pct": 1.0, + "npu006_note": "conv fusions median 20.123ms vs no-fusion 19.924ms = +1.0%" +} diff --git a/research/autoconfig/catalog-qnn-sweep/facebook--dinov2-base/results_v2.json b/research/autoconfig/catalog-qnn-sweep/facebook--dinov2-base/results_v2.json new file mode 100644 index 000000000..416ddce95 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/facebook--dinov2-base/results_v2.json @@ -0,0 +1,92 @@ +{ + "model_id": "facebook/dinov2-base", + "task": "image-feature-extraction", + "model_type": "dinov2", + "timestamp": "2026-06-16T16:12:15", + "ep": "qnn", + "device": "npu", + "validation_sweep": true, + "hypotheses": { + "h0": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 41.108, + "cv": 1.2524, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 38.991, + 38.68, + 36.256 + ], + "median_p50_ms": 38.68 + }, + "label": "baseline (auto-config, W8A16)", + "opset": "auto" + }, + "h1": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 36.348, + "cv": 0.7429, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 34.556, + 34.668, + 33.148 + ], + "median_p50_ms": 34.556 + }, + "label": "opset 17 explicit", + "opset": 17 + }, + "h3": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 32.742, + "cv": 0.8357, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 33.001, + 26.224, + 26.227 + ], + "median_p50_ms": 26.227 + }, + "label": "opset 21 (tests npu-001)", + "opset": 21 + }, + "h4": { + "status": "OK", + "screen": { + "p50_ms": 25.83, + "cv": 0.1082, + "stable": true, + "note": null + }, + "full": { + "p50s_ms": [ + 26.064, + 25.921, + 25.872 + ], + "median_p50_ms": 25.921 + }, + "label": "opset 17 + conv fusions", + "opset": 17 + } + }, + "errors": [], + "npu001_opset21_vs_17_gain_pct": 24.1, + "npu001_note": "opset21 median 26.227ms vs opset17 34.556ms = +24.1%", + "npu006_conv_fusion_regression_pct": -25.0, + "npu006_note": "conv fusions median 25.921ms vs no-fusion 34.556ms = -25.0%" +} diff --git a/research/autoconfig/catalog-qnn-sweep/facebook--dinov2-small/results.json b/research/autoconfig/catalog-qnn-sweep/facebook--dinov2-small/results.json new file mode 100644 index 000000000..521b465de --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/facebook--dinov2-small/results.json @@ -0,0 +1,109 @@ +{ + "model_id": "facebook/dinov2-small", + "task": "image-feature-extraction", + "model_type": "dinov2", + "timestamp": "2026-06-13T14:49:59", + "ep": "qnn", + "device": "npu", + "baseline_opset": 17, + "hypotheses": { + "h0": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 7.213, + "cv": 0.3437, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 6.561, + 6.353, + 12.408 + ], + "median_p50_ms": 6.561 + }, + "accuracy": null, + "label": "baseline (auto-config, W8A16)", + "opset": 17 + }, + "h1": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 4.897, + "cv": 0.4572, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 7.176, + 6.392, + 9.436 + ], + "median_p50_ms": 7.176 + }, + "accuracy": null, + "label": "opset 17 explicit", + "opset": 17 + }, + "h2": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 6.953, + "cv": 1.8047, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 8.454, + 7.191, + 6.194 + ], + "median_p50_ms": 7.191 + }, + "accuracy": null, + "label": "opset 19", + "opset": 19 + }, + "h3": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 9.432, + "cv": 0.936, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 4.977, + 4.876, + 6.884 + ], + "median_p50_ms": 4.977 + }, + "accuracy": null, + "label": "opset 21 (tests npu-001 bypass)", + "opset": 21 + }, + "h4": { + "status": "TIMEOUT", + "label": "opset 17 + conv fusions" + }, + "h5": { + "status": "TIMEOUT", + "label": "opset 21 + conv fusions" + } + }, + "best_hypothesis": "h3", + "baseline_p50_ms": 6.561, + "best_p50_ms": 4.977, + "best_gain_pct": 24.14, + "npu001_generalized": true, + "feature_gaps": [], + "errors": [ + "Model timed out at 1333s (before h4)", + "Model timed out at 1333s (before h5)" + ] +} diff --git a/research/autoconfig/catalog-qnn-sweep/google--vit-base-patch16-224/results.json b/research/autoconfig/catalog-qnn-sweep/google--vit-base-patch16-224/results.json new file mode 100644 index 000000000..42edb241b --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/google--vit-base-patch16-224/results.json @@ -0,0 +1,96 @@ +{ + "model_id": "google/vit-base-patch16-224", + "task": "image-classification", + "model_type": "vit", + "timestamp": "2026-06-13T14:05:37", + "ep": "qnn", + "device": "npu", + "baseline_opset": 17, + "hypotheses": { + "h0": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 9.245, + "cv": 1.2887, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 9.039, + 8.6, + 9.779 + ], + "median_p50_ms": 9.039 + }, + "accuracy": 0.74, + "label": "baseline (auto-config, W8A16)", + "opset": 17 + }, + "h1": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 9.656, + "cv": 0.7434, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 9.33, + 12.723, + 9.064 + ], + "median_p50_ms": 9.33 + }, + "accuracy": null, + "label": "opset 17 explicit", + "opset": 17 + }, + "h2": { + "status": "BUILD_FAIL", + "label": "opset 19", + "opset": 19, + "build_error": "MzU3NTk3NTM4NmY1YzY0YjEzZjgwNTlkYmY3MWVkNDBkYWEwMGFcXD91c2VyX2lkPXB1YmxpYyZYLVhldC1DYXMtVWlkPXB1YmxpYyZyZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPWlubGluZSUzQitmaWxlbmFtZSUyQSUzRFVURi04JTI3JTI3dHJhaW4tMDAwMDAtb2YtMDAwMTMucGFycXVldCUzQitmaWxlbmFtZSUzRCUyMnRyYWluLTAwMDAwLW9mLTAwMDEzLnBhcnF1ZXQlMjIlM0IiLCJDb25kaXRpb24iOnsiRGF0ZUxlc3NUaGFuIjp7IkVwb2NoVGltZSI6MTc4MTMzNTIwOH0sIkJ5dGVSYW5nZSI6eyJFeHBlY3RlZEhlYWRlciI6ImJ5dGVzPTQ4NTEzNzYwNC00ODUyMDMxMzkifX19XX0_&Signature=MEUCIQD51-TIZFhcd8Id1yCa5oFvcfXtxBJQLnbeG3PPgDJm5AIgBbqpmbciOJZpxVhunYiYCwhL8FT6ymJ72UKocE3aygs_&Key-Pair-Id=01KAYHXK2CBJSW0YZTMNXK9W1M\n\n" + }, + "h3": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 11.564, + "cv": 2.1585, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 15.271, + 10.019, + 7.808 + ], + "median_p50_ms": 10.019 + }, + "accuracy": null, + "label": "opset 21 (tests npu-001 bypass)", + "opset": 21 + }, + "h4": { + "status": "TIMEOUT", + "label": "opset 17 + conv fusions" + }, + "h5": { + "status": "TIMEOUT", + "label": "opset 21 + conv fusions" + } + }, + "best_hypothesis": "h0", + "baseline_p50_ms": 9.039, + "best_p50_ms": 9.039, + "best_gain_pct": 0.0, + "npu001_generalized": false, + "feature_gaps": [], + "errors": [ + "h2: BUILD_FAIL", + "Model timed out at 1204s (before h4)", + "Model timed out at 1204s (before h5)" + ] +} diff --git a/research/autoconfig/catalog-qnn-sweep/hustvl--yolos-small/results.json b/research/autoconfig/catalog-qnn-sweep/hustvl--yolos-small/results.json new file mode 100644 index 000000000..ae4b9e09e --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/hustvl--yolos-small/results.json @@ -0,0 +1,79 @@ +{ + "model_id": "hustvl/yolos-small", + "task": "object-detection", + "model_type": "yolos", + "timestamp": "2026-06-13T15:12:34", + "ep": "qnn", + "device": "npu", + "baseline_opset": 17, + "hypotheses": { + "h0": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 76.826, + "cv": 0.344, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 76.629, + 96.253, + 78.694 + ], + "median_p50_ms": 78.694 + }, + "accuracy": null, + "label": "baseline (auto-config, W8A16)", + "opset": 17 + }, + "h1": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 89.003, + "cv": 0.316, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 95.119, + 92.075, + 89.82 + ], + "median_p50_ms": 92.075 + }, + "accuracy": null, + "label": "opset 17 explicit", + "opset": 17 + }, + "h2": { + "status": "TIMEOUT", + "label": "opset 19" + }, + "h3": { + "status": "TIMEOUT", + "label": "opset 21 (tests npu-001 bypass)" + }, + "h4": { + "status": "TIMEOUT", + "label": "opset 17 + conv fusions" + }, + "h5": { + "status": "TIMEOUT", + "label": "opset 21 + conv fusions" + } + }, + "best_hypothesis": "h0", + "baseline_p50_ms": 78.694, + "best_p50_ms": 78.694, + "best_gain_pct": 0.0, + "npu001_generalized": "N/A (h1, h3 not OK)", + "feature_gaps": [], + "errors": [ + "Model timed out at 1318s (before h2)", + "Model timed out at 1318s (before h3)", + "Model timed out at 1318s (before h4)", + "Model timed out at 1318s (before h5)" + ] +} diff --git a/research/autoconfig/catalog-qnn-sweep/microsoft--rad-dino/results_v2.json b/research/autoconfig/catalog-qnn-sweep/microsoft--rad-dino/results_v2.json new file mode 100644 index 000000000..20cf14836 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/microsoft--rad-dino/results_v2.json @@ -0,0 +1,71 @@ +{ + "model_id": "microsoft/rad-dino", + "task": "image-feature-extraction", + "model_type": "dinov2", + "timestamp": "2026-06-16T16:43:10", + "ep": "qnn", + "device": "npu", + "validation_sweep": true, + "hypotheses": { + "h0": { + "status": "OK", + "screen": { + "p50_ms": 274.506, + "cv": 0.0134, + "stable": true, + "note": null + }, + "full": { + "p50s_ms": [ + 274.727, + 274.621, + 274.949 + ], + "median_p50_ms": 274.727 + }, + "label": "baseline (auto-config, W8A16)", + "opset": "auto" + }, + "h1": { + "status": "OK", + "screen": { + "p50_ms": 274.204, + "cv": 0.0088, + "stable": true, + "note": null + }, + "full": { + "p50s_ms": [ + 274.979, + 274.557, + 275.099 + ], + "median_p50_ms": 274.979 + }, + "label": "opset 17 explicit", + "opset": 17 + }, + "h3": { + "status": "OK", + "screen": { + "p50_ms": 275.269, + "cv": 0.0222, + "stable": true, + "note": null + }, + "full": { + "p50s_ms": [ + 275.298, + 275.355, + 275.564 + ], + "median_p50_ms": 275.355 + }, + "label": "opset 21 (tests npu-001)", + "opset": 21 + } + }, + "errors": [], + "npu001_opset21_vs_17_gain_pct": -0.1, + "npu001_note": "opset21 median 275.355ms vs opset17 274.979ms = -0.1%" +} diff --git a/research/autoconfig/catalog-qnn-sweep/microsoft--resnet-18/results.json b/research/autoconfig/catalog-qnn-sweep/microsoft--resnet-18/results.json new file mode 100644 index 000000000..555428793 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/microsoft--resnet-18/results.json @@ -0,0 +1,124 @@ +{ + "model_id": "microsoft/resnet-18", + "task": "image-classification", + "model_type": "resnet", + "timestamp": "2026-06-13T13:38:52", + "ep": "qnn", + "device": "npu", + "baseline_opset": 17, + "hypotheses": { + "h0": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 4.031, + "cv": 1.6902, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 1.311, + 0.952, + 0.964 + ], + "median_p50_ms": 0.964 + }, + "accuracy": 0.66, + "label": "baseline (auto-config, W8A16)", + "opset": 17 + }, + "h1": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 3.111, + "cv": 2.0363, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 0.99, + 4.003, + 2.716 + ], + "median_p50_ms": 2.716 + }, + "accuracy": null, + "label": "opset 17 explicit", + "opset": 17 + }, + "h2": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 3.992, + "cv": 1.5168, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 1.147, + 1.114, + 1.947 + ], + "median_p50_ms": 1.147 + }, + "accuracy": null, + "label": "opset 19", + "opset": 19 + }, + "h3": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 2.968, + "cv": 1.1762, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 1.054, + 2.175, + 4.107 + ], + "median_p50_ms": 2.175 + }, + "accuracy": null, + "label": "opset 21 (tests npu-001 bypass)", + "opset": 21 + }, + "h4": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 128.104, + "cv": 1.4049, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 132.3, + 134.97, + 130.669 + ], + "median_p50_ms": 132.3 + }, + "accuracy": null, + "label": "opset 17 + conv fusions", + "opset": 17 + }, + "h5": { + "status": "TIMEOUT", + "label": "opset 21 + conv fusions" + } + }, + "best_hypothesis": "h0", + "baseline_p50_ms": 0.964, + "best_p50_ms": 0.964, + "best_gain_pct": 0.0, + "npu001_generalized": true, + "feature_gaps": [], + "errors": [ + "Model timed out at 1560s (before h5)" + ] +} diff --git a/research/autoconfig/catalog-qnn-sweep/rizvandwiki--gender-classification/results_new.json b/research/autoconfig/catalog-qnn-sweep/rizvandwiki--gender-classification/results_new.json new file mode 100644 index 000000000..ad2ca7a54 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/rizvandwiki--gender-classification/results_new.json @@ -0,0 +1,31 @@ +{ + "model_id": "rizvandwiki/gender-classification", + "task": "image-classification", + "hypotheses": { + "h0": { + "description": "opset17 no opts", + "model_file": "quantized.onnx", + "screen_p50_ms": 29.602, + "screen_cv": 0.5068, + "full_p50s_ms": [ + 14.151, + 14.942, + 13.889 + ], + "avg_p50_ms": 14.327 + }, + "h3": { + "description": "opset21 no opts", + "model_file": "quantized.onnx", + "screen_p50_ms": 15.056, + "screen_cv": 0.579, + "full_p50s_ms": [ + 13.698, + 13.921, + 13.868 + ], + "avg_p50_ms": 13.829 + } + }, + "opset21_gain_pct": 3.48 +} diff --git a/research/autoconfig/catalog-qnn-sweep/sentence-transformers--all-MiniLM-L6-v2/results.json b/research/autoconfig/catalog-qnn-sweep/sentence-transformers--all-MiniLM-L6-v2/results.json new file mode 100644 index 000000000..67483f470 --- /dev/null +++ b/research/autoconfig/catalog-qnn-sweep/sentence-transformers--all-MiniLM-L6-v2/results.json @@ -0,0 +1,123 @@ +{ + "model_id": "sentence-transformers/all-MiniLM-L6-v2", + "task": "sentence-similarity", + "model_type": "bert", + "timestamp": "2026-06-13T15:58:36", + "ep": "qnn", + "device": "npu", + "baseline_opset": 17, + "hypotheses": { + "h0": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 5.934, + "cv": 0.2221, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 5.808, + 5.647, + 5.829 + ], + "median_p50_ms": 5.808 + }, + "accuracy": null, + "label": "baseline (auto-config, W8A16)", + "opset": 17 + }, + "h1": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 5.851, + "cv": 0.9986, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 5.814, + 5.88, + 5.912 + ], + "median_p50_ms": 5.88 + }, + "accuracy": null, + "label": "opset 17 explicit", + "opset": 17 + }, + "h2": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 5.309, + "cv": 0.2051, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 5.98, + 5.799, + 6.021 + ], + "median_p50_ms": 5.98 + }, + "accuracy": null, + "label": "opset 19", + "opset": 19 + }, + "h3": { + "status": "OK_HIGH_CV", + "screen": { + "p50_ms": 5.959, + "cv": 1.1272, + "stable": false, + "note": "DVFS noise — high CV expected on QNN NPU" + }, + "full": { + "p50s_ms": [ + 6.0, + 5.851, + 5.844 + ], + "median_p50_ms": 5.851 + }, + "accuracy": null, + "label": "opset 21 (tests npu-001 bypass)", + "opset": 21 + }, + "h4": { + "status": "OK", + "screen": { + "p50_ms": 5.478, + "cv": 0.1344, + "stable": true + }, + "full": { + "p50s_ms": [ + 6.059, + 5.966, + 5.469 + ], + "median_p50_ms": 5.966 + }, + "accuracy": null, + "label": "opset 17 + conv fusions", + "opset": 17 + }, + "h5": { + "status": "TIMEOUT", + "label": "opset 21 + conv fusions" + } + }, + "best_hypothesis": "h0", + "baseline_p50_ms": 5.808, + "best_p50_ms": 5.808, + "best_gain_pct": 0.0, + "npu001_generalized": "neutral", + "feature_gaps": [], + "errors": [ + "Model timed out at 1346s (before h5)" + ] +} diff --git a/research/autoconfig/catalog_cpu_sweep.py b/research/autoconfig/catalog_cpu_sweep.py new file mode 100644 index 000000000..69a90ea59 --- /dev/null +++ b/research/autoconfig/catalog_cpu_sweep.py @@ -0,0 +1,827 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""catalog_cpu_sweep.py — WinML CPU EP optimization sweep across catalog + recipe models. + +Sweeps graph-optimization flags for CPU EP to find improvement opportunities beyond +autoconf defaults. Based on patterns detected by analyze_insight.py (30+ fusion candidates). + +Key CPU constraints from ep_knowledge/cpu.json: + cpu-001: opset 19+ REGRESSES on CPU (3-4x slowdown, Transpose Optimizer bypass) + → h3/h4 included deliberately to test on transformer models (cpu-001 was ConvNext only) + cpu-002: matmul_add_fusion REGRESSES if model already has Gemm ops + → guarded by Gemm check before applying + cpu-003: transpose_optimizer is neutral on ConvNext (may help transformers) + cpu-004: nchwc_transformer neutral on Gemm-heavy models + cpu-005: baseline is optimal for ConvNext — transformers untested + +Phase A: 200-iter screen, CV < 10% required (CPU is thermally stable). +Phase B: 3 sessions × 300 iters, 2s cool-down. +Phase C (confirmation): best hypothesis + 2 extra sessions. + All 5 p50s < baseline_min → CONFIRMED. +KEEP criterion: median p50 >= 5% improvement. + +Results: catalog-cpu-sweep//results.json +Summary: catalog-cpu-sweep/SUMMARY.md +""" + +from __future__ import annotations + +import argparse +import copy +import json +import subprocess +import sys +import time +from datetime import datetime +from pathlib import Path + + +sys.stdout.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined] + +# ── constants ───────────────────────────────────────────────────────────────── +BASE_DIR = Path(__file__).parent +WINML = str(BASE_DIR / ".venv" / "Scripts" / "winml.exe") +EP = "cpu" +DEVICE = "cpu" +RESULTS_DIR = BASE_DIR / "catalog-cpu-sweep" + +SCREEN_WARMUP = 10 +SCREEN_ITERS = 200 +SCREEN_CV_MAX = 0.10 # CPU is stable — stricter than QNN + +FULL_WARMUP = 10 +FULL_ITERS = 300 +FULL_SESSIONS = 3 +CONFIRM_SESSIONS = 2 # Phase C: extra sessions for best hypothesis +COOL_DOWN_S = 2 # CPU cools quickly + +MIN_IMPROVEMENT_PCT = 5.0 + +BUILD_TIMEOUT_S = 10 * 60 +BENCH_TIMEOUT_S = 8 * 60 + +# Gemm threshold: if model has Gemm ops, skip matmul_add_fusion (cpu-002) +GEMM_SAFE_MATMUL_ADD = False # Conservative default; overridden per model + +# Hypotheses: (id, label, opset_override, extra_optim, skip_if_gemm) +# skip_if_gemm=True → skip if model.onnx already contains Gemm nodes (cpu-002 guard) +HYPOTHESES: list[tuple[str, str, int | None, dict | None, bool]] = [ + # ── Opset variants ───────────────────────────────────────────────────── + ("h0", "baseline (opset 17, autoconf defaults)", None, None, False), + ("h1", "opset 17 explicit", 17, None, False), + # cpu-001: opset 19/21 KNOWN to regress on ConvNext — included to test transformers + ("h2", "opset 19 (cpu-001 risk — transformer test)", 19, None, False), + ("h3", "opset 21 (cpu-001 risk — transformer test)", 21, None, False), + # ── Transformer fusions (graph-analysis-driven) ──────────────────────── + ("h4", "opset 17 + attention_fusion", 17, {"attention_fusion": True}, False), + ("h5", "opset 17 + skip_layer_norm_fusion", 17, {"skip_layer_norm_fusion": True}, False), + ("h6", "opset 17 + layer_norm_fusion", 17, {"layer_norm_fusion": True}, False), + ("h7", "opset 17 + bias_softmax_fusion", 17, {"bias_softmax_fusion": True}, False), + # ── MatMul fusions ───────────────────────────────────────────────────── + # matmul_add_fusion: skip if Gemm already present (cpu-002) + ("h8", "opset 17 + matmul_add_fusion (cpu-002 guarded)", 17, {"matmul_add_fusion": True}, True), + ("h9", "opset 17 + matmul_transpose_fusion", 17, {"matmul_transpose_fusion": True}, False), + # ── Transformer bundle (best flags combined) ─────────────────────────── + ( + "h10", + "opset 17 + attention + skip_layer_norm + layer_norm", + 17, + {"attention_fusion": True, "skip_layer_norm_fusion": True, "layer_norm_fusion": True}, + False, + ), + # ── Conv / layout (vision models) ───────────────────────────────────── + # nchwc_transformer: neutral on Gemm-heavy models (cpu-004), may help Conv-heavy + ( + "h11", + "opset 17 + nchwc_transformer (Conv-heavy models)", + 17, + {"nchwc_transformer": True}, + False, + ), + # ── Misc ─────────────────────────────────────────────────────────────── + ("h12", "opset 17 + transpose_optimizer", 17, {"transpose_optimizer": True}, False), + ("h13", "opset 17 + gelu_fusion explicit", 17, {"gelu_fusion": True}, False), +] + +# Catalog + recipe models (task, model_type) +ALL_MODELS = [ + ("microsoft/resnet-18", "image-classification", "resnet"), + ("apple/mobilevit-small", "image-classification", "mobilevit"), + ("facebook/dinov2-small", "image-feature-extraction", "dinov2"), + ("deepset/roberta-base-squad2", "question-answering", "roberta"), + ("deepset/tinyroberta-squad2", "question-answering", "roberta"), + ("BAAI/bge-small-en-v1.5", "sentence-similarity", "bert"), + ("sentence-transformers/all-MiniLM-L6-v2", "sentence-similarity", "bert"), + ("microsoft/rad-dino", "image-feature-extraction", "dinov2"), +] + + +# ── subprocess helpers ──────────────────────────────────────────────────────── + + +def run_cmd(cmd: list[str], label: str = "", timeout: int = 300) -> tuple[int, str, float]: + t0 = time.monotonic() + print(f" >> {label or ' '.join(cmd[:3])}", flush=True) + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout, + encoding="utf-8", + errors="replace", + ) + elapsed = time.monotonic() - t0 + ok = "ok" if result.returncode == 0 else f"rc={result.returncode}" + print(f" {elapsed:.0f}s [{ok}]", flush=True) + if result.returncode != 0: + stderr = result.stderr.strip() + if stderr: + print(f" stderr: {stderr[:200]}", flush=True) + return result.returncode, result.stdout + result.stderr, elapsed + except subprocess.TimeoutExpired: + print(f" TIMEOUT after {timeout}s", flush=True) + return -1, "TIMEOUT", timeout + + +def _get_p50(perf_json: Path) -> float | None: + try: + d = json.loads(perf_json.read_text(encoding="utf-8")) + lat = d.get("latency_ms", d) + return float(lat.get("p50") or 0) or None + except Exception: + return None + + +def _get_cv(perf_json: Path) -> float | None: + try: + d = json.loads(perf_json.read_text(encoding="utf-8")) + lat = d.get("latency_ms", d) + p50 = float(lat.get("p50") or 0) + std = float(lat.get("std") or 0) + return std / p50 if p50 > 0 else None + except Exception: + return None + + +# ── config helpers ───────────────────────────────────────────────────────────── + + +def _patch_for_cpu(cfg: dict) -> dict: + """Remove quantization and compile from CPU config.""" + cfg = copy.deepcopy(cfg) + cfg["quant"] = None + cfg["compile"] = None + return cfg + + +def get_base_config(model_id: str, task: str, model_type: str) -> dict | None: + tmp_dir = RESULTS_DIR / "_tmp_config" + tmp_dir.mkdir(parents=True, exist_ok=True) + cfg_out = tmp_dir / f"{model_id.replace('/', '--')}_cpu.json" + + rc, out, _ = run_cmd( + [ + WINML, + "config", + "--model", + model_id, + "--task", + task, + "--ep", + EP, + "--device", + DEVICE, + "--model-type", + model_type, + "--output", + str(cfg_out), + ], + label=f"winml config --ep {EP}", + timeout=300, + ) + if rc != 0 or not cfg_out.exists(): + for line in out.splitlines(): + line = line.strip() + if line.startswith("{"): + try: + return _patch_for_cpu(json.loads(line)) + except Exception: + pass + return None + return _patch_for_cpu(json.loads(cfg_out.read_text(encoding="utf-8"))) + + +def make_hypothesis_config( + base_config: dict, opset_override: int | None, extra_optim: dict | None +) -> dict: + cfg = copy.deepcopy(base_config) + if opset_override is not None: + cfg.setdefault("export", {})["opset_version"] = opset_override + if extra_optim: + existing = cfg.get("optim") or {} + cfg["optim"] = {**existing, **extra_optim} + return cfg + + +def _model_has_gemm(model_onnx: Path) -> bool: + """Check if an optimized.onnx has Gemm nodes (cpu-002 guard).""" + try: + import onnx + + m = onnx.load(str(model_onnx)) + return any(n.op_type == "Gemm" for n in m.graph.node) + except Exception: + return False # Assume safe if can't check + + +# ── build + bench ────────────────────────────────────────────────────────────── + + +def run_build(model_id: str, cfg_path: Path, out_dir: Path) -> tuple[bool, str]: + """winml build --no-quant --no-compile --rebuild for CPU EP.""" + rc, out, _ = run_cmd( + [ + WINML, + "build", + "-m", + model_id, + "-c", + str(cfg_path), + "-o", + str(out_dir), + "--ep", + EP, + "--device", + DEVICE, + "--no-quant", + "--no-compile", + "--rebuild", + ], + label="winml build", + timeout=BUILD_TIMEOUT_S, + ) + return rc == 0, out + + +def run_perf_screen(onnx_path: Path, out_json: Path) -> tuple[float | None, float | None]: + rc, _, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(onnx_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(SCREEN_WARMUP), + "--iterations", + str(SCREEN_ITERS), + "--output", + str(out_json), + ], + label="perf screen (200 iters)", + timeout=BENCH_TIMEOUT_S, + ) + if rc != 0 or not out_json.exists(): + return None, None + p50 = _get_p50(out_json) + cv = _get_cv(out_json) + if p50: + print(f" screen: p50={p50:.2f}ms CV={cv:.3f}", flush=True) + return p50, cv + + +def run_perf_full(onnx_path: Path, hyp_dir: Path) -> list[float]: + p50s = [] + for s in range(1, FULL_SESSIONS + 1): + out_json = hyp_dir / f"full_s{s}.json" + rc, _, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(onnx_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(FULL_WARMUP), + "--iterations", + str(FULL_ITERS), + "--output", + str(out_json), + ], + label=f"perf full s{s}/{FULL_SESSIONS} ({FULL_ITERS} iters)", + timeout=BENCH_TIMEOUT_S, + ) + p50 = _get_p50(out_json) if rc == 0 and out_json.exists() else None + if p50: + print(f" full s{s}: p50={p50:.2f}ms", flush=True) + p50s.append(p50) + if s < FULL_SESSIONS: + print(f" cool-down {COOL_DOWN_S}s…", flush=True) + time.sleep(COOL_DOWN_S) + return p50s + + +# ── sweep logic ──────────────────────────────────────────────────────────────── + + +def sweep_model( + model_id: str, + task: str, + model_type: str, + only_hyp_ids: "set[str] | None" = None, + reuse_h0_config: bool = False, +) -> dict: + model_slug = model_id.replace("/", "--") + model_dir = RESULTS_DIR / model_slug + model_dir.mkdir(parents=True, exist_ok=True) + + results_path = model_dir / "results.json" + if only_hyp_ids and results_path.exists(): + try: + results = json.loads(results_path.read_text(encoding="utf-8")) + print(" [resume] loaded existing results", flush=True) + except Exception: + results = {} + else: + results = {} + + results.update( + { + "model_id": model_id, + "task": task, + "model_type": model_type, + "timestamp": datetime.now().isoformat(timespec="seconds"), + "ep": EP, + "device": DEVICE, + } + ) + results.setdefault("hypotheses", {}) + results.setdefault("baseline_p50_ms", None) + results.setdefault("best_p50_ms", None) + results.setdefault("best_hypothesis", None) + results.setdefault("best_gain_pct", None) + results.setdefault("errors", []) + + print(f"\n{'=' * 64}", flush=True) + print(f" SWEEP [CPU]: {model_id} [{task}]", flush=True) + if only_hyp_ids: + print(f" (delta — only: {sorted(only_hyp_ids)})", flush=True) + print(f"{'=' * 64}", flush=True) + + # Step 1: base config + print("\n[1/3] Generating base config…", flush=True) + base_config = None + + if reuse_h0_config: + h0_cfg = model_dir / "h0" / "build_config.json" + if h0_cfg.exists(): + try: + base_config = json.loads(h0_cfg.read_text(encoding="utf-8")) + print(" [reuse] h0 config loaded", flush=True) + except Exception: + pass + + if base_config is None: + base_config = get_base_config(model_id, task, model_type) + + if base_config is None: + results["errors"].append("base config generation failed") + _save_results(results, model_dir) + return results + + baseline_opset = (base_config.get("export") or {}).get("opset_version", "?") + results["baseline_opset"] = baseline_opset + print(f" baseline opset={baseline_opset} quant=NONE (CPU EP) compile=NONE", flush=True) + + # Step 2: hypothesis loop + print(f"\n[2/3] Running {len(HYPOTHESES)} hypotheses…", flush=True) + + baseline_p50: float | None = results.get("baseline_p50_ms") + model_has_gemm: bool | None = None # lazy-loaded for cpu-002 guard + + for hyp_id, label, opset_override, extra_optim, skip_if_gemm in HYPOTHESES: + if only_hyp_ids is not None and hyp_id not in only_hyp_ids: + continue + + sep = "─" * 56 + print(f"\n{sep}", flush=True) + print(f" {hyp_id}: {label}", flush=True) + print(f"{sep}", flush=True) + + hyp_config = make_hypothesis_config(base_config, opset_override, extra_optim) + opset_used = (hyp_config.get("export") or {}).get("opset_version", "?") + print(f" opset={opset_used} extra_optim={extra_optim}", flush=True) + + hyp_dir = model_dir / hyp_id + hyp_dir.mkdir(parents=True, exist_ok=True) + cfg_path = hyp_dir / "build_config.json" + cfg_path.write_text(json.dumps(hyp_config, indent=2), encoding="utf-8") + + # Build + build_ok, build_out = run_build(model_id, cfg_path, hyp_dir) + if not build_ok: + results["hypotheses"][hyp_id] = { + "status": "BUILD_FAIL", + "label": label, + "opset": opset_used, + "build_error": build_out[-300:] if build_out else "", + } + results["errors"].append(f"{hyp_id}: BUILD_FAIL") + continue + + # Find output ONNX + onnx_path = hyp_dir / "model.onnx" + if not onnx_path.exists(): + candidates = list(hyp_dir.glob("*.onnx")) + if candidates: + onnx_path = candidates[0] + else: + results["hypotheses"][hyp_id] = {"status": "NO_ONNX", "label": label} + results["errors"].append(f"{hyp_id}: build OK but no ONNX") + continue + + # cpu-002 guard: skip matmul_add_fusion if model already has Gemm + if skip_if_gemm: + if model_has_gemm is None: + opt_onnx = hyp_dir / "optimized.onnx" + model_has_gemm = _model_has_gemm(opt_onnx) if opt_onnx.exists() else False + if model_has_gemm: + print( + f" [cpu-002] SKIP {hyp_id}: model has Gemm nodes — matmul_add_fusion likely harmful", + flush=True, + ) + results["hypotheses"][hyp_id] = { + "status": "SKIPPED_CPU002", + "label": label, + "opset": opset_used, + "reason": "cpu-002: model already has Gemm — matmul_add_fusion skipped", + } + continue + + # Annotate cpu-001 risk + if opset_override is not None and opset_override >= 19: + print( + f" [cpu-001] NOTE: opset={opset_override} may regress on Conv-heavy models" + f" (cpu-001 validated on ConvNext only — testing transformer behavior)", + flush=True, + ) + + # Phase A: screen + screen_json = hyp_dir / "screen_perf.json" + screen_p50, screen_cv = run_perf_screen(onnx_path, screen_json) + + if screen_p50 is None: + results["hypotheses"][hyp_id] = {"status": "BENCH_FAIL", "label": label} + results["errors"].append(f"{hyp_id}: screen bench failed") + continue + + if screen_cv is not None and screen_cv > SCREEN_CV_MAX: + print(f" [warn] high CV={screen_cv:.3f} on CPU (unusual) — proceeding", flush=True) + + # Phase B: full bench + p50s = run_perf_full(onnx_path, hyp_dir) + if not p50s: + results["hypotheses"][hyp_id] = { + "status": "BENCH_FAIL", + "label": label, + "screen_p50_ms": screen_p50, + } + continue + + median_p50 = sorted(p50s)[len(p50s) // 2] + + hyp_data: dict = { + "status": "OK", + "label": label, + "opset": opset_used, + "extra_optim": extra_optim, + "screen_p50_ms": screen_p50, + "screen_cv": screen_cv, + "full_p50s_ms": p50s, + "median_p50_ms": median_p50, + } + + if hyp_id == "h0": + baseline_p50 = median_p50 + results["baseline_p50_ms"] = baseline_p50 + print(f" [baseline] p50={baseline_p50:.2f}ms", flush=True) + + if baseline_p50 and hyp_id != "h0": + gain_pct = (baseline_p50 - median_p50) / baseline_p50 * 100 + hyp_data["gain_vs_baseline_pct"] = round(gain_pct, 2) + verdict = ( + "KEEP" + if gain_pct >= MIN_IMPROVEMENT_PCT + else ("MARGINAL" if gain_pct > 0 else "DISCARD") + ) + # cpu-001: flag known-regression hypotheses specially + if opset_override is not None and opset_override >= 19 and gain_pct <= -50: + verdict = "CPU001_REGRESSION" + hyp_data["verdict"] = verdict + print( + f" [{verdict}] gain={gain_pct:+.1f}% ({baseline_p50:.2f}ms → {median_p50:.2f}ms)", + flush=True, + ) + + best_p50 = results.get("best_p50_ms") + if best_p50 is None or median_p50 < best_p50: + if gain_pct >= MIN_IMPROVEMENT_PCT: + results["best_p50_ms"] = median_p50 + results["best_hypothesis"] = hyp_id + results["best_gain_pct"] = round(gain_pct, 2) + else: + hyp_data["verdict"] = "BASELINE" + + results["hypotheses"][hyp_id] = hyp_data + + # Step 2b: Phase C confirmation + _run_confirmation_pass(results, model_dir, baseline_p50) + + # Step 3: finalise + _post_process(results) + _save_results(results, model_dir) + return results + + +def _run_confirmation_pass(results: dict, model_dir: Path, baseline_p50: float | None) -> None: + """Phase C: CONFIRM_SESSIONS extra sessions for best hypothesis.""" + if not baseline_p50: + return + hyps = results.get("hypotheses", {}) + keep_ids = [hid for hid, h in hyps.items() if h.get("verdict") == "KEEP"] + if not keep_ids: + return + + print( + f"\n ── Phase C: confirming {keep_ids} ({CONFIRM_SESSIONS} extra sessions each) ──", + flush=True, + ) + + for hyp_id in keep_ids: + hyp_data = hyps[hyp_id] + onnx_path: Path | None = None + hyp_dir = model_dir / hyp_id + for candidate in (hyp_dir / "model.onnx", hyp_dir / "optimized.onnx"): + if candidate.exists(): + onnx_path = candidate + break + if onnx_path is None: + continue + + print(f" [confirm] {hyp_id} ({hyp_data['label']})", flush=True) + extra_p50s: list[float] = [] + for s in range(1, CONFIRM_SESSIONS + 1): + out_json = hyp_dir / f"confirm_s{s}.json" + rc, _, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(onnx_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(FULL_WARMUP), + "--iterations", + str(FULL_ITERS), + "--output", + str(out_json), + ], + label=f"confirm s{s}/{CONFIRM_SESSIONS}", + timeout=BENCH_TIMEOUT_S, + ) + p50 = _get_p50(out_json) if rc == 0 and out_json.exists() else None + if p50: + print(f" confirm s{s}: p50={p50:.2f}ms", flush=True) + extra_p50s.append(p50) + if s < CONFIRM_SESSIONS: + time.sleep(COOL_DOWN_S) + + if not extra_p50s: + continue + + all_p50s = hyp_data.get("full_p50s_ms", []) + extra_p50s + overall_median = sorted(all_p50s)[len(all_p50s) // 2] + overall_gain = (baseline_p50 - overall_median) / baseline_p50 * 100 + wins = sum( + 1 for p in all_p50s if (baseline_p50 - p) / baseline_p50 * 100 >= MIN_IMPROVEMENT_PCT + ) + + hyp_data["confirm_p50s_ms"] = extra_p50s + hyp_data["all_p50s_ms"] = all_p50s + hyp_data["overall_median_p50_ms"] = round(overall_median, 3) + hyp_data["overall_gain_pct"] = round(overall_gain, 2) + hyp_data["sessions_above_threshold"] = wins + hyp_data["total_sessions"] = len(all_p50s) + + if wins == len(all_p50s): + hyp_data["verdict"] = "KEEP_CONFIRMED" + print( + f" [KEEP_CONFIRMED] {hyp_id}: {wins}/{len(all_p50s)} sessions ≥ {MIN_IMPROVEMENT_PCT}%," + f" overall={overall_gain:+.1f}%", + flush=True, + ) + else: + hyp_data["verdict"] = "MARGINAL_UNCONFIRMED" + print( + f" [MARGINAL_UNCONFIRMED] {hyp_id}: only {wins}/{len(all_p50s)} sessions above threshold", + flush=True, + ) + + if hyp_data["verdict"] == "KEEP_CONFIRMED": + best_p50 = results.get("best_p50_ms") + if best_p50 is None or overall_median < best_p50: + results["best_p50_ms"] = overall_median + results["best_hypothesis"] = hyp_id + results["best_gain_pct"] = round(overall_gain, 2) + + +def _post_process(results: dict) -> None: + hyps = results.get("hypotheses", {}) + baseline_p50 = results.get("baseline_p50_ms") + if not baseline_p50: + return + + keeps = [(hid, h) for hid, h in hyps.items() if h.get("verdict") in ("KEEP", "KEEP_CONFIRMED")] + unconfirmed = [ + (hid, h) for hid, h in hyps.items() if h.get("verdict") == "MARGINAL_UNCONFIRMED" + ] + regressions = [(hid, h) for hid, h in hyps.items() if h.get("verdict") == "CPU001_REGRESSION"] + + if keeps: + print(f"\n ✓ KEEP/KEEP_CONFIRMED: {[h[0] for h in keeps]}", flush=True) + if unconfirmed: + print(f" ⚠ MARGINAL_UNCONFIRMED: {[h[0] for h in unconfirmed]}", flush=True) + if regressions: + print(f" ✗ CPU001_REGRESSION: {[h[0] for h in regressions]}", flush=True) + if not keeps and not unconfirmed and not regressions: + print("\n No improvements found above 5% threshold.", flush=True) + + # Cross-architecture cpu-001 check: does opset 19/21 regress on THIS model? + for hid in ("h2", "h3"): + h = hyps.get(hid, {}) + if h.get("status") == "OK" and baseline_p50: + gain = h.get("gain_vs_baseline_pct", 0.0) + if gain < -50: + print( + f" [cpu-001] CONFIRMED regression on {hid} for this architecture: {gain:.1f}%", + flush=True, + ) + elif gain > -10: + print( + f" [cpu-001] NOT OBSERVED on {hid} for {results.get('model_type')} — " + f"gain={gain:+.1f}% (ConvNext-specific?)", + flush=True, + ) + + +def _save_results(results: dict, model_dir: Path) -> None: + out = model_dir / "results.json" + out.write_text(json.dumps(results, indent=2, ensure_ascii=False), encoding="utf-8") + print(f" Results: {out}", flush=True) + + +# ── summary writer ──────────────────────────────────────────────────────────── + + +def write_summary(all_results: list[dict]) -> None: + lines = [ + "# CPU EP Optimization Sweep — Catalog Models", + "", + f"Generated: {datetime.now().isoformat(timespec='seconds')} ", + f"EP: `{EP}` / device: `{DEVICE}` ", + f"Protocol: screen {SCREEN_ITERS} iters (CV<{SCREEN_CV_MAX * 100:.0f}%)," + f" full {FULL_ITERS}×{FULL_SESSIONS} sessions" + f" + {CONFIRM_SESSIONS} confirm sessions for KEEP ", + "Constraints: NO quant, NO compile ", + "", + "---", + "", + "## cpu-001 Check: Does opset 19/21 Regress on Non-ConvNext Models?", + "", + "| Model | type | h2(opset19) gain% | h3(opset21) gain% | cpu-001 fires? |", + "|-------|------|-------------------|-------------------|---------------|", + ] + + for r in all_results: + model_id = r.get("model_id", "?") + mtype = r.get("model_type", "?") + h2 = r.get("hypotheses", {}).get("h2", {}) + h3 = r.get("hypotheses", {}).get("h3", {}) + g2 = ( + f"{h2.get('gain_vs_baseline_pct', 'N/A'):+.1f}%" + if h2.get("gain_vs_baseline_pct") is not None + else h2.get("status", "N/A") + ) + g3 = ( + f"{h3.get('gain_vs_baseline_pct', 'N/A'):+.1f}%" + if h3.get("gain_vs_baseline_pct") is not None + else h3.get("status", "N/A") + ) + fires = ( + "YES ≤-50%" + if any( + r.get("hypotheses", {}).get(h, {}).get("gain_vs_baseline_pct", 0) <= -50 + for h in ("h2", "h3") + ) + else "no" + ) + lines.append(f"| `{model_id}` | {mtype} | {g2} | {g3} | {fires} |") + + lines += [ + "", + "## Per-Model Results", + "", + "| Model | Baseline p50 | Best p50 | Best config | Gain% | Notes |", + "|-------|-------------|----------|-------------|-------|-------|", + ] + + for r in all_results: + model_id = r.get("model_id", "?") + baseline = f"{r['baseline_p50_ms']:.1f} ms" if r.get("baseline_p50_ms") else "N/A" + best = f"{r['best_p50_ms']:.1f} ms" if r.get("best_p50_ms") else "N/A" + best_h = r.get("best_hypothesis") or "N/A" + best_label = "" + if best_h != "N/A": + best_label = r.get("hypotheses", {}).get(best_h, {}).get("label", "") + gain = f"{r['best_gain_pct']:.1f}%" if r.get("best_gain_pct") is not None else "N/A" + errors = "; ".join(r.get("errors", []))[:80] or "none" + lines.append( + f"| `{model_id}` | {baseline} | {best} | {best_h} ({best_label}) | {gain} | {errors} |" + ) + + summary_path = RESULTS_DIR / "SUMMARY.md" + RESULTS_DIR.mkdir(parents=True, exist_ok=True) + summary_path.write_text("\n".join(lines) + "\n", encoding="utf-8") + print(f"\n📄 Summary: {summary_path}", flush=True) + + +# ── CLI ──────────────────────────────────────────────────────────────────────── + + +def main() -> None: + parser = argparse.ArgumentParser(description="CPU EP sweep across catalog models") + parser.add_argument("--model", help="Run a single model (HuggingFace model ID)") + parser.add_argument("--task", help="Task for single model run") + parser.add_argument("--model-type", dest="model_type", help="Model type for single model run") + parser.add_argument( + "--only-hypotheses", + dest="only_hyp", + help="Comma-separated list of hypothesis IDs to run (e.g. h4,h5,h10)", + ) + parser.add_argument( + "--reuse-h0-config", + dest="reuse_h0", + action="store_true", + help="Load base config from existing h0/build_config.json", + ) + args = parser.parse_args() + + only_hyp_ids = set(args.only_hyp.split(",")) if args.only_hyp else None + + all_results = [] + + if args.model: + if not args.task or not args.model_type: + print("ERROR: --task and --model-type required with --model", file=sys.stderr) + sys.exit(1) + r = sweep_model( + args.model, + args.task, + args.model_type, + only_hyp_ids=only_hyp_ids, + reuse_h0_config=args.reuse_h0, + ) + all_results.append(r) + else: + for model_id, task, model_type in ALL_MODELS: + r = sweep_model( + model_id, + task, + model_type, + only_hyp_ids=only_hyp_ids, + reuse_h0_config=args.reuse_h0, + ) + all_results.append(r) + + write_summary(all_results) + print("\n================================================================", flush=True) + print(" CPU SWEEP COMPLETE", flush=True) + print("================================================================", flush=True) + print(f"\n📄 Summary: {RESULTS_DIR / 'SUMMARY.md'}", flush=True) + + +if __name__ == "__main__": + main() diff --git a/research/autoconfig/catalog_gpu_sweep.py b/research/autoconfig/catalog_gpu_sweep.py new file mode 100644 index 000000000..7bfeab75f --- /dev/null +++ b/research/autoconfig/catalog_gpu_sweep.py @@ -0,0 +1,865 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""catalog_gpu_sweep.py — QNN GPU optimization hypothesis sweep for winml catalog models. + +QNN GPU differs fundamentally from QNN NPU: + - NO quantization (gpu-004: QDQ graphs hang on QNN GPU EP) + - NO compile (gpu-003: EPContext compilation regresses ~34% on GPU) + - NO nhwc-transformer (gpu-002: Adreno X1-85 does not benefit) + - CV gating IS reliable on GPU (no DVFS noise unlike NPU) + - All findings from gpu-001..006 are ConvNext-specific — transformer fusions + (attention, matmul_add, layer_norm) are UNTESTED and may help + +Hypothesis matrix (per model): + h0: baseline FP32 (auto-config, no quant, no compile) + h1: opset 17 explicit + h2: opset 19 + h3: opset 21 ← tests gpu-006 (unknown territory) + + Transformer/attention fusions (graph-analysis-driven): + h4: opset 17 + matmul_transpose_fusion (24-36× on transformer optimized.onnx) + h5: opset 17 + attention_fusion + h6: opset 17 + bias_softmax_fusion (12× on BERT-family) + h7: opset 17 + layer_norm_fusion + h8: opset 17 + skip_layer_norm_fusion + + Combined bundles: + h9: opset 21 + matmul_transpose_fusion + attention_fusion + h10: opset 17 + layer_norm_fusion + skip_layer_norm_fusion + matmul_transpose_fusion + h11: opset 17 + gelu_fusion (already in autoconf baseline; test stability benefit — gpu-005) + + Layout (Conv-heavy models only): + h12: opset 17 + transpose_optimizer + +2-phase bench (CV-gated, GPU is stable unlike NPU): + Phase A: 200-iter screen, CV < 15% required. + Phase B: 3 sessions × 300 iters, 5s cool-down. + Phase C (confirmation): KEEP candidates get 2 additional sessions. + All 5 sessions must show improvement → KEEP_CONFIRMED. + Fewer than 5/5 → MARGINAL_UNCONFIRMED. + KEEP criterion: median p50 >= 5% improvement AND CV < 5%. + +Results: catalog-gpu-sweep//results.json +Summary: catalog-gpu-sweep/SUMMARY.md +""" + +from __future__ import annotations + +import argparse +import copy +import json +import subprocess +import sys +import time +from datetime import datetime +from pathlib import Path + + +sys.stdout.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined] + +# ── constants ───────────────────────────────────────────────────────────────── +BASE_DIR = Path(__file__).parent +WINML = str(BASE_DIR / ".venv" / "Scripts" / "winml.exe") +EP = "qnn" +DEVICE = "gpu" +RESULTS_DIR = BASE_DIR / "catalog-gpu-sweep" + +SCREEN_WARMUP = 20 +SCREEN_ITERS = 200 +SCREEN_CV_MAX = 0.15 # GPU is CV-stable, unlike NPU + +FULL_WARMUP = 20 +FULL_ITERS = 300 +FULL_SESSIONS = 3 # baseline sessions per hypothesis +CONFIRM_SESSIONS = 2 # extra sessions for KEEP candidates (Phase C) +COOL_DOWN_S = 5 # GPU cools faster than NPU HTP + +MIN_IMPROVEMENT_PCT = 5.0 # % gain required to declare KEEP + +BUILD_TIMEOUT_S = 10 * 60 +BENCH_TIMEOUT_S = 5 * 60 + +# gpu-004: no quantization allowed +# gpu-003: no compile +GPU_NO_QUANT = True +GPU_NO_COMPILE = True + +# Hypotheses: (id, label, opset_override, extra_optim) +# extra_optim=None → keep auto-config optim unchanged +# extra_optim=dict → merge ON TOP of auto-config optim +HYPOTHESES = [ + ("h0", "baseline FP32 (no quant, no compile)", None, None), + ("h1", "opset 17 explicit", 17, None), + ("h2", "opset 19", 19, None), + ("h3", "opset 21 (tests gpu-006)", 21, None), + # ── transformer/attention fusions (graph-analysis-driven) ────────────── + ("h4", "opset 17 + matmul_transpose_fusion", 17, {"matmul_transpose_fusion": True}), + ("h5", "opset 17 + attention_fusion", 17, {"attention_fusion": True}), + ("h6", "opset 17 + bias_softmax_fusion", 17, {"bias_softmax_fusion": True}), + ( + "h7", + "opset 17 + layer_norm_fusion", + 17, + {"layer_norm_fusion": True}, + ), + ( + "h8", + "opset 17 + skip_layer_norm_fusion", + 17, + {"skip_layer_norm_fusion": True}, + ), + # ── combined bundles ──────────────────────────────────────────────────── + ( + "h9", + "opset 21 + matmul_transpose + attention_fusion", + 21, + {"matmul_transpose_fusion": True, "attention_fusion": True}, + ), + ( + "h10", + "opset 17 + ln + skip_ln + matmul_transpose", + 17, + { + "layer_norm_fusion": True, + "skip_layer_norm_fusion": True, + "matmul_transpose_fusion": True, + }, + ), + # ── gelu stability (gpu-005) ──────────────────────────────────────────── + # gelu_fusion is already in autoconf defaults, but test explicitly + # to confirm p90/std stability benefit on non-ConvNext models + ("h11", "opset 17 + gelu_fusion explicit", 17, {"gelu_fusion": True}), + # ── layout ────────────────────────────────────────────────────────────── + ("h12", "opset 17 + transpose_optimizer", 17, {"transpose_optimizer": True}), +] + +# Catalog models (same as NPU sweep + recipe models) +ALL_MODELS: list[tuple[str, str, str]] = [ + # Catalog 8 + ("microsoft/resnet-18", "image-classification", "resnet"), + ("google/vit-base-patch16-224", "image-classification", "vit"), + ("apple/mobilevit-small", "image-classification", "mobilevit"), + ("facebook/dinov2-small", "image-feature-extraction", "dinov2"), + ("hustvl/yolos-small", "object-detection", "yolos"), + ( + "distilbert/distilbert-base-uncased-finetuned-sst-2-english", + "text-classification", + "distilbert", + ), + ("sentence-transformers/all-MiniLM-L6-v2", "sentence-similarity", "bert"), + ("deepset/roberta-base-squad2", "question-answering", "roberta"), + # Recipe models (from winml-cli examples/recipes) + ("microsoft/rad-dino", "image-feature-extraction", "dinov2"), + ("deepset/tinyroberta-squad2", "question-answering", "roberta"), + ("BAAI/bge-small-en-v1.5", "sentence-similarity", "bert"), +] + + +# ── low-level helpers ───────────────────────────────────────────────────────── + + +def run_cmd(cmd: list[str], label: str = "", timeout: int = 600) -> tuple[int, str, float]: + """Run a command; return (returncode, combined_output, elapsed_s).""" + t0 = time.time() + print(f" >> {label or cmd[1]}", flush=True) + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + timeout=timeout, + ) + elapsed = time.time() - t0 + tag = "ok" if result.returncode == 0 else f"rc={result.returncode}" + print(f" {elapsed:.0f}s [{tag}]", flush=True) + if result.returncode != 0: + stderr = result.stderr.strip() + if stderr: + print(f" stderr: {stderr[:200]}", flush=True) + return result.returncode, result.stdout + result.stderr, elapsed + except subprocess.TimeoutExpired: + elapsed = time.time() - t0 + print(f" TIMEOUT ({elapsed:.0f}s)", flush=True) + return -1, "TIMEOUT", elapsed + + +def _get_p50(perf_json: Path) -> float | None: + try: + d = json.loads(perf_json.read_text(encoding="utf-8")) + lat = d.get("latency_ms", d) + return float(lat.get("p50") or 0) or None + except Exception: + return None + + +def _get_cv(perf_json: Path) -> float | None: + """Return CV (std/p50). Returns None on parse error.""" + try: + d = json.loads(perf_json.read_text(encoding="utf-8")) + lat = d.get("latency_ms", d) + p50 = float(lat.get("p50") or 0) + std = float(lat.get("std") or 0) + return std / p50 if p50 > 0 else None + except Exception: + return None + + +# ── config helpers ──────────────────────────────────────────────────────────── + + +def _patch_for_gpu(cfg: dict) -> dict: + """Strip quantization and compile from a base config for GPU EP.""" + cfg = copy.deepcopy(cfg) + cfg["quant"] = None + cfg["compile"] = None + # Remove nhwc-transformer (gpu-002) + optim = cfg.get("optim") or {} + optim.pop("nhwc_transformer", None) + cfg["optim"] = optim + return cfg + + +def get_base_config(model_id: str, task: str, model_type: str) -> dict | None: + """Call winml config for GPU EP and return the parsed config.""" + tmp_dir = RESULTS_DIR / "_tmp_config" + tmp_dir.mkdir(parents=True, exist_ok=True) + cfg_out = tmp_dir / f"{model_id.replace('/', '--')}_gpu.json" + + rc, out, _ = run_cmd( + [ + WINML, + "config", + "--model", + model_id, + "--task", + task, + "--ep", + EP, + "--device", + DEVICE, + "--model-type", + model_type, + "--output", + str(cfg_out), + ], + label="winml config --ep qnn --device gpu", + timeout=300, + ) + if rc != 0 or not cfg_out.exists(): + # Try without --output (some versions write to stdout) + for line in out.splitlines(): + line = line.strip() + if line.startswith("{"): + try: + cfg = json.loads(line) + return _patch_for_gpu(cfg) + except Exception: + pass + return None + + cfg = json.loads(cfg_out.read_text(encoding="utf-8")) + return _patch_for_gpu(cfg) + + +def make_hypothesis_config( + base_config: dict, opset_override: int | None, extra_optim: dict | None +) -> dict: + """Apply opset + extra_optim on top of base config.""" + cfg = copy.deepcopy(base_config) + if opset_override is not None: + cfg.setdefault("export", {})["opset_version"] = opset_override + if extra_optim: + existing = cfg.get("optim") or {} + cfg["optim"] = {**existing, **extra_optim} + return cfg + + +# ── build + bench ───────────────────────────────────────────────────────────── + + +def run_build(model_id: str, cfg_path: Path, out_dir: Path) -> tuple[bool, str]: + """winml build --no-quant --no-compile --rebuild. Returns (ok, output).""" + rc, out, _ = run_cmd( + [ + WINML, + "build", + "-m", + model_id, + "-c", + str(cfg_path), + "-o", + str(out_dir), + "--ep", + EP, + "--device", + DEVICE, + "--no-quant", + "--no-compile", + "--rebuild", + ], + label="winml build", + timeout=BUILD_TIMEOUT_S, + ) + return rc == 0, out + + +def run_perf_screen(onnx_path: Path, out_json: Path) -> tuple[float | None, float | None]: + """Phase A: 200-iter screen. Returns (p50_ms, cv).""" + rc, out, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(onnx_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(SCREEN_WARMUP), + "--iterations", + str(SCREEN_ITERS), + "--output", + str(out_json), + ], + label="perf screen (200 iters)", + timeout=BENCH_TIMEOUT_S, + ) + if rc != 0 or not out_json.exists(): + return None, None + p50 = _get_p50(out_json) + cv = _get_cv(out_json) + if p50: + print(f" screen: p50={p50:.2f}ms CV={cv:.3f}", flush=True) + return p50, cv + + +def run_perf_full(onnx_path: Path, hyp_dir: Path) -> list[float]: + """Phase B: 2 × 300-iter sessions. Returns list of p50 values.""" + p50s = [] + for s in range(1, FULL_SESSIONS + 1): + out_json = hyp_dir / f"full_s{s}.json" + rc, out, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(onnx_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(FULL_WARMUP), + "--iterations", + str(FULL_ITERS), + "--output", + str(out_json), + ], + label=f"perf full s{s}/{FULL_SESSIONS} ({FULL_ITERS} iters)", + timeout=BENCH_TIMEOUT_S, + ) + p50 = _get_p50(out_json) if rc == 0 and out_json.exists() else None + if p50: + print(f" full s{s}: p50={p50:.2f}ms", flush=True) + p50s.append(p50) + if s < FULL_SESSIONS: + print(f" cool-down {COOL_DOWN_S}s…", flush=True) + time.sleep(COOL_DOWN_S) + return p50s + + +# ── sweep logic ─────────────────────────────────────────────────────────────── + + +def sweep_model( + model_id: str, + task: str, + model_type: str, + only_hyp_ids: "set[str] | None" = None, + reuse_h0_config: bool = False, +) -> dict: + """Run GPU hypotheses for one model. Returns results dict.""" + model_slug = model_id.replace("/", "--") + model_dir = RESULTS_DIR / model_slug + model_dir.mkdir(parents=True, exist_ok=True) + + # Resume from partial run + results_path = model_dir / "results.json" + if only_hyp_ids and results_path.exists(): + try: + results = json.loads(results_path.read_text(encoding="utf-8")) + print(" [resume] loaded existing results", flush=True) + except Exception: + results = {} + else: + results = {} + + results.update( + { + "model_id": model_id, + "task": task, + "model_type": model_type, + "timestamp": datetime.now().isoformat(timespec="seconds"), + "ep": EP, + "device": DEVICE, + } + ) + results.setdefault("baseline_opset", None) + results.setdefault("hypotheses", {}) + results.setdefault("best_hypothesis", None) + results.setdefault("baseline_p50_ms", None) + results.setdefault("best_p50_ms", None) + results.setdefault("best_gain_pct", None) + results.setdefault("opset21_gain_pct", None) # tests gpu-006 + results.setdefault("feature_gaps", []) + results.setdefault("errors", []) + + print(f"\n{'=' * 64}", flush=True) + print(f" SWEEP [GPU]: {model_id} [{task}]", flush=True) + if only_hyp_ids: + print(f" (delta — only: {sorted(only_hyp_ids)})", flush=True) + print(f"{'=' * 64}", flush=True) + + # ── Step 1: base config ──────────────────────────────────────────────── + print("\n[1/3] Generating base config…", flush=True) + base_config = None + + if reuse_h0_config: + h0_cfg = model_dir / "h0" / "build_config.json" + if h0_cfg.exists(): + try: + base_config = json.loads(h0_cfg.read_text(encoding="utf-8")) + print(" [reuse] h0 config loaded", flush=True) + except Exception: + pass + + if base_config is None: + base_config = get_base_config(model_id, task, model_type) + + if base_config is None: + results["errors"].append("base config generation failed") + _save_results(results, model_dir) + return results + + baseline_opset = (base_config.get("export") or {}).get("opset_version", "?") + results["baseline_opset"] = baseline_opset + print(f" baseline opset={baseline_opset} quant=NONE (GPU EP) compile=NONE", flush=True) + + # ── Step 2: hypothesis loop ──────────────────────────────────────────── + print(f"\n[2/3] Running {len(HYPOTHESES)} hypotheses…", flush=True) + + baseline_p50: float | None = results.get("baseline_p50_ms") + + for hyp_id, label, opset_override, extra_optim in HYPOTHESES: + if only_hyp_ids is not None and hyp_id not in only_hyp_ids: + continue + + sep = "─" * 56 + print(f"\n{sep}", flush=True) + print(f" {hyp_id}: {label}", flush=True) + print(f"{sep}", flush=True) + + hyp_config = make_hypothesis_config(base_config, opset_override, extra_optim) + opset_used = (hyp_config.get("export") or {}).get("opset_version", "?") + print(f" opset={opset_used} extra_optim={extra_optim}", flush=True) + + hyp_dir = model_dir / hyp_id + hyp_dir.mkdir(parents=True, exist_ok=True) + cfg_path = hyp_dir / "build_config.json" + cfg_path.write_text(json.dumps(hyp_config, indent=2), encoding="utf-8") + + # Build + build_ok, build_out = run_build(model_id, cfg_path, hyp_dir) + if not build_ok: + results["hypotheses"][hyp_id] = { + "status": "BUILD_FAIL", + "label": label, + "opset": opset_used, + "build_error": build_out[-300:] if build_out else "", + } + results["errors"].append(f"{hyp_id}: BUILD_FAIL") + continue + + # Find output ONNX + onnx_path = hyp_dir / "model.onnx" + if not onnx_path.exists(): + candidates = list(hyp_dir.glob("*.onnx")) + if candidates: + onnx_path = candidates[0] + else: + results["hypotheses"][hyp_id] = {"status": "NO_ONNX", "label": label} + results["errors"].append(f"{hyp_id}: build OK but no ONNX") + continue + + # Phase A: screen + screen_json = hyp_dir / "screen_perf.json" + screen_p50, screen_cv = run_perf_screen(onnx_path, screen_json) + + if screen_p50 is None: + results["hypotheses"][hyp_id] = {"status": "BENCH_FAIL", "label": label} + results["errors"].append(f"{hyp_id}: screen bench failed") + continue + + if screen_cv is not None and screen_cv > SCREEN_CV_MAX: + print( + f" [warn] high CV={screen_cv:.3f} on GPU (unusual) — proceeding anyway", flush=True + ) + + # Phase B: full bench + p50s = run_perf_full(onnx_path, hyp_dir) + if not p50s: + results["hypotheses"][hyp_id] = { + "status": "BENCH_FAIL", + "label": label, + "screen_p50_ms": screen_p50, + } + continue + + median_p50 = sorted(p50s)[len(p50s) // 2] + + hyp_data: dict = { + "status": "OK", + "label": label, + "opset": opset_used, + "extra_optim": extra_optim, + "screen_p50_ms": screen_p50, + "screen_cv": screen_cv, + "full_p50s_ms": p50s, + "median_p50_ms": median_p50, + } + + # Track baseline + if hyp_id == "h0": + baseline_p50 = median_p50 + results["baseline_p50_ms"] = baseline_p50 + print(f" [baseline] p50={baseline_p50:.2f}ms", flush=True) + + # Compare to baseline + if baseline_p50 and hyp_id != "h0": + gain_pct = (baseline_p50 - median_p50) / baseline_p50 * 100 + hyp_data["gain_vs_baseline_pct"] = round(gain_pct, 2) + verdict = ( + "KEEP" + if gain_pct >= MIN_IMPROVEMENT_PCT + else ("MARGINAL" if gain_pct > 0 else "DISCARD") + ) + hyp_data["verdict"] = verdict + print( + f" [{verdict}] gain={gain_pct:+.1f}% ({baseline_p50:.2f}ms → {median_p50:.2f}ms)", + flush=True, + ) + + # Track best + best_p50 = results.get("best_p50_ms") + if best_p50 is None or median_p50 < best_p50: + if gain_pct >= MIN_IMPROVEMENT_PCT: + results["best_p50_ms"] = median_p50 + results["best_hypothesis"] = hyp_id + results["best_gain_pct"] = round(gain_pct, 2) + + # gpu-006: track opset21 result + if opset_override == 21 and extra_optim is None: + results["opset21_gain_pct"] = round(gain_pct, 2) + else: + hyp_data["verdict"] = "BASELINE" + + results["hypotheses"][hyp_id] = hyp_data + + # ── Step 2b: Phase C — confirmation runs for KEEP candidates ────────────── + _run_confirmation_pass(results, model_dir, baseline_p50) + + # ── Step 3: finalise ─────────────────────────────────────────────────── + _post_process(results) + _save_results(results, model_dir) + return results + + +def _run_confirmation_pass(results: dict, model_dir: Path, baseline_p50: float | None) -> None: + """Phase C: re-run CONFIRM_SESSIONS additional sessions for every KEEP candidate. + + If all (FULL_SESSIONS + CONFIRM_SESSIONS) sessions show >= MIN_IMPROVEMENT_PCT: + verdict stays KEEP_CONFIRMED. + Otherwise downgrade to MARGINAL_UNCONFIRMED. + """ + if not baseline_p50: + return + hyps = results.get("hypotheses", {}) + keep_ids = [hid for hid, h in hyps.items() if h.get("verdict") == "KEEP"] + if not keep_ids: + return + + print( + f"\n ── Phase C: confirming {keep_ids} ({CONFIRM_SESSIONS} extra sessions each) ──", + flush=True, + ) + + for hyp_id in keep_ids: + hyp_data = hyps[hyp_id] + onnx_path: Path | None = None + hyp_dir = model_dir / hyp_id + + # Find built ONNX from the hypothesis dir + for candidate in (hyp_dir / "optimized.onnx", hyp_dir / "quantized.onnx"): + if candidate.exists(): + onnx_path = candidate + break + if onnx_path is None: + print(f" [confirm] {hyp_id}: no onnx found, skipping", flush=True) + continue + + print(f" [confirm] {hyp_id} ({hyp_data['label']})", flush=True) + extra_p50s: list[float] = [] + for s in range(1, CONFIRM_SESSIONS + 1): + out_json = hyp_dir / f"confirm_s{s}.json" + rc, _, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(onnx_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(FULL_WARMUP), + "--iterations", + str(FULL_ITERS), + "--output", + str(out_json), + ], + label=f"confirm s{s}/{CONFIRM_SESSIONS}", + timeout=BENCH_TIMEOUT_S, + ) + p50 = _get_p50(out_json) if rc == 0 and out_json.exists() else None + if p50: + print(f" confirm s{s}: p50={p50:.2f}ms", flush=True) + extra_p50s.append(p50) + if s < CONFIRM_SESSIONS: + time.sleep(COOL_DOWN_S) + + if not extra_p50s: + print(f" [confirm] {hyp_id}: confirm bench failed, keeping KEEP", flush=True) + continue + + all_p50s: list[float] = hyp_data.get("full_p50s_ms", []) + extra_p50s + overall_median = sorted(all_p50s)[len(all_p50s) // 2] + overall_gain = (baseline_p50 - overall_median) / baseline_p50 * 100 + wins = sum( + 1 for p in all_p50s if (baseline_p50 - p) / baseline_p50 * 100 >= MIN_IMPROVEMENT_PCT + ) + + hyp_data["confirm_p50s_ms"] = extra_p50s + hyp_data["all_p50s_ms"] = all_p50s + hyp_data["overall_median_p50_ms"] = round(overall_median, 3) + hyp_data["overall_gain_pct"] = round(overall_gain, 2) + hyp_data["sessions_above_threshold"] = wins + hyp_data["total_sessions"] = len(all_p50s) + + if wins == len(all_p50s): + hyp_data["verdict"] = "KEEP_CONFIRMED" + print( + f" [KEEP_CONFIRMED] {hyp_id}: {wins}/{len(all_p50s)} sessions ≥ {MIN_IMPROVEMENT_PCT}%," + f" overall gain={overall_gain:+.1f}%", + flush=True, + ) + else: + hyp_data["verdict"] = "MARGINAL_UNCONFIRMED" + print( + f" [MARGINAL_UNCONFIRMED] {hyp_id}: only {wins}/{len(all_p50s)} sessions above threshold", + flush=True, + ) + + # Update best_hypothesis tracking + if hyp_data["verdict"] == "KEEP_CONFIRMED": + best_p50 = results.get("best_p50_ms") + if best_p50 is None or overall_median < best_p50: + results["best_p50_ms"] = overall_median + results["best_hypothesis"] = hyp_id + results["best_gain_pct"] = round(overall_gain, 2) + + +def _post_process(results: dict) -> None: + """Print summary and add cross-hypothesis notes.""" + hyps = results.get("hypotheses", {}) + baseline_p50 = results.get("baseline_p50_ms") + if not baseline_p50: + return + + keeps = [(hid, h) for hid, h in hyps.items() if h.get("verdict") in ("KEEP", "KEEP_CONFIRMED")] + unconfirmed = [ + (hid, h) for hid, h in hyps.items() if h.get("verdict") == "MARGINAL_UNCONFIRMED" + ] + if keeps: + print(f"\n ✓ KEEP/KEEP_CONFIRMED: {[h[0] for h in keeps]}", flush=True) + if unconfirmed: + print( + f" ⚠ MARGINAL_UNCONFIRMED (failed confirmation): {[h[0] for h in unconfirmed]}", + flush=True, + ) + if not keeps and not unconfirmed: + print("\n No improvements found above 5% threshold.", flush=True) + + # gpu-006 summary + opset21 = results.get("opset21_gain_pct") + if opset21 is not None: + if opset21 >= 5: + print(f" [gpu-006] opset21 HELPS GPU: +{opset21:.1f}%", flush=True) + elif opset21 <= -5: + print(f" [gpu-006] opset21 HURTS GPU: {opset21:.1f}%", flush=True) + else: + print(f" [gpu-006] opset21 NEUTRAL on GPU: {opset21:.1f}%", flush=True) + + +def _save_results(results: dict, model_dir: Path) -> None: + out = model_dir / "results.json" + out.write_text(json.dumps(results, indent=2, ensure_ascii=False), encoding="utf-8") + print(f" Results: {out}", flush=True) + + +# ── summary writer ──────────────────────────────────────────────────────────── + + +def write_summary(all_results: list[dict]) -> None: + lines = [ + "# QNN GPU Optimization Sweep — Catalog Models", + "", + f"Generated: {datetime.now().isoformat(timespec='seconds')} ", + f"EP: `{EP}` / device: `{DEVICE}` ", + f"Protocol: screen {SCREEN_ITERS} iters (CV<{SCREEN_CV_MAX * 100:.0f}%)," + f" full {FULL_ITERS}×{FULL_SESSIONS} sessions + {CONFIRM_SESSIONS} confirm sessions for KEEP ", + "Constraints: NO quant (gpu-004), NO compile (gpu-003), NO nhwc (gpu-002) ", + "", + "---", + "", + "## Per-Model Results", + "", + "| Model | Baseline p50 | Best p50 | Best config | Gain% | opset21 gain% | Notes |", + "|-------|-------------|----------|-------------|-------|--------------|-------|", + ] + + for r in all_results: + model_id = r["model_id"] + baseline = f"{r['baseline_p50_ms']:.1f} ms" if r.get("baseline_p50_ms") else "N/A" + best = f"{r['best_p50_ms']:.1f} ms" if r.get("best_p50_ms") else "N/A" + best_h = r.get("best_hypothesis") or "N/A" + best_label = "" + if best_h != "N/A": + best_label = r.get("hypotheses", {}).get(best_h, {}).get("label", "") + gain = f"{r['best_gain_pct']:.1f}%" if r.get("best_gain_pct") is not None else "N/A" + opset21 = r.get("opset21_gain_pct") + opset21_str = f"{opset21:+.1f}%" if opset21 is not None else "N/A" + errors = "; ".join(r.get("errors", []))[:80] or "none" + lines.append( + f"| `{model_id}` | {baseline} | {best} | {best_h} ({best_label}) | {gain} | {opset21_str} | {errors} |" + ) + + lines += [ + "", + "## gpu-006: opset 21 on QNN GPU", + "", + "Previously untested. This sweep provides first data across multiple architectures.", + "", + ] + + opset21_helps = [r["model_id"] for r in all_results if (r.get("opset21_gain_pct") or 0) >= 5] + opset21_hurts = [r["model_id"] for r in all_results if (r.get("opset21_gain_pct") or 0) <= -5] + opset21_neutral = [ + r["model_id"] + for r in all_results + if r.get("opset21_gain_pct") is not None and -5 < (r.get("opset21_gain_pct") or 0) < 5 + ] + lines += [ + f"- **Helps (≥5%):** {', '.join(opset21_helps) or 'none'}", + f"- **Hurts (≤-5%):** {', '.join(opset21_hurts) or 'none'}", + f"- **Neutral:** {', '.join(opset21_neutral) or 'none (no data yet)'}", + "", + ] + + lines += ["## Feature Gaps", ""] + all_gaps = [ + f"- **`{r['model_id']}`**: {g}" for r in all_results for g in r.get("feature_gaps", []) + ] + lines += all_gaps if all_gaps else ["- None observed"] + + summary_path = RESULTS_DIR / "SUMMARY.md" + summary_path.write_text("\n".join(lines) + "\n", encoding="utf-8") + print(f"\n📄 Summary: {summary_path}", flush=True) + + +# ── entry point ─────────────────────────────────────────────────────────────── + + +def main() -> None: + parser = argparse.ArgumentParser( + description="QNN GPU hypothesis sweep for winml catalog models" + ) + parser.add_argument("--model", default=None) + parser.add_argument("--task", default=None) + parser.add_argument("--model-type", default="auto") + parser.add_argument( + "--only-hypotheses", default=None, help="Comma-separated h IDs, e.g. h3,h4,h9" + ) + parser.add_argument("--reuse-h0-config", action="store_true") + args = parser.parse_args() + + only_hyp_ids: set[str] | None = None + if args.only_hypotheses: + only_hyp_ids = {h.strip() for h in args.only_hypotheses.split(",")} + + RESULTS_DIR.mkdir(parents=True, exist_ok=True) + + # Confirm QNN GPU EP + print("=== Confirming QNN GPU EP ===", flush=True) + rc, out, _ = run_cmd([WINML, "sys", "--list-ep"], label="winml sys --list-ep", timeout=30) + if "qnn" not in out.lower(): + print("❌ QNN EP not detected! Aborting.", flush=True) + sys.exit(1) + print("✓ QNN EP available\n", flush=True) + + if args.model: + if not args.task: + print("Error: --task required with --model", flush=True) + sys.exit(1) + models_to_run = [(args.model, args.task, args.model_type)] + else: + models_to_run = ALL_MODELS # type: ignore[assignment] + + all_results: list[dict] = [] + + for model_id, task, model_type in models_to_run: + try: + result = sweep_model( + model_id, + task, + model_type, + only_hyp_ids=only_hyp_ids, + reuse_h0_config=args.reuse_h0_config, + ) + except Exception as exc: + print(f"\n❌ Unexpected error for {model_id}: {exc}", flush=True) + result = { + "model_id": model_id, + "task": task, + "model_type": model_type, + "errors": [f"Unexpected exception: {exc}"], + "hypotheses": {}, + "feature_gaps": [], + } + all_results.append(result) + write_summary(all_results) + + print("\n" + "=" * 64, flush=True) + print(" GPU SWEEP COMPLETE", flush=True) + print("=" * 64, flush=True) + write_summary(all_results) + + +if __name__ == "__main__": + main() diff --git a/research/autoconfig/catalog_qnn_sweep.py b/research/autoconfig/catalog_qnn_sweep.py new file mode 100644 index 000000000..75903ed5d --- /dev/null +++ b/research/autoconfig/catalog_qnn_sweep.py @@ -0,0 +1,1283 @@ +#!/usr/bin/env python3 +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""catalog_qnn_sweep.py — QNN NPU optimization hypothesis sweep for winml catalog models. + +Hypothesis matrix (per model): + h0: baseline (auto-config, default winml build for QNN NPU + W8A16) + h1: opset 17 explicit (explicit opset, same optim as baseline) + h2: opset 19 + h3: opset 21 <- tests npu-001 generalization + + Conv fusions (npu-006 hazard on Conv-dense models): + h4: opset 17 + conv fusions (conv-bn, conv-add, conv-activation) + h5: opset 21 + conv fusions + + Attention/transformer fusions (graph-analysis-driven; 2026-06-17): + h6: opset 21 + matmul_transpose_fusion (24-36× detected on all transformer models) + h7: opset 21 + bias_softmax_fusion (12× on BERT-family: roberta, bge, MiniLM) + h8: opset 21 + attention_fusion (12× Softmax nodes across all transformers) + + Rewrite hypotheses (graph-analysis-driven; 2026-06-17): + h9: opset 21 + highdimRTR_lowdimRTR (12× Reshape-Transpose-Reshape on MobileViT) + h10: opset 17 + conv_add_fusion only (11× on ResNet; safe subset of npu-006 convoy) + +2-phase bench protocol (npu-007): + Phase A: 200-iter screen — high CV is NORMAL on QNN NPU (DVFS), always proceed to Phase B. + Phase B: 3 independent sessions x 500 iters, 30 s cool-down between sessions. + KEEP criterion: all 3 sessions faster than baseline, ranges must not overlap. + +Validated constraints applied: + npu-006: conv fusions (conv-bn/add/activation) produce FusedConv ops that QNN EP cannot + dispatch -> CPU fallback -> catastrophic regression on Conv-dense models. h4/h5 are + annotated with npu006_expected_regression=True when Conv% of total ops > 20%. + npu-001: opset21 speedup is architecture-specific. npu001_generalized uses range-overlap + check (max(h3_p50s) < min(h1_p50s)), not just median comparison. + +Results: catalog-qnn-sweep//results.json +Summary: catalog-qnn-sweep/SUMMARY.md +""" + +import argparse +import copy +import json +import subprocess +import sys +import time +from datetime import datetime +from pathlib import Path + + +sys.stdout.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined] + +# npu-006 guard: conv fusions produce FusedConv (ORT private op) that QNN EP cannot dispatch. +# On Conv-dense models (Conv% > this threshold), h4/h5 will catastrophically regress. +# Validated: ResNet-18 (Conv-dense) +4900%, DINOv2-base (1 Conv total) benign. +NPU006_CONV_PCT_THRESHOLD = 20.0 # percent of total ops; above this = high npu-006 risk + +# ── constants ───────────────────────────────────────────────────────────────── +BASE_DIR = Path(__file__).parent +WINML = str(BASE_DIR / ".venv" / "Scripts" / "winml.exe") +EP = "qnn" +DEVICE = "npu" +RESULTS_DIR = BASE_DIR / "catalog-qnn-sweep" + +SCREEN_WARMUP = 20 +SCREEN_ITERS = 200 +SCREEN_CV_MAX = 0.15 + +FULL_WARMUP = 50 +FULL_ITERS = 500 +FULL_SESSIONS = 3 +CONFIRM_SESSIONS = 2 # extra sessions for best hypothesis (Phase C confirmation) +COOL_DOWN_S = 30 + +MODEL_TIMEOUT_S = 180 * 60 # 3 hours per model — 6 hypotheses × ~30min each +BUILD_TIMEOUT_S = 8 * 60 # 8 min per individual build +BENCH_TIMEOUT_S = 8 * 60 # 8 min per bench run +EVAL_TIMEOUT_S = 6 * 60 # 6 min for accuracy eval +EVAL_SAMPLES = 50 + +# Hypotheses: (id, label, opset_override, extra_optim) +# opset_override=None → keep whatever auto-config chose +# extra_optim=None → keep auto-config optim unchanged +# extra_optim=dict → merge these flags ON TOP of auto-config optim +HYPOTHESES = [ + ("h0", "baseline (auto-config, W8A16)", None, None), + ("h1", "opset 17 explicit", 17, None), + ("h2", "opset 19", 19, None), + ("h3", "opset 21 (tests npu-001 bypass)", 21, None), + # ── conv fusions (npu-006) ────────────────────────────────────────────── + ( + "h4", + "opset 17 + conv fusions", + 17, + { + "conv_bn_fusion": True, + "conv_add_fusion": True, + "conv_activation_fusion": True, + }, + ), + ( + "h5", + "opset 21 + conv fusions", + 21, + { + "conv_bn_fusion": True, + "conv_add_fusion": True, + "conv_activation_fusion": True, + }, + ), + # ── attention/transformer fusions (graph-analysis-driven, 2026-06-17) ── + # matmul_transpose_fusion: 24-36× patterns detected on all transformer + # models (dinov2, roberta, bge, mobilevit). Tests whether fusing + # Transpose↔MatMul pairs helps QNN NPU dispatch. + ( + "h6", + "opset 21 + matmul_transpose_fusion", + 21, + {"matmul_transpose_fusion": True}, + ), + # bias_softmax_fusion: 12× Add→Softmax patterns in BERT-family models + # (roberta, bge, MiniLM). Attention mask is added before softmax — + # fusing may help QNN NPU kernel scheduling. + ( + "h7", + "opset 21 + bias_softmax_fusion", + 21, + {"bias_softmax_fusion": True}, + ), + # attention_fusion: 9-12× Softmax nodes across all transformers. + # Full QK^T V attention fusion into a single op. + ( + "h8", + "opset 21 + attention_fusion", + 21, + {"attention_fusion": True}, + ), + # ── rewrite hypotheses (graph-analysis-driven, 2026-06-17) ───────────── + # highdimRTR_lowdimRTR: 12× Reshape→Transpose→Reshape detected on + # MobileViT. Reduces high-rank RTR chains to lower-rank equivalents, + # potentially reducing Transpose overhead on QNN NPU. + ( + "h9", + "opset 21 + highdimRTR_lowdimRTR", + 21, + {"highdimRTR_lowdimRTR": True}, + ), + # conv_add_fusion only (safe subset of npu-006 convoy): 11× Conv→Add + # on ResNet. Distinct from conv_add_activation_fusion (FusedConv) — + # only fuses the Conv+bias Add, not the full 3-node chain. + ( + "h10", + "opset 17 + conv_add_fusion only", + 17, + {"conv_add_fusion": True}, + ), +] + +# Full catalog sweep list: (model_id, task, model_type, run_eval_on_baseline) +ALL_MODELS: list[tuple[str, str, str, bool]] = [ + # Vision + ("microsoft/resnet-18", "image-classification", "resnet", True), + ("google/vit-base-patch16-224", "image-classification", "vit", True), + ("apple/mobilevit-small", "image-classification", "mobilevit", True), + ("facebook/dinov2-small", "image-feature-extraction", "dinov2", False), # no imagenet eval + ("hustvl/yolos-small", "object-detection", "yolos", False), # no imagenet eval + # NLP + ( + "distilbert/distilbert-base-uncased-finetuned-sst-2-english", + "text-classification", + "distilbert", + False, + ), + ("sentence-transformers/all-MiniLM-L6-v2", "sentence-similarity", "bert", False), + ("deepset/roberta-base-squad2", "question-answering", "roberta", False), +] + + +# ── low-level helpers ───────────────────────────────────────────────────────── + + +def run_cmd(cmd: list[str], label: str = "", timeout: int = 600) -> tuple[int, str, float]: + """Run a command; return (returncode, combined_output, elapsed_s).""" + t0 = time.time() + print(f" >> {label or cmd[1]}", flush=True) + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + timeout=timeout, + ) + elapsed = time.time() - t0 + tag = "ok" if result.returncode == 0 else f"rc={result.returncode}" + print(f" {elapsed:.0f}s [{tag}]", flush=True) + if result.returncode != 0: + snippet = (result.stderr or result.stdout or "")[-600:] + print(f" stderr: {snippet}", flush=True) + return result.returncode, result.stdout + result.stderr, elapsed + except subprocess.TimeoutExpired: + elapsed = time.time() - t0 + print(f" TIMEOUT after {elapsed:.0f}s", flush=True) + return -999, f"TIMEOUT after {timeout}s", elapsed + + +def _count_conv_pct(model_onnx: Path) -> tuple[float, int, int]: + """Count Conv ops in a built ONNX model. Returns (conv_pct, conv_count, total_count). + Used to assess npu-006 risk before running conv-fusion hypotheses. + Falls back to (0.0, 0, 0) if onnx is not importable or file missing. + + WARNING: (0.0, 0, 0) means UNKNOWN, not SAFE. The caller must treat a zero + result as unknown and emit a warning rather than silently skipping the guard. + """ + if not model_onnx.exists(): + return 0.0, 0, 0 + try: + import onnx # noqa: PLC0415 + except ImportError: + print( + " [ERROR] onnx package not installed — cannot assess npu-006 risk for conv fusions.\n" + " Install it: pip install onnx\n" + " Conv-fusion hypotheses (h4/h5) will be annotated as UNKNOWN risk.", + flush=True, + ) + return 0.0, 0, 0 + try: + model = onnx.load(str(model_onnx)) + ops = [n.op_type for n in model.graph.node] + total = len(ops) + conv_count = sum(1 for o in ops if o == "Conv") + pct = conv_count / total * 100 if total > 0 else 0.0 + return round(pct, 1), conv_count, total + except Exception as e: + print(f" [warn] Conv% analysis failed: {e}", flush=True) + return 0.0, 0, 0 + + +# ── winml wrappers ──────────────────────────────────────────────────────────── + + +def get_base_config(model_id: str, task: str, model_type: str) -> dict | None: + """Generate the auto-config via `winml config` for QNN NPU. + Returns the parsed config dict, or None on failure. + """ + tmp_path = RESULTS_DIR / "_tmp_base_config.json" + tmp_path.parent.mkdir(parents=True, exist_ok=True) + + def _try(extra_args: list[str]) -> dict | None: + cmd = [ + WINML, + "config", + "-m", + model_id, + "-t", + task, + "--device", + DEVICE, + "--ep", + EP, + "--no-compile", + "-o", + str(tmp_path), + ] + extra_args + rc, out, _ = run_cmd(cmd, label="winml config", timeout=120) + if rc == 0 and tmp_path.exists(): + try: + cfg = json.loads(tmp_path.read_text(encoding="utf-8")) + tmp_path.unlink(missing_ok=True) + return cfg + except Exception as e: + print(f" [warn] config parse error: {e}", flush=True) + tmp_path.unlink(missing_ok=True) + return None + + # Try with explicit model-type first, fall back without it + cfg = _try(["--model-type", model_type]) + if cfg is None: + print(" [warn] config with --model-type failed, retrying without…", flush=True) + cfg = _try([]) + return cfg + + +def make_hypothesis_config( + base: dict, opset_override: int | None, extra_optim: dict | None +) -> dict: + """Return a modified copy of base config for this hypothesis.""" + cfg = copy.deepcopy(base) + if opset_override is not None: + if cfg.get("export"): + cfg["export"]["opset_version"] = opset_override + if extra_optim is not None: + existing = cfg.get("optim") or {} + cfg["optim"] = {**existing, **extra_optim} + return cfg + + +def run_build(model_id: str, cfg_path: Path, out_dir: Path) -> tuple[bool, str]: + """Run `winml build -c cfg_path -m model_id -o out_dir --ep qnn --device npu --no-compile`.""" + out_dir.mkdir(parents=True, exist_ok=True) + cmd = [ + WINML, + "build", + "-c", + str(cfg_path), + "-m", + model_id, + "-o", + str(out_dir), + "--ep", + EP, + "--device", + DEVICE, + "--no-compile", + "--rebuild", + ] + rc, out, _ = run_cmd(cmd, label=f"winml build [{out_dir.name}]", timeout=BUILD_TIMEOUT_S) + return rc == 0, out + + +def bench_screen(model_path: Path) -> tuple[float | None, float, bool]: + """Phase A: 200-iter screen. + Returns (p50_ms, cv, stable). + p50_ms=None only on hard failure (rc!=0 or missing output file). + QNN NPU DVFS routinely produces CV >> 0.15 — high CV is logged but does NOT + block Phase B; Phase B's multi-session cool-down is the thermal control. + """ + out_json = model_path.parent / "screen_perf.json" + rc, _, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(model_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(SCREEN_WARMUP), + "--iterations", + str(SCREEN_ITERS), + "-o", + str(out_json), + ], + label=f"perf screen ({SCREEN_ITERS} iters)", + timeout=BENCH_TIMEOUT_S, + ) + if rc != 0 or not out_json.exists(): + return None, 999.0, False + try: + data = json.loads(out_json.read_text()) + lat = data["latency_ms"] + p50, std = lat["p50"], lat["std"] + cv = std / p50 if p50 > 0 else 999.0 + stable = cv <= SCREEN_CV_MAX + tag = "stable" if stable else "HIGH-CV (DVFS noise — proceeding to Phase B)" + print(f" screen: p50={p50:.2f}ms std={std:.2f}ms CV={cv:.3f} [{tag}]", flush=True) + return p50, cv, stable + except Exception as e: + print(f" [warn] screen parse error: {e}", flush=True) + return None, 999.0, False + + +def bench_full(model_path: Path) -> list[float]: + """Phase B: 3 × 500-iter full bench with cool-down. Returns list of p50 values.""" + p50s: list[float] = [] + for s in range(1, FULL_SESSIONS + 1): + out_json = model_path.parent / f"full_perf_s{s}.json" + rc, _, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(model_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(FULL_WARMUP), + "--iterations", + str(FULL_ITERS), + "-o", + str(out_json), + ], + label=f"perf full s{s}/{FULL_SESSIONS} ({FULL_ITERS} iters)", + timeout=BENCH_TIMEOUT_S, + ) + if rc == 0 and out_json.exists(): + try: + data = json.loads(out_json.read_text()) + lat = data["latency_ms"] + p50, std = lat["p50"], lat["std"] + cv = std / p50 if p50 > 0 else 999.0 + print(f" full s{s}: p50={p50:.2f}ms std={std:.2f}ms CV={cv:.3f}", flush=True) + p50s.append(p50) + except Exception as e: + print(f" [warn] full bench s{s} parse error: {e}", flush=True) + else: + print(f" [warn] full bench s{s} failed", flush=True) + if s < FULL_SESSIONS: + print(f" cool-down {COOL_DOWN_S}s…", flush=True) + time.sleep(COOL_DOWN_S) + return p50s + + +def run_eval(model_path: Path, model_id: str, task: str) -> float | None: + """Run `winml eval` for accuracy. Returns accuracy or None.""" + out_json = model_path.parent / "eval_result.json" + rc, _, _ = run_cmd( + [ + WINML, + "eval", + "-m", + str(model_path), + "--model-id", + model_id, + "--task", + task, + "--ep", + EP, + "--device", + DEVICE, + "--samples", + str(EVAL_SAMPLES), + "-o", + str(out_json), + ], + label="winml eval (accuracy gate)", + timeout=EVAL_TIMEOUT_S, + ) + if rc != 0 or not out_json.exists(): + return None + try: + data = json.loads(out_json.read_text()) + metrics = data.get("metrics", data) + acc = metrics.get("accuracy") + if acc is not None: + print(f" eval accuracy: {acc:.4f}", flush=True) + return float(acc) if acc is not None else None + except Exception as e: + print(f" [warn] eval parse error: {e}", flush=True) + return None + + +def _perf_result(onnx_path: Path, model_id: str, task: str, run_eval_flag: bool) -> dict: + """Run Phase A + Phase B bench and optionally eval. Returns result dict.""" + result: dict = {"status": "PENDING", "screen": {}, "full": {}, "accuracy": None} + + p50_screen, cv_screen, stable = bench_screen(onnx_path) + result["screen"] = { + "p50_ms": p50_screen, + "cv": round(cv_screen, 4), + "stable": stable, + } + + if p50_screen is None: + # Hard failure (rc != 0 or missing output) — cannot proceed + result["status"] = "SCREEN_FAIL" + return result + + # QNN NPU note: always proceed to Phase B even if screen CV is high. + # Phase B multi-session cool-down is the thermal / DVFS control. + if not stable: + result["screen"]["note"] = "DVFS noise — high CV expected on QNN NPU" + + full_p50s = bench_full(onnx_path) + if not full_p50s: + result["status"] = "BENCH_FAIL" + return result + + median_p50 = float(sorted(full_p50s)[len(full_p50s) // 2]) + result["full"] = { + "p50s_ms": [round(p, 3) for p in full_p50s], + "median_p50_ms": round(median_p50, 3), + } + result["status"] = "OK" if stable else "OK_HIGH_CV" + + if run_eval_flag: + acc = run_eval(onnx_path, model_id, task) + result["accuracy"] = acc + + return result + + +# ── main sweep logic ────────────────────────────────────────────────────────── + + +def sweep_model( + model_id: str, + task: str, + model_type: str, + run_eval_on_baseline: bool, + only_hyp_ids: "set[str] | None" = None, + reuse_h0_config: bool = False, +) -> dict: + """Run hypotheses for one model on QNN NPU. Returns results dict. + + Args: + only_hyp_ids: If set, only run these hypothesis IDs (e.g. {'h6','h7'}). + reuse_h0_config: If True, load base config from existing h0/build_config.json + instead of calling winml config again. + """ + model_slug = model_id.replace("/", "--") + model_dir = RESULTS_DIR / model_slug + model_dir.mkdir(parents=True, exist_ok=True) + + # When resuming from partial run, load existing results to preserve prior data + results_path = model_dir / "results.json" + if only_hyp_ids and results_path.exists(): + try: + results = json.loads(results_path.read_text(encoding="utf-8")) + print(f" [resume] loaded existing results from {results_path}", flush=True) + except Exception: + results = {} + else: + results = {} + + results.update( + { + "model_id": model_id, + "task": task, + "model_type": model_type, + "timestamp": datetime.now().isoformat(timespec="seconds"), + "ep": EP, + "device": DEVICE, + } + ) + results.setdefault("baseline_opset", None) + results.setdefault("conv_pct", None) + results.setdefault("npu006_risk", None) + results.setdefault("npu006_regression", None) + results.setdefault("hypotheses", {}) + results.setdefault("best_hypothesis", None) + results.setdefault("baseline_p50_ms", None) + results.setdefault("best_p50_ms", None) + results.setdefault("best_gain_pct", None) + results.setdefault("npu001_generalized", None) + results.setdefault("npu001_ranges_non_overlapping", None) + results.setdefault("feature_gaps", []) + results.setdefault("errors", []) + + print(f"\n{'=' * 64}", flush=True) + print(f" SWEEP: {model_id} [{task}]", flush=True) + if only_hyp_ids: + print(f" (delta sweep — only: {sorted(only_hyp_ids)})", flush=True) + print(f"{'=' * 64}", flush=True) + + model_start = time.time() + + # ── Step 1: generate base config (or reuse from existing h0) ────────────── + print("\n[1/3] Generating base config (winml config)…", flush=True) + base_config = None + + if reuse_h0_config: + h0_cfg_path = model_dir / "h0" / "build_config.json" + if h0_cfg_path.exists(): + try: + base_config = json.loads(h0_cfg_path.read_text(encoding="utf-8")) + print(f" [reuse] loaded h0 config from {h0_cfg_path}", flush=True) + except Exception as e: + print(f" [reuse] failed to load h0 config: {e} — regenerating", flush=True) + + if base_config is None: + base_config = get_base_config(model_id, task, model_type) + + if base_config is None: + results["errors"].append("base config generation failed — model may not be supported") + results["feature_gaps"].append("winml config failed for this model (inspect winml output)") + _save_results(results, model_dir) + return results + + baseline_opset = (base_config.get("export") or {}).get("opset_version", "?") + results["baseline_opset"] = baseline_opset + base_quant = base_config.get("quant") + print( + f" auto-config: opset={baseline_opset} quant={'W8A16' if base_quant else 'NONE'}", + flush=True, + ) + if base_quant is None: + results["feature_gaps"].append( + "auto-config did not include quantization — possible model type not supported for W8A16" + ) + optim_keys = list((base_config.get("optim") or {}).keys()) + print(f" auto-config optim: {optim_keys}", flush=True) + + # ── Step 2: per-hypothesis loop ─────────────────────────────────────────── + print(f"\n[2/3] Running {len(HYPOTHESES)} hypotheses…", flush=True) + + # conv_pct is filled in after h0 succeeds (used to annotate npu-006 risk for h4/h5) + conv_pct: float = 0.0 + npu006_risk: bool = False + + for hyp_id, label, opset_override, extra_optim in HYPOTHESES: + # Hypothesis filter: skip if not in --only-hypotheses list + if only_hyp_ids is not None and hyp_id not in only_hyp_ids: + continue + elapsed_total = time.time() - model_start + if elapsed_total > MODEL_TIMEOUT_S: + print( + f"\n ⏰ MODEL TIMEOUT ({elapsed_total:.0f}s > {MODEL_TIMEOUT_S}s) — stopping", + flush=True, + ) + results["hypotheses"][hyp_id] = {"status": "TIMEOUT", "label": label} + results["errors"].append(f"Model timed out at {elapsed_total:.0f}s (before {hyp_id})") + continue + + sep = "─" * 56 + print(f"\n{sep}", flush=True) + print(f" {hyp_id}: {label}", flush=True) + print(f"{sep}", flush=True) + + # Build config for this hypothesis + hyp_config = make_hypothesis_config(base_config, opset_override, extra_optim) + opset_used = (hyp_config.get("export") or {}).get("opset_version", "?") + print(f" opset={opset_used} extra_optim={extra_optim}", flush=True) + + hyp_dir = model_dir / hyp_id + hyp_dir.mkdir(parents=True, exist_ok=True) + cfg_path = hyp_dir / "build_config.json" + cfg_path.write_text(json.dumps(hyp_config, indent=2), encoding="utf-8") + + # Build + build_ok, build_out = run_build(model_id, cfg_path, hyp_dir) + + if not build_ok: + is_timeout = "TIMEOUT" in build_out + status = "BUILD_TIMEOUT" if is_timeout else "BUILD_FAIL" + error_snippet = build_out[-600:] if not is_timeout else "build timed out" + results["hypotheses"][hyp_id] = { + "status": status, + "label": label, + "opset": opset_used, + "build_error": error_snippet, + } + results["errors"].append(f"{hyp_id}: {status}") + # Try to extract feature gap info from the build output + if any( + kw in build_out.lower() for kw in ("unsupported", "not supported", "no handler") + ): + results["feature_gaps"].append( + f"{hyp_id} ({label}): EP/op unsupported — '{build_out[-200:]}'" + ) + elif is_timeout: + results["feature_gaps"].append( + f"{hyp_id} ({label}): build timeout — possible QNN compilation hang" + ) + continue + + onnx_path = hyp_dir / "model.onnx" + if not onnx_path.exists(): + # Check for EPContext model (compile might have happened anyway) + ctx_candidates = list(hyp_dir.glob("*_ctx*.onnx")) + list( + hyp_dir.glob("model_npu*.onnx") + ) + if ctx_candidates: + onnx_path = ctx_candidates[0] + print(f" [info] using compiled model: {onnx_path.name}", flush=True) + else: + results["hypotheses"][hyp_id] = { + "status": "NO_MODEL_ONNX", + "label": label, + "opset": opset_used, + } + results["errors"].append(f"{hyp_id}: build OK but model.onnx missing") + results["feature_gaps"].append( + f"{hyp_id}: build completed but no model.onnx produced (unexpected pipeline behavior)" + ) + continue + + # After h0: analyze Conv% to assess npu-006 risk for h4/h5 + if hyp_id == "h0" and onnx_path.exists(): + conv_pct, conv_count, total_count = _count_conv_pct(onnx_path) + # Treat (0.0, 0, 0) as UNKNOWN (not safe) — onnx may be unavailable. + conv_unknown = conv_pct == 0.0 and total_count == 0 + npu006_risk = conv_pct > NPU006_CONV_PCT_THRESHOLD or conv_unknown + results["conv_pct"] = None if conv_unknown else conv_pct + results["npu006_risk"] = npu006_risk + if conv_unknown: + print( + " [npu-006] Conv% analysis returned UNKNOWN (onnx unavailable or file missing)" + " — treating h4/h5 as HIGH RISK to be safe", + flush=True, + ) + elif npu006_risk: + print( + f" [npu-006] Conv%={conv_pct:.1f}% ({conv_count}/{total_count} ops)" + f" > {NPU006_CONV_PCT_THRESHOLD:.0f}% threshold", + flush=True, + ) + print( + " [npu-006] h4/h5 (conv fusions) EXPECTED to catastrophically regress" + " — FusedConv not supported by QNN EP -> CPU fallback", + flush=True, + ) + else: + print( + f" [npu-006] Conv%={conv_pct:.1f}% ({conv_count}/{total_count} ops)" + f" <= {NPU006_CONV_PCT_THRESHOLD:.0f}% — h4/h5 low risk", + flush=True, + ) + + # Annotate h4/h5 with npu-006 risk BEFORE running bench + if hyp_id in ("h4", "h5") and npu006_risk: + print( + f" [npu-006] WARNING: {hyp_id} uses conv fusions on Conv-dense model" + f" (Conv%={conv_pct:.1f}%) — expect catastrophic regression", + flush=True, + ) + + # Only run eval for h0 (baseline) on image-classification models + do_eval = run_eval_on_baseline and hyp_id == "h0" and task == "image-classification" + + bench = _perf_result(onnx_path, model_id, task, do_eval) + bench["label"] = label + bench["opset"] = opset_used + if hyp_id in ("h4", "h5"): + bench["npu006_expected_regression"] = npu006_risk + results["hypotheses"][hyp_id] = bench + + if bench["status"] == "UNSTABLE": + results["errors"].append(f"{hyp_id}: bench UNSTABLE (CV too high)") + + # ── Step 3: compute summary stats ───────────────────────────────────────── + print("\n[3/3] Computing summary stats…", flush=True) + _compute_summary(results) + + # ── Step 3b: Phase C — confirm the best hypothesis with 2 extra sessions ── + _run_confirmation_pass_npu(results, model_dir) + + _save_results(results, model_dir) + return results + + +def _run_confirmation_pass_npu(results: dict, model_dir: Path) -> None: + """Phase C: run CONFIRM_SESSIONS extra sessions on the best hypothesis. + + For NPU (high DVFS noise), uses range-non-overlap criterion: + - All (FULL_SESSIONS + CONFIRM_SESSIONS) p50s < baseline_min → CONFIRMED + - Otherwise → MARGINAL_UNCONFIRMED, best_gain_pct flagged as uncertain + """ + best_h_id: str | None = results.get("best_hypothesis") + baseline_p50: float | None = results.get("baseline_p50_ms") + if not best_h_id or not baseline_p50: + return + + best_hyp = results["hypotheses"].get(best_h_id, {}) + best_gain = results.get("best_gain_pct", 0.0) + if best_gain < 5.0: + return # nothing worth confirming + + # Find ONNX + hyp_dir = model_dir / best_h_id + onnx_path: Path | None = None + for candidate in (hyp_dir / "quantized.onnx", hyp_dir / "optimized.onnx"): + if candidate.exists(): + onnx_path = candidate + break + if onnx_path is None: + return + + print( + f"\n ── Phase C: confirming best hypothesis {best_h_id} ({CONFIRM_SESSIONS} extra sessions) ──", + flush=True, + ) + + confirm_p50s: list[float] = [] + for s in range(1, CONFIRM_SESSIONS + 1): + out_json = hyp_dir / f"confirm_s{s}.json" + rc, _, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(onnx_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(FULL_WARMUP), + "--iterations", + str(FULL_ITERS), + "-o", + str(out_json), + ], + label=f"confirm s{s}/{CONFIRM_SESSIONS}", + timeout=BENCH_TIMEOUT_S, + ) + if rc == 0 and out_json.exists(): + try: + data = json.loads(out_json.read_text()) + lat = data["latency_ms"] + p50 = lat["p50"] + print(f" confirm s{s}: p50={p50:.2f}ms", flush=True) + confirm_p50s.append(p50) + except Exception as e: + print(f" [warn] confirm s{s} parse error: {e}", flush=True) + if s < CONFIRM_SESSIONS: + print(f" cool-down {COOL_DOWN_S}s…", flush=True) + time.sleep(COOL_DOWN_S) + + if not confirm_p50s: + print(f" [confirm] {best_h_id}: confirm bench failed, conclusion unchanged", flush=True) + return + + # Get all p50s including prior FULL_SESSIONS runs + prior_p50s: list[float] = best_hyp.get("full", {}).get("p50s_ms", []) + all_p50s = prior_p50s + confirm_p50s + + # Baseline comparison: use h0/h1 p50s for range overlap test + baseline_h = None + for h_id in ("h0", "h1"): + h = results["hypotheses"].get(h_id, {}) + if h.get("status") in ("OK", "OK_HIGH_CV"): + baseline_h = h + break + baseline_p50s: list[float] = ( + baseline_h["full"].get("p50s_ms", [baseline_p50]) if baseline_h else [baseline_p50] + ) + + overall_median = float(sorted(all_p50s)[len(all_p50s) // 2]) + overall_gain = (baseline_p50 - overall_median) / baseline_p50 * 100 + # Strict: max of all best-hypothesis sessions must be < min of baseline sessions + ranges_confirmed = max(all_p50s) < min(baseline_p50s) if baseline_p50s else False + + best_hyp["confirm_p50s_ms"] = [round(p, 3) for p in confirm_p50s] + best_hyp["all_p50s_ms"] = [round(p, 3) for p in all_p50s] + best_hyp["confirm_overall_median_ms"] = round(overall_median, 3) + best_hyp["confirm_overall_gain_pct"] = round(overall_gain, 2) + best_hyp["confirm_ranges_non_overlapping"] = ranges_confirmed + + if ranges_confirmed: + best_hyp["confirm_verdict"] = "CONFIRMED" + results["best_gain_pct"] = round(overall_gain, 2) + print( + f" [CONFIRMED] {best_h_id}: all {len(all_p50s)} p50s < baseline min" + f" — gain={overall_gain:+.1f}% (ranges non-overlapping)", + flush=True, + ) + else: + best_hyp["confirm_verdict"] = "MARGINAL_UNCONFIRMED" + print( + f" [MARGINAL_UNCONFIRMED] {best_h_id}: max={max(all_p50s):.1f}ms" + f" ≥ baseline min={min(baseline_p50s):.1f}ms — DVFS noise, ranges overlap", + flush=True, + ) + + +def _compute_summary(results: dict) -> None: + """Fill in baseline_p50, best_hypothesis, best_gain, npu001_generalized, npu006_regression.""" + hyps = results["hypotheses"] + + # Baseline p50: prefer h0, fall back to h1 + baseline_p50: float | None = None + for h_id in ("h0", "h1"): + h = hyps.get(h_id, {}) + if h.get("status") in ("OK", "OK_HIGH_CV"): + baseline_p50 = h.get("full", {}).get("median_p50_ms") + if baseline_p50: + break + results["baseline_p50_ms"] = baseline_p50 + + # Best hypothesis (minimum median p50) + best_p50: float | None = None + best_h: str | None = None + for h_id, h in hyps.items(): + if h.get("status") in ("OK", "OK_HIGH_CV"): + p50 = h.get("full", {}).get("median_p50_ms") + if p50 is not None and (best_p50 is None or p50 < best_p50): + best_p50 = p50 + best_h = h_id + results["best_hypothesis"] = best_h + results["best_p50_ms"] = best_p50 + + if baseline_p50 and best_p50: + gain_pct = (baseline_p50 - best_p50) / baseline_p50 * 100 + results["best_gain_pct"] = round(gain_pct, 2) + + # ── npu-001: opset21 vs opset17 (h3 vs h1) ────────────────────────────── + # Criterion 1 (median): h3 p50 < h1 p50 by >=5% + # Criterion 2 (range-overlap, stricter): max(h3_p50s) < min(h1_p50s) + # Both must agree for "True"; either failing gives "neutral" + h1 = hyps.get("h1", {}) + h3 = hyps.get("h3", {}) + if h1.get("status") in ("OK", "OK_HIGH_CV") and h3.get("status") in ("OK", "OK_HIGH_CV"): + p50_h1 = h1["full"].get("median_p50_ms", float("inf")) + p50_h3 = h3["full"].get("median_p50_ms", float("inf")) + h1_p50s: list[float] = h1["full"].get("p50s_ms", [p50_h1]) + h3_p50s: list[float] = h3["full"].get("p50s_ms", [p50_h3]) + + # Median-based test (>=5% improvement) + median_gain = p50_h3 < p50_h1 * 0.95 + median_loss = p50_h1 < p50_h3 * 0.95 + + # Range-overlap test (non-overlapping = more reliable for DVFS-noisy NPU) + ranges_non_overlapping = max(h3_p50s) < min(h1_p50s) if h3_p50s and h1_p50s else None + results["npu001_ranges_non_overlapping"] = ranges_non_overlapping + + if median_gain and ranges_non_overlapping: + results["npu001_generalized"] = True + gain = (p50_h1 - p50_h3) / p50_h1 * 100 + print( + f" [npu-001] CONFIRMED: opset21={p50_h3:.1f}ms vs opset17={p50_h1:.1f}ms" + f" (+{gain:.1f}%, ranges non-overlapping)", + flush=True, + ) + elif median_gain and not ranges_non_overlapping: + results["npu001_generalized"] = "median_only" + gain = (p50_h1 - p50_h3) / p50_h1 * 100 + print( + f" [npu-001] MARGINAL: opset21 median {gain:.1f}% faster but ranges OVERLAP" + f" (h3 max={max(h3_p50s):.1f}ms > h1 min={min(h1_p50s):.1f}ms) -- DVFS noise", + flush=True, + ) + elif median_loss: + results["npu001_generalized"] = False + print( + f" [npu-001] NEGATIVE: opset17={p50_h1:.1f}ms < opset21={p50_h3:.1f}ms", + flush=True, + ) + else: + results["npu001_generalized"] = "neutral" + print( + f" [npu-001] NEUTRAL: opset17={p50_h1:.1f}ms ~ opset21={p50_h3:.1f}ms", + flush=True, + ) + else: + missing = [ + h for h, d in [("h1", h1), ("h3", h3)] if d.get("status") not in ("OK", "OK_HIGH_CV") + ] + results["npu001_generalized"] = f"N/A ({', '.join(missing)} not OK)" + results["npu001_ranges_non_overlapping"] = None + + # ── npu-006: detect catastrophic conv-fusion regression (h4/h5) ────────── + # "Catastrophic" = h4 or h5 median p50 >= 5x baseline (CPU fallback signature) + npu006_regression = False + for h_id in ("h4", "h5"): + h = hyps.get(h_id, {}) + if h.get("status") in ("OK", "OK_HIGH_CV") and baseline_p50: + p50_fused = h["full"].get("median_p50_ms") + if p50_fused and p50_fused >= baseline_p50 * 5.0: + npu006_regression = True + ratio = p50_fused / baseline_p50 + print( + f" [npu-006] CATASTROPHIC REGRESSION confirmed on {h_id}:" + f" {p50_fused:.1f}ms vs baseline {baseline_p50:.1f}ms ({ratio:.0f}x slower)" + f" -- FusedConv CPU fallback", + flush=True, + ) + elif h.get("status") == "BENCH_FAIL" and h.get("npu006_expected_regression"): + # Bench failure on expected-regression hypothesis is also a signal + print( + f" [npu-006] {h_id} bench FAILED on conv-dense model -- possible CPU fallback timeout", + flush=True, + ) + results["npu006_regression"] = npu006_regression + + +def _save_results(results: dict, model_dir: Path) -> None: + """Write results.json.""" + out = model_dir / "results.json" + out.write_text(json.dumps(results, indent=2, ensure_ascii=False), encoding="utf-8") + print(f" Results: {out}", flush=True) + + +# ── summary writer ──────────────────────────────────────────────────────────── + + +def write_summary(all_results: list[dict]) -> None: + """Write SUMMARY.md to RESULTS_DIR.""" + lines: list[str] = [ + "# QNN NPU Optimization Sweep — Catalog Models", + "", + f"Generated: {datetime.now().isoformat(timespec='seconds')} ", + f"EP: `{EP}` / device: `{DEVICE}` ", + f"Bench protocol: Phase-A {SCREEN_ITERS} iters (high CV expected on QNN NPU — DVFS)," + f" Phase-B {FULL_ITERS}x{FULL_SESSIONS} sessions, 30s cool-down ", + "npu-001 criterion: median >=5% gain AND ranges non-overlapping ", + "npu-006 criterion: Conv% of ops; h4/h5 marked catastrophic if >=5x baseline ", + "", + "---", + "", + "## Per-Model Results", + "", + "| Model | Conv% | Baseline p50 | Best p50 | Best config | Gain% | npu-001? | npu-006 regression? | Notes |", + "|-------|-------|-------------|----------|-------------|-------|----------|---------------------|-------|", + ] + + for r in all_results: + model_id = r["model_id"] + conv_pct = r.get("conv_pct") + conv_str = f"{conv_pct:.0f}%" if conv_pct is not None else "N/A" + if r.get("npu006_risk"): + conv_str += " ⚠️" + baseline = f"{r['baseline_p50_ms']:.1f} ms" if r.get("baseline_p50_ms") else "N/A" + best = f"{r['best_p50_ms']:.1f} ms" if r.get("best_p50_ms") else "N/A" + best_h = r.get("best_hypothesis") or "N/A" + if best_h != "N/A": + h_data = r.get("hypotheses", {}).get(best_h, {}) + best_label = h_data.get("label", "") + else: + best_label = "" + gain = f"{r['best_gain_pct']:.1f}%" if r.get("best_gain_pct") is not None else "N/A" + npu001 = r.get("npu001_generalized") + non_overlap = r.get("npu001_ranges_non_overlapping") + if npu001 is True: + npu001_str = "CONFIRMED (ranges sep.)" if non_overlap else "YES (median)" + elif npu001 is False: + npu001_str = "NO" + elif npu001 == "median_only": + npu001_str = "MARGINAL (overlap)" + elif npu001 == "neutral": + npu001_str = "neutral" + else: + npu001_str = "N/A" + npu006 = ( + "YES ⚠️" if r.get("npu006_regression") else ("risk" if r.get("npu006_risk") else "no") + ) + errors = "; ".join(r.get("errors", []))[:80] or "none" + lines.append( + f"| `{model_id}` | {conv_str} | {baseline} | {best} | {best_h} ({best_label}) | {gain} | {npu001_str} | {npu006} | {errors} |" + ) + + # Per-model hypothesis breakdown + lines += [ + "", + "## Hypothesis Breakdown per Model", + "", + ] + for r in all_results: + lines.append(f"### {r['model_id']}") + lines.append("") + lines.append( + "| Hypothesis | Opset | Screen p50 | Full p50 (median) | CV | Status | Accuracy |" + ) + lines.append( + "|------------|-------|-----------|-------------------|-----|--------|---------|" + ) + for h_id, h_data in r.get("hypotheses", {}).items(): + lbl = h_data.get("label", "") + opset = h_data.get("opset", "?") + s_p50 = h_data.get("screen", {}).get("p50_ms") + s_p50_str = f"{s_p50:.1f}" if s_p50 else "—" + f_p50 = h_data.get("full", {}).get("median_p50_ms") + f_p50_str = f"{f_p50:.1f}" if f_p50 else "—" + cv = h_data.get("screen", {}).get("cv", "?") + cv_str = f"{cv:.3f}" if isinstance(cv, float) else str(cv) + status = h_data.get("status", "?") + stable = h_data.get("screen", {}).get("stable", True) + if not stable and status.startswith("OK"): + status += " ⚡DVFS" + acc = h_data.get("accuracy") + acc_str = f"{acc:.3f}" if acc is not None else "—" + lines.append( + f"| {h_id} ({lbl}) | {opset} | {s_p50_str} | {f_p50_str} | {cv_str} | {status} | {acc_str} |" + ) + lines.append("") + + # Cross-model patterns + lines += [ + "---", + "", + "## Cross-Model Patterns", + "", + "### npu-001: Does opset 21 bypass help broadly?", + "", + ] + + npu001_map = {r["model_id"]: r.get("npu001_generalized") for r in all_results} + yes_m = [m for m, v in npu001_map.items() if v is True] + no_m = [m for m, v in npu001_map.items() if v is False] + neut_m = [m for m, v in npu001_map.items() if v == "neutral"] + na_m = [m for m, v in npu001_map.items() if v not in (True, False, "neutral")] + + lines += [ + f"- **Helps ({len(yes_m)} models):** {', '.join(f'`{m}`' for m in yes_m) or 'none'}", + f"- **Hurts ({len(no_m)} models):** {', '.join(f'`{m}`' for m in no_m) or 'none'}", + f"- **Neutral ({len(neut_m)} models):** {', '.join(f'`{m}`' for m in neut_m) or 'none'}", + f"- **N/A ({len(na_m)} models):** {', '.join(f'`{m}`' for m in na_m) or 'none'}", + "", + ] + + total_tested = len(yes_m) + len(no_m) + len(neut_m) + if total_tested > 0: + if len(yes_m) > total_tested / 2: + lines.append( + f"> **Finding**: opset 21 bypass generalizes to {len(yes_m)}/{total_tested} tested models." + " Consider upgrading npu-001 scope from ConvNext-only to broader architectures." + ) + elif len(no_m) > total_tested / 2: + lines.append( + f"> **Finding**: opset 21 bypass does NOT broadly generalize ({len(no_m)}/{total_tested} hurt)." + " npu-001 appears ConvNext-specific (residual connection topology dependency confirmed)." + ) + else: + lines.append( + f"> **Finding**: Mixed results ({len(yes_m)} help, {len(no_m)} hurt, {len(neut_m)} neutral)." + " Architecture-dependent. Confirm ORT `kMaxSupportedOpset` version before drawing conclusions." + ) + lines.append("") + + lines += [ + "### Feature Gaps", + "", + ] + all_gaps: list[str] = [] + for r in all_results: + for gap in r.get("feature_gaps", []): + all_gaps.append(f"- **`{r['model_id']}`**: {gap}") + lines += all_gaps if all_gaps else ["- No feature gaps observed"] + + lines += [ + "", + "### Build / Compatibility Issues", + "", + ] + for r in all_results: + errs = r.get("errors", []) + if errs: + lines.append(f"**`{r['model_id']}`**") + for e in errs: + lines.append(f" - {e}") + + lines += [ + "", + "---", + "", + "## Updated Recommendations for `ep_knowledge/qnn_npu.json`", + "", + "Based on this cross-architecture sweep:", + "", + ] + + # Auto-generate KB recommendations + if total_tested > 0: + if len(yes_m) >= 2: + lines += [ + "- **npu-001**: Broaden scope beyond ConvNext. Architectures that benefit: " + f"{', '.join(yes_m)}. Update `scope` field and set `gate1_statistical` confidence accordingly.", + "- **search_space_rules.opset.recommended_order**: Retain `[21, 17]` as default order.", + ] + if len(no_m) >= 2: + lines += [ + "- **npu-001**: Keep 'architecture-specific' caveat. Architectures where opset 21 hurts: " + f"{', '.join(no_m)}. Add to `do_not_generalize_to` list.", + "- **search_space_rules**: Add architecture check before applying opset 21 preference.", + ] + + # Conv fusions analysis + lines += [ + "", + "### Conv Fusion Findings (h4 vs h1, h5 vs h3)", + "", + ] + for r in all_results: + h1_p50 = r.get("hypotheses", {}).get("h1", {}).get("full", {}).get("median_p50_ms") + h4_p50 = r.get("hypotheses", {}).get("h4", {}).get("full", {}).get("median_p50_ms") + h3_p50 = r.get("hypotheses", {}).get("h3", {}).get("full", {}).get("median_p50_ms") + h5_p50 = r.get("hypotheses", {}).get("h5", {}).get("full", {}).get("median_p50_ms") + parts = [] + if h1_p50 and h4_p50: + delta = (h1_p50 - h4_p50) / h1_p50 * 100 + parts.append(f"conv-fusions on opset17: {delta:+.1f}% ({h1_p50:.1f}→{h4_p50:.1f}ms)") + if h3_p50 and h5_p50: + delta = (h3_p50 - h5_p50) / h3_p50 * 100 + parts.append(f"conv-fusions on opset21: {delta:+.1f}% ({h3_p50:.1f}→{h5_p50:.1f}ms)") + if parts: + lines.append(f"- **`{r['model_id']}`**: {'; '.join(parts)}") + + summary_path = RESULTS_DIR / "SUMMARY.md" + summary_path.write_text("\n".join(lines) + "\n", encoding="utf-8") + print(f"\n📄 Summary: {summary_path}", flush=True) + + +# ── entry point ─────────────────────────────────────────────────────────────── + + +def main() -> None: + parser = argparse.ArgumentParser( + description="QNN NPU optimization hypothesis sweep for winml catalog models" + ) + parser.add_argument( + "--model", default=None, help="Single HF model ID to sweep (default: all catalog models)" + ) + parser.add_argument( + "--task", default=None, help="Task override (required when --model is given)" + ) + parser.add_argument( + "--model-type", default="auto", help="Model type hint (e.g. resnet, vit). Default: auto" + ) + parser.add_argument( + "--skip-eval", + action="store_true", + help="Skip winml eval accuracy step even for image models", + ) + parser.add_argument( + "--only-hypotheses", + default=None, + help=( + "Comma-separated list of hypothesis IDs to run, e.g. h6,h7,h8. " + "Skips all others. Use with --reuse-h0-config to avoid regenerating base config." + ), + ) + parser.add_argument( + "--reuse-h0-config", + action="store_true", + help=( + "Reuse the base config from an existing h0/build_config.json instead of " + "running winml config again. Requires a previous full sweep to have run." + ), + ) + args = parser.parse_args() + + # Parse hypothesis filter + only_hyp_ids: set[str] | None = None + if args.only_hypotheses: + only_hyp_ids = {h.strip() for h in args.only_hypotheses.split(",")} + print(f" Running only: {sorted(only_hyp_ids)}", flush=True) + + RESULTS_DIR.mkdir(parents=True, exist_ok=True) + + # Confirm QNN EP is present + print("=== Confirming QNN EP ===", flush=True) + rc, out, _ = run_cmd([WINML, "sys", "--list-ep"], label="winml sys --list-ep", timeout=30) + if "qnn" not in out.lower(): + print("❌ QNN EP not detected! Aborting.", flush=True) + sys.exit(1) + print("✓ QNN EP available\n", flush=True) + + # Determine model list + if args.model: + if not args.task: + print("Error: --task is required when --model is specified", flush=True) + sys.exit(1) + models_to_run: list[tuple[str, str, str, bool]] = [ + (args.model, args.task, args.model_type, not args.skip_eval) + ] + else: + models_to_run = ALL_MODELS # type: ignore[assignment] + + all_results: list[dict] = [] + + for model_id, task, model_type, do_eval in models_to_run: + if args.skip_eval: + do_eval = False + try: + result = sweep_model( + model_id, + task, + model_type, + do_eval, + only_hyp_ids=only_hyp_ids, + reuse_h0_config=args.reuse_h0_config, + ) + except Exception as exc: + print(f"\n❌ Unexpected error for {model_id}: {exc}", flush=True) + result = { + "model_id": model_id, + "task": task, + "model_type": model_type, + "errors": [f"Unexpected exception: {exc}"], + "hypotheses": {}, + "feature_gaps": [], + } + all_results.append(result) + + # Save rolling summary after each model + write_summary(all_results) + + print("\n" + "=" * 64, flush=True) + print(" SWEEP COMPLETE", flush=True) + print("=" * 64, flush=True) + write_summary(all_results) + + +if __name__ == "__main__": + main() diff --git a/research/autoconfig/docs/agent-design.md b/research/autoconfig/docs/agent-design.md new file mode 100644 index 000000000..688dad029 --- /dev/null +++ b/research/autoconfig/docs/agent-design.md @@ -0,0 +1,254 @@ +# WinML CLI Agent Design + +> Status: Draft — 2026-06-17 (updated: autoconfig loop V3 changes incorporated) +> Context: Strategic design for the agent layer of winml-cli + +--- + +## 1. Context: Why Agent Matters for winml-cli + +### 1.1 winml-cli vs Olive — The Real Distinction + +Microsoft Olive already exists as a pass-based optimization framework supporting QNN, DML, and other Windows EPs. The temptation is to dismiss winml-cli's agent as redundant with Olive. That would be wrong — the distinction is fundamental: + +| Dimension | Olive | winml-cli | +| --- | --- | --- | +| Target user | ML engineer who understands ORT internals | WinApp developer who wants their model to work on Windows | +| Workflow | Compose passes manually, specify EP upfront | `config` + `build` — two commands, full pipeline | +| Hardware selection | Manual EP specification | `--device auto` — detects hardware, selects EP | +| Explainability | Silent pipeline output | Designed for transparency | +| Windows-first | Cross-platform, Windows supported | Built exclusively for Windows hardware diversity | +| Operator diagnostics | Not available | `winml analyze` — operator linting, EP compatibility | +| Agent-ready | Not designed for it | First-class design goal | + +**Analogy:** Olive is webpack (powerful, expert-configured); winml-cli is Vite (opinionated, works for most cases out of the box). + +### 1.2 The Core Gap Agent Should Fill + +WinApp developers lack access to a senior ML engineer who: + +- Knows why a model fails on QNN NPU for this specific operator pattern +- Can read an error message and immediately know the root cause +- Understands which optimization knob to turn for which problem +- Knows how a config that works on Snapdragon X Elite will behave on Intel Meteor Lake + +**The agent's job is to be that person.** + +--- + +## 2. Agent Design Philosophy + +### 2.1 The Improved Loop (autoconfig V3) vs The Agent Layer + +The autoconfig search loop has been significantly improved since the initial draft. As of v3 (`59e7329d`): + +**What the improved loop does well:** +- Statistical significance via `ThroughputOnly` verdict policy: `improvement > max(1% floor, 2× screen_CV)` — noise-level deltas no longer pass as KEEP +- Screen early exit: if screen improvement < 1%, skip 3× full bench — saves 25–90 min per rejected hypothesis +- Crash-resume via `session.json`: atomic state persistence, restartable without re-running completed experiments +- KB-guided search: `ep_knowledge/*.json` confirmed rules prune the search space before any experiment runs +- DVFS-aware bench protocol: npu-007 CV gate disabled on QNN NPU; 3× 500-iter sessions with cool-down +- npu-006 guard: Conv% > 20% → hard-block conv fusions before they cause 4900% regression + +**What still requires the agent layer:** + +The loop is a *computation engine*, not an *intelligence layer*. It needs an agent because: + +1. **No architecture-aware hypothesis generation** — hypotheses are hardcoded per EP, not generated from model analysis. An attention-heavy model gets the same hypotheses as a Conv-heavy one. +2. **No failure explanation** — DISCARD is logged but not explained. Developers can't learn from results without reading raw JSON. +3. **No cross-device reasoning** — a config found on Snapdragon X Elite has unknown behavior on Intel Meteor Lake. The loop can't tell you that. +4. **No adaptive stopping** — 30-DISCARD plateau is a static heuristic. An agent would recognize when all architectural levers for this model/EP pair have been exhausted. +5. **No KB self-update** — KB is manually maintained. An agent with memory extraction (cf. AgenticGPUOptimizer `memory_extractor.py`) would auto-update `ep_knowledge/*.json` after each run. + +The revised framing: **autoconfig is a sub-tool that the agent invokes and explains, not a headless replacement for the agent**. + +### 2.2 The Wrong Design (Original Autoconfig) + +The *original* autoconfig ran a **headless search loop** with no statistical significance, no crash-resume, and no KB-guided pruning: +Explorer → Optimizer → Reviewer → repeat + +**Problems that were present (now fixed in V3):** + +- No statistical significance — 1% hardcoded floor meant noise-level deltas passed as KEEP +- No screen early exit — every hypothesis ran 3× full bench regardless of screen result +- No crash-resume — an interrupted run lost all state +- All optim keys in kebab-case → `build_config()` silently used snake_case lookups → every hypothesis ran as baseline (critical bug, fixed) + +**Remaining problems (require agent layer to fix):** + +- A Python script can do benchmark loops faster, cheaper, and more reliably than an LLM agent — the loop is good, the LLM overhead is not worth it +- Results (config files) are not auditable — developer cannot verify why a config was chosen +- No explainability — developer doesn't understand what was decided or why +- Treats developer as absent; no collaborative interaction +- The "agentic" overhead (LLM inference cost per loop iteration) adds nondeterminism without intelligence + +Autoconfig search is useful as a **sub-tool**, not as the primary value proposition of the agent layer. + +### 2.2 The Right Design: Diagnosis + Guidance over Search + +Agent excels at **judgment, diagnosis, and explanation** — not computation. The redesign centers on: + +> **When a developer encounters a problem, the agent gives explanation + executable next step — not a config file.** + +#### Design Principles + +1. **Explain, don't just output** + Instead of silently picking an EP, say: *"I picked QNN EP because your device has a Qualcomm NPU. Operator coverage is 97% — the remaining 3% fall back to CPU, which is acceptable for these specific ops."* +2. **Fix, don't just diagnose** + When an incompatible operator is found, apply the graph transformation — don't just flag it. +3. **Developer talks, agent acts** + The agent is interactive and conversational. Developer says "this model is slow on GPU" → agent asks clarifying questions, runs targeted experiments, explains findings. +4. **Progressive trust** + Show confidence levels. Be explicit about uncertainty. Let the developer see what the agent is doing. Never give false precision (e.g., "Config A is 3% faster" when standard deviation is 5%). +5. **Windows device diversity as first-class concern** + Always reason about what happens on devices the developer doesn't have — not just the machine the agent runs on. + +--- + +## 3. Agent Types + +### 3.1 Diagnostic Agent *(highest priority)* + +**Trigger:** Model fails to load, crashes at inference, throws EP compatibility error +**Developer question:** "My model fails on QNN NPU — why? What do I do?" + +**Agent responsibilities:** + +- Parse error message → identify root cause (unsupported op, shape mismatch, driver version, etc.) +- Analyze model graph → enumerate incompatible operators per EP +- Propose and apply concrete fix (graph transformation, operator substitution, fallback EP) +- Verify fix with `winml eval` accuracy check + +**Why this is Olive-incompatible:** Olive doesn't converse, doesn't diagnose, doesn't explain. It fails silently or produces a broken model. + +**Example interaction:** + +```javascript +Developer: winml build failed. Error: "QNNExecutionProvider: Unsupported op at node /conv/Conv_3" +Agent: Found it. Conv_3 has dynamic padding — QNN NPU requires static shapes. + I'll apply DynamicToFixedShape transform and re-run the compile. + [applies fix] → Build succeeded. NPU latency: 12.3ms. Accuracy delta: 0.01%. +``` + +--- + +### 3.2 Decision Guidance Agent + +**Trigger:** Developer is at a decision point in the pipeline (which EP? which precision? to quantize or not?) +**Developer question:** "I don't know what options to pick. What's the tradeoff?" + +**Agent responsibilities:** + +- Run quick comparative benchmarks (not exhaustive search) +- Present tradeoffs with numbers: latency gain vs accuracy delta vs model size +- Make a recommendation with reasoning, not just a number +- Let developer override with understanding of consequences + +**Key difference from autoconfig:** This is interactive and decision-oriented, not headless. The developer is in the loop. + +--- + +### 3.3 Cross-Device Confidence Agent *(winml-cli unique)* + +**Trigger:** Developer has a working config, asks "will this work on my users' devices?" +**Developer question:** "My app ships on many Windows hardware configs. Will this be okay?" + +**Agent responsibilities:** + +- Given a config optimized for Device A, reason about behavior on Device B, C... +- Identify configs that are device-specific (compiled QNN binaries only work on Qualcomm) +- Generate multi-device config with automatic EP fallback chain (QNN → DML → CPU) +- Surface warnings: "This config will fail on Intel Meteor Lake — here's the fallback" + +**Why this matters:** WinApp developers ship to millions of devices. No other tool addresses Windows hardware diversity in the deployment sense. + +--- + +### 3.4 Regression Detection Agent *(CI/CD scenario)* + +**Trigger:** ORT version bump, driver update, or scheduled CI run +**Developer question:** "Something changed — my model got slower / broke" + +**Agent responsibilities:** + +- Compare before/after perf numbers with statistical validity (not point estimates) +- Correlate change with known ORT/EP changelog entries +- Identify which layer / operator regressed using profiler output +- Propose workaround or file structured bug report + +--- + +## 4. Role of Autoconfig (Perf Search) in This Design + +Autoconfig (opset × EP × opt\_level search) is **not abandoned** — it becomes a sub-tool invoked by the agents above when appropriate. + +**When it's invoked:** + +- Diagnostic Agent: "Let me try a few configs to see if this performance issue is config-related" +- Decision Guidance Agent: "Let me run a quick sweep across opt\_level to show you the tradeoff" + +**What changes:** + +- It's not the primary entry point +- Its output is explained, not just surfaced as a config file +- It runs fewer, targeted experiments (not exhaustive grid search) +- Accuracy validation (`winml eval`) is mandatory before any recommendation + +--- + +### 3.5 Model Recommendation Agent *(developer onboarding scenario)* + +**Trigger:** Developer describes a problem they want to solve but doesn't know which model to use +**Developer question:** "I want to add real-time object detection to my app. What model should I use?" + +**Agent responsibilities:** + +- Understand the developer's task from natural language (classification, detection, segmentation, NLP, etc.) +- Query `winml catalog` to enumerate available built-in models for that task +- Factor in the developer's current hardware (`winml sys`) — which EPs are available, memory constraints, NPU presence +- Recommend the best-fit model with reasoning: why this model, expected latency on their device, accuracy tier +- Output a ready-to-run command so the developer can immediately try it + +**Why this is valuable:** Developers often don't know which model architecture fits their use case, let alone which one performs well on their specific Windows hardware. The catalog has verified, pre-tested models — the agent bridges "I have a problem" to "here is a model you can ship today." + +**Example interaction:** + +```javascript +Developer: I want to classify product images in my e-commerce app. Should be fast on low-end laptops. +Agent: For image classification on low-end hardware without an NPU, I recommend: + + → microsoft/resnet-50 (catalog built-in) + - Task: image-classification + - Latency on CPU: ~18ms p50 (tested on Intel Core i5) + - Size: 98MB FP32, 25MB w8a16 + - Best EP on your machine: CPUExecutionProvider + + Runner-up: google/vit-base-patch16-224 (better accuracy, ~2x slower on CPU) + + To build and benchmark: + winml build -c $(winml config -m microsoft/resnet-50 --device auto) -o resnet_out/ + winml perf -m resnet_out/model.onnx --device auto --iterations 100 +``` + +**What makes this different from a search engine:** The recommendation is hardware-aware — the same question asked on a machine with a Qualcomm NPU would surface a different model (or a different EP for the same model) with different expected numbers. It's not a static lookup, it's a contextual match. + +--- + +## 5. Key Concerns to Track + +| Concern | Mitigation | +| --- | --- | +| Device heterogeneity: config found on Dev's machine may not generalize | Cross-Device Confidence Agent explicitly addresses this; output includes device scope | +| Trust/auditability: developer can't verify agent recommendation | All recommendations include reasoning + confidence + "how I tested this" | +| Olive overlap at implementation layer | winml-cli uses ORT under the hood like Olive; the differentiation is UX + Windows-first + explainability, not reimplementing optimization passes | +| Accuracy validation | `winml eval` is mandatory in every agent loop that modifies the model | +| Agent hallucinating perf numbers | All perf claims require iteration ≥ 1000 and report p50/p90/p99 with std dev | + +--- + +## 6. Open Questions + +1. **Scope**: Should the agent be a CLI mode (`winml agent`) or embedded into existing commands (`winml build --agent`)? +2. **Olive relationship**: Should winml-cli contribute opset search back to Olive, or maintain it independently? Needs alignment with Olive team. +3. **Offline / no-LLM mode**: Should the agent work without LLM (rule-based fallback) for air-gapped CI environments? +4. **Multi-device testing**: Cross-Device Confidence Agent requires access to multiple devices or a device simulation layer — how to implement? diff --git a/research/autoconfig/docs/ep-findings-summary.html b/research/autoconfig/docs/ep-findings-summary.html new file mode 100644 index 000000000..31101d51a --- /dev/null +++ b/research/autoconfig/docs/ep-findings-summary.html @@ -0,0 +1,795 @@ + + + + +WinML EP Findings — Validated Catalog + + + + +

WinML EP Findings — Validated Catalog

+

+ Hardware: Snapdragon X Elite CRD  |  ORT: 1.24.5 (onnxruntime-windowsml)  |  + QNN SDK: Hexagon HTP (NPU) + Adreno X1-85 (GPU/DML)  |  + Last updated: 2026-06-17  |  14 models tested (QNN NPU), 1 model (CPU/DML/GPU) +

+ +
+
17
total findings
+
6
visible (multi-model)
+
11
hidden (single-model)
+
14
models tested (NPU)
+
7
feature requests
+
+ +
+ Scope warning: All findings are from 1 hardware device (Snapdragon X Elite CRD). + CPU/DML/GPU findings are from 1 model only (facebook/convnext-tiny-224). + QNN NPU findings cover 14 models. Always re-validate on new model architectures before using to prune search space. + Confidence levels reflect mechanism certainty, not universal applicability. +
+ +
+ + Showing only multi-model / universal findings by default. Single-model findings (1 model tested) are hidden. +
+ + +
+
+ QNN NPU  —  Hexagon HTP (Snapdragon X Elite) +  14 models tested, 3×500-iter sessions, 30s cool-down +
+
+ + +
+
npu-006
+
HIGH
confirmed
+
+
Conv fusions cause catastrophic CPU fallback on Conv-dominant models
+
+ ResNet-18 with conv-bn-fusion + conv-add-fusion + conv-activation-fusion: + 3-session p50 = [132.3, 135.0, 130.7]ms (CV=0.016) vs baseline ~1-4ms. + ~130x regression, near-zero variance = deterministic CPU fallback. + DINOv2-base (Conv%<1%): fusion is neutral or slightly beneficial (-25%). + ORT FusedConv op produced by these passes is not dispatchable by QNN EP → falls back to CPU for every Conv node. +
Scope: Conv-dominant models (ResNet, EfficientNet, MobileNet). Not applicable to Transformer, NLP.
+
+
+
+
Autoconfig action
+
+ Hard-block conv-bn/add/act fusions for QNN NPU when Conv% > 20%. + Gate is enforced in catalog_qnn_sweep.py via count_conv_pct(). + Feature request: winml analyze should detect FusedConv ops pre-build. +
+
+
+ + +
+
npu-007
+
HIGH
confirmed
+
+
DVFS thermal noise makes CV-based stability gating unreliable on QNN NPU
+
+ Across all 8 catalog models, within-session CV ranges 0.1–2.0+ even on warm device. + CV gate (e.g., <15%) blocks most valid candidates — the noise is DVFS, not model instability. + Reliable signal requires: 3+ independent sessions × 500+ iters with 30s cool-down between sessions. + Use median p50 across sessions. Differences < 10% are within noise floor. +
Scope: General — all models on QNN NPU / Snapdragon X Elite HTP.
+
+
+
+
Autoconfig action
+
+ CV gate DISABLED for QNN NPU in bench_utils.py. + SCREEN_CV_MAX_NPU = 999.0. + Always run 3×500 Phase B regardless of screen CV. + Feature request: winml perf --sessions 3 --cool-down 30s (#155). +
+
+
+ + +
+
npu-001
+
MEDIUM
empirical
+
+
opset 21 export gives +24–31% speedup on DINOv2 family — mechanism UNKNOWN
+
+ DINOv2-small: opset17 7.2ms → opset21 5.0ms (+30.6%) (3-session, ranges separated). + DINOv2-base: opset17 34.6ms → opset21 26.2ms (+24.1%) (fresh builds, clean protocol). + MobileViT-small: ~20–26% (DVFS spike in one session — partially reliable). + Critical controls: dino-vitb16 -0.7% NEUTRAL; gender-classification ViT +3.5% NEUTRAL (SAME op counts as DINOv2-small). Opset21 speedup is DINOv2-architecture-specific, NOT a general ViT property. + Original kMaxSupportedOpset bypass mechanism INVALIDATED (ORT 1.24.5 has kMaxSupportedOpset≥22). + Mechanism unknown. Transpose count identical (49 nodes both opsets). +48 Reshape nodes in opset21 may be relevant. +
Scope: DINOv2-family (facebook/dinov2-*). MobileViT likely. NOT plain ViT, NOT NLP (12 models tested).
+
+
+
+
Autoconfig action
+
+ For DINOv2-family: try opset 21 first in priority queue. + For plain ViT / NLP: skip (confirmed neutral). + Do NOT apply blindly without architecture check. + Tracked: #869 (closed). +
+
+
+ + +
+ 3 single-model findings hidden (npu-002/003/004 — ConvNext only) + Click “Show single-model findings” above to expand +
+
+
npu-002
+
MEDIUM
1 model
+
+
W8A16 quantization gives ~1.9x speedup over FP32 on QNN NPU
+
+ ConvNext FP32: 19.4ms → W8A16: 10.3ms (1.9x speedup). 1 model only. + Mechanism confirmed: QNN HTP has native INT8 weight / FP16 activation datapath; W8A16 maps to weight-compressed matmul kernels. + Magnitude (1.9x) is ConvNext-specific. All catalog sweep models ran W8A16 but no FP32 baselines for those. +
Scope: ConvNext only for magnitude. Mechanism generalizes; magnitude does not.
+
+
+
+
Autoconfig action
+
+ Always quantize for QNN NPU. + W8A16 is the starting point. + Validate accuracy after. +
+
+
+ + +
+
npu-003
+
MEDIUM
1 model
+
+
winml compile (EPContext) adds ~1.7x speedup on top of W8A16
+
+ ConvNext W8A16: 10.3ms → compiled EPContext: 6.0ms (1.7x). 1 model only. + Mechanism confirmed: EPContext pre-builds QNN binary graph, eliminates JIT graph partitioning at session creation. +
Scope: ConvNext only for magnitude. Mechanism generalizes to all models on QNN NPU.
+
+
+
+
Autoconfig action
+
+ Always run winml compile after finding best quantized config for QNN NPU. +
+
+
+ + +
+
npu-004
+
LOW
anecdote
+
+
W8A8 may cause accuracy collapse on models with LN+GELU (UNVALIDATED)
+
+ Experiment aborted early — no accuracy numbers preserved. Recalled anecdote only. + Do NOT use this to skip W8A8 without running eval first. + If W8A8 top-1 drops >15 points vs W8A16, then skip. +
Scope: UNVALIDATED. Anecdote from ConvNext-tiny-224 only.
+
+
+
+
Autoconfig action
+
+ Treat as anecdotal. + Run W8A8 eval before deciding. + Only skip after confirmed accuracy gate failure. +
+
+
+ +
+
+ + +
+
+ CPU EP  —  Oryon CPU (Snapdragon X Elite) +  1 model only (facebook/convnext-tiny-224), 3×1000-iter sessions +
+
+ + +
+ 3 single-model findings hidden (cpu-001/002/005 — ConvNext only) + Click “Show single-model findings” above to expand +
+
+
cpu-001
+
HIGH
empirical
+
+
opset 19+ causes 3–4x slowdown on CPU EP — mechanism uncertain
+
+ opset 17: 43.7ms. opset 19: 160ms (3.7x). opset 21: 170ms (3.9x). + Non-monotonic: opset 22 partially recovers to 85ms (1.9x slower than 17). + Hypothesis: Transpose Optimizer bypass via kMaxSupportedOpset gate — but non-monotonic recovery at opset 22 is inconsistent with a simple gate. + Mechanism uncertain; empirical data is solid across opsets. +
Scope: ConvNext on Oryon CPU, ORT 1.24.x. Models with few Transpose nodes (BERT) likely unaffected.
+
+
+
+
Autoconfig action
+
+ Default to opset 17 for CPU EP. + Do NOT try opset 19+. + Practical rule is solid regardless of mechanism. +
+
+
+ + +
+
cpu-002
+
HIGH
confirmed
+
+
matmul_add_fusion is a confirmed regression on ConvNext CPU (+87%)
+
+ matmul_add_fusion: p50=81.7ms vs baseline 43.7ms (+87%). All 3 runs far above baseline. + ORT L2 already converts MatMul+Add → Gemm at baseline (37 Gemm nodes). + Applying matmul_add_fusion on top conflicts with existing Gemm nodes. +
Scope: Models where ORT L2 baseline already has Gemm. Check model.onnx before applying.
+
+
+
+
Autoconfig action
+
+ Skip matmul_add_fusion for CPU EP when model.onnx already contains Gemm nodes. + Check baseline model for Gemm count before adding to search space. +
+
+
+ + +
+
cpu-005
+
HIGH
confirmed
+
+
Baseline (no extra flags) is optimal for ConvNext CPU — optimization pass sweep is wasted
+
+ 22-experiment ablation: no flag improved p50 beyond noise. Baseline at 43.7ms is the floor. + ORT L2 already applies gelu_fusion and MatMul→Gemm. + Compute bottleneck (Gemm=57.7%) not addressable via graph passes. +
Scope: ConvNext-class vision models on CPU. BERT/Transformer models may benefit from attention_fusion.
+
+
+
+
Autoconfig action
+
+ For CPU EP + ConvNext-class: skip optimization pass sweep. + Go directly to quantization experiments (W8A8 first). +
+
+
+ + +
+
cpu-006
+
HIGH
empirical
+
+
CPU EP and QNN NPU respond OPPOSITELY to opset changes — EP isolation is mandatory
+
+ CPU opset 17 vs 21: 3.9x SLOWER at opset 21. + QNN NPU opset 17 vs 21 (DINOv2): 24% FASTER at opset 21. + (Note: different models used — directional comparison only.) + Same opset change, opposite effect. CPU and QNN NPU have independent optimizer paths. +
Scope: Meta-rule about EP isolation. Applies to all models.
+
+
+
+
Autoconfig action
+
+ NEVER transfer opset findings across EPs. + Always validate per EP independently. + CPU search space rules are separate from NPU rules. +
+
+
+ +
+
+ + +
+
+ DML EP  —  Adreno X1-85 via Direct3D 12 +  1 model only (facebook/convnext-tiny-224) +
+
+ + +
+ 3 single-model findings hidden (dml-001/002/003 — ConvNext only) + Click “Show single-model findings” above to expand +
+
+
dml-001
+
MEDIUM
stability
+
+
DML is more stable than QNN GPU — p50 difference is within noise
+
+ DML FP32: p50=16.9ms, std=0.52. QNN GPU FP32: p50=17.7ms, std=0.97. + p50 diff = 0.8ms = 0.82σ of QNN GPU — distributions OVERLAP. Not a separable p50 advantage. + DML IS meaningfully more stable: std 0.52 vs 0.97, CV 3% vs 5.5%. +
Scope: Adreno X1-85, ConvNext. 3-run comparison (insufficient for definitive p50 ranking).
+
+
+
+
Autoconfig action
+
+ Prefer DML over QNN GPU for lower tail latency (p90) and variance. + Do NOT claim DML is faster based on p50 alone. +
+
+
+ + +
+
dml-002
+
MEDIUM
1 run
+
+
NHWC transformer increases latency variance on DML — p50 neutral, p90 +19%
+
+ DML NHWC: p50=16.5ms (-0.4ms, marginal), p90=21.0ms (+19%), std=1.89 (3.6x worse). + Baseline: p50=16.9ms, p90=17.7ms, std=0.52. + D3D12 handles tensor layouts internally via HLSL. Adding ORT NHWC Transposes adds dispatch overhead → scheduling jitter. +
Scope: Adreno X1-85 + DML, ConvNext. NVIDIA/Intel may differ (NHWC can help with CUDNN).
+
+
+
+
Autoconfig action
+
+ Do NOT apply nhwc-transformer for DML EP. + Tail latency stability matters for apps; p90 +19% is unacceptable. +
+
+
+ + +
+
dml-003
+
LOW
CLI gap
+
+
DML FP16 gives ~1.4x speedup with clean unimodal distribution — BLOCKED (#867)
+
+ DML FP16 (Python hack only): p50=11.8ms, p90=12.8ms, std=0.66 (clean unimodal). + vs FP32 baseline 16.9ms. 1.4x speedup. + DML HLSL shaders lock in FP16 paths at load time — no DVFS bimodal issues. + Cannot be reproduced with winml CLI today. Blocked on #867 (--precision fp16). +
Scope: Adreno X1-85 + DML. 1 experiment, Python workaround only.
+
+
+
+
Autoconfig action
+
+ Marked SKIPPED in search space until #867 ships. + FP16 is the primary DML optimization lever once available. +
+
+
+ +
+
+ + +
+
+ QNN GPU EP  —  Adreno X1-85 via QNN SDK +  1 model only (facebook/convnext-tiny-224) +
+
+ + +
+ 4 single-model findings hidden (gpu-001/002/003/005 — ConvNext only) + Click “Show single-model findings” above to expand +
+
+
gpu-001
+
HIGH
confirmed
+
+
FP32 baseline is already optimal for ConvNext on QNN GPU — no optimization pass helps
+
+ 11-pass sweep on ConvNext QNN GPU: all returned 0% node reduction or worse latency. + 251/0/0/0 analyze output — all ops native, zero CPU fallback. + ConvNext uses Reshape→MatMul→Reshape pattern; MatMulAdd→Conv2D rewrites don’t match. +
Scope: ConvNext-class models (Reshape→MatMul→Reshape pattern). Transformer models may benefit.
+
+
+
+
Autoconfig action
+
+ Skip all graph optimization experiments for QNN GPU on ConvNext-class. + Use FP32 baseline directly. + FP16 is the only remaining lever (#867). +
+
+
+ + +
+
gpu-002
+
MEDIUM
consistent
+
+
NHWC transformer hurts QNN GPU on Adreno — ~10% worse p50, +21% p90
+
+ NHWC: p50=19.5ms (+10%), p90=23.8ms (+21%), std=3.43 (3.5x worse) vs baseline p50=17.7ms. + QNN GPU EP handles layout internally; forcing NHWC creates Reshape overhead without alignment benefit. +
Scope: Adreno X1-85 + QNN GPU. Non-Adreno GPUs may differ.
+
+
+
+
Autoconfig action
+
+ Do NOT apply nhwc-transformer for QNN GPU EP. +
+
+
+ + +
+
gpu-004
+
HIGH
confirmed
+
+
W8A8 QDQ hangs indefinitely on QNN GPU EP
+
+ Passing W8A8 QDQ-annotated ONNX to QNN GPU EP → infinite hang. + QNN SDK GPU EP does not support QDQ-quantized graphs. + winml build already protects via _patch_device() (sets quant=null for GPU). + Fast-fail enhancement filed: #868. +
Scope: QNN GPU EP (QNN SDK limitation). Not a winml bug.
+
+
+
+
Autoconfig action
+
+ Skip ALL quantization experiments for QNN GPU EP. + winml build protects automatically. + Tracked: #868. +
+
+
+ + +
+
gpu-005
+
HIGH
confirmed
+
+
gelu_fusion improves latency STABILITY on QNN GPU — not p50
+
+ Raw (unfused GELU, 287 nodes): p50=17.4ms, p90=29.2ms, std=5.90. + Autoconf (fused GELU, 251 nodes): p50=17.7ms, p90=19.7ms (-48%), std=0.97 (-6x). + 5 separate GPU kernel dispatches for unfused GELU create scheduling jitter. + Single Gelu kernel eliminates dispatch overhead → dramatically lower tail latency. +
Scope: Any model with GELU activations on QNN GPU.
+
+
+
+
Autoconfig action
+
+ Always apply gelu_fusion for QNN GPU (stability, not p50 benefit). + Do not expect p50 improvement. +
+
+
+ + +
+
gpu-003
+
LOW
1 run
+
+
winml compile regresses QNN GPU by ~34% — single experiment, low confidence
+
+ FP32 + compile: p50=23.7ms vs baseline 17.7ms (+34%). Single experiment only. + EPContext compile designed for NPU (HTP). On GPU EP it may bypass optimized shader path. + 34% gap is above DVFS noise level (CV~0.05 → noise ~1ms); direction probably real. +
Scope: QNN GPU EP. QNN NPU EP (compile always helps there).
+
+
+
+
Autoconfig action
+
+ Avoid winml compile for QNN GPU EP. + Re-validate if winml compile behavior changes. +
+
+
+ +
+
+ + + +
+
+ Feature Requests & CLI Gaps + — required to complete the autoconfig skill +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FeatureIssuePriorityMotivationEP FindingStatus
winml perf multi-session bench protocol
+ --sessions N --cool-down S
#155P0npu-007: reliable QNN NPU measurement requires 3 independent sessions with 30s cool-down. Single-session p50 is meaningless due to DVFS. catalog_qnn_sweep.py works around via subprocess loops but CLI support is needed for production autoconfig.npu-007OPEN
winml analyze: detect FusedConv ops pre-build
+ Warn when Conv% > threshold before fusions applied
not filedP0npu-006: conv fusions on QNN NPU cause 130x regression by creating FusedConv ops that QNN EP can't dispatch. autoconfig already guards via Conv% counter, but a CLI-level pre-build lint would catch it without any Python code. Should be part of winml analyze --ep qnn.npu-006NEEDS ISSUE
winml build --precision fp16
+ --enable-fp16-conversion
#867P1dml-003: DML FP16 gives ~1.4x speedup with clean distribution. Currently only achievable via Python workaround. Same for QNN GPU (FP16 is the only remaining lever after all graph passes exhausted). Unblocks DML search space.dml-003, gpu-001OPEN
winml perf: report p90/p99 + std per session
+ Plus per-session breakdown for multi-session runs
#155P1dml-002/gpu-002: NHWC transformer hurts p90 (+19-21%) while p50 is neutral or marginally better. Without p90 in perf output, autoconfig cannot detect stability regressions. p50-only verdict policy misses these cases.dml-002, gpu-002OPEN
winml perf --profile
+ Per-op kernel time breakdown
#158P1Phase 1 (Insight) in autoconfig requires knowing which op type dominates latency (Gemm% vs Conv% vs Attention%) to prioritize hypotheses. Without this, all hypotheses run blindly. POC bridges via analyze_graph.py (static), but dynamic profiling is needed for accurate attribution.all EPsOPEN
winml build --json (agent-friendly structured output)
+ Per-step status, timing, output paths, exit code
#443P2autoconfig parses winml build stdout to detect failures — fragile string parsing. A --json flag should emit a structured JSON to stdout (or a sidecar file) with per-step status (success/fail/skip), elapsed time, output artifact paths, and a top-level exit code. Would let autoconfig detect partial failures precisely and resume from the failed step.all EPsOPEN
winml eval --mode compare — support local PyTorch model as reference
+ Currently requires --model-id (HF hub only)
not filedP2autoconfig correctness contract compares optimized ONNX vs a reference model. Currently winml eval --mode compare requires a HuggingFace model ID as the golden reference — local .pt / .pth files are not supported. For models not on HF Hub, or custom fine-tunes, there is no way to establish a cosine-similarity correctness gate.all EPsNEEDS ISSUE
+
+ + +
+ How to read confidence levels: + HIGH confirmed = mechanism understood + data from ≥3 independent sessions with non-overlapping ranges. +   + MEDIUM empirical = data is reliable but mechanism unconfirmed or from 1 model only. +   + LOW = single experiment, anecdote, or CLI gap blocking proper validation. +
+ All findings from Snapdragon X Elite CRD (Oryon CPU + Adreno X1-85 GPU + Hexagon HTP NPU). + ORT 1.24.5 (onnxruntime-windowsml). Findings may not generalize to x86 hardware or older ORT versions. +
+ + + diff --git a/research/autoconfig/docs/ep-knowledge-review.md b/research/autoconfig/docs/ep-knowledge-review.md new file mode 100644 index 000000000..288467396 --- /dev/null +++ b/research/autoconfig/docs/ep-knowledge-review.md @@ -0,0 +1,246 @@ +# EP Knowledge Base — Critical Review + +> Date: 2026-06-16 +> Reviewer: internal audit +> Scope: `ep_knowledge/qnn_npu.json` findings npu-001 through npu-007 +> +> This document records issues found in the original KB entries and the +> reasoning behind corrections applied in the June 2026 update. + +--- + +## Summary of Issues Found + +| Finding | Status Before Review | Issue | Corrected Status | +|---------|---------------------|-------|-----------------| +| npu-001 | `mechanism_confirmed: true` | ORT version used has kMaxSupportedOpset ≥ 22 — bypass mechanism does not apply; ResNet-18 data is noise | `mechanism_confirmed: false`, mechanism UNKNOWN | +| npu-002 | scope: "General / most vision models" | Tested on 1 model only (ConvNext) | scope narrowed to ConvNext | +| npu-003 | scope: "General / all QNN NPU" | Tested on 1 model only (ConvNext) | scope narrowed to ConvNext | +| npu-004 | confidence: "medium" | No recorded data; experiment aborted before measurements saved | confidence: "very_low / anecdote" | +| npu-005 | confidence: "medium" | Compares ORT QNN EP vs qairt native stack — different compilation pipeline entirely | added fairness caveat | +| npu-006 | `mechanism_confirmed: false` | Observation is solid (3-session consistent). Mechanism is unconfirmed but regression is unambiguous | no change to confirmed status; added session evidence | +| npu-007 | `mechanism_confirmed: true` | Solid, confirmed across all 8 models | no change | + +--- + +## Detailed Analysis + +### npu-001 — opset 21 speedup + +#### ORT version issue (critical) + +The catalog sweep used `onnxruntime-windowsml==1.24.5`. The npu-001 mechanism +explanation relies on ORT's `kMaxSupportedOpset` gate: + +> "On older ORT where kMaxSupportedOpset < 21, opset 21 models bypass the +> NCHW→NHWC layout transformer entirely." + +But the `kMaxSupportedOpset` version table (from `cpu.json`) shows: + +| ORT version | kMaxSupportedOpset | +|-------------|-------------------| +| v1.14.x | 18 | +| v1.16.x | 19 | +| v1.17.x | 20 | +| v1.18.x | 21 | +| main_HEAD | 26 | + +At ORT 1.24.x, `kMaxSupportedOpset` is almost certainly ≥ 22. This means BOTH +opset 17 and opset 21 models go through the NHWC layout transform in the ORT +version actually used in the sweep. **The "bypass" mechanism does not apply.** + +Consequence: `mechanism_confirmed` must be `false`. The speedup for DINOv2 and +MobileViT is empirically real but the cause is **unknown**. The ORT source code +investigation confirmed the bypass mechanism for *older* ORT versions, not for +the ORT version actually used. + +Possible alternative mechanisms (uninvestigated): +1. PyTorch ONNX exporter produces a structurally different graph at opset 21 + (different op decompositions, fewer reshape/squeeze nodes) +2. QNN EP's graph partitioner behaves differently with opset 21 operator + semantics even when the NHWC transform fires +3. Quantization calibration path differs between opset export versions +4. The NHWC transform at opset 21 still inserts fewer Transposes for some reason + despite firing (investigation needed via optimized graph dump) + +#### ResNet-18 data is noise-dominated + +ResNet-18 baseline p50 is ~1ms. At this latency, the 3×500-iter protocol +produces per-session p50s that vary 4x between sessions: + +``` +h1 (opset17): sessions = [0.990, 4.003, 2.716] ms ← 4x range +h3 (opset21): sessions = [1.054, 2.175, 4.107] ms ← 4x range +``` + +The two distributions fully overlap. Declaring a "+20.2% speedup" from comparing +medians (2.716 vs 2.175ms) is not statistically valid. This data point is +**removed** from `validated_models.benefits_from_opset21`. + +To get reliable data for ResNet-18, a minimum of ~3000 iterations per session +and ≥ 5 sessions would be needed. + +#### MobileViT DVFS spike in h1 + +h1 (opset17) sessions: [10.557, 11.721, **27.436**] ms + +The third session at 27.4ms is a clear DVFS thermal event (2.4x spike). The +median (11.721ms) is upward-biased by this session. The "true" opset17 p50 is +likely ~11ms, making the "+26.5%" speedup calculation overstated. A more +conservative estimate is ~20-22%. + +However, h3 (opset21) sessions [10.814, 8.625, 8.449] show two highly consistent +low-latency sessions. The speedup is real, magnitude uncertain (~20-26%). + +#### DINOv2 — most reliable evidence for npu-001 + +h1 (opset17): [7.176, 6.392, 9.436] ms — range 6.4–9.4ms +h3 (opset21): [4.977, 4.876, 6.884] ms — range 4.9–6.9ms + +The two distributions barely overlap only at extremes (h3 max 6.884 ≈ h1 min +6.392). h3 sessions 1 and 2 (4.977, 4.876ms) are tightly clustered at ~4.9ms, +well below the h1 range. The speedup appears real (≥24% vs h1's non-spiked +sessions, up to 31% vs h1 median). + +DINOv2-small's benefit is notable because it is primarily a Vision Transformer — +it has a patch embedding Conv layer but attention-dominant compute. Why opset21 +helps DINOv2 but NOT ViT-base is unknown. This architecture distinction needs +investigation. + +#### Updated empirical claim for npu-001 + +**Observable fact**: For DINOv2-small and MobileViT-small on QNN NPU (ORT 1.24.5, +Snapdragon X Elite), using opset 21 export instead of opset 17 produces a +consistent latency reduction of ~20-31% across 3-session benchmarks. + +**What is NOT known**: Why this occurs in ORT 1.24.x where the kMaxSupportedOpset +bypass should not apply. + +**What needs investigation**: +1. Dump optimized.onnx for both opset17 and opset21 DINOv2, count Transpose nodes + — if opset21 has fewer Transposes, explains speedup via a different mechanism +2. Verify ORT 1.24.x kMaxSupportedOpset value from compiled binary +3. Test 3+ additional Conv+residual models: EfficientNet-B0, MobileNet-V3, + ConvNeXt-tiny (already done for CPU; needs QNN NPU validation) + +--- + +### npu-002 — W8A16 speedup over FP32 + +**Issue**: Scope states "General (applies to most vision models on QNN NPU)". +Evidence base: 1 model (ConvNext), 1 device. + +The 1.9x speedup is plausible from HTP architecture (INT8 weight path), but +the magnitude varies by model: a model with few weight-heavy ops (e.g., pure +attention) may see less speedup than a Conv-heavy model. "Most vision models" +is over-claimed. + +**Correction**: Scope narrowed to "ConvNext — single model validation". The +catalog sweep provides indirect evidence (all 8 models used W8A16 and ran +faster than FP32 would on HTP) but no direct FP32 comparison baseline for +those models. + +--- + +### npu-003 — compile speedup + +**Issue**: Scope states "General (applies to all QNN NPU deployments)". Evidence +base: 1 model (ConvNext), 1 device. + +The compile (EPContext) mechanism is well-understood and applies generally, but +the 1.7x magnitude is model-specific. Models with simpler graphs may see less +benefit; models with many ops may see more. + +**Correction**: Scope narrowed. The mechanism claim ("eliminates JIT partitioning") +is generally correct; the magnitude claim (1.7x) is ConvNext-specific. + +--- + +### npu-004 — W8A8 accuracy collapse + +**Issue**: The observation is "Exact numbers not recorded — aborted early." This +is an anecdote, not a finding. The confidence of "medium" is unjustified without +data. + +The claim may well be correct (W8A8 on LN+GELU is problematic), but without +recorded accuracy numbers it cannot be treated as a KB finding. + +**Correction**: Confidence downgraded to "very_low". The finding is relabeled +as an unrecorded anecdote pending a proper experiment with recorded numbers. + +--- + +### npu-006 — conv fusions catastrophic regression + +This finding is the **most statistically solid** in the entire KB: + +ResNet-18 h4 sessions: [132.3, 134.97, 130.669] ms — CV = 0.016 (extremely stable) +ResNet-18 h1 sessions: [0.990, 4.003, 2.716] ms — median 2.716ms + +Even using the best h1 session (0.990ms) vs worst h4 session (134.97ms), the +regression is 136x. The 3-session consistency of h4 (~130-135ms) with near-zero +variance is unusual for QNN NPU (all other hypotheses show high CV). This +suggests the fused ops cause a deterministic CPU fallback with no DVFS noise — +consistent with the mechanism hypothesis. + +The only issue is "mechanism_confirmed: false" — the CPU fallback has not been +verified via EP partition dump. The regression is unambiguous; the mechanism is +a strong hypothesis. + +**No changes needed** except documenting the 3-session evidence more explicitly. + +--- + +## Additional Models Needed for Validation + +### For npu-001 (opset21 benefit for Conv+residual) + +| Model | Why useful | Predicted result | +|-------|-----------|-----------------| +| `microsoft/efficientnet-b0` | Conv-dominant, no residual-add structure | uncertain | +| `microsoft/mobilenet-v3-small` | Conv-dominant + SE blocks | likely benefits | +| `timm/convnextv2-nano` | ConvNext variant, already confirmed for ConvNext | should benefit | +| `facebook/deit-small-patch16-224` | Pure ViT (no Conv), similar to ViT-base | should be neutral | +| `timm/regnetx-002` | ResNet-like but with group Conv | uncertain | + +Goal: determine whether the benefit is "Conv+residual" or something more specific +to the DINOv2/MobileViT architectures (e.g., hybrid Conv+attention). + +### For npu-006 (conv fusions) + +| Model | Why useful | Predicted result | +|-------|-----------|-----------------| +| `microsoft/efficientnet-b0` | Conv+BN heavy (many fuseable patterns) | should regress | +| `google/mobilenet-v2-1.0-224` | Depthwise Conv dominant | should regress | +| `timm/vgg16` | Pure Conv-BN | should regress | +| `microsoft/beit-base-patch16-224` | Pure transformer | should be neutral | + +Goal: confirm that the regression generalizes to all Conv-dominant models, not +just ResNet-18. + +### For npu-002/003 (W8A16 and compile) + +Run FP32 vs W8A16 and W8A16 vs W8A16+compile on at least: +- `apple/mobilevit-small` (already benchmarked W8A16; need FP32 baseline) +- `microsoft/resnet-18` (same) +- `facebook/dinov2-small` (same) + +This would promote npu-002 and npu-003 from "1-model observations" to +"catalog-validated" findings. + +--- + +## Minimum Experiment Protocol for Validation + +For any new model added to the KB: + +1. Run 3 independent sessions × 500 iters with 30s cool-down (npu-007 protocol) +2. Record raw per-session p50s, not just the median +3. Verify session-to-session range is < 50% of the median before reporting a gain +4. For sub-2ms models: increase to 3 sessions × 2000 iters minimum +5. Always dump the optimized graph (`--save-optimized-model`) for opset comparison +6. Record ORT version (`winml --version`) at experiment time in the finding + +--- + +*This review document should be re-run after any ORT or QNN SDK version update.* diff --git a/research/autoconfig/docs/skills-design.html b/research/autoconfig/docs/skills-design.html new file mode 100644 index 000000000..c52ede8b1 --- /dev/null +++ b/research/autoconfig/docs/skills-design.html @@ -0,0 +1,3784 @@ + + + + + +WinML CLI Skills Design Doc + + + +
+ +
+

WinML CLI Skills Design Doc

+

Overview

+

This document defines the design for 9 skills to be added to skills/ in winml-cli. +Skills are split into two categories by the single question: does the task require editing repo code?

+
    +
  • User skills (5) — the user reaches their goal purely by specifying conditions and letting + winml-cli produce or modify a config.json / manifest.json / report. No source code is touched. + Audience: WinApp developers and ISVs deploying models.
  • +
  • Contributor skills (4) — the task requires a winml-cli source-code change (a new exporter, a new + EP backend, a new skill), or exists specifically to produce code-change backlog. Audience: winml-cli engineers.
  • +
+
+

Discriminator: if the deliverable is a config/manifest/report, it is a User skill. If completing it +requires editing code in the repo (or its whole purpose is to drive such edits), it is a Contributor skill.

+
+

Each skill follows the SKILL.md frontmatter convention (name:, description:) established +by Mobius, NVIDIA Model-Optimizer, and Google LiteRT-CLI as the de facto standard.

+

User skills — ranked by importance

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
RankSkillWhy it ranks hereOutput (no code)
1autoconfigFlagship. Autonomously searches the config space and delivers the optimal config.json per EP. Also hosts the manual optimize path (precision-ladder + latency/accuracy-budget decision framework + hardware table) for users who want to choose by hand or have no target hardware. Maps to all five user scenarios (S1–S5).config_<ep>_optimal.json + report.html
2check-model-feasibilityPre-build front door, merging model discovery + EP/device compatibility: "find me a supported model from my constraints, then confirm it runs on my hardware." The single "what do I run, and will it run?" gate (inspectsysanalyze). Highest frequency — every user hits it before building.model shortlist + go/no-go + fallback EP
3debug-accuracy-dropCloses the most acute pain point: accuracy dropped, cause unknown. High-frequency diagnostic need with the clearest existing tooling (eval --mode compare).stage + root cause + fix
4ship-to-winappShip-time skill, merging validation + packaging: L1–L5 Definition-of-Done gates plus multi-EP artifact layout, manifest.json, and runtime EP selection. Everything between "the model is good" and "it's running in the app."pass/fail report + manifest.json
5use-winml-cliGeneral tool-scoped onboarding reference (existing). Foundational but low differentiation vs the task-scoped skills above.command reference
+

Contributor skills — ranked by importance

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
RankSkillWhy it ranks hereCode touched
1adding-model-supportDirectly grows model coverage — the core long-tail business problem (ISV onboarding, S2/S5). Highest contribution frequency.new exporter + recipe
2optimization-researchHigh leverage: deep-searches ORT/Olive/ecosystem to find gaps and file the backlog that drives every other contributor skill. Internal, but sets the roadmap.files issues + repro (drives code changes)
3adding-ep-supportOnboards a new execution-provider backend. Infrequent, but high value the moment a new NPU vendor lands.compile backend + EP registry
4contributing-a-skillMeta-tooling: how to author, lint, and eval a SKILL.md. Sustains the ecosystem but is supporting infrastructure, not a direct model/EP/perf deliverable.SKILL.md + evals
+
+

The detailed ## Skill: sections below appear in document order, not priority order. Importance is +defined by the two ranked tables above; implementation sequencing (risk/dependency-driven) is in +Priority order for implementation.

+
+

User skill dependency graph

+
check-model-feasibility ──► autoconfig ──────────► ship-to-winapp
+  find a supported model      optimize the model      validate (L1–L5 gates)
+  + confirm EP/device runs     (automated autoresearch  + package multi-EP artifacts
+                               loop OR manual framework)  + manifest + runtime EP selection
+          │                         │                          ▲
+          └──────────► debug-accuracy-drop ───────────────────┘
+                       (diagnose accuracy drops at any stage)
+
+use-winml-cli ── general command reference; underpins every step above
+
+ +

Contributor research skill

+
optimization-research ──► [GitHub issues / winml backlog]
+  (deep search: ORT source + Olive + ONNX ecosystem + native stack models
+   → find better solutions → diagnose winml gaps → produce work items)
+
+ +

Contributor skill dependency graph

+
adding-model-support ──► contributing-a-skill
+adding-ep-support    ──► contributing-a-skill
+
+ +
+

Design principle: Skills as agentic workflows

+

The shift: documentation → automation

+

Current state (most skills in the ecosystem):

+
+

Skill tells the user what commands to run → user runs them → user interprets output

+
+

Target state for winml-cli:

+
+

Skill tells the agent what commands to run → agent runs them → agent interprets output → agent gives a specific answer

+
+

The difference:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Documentation skillAgentic skill
Agent sees low cosine"Run winml eval --mode compare"Runs it, reads cosine=0.87, says "drop at quantize stage, Attention layers"
EP compatibility"Run winml sys then winml analyze"Runs both, parses JSON, says "QNN available but LayerNorm is partial"
Optimize precision"Use the decision framework"Runs fp16/w8a16/w8a8 sweep, builds actual tradeoff table, recommends W8A16
Validate before ship"Check these 6 gates"Runs all 6 gates, generates a pass/fail report with actual numbers
+

This is only possible if skills describe a GATHER → ANALYZE → DECIDE → ACT workflow, +and winml-cli commands emit machine-readable structured output that the agent can parse.

+

Structured output: current state and gaps

+

Copilot agents have shell tool access and can run winml commands directly. +The key requirement is --format json on stdout so the agent can parse results +without screen-scraping Rich/ANSI terminal output.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CommandStructured output todayGap
winml inspect--format json (stdout)None
winml sys--format json (stdout)None
winml run--format json (stdout)None
winml analyze--output file.json (file only)Add --format json stdout
winml perf--output file.json (file only)Add --format json stdout
winml eval✗ No structured outputAdd --format json stdout
+

Required code changes (enables agentic skill execution): +1. winml eval --format json — outputs {cosine, sqnr, psnr, task_metric} to stdout +2. winml analyze --format json — outputs {supported: [...], partial: [...], unsupported: [...]} to stdout +3. winml perf --format json — outputs {p50_ms, p90_ms, p99_ms, mean_ms} to stdout

+

The GATHER → ANALYZE → DECIDE → ACT skill structure

+

Each skill section should be written with agent execution in mind:

+
## GATHER: what to run
+Commands the agent runs first (with --format json) to collect facts.
+
+## ANALYZE: what to look for
+How to interpret the JSON output. What values matter. What thresholds to apply.
+
+## DECIDE: what to recommend
+Decision logic. If X → recommend Y. If A and B → recommend C.
+
+## ACT: what to tell the user
+What to surface to the user: specific diagnosis + specific next step.
+
+ +

In practice this maps onto the existing "Sections" structure — the key is ensuring +each section has concrete commands to run and concrete interpretation rules, +not just prose description.

+

Example: debug-accuracy-drop as an agentic workflow

+
User: "My W8A8 model has low accuracy"
+
+GATHER:
+  agent runs: winml eval --mode compare -m quantized.onnx --model-id <id> --format json
+  agent gets: {"cosine_similarity": 0.87, "sqnr_db": 28.3, "stage": "quantize"}
+
+ANALYZE:
+  cosine=0.87 < 0.90 threshold → problem is real
+  sqnr=28.3 < 30 dB → significant degradation
+  stage=quantize → problem introduced at quantize, not optimize or compile
+
+DECIDE:
+  quantize-stage drop on W8A8 → check if Attention layers are the culprit
+  agent runs: winml analyze -m quantized.onnx --ep qnn --format json
+  agent gets: {"partial": ["MultiHeadAttention", "LayerNorm"], "unsupported": []}
+
+ACT:
+  Agent: "The accuracy drop (cosine=0.87) is at the quantize stage.
+          MultiHeadAttention is partial on QNN — activations may be falling back to FP32.
+          Try W8A16 to keep activations at FP16: winml build -c config.json --precision w8a16"
+
+ +

Without structured output (--format json), the agent would have to tell the user to run +each step manually and paste the results back. With structured output, the agent runs the +full diagnostic in one turn.

+
+

Validation confidence levels (L1–L5)

+

Inspired by Mobius writing-tests. Applied in ship-to-winapp as the Definition-of-Done backbone. +Each level is checked independently — a model can pass L3 without passing L2.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LevelNameWhat it verifiesKey command
L1LoadableArtifact is valid ONNX, loads without errorwinml inspect -m <artifact>
L2Shape correctOutput shape matches expected specwinml eval -m <artifact> --model-id <model> (check shape in output)
L3Numerical parityOutput matches FP32 baseline (cosine ≥ 0.99 FP16, ≥ 0.95 W8A16, ≥ 0.90 W8A8)winml eval --mode compare -m <artifact> --model-id <model>
L4Task accuracyTask metric (Top-1/F1/mAP) within acceptable drop from FP32 referencewinml eval -m <artifact> --model-id <model> (task metric)
L5Production readyPerf SLA met on target device + cross-EP consistency verifiedwinml perf --iterations 100 --monitor
+

Quick pass criteria:

+ + + + + + + + + + + + + + + + + + + + + +
PrecisionL3 threshold
FP16cosine_similarity ≥ 0.99
W8A16cosine_similarity ≥ 0.95
W8A8cosine_similarity ≥ 0.90 (or task-specific)
+

Waivers: any level that cannot be verified must be documented with a reason and tracking issue. +The ship-to-winapp skill maps each of its 6 validation gates to an L-level.

+
+
+

Competitive Analysis

+

Summary

+

winml-cli has a solid optimization pipeline (export→quantize→compile→benchmark) but lacks the debugging/diagnostic loop, accuracy recovery tooling, and developer observability that distinguish great toolchains from adequate ones.

+
+

Competitor Feature Matrix

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FeatureAppleExecuTorchAI HubNVIDIAOpenVINOOptimumOlivewinml-cli
Per-layer accuracy debugging✅ SVG graph✅ cloud
Compute unit utilization reportPartial
Accuracy-Aware PTQ (auto layer rollback)✅ NNCF
Standard NLP benchmark (MMLU/PPL)
Cross-EP side-by-side comparePartial
Zero-deploy validation (model.predict)✅ macOS✅ cloudPartial
Pre-quantized model zoo✅ 500+✅ HF org
One-line optimize command
Multi-EP artifact packaging✅ .mlpackage✅ .pte
QAT / accuracy recovery fine-tuning✅ AIMET
Advanced quant (AWQ/SmoothQuant)✅ NNCF
Thermal/sustained-load profiling
+
+

Competitor Deep Dives

+

Apple coremltools

+

Most relevant: zero-deploy validation + compute_units API + palettization

+
    +
  • model.predict({'input': np_array}) — validates converted model in one Python call without any device deploy. Can force ComputeUnit.CPU_ONLY for numerical comparison vs CPU_AND_NE.
  • +
  • compute_units is switchable at prediction time (not just compile time) — enables A/B testing EP performance without re-converting.
  • +
  • Palettization: LUT-based weight compression at 1–8 bits (k-means clustering, not linear quant). Matches Neural Engine hardware kernels better than INT4 linear quantization for many models.
  • +
  • Three compression workflows: data-free / calibration-based / fine-tuning-based (QAT).
  • +
  • .mlpackage separates architecture from weights → streaming-friendly, supports on-device compilation after download.
  • +
+

ExecuTorch (Meta)

+

Most relevant: per-layer QNN accuracy debugging (best-in-class of all competitors)

+
    +
  • QNNIntermediateDebugger: dumps intermediate tensor outputs at every QNN op, computes cosine similarity per layer vs CPU reference, generates color-coded SVG computation graph (green ≥ 0.9, red < 0.9).
  • +
  • get_delegation_info(): table of ops showing delegated-to-NPU count vs CPU-fallback count per op type.
  • +
  • ETDump + Inspector API: per-op timing table with avg (ms), op type, is_delegated. Returns pandas DataFrame.
  • +
  • QAIRT Visualizer: pip install qairt-visualizer — interactive GUI overlaying op trace + QHAS (QNN HTP Analysis Summary) on model graph.
  • +
  • Missing: no cloud device testing, no automated accuracy-latency sweep, build process is complex.
  • +
+

Qualcomm AI Hub

+

Most relevant: cloud profiling with physical hardware, per-step memory breakdown

+
    +
  • Compile + Profile + Inference on real physical devices (Snapdragon X Elite laptops, Galaxy S24) in the cloud — no local hardware needed.
  • +
  • Per-step memory profiling: compilation time/memory, first-load time/memory (NE optimization), subsequent-load (cached), inference latency.
  • +
  • 500+ pre-optimized models in model zoo.
  • +
  • --clone j1glw6y8p — clone any previous job with modified params.
  • +
  • Cloud AIMET quantization: sophisticated PTQ as a service (submit_quantize_job()).
  • +
+

NVIDIA ModelOpt

+

Most relevant: 16 compression techniques + MMLU benchmark scripts + pre-quantized HF checkpoints

+
    +
  • Compression techniques beyond PTQ: AWQ, SmoothQuant, QAT, pruning (Minitron 33% smaller, 50% faster), distillation, speculative decoding, sparsity, NAS (Puzzletron).
  • +
  • Windows accuracy benchmark: mmlu_benchmark.py (57 subjects, DirectML/ORT/TensorRT-LLM/CPU), perplexity on WikiText-2, KL-divergence metrics.
  • +
  • Pre-quantized HF checkpoints: nvidia/DeepSeek-R1-FP4, nvidia/Llama-3.3-70B-FP4 etc. — pull validated optimized models without running pipeline.
  • +
+

Intel OpenVINO + NNCF

+

Most relevant: Accuracy-Aware PTQ (auto layer rollback)

+
    +
  • NNCF AccuracyAwareQuantization: automatically identifies sensitivity of each layer to quantization, rolls back sensitive layers to float when accuracy drop exceeds threshold. Fully automated accuracy-performance tradeoff solver.
  • +
  • benchmark_app -hint latency vs -hint throughput: auto-configures streams, batch, inference requests for each mode. -d AUTO: automatic device selection with fallback.
  • +
  • 100+ Jupyter notebooks on Binder/Colab — zero setup barrier.
  • +
  • OpenVINO GenAI: high-level LLMPipeline, WhisperPipeline — deploy-ready LLM inference in 5 lines.
  • +
+

HuggingFace Optimum

+

Most relevant: drop-in Transformers replacement + multi-backend hub

+
    +
  • Replace AutoModelForSequenceClassification.from_pretrained() with ORTModelForSequenceClassification.from_pretrained() → ONNX Runtime inference with zero code change.
  • +
  • 8 hardware backends: ONNX Runtime, OpenVINO, NVIDIA TensorRT-LLM, AMD Ryzen AI, AWS Inferentia, ExecuTorch, Intel Gaudi, FuriosaAI.
  • +
  • Task-aware export: --task text-generation auto-configures dynamic axes and model wrapping.
  • +
+

Microsoft Olive (direct competitor)

+

Most relevant: one-line optimize command + VS Code AI Toolkit

+
    +
  • olive optimize --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct --precision int4 --output_path models/qwen — one command, no per-step config.
  • +
  • JSON-based pipeline config for full declarative multi-step control.
  • +
  • VS Code AI Toolkit extension: GUI for model optimization, fine-tuning, and inference testing — no CLI knowledge needed.
  • +
  • MultiLoRA serving support.
  • +
+
+

Top 5 High-Impact Gaps for winml-cli

+

🔴 Gap 1: Per-Layer Accuracy Debugging

+

Pain: Accuracy degrades after QNN compilation/quantization, user has no idea which layer caused it. Currently requires QNN SDK expert knowledge.

+

Solution: winml debug --model model.onnx --ep qnn --inputs calibration_data/ +1. Runs model on CPU and QNN, captures intermediate tensor outputs at each op +2. Computes cosine similarity per layer +3. Outputs HTML/SVG graph with color-coded accuracy (green/red per layer)

+

Reference: ExecuTorch QNNIntermediateDebuggerOutputFormat.SVG_GRAPH + QcomCosineSimilarityComparator

+

Impact: Turns multi-day debugging into a 30-minute diagnosis. Currently no Windows-on-NPU tool does this.

+
+

🔴 Gap 2: Compute Unit Utilization Report

+

Pain: winml perf shows slower-than-expected latency with no explanation. User doesn't know what % of ops ran on NPU vs fell back to CPU.

+

Solution: Extend winml analyze to output delegation table:

+
Op Type         | NPU Delegated | CPU Fallback | Reason
+----------------|---------------|--------------|------------------
+MatMul (INT8)   | 47 / 47       | 0            | -
+LayerNorm       |  0 / 12       | 12           | Unsupported dtype
+Softmax (FP32)  |  0 /  6       |  6           | Requires INT8 input
+
+ +

Reference: ExecuTorch get_delegation_info().get_operator_delegation_dataframe() / AI Hub per-layer compute unit mapping

+

Impact: Directly actionable — if user sees "60% of ops on CPU due to unsupported dtype," they know to switch to W8A8.

+
+

🟠 Gap 3: Quantization Sensitivity Analysis

+

Pain: winml quantize --algo w8a8 produces a model with unacceptable accuracy. User doesn't know if it's a specific layer, the algorithm, or the calibration data.

+

Solution: winml analyze-quant --model model.onnx --calibration data/ --eval-dataset eval/ +1. Run full W8A8 quantization +2. For each block/layer, measure accuracy impact of reverting to FP16 +3. Rank layers by sensitivity +4. Report: "reverting 3 attention layers to FP16 recovers X% accuracy at Y% latency cost"

+

Reference: Intel NNCF AccuracyAwareQuantization (automatic per-layer rollback)

+

Impact: Replaces multi-day trial-and-error with a 10-minute automated report.

+
+

🟠 Gap 4: Standard Benchmark Integration (MMLU / Perplexity)

+

Pain: winml eval supports custom scripts but no out-of-box standard benchmarks. Users have no reference point for whether their quantized model's accuracy is "expected."

+

Solution: winml eval --model model.onnx --benchmark mmlu --ep qnn +- Built-in MMLU (57 subjects), WikiText-2 perplexity, KL-divergence scripts +- Reference numbers from FP32 baseline shown alongside quantized result +- FP16 baseline: 78.2% → W8A8 QNN: 77.9% (−0.3%, expected range: −0.1% to −0.5%)

+

Reference: NVIDIA ModelOpt examples/windows/accuracy_benchmark/mmlu_benchmark.py supports DirectML/ORT/CPU

+

Impact: Removes ambiguity and creates trust. Critical for LLM users.

+
+

🟡 Gap 5: Cross-EP Side-by-Side Comparison

+

Pain: Choosing between QNN/DirectML/CPU/OpenVINO requires running each EP manually and aggregating results. No tool does this automatically.

+

Solution: winml sweep --model model.onnx --precision w8a16,fp16 --ep qnn,dml,cpu +- Runs build+eval+perf for each (precision × EP) combination +- Outputs a single comparison table: accuracy / latency / op coverage % +- Agent-driven: skill reads JSON output and recommends the optimal combination

+

Reference: Truly unique — no competitor does this for Windows multi-EP. Closest is AI Hub's multi-device fleet testing (Android only).

+

Impact: The single most-requested decision for Windows AI developers. Unique to winml-cli.

+
+

Patterns in Great Toolchain DX

+

Pattern 1: The "Why" Feedback Loop +Great toolchains explain why results are the way they are. ExecuTorch's delegation table, AI Hub's compute unit mapping, NNCF's layer sensitivity analysis all answer "why?" winml-cli currently stops at "here's the result."

+

Pattern 2: Progressive Disclosure of Complexity +- Olive: olive optimize --precision int4 (one line) → full JSON config pipeline +- coremltools: ct.convert(model) → MIL IR manipulation +- AI Hub: web dashboard → Python SDK → CLI → AIMET configs

+

winml-cli is currently too close to the expert path: each step requires understanding EP-specific options.

+

Pattern 3: Zero-Deploy Validation +Every strong toolchain lets you test model output before deploying to hardware: coremltools model.predict(), ExecuTorch Python pybind, AI Hub submit_inference_job(). winml-cli is strong for CPU but lacks the quick "compare CPU vs QNN output" path.

+

Pattern 4: Pre-Validated Model Artifacts +ModelOpt (HF nvidia/ org), AI Hub (500+ models), NNCF (Model Zoo with accuracy tables) all reduce the cold-start problem. Users don't need the full pipeline for popular models.

+
+

Whitespace Opportunities (No Competitor Covers)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
OpportunityWhy it's winml-cli territory
Cross-EP regression table (one command, all EPs)Multi-EP is the unique Windows AI challenge; no Android/iOS tool does this
Quantization config recommender (winml recommend --target qnn --constraint latency=20ms)Rule-based recommendation from hardware+model arch analysis
EP-aware ONNX graph visualizer (Netron + green/yellow/red per EP)Netron exists but has no EP coverage overlay
Thermal/sustained-load profiling (latency curve over 100 runs, detect throttling)AI Hub hides variance; no tool surfaces thermal behavior
Windows AI Model Package (.mlpackage equivalent with multi-EP manifest)Apple has .mlpackage; Windows has nothing equivalent
+
+

Skill: use-winml-cli (existing — extend)

+

Status: Exists at skills/use-winml-cli/SKILL.md. Needs two additions: +- Add winml run and winml serve usage (currently missing) +- Add "first-time onboarding" path for users who don't know where to start

+

No structural changes needed; the existing skill is the general entry point.

+
+

Skill: debug-accuracy-drop

+

Frontmatter

+
name: debug-accuracy-drop
+description: >
+  Use this skill when a quantized or optimized model produces worse accuracy than
+  the FP32 baseline and the cause is unknown. Guides a structured diagnosis: first
+  isolate which pipeline stage introduced the drop (optimize vs quantize vs compile),
+  then use winml eval --mode compare to measure output similarity, then use winml
+  analyze to check for partial/unsupported ops that may cause EP fallback. Covers
+  calibration dataset issues, precision selection mistakes, and QNN-specific fallback
+  patterns. Use when the user says "accuracy dropped after quantization", "results
+  look wrong on NPU", or "cosine similarity is low".
+
+ +

When to use

+
    +
  • "My model gives wrong results after quantization"
  • +
  • "W8A8 accuracy is too low, how do I find out why"
  • +
  • "Results differ between NPU and CPU"
  • +
  • cosine_similarity < 0.95 from winml eval --mode compare
  • +
+

Sections

+

1. Isolation strategy: binary search on the pipeline +Diagnose by bisecting the pipeline stages:

+
FP32 baseline
+    → after optimize?   winml eval --mode compare (fp32 vs optimized)
+    → after quantize?   winml eval --mode compare (fp32 vs quantized)
+    → after compile?    winml eval --mode compare (fp32 vs compiled)
+
+ +

First stage where cosine drops → that's where the problem is.

+

Key commands:

+
# Export FP32 baseline
+winml export -m <model> -o baseline/model.onnx
+
+# Compare optimized vs baseline
+winml eval --mode compare -m optimized/model.onnx --model-id <model>
+
+# Compare quantized vs baseline
+winml eval --mode compare -m quantized/model.onnx --model-id <model>
+
+# Compare EP-compiled vs baseline (run on target EP)
+winml eval --mode compare -m compiled/model.onnx --model-id <model> --ep qnn
+
+ +

2. Interpreting similarity metrics +Table of thresholds: +| Metric | Healthy | Investigate | Problem | +|---|---|---|---| +| cosine_similarity | > 0.99 | 0.95–0.99 | < 0.95 | +| SQNR (dB) | > 40 | 30–40 | < 30 | +| max_abs_diff | model-dependent | — | unbounded |

+

3. Root cause patterns

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SymptomLikely causeFix
Drop appears at quantize stageCalibration dataset not representativeUse task-relevant calibration data via --calibration-dataset
Drop appears at quantize stage for Attention layersW8A8 quantizing activations in attentionSwitch to W8A16 (keeps activations at FP16)
Drop appears at compile stage on QNNOp pattern unsupported → CPU fallbackRun winml analyze to find partial ops
Inconsistent results across runsNon-deterministic EP dispatchAdd --iterations 20 to average out
Drop only in certain inputsInput shape sensitivityTest with calibration data matching real distribution
+

4. Checking for op fallback with winml analyze +When compile-stage drop is suspected:

+
winml analyze -m quantized/model.onnx --ep qnn
+
+ +

Look for partial and unsupported ops — these fall back to CPU, introducing +numerical differences vs native NPU execution. Partial ops are the most common +source of unexpected accuracy variance on QNN.

+

5. Precision escalation path +If W8A8 is the problem and the model is accuracy-sensitive: +W8A8 → W8A16 → FP16 → FP32 +Stop at the first precision that meets accuracy requirements.

+

Cross-references: +- To compare precision options systematically → autoconfig (manual or automated optimize) +- If op is listed as unsupported → check-model-feasibility

+
+

Skill: ship-to-winapp (merge of validate-before-ship + prepare-for-winapp)

+

Covers the whole ship-time phase: first validate the model meets the Definition-of-Done, +then package the multi-EP artifacts and manifest for the WinApp to load at runtime.

+

Frontmatter

+
name: ship-to-winapp
+description: >
+  Use this skill when taking a winml-cli model artifact the last mile into a Windows
+  application — both validating it is good enough to ship and packaging it for the app.
+  Validation half: a Definition-of-Done checklist covering artifact completeness, accuracy
+  vs FP32 baseline, performance SLA, output correctness on real inputs, cross-EP consistency,
+  and fallback chain (every item checked or explicitly waived). Packaging half: how to organize
+  multi-EP artifacts (QNN/NPU, OpenVINO, VitisAI, DirectML/GPU, CPU fallback), the recommended
+  directory layout and manifest.json for runtime EP selection, and the runtime EP detection /
+  fallback pattern. Use when the user says "I'm ready to ship", "what should I test before
+  release", "how do I know the model is good enough", "how do I use this in my app",
+  "how do I package the model", or "what file do I load at runtime".
+
+ +

When to use

+
    +
  • About to ship a WinApp with on-device inference; final QA gate before production
  • +
  • After any build config change (new quantization, new EP, new model version)
  • +
  • "I built the model, how do I ship it in my app?"
  • +
  • "How do I load different models for different hardware / what happens with no NPU?"
  • +
  • "How do I package QNN + DML + CPU variants together?"
  • +
+
+

Part A — Validate (Definition-of-Done gates)

+

The checklist

+

Gate 1 — Artifact completeness +- [ ] All target EP artifacts exist and are loadable +- [ ] CPU fallback artifact exists +- [ ] manifest.json (if using multi-EP layout) is valid and references existing files +- [ ] Artifact was built with winml build (not opaque cache artifact)

+
winml inspect -m <artifact>.onnx  # verify each artifact loads
+
+ +

Gate 2 — Accuracy vs FP32 baseline +- [ ] cosine_similarity ≥ 0.99 for FP16 artifacts +- [ ] cosine_similarity ≥ 0.95 for W8A16 artifacts +- [ ] cosine_similarity ≥ 0.90 for W8A8 artifacts (or task-specific threshold) +- [ ] Task accuracy metric (Top-1, F1, mAP) within acceptable drop from FP32

+
winml eval --mode compare -m <artifact>.onnx --model-id <model>
+winml eval -m <artifact>.onnx --model-id <model>  # task accuracy
+
+ +

Gate 3 — Performance SLA +- [ ] p50 latency meets application target on target device +- [ ] p99 latency within 2x p50 (no outlier spikes) +- [ ] Benchmark run on actual target hardware (not developer machine)

+
winml perf -m <artifact>.onnx --device <target> --iterations 100 --monitor
+
+ +

Gate 4 — Output correctness on real inputs +- [ ] Model produces correct output on ≥3 representative real-world inputs +- [ ] No NaN or Inf in outputs +- [ ] Output shape matches expected shape

+
winml run -m <artifact>.onnx --file <real_input>  # visual/manual check
+
+ +

Gate 5 — Cross-EP consistency (if shipping multiple EP variants) +- [ ] QNN and DML outputs agree within tolerance on same input +- [ ] CPU fallback output agrees with primary EP within tolerance

+
winml run -m model_qnn.onnx --file sample.jpg --format json -o qnn_out.json
+winml run -m model_dml.onnx --file sample.jpg --format json -o dml_out.json
+winml run -m model_cpu.onnx --file sample.jpg --format json -o cpu_out.json
+# compare qnn_out.json vs dml_out.json vs cpu_out.json manually
+
+ +

Gate 6 — Fallback chain +- [ ] CPU fallback artifact verified independently (not just assumed to work) +- [ ] App runtime selects correct artifact when target EP is absent (simulate by removing EP)

+

Waiver policy +Any item that cannot be completed must be waived explicitly:

+
Waivers:
+- Cross-EP consistency: VitisAI not available on developer machine.
+  Verified on target hardware by QA team. Issue #NNN.
+- Performance SLA: Target hardware (Snapdragon X Elite) in procurement.
+  Benchmark deferred to post-merge, tracked in issue #NNN.
+
+ +

Unchecked items without waiver → do not ship.

+

L-level mapping — the 6 gates map directly to the L1–L5 confidence system (see Overview):

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
GateL-level
Gate 1 — Artifact completenessL1
Gate 2 — Accuracy vs FP32 baselineL3 + L4
Gate 3 — Performance SLAL5
Gate 4 — Output correctness on real inputsL4
Gate 5 — Cross-EP consistencyL5
Gate 6 — Fallback chainL1 (CPU artifact)
+

Minimum to ship: L1 + L3 all passing. L4 + L5 required for production release.

+

Quick command reference

+
# Gate 1: inspect all artifacts
+for f in model_qnn.onnx model_dml.onnx model_cpu.onnx; do winml inspect -m $f; done
+# Gate 2: accuracy
+winml eval --mode compare -m <artifact>.onnx --model-id <model>
+winml eval -m <artifact>.onnx --model-id <model>
+# Gate 3: perf
+winml perf -m <artifact>.onnx --device auto --iterations 100 --monitor
+# Gate 4: real input
+winml run -m <artifact>.onnx --file <sample>
+# Gate 5: cross-EP (run individually, compare outputs)
+winml run -m model_qnn.onnx --file <sample> --format json
+winml run -m model_dml.onnx --file <sample> --format json
+
+ +
+

Part B — Package & integrate (multi-EP)

+

1. The multi-EP artifact problem +winml compile produces EP-locked files (not portable), so a WinApp needs a strategy to +select the right file per device.

+

2. Recommended artifact layout

+
my_model/
+  manifest.json          ← EP → file mapping + version
+  model_qnn.onnx         ← QNN NPU (compiled, Snapdragon X)
+  model_openvino.onnx    ← OpenVINO NPU/GPU (Intel Core Ultra)
+  model_vitisai.onnx     ← VitisAI NPU (AMD Ryzen AI)
+  model_dml.onnx         ← DirectML GPU (any GPU, non-NPU machines)
+  model_cpu.onnx         ← CPU fallback (universal)
+
+ +

3. manifest.json schema

+
{
+  "model_id": "facebook/convnext-tiny-224",
+  "task": "image-classification",
+  "version": "1.0.0",
+  "variants": [
+    { "ep": "qnn",       "device": "npu",  "file": "model_qnn.onnx",       "precision": "w8a16" },
+    { "ep": "openvino",  "device": "npu",  "file": "model_openvino.onnx",  "precision": "w8a8"  },
+    { "ep": "vitisai",   "device": "npu",  "file": "model_vitisai.onnx",   "precision": "w8a8"  },
+    { "ep": "dml",       "device": "gpu",  "file": "model_dml.onnx",       "precision": "fp16"  },
+    { "ep": "cpu",       "device": "cpu",  "file": "model_cpu.onnx",       "precision": "w8a8"  }
+  ],
+  "selection_order": ["qnn", "openvino", "vitisai", "dml", "cpu"]
+}
+
+ +

(For multi-EP artifacts, autoconfig emits this manifest.json directly with experiment provenance.)

+

4. Building all variants with winml-cli

+
# Generate configs per EP
+winml config -m <model> --device npu --ep qnn -o config_qnn.json
+winml config -m <model> --device npu --ep openvino -o config_ov.json
+winml config -m <model> --device gpu --ep dml -o config_dml.json
+winml config -m <model> --device cpu -o config_cpu.json
+
+# Build all
+winml build -c config_qnn.json -m <model> -o out_qnn/
+winml build -c config_ov.json  -m <model> -o out_ov/
+winml build -c config_dml.json -m <model> -o out_dml/
+winml build -c config_cpu.json -m <model> -o out_cpu/
+
+ +

5. Runtime EP selection pattern (C++ / ORT) +Pseudocode for app-side logic: +- Read manifest.json +- Query available EPs on device (GetAvailableProviders() or winml sys equivalent) +- Walk selection_order, pick first EP available on this device +- Load the corresponding file +- If all fail → CPU is always available

+

6. What NOT to do +- Don't load a QNN-compiled model with CPU EP → will fail or produce wrong results +- Don't hardcode EP names → check availability at runtime +- Don't ship only the compiled artifact without a CPU fallback

+

Cross-references: +- If accuracy gate fails → debug-accuracy-drop +- If performance gate fails → autoconfig (manual or automated optimize path) +- If EP not available for testing, or to pick the right EP → check-model-feasibility +- To build the artifacts → use-winml-cli

+
+

Skill: check-model-feasibility (merge of find-a-model + ep-compatibility-check)

+

The pre-build front door. Two entry points, one shared engine (inspectsysanalyze): +(A) the user has no model yet → recommend a supported one from their constraints; +(B) the user has a model → confirm it runs on their target EP/device. Both converge on the +same three-layer check, so they are one skill.

+

Frontmatter

+
name: check-model-feasibility
+description: >
+  Use this skill before a full build, to answer two linked questions: "which model should I
+  use?" and "will it run on my hardware?". Model discovery: when the user knows the task
+  (image classification, text embedding, object detection, summarization, …) but has no model
+  yet, gather their constraints, generate Hugging Face candidates, and screen each one for
+  winml-cli support. Compatibility: for a chosen (or candidate) model, run the three-layer check
+  — winml inspect (model support), winml sys (EP availability on this machine), winml analyze
+  (operator-level EP coverage) — plus the EP-to-hardware mapping and fallback chain for Windows
+  AI PCs. Use when the user says "what model should I use for X", "find me a model that runs
+  under 20ms on the NPU", "recommend a small image classifier", "I don't have a model yet",
+  "will this work on my device", "is QNN supported here", "what hardware do I need for NPU",
+  or when they hit an unsupported-operator error.
+
+audience: external (WinApp developers)
+
+ +

When to use

+
    +
  • "What model should I use for background blur / OCR / summarization?"
  • +
  • "Find a text-embedding model under 100MB that runs on the Intel NPU"
  • +
  • "Will this model work on my Snapdragon X Elite laptop? Is QNN supported here?"
  • +
  • "The compile step failed with an unsupported op"
  • +
  • Starting a new project: pick a model and verify feasibility before investing build time
  • +
+

What this skill does NOT do

+
    +
  • It does not train, fine-tune, or optimize a model — optimization hands off to autoconfig.
  • +
  • It only recommends models whose architecture winml-cli can actually export/run (verified via + winml inspect), never an arbitrary HF model it cannot load.
  • +
+

Sections

+

1. Two entry points +- (A) No model yet → run Section 2 (discovery) to produce candidates, then Section 3 on each. +- (B) Have a model → skip to Section 3 (three-layer check) directly.

+

2. Discovery — find candidate models (entry point A) +Capture and lock the selection constraints first:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ConditionExampleDrives
Taskimage-classification, feature-extraction, text-generationHF Hub filter
Target device / EPSnapdragon X NPU (QNN), Intel NPU (OpenVINO), any GPU (DML)feasibility + latency class
Latency budgetp50 ≤ 20 mssize / architecture shortlist
Accuracy need"≥ ResNet-50 top-1" or a benchmark floorcandidate quality bar
Size limit≤ 100 MB on diskexcludes large variants
Licensepermissive (Apache-2.0 / MIT)excludes restricted models
+

The agent queries the HF Hub by task, sorted by downloads/likes, restricted to architecture +families winml-cli is known to support → a 5–10 model shortlist. Each candidate then goes +through the three-layer check below; drop any that fail Layer 1 or have heavy unsupported ops.

+

3. The three-layer feasibility check (entry points A and B) +Layer 1 — Model support · Layer 2 — EP availability · Layer 3 — Operator coverage. +Run in order, stop at first hard failure.

+

Layer 1 — Model support

+
winml inspect -m <model-id> --format json
+
+ +

Look for loader, exporter, winml_inference_class populated. If inspect fails or shows +"unsupported" → model is out of scope for winml-cli (drop the candidate; do not recommend it).

+

Layer 2 — EP availability

+
winml sys --list-ep --list-device
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
EPHardware requirementCheck for
QNNQualcomm Snapdragon X Elite / X PlusQNNExecutionProvider in list
OpenVINOIntel Core Ultra (Meteor Lake / Lunar Lake+)OpenVINOExecutionProvider
VitisAIAMD Ryzen AI (Phoenix / Hawk Point / Strix)VitisAIExecutionProvider
NvTensorRTRTXNVIDIA discrete GPU (RTX series)NvTensorRTRTXExecutionProvider
DMLAny DirectX 12 GPUDmlExecutionProvider
CPUAnyAlways available
+

If the desired EP is not listed → recommend next best EP from the fallback chain.

+

Layer 3 — Operator coverage

+
winml analyze -m <exported_model>.onnx --ep <ep> --format json
+# or for all EPs at once:
+winml analyze -m <exported_model>.onnx --device all
+
+ +
    +
  • supported (green): op runs natively on EP
  • +
  • partial (yellow): op may fall back to CPU for some configurations
  • +
  • unsupported (red): op cannot run on this EP
  • +
+

Decision rule: any unsupported → either change EP or accept CPU fallback for those ops +(which may impact accuracy and latency).

+

4. Fallback chain recommendation +If target EP not available or has unsupported ops:

+
QNN not available → OpenVINO (if Intel) or VitisAI (if AMD) → DML → CPU
+
+ +

5. Rank and recommend (entry point A) / fast-fail before compile (entry point B) +- Discovery: rank surviving candidates by fit against the locked conditions (size, latency + class, accuracy reference, op coverage, downloads as a popularity prior). Output a short + ranked table + one recommended pick + rationale. +- winml compile is expensive (minutes). Always run analyze first; if it shows >20% + unsupported ops → likely not worth compiling for that EP.

+

Cross-references: +- After picking a model + confirming feasibility → autoconfig (find the optimal config) +- To build the chosen artifacts → use-winml-cli +- If no supported model meets the constraints, or all EPs show unsupported ops → the gap + feeds optimization-research (long-tail coverage) and adding-model-support

+
+

Addresses the Pre-quantized model zoo / cold-start whitespace from the Competitive Analysis: +NVIDIA (nvidia/ HF org) and AI Hub (500+ models) reduce cold-start with curated zoos; winml-cli +has none, so this skill substitutes a constraints-driven recommender that only returns supported models.

+
+
+

Skill: adding-model-support (contributor)

+

Frontmatter

+
name: adding-model-support
+description: >
+  Use this skill when contributing support for a new Hugging Face model to
+  winml-cli. Covers finding the correct exporter, writing a recipe config,
+  verifying at each pipeline stage (export → optimize → quantize → compile),
+  and passing the L1–L5 validation gates before submitting a PR. Use when
+  a contributor says "I want to add support for model X", "this model type
+  is not supported", or "how do I write a recipe for a new architecture".
+
+ +

When to use

+
    +
  • "I want to add support for Qwen3 / Phi-4 / [new model]"
  • +
  • "winml-cli says this model is unsupported"
  • +
  • "How do I write a recipe config for a new model family?"
  • +
+

Sections

+

1. Find the right exporter

+
winml inspect -m <hf_model_id>  # check if auto-detected
+
+ +

If inspect fails → the model needs a new exporter or recipe. +Look in src/winml/modelkit/export/ for existing exporters as reference.

+

2. Find a reference model of the same family +- Same architecture class (e.g., LlamaForCausalLM, BertModel)? +- Check recipes/ for an existing .json config for that class +- Prefer copying the closest recipe and adjusting rather than writing from scratch

+

3. Write the recipe config +Minimal recipe template:

+
{
+  "model_id": "org/model-name",
+  "task": "text-generation",
+  "export": { "opset": 17 },
+  "optimize": { "passes": ["MatMulAddFusion", "LayerNormFusion"] },
+  "quantize": { "mode": "w8a16", "calibration_dataset": "wikitext2" }
+}
+
+ +

4. Validate at each stage (L1 → L5)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
StageCommandPass criterion
L1: Export loadswinml inspect -m <exported>.onnxNo error
L2: Shape correctwinml eval -m <exported>.onnx --model-id <id>Output shape matches
L3: Numerical paritywinml eval --mode compare -m <quantized>.onnx --model-id <id>cosine ≥ threshold
L4: Task accuracywinml eval -m <quantized>.onnx --model-id <id>Task metric in spec
L5: Perf on target EPwinml perf -m <compiled>.onnx --device <target>Meets latency target
+

5. Common pitfalls for new models +- New op types not in operator coverage → run winml analyze early +- Attention variant (GQA, MQA, MLA) → check quantization mode compatibility +- Dynamic shapes → add explicit shape hints in export config +- Non-standard tokenizer → verify winml run input preprocessing

+

Cross-references: +- If EP shows unsupported ops → check-model-feasibility +- After L1–L5 all pass → ship-to-winapp for PR gate

+
+

Skill: adding-ep-support (contributor)

+

Frontmatter

+
name: adding-ep-support
+description: >
+  Use this skill when adding a new execution provider (EP) backend to
+  winml-cli. Covers implementing the compile backend interface, adding
+  EP-specific optimize passes, wiring the new EP into winml sys and
+  winml analyze, and verifying coverage with the L1–L5 test gates.
+  Use when a contributor says "I want to add support for a new EP",
+  "how does the QNN compile backend work", or "can we support EP X".
+
+ +

When to use

+
    +
  • Adding a new EP compile backend (e.g., a new NPU vendor)
  • +
  • Extending an existing EP with new optimization passes
  • +
  • Understanding how the existing QNN / OpenVINO / VitisAI backends are structured
  • +
+

Sections

+

1. EP backend interface +Reference implementation: src/winml/modelkit/compile/qnn_backend.py +Three methods to implement:

+
class MyEPBackend(CompileBackend):
+    def is_available(self) -> bool: ...      # detect EP on current machine
+    def optimize(self, model, config): ...   # EP-specific graph transforms
+    def compile(self, model, config): ...    # produce EP-locked artifact
+
+ +

2. Wire into EP registry +Register in src/winml/modelkit/ep_registry.py:

+
EP_REGISTRY["myep"] = MyEPBackend
+
+ +

This makes --ep myep work in winml config, winml compile, winml analyze.

+

3. Add operator coverage data +Add a coverage JSON to src/winml/modelkit/analyze/coverage/myep_ops.json:

+
{ "Add": "supported", "LayerNorm": "partial", "CustomOp": "unsupported" }
+
+ +

This is what winml analyze --ep myep reads.

+

4. Add to winml sys output +Add EP availability check to src/winml/commands/sys.py so it appears +in winml sys --list-ep.

+

5. L1–L5 validation for the new EP +Minimum before merging: +- L1: A known-good model compiles without crash +- L3: Compiled artifact passes winml eval --mode compare (cosine threshold) +- L5: winml perf produces valid latency output on target hardware

+

Cross-references: +- Operator coverage analysis → check-model-feasibility +- After adding: document the EP in the check-model-feasibility hardware table

+
+

Skill: contributing-a-skill (contributor)

+

Frontmatter

+
name: contributing-a-skill
+description: >
+  Use this skill when writing a new SKILL.md for winml-cli or improving
+  an existing one. Covers frontmatter requirements, description writing
+  (the description is the agent trigger, not a human summary), section
+  structure conventions, cross-reference format, command accuracy
+  requirements, and the review checklist before submitting. Use when a
+  contributor says "I want to add a new skill", "how should I write
+  SKILL.md", or "what are the skill authoring rules".
+
+ +

When to use

+
    +
  • Writing a new skill for a gap not covered by existing skills
  • +
  • Improving an existing skill with new commands or sections
  • +
  • Reviewing a skill PR
  • +
+

Sections

+

1. Frontmatter rules

+
name: kebab-case-skill-name   # matches directory name under skills/
+description: >
+  Use this skill when <trigger phrase describing user's problem>.
+  Covers <what the skill teaches>.
+  Use when the user says "<example trigger phrase 1>", "<example 2>", or <condition>.
+
+ +

Critical: The description field is what the Copilot agent reads to decide +whether to activate this skill. Write it as a trigger specification, not a +documentation summary. Include representative user phrases in quotes.

+

2. Required sections (in order) +1. ## When to use — 3–5 bullet points with user-facing symptoms/questions +2. Diagnostic or decision section — symptom → cause → fix structure +3. Command examples — runnable winml commands with real flags +4. Reference tables — hardware, thresholds, EP names as concrete data +5. ## Cross-references — links to related skills using relative paths

+

3. Cross-reference format

+
- If accuracy dropped → see `.agents/skills/debug-accuracy-drop/SKILL.md`
+- After validating → see `.agents/skills/validate-before-ship/SKILL.md`
+
+ +

4. Content rules +- All commands must be runnable exactly as written (no pseudocode flags) +- Include concrete numbers: thresholds (cosine ≥ 0.99), speedup (3–5×), latency (<50ms) +- Target ~200 lines prose + tables; move deep content to references/ subdirectory +- Do not duplicate content from another skill — cross-reference instead

+

5. Review checklist before PR +- [ ] description contains ≥3 quoted user trigger phrases +- [ ] All commands are tested and produce the described output +- [ ] Cross-references use relative paths and the linked skill exists +- [ ] No commands reference flags that don't exist in current winml --help +- [ ] Hardware names and EP names match the canonical list in check-model-feasibility +- [ ] evals/eval.yaml exists with ≥2 test cases (including at least one negative assertion)

+
+

Skill: autoconfig (user — optimize the model: automated loop + manual framework)

+

The optimize skill. Two modes: automated (the autoresearch loop — the bulk of this section) for +"figure it out for me / run overnight", and manual (the decision framework folded in from +optimize-for-device) for "I'll choose by hand" or when there is no target hardware to benchmark on.

+

Frontmatter

+
name: autoconfig
+description: >
+  Use this skill when a **WinApp developer** wants the best performance for their model on one or
+  more Windows EP/device targets — either by letting winml-cli search automatically, or by working
+  through the precision/EP tradeoffs by hand. Automated mode: an autonomous experiment loop that
+  proposes config.json hypotheses, runs winml build + eval + perf, evaluates against user-defined
+  objectives (accuracy floor, latency budget, or Pareto frontier), and iterates — keeping
+  improvements, discarding regressions; covers single-EP optimization, multi-EP parallel search,
+  mixed-precision (nodes_to_exclude) exploration, calibration tuning, and manifest.json output.
+  Manual mode: the latency-budget vs accuracy-floor decision framework, the FP32→FP16→W8A16→W8A8
+  precision ladder, a per-device hardware guidance table, and how to read tradeoff results.
+  Use when the user says "find the best config for my model on QNN", "automate the config search",
+  "generate configs for all EPs", "I want to leave this running overnight", "make it faster",
+  "which precision should I use", "is NPU worth it", or "compare QNN vs DirectML vs CPU".
+
+audience: external (WinApp developers)
+
+ +

When to use

+
    +
  • "Find the best W8A8 config that keeps accuracy > 0.95 on QNN"
  • +
  • "Generate optimized configs for QNN + DirectML + CPU and build a manifest"
  • +
  • "I don't know which quantization settings to use, figure it out for me" / "run overnight"
  • +
  • "Make it faster" / "which precision should I use" / "is NPU worth it" (→ manual mode)
  • +
  • "Compare QNN vs DirectML vs CPU for my model"
  • +
  • User has a latency SLA or accuracy floor but doesn't know how to achieve it
  • +
+

What this skill does NOT do

+
    +
  • It only searches within what winml build currently supports (existing capabilities)
  • +
  • It does not look for optimization techniques outside winml's current feature set
  • +
  • It does not suggest that winml needs new features or file bugs
  • +
  • For finding what winml is missing, use optimization-research instead
  • +
+
+

Manual mode — the decision framework (folded in from optimize-for-device)

+

Use this lightweight path when the user wants to decide by hand, or has no target hardware to +benchmark on (so the automated loop's perf gate can't run). It is the conceptual model the +automated loop below mechanizes.

+

1. The decision framework — two inputs: latency budget OR accuracy budget. +- Have a latency SLA (e.g. <50ms)? → find highest accuracy within that budget +- Have an accuracy floor (e.g. <2% drop)? → find fastest within that floor

+

2. The precision ladder — FP32 → FP16 → W8A16 → W8A8, with typical speedup and accuracy-drop +ranges per model family (Encoder/BERT-like, Vision/ConvNet, Transformer/ViT).

+

3. The sweep workflow — run winml build + winml eval + winml perf for each precision, +collect into a tradeoff table, apply the decision framework.

+
winml config -m <model> --device <device> --precision fp16 -o config_fp16.json
+winml build -c config_fp16.json -m <model> -o out_fp16/
+winml eval -m out_fp16/<artifact>.onnx --model-id <model>
+winml perf -m out_fp16/<artifact>.onnx --device <device> --iterations 50
+# repeat for w8a16, w8a8
+
+ +

4. Hardware-specific guidance table +| Device | Best EP | Sweet-spot precision | Notes | +|---|---|---|---| +| Snapdragon X Elite NPU | QNN | W8A16 | HTP native for W8A16; W8A8 risky for Attention | +| Intel Core Ultra NPU | OpenVINO | W8A8 | OpenVINO PTQ handles INT8 well | +| AMD Ryzen AI NPU | VitisAI | W8A8 | Phoenix/Hawk Point prefer INT8 | +| Any GPU | DirectML | FP16 | FP16 sufficient; quantization rarely helps on GPU | +| CPU fallback | CPU | W8A8 | Size + latency both benefit |

+

5. Reading the output — how to interpret winml eval cosine_similarity / SQNR and +winml perf p50/p90/p99; what values indicate "acceptable" vs "needs investigation".

+

When the user wants this automated instead of done by hand, continue to the autoresearch loop below.

+
+

Epistemic standard for autoconfig findings

+

Any conclusion this skill writes into a report or recommends to a user must meet this bar:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
RequirementWhat it means
Observation vs explanationState what was measured separately from why it happened. "latency increased 270ms" is fact. "because NHWC causes cache thrashing" is a hypothesis — label it as such unless confirmed by profiling.
Statistical validityA latency claim requires ≥ 3 independent runs with warmup. A single winml eval run (no warmup, includes preprocessing) is insufficient to quote as a latency number. It can guide search decisions but not final reports.
Mechanism confirmationDo not explain a regression unless the mechanism is confirmed (e.g., by profiler, by op-level timing, or by source code inspection of ORT/QNN SDK). If unknown, write "cause unconfirmed; further profiling needed."
Scope boundaryResults measured on one model/EP are never generalized to other models/EPs without explicit qualification. "On ConvNext-tiny CPU" is allowed. "CPU dislikes fusion" is not — it's an overgeneralization.
Unresolved uncertaintyIf an observation contradicts the expected behavior (e.g., a "disabled" fusion still appears in the output), the report must flag this as an open question, not silently adopt an explanation.
EP isolationA finding on one EP (positive or negative) MUST NOT be applied to prune the search space of a different EP without independent validation. CPU opset regression ≠ QNN NPU opset regression. Always validate per EP independently.
+

The skill MUST NOT write confident root-cause explanations in the HTML report or chat summary for regressions where only the measurement is available. Use hedged language: "this likely relates to…", "one hypothesis is…", or simply omit the explanation and recommend profiling.

+

Perf gain validation protocol

+

Before any perf gain is written into a report, config recommendation, or knowledge base as a confirmed finding, it must pass ALL three gates:

+

Gate 1 — Statistical: two-phase bench protocol (from GPU Optimizer V2)

+
Phase A — Quick screen (fast, ~2 min):
+  winml perf -m <model> --ep <ep> --device <device> --warmup 20 --iterations 200 -o screen.json
+  CV = screen.json.std / screen.json.p50
+  IF CV > 0.10 (10%): REJECT — high DVFS variance, measurement unreliable
+                       → cool down 120s, retry once
+                       → if still CV > 0.10: flag as [UNSTABLE], skip candidate
+
+Phase B — Full bench (only if Phase A passes, ~15 min):
+  # 3 independent sessions with 60s cool-down between each
+  winml perf ... --warmup 50 --iterations 1000 -o run1.json
+  sleep 60
+  winml perf ... --warmup 50 --iterations 1000 -o run2.json
+  sleep 60
+  winml perf ... --warmup 50 --iterations 1000 -o run3.json
+
+  # KEEP if ALL of:
+  #   1. p50(run1,2,3) are all faster than baseline p50 × (1 - min_improvement)
+  #   2. CV of each run < 0.10
+  #   3. cosine_similarity ≥ accuracy_floor
+  KEEP_threshold = baseline_p50 × 0.99   # ≥1% improvement required
+
+ +

Rationale: DVFS on mobile NPUs causes 2-10x run-to-run variance. CV check catches this before wasting 15 min on full bench.

+

Gate 2 — Mechanism: read ORT/QNN source code before explaining why

+

Gate 2 — Mechanism: read ORT/QNN source code before explaining why +- For QNN EP gains: check onnxruntime/core/providers/qnn/builder/ for opset-conditional dispatch +- For CPU EP gains: check onnxruntime/core/optimizer/ for pass applicability conditions +- For DML EP gains: check DML operator mapping tables +- Do not publish "opset 21 = 2.3x faster on QNN NPU" without confirming the mechanism in source code. It may be DVFS bias, not a real architectural difference.

+

Gate 3 — Reproducibility: baseline and candidate measured in same thermal state +- Run baseline and candidate back-to-back in the same session OR +- Use a device-level tool to lock NPU clock frequency +- If you cannot control thermal state, report min_ms (peak-performance ceiling) alongside p50 (typical performance), and flag the variance explicitly.

+

Lesson from ConvNext opset sweep (2026-06-10): +Initial opset 21 measurement (8.45ms, 50 iters) vs opset 17 (19.4ms) appeared to show 2.3x gain. Full 17-22 sweep with 50 iters each showed: +- All opsets min ~9-10ms (same peak capability) +- opset 17 p50=54ms, opset 19-22 p50=12ms — but opset 18 p50=43ms (bimodal) +- opset 21 std varied from 10ms (cool device) to 37ms (warm device) +Conclusion: data is inconclusive. Gain may be real OR may be thermal artifact. Gates 1+2 not yet passed.

+
+

Design Comparison: GPU Optimizer V2 vs WinML Autoconfig

+

Reference: "Agentic GPU Model Optimization" doc (cheye@, 2026-03-20). GPU Optimizer V2 is a 6-role multi-agent system for cloud GPU inference optimization (ONER-1B KNN service, H100). Autoconfig is a local edge inference optimizer (winml-cli, Snapdragon X). Most of their infrastructure (machine pool, SSH fleet, Triton serving, custom CUDA kernels, SM occupancy tuning) does not apply here. But the agent loop design has several directly adoptable ideas.

+

Adoptable insights from GPU Optimizer V2

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
V2 design decisionV2 rationaleAdopt into autoconfig?Notes
Two-phase bench: 200-iter quick screen → 3×1000-iter full bench"CV<2% gates full bench — avoid wasting time on high-variance results"YES — highest priority gapWe've been doing single 50-iter runs and calling them facts. CV check would have caught the DVFS noise immediately.
Verdict policy names (ThroughputOnly, ThroughputOrLatency…)"Named policies prevent Reviewer from ad-hoc criteria drift"✅ YES (simplified)Autoconfig should have explicit KEEP criteria: p50_ms < baseline × (1 - threshold) AND cosine ≥ floor
Append-only experiment_log.md + results.tsv written only by Reviewer"Single writer = no drift, full audit trail"✅ YESOur results.tsv exists but no "single writer" discipline
Explorer mandatory external-research triggers"After 15 consecutive DISCARDs → external research sweep"✅ YES — this is the exact gap that caused the opset 21 missIf we had this rule, we would have searched ORT source after N DISCARDs and found kMaxSupportedOpset earlier
Knowledge agent with review gate before KB save"Learnings reviewed before they prune future search"✅ YESep_knowledge/*.json entries should be marked draft until Gate 2 (mechanism) is confirmed
Correctness contract locked after Phase 0, never modified"Prevents accuracy goal-post moving"✅ YESWe have accuracy gate but no locked contract file
30-consecutive-DISCARD stop condition"Prevents endless search in exhausted space"✅ YESautoconfig has no stop condition today
Per-experiment structured output: Hypothesis → Implementation → Parity → Perf → Analysis → Decision"Enables post-analysis and knowledge extraction"✅ YESautoconfig report is currently holistic, not per-experiment
Role separation: Profiler / Explorer / Optimizer / Reviewer are separate agents"Prevents context drift; each agent stays focused"⚠️ PartialFull 6-agent split is overkill for CLI tool; but Explorer / Reviewer distinction is valuable
Resource lock: only one GPU job at a time"Prevents benchmark interference"✅ YES (trivially)Already serial; but should be explicitly enforced if autoconfig ever parallelizes
Machine pool + SSH fleet + Model RegistryCloud GPU fleet management❌ N/ALocal device only
Custom CUDA kernel writing"Extreme asymmetry benefits from custom kernels"❌ N/ACLI-only constraint; no kernel modification
SM occupancy / GEMM tile count tuning"H100 has 132 SMs; 48 output tiles = 36% occupancy"❌ N/AEdge NPU/GPU, not H100 multi-SM
FlashAttention / fused QKV"Eliminate HBM traffic for attention score matrix"❌ N/AModel is already trained; deployment-time optimization only
+

Key gaps in current autoconfig design (from V2 comparison)

+

Gap 1 (critical): No two-phase bench protocol +Current design runs --iterations 50 and accepts the result. V2 runs: +1. Quick screen: 200 iters, check CV < 2% (Coefficient of Variation = std/mean) +2. Only if CV < 2%: full bench 3×1000 iters with 60s cool-down between sessions +3. KEEP only if Δp50 > threshold AND CV(candidate) < 2%

+

This directly matches the "iter ≥ 1000" rule we just added. Formalize it as two phases.

+

Gap 2 (critical): No mandatory external-research trigger in Explorer +V2 Explorer triggers external research (web search, papers, source code) after: +- 15 consecutive DISCARDs +- Every KEEP that changes model/precision +- Before declaring backlog_empty

+

We discovered kMaxSupportedOpset only by accident (downloading QNN Hub models). A mandatory "read ORT source after 5 DISCARDs in opset dimension" rule would have found it in Phase 2.

+

Gap 3 (important): ep_knowledge/*.json has no draft/confirmed state +V2 Knowledge agent requires review gate before KB entries are used to prune search space. Our ep_knowledge findings should have: +- status: "draft" — observed, mechanism unconfirmed (Gate 2 not passed) +- status: "confirmed" — mechanism confirmed via source code (Gate 2 passed)
+- status: "deprecated" — finding invalidated by new experiment or ORT version change +Only "confirmed" entries should prune search space. "draft" entries inform hypothesis priority but don't prune.

+

Gap 4 (nice-to-have): No per-experiment structured artifact +V2 produces per-experiment: Hypothesis / Implementation / Parity / Perf / Analysis / Decision +autoconfig produces: one aggregate report.html. Should produce both.

+

Design: The Autoresearch Loop

+

Inspired by karpathy/autoresearch: +agent modifies a config file, runs a fixed-cost experiment, checks if the objective improved, keeps or discards, and repeats autonomously until manually stopped or convergence criteria met.

+
OBJECTIVE (user-defined, one of):
+  A. Accuracy-primary:  maximize cosine_similarity  subject to  p50_ms ≤ <budget>
+  B. Latency-primary:   minimize p50_ms             subject to  cosine ≥ <floor>
+  C. Pareto search:     find the full accuracy-latency frontier
+
+SEARCH SPACE — config.json has three sections the agent can modify:
+
+  [export]
+    opset_version          : int   — 17, 18, 19, 20  (higher = newer ops, EP may not support)
+    do_constant_folding    : bool  — may affect graph structure visible to EP
+    dynamic_axes           : dict  — static vs dynamic shapes (QNN prefers static batch=1)
+
+  [optimize]  — full capability list (from winml optimize --list-capabilities)
+
+    GraphPipe (run via ORT SessionOptions):
+      GELU:
+        gelu-fusion            : bool  — fuse tanh-GELU subgraph → Gelu op
+        fast-gelu-fusion       : bool  — fuse fast-GELU (tanh-approx) → FastGelu
+        bias-gelu-fusion       : bool  — fuse Bias+GELU (requires gelu-fusion)
+        quick-gelu-fusion      : bool  — fuse x*sigmoid(1.702x) → FastGelu
+        gelu-approximation     : bool  — convert exact Gelu → FastGelu (requires gelu-fusion)
+      Activation:
+        bias-softmax-fusion    : bool  — fuse Bias+Softmax
+        bias-dropout-fusion    : bool  — fuse Bias+Dropout
+      Convolution:
+        conv-add-fusion        : bool  — fuse Conv+Add (bias)
+        conv-bn-fusion         : bool  — fuse Conv+BatchNorm into weights
+        conv-mul-fusion        : bool  — fuse Conv+Multiply
+        conv-activation-fusion : bool  — fuse Conv+activation (ReLU, Sigmoid, etc.)
+      Elimination:
+        slice-elimination      : bool  — remove redundant Slice ops
+        expand-elimination     : bool  — remove no-op Expand
+        unsqueeze-elimination  : bool  — fold Unsqueeze into initializers
+      GEMM:
+        gemm-activation-fusion : bool  — fuse GEMM+activation
+        gemm-sum-fusion        : bool  — fuse GEMM+Sum
+        gemm-transpose-fusion  : bool  — fuse GEMM+Transpose
+      Graph:
+        concat-slice-elimination   : bool  — remove Concat+Slice that restore originals
+        double-qdq-pairs-remover   : bool  — remove consecutive QDQ pairs
+        constant-folding           : bool  — pre-compute constant exprs (default=True; disable to reduce size)
+      LayerNorm:
+        layer-norm-fusion          : bool  — fuse ReduceMean→Sub→Pow→Sqrt→Div→Mul→Add
+        skip-layer-norm-fusion     : bool  — fuse Add(residual)+LayerNorm → SkipLayerNorm (requires layer-norm-fusion)
+        simplified-layer-norm-fusion : bool — fuse simplified LayerNorm (no mean-centering)
+      Layout:
+        transpose-optimizer        : bool  — eliminate redundant transpose chains
+        nhwc-transformer           : bool  — NCHW→NHWC (GPU memory layout)
+        nchwc-transformer          : bool  — NCHW→NCHWc (CPU SIMD layout)
+        conv-add-activation-fusion : bool  — fuse Conv+Add+Activation → FusedConv
+      MatMul:
+        matmul-add-fusion          : bool  — fuse MatMul+Add → single kernel
+        matmul-activation-fusion   : bool  — fuse MatMul+activation (DML-only, requires matmul-transpose-fusion)
+        matmul-transpose-fusion    : bool  — fuse MatMul+Transpose → FusedMatMul
+        matmul-scale-fusion        : bool  — fuse MatMul+Scale
+        matmul-bn-fusion           : bool  — fuse MatMul+BatchNorm
+        dynamic-quantize-matmul-fusion : bool — dynamic quant for MatMul
+      Misc:
+        gather-slice-to-split-fusion : bool — fuse Gather+Slice → Split
+        gather-to-slice-fusion       : bool — convert Gather to Slice (contiguous idx)
+        pad-fusion                   : bool — fuse Pad with Conv/Pool
+        not-where-fusion             : bool — fuse Not+Where
+
+    FusionPipe (ORT transformer fusions, via FusionOptions):
+      attention-fusion              : bool  — fuse MHA pattern → Attention/MultiHeadAttention
+      layer-norm-fusion             : bool  — (FusionPipe variant, same flag)
+      skip-layer-norm-fusion        : bool  — (FusionPipe variant)
+      simplified-layer-norm-fusion  : bool  — (FusionPipe variant)
+      embed-layer-norm-fusion       : bool  — fuse Embedding+Position+LayerNorm (requires layer-norm-fusion)
+      bias-skip-layer-norm-fusion   : bool  — fuse Bias+SkipLayerNorm (requires skip-layer-norm-fusion)
+      fuse-rmsnorm                  : bool  — fuse RMSNorm → LpNormalization(p=2) [custom, QNN-compatible]
+      packed-qkv-fusion             : bool  — (SD only)
+      packed-kv-fusion              : bool  — (SD only)
+      skip-group-norm-fusion        : bool  — (SD only)
+      bias-add-fusion               : bool  — fuse BiasAdd
+      qordered-matmul               : bool  — (SD only)
+
+    SurgeryPipe (pre-EP graph fixes):
+      clamp-constant-values         : bool  — clamp -inf/+inf constants → [-1e3, 1e3] (prevents QNN quant issues)
+      remove-isnan-in-attention-mask: bool  — remove Softmax→IsNaN→Where guards (use after clamp)
+
+    RewritePipe (pattern-based subgraph rewriting):
+      --enable-{source-slug}-{target-slug}  (run winml optimize --list-rewrites for full list)
+      Examples: --enable-gelu-singlegelu, --enable-matmuladdpattern-reshapegemmreshapepattern
+
+  [quant]
+    precision              : fp16 | w8a16 | w8a8
+    calibration_method     : minmax | entropy | percentile
+    samples                : 64 | 128 | 256 | 512
+    per_channel            : bool
+    symmetric              : bool
+    op_types_to_quantize   : list[str]  — restrict which op types get quantized
+    nodes_to_exclude       : list[str]  — exclude specific named nodes
+
+FIXED:  winml build + winml eval + winml perf  (the experiment harness)
+METRIC: cosine_similarity  (from winml eval --format json)
+        p50_ms             (from winml perf --format json)
+RECORD: results.tsv
+
+ +
+

Profiler-Enhanced Agent Architecture (redesigned)

+

Insight from GPU Optimizer v2 analysis and ConvNext POC: +Running the profiler before the search loop would have shown Gemm=57.7% on ConvNext — +immediately ruling out layout-pass experiments (Transpose only 2.6%, already fused Gelu already +canonical). Profile-first makes the Explorer smarter and the search shorter.

+

New 4-phase structure:

+
┌─────────────────────────────────────────────────────────────────────┐
+│ PHASE 0 — INTAKE                                                    │
+│   winml inspect → validate model is supported                       │
+│   winml build (baseline config) → get model.onnx                   │
+│   winml eval --mode compare → lock FP32 correctness baseline        │
+│   winml perf (baseline) → establish latency floor                   │
+└────────────────────────────┬────────────────────────────────────────┘
+                             ▼
+┌─────────────────────────────────────────────────────────────────────┐
+│ PHASE 1 — PROFILE  (runs ONCE, before any search)                   │
+│   winml perf -m baseline/model.onnx --ep <ep> --profile             │
+│   Parse bottleneck.json:                                            │
+│     - top_bottleneck: op type with highest % of kernel time         │
+│     - top3_concentration_pct: how concentrated the compute is       │
+│     - headroom_hints: actionable pass recommendations               │
+│   Classify each bottleneck op type:                                 │
+│     - "compute" (Gemm, Conv, Attention) → quant/kernel matters      │
+│     - "layout" (Transpose, Reshape) → graph pass matters            │
+│     - "already_canonical" (op shows as fused type) → fusion N/A    │
+│   Output: prioritized_hypothesis_queue (ordered by profile evidence)│
+└────────────────────────────┬────────────────────────────────────────┘
+                             ▼
+┌─────────────────────────────────────────────────────────────────────┐
+│ PHASE 2 — PROFILE-GUIDED OPTIMIZATION LOOP                          │
+│                                                                     │
+│  ┌──────────────┐    ┌──────────────┐    ┌─────────────────────┐  │
+│  │   EXPLORER   │───►│  OPTIMIZER   │───►│      REVIEWER       │  │
+│  │              │    │              │    │                     │  │
+│  │ Pops next    │    │ Runs ONE     │    │ Cross-exp verdict:  │  │
+│  │ hypothesis   │    │ experiment:  │    │ - CV gate Phase A   │  │
+│  │ from queue,  │    │ build +      │    │ - full bench Gate 1 │  │
+│  │ motivated by │    │ quick-screen │    │ - keep / discard    │  │
+│  │ profile data │    │ → full bench │    │ - detect plateau    │  │
+│  │              │    │ → eval       │    │ - stop condition    │  │
+│  └──────────────┘    └──────────────┘    │ - write KB draft   │  │
+│         ▲                               └─────────────────────┘  │
+│  mandatory external-research triggers (adopted from V2):           │
+│    • after 5 consecutive DISCARDs in same search dimension         │
+│      → search ORT/QNN SDK source code for mechanism               │
+│    • after every KEEP that changes precision or EP                 │
+│      → re-read ep_knowledge for updated constraints                │
+│    • before declaring search_space_exhausted                       │
+│      → ORT source sweep: opset gates, EP-specific dispatch rules   │
+│                                                                     │
+│  Explorer prunes via bottleneck.json (only "confirmed" KB rules):  │
+│    IF top_bottleneck == "Gemm" (>50%):                              │
+│      → SKIP layout passes (transpose-optimizer, nchwc, nhwc)        │
+│      → FOCUS on: quant precision, calibration, matmul fusions       │
+│    IF top_bottleneck == "Transpose" (>10%):                         │
+│      → CHECK kMaxSupportedOpset for current ORT version FIRST       │
+│    IF top_bottleneck == "Conv" (>20%):                              │
+│      → try nchwc-transformer, conv-activation-fusion               │
+│    IF "Gelu"/"LayerNormalization" op_type (already canonical):      │
+│      → SKIP corresponding fusion flags                              │
+└────────────────────────────┬────────────────────────────────────────┘
+                             ▼
+┌─────────────────────────────────────────────────────────────────────┐
+│ PHASE 3 — REPORT                                                    │
+│   config_<ep>_optimal.json  ← champion config with _autoconfig_meta│
+│   report.html               ← full benchmark + profile section      │
+│   experiments/<n>/          ← per-exp: hypothesis/impl/parity/     │
+│                                perf/analysis/decision (V2 pattern)  │
+│   kb_entry.json             ← status="draft"; promoted to          │
+│     "confirmed" only after mechanism confirmed (Gate 2)             │
+└─────────────────────────────────────────────────────────────────────┘
+
+ +

ep_knowledge draft/confirmed lifecycle (Gap 3 fix):

+
KB entry states:
+  "draft"     — observed perf delta, mechanism unconfirmed (Gate 2 not passed)
+                Can influence hypothesis PRIORITY but NOT prune search space
+  "confirmed" — mechanism confirmed via ORT/QNN source code (Gate 2 passed)
+                Can prune search space for future runs
+  "deprecated"— finding invalidated by new experiment or stack version change
+                Must NOT influence search space; kept for history only
+
+Transition rules:
+  draft → confirmed:   requires mechanism_confirmed=true + source_citation
+  confirmed → deprecated: requires contradicting experiment OR stack version bump
+  deprecated entries:  kept in JSON with status field, never deleted
+
+ +

Profiler output → Explorer mapping table:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Profile findingExplorer actionHypothesis skipped
Gemm > 50%Prioritize quant/calib experimentsAll layout-transform passes
Transpose < 5% (opset=17)Transpose Optimizer already workingtranspose-optimizer trials
op_type "Gelu" presentAlready fusedgelu-fusion, fast-gelu-fusion
op_type "LayerNormalization" presentAlready fusedlayer-norm-fusion trials
Reorder{Input,Output} present (>4%)NCHWc already activenchwc-transformer trials
op_type "Attention" presentMHA already fusedattention-fusion trials
QDQ ops > 15%Quant overhead highFocus on op_types_to_quantize exclusions
Transpose > 10% + opset ≥ 19kMaxSupportedOpset issueFlag as [KNOWN_TRADEOFF], lower opset
+

Why profile-first matters (validated on ConvNext):

+

The ablation experiment ran 22 experiments over multiple days. Had the profiler run first: +- Profile shows: Gemm=57.7%, Conv=12.6%, Transpose=2.6%, Gelu=8% (already "Gelu" op) +- Explorer would have immediately skipped: gelu-fusion, layer-norm-fusion, transpose-optimizer, + nchwc-transformer (already active via ReorderInput/Output) +- Only candidates from profile: matmul-add-fusion (Gemm bottleneck), conv-activation-fusion +- This would have reduced 22 experiments to ~6, with the same conclusions

+

POC profiler: C:\tmp\autoconfig-demo\winml_profile.py +- Uses ORT enable_profiling=True + end_profiling() (same pattern as AI Studio's profile_file.py) +- CPU EP: parses _kernel_time events from ORT JSON trace +- Output: bottleneck.json (structured) + bottleneck.txt (human-readable) + raw ORT trace +- ConvNext result: Gemm 57.7%, Conv 12.6%, Transpose 2.6% → confirms baseline is optimal for CPU

+
+

Sections

+

1. Phase 0 — Intake + Baseline

+
# Step 1: verify the model is supported
+winml inspect -m <model-id> --format json
+
+# Step 2: baseline build (default config, opset=17)
+winml export -m <model-id> -o baseline/
+winml build -c config_baseline.json -m <model-id> -o baseline_built/
+
+# Step 3: correctness contract
+winml eval --mode compare -m baseline_built/model.onnx --model-id <model-id> --format json
+# Expected: cosine=1.0 (FP32 self-comparison)
+
+# Step 4: baseline perf
+winml perf -m baseline_built/model.onnx --ep <ep> --warmup 10 --iterations 50 --format json
+# Record: baseline_p50_ms
+
+ +

Initialize results.tsv (TSV, not CSV — commas break in description field):

+
commit  precision   nodes_excluded  cosine  p50_ms  calibration_samples status  notes
+
+ +
+

2. Phase 1 — Profile (runs once, BEFORE any search experiments)

+
# Run profiler on baseline model (--profile flag added to winml perf)
+winml perf -m baseline_built/model.onnx --ep <ep> \
+  --warmup 5 --iterations 20 --profile --out profile_out/ --format json
+# Reads: profile_out/bottleneck.json
+# POC (before --profile ships): python winml_profile.py --model ... --ep ...
+
+ +

Profiler output drives Explorer hypothesis initialization:

+
READ bottleneck.json:
+  top_bottleneck: <op_type>
+  op_summary: [{op_type, pct}, ...]  (sorted by descending pct)
+  headroom_hints: [...]
+
+BUILD skip_set (passes not worth trying):
+  FOR each op_type in op_summary:
+    IF op_type == "Gelu":          skip_set.add(gelu-fusion, fast-gelu-fusion)
+    IF op_type == "LayerNormalization": skip_set.add(layer-norm-fusion)
+    IF op_type == "Attention":     skip_set.add(attention-fusion)
+    IF "ReorderInput" in op_summary AND pct > 2%:
+                                   skip_set.add(nchwc-transformer)  # already active
+  IF Transpose pct < 5% AND opset=17:
+                                   skip_set.add(transpose-optimizer)  # already working, no gain
+  IF Transpose pct > 10% AND opset >= 19:
+                                   flag as [KNOWN_TRADEOFF]; add to report
+
+BUILD priority_queue (hypotheses in evidence-based order):
+  IF top_bottleneck == "Gemm" OR "MatMul":
+    queue: [quant_precision, calib_method, calib_samples, matmul_fusions, per_channel]
+  IF top_bottleneck == "Conv":
+    queue: [nchwc (if not in skip_set), conv_fusions, quant_precision]
+  IF top_bottleneck == "Attention":
+    queue: [quant_precision, nodes_to_exclude (Attention), calib_method]
+  DEFAULT:
+    queue: [quant_precision, calib_method, calib_samples]
+
+ +
+

3. Phase 2 — Profile-Guided Optimization Loop (single EP)

+
LOOP FOREVER (until user stops or convergence):
+
+1. EXPLORER: pop next hypothesis from priority_queue
+   - Skip if in skip_set (pruned by profile)
+   - If queue empty → enter Phase 4 (generalization) or stop
+
+2. HYPOTHESIZE: build config.json delta based on hypothesis
+   Hypothesis rules (profile-informed, in priority order):
+   a. If first loop: start with full W8A8/W8A16, all ops quantized
+   b. If cosine < floor: add worst partial_op to nodes_to_exclude (one at a time)
+   c. If cosine ≥ floor but latency > budget: try W8A8 instead of W8A16,
+      or reduce calibration_samples, or add per_channel=true
+   d. If stuck (3 iterations no improvement): try calibration_method change
+      (minmax → entropy → percentile)
+   e. If still stuck: try precision escalation (W8A8 → W8A16 → FP16)
+
+3. MODIFY: write updated config.json
+   Key fields in quant section:
+   {
+     "precision": "w8a8",
+     "samples": 128,
+     "calibration_method": "minmax",
+     "nodes_to_exclude": ["LayerNorm_0", "Softmax_3"],
+     "per_channel": false
+   }
+
+4. OPTIMIZER: winml build -c config.json -m <model-id> -o out_<iteration>/
+   If build crashes: log as "crash", revert config, try different hypothesis
+
+5a. EVAL — quick sanity (cosine proxy, cheap):
+    winml eval --mode compare -m out_<iteration>/artifact.onnx \
+               --model-id <model-id> --format json
+    → cosine_similarity, sqnr_db
+    If cosine < hard_floor (e.g. 0.85): fail-fast, skip step 5b + 6, log as discard
+
+5b. EVAL — task accuracy (real quality gate):
+    winml eval -m out_<iteration>/artifact.onnx \
+               --model-id <model-id> \
+               --task <task>  --device <target> --ep <ep> \
+               --samples 100 --format json
+    → top1_accuracy (image-classification), f1 (text), mAP (detection), etc.
+    This is the authoritative accuracy metric for Reviewer verdict.
+
+    Why cosine alone is not sufficient:
+    - High cosine (0.97) but top-1 drops 5%: logit magnitudes preserved but relative ranking shifted
+    - Low cosine (0.92) but same top-1: relative ranking unchanged despite numeric difference
+    → Only task accuracy tells you whether the model still does its job
+
+6. PERF: winml perf -m out_<iteration>/artifact.onnx \
+         --device <target> --ep <ep> --warmup 10 --iterations 50 --format json
+   → p50_ms, p90_ms
+
+7. REVIEWER: cross-experiment verdict
+   keep    if task_accuracy ≥ accuracy_floor  AND  p50_ms ≤ latency_budget
+   discard if task_accuracy < accuracy_floor  OR   p50_ms > latency_budget
+   crash   if build/eval failed
+
+   Reviewer also checks:
+   - Plateau: 3+ keeps with Δlatency < 2% → likely at local optimum
+   - Profile divergence: if new op_type appears after build, re-profile
+   - Skip_set update: if experiment proves a pass is a no-op, add to skip_set
+   - Accuracy cliff: if task_accuracy drops > 3% in one step → flag, do not cascade
+
+8. LOG to results.tsv:
+   <git-short-hash>  <precision>  <nodes_excluded>  <cosine>  <top1_acc>  <p50_ms>  <samples>  keep/discard/crash  <notes>
+
+9. If keep: advance to next iteration from this config
+   If discard: revert to last kept config, try different hypothesis
+
+ +

Convergence criteria (stop the loop): +- cosine ≥ target floor AND p50_ms ≤ latency budget: objective achieved +- 5 consecutive discards with no improvement: report best so far +- User manually stops the agent

+
+

3. Hypothesis generation rules (the intelligence layer)

+

The agent generates hypotheses by traversing the search space in priority order. +Each hypothesis is motivated by diagnostic data from the previous experiment, not random search.

+

Priority ordering across the three config sections:

+
Phase 1 — establish baseline (iteration 0)
+  Start with: opset_version=17, all fusions enabled, precision=w8a16, minmax, 128 samples
+
+Phase 2 — precision first (fastest to try, most impact)
+  If cosine < floor:
+    w8a16 → try w8a8 with selective exclusions, or w8a16 first
+  If latency > budget:
+    w8a16 → try w8a8 (smaller model, faster inference)
+    fp16  → try w8a16 (if currently at fp16)
+
+Phase 3 — calibration tuning (if precision is right but cosine still low)
+  Try in order: minmax → entropy → percentile
+  Try increasing samples: 128 → 256 → 512
+  Try per_channel=true (better accuracy, slightly slower build)
+  Try symmetric=false if currently true
+
+Phase 4 — optimize pass tuning (independent of quant, affects graph structure)
+  Hypothesis: some fusion patterns create op shapes QNN handles poorly
+  Transformer models (try in order):
+    attention-fusion → skip-layer-norm-fusion → layer-norm-fusion → fuse-rmsnorm
+  Vision models (try in order):
+    conv-bn-fusion → conv-add-fusion → conv-activation-fusion
+  Shared (try if cosine drops or build crashes):
+    constant-folding=false  (prevents size bloat; sometimes exposes EP-incompatible shape)
+    clamp-constant-values=true  (fixes -inf attention mask → quantization issues)
+    remove-isnan-in-attention-mask=true  (use after clamp; cleans dead IsNaN guards)
+  Try opset_version: 17 → 18 → 19
+    (Higher opsets expose newer op types that may have better EP support)
+
+Phase 5 — selective node exclusion (when analyze shows partial ops)
+  Read winml analyze --format json → partial_ops list
+  Exclude one partial_op at a time (greedy: exclude highest-impact first)
+  Also try excluding op_types_to_quantize selectively
+    e.g., remove "LayerNorm" from op_types_to_quantize list
+
+Phase 6 — combined search (if single-dimension changes are stuck)
+  Try combinations of best Phase 3 + Phase 4 + Phase 5 changes together
+
+ +

Diagnosis table — what to try given what you see:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SymptomLikely causePhase to try next
cosine drops a lot at quant stage, all ops supportedCalibration data mismatchPhase 3: entropy calib, more samples
cosine drops at quant, Attention ops partialAttention activation quant on QNNPhase 5: exclude Attention nodes
cosine OK but latency worse than CPUFusion pattern creating unoptimized subgraphPhase 4: disable attention-fusion, try different opset
cosine OK but model larger than expectedConstant folding inlining large weightsPhase 4: constant-folding=false
Both cosine and latency good at w8a8 but build crashesopset op not supported by quant pipelinePhase 4: opset_version 17 → 16
cosine highly variable across seedsCalibration with too few samplesPhase 3: 128 → 256 samples
All ops supported, cosine still drops after fusionsFusion creates non-quantizable shapePhase 4: disable skip-layer-norm-fusion
QNN build fails with "invalid scale"-inf in attention mask initializerPhase 4: clamp-constant-values=true
Vision model: accuracy drops unexpectedlyConv+BN fusion slightly changes weight valuesPhase 4: disable conv-bn-fusion
MatMul-heavy model: latency not improvingMatMul not being fusedPhase 4: matmul-add-fusion, matmul-transpose-fusion
RMSNorm model (Llama etc.) poor QNN perfORT not recognizing RMSNorm patternPhase 4: fuse-rmsnorm=true
+

This is the key difference from grid search: each hypothesis is motivated by diagnostic data from winml analyze and the previous experiment result.

+
+

4. Multi-EP config generation

+

Run parallel loops for each target EP, then aggregate into manifest.json:

+
# Agent runs loops for each EP (can be sequential or parallel):
+# Loop 1: ep=qnn,   target_device=npu
+# Loop 2: ep=dml,   target_device=gpu
+# Loop 3: ep=cpu,   target_device=cpu
+
+# After all loops complete, agent generates:
+# - config_qnn_optimal.json   (best config found for QNN)
+# - config_dml_optimal.json   (best config found for DirectML)
+# - config_cpu_optimal.json   (best config found for CPU)
+
+# Then builds final artifacts and assembles manifest.json
+
+ +

Generated manifest.json includes experiment provenance:

+
{
+  "model_id": "microsoft/resnet-50",
+  "generated_by": "autoconfig",
+  "experiments_run": 34,
+  "variants": [
+    {
+      "ep": "qnn", "device": "npu",
+      "file": "model_qnn.onnx",
+      "precision": "w8a16",
+      "nodes_excluded": ["MultiHeadAttention"],
+      "cosine_similarity": 0.972,
+      "p50_ms": 18.3,
+      "config": "config_qnn_optimal.json"
+    },
+    {
+      "ep": "dml", "device": "gpu",
+      "file": "model_dml.onnx",
+      "precision": "fp16",
+      "nodes_excluded": [],
+      "cosine_similarity": 0.999,
+      "p50_ms": 22.1,
+      "config": "config_dml_optimal.json"
+    },
+    {
+      "ep": "cpu", "device": "cpu",
+      "file": "model_cpu.onnx",
+      "precision": "w8a8",
+      "nodes_excluded": ["LayerNorm"],
+      "cosine_similarity": 0.931,
+      "p50_ms": 84.7,
+      "config": "config_cpu_optimal.json"
+    }
+  ],
+  "selection_order": ["qnn", "dml", "cpu"]
+}
+
+ +
+

5. results.tsv format

+

Track all three config sections per experiment (TSV, not CSV):

+
commit  opset   fusions_disabled    precision   nodes_excluded  cosine  p50_ms  calib_samples   calib_method    status  notes
+baseline    17  []  fp32    []  1.000   —   —   —   keep    FP32 reference
+a1b2c3d 17  []  w8a8    []  0.871   16.2    128 minmax  discard full W8A8 too aggressive
+b2c3d4e 17  []  w8a16   []  0.967   19.8    128 minmax  keep    W8A16 baseline meets floor
+c3d4e5f 17  []  w8a16   []  0.969   19.1    256 entropy keep    entropy calib improvement
+d4e5f6g 17  [attention-fusion]  w8a16   []  0.971   18.4    256 entropy keep    disabling attn-fusion helps latency
+e5f6g7h 18  [attention-fusion]  w8a16   []  0.973   17.9    256 entropy keep    opset18 best so far
+f6g7h8i 18  [attention-fusion]  w8a8    [MultiHeadAttention]    0.961   14.2    256 entropy keep    mixed prec: meet latency budget
+
+ +
+

6. Skill outputs

+

autoconfig produces two primary outputs after convergence or user stop:

+

Output A: Best config file

+

config_<ep>_optimal.json — the winning config.json, ready to pass to winml build. Contains provenance metadata so it's reproducible:

+
{
+  "_autoconfig_meta": {
+    "model_id": "facebook/convnext-tiny-224",
+    "ep": "qnn",
+    "objective": "latency-primary",
+    "latency_budget_ms": 20,
+    "accuracy_floor": 0.95,
+    "experiments_run": 23,
+    "best_iter": "iter_17",
+    "timestamp": "2026-06-10T11:55:05+08:00"
+  },
+  "export": { "opset_version": 18 },
+  "optimize": { "attention-fusion": false },
+  "quantize": {
+    "precision": "w8a16",
+    "calibration_method": "entropy",
+    "calibration_samples": 256,
+    "nodes_to_exclude": ["MultiHeadAttention_0"]
+  }
+}
+
+ +

Output B: HTML benchmark report

+

report.html — self-contained single-file report (no external dependencies), viewable in any browser. Contains:

+

Section 1 — Summary card

+
Model:    facebook/convnext-tiny-224     EP: QNN (NPU)
+Objective: latency-primary ≤ 20ms       Accuracy floor: 0.95
+Result:   ✅ FOUND                       Experiments: 23  Time: 41 min
+
+Best config:  W8A16, entropy calib, 256 samples
+  Accuracy:   0.953  (floor 0.95 ✓)
+  p50 latency: 15.8ms  (budget 20ms ✓)
+
+ +

Section 2 — Search progress chart +Scatter plot: all 23 experiments, x=p50_latency_ms, y=accuracy. +- Green dot = kept (improvement) +- Red dot = discarded (regression) +- Star = best found +- Hover tooltip: iter ID, config diff vs previous

+

Section 3 — Iteration table +Full results.tsv rendered as sortable HTML table with columns:

+
iter | opset | precision | nodes_excluded | calib | accuracy | p50_ms | Δacc | Δlatency | status | hypothesis
+
+ +

Color-coded rows: green = keep, red = discard, gold = best.

+

Section 4 — Config diff timeline +Visual diff showing what changed between each kept iteration (config deltas as +/- lines).

+

Section 5 — Model graph analysis (from pre-search winml analyze) +- Op distribution pie chart (ONNX vs com.microsoft) +- EP compatibility table: ops supported/unsupported on target EP +- Detected patterns (GELU variant, attention structure, Transpose-sandwich)

+

Section 6 — Benchmark details +For the best config, full winml perf output: +- p10/p50/p90/p99 latency histogram +- Throughput (samples/sec) +- Warmup vs steady-state comparison +- (If multi-EP: side-by-side EP comparison bar chart)

+

Section 7 — Reproduction instructions

+
# Reproduce the winning config:
+winml build -c config_qnn_optimal.json -m facebook/convnext-tiny-224 -o out/
+# For NPU: always compile after build (empirically +1.7× speedup)
+winml compile -m out/model.onnx --device npu --ep qnn -o out_compiled/
+winml perf -m out_compiled/model_npu_ctx.onnx --ep qnn --iterations 100 --warmup 10
+
+ +

Report generation approach: The agent generates report.html using inline Python with Jinja2-style string templating + embedded Chart.js (CDN or inlined). No external dependencies — single file, opens offline.

+
+

7. What the agent says in chat

+

After convergence or user stop (terminal summary, report is the real deliverable):

+
autoconfig completed. 23 experiments run (41 min).
+
+Best config (QNN NPU):
+  W8A16, entropy calib, 256 samples, MultiHeadAttention excluded
+  accuracy 0.953 ✓ (floor 0.95)   p50 15.8ms ✓ (budget 20ms)
+
+Outputs:
+  config_qnn_optimal.json   ← drop into winml build -c
+  report.html               ← open in browser for full benchmark breakdown
+
+Next: winml validate-before-ship for production gate.
+
+ +
+

8. Constraints and failure handling

+
    +
  • Build timeout: If winml build exceeds 15 minutes, kill and log as crash
  • +
  • OOM: If build fails with out-of-memory, reduce calibration_samples by half
  • +
  • All hypotheses exhausted: Report best config found, note convergence limit
  • +
  • Latency not measurable (target EP not on machine): run eval only, skip perf gate
  • +
+

9. CLI-only constraint (critical)

+

The agent MUST use only official winml CLI commands as its tool surface. No Python scripting, no direct ONNX manipulation, no third-party tools (onnxconverter-common, onnxsim, Olive, etc.) except where explicitly documented as a known workaround.

+

Rationale: autoconfig's output is a config.json + report.html that a user can reproduce with winml build -c config.json. If the agent used a Python hack to produce a model artifact, the config is not reproducible and the report is misleading.

+

Known workarounds (allowed, must be flagged in report): +| Workaround | Replaces | Tracking issue | Required flag in report | +|---|---|---|---| +| python winml_profile.py | winml perf --profile (not yet shipped) | pending | ⚠️ "Profile data via POC script, not official API" |

+

Gap reporting rule: If a hypothesis cannot be tested because the required winml CLI capability does not exist, the agent MUST: +1. Record the hypothesis as SKIPPED — CLI gap in the experiment table +2. Add an entry to Section 6 "Gaps & Issues" block in report.html: + GAP: <hypothesis> requires <missing capability> + Impact: <what speedup/accuracy improvement was not measurable> + Filed: <issue URL or "not yet filed"> +3. NOT silently substitute a Python workaround that produces unverifiable artifacts

+

Example gaps encountered during ConvNext QNN GPU validation: +- winml build --precision fp16 flag not available (#867) → FP16 native export untested → SKIPPED — CLI gap +- winml perf --ep-option not available (#865) → runtime flag sweep untested → SKIPPED — CLI gap +- winml perf --profile for QNN EP not available → profiling via POC script (allowed workaround) +- W8A8 QDQ ONNX on QNN GPU EP hangs indefinitely — root cause is QNN SDK behavior; winml build already prevents this via _patch_device(); fast-fail enhancement filed as #868 (low priority)

+
+

Key commands used

+
# Phase 1: profiling (--profile flag on winml perf, before search)
+winml perf -m baseline_built/model.onnx --ep <ep> --warmup 5 --iterations 20 \
+  --profile --out profile_out/ --format json
+# → profile_out/bottleneck.json  (machine-readable for Explorer)
+# → profile_out/bottleneck.txt   (human-readable summary)
+# POC: python winml_profile.py --model ... --ep ... (until --profile ships)
+
+# Phase 2: analysis (informs nodes_to_exclude hypotheses)
+winml analyze -m <exported>.onnx --ep <ep> --format json
+
+# Phase 2: experiment
+winml build -c config.json -m <model-id> -o out_<n>/
+
+# Phase 2: metrics
+winml eval --mode compare -m out_<n>/artifact.onnx --model-id <model-id> --format json
+winml perf -m out_<n>/artifact.onnx --device <target> --ep <ep> --iterations 50 --format json
+
+# Phase 3: compile best candidate to QNN EPContext (NPU only)
+# Eliminates JIT overhead; empirically ~1.7× further speedup on ConvNext W8A16
+winml compile -m best_candidate/model.onnx --device npu --ep qnn -o best_compiled/
+# → best_compiled/model_npu_ctx.onnx  (loads context binary at runtime)
+# → best_compiled/model_npu_ctx_qnn.bin  (QNN hardware-compiled graph)
+
+# Phase 3: re-benchmark compiled model
+winml perf -m best_compiled/model_npu_ctx.onnx --device npu --ep qnn --warmup 10 --iterations 50
+
+ +

Empirical data: ConvNext QNN NPU compile impact +| Version | p50 | vs FP32 NPU | +|---|---|---| +| FP32 baseline | 19.39ms | — | +| W8A16 quantized | 10.29ms | 1.9× | +| W8A16 + compile | 6.01ms | 3.2× | +→ winml compile alone adds ~1.7× on top of quantization. Always compile for NPU deployment.

+

Empirical data: ConvNext QNN GPU optimization sweep (Adreno X1-85) — full search +| Experiment | p50 | p90 | std | vs FP32 | Notes | +|---|---|---|---|---|---| +| FP32 baseline (autoconf) | 17.7ms | 19.7ms | 0.97 | — | ✅ OPTIMAL with current CLI | +| NHWC transformer | 19.5ms | 23.8ms | 3.43 | ❌ −10% | Hurts Adreno+QNN EP | +| NHWC + all GPU fusions | 18.1ms | 23.9ms | 2.71 | ❌ −2% | Still worse | +| Conv/norm fusions (no NHWC) | 17.6ms | 22.6ms | 5.51 | ≈0% | Variance ↑, no gain | +| LayerNorm rewrite | 18.4ms | 21.4ms | 2.04 | ❌ −4% | Pattern mismatch anyway | +| Transpose optimizer | 0% node Δ | — | — | no-op | Already optimal positions | +| HiDimRTR→LowDimRTR | 0% node Δ | — | — | no-op | ConvNext RTR doesn't match pattern | +| MatMulAdd→Conv2D (2d/3d/4d) | 0% node Δ | — | — | no-op | ConvNext uses Reshape→MatMul, not bare MatMul+Add | +| FP32 + compile | 23.7ms | — | — | ❌ −34% | Compile hurts GPU (opposite of NPU) | +| W8A8 QDQ quantized | hangs | — | — | ❌ blocked | #868 enhancement (fast-fail) | +| FP16 (invalid CLI path) | 8.8ms | ~32ms | bimodal | ⚠️ 2× p50 | BLOCKED — need #867 |

+

Root cause: why no pass matches ConvNext on QNN GPU +- All 251 ops run natively on GPU (251/0/0/0) — no CPU fallback to eliminate +- ConvNext linear layers: Reshape → MatMul → Reshape pattern, not bare MatMul+Add → Conv2D rewrites don't match +- 72 Reshape + 42 Transpose are already at minimum / optimal topology from PyTorch export +- winml build autoconf (gelu_fusion + matmul_add_fusion) already applied all relevant transforms +- The bottleneck is compute throughput + memory bandwidth — only FP16 (smaller tensors) can improve this

+

Key insight: gelu_fusion matters for variance, not p50 +| Version | p50 | p90 | std | +|---|---|---|---| +| Raw export (287 nodes, unfused Gelu) | 17.4ms | 29.2ms | 5.90 | +| Autoconf (251 nodes, fused Gelu+Gemm) | 17.7ms | 19.7ms | 0.97 |

+

Unfused Gelu = 5 separate GPU kernel launches (Mul→Div→Erf→Mul→Add) with scheduling jitter. +A single Gelu kernel eliminates dispatch overhead → p90 −48%, std −6×. +→ autoconf's role on GPU is stability, not speedup. Critical for real-time / latency-SLA deployments.

+

QNN GPU search space exhausted. FP16 is the only remaining lever, blocked by #867.

+

Empirical data: ConvNext DML optimization sweep (Adreno X1-85, DirectML) +| Experiment | p50 | p90 | std | vs FP32 | +|---|---|---|---|---| +| FP32 baseline (autoconf, 251 nodes) | 16.9ms | 17.7ms | 0.52 | — ← OPTIMAL with current CLI | +| NHWC transformer | 16.5ms | 21.0ms | 1.89 | ❌ p90 worse | +| Raw unfused export (287 nodes) | 16.5ms | 18.4ms | 2.74 | ❌ p99=35ms, worse tail | +| FP16 (Python hack ⚠️) | 11.8ms | 12.8ms | 0.66 | ✅ 1.4× faster, clean dist — BLOCKED #867 |

+

DML vs QNN GPU comparison (same Adreno X1-85): +| | QNN GPU FP32 | DML FP32 | DML FP16 (invalid) | +|---|---|---|---| +| p50 | 17.7ms | 16.9ms | 11.8ms | +| p90 | 19.7ms | 17.7ms | 12.8ms | +| std | 0.97 | 0.52 | 0.66 |

+

→ DML is consistently faster and more stable than QNN GPU at FP32. Root cause: DML JIT-compiles HLSL shaders at model load time; QNN GPU EP does graph partitioning at each session creation. +→ DML FP16: no DVFS bimodal (unlike QNN GPU FP16) — DML's shader compilation locks in FP16 compute paths. +→ NHWC hurts DML too (same reason as QNN GPU: Adreno X1-85 + D3D12 doesn't benefit from explicit NHWC transforms). +→ Note: winml analyze returns 0/0/0/251 (all Unknown) for DML — no rule data. DML supports all standard ONNX ops by design.

+

QNN Hub benchmark comparison (Snapdragon X Elite CRD) — WITH cross-stack test

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ModelStackNPU p50GPU p50Notes
QNN Hub Float (opset 21, 222 nodes, MatMul)qairt cloud2.687msReference
QNN Hub Float (same model)winml ORT QNN EP8.78ms23.9msDirect test on this device
Our Float (opset 17, 251 nodes, Gemm)winml ORT QNN EP19.4ms17.7mswinml build output
QNN Hub W8A16 (opset 21, 798 QDQ, uint16 input)qairt cloud2.612msReference
QNN Hub W8A16 (same model)winml ORT QNN EP14.82ms (std=8.8!)ORT-QNN mismatch
Our W8A16 + compile (opset 17, ORT quant)winml ORT QNN EP6.01msBest we can do
+

Gap decomposition (three independent sources):

+
QNN Hub cloud:   2.7ms
+                  ↑ 3.3× Runtime gap  (qairt native vs ORT QNN EP adapter overhead)
+QNN Hub on winml: 8.78ms
+                  ↑ 2.2× Model graph gap (opset 21/MatMul/222 nodes vs opset 17/Gemm/251 nodes)
+Our model on winml: 19.4ms (FP32)
+
+ +

Actionable findings (updated 2026-06-10 — mechanism confirmed via ORT source): +1. opset 21 NPU speedup mechanism CONFIRMED — but ORT-version-dependent (#869) + - Root cause: kMaxSupportedOpset gate in IsSupportedOpset() (layout_transformation.cc). On older ORT where kMaxSupportedOpset < 21, opset 21 models bypass the NHWC layout transform entirely (transform_layout_fn = nullptr). + - Why bypass helps ConvNext: NHWC transform inserts Transpose(NCHW→NHWC/NHWC→NCHW) around Conv. ConvNext residual connections block full transpose cancellation → extra Transpose ops on HTP → slower. Bypassing = cleaner graph = faster. + - Critical caveat: Current ORT main has kMaxSupportedOpset = 26 → BOTH opset 17 and 21 get NHWC transform. Must verify ORT version before assuming the speedup exists. + - Does NOT generalize to: MobileNet/EfficientNet (no residual Transpose blocks), ViT (no Conv). + - Perf claim validation status: Gate 1 (iter≥1000×3) and Gate 3 (thermal control) still FAILED. Perf numbers are DVFS-dominated. +2. Runtime stack gap (3.3×) is structural: qairt native will always be faster. Correct baseline = "QNN Hub ONNX on winml" (8.78ms). +3. QNN Hub W8A16 is WORSE on our stack (14.82ms, std=8.8ms): opset 21 QDQ + uint16 input incompatible with ORT QNN EP format. +4. Opset is a search dimension — but the correct action is a FULL SWEEP (17–22), not "try 21 first". The optimal opset depends on ORT version.

+

EP-specific search space rules

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
EPQuantizationOpsetGraph passesCompileKey insight
QNN NPU✅ W8A16Full sweep 17-22 (mechanism ORT-version-dependent)autoconf (gelu+matmul_add)✅ AlwaysW8A8 catastrophic on LN+GELU; opset effect depends on ORT kMaxSupportedOpset
QNN GPU❌ Skip17 (opset 21 not validated)autoconf only❌ SkipCompile regresses; FP16 only lever (#867)
DML❌ Skip17 (opset 21 not validated)autoconf onlyN/AFP16 primary lever (#867); faster+stabler than QNN GPU
CPU❌ Skip17 only (kMaxSupportedOpset causes 3-4× regression on 19+)nchwc, matmul-add, geluN/AkMaxSupportedOpset gate hurts CPU for same reason it helps QNN
+

Rule: autoconfig must use EP-specific search space. Do NOT run quantization experiments for GPU/DML/CPU. +Rule: for QNN NPU opset sweep, verify ORT kMaxSupportedOpset first — if ≥ 22, all opsets get NHWC transform and the opset-based speedup may not apply. +Rule: for NPU, if W8A8 top-1 ≤ 15% on first attempt → skip all W8A8 variants, go directly to W8A16. +Rule: always run winml compile after finding best quantized config for QNN NPU. NEVER compile for GPU (regresses). +Rule: for GPU/DML, skip ALL graph optimization passes beyond what winml build autoconf applies (NHWC and additional fusions hurt). +Rule: W8A8 QDQ on GPU EP hangs — skip quantization immediately for GPU targets without testing.

+

User scenario mapping

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ScenarioHow autoconfig addresses it
S1: LLM fast support (7-30d)autoconfig replaces manual per-EP tuning; outputs config_optimal.json + report.html deployable in hours not days
S2: ISV non-LLM model supportExact use case: ISV brings model → autoconfig finds config → report is deliverable with SOP turnaround
S3: Cross-EP parityMulti-EP parallel run: same model, EP-specific search spaces in parallel → output config matrix per EP
S4: Customer ONNX can't runPhase 0 intake diagnoses "can't run" (partial ops → block reason); Phase 1+2 finds "escape config" for "runs poorly"
S5: PyTorch HF Hub coveragePhase 0 IS the "can WinML run it?" gate; failed Phase 0 → structured block reason feeds long-tail gap tracking
+

Dependencies on code changes: +- winml perf --profile (new flag) — adds per-op bottleneck output alongside existing latency metrics; POC script winml_profile.py exists to unblock +- --format json on winml eval (#847), winml analyze (#848), winml perf (#849)

+

Cross-references

+
    +
  • Run check-model-feasibility before starting to pick a model and verify the EP is available
  • +
  • After autoconfig completes → ship-to-winapp for final validation gates + packaging
  • +
  • If autoconfig cannot meet objective → debug-accuracy-drop for deeper diagnosis
  • +
  • Multi-EP output feeds directly into ship-to-winapp's manifest layout
  • +
  • If the best config found is still not good enough → escalate to optimization-research
  • +
+
+

Skill: optimization-research (contributor — internal, deep gap analysis)

+

Frontmatter

+
name: optimization-research
+description: >
+  Use this skill when a winml-cli engineer wants to find out whether a model can
+  be optimized better than what winml-cli currently achieves, identify what is
+  blocking that optimization, and produce concrete backlog work items.
+  The agent performs a deep search across: ORT source code and its optimizer
+  passes, Olive recipes and benchmarks, other ONNX ecosystem tools (onnxsim,
+  onnxoptimizer, neural-compressor, etc.), and native stack reference models
+  and datasets. It compares the best achievable result (using all available tools)
+  against what winml produces today, diagnoses the gap, and files GitHub issues
+  with reproduction steps. Use when an internal engineer says "why is this model
+  slower than it should be", "what optimization techniques are we missing",
+  or "what would it take to match Olive's results".
+
+audience: internal (winml-cli team engineers)
+
+ +

When to use

+
    +
  • "ConvNext on QNN is 3× slower than what Qualcomm's SDK achieves — why?"
  • +
  • "Olive gets 15ms on this model; winml gets 28ms — what's the gap?"
  • +
  • "We're seeing quantization accuracy drop on LLaMA; are there better calibration methods we're not supporting?"
  • +
  • "What would it take to match ORT's best-known config for this architecture?"
  • +
  • After autoconfig hits a ceiling: best config found is still not meeting the objective
  • +
+

What this skill produces

+

Primary outputs: +1. gap_analysis.md — structured report of what the best achievable result is and what's missing +2. repro/ — scripts to reproduce the better result using external tools +3. GitHub issues — one per identified gap, filed against winml-cli with: repro steps, expected vs actual, what ORT/Olive/ecosystem already does, proposed fix direction

+
+

Design: Deep Search Process

+
┌──────────────────────────────────────────────────────────────────┐
+│ PHASE 1 — BASELINE                                               │
+│   winml autoconfig best result for this model/EP                 │
+│   (or provided by user if already run)                           │
+└─────────────────────────┬────────────────────────────────────────┘
+                          ▼
+┌──────────────────────────────────────────────────────────────────┐
+│ PHASE 2 — EXTERNAL BENCHMARK                                     │
+│   Run same model through:                                        │
+│     A. ORT optimizer directly (onnxruntime.tools.transformers)   │
+│     B. Olive (olive-ai) with ep-specific recipe                  │
+│     C. onnxsim + onnxoptimizer (static graph simplification)     │
+│     D. neural-compressor (Intel) for quantization comparison     │
+│   Record: best latency, accuracy, config used                    │
+└─────────────────────────┬────────────────────────────────────────┘
+                          ▼
+┌──────────────────────────────────────────────────────────────────┐
+│ PHASE 3 — GAP DIAGNOSIS                                          │
+│   For each gap (external better than winml):                     │
+│     a. Diff the ONNX graphs (what ops/patterns differ?)          │
+│     b. Read ORT optimizer source to understand what it does      │
+│     c. Check winml's capability registry — is this pass missing? │
+│        disabled by default? wired incorrectly?                   │
+│     d. Check Olive recipe — what flags/params does it use?       │
+│   Classify gap as one of:                                        │
+│     [MISSING_CAPABILITY]   — pass exists in ORT, not in winml   │
+│     [WRONG_DEFAULT]        — pass exists but wrong default/order │
+│     [BUG]                  — pass exists but produces wrong graph│
+│     [CALIBRATION_DATA]     — accuracy gap from calibration set   │
+│     [EP_LIMITATION]        — EP itself can't do this, not winml  │
+│     [KNOWN_TRADEOFF]       — intentional: winml trades X for Y   │
+└─────────────────────────┬────────────────────────────────────────┘
+                          ▼
+┌──────────────────────────────────────────────────────────────────┐
+│ PHASE 4 — NATIVE STACK VALIDATION                                │
+│   Check existing reference models in winml-cli test suite:       │
+│     - Are there models of this architecture in tests/models/?    │
+│     - Do their expected results match what we see?               │
+│   Check Windows AI Studio / WinML model zoo:                     │
+│     - Is this architecture listed? At what performance?          │
+│   Check QNN SDK reference benchmarks (if QNN EP):               │
+│     - Does QNN vendor claim better numbers for this model?       │
+└─────────────────────────┬────────────────────────────────────────┘
+                          ▼
+┌──────────────────────────────────────────────────────────────────┐
+│ PHASE 5 — WORK ITEMS                                             │
+│   For each [MISSING_CAPABILITY] or [WRONG_DEFAULT] gap:          │
+│     - Draft GitHub issue with: title, body, repro, expected,     │
+│       actual, proposed fix, ORT source pointer                   │
+│     - Estimate implementation complexity (S/M/L/XL)             │
+│   For [BUG]: file with full repro script                         │
+│   For [CALIBRATION_DATA]: suggest dataset and eval protocol      │
+│   For [EP_LIMITATION]: file with QNN/DML SDK reference           │
+└──────────────────────────────────────────────────────────────────┘
+
+ +
+

Key external tools to invoke

+
# A. ORT transformer optimizer (the "gold standard" for transformer models)
+python -c "
+from onnxruntime.transformers import optimizer
+from onnxruntime.transformers.fusion_options import FusionOptions
+opts = FusionOptions('bert')   # or 'gpt2', 'clip', etc.
+opts.enable_attention = True
+opts.enable_gelu = True
+model = optimizer.optimize_model(
+    'export.onnx', model_type='bert',
+    num_heads=12, hidden_size=768,
+    optimization_options=opts
+)
+model.save_model_to_file('ort_optimized.onnx')
+"
+
+# B. Olive (end-to-end, EP-aware)
+olive run --config olive_recipe.json
+# olive recipe template: see skills/optimization-research/templates/olive_qnn.json
+
+# C. onnxsim (structural simplification)
+python -m onnxsim export.onnx simplified.onnx
+
+# D. onnxoptimizer
+python -c "
+import onnxoptimizer, onnx
+m = onnx.load('export.onnx')
+passes = onnxoptimizer.get_available_passes()
+m2 = onnxoptimizer.optimize(m, passes)
+onnx.save(m2, 'onnxopt.onnx')
+"
+
+ +
+

Gap report format (gap_analysis.md)

+
# Optimization Gap Analysis: <model_id> on <ep>
+
+Date: <timestamp>
+winml-cli version: <version>
+ORT version: <version>
+
+## Summary
+| Tool | Latency p50 | Accuracy | Config notes |
+|---|---|---|---|
+| winml best (autoconfig) | 28.3ms | 0.953 | W8A16, entropy, 256 samples |
+| ORT transformer optimizer | 19.1ms | 0.951 | model_type=bert, all fusions |
+| Olive QNN recipe | 17.8ms | 0.948 | W8A8 + attention fusion |
+| **Gap** | **10.5ms (37%)** | — | — |
+
+## Gap 1: [MISSING_CAPABILITY] FusedMatMul with rotary embedding
+**What external tool does:** ...
+**What winml does:** ...
+**ORT source:** `onnxruntime/python/tools/transformers/fusion_rotary_attention.py`
+**Proposed fix:** Add RotaryAttentionFusion to FusionPipe capability registry
+**Estimated effort:** M
+
+## Gap 2: [WRONG_DEFAULT] attention-fusion disabled by default
+...
+
+ +
+

GitHub issue template

+
title: [optimization-gap] <model_arch>/<ep>: <gap description>
+
+body:
+## Summary
+<one-sentence description of what's missing>
+
+## Reproduction
+```bash
+# Install
+uv pip install winml-cli
+
+# Baseline (winml current)
+winml build -c config.json -m <model-id> -o winml_out/
+winml perf -m winml_out/model.onnx --ep <ep> --warmup 10 --iterations 50
+
+# Better result (external)
+<commands to reproduce the external result>
+
+ +

Expected vs actual

+
    +
  • External tool achieves: ms at
  • +
  • winml achieves: ms at
  • +
  • Gap: ms (%)
  • +
+

Root cause

+

+

ORT source reference

+

+

Proposed fix direction

+

+

Complexity estimate

+

S / M / L / XL

+
---
+
+### What this skill does NOT do
+- Does not make code changes to winml-cli itself (files issues only)
+- Does not run production benchmarks (uses quick screening methodology)
+- Does not replace formal performance testing with validated hardware
+
+### Cross-references
+- `autoconfig` provides the winml baseline to compare against
+- Issues filed here feed `adding-ep-support` and `contributing-a-skill` workflows
+- Use `check-model-feasibility` to confirm EP availability before running external benchmarks
+
+---
+
+
+---
+
+## ConvNext Autoconfig POC — Rigorous Ablation Results
+
+**Source:** `C:\tmp\autoconfig-demo\ablation.py` — 4-phase rigorous ablation experiment
+**Measurement:** `winml perf --ep cpu --warmup 10 --iterations 50` — pure inference latency, no preprocessing
+**Design:** 3 independent runs per config; promotion threshold = max(3%, 2×σ_baseline); correctness gate (`winml eval --samples 20`) per config
+**Report:** `C:\tmp\autoconfig-demo\report.html` | **Config:** `C:\tmp\autoconfig-demo\config_cpu_optimal.json`
+
+### Graph structure (facebook/convnext-tiny-224, opset 17)
+
+**Op counts (raw export):** 287 nodes total
+
+ +

Add×72 Mul×54 Transpose×42 MatMul×36 LayerNormalization×23 +Conv×22 Div×18 Erf×18 ReduceMean×1 Gemm×1

+
**ConvNext block structure** (traced from first DW-Conv):
+
+ +

DW-Conv(7x7, g=96) → Transpose +→ LayerNormalization (native, already fused at export) +→ MatMul(C→4C) → Add(bias) +→ [GELU: Div → Erf → Add(1) → Mul → Mul(0.5)] ← 18 unfused in export +→ MatMul(4C→C) → Add(bias) [Gemm after ORT L2] +→ Mul (layer scale) → Add (residual) +→ Transpose (back to NCHW)

+
**Conv breakdown:** 4 regular (1×stem 4x4, 3×downsample 2x2 stride-2), 18×DW-Conv 7x7
+
+**Transpose patterns:**
+
+ +

19× Conv → Transpose → LayerNormalization (NCHW→NHWC for LN) +15× Mul → Transpose → Add (NHWC→NCHW for residual) + 4× LayerNormalization → Transpose → Conv (NHWC→NCHW for next DW-Conv) + 2× Add → Transpose → Conv + 2× Add → Transpose → LayerNormalization

+
→ ConvNext is a **Transpose-sandwich** model: alternates NCHW (Conv) and NHWC (LN) layout
+
+**Observed graph transformation (export.onnx → model.onnx after winml build, baseline config):**
+| Op | export.onnx | model.onnx (baseline) | Change |
+|---|---|---|---|
+| `com.microsoft/Gelu` | 0 | 18 | +18 |
+| `Gemm` | 1 | 37 | +36 |
+| `MatMul` | 36 | 0 | −36 |
+| `Add` | 72 | 18 | −54 |
+| `Mul` | 54 | 18 | −36 |
+| `Div`, `Erf` | 18 each | 0 | −18 each |
+| `Reshape` | 0 | 72 | +72 |
+
+**Observation (confirmed):** The baseline `model.onnx` (no user fusion flags) already differs substantially from `export.onnx`. GELU and MatMul+Add are fused before any user capability flag is applied.
+
+**Open question (unresolved):** The `ORTGraphPipe` design (graph.py) is supposed to disable `GeluFusion`/`GeluFusionL2`/`LayerNormFusion` in the baseline via `optimization.disable_specified_optimizers`. Yet the baseline output clearly contains `com.microsoft/Gelu`. This contradiction is unresolved — possible explanations include: ORT name mismatch in disabled list, a different code path fusing GELU, or the export step (via HF Optimum) applying fusion before winml. **This must be investigated before any mechanistic claims about "ORT L2 already does X" are written in user-facing reports.**
+
+---
+
+### Ablation results (rigorous, Phase 0–4)
+
+**Clean baseline:** 43.7ms p50 (base_0 + base_1, 6 runs, all within 42.5–45.4ms)
+
+| config | p50 mean | Δ vs baseline | runs (ms) | verdict |
+|---|---|---|---|---|
+| base_0 | 43.0ms | −0.6ms | 43.8 / 42.7 / 42.5 | baseline |
+| base_1 | 44.3ms | +0.6ms | 43.2 / 44.3 / 45.4 | baseline |
+| base_2 | 73.5ms | +29.8ms | 47.2 / **127.1** / 46.2 | outlier run (system spike) |
+| opset_18 | 48.0ms | +4.3ms | 50.2 / 44.0 / 49.7 | neutral |
+| **opset_19** | **160.3ms** | **+116ms** | **147.6 / 145.8 / 187.4** | **⚠️ SEVERE REGRESSION** |
+| **opset_20** | **131.0ms** | **+87ms** | **135.7 / 129.8 / 127.5** | **⚠️ SEVERE REGRESSION** |
+| **opset_21** | **170.3ms** | **+126ms** | **190.1 / 164.9 / 155.8** | **⚠️ SEVERE REGRESSION** |
+| **opset_22** | **85.0ms** | **+41ms** | **70.9 / 93.9 / 90.2** | **confirmed regression** |
+| no_cf_17 | 51.8ms | +8.1ms | 56.4 / 49.0 / 49.9 | mild regression |
+| base_mid | 49.4ms | +5.8ms | 51.3 / 51.1 / 45.9 | baseline (mid-exp drift) |
+| gelu_only | 52.5ms | +8.9ms | 53.0 / 55.6 / 49.1 | mild regression |
+| ln_only | 57.2ms | +13.6ms | **79.3** / 47.9 / 44.5 | inconclusive (outlier) |
+| conv_add | 50.2ms | +6.5ms | 47.3 / 55.9 / 47.4 | inconclusive |
+| conv_act | 51.2ms | +7.5ms | 45.2 / 41.9 / **66.4** | inconclusive (outlier) |
+| **matmul_add** | **81.7ms** | **+38.0ms** | **63.0 / 70.8 / 111.2** | **CONFIRMED REGRESSION** |
+| transpose_opt | 45.5ms | +1.8ms | 42.3 / 52.3 / 41.8 | neutral |
+| nchwc | 45.4ms | +1.7ms | 43.4 / 48.0 / 44.7 | neutral |
+| matmul_scale | 56.9ms | +13.3ms | 51.5 / 58.1 / 61.2 | probable mild regression |
+| base_end | 48.3ms | +4.7ms | 45.3 / 56.7 / 43.1 | baseline (end-of-exp drift) |
+
+**Phase 3 outcome:** No candidates met promotion threshold (29.4ms needed). Baseline is optimal.
+
+---
+
+### Confirmed findings (statistically defensible)
+
+**1. `matmul-add-fusion` is a confirmed regression on ConvNext CPU (+38ms)**
+- All 3 independent runs: 63.0 / 70.8 / 111.2ms — each far above the highest clean baseline run (45.4ms)
+- Not attributable to system noise (no run-to-run overlap with baseline distribution)
+- Mechanism hypothesis: baseline already converts MatMul+Add→Gemm (37 Gemm in model.onnx); applying matmul-add-fusion on top may create redundant or conflicting kernel dispatch. Unconfirmed — requires profiling.
+
+**2. `transpose-optimizer` is NEUTRAL on pure inference latency**
+- Runs: 42.3 / 52.3 / 41.8ms — overlapping with clean baseline (42.5–45.4ms)
+- ⚠️ **CORRECTION OF EARLIER FINDING:** A previous 8-iteration search (using `winml eval`) reported +270ms. That was a measurement artifact — `winml eval` includes HF preprocessing pipeline overhead and has no warmup. It measures *application startup + preprocessing + inference*, not *inference alone*. With `winml perf` (warmup=10, iter=50, pure inference): transpose_opt = baseline. Do not cite the +270ms in any report.
+
+**3. `nchwc-transformer` is neutral on this model**
+- NCHWc SIMD layout: 43.4 / 48.0 / 44.7ms — no benefit for ConvNext CPU inference.
+
+**4. opset=18 is neutral**
+- Same node count (251) as opset=17 — no graph structure changes. Mean slightly above baseline (48ms) is within machine variance.
+
+**5. No flag improved latency beyond noise. Baseline is the optimal config.**
+
+---
+
+### ⚠️ Critical finding: ORT performance cliff at opset 19 (ConvNext CPU)
+
+**Experiment:** tested opset 17–22, all with identical graph structure (251 nodes, same op counts)
+
+| opset | mean p50 | slowdown |
+|---|---|---|
+| 17 | 43.7ms | — (baseline) |
+| 18 | 48.0ms | 1.1× |
+| **19** | **160.3ms** | **3.7×** |
+| **20** | **131.0ms** | **3.0×** |
+| **21** | **170.3ms** | **3.9×** |
+| **22** | **85.0ms** | **1.9×** |
+
+**Key facts:**
+- All runs within each opset are consistent (no outliers) — this is real, not noise
+- Graph structure is **byte-for-byte identical**: Reshape×72, Transpose×42, Gemm×37, LN×23, Conv×22 for ALL opsets
+- The performance difference is entirely in ORT's runtime execution path, not the graph
+
+**Mechanism: CONFIRMED ROOT CAUSE — ORT `kMaxSupportedOpset` gates Transpose Optimizer**
+
+Source: `onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h`
+```cpp
+constexpr int64_t kMaxSupportedOpset = 18;  // ORT v1.14.x — bumped each ORT release
+
+ +

Entry point onnx_transpose_optimization::Optimize()MakeOptimizerContext():

+
if (*opset > kMaxSupportedOpset) {
+    return std::nullopt;  // entire Transpose Optimizer skipped silently
+}
+
+ +

ConvNext has 42 Transpose nodes (NCHW↔NHWC sandwich in every block). The Transpose Optimizer normally: +- Pushes Transposes through Add×18, Mul×18 (layer-scale + residual) across block boundaries +- Cancels adjacent inverse pairs

+

When bypassed (opset > kMaxSupportedOpset), all 42 Transposes execute as full memory-layout copies → 3–4× systemic slowdown.

+

ORT optimization level experiment (definitive proof):

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Session opt levelopset=17opset=19ratioexplanation
DISABLE_ALL47.5ms355ms7.5×No Transpose Optimizer → all 42 Transposes raw
ENABLE_BASIC289ms315ms1.1×Both slow (re-optimizing pre-fused graph)
ENABLE_EXTENDED209ms241ms1.2×Better but no layout transform
ENABLE_ALL216ms215ms1.0×Transpose Optimizer runs on both → full parity
+

kMaxSupportedOpset version history:

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ORT versionkMaxSupportedOpsetopset ≥ N disabled
v1.14.x18≥ 19
v1.16.x19≥ 20
v1.17.x20≥ 21
v1.18.x21≥ 22
main/HEAD26fully covered
+

Classification for optimization-research skill: [KNOWN_TRADEOFF] (intentional design: ORT bumps the ceiling with each ONNX opset release) +- winml-cli ships a specific ORT build → its kMaxSupportedOpset is fixed +- winml-cli's default opset=17 is correct and essential — it is the safe zone for all current ORT builds +- Raising opset requires ensuring the shipping ORT version has kMaxSupportedOpset ≥ target_opset +- Do NOT raise default opset without verifying kMaxSupportedOpset in the shipped ORT

+

Call chain:

+
InferenceSession::Initialize()
+  → TransposeOptimizer::ApplyImpl()         [transpose_optimizer.cc:18]
+      → onnx_transpose_optimization::Optimize()
+          → MakeOptimizerContext()
+              → if opset > kMaxSupportedOpset: return nullopt  ← THE GATE
+
+ +
+

Inconclusive / do not report

+

These show elevated means but cannot be confirmed as regressions given machine variance (p90 = 2–3× p50 throughout): +- ln_only, conv_add, conv_act: each has ≥1 extreme outlier run; other runs are baseline-level +- gelu_only: consistently 49–56ms, possibly a mild regression but no outlier; 3 runs insufficient to separate from drift +- matmul_scale: all 3 runs elevated (51–61ms), but concurrent baseline also drifted (+5ms); net delta ~+8ms, weak signal

+

Do not write these as confirmed regressions in user-facing reports. Label as "inconclusive" or omit.

+
+

Measurement methodology correction (winml eval vs winml perf)

+ + + + + + + + + + + + + + + + + + + + + + + + + +
ToolWhat it measuresLatency for ConvNext CPU
winml eval (no warmup, includes preprocessing)Application-level: model load + HF preprocessing + inference × N~67ms/sample
winml perf --warmup 10 --iterations 50Pure inference: steady-state kernel execution only~43.7ms p50
DifferenceHF preprocessing + JIT warmup overhead~23ms
+

Rule for autoconfig skill: Always use winml perf with --warmup 10 --iterations 50 for latency measurements in experiments. Never use winml eval latency to compare configs.

+
+

Key insight for autoconfig skill

+
    +
  • CPU EP on ConvNext: no extra flag tested improved latency. Baseline (no fusions beyond what ORT L2 applies unconditionally) is optimal.
  • +
  • The only actionable finding is: do not add matmul-add-fusion for ConvNext on CPU (or any model where baseline already uses Gemm).
  • +
  • QNN/DML: not yet tested. Guidance on those EPs requires separate validated experiments.
  • +
+
+

winml analyze gaps discovered

+

These are cases where analyzing the graph before running autoconfig would have prevented wasted search iterations:

+

Gap 1: "Already fused" vs "fuseable" not distinguished +- ConvNext has LayerNormalization as a native op (already fused at PyTorch export) +- layer-norm-fusion targets the decomposed ReduceMean→Sub→... pattern +- winml analyze reports OP/ai.onnx/LayerNormalization without indicating it's already in canonical form +- Impact: user enables layer-norm-fusion thinking it will help; it does nothing (but builds take longer) +- Fix: analyze should tag ops as already_canonical vs fuseable_subgraph

+

Gap 2: DW-Conv not distinguished from regular Conv +- ConvNext has 18×7x7 DW-Conv (group=C) and 4×regular Conv (group=1) +- winml analyze reports all as OP/ai.onnx/Conv (undifferentiated) +- QNN EP supports DW-Conv natively (important for NPU efficiency), but EP support classification is per op type, not per groups value +- Impact: user cannot tell whether Conv ops are the DW or regular variant; EP support may differ +- Fix: analyze should emit OP/ai.onnx/Conv[depthwise] vs OP/ai.onnx/Conv[regular]

+

Gap 3: Transpose-sandwich pattern not detected +- 42 Transpose nodes in ConvNext form a clear Conv→Transpose→LN→...→Transpose repeating pattern +- transpose-optimizer turns this into NHWC chains (good for GPU/NPU, bad for CPU) +- winml analyze reports Transpose as just OP/ai.onnx/Transpose with no structural context +- Impact: user cannot predict whether transpose-optimizer will help or hurt without running it +- Fix: analyze should detect transpose_sandwich_depth: N and emit a warning for CPU EP

+

Gap 4: ORT L2 baseline fusions not surfaced +- After ORT Level 2 optimization (which runs unconditionally), the graph already has fused Gelu, Gemm +- The analyze command runs on the pre-optimize export.onnx, not the actual optimized model +- winml analyze sees 36×MatMul in export.onnx but the real model at inference has 37×Gemm +- Impact: analyze output doesn't reflect what the model actually looks like when running +- Fix: analyze should optionally run on optimized.onnx (post-ORT-L2), not just export.onnx

+

Gap 5: MatMul semantic not classified +- 36 MatMul ops are all MLP dense layers (4C→C or C→4C expansion) +- No attention MatMuls present (ConvNext has no self-attention) +- QNN handles dense-layer MatMul differently from attention-context MatMul +- winml analyze reports OP/ai.onnx/MatMul without semantic classification +- Fix: analyze could detect MatMul role heuristically (shapes: attention = square-ish, MLP = wide fan-out)

+
+

Why skill eval matters

+

Mobius has no skill eval mechanism — it tests models but not skills themselves. This is a gap. +A SKILL.md can have correct content but still cause the agent to give wrong guidance if the +trigger description is poorly written or the structure is confusing. Skill eval catches this.

+

Two eval dimensions

+ + + + + + + + + + + + + + + + + + + + +
DimensionWhat it checksWhen to run
Static (content quality)description trigger phrases, command accuracy, cross-reference validityEvery PR that modifies a SKILL.md
Dynamic (agent behavior)Given a user scenario + skill injected, does the agent produce the right commands and diagnosis?On significant content changes; periodically
+

Static eval = the review checklist in contributing-a-skill. +Dynamic eval = test cases in evals/eval.yaml per skill, run with winml skill eval.

+

winml skill — new CLI subcommand

+

The eval system is built into winml-cli itself as a new skill subcommand. +This keeps the toolchain self-contained and enables CI integration without external dependencies.

+

Command surface:

+
winml skill check  [--skill <name>]   # static: lint + auto-verify all commands in SKILL.md
+winml skill gen-evals [--skill <name>] # auto-research: generate eval.yaml from SKILL.md content
+winml skill eval   [--skill <name>]   # dynamic: run agent behavior tests
+winml skill list                      # list all skills with pass/fail status
+
+ +

winml skill check — auto-research via command extraction

+

This is the "code change that does auto research":

+
    +
  1. Parse SKILL.md — extract every code block containing winml <command> patterns
  2. +
  3. Verify flags exist — run winml <command> --help and check each flag is present
  4. +
  5. Verify cross-references — confirm every .agents/skills/<name>/SKILL.md path exists
  6. +
  7. Verify trigger coverage — count quoted phrases in description frontmatter (must be ≥3)
  8. +
  9. Optionally run commands — with --dry-run-commands, execute each command on a + canary model to verify it doesn't crash
  10. +
+

Example output:

+
winml skill check --skill debug-accuracy-drop
+
+Checking debug-accuracy-drop...
+  ✓ description: 4 trigger phrases found
+  ✓ winml eval --mode compare     [flag verified against eval --help]
+  ✓ winml analyze -m ... --ep qnn [flag verified against analyze --help]
+  ✗ winml perf --monitor          [flag '--monitor' not found in perf --help]  ← STALE
+  ✓ cross-ref: ep-compatibility-check/SKILL.md exists
+  ✗ cross-ref: validate-before-ship/SKILL.md [file missing]  ← BROKEN LINK
+Summary: 2 issues found
+
+ +

Key insight: every time winml-cli flags change, winml skill check automatically +detects which skills have stale commands — no manual audit needed.

+

Implementation sketch (src/winml/modelkit/commands/skill.py):

+
import re, subprocess
+from pathlib import Path
+import click
+
+SKILLS_DIR = Path(__file__).parents[5] / "skills"
+WINML_CMD_PATTERN = re.compile(r'^\s*(winml\s+\w[\w\-]*\s+[^\n]+)', re.MULTILINE)
+
+def extract_commands(skill_md: str) -> list[str]:
+    """Extract all 'winml <subcommand> ...' lines from code blocks."""
+    in_block = False
+    commands = []
+    for line in skill_md.splitlines():
+        if line.strip().startswith("```"):
+            in_block = not in_block
+        elif in_block and line.strip().startswith("winml "):
+            commands.append(line.strip())
+    return commands
+
+def verify_flag(command_line: str) -> tuple[bool, str]:
+    """Check flags in a command line exist in --help output."""
+    parts = command_line.split()
+    subcommand = parts[1]
+    flags = [p for p in parts[2:] if p.startswith("--")]
+    result = subprocess.run(["winml", subcommand, "--help"],
+                            capture_output=True, text=True)
+    help_text = result.stdout
+    for flag in flags:
+        if flag not in help_text:
+            return False, f"flag '{flag}' not found in {subcommand} --help"
+    return True, "ok"
+
+@click.group("skill")
+def skill_cmd():
+    """Manage and evaluate winml-cli skills."""
+
+@skill_cmd.command("check")
+@click.option("--skill", default=None, help="Skill name to check (default: all)")
+@click.option("--dry-run-commands", is_flag=True, help="Execute commands on canary model")
+def check(skill, dry_run_commands):
+    """Static check: verify commands and cross-references in SKILL.md files."""
+    targets = [SKILLS_DIR / skill] if skill else list(SKILLS_DIR.iterdir())
+    for skill_dir in targets:
+        skill_md = (skill_dir / "SKILL.md").read_text()
+        for cmd in extract_commands(skill_md):
+            ok, msg = verify_flag(cmd)
+            status = "✓" if ok else "✗ STALE"
+            click.echo(f"  {status}  {cmd[:60]}")
+
+ +

winml skill gen-evals — LLM-powered eval case generation

+

Auto-generates evals/eval.yaml from SKILL.md content using an LLM:

+
    +
  1. Extract trigger phrases from description frontmatter
  2. +
  3. Extract symptom→fix tables from SKILL.md sections
  4. +
  5. Prompt an LLM to generate (user scenario, expected commands) pairs
  6. +
  7. Write evals/eval.yaml in PromptFoo format
  8. +
+

This is "auto research": the LLM reads the skill and generates adversarial cases +that challenge the agent — including negative cases where the agent should NOT +recommend something.

+
winml skill gen-evals --skill debug-accuracy-drop --model gpt-4o --count 5
+# Writes: skills/debug-accuracy-drop/evals/eval.yaml (auto-generated)
+# Human review before committing
+
+ +

The generated eval.yaml is a starting point — contributors review and refine before +committing. Over time, real user questions (from GitHub issues) can be mined and +added as additional eval cases.

+

winml skill eval — agent behavior testing

+

Runs the eval cases and reports results:

+
winml skill eval --skill debug-accuracy-drop
+# Uses evals/eval.yaml + injects SKILL.md as system prompt
+# Reports pass/fail per test case
+
+ +

Internally shells out to PromptFoo (if installed) or uses a lightweight built-in runner +that calls the configured LLM API directly.

+

Directory layout

+

Each skill carries its own eval cases:

+
skills/
+  debug-accuracy-drop/
+    SKILL.md
+    evals/
+      eval.yaml     ← agent behavior test cases (hand-written or gen-evals output)
+
+ +

eval.yaml format (PromptFoo)

+
# skills/debug-accuracy-drop/evals/eval.yaml
+description: "Agent behavior eval for debug-accuracy-drop skill"
+
+prompts:
+  - "{{user_message}}"
+
+providers:
+  - id: openai:gpt-4o
+    config:
+      systemPrompt: |
+        You are a WinML CLI assistant. Use the following skill:
+        ---
+        {{skill_content}}
+
+tests:
+  - description: "Low cosine after W8A8  should isolate to quantize stage"
+    vars:
+      user_message: "I quantized my model to W8A8 and cosine similarity is 0.87. What's wrong?"
+    assert:
+      - type: contains
+        value: "winml eval --mode compare"
+      - type: icontains
+        value: "quantize"
+      - type: icontains
+        value: "w8a16"              # should suggest escalating precision
+
+  - description: "NPU vs CPU discrepancy  should point to op fallback"
+    vars:
+      user_message: "My model gives different results on QNN NPU vs CPU after compile"
+    assert:
+      - type: contains
+        value: "winml analyze"
+      - type: icontains
+        value: "partial"            # mention partial op fallback
+      - type: icontains
+        value: "compile"            # blame compile stage, not quantize
+
+  - description: "Drop after optimize only  should NOT blame calibration"
+    vars:
+      user_message: "cosine similarity dropped after winml optimize, I haven't quantized yet"
+    assert:
+      - type: contains
+        value: "winml eval --mode compare"
+      - type: icontains
+        value: "optimize"
+      - type: not-icontains
+        value: "calibration"        # calibration is irrelevant here
+
+ +

Minimum eval cases per skill

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SkillMin casesKey assertions
check-model-feasibility4Screens candidates with winml inspect (never recommends an unsupported model); recommends the 3-layer check in order; gives fallback when EP absent
debug-accuracy-drop4Correctly isolates pipeline stage; suggests precision escalation
ship-to-winapp4Lists all 6 validation gates; handles waiver scenario; produces manifest.json with CPU fallback
autoconfig3Applies latency-budget vs accuracy-floor framework (manual mode); keeps/discards by objective (auto mode)
adding-model-support2Suggests L1→L5 order; correct recipe structure
contributing-a-skill2Flags missing trigger phrases; flags pseudocode commands
+

What "passing" means

+

An eval case passes when all assertions hold. Recommended pass threshold before merging: +- All contains / icontains assertions pass +- All not-icontains (negative) assertions pass (agent does NOT give wrong advice)

+

The negative assertions are the most valuable — they catch the agent confidently giving +wrong guidance (e.g., blaming calibration for an optimize-stage drop).

+

Running evals

+
# Install PromptFoo
+npm install -g promptfoo
+
+# Run eval for a single skill
+cd skills/debug-accuracy-drop
+promptfoo eval --config evals/eval.yaml
+
+# Run all skill evals
+for dir in skills/*/; do
+  if [ -f "$dir/evals/eval.yaml" ]; then
+    promptfoo eval --config "$dir/evals/eval.yaml"
+  fi
+done
+
+ +
+

Implementation notes

+

Directory structure

+
skills/
+  use-winml-cli/              ← existing, extend (user)
+    SKILL.md
+    evals/eval.yaml
+  check-model-feasibility/    ← new (user — model discovery + EP/device compatibility)
+    SKILL.md
+    evals/eval.yaml
+  debug-accuracy-drop/        ← new (user)
+    SKILL.md
+    evals/eval.yaml
+  autoconfig/                 ← new (user — optimize: autoresearch loop + manual framework)
+    SKILL.md
+    evals/eval.yaml
+  ship-to-winapp/             ← new (user — validation gates + multi-EP packaging; partial dep on winml package feature)
+    SKILL.md
+    evals/eval.yaml
+  adding-model-support/       ← new (contributor)
+    SKILL.md
+    evals/eval.yaml
+  adding-ep-support/          ← new (contributor)
+    SKILL.md
+    evals/eval.yaml
+  contributing-a-skill/       ← new (contributor)
+    SKILL.md
+    evals/eval.yaml
+  optimization-research/      ← new (contributor — internal deep gap analysis for winml-cli team)
+    SKILL.md
+    templates/olive_qnn.json
+    templates/olive_dml.json
+    evals/eval.yaml
+
+ +

Priority order for implementation

+

This is implementation sequencing (risk- and dependency-driven), which intentionally differs from +the importance ranking in the Overview. Importance answers "which skill matters most to users"; +this answers "which is safest to build first." Example: autoconfig is the #1 importance user skill +but ships last because it depends on the --format json changes and is the most complex.

+

Code changes first (unblocks agentic skill execution): +0. winml eval --format json — critical: enables all accuracy-related agentic flows +0. winml analyze --format json — enables EP compatibility agentic flows +0. winml perf --format json — enables performance SLA agentic flows

+

User skills: +1. check-model-feasibility — lowest risk, pure existing commands (inspect/sys/analyze); front door for new users (model discovery half needs analyze --format json) +2. debug-accuracy-drop — closes clearest pain point, existing eval --mode compare +3. ship-to-winapp — validation checklist + packaging; build it once the gate commands exist (partial dep on winml package feature) +4. autoconfig — depends on #847/#848/#849 + most complex skill to implement (manual mode can ship first as the lightweight framework)

+

Contributor skills: +5. contributing-a-skill — enables community contributions to the skill ecosystem +6. adding-model-support — most impactful for model coverage growth +7. adding-ep-support — lower frequency, but needed for new EP onboarding +8. optimization-research — internal gap-finder; depends on a working autoconfig baseline to compare against

+

Required code changes for agentic skill execution

+

The three changes that turn skills from documentation into agentic programs:

+

1. winml eval --format json

+

File: src/winml/modelkit/commands/eval.py

+

Add --format option and emit structured JSON to stdout:

+
{
+  "mode": "compare",
+  "model": "path/to/quantized.onnx",
+  "model_id": "microsoft/resnet-50",
+  "metrics": {
+    "cosine_similarity": 0.87,
+    "sqnr_db": 28.3,
+    "psnr_db": 31.1,
+    "max_abs_diff": 0.042
+  },
+  "task_metric": { "top1_accuracy": 0.741 },
+  "threshold_pass": false
+}
+
+ +

2. winml analyze --format json

+

File: src/winml/modelkit/commands/analyze.py

+

Already supports --output file.json. Add --format json to also print to stdout +(mirrors pattern from winml inspect and winml sys):

+
{
+  "ep": "qnn",
+  "model": "path/to/model.onnx",
+  "summary": { "supported": 142, "partial": 3, "unsupported": 1 },
+  "partial_ops": ["MultiHeadAttention", "LayerNorm", "Softmax"],
+  "unsupported_ops": ["CustomRotaryEmbedding"]
+}
+
+ +

3. winml perf --format json

+

File: src/winml/modelkit/commands/perf.py

+

Already writes JSON to file via -o. Add --format json stdout output:

+
{
+  "model": "path/to/model.onnx",
+  "ep": "qnn",
+  "device": "npu",
+  "iterations": 100,
+  "latency_ms": { "p50": 18.3, "p90": 21.7, "p99": 28.4, "mean": 18.9 },
+  "throughput_rps": 54.6
+}
+
+ +

These three changes are ~50 lines of code each, follow the existing pattern from +winml inspect --format json and winml sys --format json, and unlock the full +agentic execution model for all consumer skills.

+

Sizing estimate (per skill)

+

Each SKILL.md based on Mobius patterns (~8–14KB): +- ~200 lines prose + decision tables +- ~50 lines code examples +- Cross-reference section

+

Relationship to existing use-winml-cli skill

+

The new skills are task-scoped (problem → solution) vs the existing skill which is +tool-scoped (here's what each command does). They complement, not replace each other. +The existing skill should add cross-references to the new skills in its "Common patterns" section.

+
+

QNN NPU Catalog Sweep — Findings & Feature Gaps (2026-06-13)

+

Source: 8-model catalog sweep via autoconfig POC (C:\tmp\autoconfig-demo\catalog_qnn_sweep.py)

+

Cross-model results

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ModelArchBaseline p50Best p50GainBest config
microsoft/resnet-18resnet0.96ms0.96msbaseline (opset17)
google/vit-base-patch16-224vit9.04ms9.04msbaseline (opset17)
apple/mobilevit-smallmobilevit12.07ms8.62ms+29%opset21+conv_fusions
facebook/dinov2-smalldinov26.56ms4.98ms+24%opset21
hustvl/yolos-smallyolos78.69mstimeout
distilbert SST-2distilbert19.48ms19.48msbaseline
all-MiniLM-L6-v2bert5.81ms5.81msbaseline
deepset/roberta-base-squad2roberta14.94ms14.72ms1.5%opset21
+

Validated KB findings

+

npu-001 refined: opset21 benefit is architecture-gated: +- ✅ Conv + residual connections: +25–31% (mobilevit, dinov2, convnext) +- ❌ Pure transformer (ViT, YOLOS): -7% or neutral +- ⚪ NLP BERT-family: neutral

+

npu-006 NEW — CRITICAL: Conv fusions (conv-bn/add/activation) cause catastrophic QNN NPU CPU fallback +- ResNet-18 with conv fusions: 0.96ms → 132ms (+4900% regression) +- MobileViT: safe (no regression) +- Severity: critical — can produce 50x+ regression silently

+

npu-007 NEW: DVFS thermal noise makes CV gate unreliable on QNN NPU +- New bench protocol: 3 sessions × 500 iters + 30s cool-down + median p50 + >10% noise floor

+

Feature gaps (winml-cli backlog items)

+

Gap A: winml analyze — Conv fusion QNN safety check +winml analyze should detect Conv-dominant topologies and warn when conv-bn/add/activation +fusions are configured for QNN NPU target. Currently no pre-build detection of this hazard. +- Command to add: warning in analyze output when ep=qnn AND conv_fusion_pass is enabled AND model has >N Conv ops +- Priority: HIGH (silent 50x regression risk)

+

Gap B: budget-aware sweep in autoconfig +Large models (YOLOS, ~78ms/inf) cause sweep timeout with current fixed budget. +Need: per-hypothesis time estimation → auto-skip models that exceed budget, log as "timeout" not failure. +- Affects: autoconfig POC and any future winml sweep command

+

Gap C: winml perf DVFS-aware session averaging +winml perf should natively support session-level median aggregation for QNN NPU. +Current single-session variance is dominated by DVFS thermal state, not model performance. +- Flag proposal: --sessions 3 --cool-down 30 --signal median-p50 +- This would make winml perf output trustworthy for optimization decisions on Snapdragon X Elite

+
+

Feature Request: FusedConv detection + unfuse-for-qnn (2026-06-15)

+

Problem

+

用户可能从外部拿到一个已经做过 Conv fusion 的 ONNX 模型,或者 autoconfig 实验里开了 conv-add-activation-fusion flag。 +这类模型在 QNN NPU 上跑起来特别慢(ResNet-18 实测 +4900% regression),但没有任何报错,用户完全不知道原因。

+

Root cause

+

conv-add-activation-fusion 生成的是 ORT 扩展 op FusedConv(非标准 ONNX op)。 +QNN EP 不认识这个 op,所有 FusedConv 节点全部 fallback 到 CPU,PCIe round-trip 开销极大。

+

conv-bn-fusion 不同:它把 BN 参数数学吸收进 Conv weight,不产生新 op 类型,结果仍是标准 Conv,不可逆

+

Proposed feature

+

1. winml analyze — FusedConv detection

+

winml analyze -m model.onnx --ep qnn 扫描图中所有节点, +如果发现 FusedConv 节点且目标 EP 为 QNN,输出警告:

+

⚠ QNN NPU: 23 FusedConv nodes detected. + FusedConv is an ORT-internal op not supported by QNN EP — these nodes will fall back to CPU. + Recommend: run winml optimize --unfuse-conv to expand back to standard ONNX ops.

+

2. winml optimize --unfuse-conv

+

新增 optimize pass:把 FusedConv 节点拆回 Conv + Add + 。 +- Lossless(权重不变,只拆 op 结构) +- 输出标准 ONNX,QNN EP 可正常映射 HTP kernel +- 适用场景:BYOM 用户带入已做过 fusion 的模型

+

Implementation notes +- 检测: +ode.op_type == "FusedConv" 即可定位 +- 拆分:读 FusedConv attribute ctivation 字段 → 插入对应 Relu/Sigmoid/Tanh 节点 +- 不处理 conv-bn-fusion 产生的模型(那个无法反向,只能重新从 FP32 export)

+

Priority

+

MEDIUM — 默认 flag 是关的,不是高频路径,但对 BYOM 场景(拿到别人优化过的模型)有实际价值。

+
+
+ + diff --git a/research/autoconfig/docs/skills-design.md b/research/autoconfig/docs/skills-design.md new file mode 100644 index 000000000..8f223f88b --- /dev/null +++ b/research/autoconfig/docs/skills-design.md @@ -0,0 +1,3114 @@ +# WinML CLI Skills Design Doc + +## Overview + +This document defines the design for 9 skills to be added to `skills/` in winml-cli. +Skills are split into **two categories by the single question: does the task require editing repo code?** + +- **User skills (5)** — the user reaches their goal purely by specifying conditions and letting + winml-cli produce or modify a `config.json` / `manifest.json` / report. **No source code is touched.** + Audience: WinApp developers and ISVs deploying models. +- **Contributor skills (4)** — the task requires a winml-cli source-code change (a new exporter, a new + EP backend, a new skill), or exists specifically to produce code-change backlog. Audience: winml-cli engineers. + +> Discriminator: if the deliverable is a config/manifest/report, it is a **User** skill. If completing it +> requires editing code in the repo (or its whole purpose is to drive such edits), it is a **Contributor** skill. + +Each skill follows the SKILL.md frontmatter convention (`name:`, `description:`) established +by Mobius, NVIDIA Model-Optimizer, and Google LiteRT-CLI as the de facto standard. + +### User skills — ranked by importance + +| Rank | Skill | Why it ranks here | Output (no code) | +|---|---|---|---| +| 1 | `autoconfig` | Flagship. Autonomously searches the config space and delivers the optimal `config.json` per EP. Also hosts the **manual optimize path** (precision-ladder + latency/accuracy-budget decision framework + hardware table) for users who want to choose by hand or have no target hardware. Maps to all five user scenarios (S1–S5). | `config__optimal.json` + `report.html` | +| 2 | `check-model-feasibility` | Pre-build front door, merging model discovery + EP/device compatibility: "find me a *supported* model from my constraints, then confirm it runs on my hardware." The single "what do I run, and will it run?" gate (`inspect` → `sys` → `analyze`). Highest frequency — every user hits it before building. | model shortlist + go/no-go + fallback EP | +| 3 | `debug-accuracy-drop` | Closes the most acute pain point: accuracy dropped, cause unknown. High-frequency diagnostic need with the clearest existing tooling (`eval --mode compare`). | stage + root cause + fix | +| 4 | `ship-to-winapp` | Ship-time skill, merging validation + packaging: L1–L5 Definition-of-Done gates **plus** multi-EP artifact layout, `manifest.json`, and runtime EP selection. Everything between "the model is good" and "it's running in the app." | pass/fail report + `manifest.json` | +| 5 | `use-winml-cli` | General tool-scoped onboarding reference (existing). Foundational but low differentiation vs the task-scoped skills above. | command reference | + +### Contributor skills — ranked by importance + +| Rank | Skill | Why it ranks here | Code touched | +|---|---|---|---| +| 1 | `adding-model-support` | Directly grows model coverage — the core long-tail business problem (ISV onboarding, S2/S5). Highest contribution frequency. | new exporter + recipe | +| 2 | `optimization-research` | High leverage: deep-searches ORT/Olive/ecosystem to find gaps and file the backlog that drives every other contributor skill. Internal, but sets the roadmap. | files issues + repro (drives code changes) | +| 3 | `adding-ep-support` | Onboards a new execution-provider backend. Infrequent, but high value the moment a new NPU vendor lands. | compile backend + EP registry | +| 4 | `contributing-a-skill` | Meta-tooling: how to author, lint, and eval a SKILL.md. Sustains the ecosystem but is supporting infrastructure, not a direct model/EP/perf deliverable. | `SKILL.md` + evals | + +> The detailed `## Skill:` sections below appear in document order, not priority order. Importance is +> defined by the two ranked tables above; implementation sequencing (risk/dependency-driven) is in +> [Priority order for implementation](#priority-order-for-implementation). + +### User skill dependency graph + +``` +check-model-feasibility ──► autoconfig ──────────► ship-to-winapp + find a supported model optimize the model validate (L1–L5 gates) + + confirm EP/device runs (automated autoresearch + package multi-EP artifacts + loop OR manual framework) + manifest + runtime EP selection + │ │ ▲ + └──────────► debug-accuracy-drop ───────────────────┘ + (diagnose accuracy drops at any stage) + +use-winml-cli ── general command reference; underpins every step above +``` + +### Contributor research skill + +``` +optimization-research ──► [GitHub issues / winml backlog] + (deep search: ORT source + Olive + ONNX ecosystem + native stack models + → find better solutions → diagnose winml gaps → produce work items) +``` + +### Contributor skill dependency graph + +``` +adding-model-support ──► contributing-a-skill +adding-ep-support ──► contributing-a-skill +``` + +--- + +## Design principle: Skills as agentic workflows + +### The shift: documentation → automation + +Current state (most skills in the ecosystem): +> Skill tells the user what commands to run → user runs them → user interprets output + +Target state for winml-cli: +> Skill tells the **agent** what commands to run → **agent runs them** → agent interprets output → agent gives a specific answer + +The difference: + +| | Documentation skill | Agentic skill | +|---|---|---| +| Agent sees low cosine | "Run `winml eval --mode compare`" | Runs it, reads cosine=0.87, says "drop at quantize stage, Attention layers" | +| EP compatibility | "Run `winml sys` then `winml analyze`" | Runs both, parses JSON, says "QNN available but LayerNorm is partial" | +| Optimize precision | "Use the decision framework" | Runs fp16/w8a16/w8a8 sweep, builds actual tradeoff table, recommends W8A16 | +| Validate before ship | "Check these 6 gates" | Runs all 6 gates, generates a pass/fail report with actual numbers | + +This is only possible if skills describe a **GATHER → ANALYZE → DECIDE → ACT** workflow, +and winml-cli commands emit **machine-readable structured output** that the agent can parse. + +### Structured output: current state and gaps + +Copilot agents have shell tool access and can run `winml` commands directly. +The key requirement is `--format json` on stdout so the agent can parse results +without screen-scraping Rich/ANSI terminal output. + +| Command | Structured output today | Gap | +|---|---|---| +| `winml inspect` | ✓ `--format json` (stdout) | None | +| `winml sys` | ✓ `--format json` (stdout) | None | +| `winml run` | ✓ `--format json` (stdout) | None | +| `winml analyze` | ⚠ `--output file.json` (file only) | Add `--format json` stdout | +| `winml perf` | ⚠ `--output file.json` (file only) | Add `--format json` stdout | +| `winml eval` | ✗ No structured output | Add `--format json` stdout | + +**Required code changes** (enables agentic skill execution): +1. `winml eval --format json` — outputs `{cosine, sqnr, psnr, task_metric}` to stdout +2. `winml analyze --format json` — outputs `{supported: [...], partial: [...], unsupported: [...]}` to stdout +3. `winml perf --format json` — outputs `{p50_ms, p90_ms, p99_ms, mean_ms}` to stdout + +### The GATHER → ANALYZE → DECIDE → ACT skill structure + +Each skill section should be written with agent execution in mind: + +``` +## GATHER: what to run +Commands the agent runs first (with --format json) to collect facts. + +## ANALYZE: what to look for +How to interpret the JSON output. What values matter. What thresholds to apply. + +## DECIDE: what to recommend +Decision logic. If X → recommend Y. If A and B → recommend C. + +## ACT: what to tell the user +What to surface to the user: specific diagnosis + specific next step. +``` + +In practice this maps onto the existing "Sections" structure — the key is ensuring +each section has **concrete commands to run** and **concrete interpretation rules**, +not just prose description. + +### Example: `debug-accuracy-drop` as an agentic workflow + +``` +User: "My W8A8 model has low accuracy" + +GATHER: + agent runs: winml eval --mode compare -m quantized.onnx --model-id --format json + agent gets: {"cosine_similarity": 0.87, "sqnr_db": 28.3, "stage": "quantize"} + +ANALYZE: + cosine=0.87 < 0.90 threshold → problem is real + sqnr=28.3 < 30 dB → significant degradation + stage=quantize → problem introduced at quantize, not optimize or compile + +DECIDE: + quantize-stage drop on W8A8 → check if Attention layers are the culprit + agent runs: winml analyze -m quantized.onnx --ep qnn --format json + agent gets: {"partial": ["MultiHeadAttention", "LayerNorm"], "unsupported": []} + +ACT: + Agent: "The accuracy drop (cosine=0.87) is at the quantize stage. + MultiHeadAttention is partial on QNN — activations may be falling back to FP32. + Try W8A16 to keep activations at FP16: winml build -c config.json --precision w8a16" +``` + +Without structured output (`--format json`), the agent would have to tell the user to run +each step manually and paste the results back. With structured output, the agent runs the +full diagnostic in one turn. + +--- + +## Validation confidence levels (L1–L5) + +Inspired by Mobius `writing-tests`. Applied in `ship-to-winapp` as the Definition-of-Done backbone. +Each level is checked **independently** — a model can pass L3 without passing L2. + +| Level | Name | What it verifies | Key command | +|---|---|---|---| +| **L1** | Loadable | Artifact is valid ONNX, loads without error | `winml inspect -m ` | +| **L2** | Shape correct | Output shape matches expected spec | `winml eval -m --model-id ` (check shape in output) | +| **L3** | Numerical parity | Output matches FP32 baseline (cosine ≥ 0.99 FP16, ≥ 0.95 W8A16, ≥ 0.90 W8A8) | `winml eval --mode compare -m --model-id ` | +| **L4** | Task accuracy | Task metric (Top-1/F1/mAP) within acceptable drop from FP32 reference | `winml eval -m --model-id ` (task metric) | +| **L5** | Production ready | Perf SLA met on target device + cross-EP consistency verified | `winml perf --iterations 100 --monitor` | + +**Quick pass criteria:** + +| Precision | L3 threshold | +|---|---| +| FP16 | cosine_similarity ≥ 0.99 | +| W8A16 | cosine_similarity ≥ 0.95 | +| W8A8 | cosine_similarity ≥ 0.90 (or task-specific) | + +Waivers: any level that cannot be verified must be documented with a reason and tracking issue. +The `ship-to-winapp` skill maps each of its 6 validation gates to an L-level. + +--- + +--- + +## Competitive Analysis + +### Summary + +winml-cli has a solid optimization pipeline (export→quantize→compile→benchmark) but lacks the **debugging/diagnostic loop**, **accuracy recovery tooling**, and **developer observability** that distinguish great toolchains from adequate ones. + +--- + +### Competitor Feature Matrix + +| Feature | Apple | ExecuTorch | AI Hub | NVIDIA | OpenVINO | Optimum | Olive | winml-cli | +|---|---|---|---|---|---|---|---|---| +| Per-layer accuracy debugging | ❌ | ✅ SVG graph | ✅ cloud | ❌ | ❌ | ❌ | ❌ | ❌ | +| Compute unit utilization report | ❌ | ✅ | ✅ | ❌ | Partial | ❌ | ❌ | ❌ | +| Accuracy-Aware PTQ (auto layer rollback) | ❌ | ❌ | ❌ | ❌ | ✅ NNCF | ❌ | ❌ | ❌ | +| Standard NLP benchmark (MMLU/PPL) | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | +| Cross-EP side-by-side compare | ❌ | ❌ | Partial | ❌ | ❌ | ❌ | ❌ | ❌ | +| Zero-deploy validation (model.predict) | ✅ macOS | ✅ | ✅ cloud | ❌ | ✅ | ✅ | ❌ | Partial | +| Pre-quantized model zoo | ❌ | ❌ | ✅ 500+ | ✅ HF org | ✅ | ❌ | ❌ | ❌ | +| One-line optimize command | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | +| Multi-EP artifact packaging | ✅ .mlpackage | ✅ .pte | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| QAT / accuracy recovery fine-tuning | ✅ | ❌ | ✅ AIMET | ✅ | ✅ | ❌ | ❌ | ❌ | +| Advanced quant (AWQ/SmoothQuant) | ❌ | ❌ | ✅ | ✅ | ✅ NNCF | ❌ | ❌ | ❌ | +| Thermal/sustained-load profiling | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | + +--- + +### Competitor Deep Dives + +#### Apple coremltools +**Most relevant**: zero-deploy validation + compute_units API + palettization + +- `model.predict({'input': np_array})` — validates converted model in one Python call without any device deploy. Can force `ComputeUnit.CPU_ONLY` for numerical comparison vs `CPU_AND_NE`. +- `compute_units` is switchable **at prediction time** (not just compile time) — enables A/B testing EP performance without re-converting. +- **Palettization**: LUT-based weight compression at 1–8 bits (k-means clustering, not linear quant). Matches Neural Engine hardware kernels better than INT4 linear quantization for many models. +- Three compression workflows: data-free / calibration-based / fine-tuning-based (QAT). +- `.mlpackage` separates architecture from weights → streaming-friendly, supports on-device compilation after download. + +#### ExecuTorch (Meta) +**Most relevant**: per-layer QNN accuracy debugging (best-in-class of all competitors) + +- `QNNIntermediateDebugger`: dumps intermediate tensor outputs at every QNN op, computes cosine similarity per layer vs CPU reference, generates **color-coded SVG computation graph** (green ≥ 0.9, red < 0.9). +- `get_delegation_info()`: table of ops showing delegated-to-NPU count vs CPU-fallback count per op type. +- `ETDump` + `Inspector` API: per-op timing table with avg (ms), op type, is_delegated. Returns pandas DataFrame. +- QAIRT Visualizer: `pip install qairt-visualizer` — interactive GUI overlaying op trace + QHAS (QNN HTP Analysis Summary) on model graph. +- **Missing**: no cloud device testing, no automated accuracy-latency sweep, build process is complex. + +#### Qualcomm AI Hub +**Most relevant**: cloud profiling with physical hardware, per-step memory breakdown + +- Compile + Profile + Inference on real physical devices (Snapdragon X Elite laptops, Galaxy S24) in the cloud — no local hardware needed. +- Per-step memory profiling: compilation time/memory, first-load time/memory (NE optimization), subsequent-load (cached), inference latency. +- 500+ pre-optimized models in model zoo. +- `--clone j1glw6y8p` — clone any previous job with modified params. +- Cloud AIMET quantization: sophisticated PTQ as a service (`submit_quantize_job()`). + +#### NVIDIA ModelOpt +**Most relevant**: 16 compression techniques + MMLU benchmark scripts + pre-quantized HF checkpoints + +- Compression techniques beyond PTQ: AWQ, SmoothQuant, QAT, pruning (Minitron 33% smaller, 50% faster), distillation, speculative decoding, sparsity, NAS (Puzzletron). +- Windows accuracy benchmark: `mmlu_benchmark.py` (57 subjects, DirectML/ORT/TensorRT-LLM/CPU), perplexity on WikiText-2, KL-divergence metrics. +- Pre-quantized HF checkpoints: `nvidia/DeepSeek-R1-FP4`, `nvidia/Llama-3.3-70B-FP4` etc. — pull validated optimized models without running pipeline. + +#### Intel OpenVINO + NNCF +**Most relevant**: Accuracy-Aware PTQ (auto layer rollback) + +- NNCF `AccuracyAwareQuantization`: automatically identifies sensitivity of each layer to quantization, rolls back sensitive layers to float when accuracy drop exceeds threshold. Fully automated accuracy-performance tradeoff solver. +- `benchmark_app -hint latency` vs `-hint throughput`: auto-configures streams, batch, inference requests for each mode. `-d AUTO`: automatic device selection with fallback. +- 100+ Jupyter notebooks on Binder/Colab — zero setup barrier. +- `OpenVINO GenAI`: high-level `LLMPipeline`, `WhisperPipeline` — deploy-ready LLM inference in 5 lines. + +#### HuggingFace Optimum +**Most relevant**: drop-in Transformers replacement + multi-backend hub + +- Replace `AutoModelForSequenceClassification.from_pretrained()` with `ORTModelForSequenceClassification.from_pretrained()` → ONNX Runtime inference with zero code change. +- 8 hardware backends: ONNX Runtime, OpenVINO, NVIDIA TensorRT-LLM, AMD Ryzen AI, AWS Inferentia, ExecuTorch, Intel Gaudi, FuriosaAI. +- Task-aware export: `--task text-generation` auto-configures dynamic axes and model wrapping. + +#### Microsoft Olive (direct competitor) +**Most relevant**: one-line optimize command + VS Code AI Toolkit + +- `olive optimize --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct --precision int4 --output_path models/qwen` — one command, no per-step config. +- JSON-based pipeline config for full declarative multi-step control. +- VS Code AI Toolkit extension: GUI for model optimization, fine-tuning, and inference testing — no CLI knowledge needed. +- MultiLoRA serving support. + +--- + +### Top 5 High-Impact Gaps for winml-cli + +#### 🔴 Gap 1: Per-Layer Accuracy Debugging + +**Pain**: Accuracy degrades after QNN compilation/quantization, user has no idea which layer caused it. Currently requires QNN SDK expert knowledge. + +**Solution**: `winml debug --model model.onnx --ep qnn --inputs calibration_data/` +1. Runs model on CPU and QNN, captures intermediate tensor outputs at each op +2. Computes cosine similarity per layer +3. Outputs HTML/SVG graph with color-coded accuracy (green/red per layer) + +**Reference**: ExecuTorch `QNNIntermediateDebugger` → `OutputFormat.SVG_GRAPH` + `QcomCosineSimilarityComparator` + +**Impact**: Turns multi-day debugging into a 30-minute diagnosis. Currently no Windows-on-NPU tool does this. + +--- + +#### 🔴 Gap 2: Compute Unit Utilization Report + +**Pain**: `winml perf` shows slower-than-expected latency with no explanation. User doesn't know what % of ops ran on NPU vs fell back to CPU. + +**Solution**: Extend `winml analyze` to output delegation table: +``` +Op Type | NPU Delegated | CPU Fallback | Reason +----------------|---------------|--------------|------------------ +MatMul (INT8) | 47 / 47 | 0 | - +LayerNorm | 0 / 12 | 12 | Unsupported dtype +Softmax (FP32) | 0 / 6 | 6 | Requires INT8 input +``` + +**Reference**: ExecuTorch `get_delegation_info().get_operator_delegation_dataframe()` / AI Hub per-layer compute unit mapping + +**Impact**: Directly actionable — if user sees "60% of ops on CPU due to unsupported dtype," they know to switch to W8A8. + +--- + +#### 🟠 Gap 3: Quantization Sensitivity Analysis + +**Pain**: `winml quantize --algo w8a8` produces a model with unacceptable accuracy. User doesn't know if it's a specific layer, the algorithm, or the calibration data. + +**Solution**: `winml analyze-quant --model model.onnx --calibration data/ --eval-dataset eval/` +1. Run full W8A8 quantization +2. For each block/layer, measure accuracy impact of reverting to FP16 +3. Rank layers by sensitivity +4. Report: "reverting 3 attention layers to FP16 recovers X% accuracy at Y% latency cost" + +**Reference**: Intel NNCF `AccuracyAwareQuantization` (automatic per-layer rollback) + +**Impact**: Replaces multi-day trial-and-error with a 10-minute automated report. + +--- + +#### 🟠 Gap 4: Standard Benchmark Integration (MMLU / Perplexity) + +**Pain**: `winml eval` supports custom scripts but no out-of-box standard benchmarks. Users have no reference point for whether their quantized model's accuracy is "expected." + +**Solution**: `winml eval --model model.onnx --benchmark mmlu --ep qnn` +- Built-in MMLU (57 subjects), WikiText-2 perplexity, KL-divergence scripts +- Reference numbers from FP32 baseline shown alongside quantized result +- `FP16 baseline: 78.2% → W8A8 QNN: 77.9% (−0.3%, expected range: −0.1% to −0.5%)` + +**Reference**: NVIDIA ModelOpt `examples/windows/accuracy_benchmark/mmlu_benchmark.py` supports DirectML/ORT/CPU + +**Impact**: Removes ambiguity and creates trust. Critical for LLM users. + +--- + +#### 🟡 Gap 5: Cross-EP Side-by-Side Comparison + +**Pain**: Choosing between QNN/DirectML/CPU/OpenVINO requires running each EP manually and aggregating results. No tool does this automatically. + +**Solution**: `winml sweep --model model.onnx --precision w8a16,fp16 --ep qnn,dml,cpu` +- Runs build+eval+perf for each (precision × EP) combination +- Outputs a single comparison table: accuracy / latency / op coverage % +- Agent-driven: skill reads JSON output and recommends the optimal combination + +**Reference**: Truly unique — no competitor does this for Windows multi-EP. Closest is AI Hub's multi-device fleet testing (Android only). + +**Impact**: The single most-requested decision for Windows AI developers. Unique to winml-cli. + +--- + +### Patterns in Great Toolchain DX + +**Pattern 1: The "Why" Feedback Loop** +Great toolchains explain *why* results are the way they are. ExecuTorch's delegation table, AI Hub's compute unit mapping, NNCF's layer sensitivity analysis all answer "why?" winml-cli currently stops at "here's the result." + +**Pattern 2: Progressive Disclosure of Complexity** +- Olive: `olive optimize --precision int4` (one line) → full JSON config pipeline +- coremltools: `ct.convert(model)` → MIL IR manipulation +- AI Hub: web dashboard → Python SDK → CLI → AIMET configs + +winml-cli is currently too close to the expert path: each step requires understanding EP-specific options. + +**Pattern 3: Zero-Deploy Validation** +Every strong toolchain lets you test model output before deploying to hardware: coremltools `model.predict()`, ExecuTorch Python pybind, AI Hub `submit_inference_job()`. winml-cli is strong for CPU but lacks the quick "compare CPU vs QNN output" path. + +**Pattern 4: Pre-Validated Model Artifacts** +ModelOpt (HF nvidia/ org), AI Hub (500+ models), NNCF (Model Zoo with accuracy tables) all reduce the cold-start problem. Users don't need the full pipeline for popular models. + +--- + +### Whitespace Opportunities (No Competitor Covers) + +| Opportunity | Why it's winml-cli territory | +|---|---| +| **Cross-EP regression table** (one command, all EPs) | Multi-EP is the unique Windows AI challenge; no Android/iOS tool does this | +| **Quantization config recommender** (`winml recommend --target qnn --constraint latency=20ms`) | Rule-based recommendation from hardware+model arch analysis | +| **EP-aware ONNX graph visualizer** (Netron + green/yellow/red per EP) | Netron exists but has no EP coverage overlay | +| **Thermal/sustained-load profiling** (latency curve over 100 runs, detect throttling) | AI Hub hides variance; no tool surfaces thermal behavior | +| **Windows AI Model Package** (.mlpackage equivalent with multi-EP manifest) | Apple has .mlpackage; Windows has nothing equivalent | + +--- + +## Skill: `use-winml-cli` (existing — extend) + +**Status:** Exists at `skills/use-winml-cli/SKILL.md`. Needs two additions: +- Add `winml run` and `winml serve` usage (currently missing) +- Add "first-time onboarding" path for users who don't know where to start + +No structural changes needed; the existing skill is the general entry point. + +--- + +## Skill: `debug-accuracy-drop` + +### Frontmatter +```yaml +name: debug-accuracy-drop +description: > + Use this skill when a quantized or optimized model produces worse accuracy than + the FP32 baseline and the cause is unknown. Guides a structured diagnosis: first + isolate which pipeline stage introduced the drop (optimize vs quantize vs compile), + then use winml eval --mode compare to measure output similarity, then use winml + analyze to check for partial/unsupported ops that may cause EP fallback. Covers + calibration dataset issues, precision selection mistakes, and QNN-specific fallback + patterns. Use when the user says "accuracy dropped after quantization", "results + look wrong on NPU", or "cosine similarity is low". +``` + +### When to use +- "My model gives wrong results after quantization" +- "W8A8 accuracy is too low, how do I find out why" +- "Results differ between NPU and CPU" +- cosine_similarity < 0.95 from `winml eval --mode compare` + +### Sections + +**1. Isolation strategy: binary search on the pipeline** +Diagnose by bisecting the pipeline stages: +``` +FP32 baseline + → after optimize? winml eval --mode compare (fp32 vs optimized) + → after quantize? winml eval --mode compare (fp32 vs quantized) + → after compile? winml eval --mode compare (fp32 vs compiled) +``` +First stage where cosine drops → that's where the problem is. + +Key commands: +```bash +# Export FP32 baseline +winml export -m -o baseline/model.onnx + +# Compare optimized vs baseline +winml eval --mode compare -m optimized/model.onnx --model-id + +# Compare quantized vs baseline +winml eval --mode compare -m quantized/model.onnx --model-id + +# Compare EP-compiled vs baseline (run on target EP) +winml eval --mode compare -m compiled/model.onnx --model-id --ep qnn +``` + +**2. Interpreting similarity metrics** +Table of thresholds: +| Metric | Healthy | Investigate | Problem | +|---|---|---|---| +| cosine_similarity | > 0.99 | 0.95–0.99 | < 0.95 | +| SQNR (dB) | > 40 | 30–40 | < 30 | +| max_abs_diff | model-dependent | — | unbounded | + +**3. Root cause patterns** + +| Symptom | Likely cause | Fix | +|---|---|---| +| Drop appears at quantize stage | Calibration dataset not representative | Use task-relevant calibration data via `--calibration-dataset` | +| Drop appears at quantize stage for Attention layers | W8A8 quantizing activations in attention | Switch to W8A16 (keeps activations at FP16) | +| Drop appears at compile stage on QNN | Op pattern unsupported → CPU fallback | Run `winml analyze` to find partial ops | +| Inconsistent results across runs | Non-deterministic EP dispatch | Add `--iterations 20` to average out | +| Drop only in certain inputs | Input shape sensitivity | Test with calibration data matching real distribution | + +**4. Checking for op fallback with `winml analyze`** +When compile-stage drop is suspected: +```bash +winml analyze -m quantized/model.onnx --ep qnn +``` +Look for `partial` and `unsupported` ops — these fall back to CPU, introducing +numerical differences vs native NPU execution. Partial ops are the most common +source of unexpected accuracy variance on QNN. + +**5. Precision escalation path** +If W8A8 is the problem and the model is accuracy-sensitive: +W8A8 → W8A16 → FP16 → FP32 +Stop at the first precision that meets accuracy requirements. + +**Cross-references:** +- To compare precision options systematically → `autoconfig` (manual or automated optimize) +- If op is listed as unsupported → `check-model-feasibility` + +--- + +## Skill: `ship-to-winapp` (merge of `validate-before-ship` + `prepare-for-winapp`) + +Covers the whole ship-time phase: **first validate** the model meets the Definition-of-Done, +**then package** the multi-EP artifacts and manifest for the WinApp to load at runtime. + +### Frontmatter +```yaml +name: ship-to-winapp +description: > + Use this skill when taking a winml-cli model artifact the last mile into a Windows + application — both validating it is good enough to ship and packaging it for the app. + Validation half: a Definition-of-Done checklist covering artifact completeness, accuracy + vs FP32 baseline, performance SLA, output correctness on real inputs, cross-EP consistency, + and fallback chain (every item checked or explicitly waived). Packaging half: how to organize + multi-EP artifacts (QNN/NPU, OpenVINO, VitisAI, DirectML/GPU, CPU fallback), the recommended + directory layout and manifest.json for runtime EP selection, and the runtime EP detection / + fallback pattern. Use when the user says "I'm ready to ship", "what should I test before + release", "how do I know the model is good enough", "how do I use this in my app", + "how do I package the model", or "what file do I load at runtime". +``` + +### When to use +- About to ship a WinApp with on-device inference; final QA gate before production +- After any build config change (new quantization, new EP, new model version) +- "I built the model, how do I ship it in my app?" +- "How do I load different models for different hardware / what happens with no NPU?" +- "How do I package QNN + DML + CPU variants together?" + +--- + +### Part A — Validate (Definition-of-Done gates) + +**The checklist** + +**Gate 1 — Artifact completeness** +- [ ] All target EP artifacts exist and are loadable +- [ ] CPU fallback artifact exists +- [ ] manifest.json (if using multi-EP layout) is valid and references existing files +- [ ] Artifact was built with `winml build` (not opaque cache artifact) + +```bash +winml inspect -m .onnx # verify each artifact loads +``` + +**Gate 2 — Accuracy vs FP32 baseline** +- [ ] cosine_similarity ≥ 0.99 for FP16 artifacts +- [ ] cosine_similarity ≥ 0.95 for W8A16 artifacts +- [ ] cosine_similarity ≥ 0.90 for W8A8 artifacts (or task-specific threshold) +- [ ] Task accuracy metric (Top-1, F1, mAP) within acceptable drop from FP32 + +```bash +winml eval --mode compare -m .onnx --model-id +winml eval -m .onnx --model-id # task accuracy +``` + +**Gate 3 — Performance SLA** +- [ ] p50 latency meets application target on target device +- [ ] p99 latency within 2x p50 (no outlier spikes) +- [ ] Benchmark run on actual target hardware (not developer machine) + +```bash +winml perf -m .onnx --device --iterations 100 --monitor +``` + +**Gate 4 — Output correctness on real inputs** +- [ ] Model produces correct output on ≥3 representative real-world inputs +- [ ] No NaN or Inf in outputs +- [ ] Output shape matches expected shape + +```bash +winml run -m .onnx --file # visual/manual check +``` + +**Gate 5 — Cross-EP consistency (if shipping multiple EP variants)** +- [ ] QNN and DML outputs agree within tolerance on same input +- [ ] CPU fallback output agrees with primary EP within tolerance + +```bash +winml run -m model_qnn.onnx --file sample.jpg --format json -o qnn_out.json +winml run -m model_dml.onnx --file sample.jpg --format json -o dml_out.json +winml run -m model_cpu.onnx --file sample.jpg --format json -o cpu_out.json +# compare qnn_out.json vs dml_out.json vs cpu_out.json manually +``` + +**Gate 6 — Fallback chain** +- [ ] CPU fallback artifact verified independently (not just assumed to work) +- [ ] App runtime selects correct artifact when target EP is absent (simulate by removing EP) + +**Waiver policy** +Any item that cannot be completed must be waived explicitly: +``` +Waivers: +- Cross-EP consistency: VitisAI not available on developer machine. + Verified on target hardware by QA team. Issue #NNN. +- Performance SLA: Target hardware (Snapdragon X Elite) in procurement. + Benchmark deferred to post-merge, tracked in issue #NNN. +``` +Unchecked items without waiver → do not ship. + +**L-level mapping** — the 6 gates map directly to the L1–L5 confidence system (see Overview): + +| Gate | L-level | +|---|---| +| Gate 1 — Artifact completeness | L1 | +| Gate 2 — Accuracy vs FP32 baseline | L3 + L4 | +| Gate 3 — Performance SLA | L5 | +| Gate 4 — Output correctness on real inputs | L4 | +| Gate 5 — Cross-EP consistency | L5 | +| Gate 6 — Fallback chain | L1 (CPU artifact) | + +Minimum to ship: L1 + L3 all passing. L4 + L5 required for production release. + +**Quick command reference** +```bash +# Gate 1: inspect all artifacts +for f in model_qnn.onnx model_dml.onnx model_cpu.onnx; do winml inspect -m $f; done +# Gate 2: accuracy +winml eval --mode compare -m .onnx --model-id +winml eval -m .onnx --model-id +# Gate 3: perf +winml perf -m .onnx --device auto --iterations 100 --monitor +# Gate 4: real input +winml run -m .onnx --file +# Gate 5: cross-EP (run individually, compare outputs) +winml run -m model_qnn.onnx --file --format json +winml run -m model_dml.onnx --file --format json +``` + +--- + +### Part B — Package & integrate (multi-EP) + +**1. The multi-EP artifact problem** +`winml compile` produces EP-locked files (not portable), so a WinApp needs a strategy to +select the right file per device. + +**2. Recommended artifact layout** +``` +my_model/ + manifest.json ← EP → file mapping + version + model_qnn.onnx ← QNN NPU (compiled, Snapdragon X) + model_openvino.onnx ← OpenVINO NPU/GPU (Intel Core Ultra) + model_vitisai.onnx ← VitisAI NPU (AMD Ryzen AI) + model_dml.onnx ← DirectML GPU (any GPU, non-NPU machines) + model_cpu.onnx ← CPU fallback (universal) +``` + +**3. manifest.json schema** +```json +{ + "model_id": "facebook/convnext-tiny-224", + "task": "image-classification", + "version": "1.0.0", + "variants": [ + { "ep": "qnn", "device": "npu", "file": "model_qnn.onnx", "precision": "w8a16" }, + { "ep": "openvino", "device": "npu", "file": "model_openvino.onnx", "precision": "w8a8" }, + { "ep": "vitisai", "device": "npu", "file": "model_vitisai.onnx", "precision": "w8a8" }, + { "ep": "dml", "device": "gpu", "file": "model_dml.onnx", "precision": "fp16" }, + { "ep": "cpu", "device": "cpu", "file": "model_cpu.onnx", "precision": "w8a8" } + ], + "selection_order": ["qnn", "openvino", "vitisai", "dml", "cpu"] +} +``` +(For multi-EP artifacts, `autoconfig` emits this `manifest.json` directly with experiment provenance.) + +**4. Building all variants with winml-cli** +```bash +# Generate configs per EP +winml config -m --device npu --ep qnn -o config_qnn.json +winml config -m --device npu --ep openvino -o config_ov.json +winml config -m --device gpu --ep dml -o config_dml.json +winml config -m --device cpu -o config_cpu.json + +# Build all +winml build -c config_qnn.json -m -o out_qnn/ +winml build -c config_ov.json -m -o out_ov/ +winml build -c config_dml.json -m -o out_dml/ +winml build -c config_cpu.json -m -o out_cpu/ +``` + +**5. Runtime EP selection pattern (C++ / ORT)** +Pseudocode for app-side logic: +- Read manifest.json +- Query available EPs on device (`GetAvailableProviders()` or `winml sys` equivalent) +- Walk `selection_order`, pick first EP available on this device +- Load the corresponding file +- If all fail → CPU is always available + +**6. What NOT to do** +- Don't load a QNN-compiled model with CPU EP → will fail or produce wrong results +- Don't hardcode EP names → check availability at runtime +- Don't ship only the compiled artifact without a CPU fallback + +**Cross-references:** +- If accuracy gate fails → `debug-accuracy-drop` +- If performance gate fails → `autoconfig` (manual or automated optimize path) +- If EP not available for testing, or to pick the right EP → `check-model-feasibility` +- To build the artifacts → `use-winml-cli` + +--- + +## Skill: `check-model-feasibility` (merge of `find-a-model` + `ep-compatibility-check`) + +The pre-build front door. Two entry points, one shared engine (`inspect` → `sys` → `analyze`): +**(A)** the user has no model yet → recommend a *supported* one from their constraints; +**(B)** the user has a model → confirm it runs on their target EP/device. Both converge on the +same three-layer check, so they are one skill. + +### Frontmatter +```yaml +name: check-model-feasibility +description: > + Use this skill before a full build, to answer two linked questions: "which model should I + use?" and "will it run on my hardware?". Model discovery: when the user knows the task + (image classification, text embedding, object detection, summarization, …) but has no model + yet, gather their constraints, generate Hugging Face candidates, and screen each one for + winml-cli support. Compatibility: for a chosen (or candidate) model, run the three-layer check + — winml inspect (model support), winml sys (EP availability on this machine), winml analyze + (operator-level EP coverage) — plus the EP-to-hardware mapping and fallback chain for Windows + AI PCs. Use when the user says "what model should I use for X", "find me a model that runs + under 20ms on the NPU", "recommend a small image classifier", "I don't have a model yet", + "will this work on my device", "is QNN supported here", "what hardware do I need for NPU", + or when they hit an unsupported-operator error. + +audience: external (WinApp developers) +``` + +### When to use +- "What model should I use for background blur / OCR / summarization?" +- "Find a text-embedding model under 100MB that runs on the Intel NPU" +- "Will this model work on my Snapdragon X Elite laptop? Is QNN supported here?" +- "The compile step failed with an unsupported op" +- Starting a new project: pick a model and verify feasibility before investing build time + +### What this skill does NOT do +- It does not train, fine-tune, or optimize a model — optimization hands off to `autoconfig`. +- It only recommends models whose architecture winml-cli can actually export/run (verified via + `winml inspect`), never an arbitrary HF model it cannot load. + +### Sections + +**1. Two entry points** +- (A) **No model yet** → run Section 2 (discovery) to produce candidates, then Section 3 on each. +- (B) **Have a model** → skip to Section 3 (three-layer check) directly. + +**2. Discovery — find candidate models (entry point A)** +Capture and lock the selection constraints first: + +| Condition | Example | Drives | +|---|---|---| +| Task | image-classification, feature-extraction, text-generation | HF Hub filter | +| Target device / EP | Snapdragon X NPU (QNN), Intel NPU (OpenVINO), any GPU (DML) | feasibility + latency class | +| Latency budget | p50 ≤ 20 ms | size / architecture shortlist | +| Accuracy need | "≥ ResNet-50 top-1" or a benchmark floor | candidate quality bar | +| Size limit | ≤ 100 MB on disk | excludes large variants | +| License | permissive (Apache-2.0 / MIT) | excludes restricted models | + +The agent queries the HF Hub by task, sorted by downloads/likes, restricted to architecture +families winml-cli is known to support → a 5–10 model shortlist. Each candidate then goes +through the three-layer check below; drop any that fail Layer 1 or have heavy unsupported ops. + +**3. The three-layer feasibility check (entry points A and B)** +Layer 1 — Model support · Layer 2 — EP availability · Layer 3 — Operator coverage. +Run in order, stop at first hard failure. + +*Layer 1 — Model support* +```bash +winml inspect -m --format json +``` +Look for `loader`, `exporter`, `winml_inference_class` populated. If inspect fails or shows +"unsupported" → model is out of scope for winml-cli (drop the candidate; do not recommend it). + +*Layer 2 — EP availability* +```bash +winml sys --list-ep --list-device +``` +| EP | Hardware requirement | Check for | +|---|---|---| +| QNN | Qualcomm Snapdragon X Elite / X Plus | QNNExecutionProvider in list | +| OpenVINO | Intel Core Ultra (Meteor Lake / Lunar Lake+) | OpenVINOExecutionProvider | +| VitisAI | AMD Ryzen AI (Phoenix / Hawk Point / Strix) | VitisAIExecutionProvider | +| NvTensorRTRTX | NVIDIA discrete GPU (RTX series) | NvTensorRTRTXExecutionProvider | +| DML | Any DirectX 12 GPU | DmlExecutionProvider | +| CPU | Any | Always available | + +If the desired EP is not listed → recommend next best EP from the fallback chain. + +*Layer 3 — Operator coverage* +```bash +winml analyze -m .onnx --ep --format json +# or for all EPs at once: +winml analyze -m .onnx --device all +``` +- `supported` (green): op runs natively on EP +- `partial` (yellow): op may fall back to CPU for some configurations +- `unsupported` (red): op cannot run on this EP + +Decision rule: any `unsupported` → either change EP or accept CPU fallback for those ops +(which may impact accuracy and latency). + +**4. Fallback chain recommendation** +If target EP not available or has unsupported ops: +``` +QNN not available → OpenVINO (if Intel) or VitisAI (if AMD) → DML → CPU +``` + +**5. Rank and recommend (entry point A) / fast-fail before compile (entry point B)** +- Discovery: rank surviving candidates by fit against the locked conditions (size, latency + class, accuracy reference, op coverage, downloads as a popularity prior). Output a short + ranked table + one recommended pick + rationale. +- `winml compile` is expensive (minutes). Always run `analyze` first; if it shows >20% + unsupported ops → likely not worth compiling for that EP. + +**Cross-references:** +- After picking a model + confirming feasibility → `autoconfig` (find the optimal config) +- To build the chosen artifacts → `use-winml-cli` +- If **no** supported model meets the constraints, or all EPs show unsupported ops → the gap + feeds `optimization-research` (long-tail coverage) and `adding-model-support` + +> Addresses the **Pre-quantized model zoo / cold-start** whitespace from the Competitive Analysis: +> NVIDIA (`nvidia/` HF org) and AI Hub (500+ models) reduce cold-start with curated zoos; winml-cli +> has none, so this skill substitutes a constraints-driven recommender that only returns *supported* models. + +--- + +## Skill: `adding-model-support` (contributor) + +### Frontmatter +```yaml +name: adding-model-support +description: > + Use this skill when contributing support for a new Hugging Face model to + winml-cli. Covers finding the correct exporter, writing a recipe config, + verifying at each pipeline stage (export → optimize → quantize → compile), + and passing the L1–L5 validation gates before submitting a PR. Use when + a contributor says "I want to add support for model X", "this model type + is not supported", or "how do I write a recipe for a new architecture". +``` + +### When to use +- "I want to add support for Qwen3 / Phi-4 / [new model]" +- "winml-cli says this model is unsupported" +- "How do I write a recipe config for a new model family?" + +### Sections + +**1. Find the right exporter** +```bash +winml inspect -m # check if auto-detected +``` +If inspect fails → the model needs a new exporter or recipe. +Look in `src/winml/modelkit/export/` for existing exporters as reference. + +**2. Find a reference model of the same family** +- Same architecture class (e.g., LlamaForCausalLM, BertModel)? +- Check `recipes/` for an existing `.json` config for that class +- Prefer copying the closest recipe and adjusting rather than writing from scratch + +**3. Write the recipe config** +Minimal recipe template: +```json +{ + "model_id": "org/model-name", + "task": "text-generation", + "export": { "opset": 17 }, + "optimize": { "passes": ["MatMulAddFusion", "LayerNormFusion"] }, + "quantize": { "mode": "w8a16", "calibration_dataset": "wikitext2" } +} +``` + +**4. Validate at each stage (L1 → L5)** + +| Stage | Command | Pass criterion | +|---|---|---| +| L1: Export loads | `winml inspect -m .onnx` | No error | +| L2: Shape correct | `winml eval -m .onnx --model-id ` | Output shape matches | +| L3: Numerical parity | `winml eval --mode compare -m .onnx --model-id ` | cosine ≥ threshold | +| L4: Task accuracy | `winml eval -m .onnx --model-id ` | Task metric in spec | +| L5: Perf on target EP | `winml perf -m .onnx --device ` | Meets latency target | + +**5. Common pitfalls for new models** +- New op types not in operator coverage → run `winml analyze` early +- Attention variant (GQA, MQA, MLA) → check quantization mode compatibility +- Dynamic shapes → add explicit shape hints in export config +- Non-standard tokenizer → verify `winml run` input preprocessing + +**Cross-references:** +- If EP shows unsupported ops → `check-model-feasibility` +- After L1–L5 all pass → `ship-to-winapp` for PR gate + +--- + +## Skill: `adding-ep-support` (contributor) + +### Frontmatter +```yaml +name: adding-ep-support +description: > + Use this skill when adding a new execution provider (EP) backend to + winml-cli. Covers implementing the compile backend interface, adding + EP-specific optimize passes, wiring the new EP into winml sys and + winml analyze, and verifying coverage with the L1–L5 test gates. + Use when a contributor says "I want to add support for a new EP", + "how does the QNN compile backend work", or "can we support EP X". +``` + +### When to use +- Adding a new EP compile backend (e.g., a new NPU vendor) +- Extending an existing EP with new optimization passes +- Understanding how the existing QNN / OpenVINO / VitisAI backends are structured + +### Sections + +**1. EP backend interface** +Reference implementation: `src/winml/modelkit/compile/qnn_backend.py` +Three methods to implement: +```python +class MyEPBackend(CompileBackend): + def is_available(self) -> bool: ... # detect EP on current machine + def optimize(self, model, config): ... # EP-specific graph transforms + def compile(self, model, config): ... # produce EP-locked artifact +``` + +**2. Wire into EP registry** +Register in `src/winml/modelkit/ep_registry.py`: +```python +EP_REGISTRY["myep"] = MyEPBackend +``` +This makes `--ep myep` work in `winml config`, `winml compile`, `winml analyze`. + +**3. Add operator coverage data** +Add a coverage JSON to `src/winml/modelkit/analyze/coverage/myep_ops.json`: +```json +{ "Add": "supported", "LayerNorm": "partial", "CustomOp": "unsupported" } +``` +This is what `winml analyze --ep myep` reads. + +**4. Add to `winml sys` output** +Add EP availability check to `src/winml/commands/sys.py` so it appears +in `winml sys --list-ep`. + +**5. L1–L5 validation for the new EP** +Minimum before merging: +- L1: A known-good model compiles without crash +- L3: Compiled artifact passes `winml eval --mode compare` (cosine threshold) +- L5: `winml perf` produces valid latency output on target hardware + +**Cross-references:** +- Operator coverage analysis → `check-model-feasibility` +- After adding: document the EP in the `check-model-feasibility` hardware table + +--- + +## Skill: `contributing-a-skill` (contributor) + +### Frontmatter +```yaml +name: contributing-a-skill +description: > + Use this skill when writing a new SKILL.md for winml-cli or improving + an existing one. Covers frontmatter requirements, description writing + (the description is the agent trigger, not a human summary), section + structure conventions, cross-reference format, command accuracy + requirements, and the review checklist before submitting. Use when a + contributor says "I want to add a new skill", "how should I write + SKILL.md", or "what are the skill authoring rules". +``` + +### When to use +- Writing a new skill for a gap not covered by existing skills +- Improving an existing skill with new commands or sections +- Reviewing a skill PR + +### Sections + +**1. Frontmatter rules** +```yaml +name: kebab-case-skill-name # matches directory name under skills/ +description: > + Use this skill when . + Covers . + Use when the user says "", "", or . +``` + +**Critical:** The `description` field is what the Copilot agent reads to decide +whether to activate this skill. Write it as a trigger specification, not a +documentation summary. Include representative user phrases in quotes. + +**2. Required sections (in order)** +1. `## When to use` — 3–5 bullet points with user-facing symptoms/questions +2. Diagnostic or decision section — symptom → cause → fix structure +3. Command examples — runnable `winml` commands with real flags +4. Reference tables — hardware, thresholds, EP names as concrete data +5. `## Cross-references` — links to related skills using relative paths + +**3. Cross-reference format** +```markdown +- If accuracy dropped → see `.agents/skills/debug-accuracy-drop/SKILL.md` +- After validating → see `.agents/skills/validate-before-ship/SKILL.md` +``` + +**4. Content rules** +- All commands must be runnable exactly as written (no pseudocode flags) +- Include concrete numbers: thresholds (cosine ≥ 0.99), speedup (3–5×), latency (<50ms) +- Target ~200 lines prose + tables; move deep content to `references/` subdirectory +- Do not duplicate content from another skill — cross-reference instead + +**5. Review checklist before PR** +- [ ] `description` contains ≥3 quoted user trigger phrases +- [ ] All commands are tested and produce the described output +- [ ] Cross-references use relative paths and the linked skill exists +- [ ] No commands reference flags that don't exist in current `winml --help` +- [ ] Hardware names and EP names match the canonical list in `check-model-feasibility` +- [ ] `evals/eval.yaml` exists with ≥2 test cases (including at least one negative assertion) + +--- + +## Skill: `autoconfig` (user — optimize the model: automated loop + manual framework) + +The optimize skill. Two modes: **automated** (the autoresearch loop — the bulk of this section) for +"figure it out for me / run overnight", and **manual** (the decision framework folded in from +`optimize-for-device`) for "I'll choose by hand" or when there is no target hardware to benchmark on. + +### Frontmatter +```yaml +name: autoconfig +description: > + Use this skill when a **WinApp developer** wants the best performance for their model on one or + more Windows EP/device targets — either by letting winml-cli search automatically, or by working + through the precision/EP tradeoffs by hand. Automated mode: an autonomous experiment loop that + proposes config.json hypotheses, runs winml build + eval + perf, evaluates against user-defined + objectives (accuracy floor, latency budget, or Pareto frontier), and iterates — keeping + improvements, discarding regressions; covers single-EP optimization, multi-EP parallel search, + mixed-precision (nodes_to_exclude) exploration, calibration tuning, and manifest.json output. + Manual mode: the latency-budget vs accuracy-floor decision framework, the FP32→FP16→W8A16→W8A8 + precision ladder, a per-device hardware guidance table, and how to read tradeoff results. + Use when the user says "find the best config for my model on QNN", "automate the config search", + "generate configs for all EPs", "I want to leave this running overnight", "make it faster", + "which precision should I use", "is NPU worth it", or "compare QNN vs DirectML vs CPU". + +audience: external (WinApp developers) +``` + +### When to use +- "Find the best W8A8 config that keeps accuracy > 0.95 on QNN" +- "Generate optimized configs for QNN + DirectML + CPU and build a manifest" +- "I don't know which quantization settings to use, figure it out for me" / "run overnight" +- "Make it faster" / "which precision should I use" / "is NPU worth it" (→ manual mode) +- "Compare QNN vs DirectML vs CPU for my model" +- User has a latency SLA or accuracy floor but doesn't know how to achieve it + +### What this skill does NOT do +- It only searches within what `winml build` currently supports (existing capabilities) +- It does not look for optimization techniques outside winml's current feature set +- It does not suggest that winml needs new features or file bugs +- For finding what winml is *missing*, use `optimization-research` instead + +--- + +### Manual mode — the decision framework (folded in from `optimize-for-device`) + +Use this lightweight path when the user wants to decide by hand, or has no target hardware to +benchmark on (so the automated loop's perf gate can't run). It is the conceptual model the +automated loop below mechanizes. + +**1. The decision framework** — two inputs: latency budget OR accuracy budget. +- Have a latency SLA (e.g. <50ms)? → find highest accuracy within that budget +- Have an accuracy floor (e.g. <2% drop)? → find fastest within that floor + +**2. The precision ladder** — FP32 → FP16 → W8A16 → W8A8, with typical speedup and accuracy-drop +ranges per model family (Encoder/BERT-like, Vision/ConvNet, Transformer/ViT). + +**3. The sweep workflow** — run `winml build` + `winml eval` + `winml perf` for each precision, +collect into a tradeoff table, apply the decision framework. +```bash +winml config -m --device --precision fp16 -o config_fp16.json +winml build -c config_fp16.json -m -o out_fp16/ +winml eval -m out_fp16/.onnx --model-id +winml perf -m out_fp16/.onnx --device --iterations 50 +# repeat for w8a16, w8a8 +``` + +**4. Hardware-specific guidance table** +| Device | Best EP | Sweet-spot precision | Notes | +|---|---|---|---| +| Snapdragon X Elite NPU | QNN | W8A16 | HTP native for W8A16; W8A8 risky for Attention | +| Intel Core Ultra NPU | OpenVINO | W8A8 | OpenVINO PTQ handles INT8 well | +| AMD Ryzen AI NPU | VitisAI | W8A8 | Phoenix/Hawk Point prefer INT8 | +| Any GPU | DirectML | FP16 | FP16 sufficient; quantization rarely helps on GPU | +| CPU fallback | CPU | W8A8 | Size + latency both benefit | + +**5. Reading the output** — how to interpret `winml eval` cosine_similarity / SQNR and +`winml perf` p50/p90/p99; what values indicate "acceptable" vs "needs investigation". + +When the user wants this automated instead of done by hand, continue to the autoresearch loop below. + +--- + +### Epistemic standard for autoconfig findings + +**Any conclusion this skill writes into a report or recommends to a user must meet this bar:** + +| Requirement | What it means | +|---|---| +| **Observation vs explanation** | State what was measured separately from why it happened. "latency increased 270ms" is fact. "because NHWC causes cache thrashing" is a hypothesis — label it as such unless confirmed by profiling. | +| **Statistical validity** | A latency claim requires ≥ 3 independent runs with warmup. A single `winml eval` run (no warmup, includes preprocessing) is insufficient to quote as a latency number. It can guide search decisions but not final reports. | +| **Mechanism confirmation** | Do not explain a regression unless the mechanism is confirmed (e.g., by profiler, by op-level timing, or by **source code inspection of ORT/QNN SDK**). If unknown, write "cause unconfirmed; further profiling needed." | +| **Scope boundary** | Results measured on one model/EP are never generalized to other models/EPs without explicit qualification. "On ConvNext-tiny CPU" is allowed. "CPU dislikes fusion" is not — it's an overgeneralization. | +| **Unresolved uncertainty** | If an observation contradicts the expected behavior (e.g., a "disabled" fusion still appears in the output), the report must flag this as an open question, not silently adopt an explanation. | +| **EP isolation** | A finding on one EP (positive or negative) MUST NOT be applied to prune the search space of a different EP without independent validation. CPU opset regression ≠ QNN NPU opset regression. Always validate per EP independently. | + +The skill MUST NOT write confident root-cause explanations in the HTML report or chat summary for regressions where only the measurement is available. Use hedged language: "this likely relates to…", "one hypothesis is…", or simply omit the explanation and recommend profiling. + +#### Perf gain validation protocol + +Before **any** perf gain is written into a report, config recommendation, or knowledge base as a confirmed finding, it must pass ALL three gates: + +**Gate 1 — Statistical: two-phase bench protocol (from GPU Optimizer V2)** + +``` +Phase A — Quick screen (fast, ~2 min): + winml perf -m --ep --device --warmup 20 --iterations 200 -o screen.json + CV = screen.json.std / screen.json.p50 + IF CV > 0.10 (10%) on CPU/GPU: REJECT — high variance, measurement unreliable + → cool down 120s, retry once + → if still CV > 0.10: flag as [UNSTABLE], skip candidate + EXCEPTION — QNN NPU: CV 0.10–1.2 is NORMAL due to DVFS (Hexagon HTP thermal throttling). + Do NOT reject on CV for QNN NPU. Instead: proceed to Phase B unconditionally. + Phase B's 3-session cool-down is the thermal control mechanism for NPU. + Watch for: s0 of any session may be elevated (JIT warmup) — exclude if >20% above s1/s2. + +Phase B — Full bench (for CPU/GPU: only if Phase A passes; for QNN NPU: always): + # 3 independent sessions with 30s cool-down between each (QNN NPU) + # or 60s cool-down (GPU) between each + winml perf ... --warmup 20 --iterations 500 -o run1.json + sleep 30 # (30s for NPU, 60s for GPU) + winml perf ... --warmup 20 --iterations 500 -o run2.json + sleep 30 + winml perf ... --warmup 20 --iterations 500 -o run3.json + + # KEEP if ALL of: + # 1. p50(run1,2,3) are ALL faster than baseline p50 × (1 - min_improvement) + # (for NPU: ranges must be non-overlapping, not just means) + # 2. cosine_similarity ≥ accuracy_floor + KEEP_threshold = baseline_p50 × 0.99 # ≥1% improvement required +``` +Rationale: DVFS on QNN NPU causes 0.15–1.2 CV routinely — single sessions are meaningless. CV check only +gates CPU/GPU. For NPU, multi-session averaging + range separation is the reliability criterion. +Validated on DINOv2-small: 3 sessions separated cleanly (h3 s1/s2=4.97/4.88ms well below entire h1 +range 6.4–9.4ms). + +**Gate 2 — Mechanism: read ORT/QNN source code before explaining why** + +**Gate 2 — Mechanism: read ORT/QNN source code before explaining why** +- For QNN EP gains: check `onnxruntime/core/providers/qnn/builder/` for opset-conditional dispatch +- For CPU EP gains: check `onnxruntime/core/optimizer/` for pass applicability conditions +- For DML EP gains: check DML operator mapping tables +- **Do not publish "opset 21 = faster on QNN NPU for model X" without confirming the mechanism.** + Even after: (1) confirming kMaxSupportedOpset ≥ 23 (bypass hypothesis RULED OUT), (2) verifying + Transpose count identical in optimized graphs for opset17 vs opset21 (Transpose-elimination + hypothesis RULED OUT) — the DINOv2 +24-31% speedup is empirically real but mechanism is UNKNOWN. + The only confirmed observation is +48 Reshape nodes in opset21 vs opset17. Why this helps QNN NPU + is an open research question. KB status: observed=true, mechanism_confirmed=false. + +**Gate 3 — Reproducibility: baseline and candidate measured in same thermal state** +- Run baseline and candidate back-to-back in the same session OR +- Use a device-level tool to lock NPU clock frequency +- If you cannot control thermal state, report min_ms (peak-performance ceiling) alongside p50 (typical performance), and flag the variance explicitly. + +**Lesson from DINOv2 and ConvNext opset sweep (validated 2026-06-17):** + +DINOv2-small and DINOv2-base (Facebook DINO pre-training, ConvNeXt-style patch projection + ViT): +- opset21 vs opset17: DINOv2-small +30.6%, DINOv2-base +24.1% (3×500-iter, clean protocol ✅) +- dino-vitb16 (Google ViT): -0.7% NEUTRAL — same ViT architecture, no benefit +- gender-classification ViT: +3.5% NEUTRAL — IDENTICAL op counts to DINOv2-small (49 Transpose, 121 Reshape) but no benefit +**Conclusion: opset21 benefit is real for DINOv2 family but NOT generalizable to ViT. Mechanism unknown.** +The gain is NOT from: NHWC bypass (kMaxSupportedOpset ≥ 23), Transpose elimination (count identical). +The only structural difference: +48 Reshape nodes in opset21 optimized graph. Effect is below op-count visibility. + +ConvNext CPU opset sweep: data IS real (opset17 best, opset19+ 3-4× regression). NOT inconclusive. +Mechanism uncertainty exists for CPU (two separate kMaxSupportedOpset constants in ORT, one unverified), +but the practical rule stands: use opset17 for CPU EP unconditionally. + +--- + +### Design Comparison: GPU Optimizer V2 vs WinML Autoconfig + +**Reference**: "Agentic GPU Model Optimization" doc (cheye@, 2026-03-20). GPU Optimizer V2 is a 6-role multi-agent system for cloud GPU inference optimization (ONER-1B KNN service, H100). Autoconfig is a local edge inference optimizer (winml-cli, Snapdragon X). Most of their infrastructure (machine pool, SSH fleet, Triton serving, custom CUDA kernels, SM occupancy tuning) does not apply here. But the agent loop design has several directly adoptable ideas. + +#### Adoptable insights from GPU Optimizer V2 + +| V2 design decision | V2 rationale | Adopt into autoconfig? | Notes | +|---|---|---|---| +| **Two-phase bench: 200-iter quick screen → 3×1000-iter full bench** | "CV<2% gates full bench — avoid wasting time on high-variance results" | ✅ **YES — highest priority gap** | We've been doing single 50-iter runs and calling them facts. CV check would have caught the DVFS noise immediately. | +| **Verdict policy names (ThroughputOnly, ThroughputOrLatency…)** | "Named policies prevent Reviewer from ad-hoc criteria drift" | ✅ YES (simplified) | Autoconfig should have explicit KEEP criteria: `p50_ms < baseline × (1 - threshold)` AND `cosine ≥ floor` | +| **Append-only experiment_log.md + results.tsv written only by Reviewer** | "Single writer = no drift, full audit trail" | ✅ YES | Our results.tsv exists but no "single writer" discipline | +| **Explorer mandatory external-research triggers** | "After 15 consecutive DISCARDs → external research sweep" | ✅ YES — this is the exact gap that caused the opset 21 miss | If we had this rule, we would have searched ORT source after N DISCARDs and found kMaxSupportedOpset earlier | +| **Knowledge agent with review gate before KB save** | "Learnings reviewed before they prune future search" | ✅ YES | ep_knowledge/*.json entries should be marked draft until Gate 2 (mechanism) is confirmed | +| **Correctness contract locked after Phase 0, never modified** | "Prevents accuracy goal-post moving" | ✅ YES | We have accuracy gate but no locked contract file | +| **30-consecutive-DISCARD stop condition** | "Prevents endless search in exhausted space" | ✅ YES | autoconfig has no stop condition today | +| **Per-experiment structured output: Hypothesis → Implementation → Parity → Perf → Analysis → Decision** | "Enables post-analysis and knowledge extraction" | ✅ YES | autoconfig report is currently holistic, not per-experiment | +| **Role separation: Profiler / Explorer / Optimizer / Reviewer are separate agents** | "Prevents context drift; each agent stays focused" | ⚠️ Partial | Full 6-agent split is overkill for CLI tool; but Explorer / Reviewer distinction is valuable | +| **Resource lock: only one GPU job at a time** | "Prevents benchmark interference" | ✅ YES (trivially) | Already serial; but should be explicitly enforced if autoconfig ever parallelizes | +| **Machine pool + SSH fleet + Model Registry** | Cloud GPU fleet management | ❌ N/A | Local device only | +| **Custom CUDA kernel writing** | "Extreme asymmetry benefits from custom kernels" | ❌ N/A | CLI-only constraint; no kernel modification | +| **SM occupancy / GEMM tile count tuning** | "H100 has 132 SMs; 48 output tiles = 36% occupancy" | ❌ N/A | Edge NPU/GPU, not H100 multi-SM | +| **FlashAttention / fused QKV** | "Eliminate HBM traffic for attention score matrix" | ❌ N/A | Model is already trained; deployment-time optimization only | + +#### Key gaps in current autoconfig design (from V2 comparison) + +**Gap 1 (critical): No two-phase bench protocol** +Current design runs `--iterations 50` and accepts the result. V2 runs: +1. Quick screen: 200 iters, check CV < 2% (Coefficient of Variation = std/mean) +2. Only if CV < 2%: full bench 3×1000 iters with 60s cool-down between sessions +3. KEEP only if Δp50 > threshold AND CV(candidate) < 2% + +This directly matches the "iter ≥ 1000" rule we just added. Formalize it as two phases. + +**Gap 2 (critical): No mandatory external-research trigger in Explorer** +V2 Explorer triggers external research (web search, papers, source code) after: +- 15 consecutive DISCARDs +- Every KEEP that changes model/precision +- Before declaring backlog_empty + +We discovered kMaxSupportedOpset only by accident (downloading QNN Hub models). A mandatory "read ORT source after 5 DISCARDs in opset dimension" rule would have found it in Phase 2. + +**Gap 3 (important): ep_knowledge/*.json has no draft/confirmed state** +V2 Knowledge agent requires review gate before KB entries are used to prune search space. Our ep_knowledge findings should have: +- `status: "draft"` — observed, mechanism unconfirmed (Gate 2 not passed) +- `status: "confirmed"` — mechanism confirmed via source code (Gate 2 passed) +- `status: "deprecated"` — finding invalidated by new experiment or ORT version change +Only `"confirmed"` entries should prune search space. `"draft"` entries inform hypothesis priority but don't prune. + +**Gap 4 (nice-to-have): No per-experiment structured artifact** +V2 produces per-experiment: Hypothesis / Implementation / Parity / Perf / Analysis / Decision +autoconfig produces: one aggregate report.html. Should produce both. + +### Design: The Autoresearch Loop + +Inspired by [karpathy/autoresearch](https://github.com/karpathy/autoresearch): +agent modifies a config file, runs a fixed-cost experiment, checks if the objective improved, keeps or discards, and repeats autonomously until manually stopped or convergence criteria met. + +``` +OBJECTIVE (user-defined, one of): + A. Accuracy-primary: maximize cosine_similarity subject to p50_ms ≤ + B. Latency-primary: minimize p50_ms subject to cosine ≥ + C. Pareto search: find the full accuracy-latency frontier + +SEARCH SPACE — config.json has three sections the agent can modify: + + [export] + opset_version : int — 17, 18, 19, 20 (higher = newer ops, EP may not support) + do_constant_folding : bool — may affect graph structure visible to EP + dynamic_axes : dict — static vs dynamic shapes (QNN prefers static batch=1) + + [optimize] — full capability list (from winml optimize --list-capabilities) + + GraphPipe (run via ORT SessionOptions): + GELU: + gelu-fusion : bool — fuse tanh-GELU subgraph → Gelu op + fast-gelu-fusion : bool — fuse fast-GELU (tanh-approx) → FastGelu + bias-gelu-fusion : bool — fuse Bias+GELU (requires gelu-fusion) + quick-gelu-fusion : bool — fuse x*sigmoid(1.702x) → FastGelu + gelu-approximation : bool — convert exact Gelu → FastGelu (requires gelu-fusion) + Activation: + bias-softmax-fusion : bool — fuse Bias+Softmax + bias-dropout-fusion : bool — fuse Bias+Dropout + Convolution: + conv-add-fusion : bool — fuse Conv+Add (bias) + conv-bn-fusion : bool — fuse Conv+BatchNorm into weights + conv-mul-fusion : bool — fuse Conv+Multiply + conv-activation-fusion : bool — fuse Conv+activation (ReLU, Sigmoid, etc.) + Elimination: + slice-elimination : bool — remove redundant Slice ops + expand-elimination : bool — remove no-op Expand + unsqueeze-elimination : bool — fold Unsqueeze into initializers + GEMM: + gemm-activation-fusion : bool — fuse GEMM+activation + gemm-sum-fusion : bool — fuse GEMM+Sum + gemm-transpose-fusion : bool — fuse GEMM+Transpose + Graph: + concat-slice-elimination : bool — remove Concat+Slice that restore originals + double-qdq-pairs-remover : bool — remove consecutive QDQ pairs + constant-folding : bool — pre-compute constant exprs (default=True; disable to reduce size) + LayerNorm: + layer-norm-fusion : bool — fuse ReduceMean→Sub→Pow→Sqrt→Div→Mul→Add + skip-layer-norm-fusion : bool — fuse Add(residual)+LayerNorm → SkipLayerNorm (requires layer-norm-fusion) + simplified-layer-norm-fusion : bool — fuse simplified LayerNorm (no mean-centering) + Layout: + transpose-optimizer : bool — eliminate redundant transpose chains + nhwc-transformer : bool — NCHW→NHWC (GPU memory layout) + nchwc-transformer : bool — NCHW→NCHWc (CPU SIMD layout) + conv-add-activation-fusion : bool — fuse Conv+Add+Activation → FusedConv + MatMul: + matmul-add-fusion : bool — fuse MatMul+Add → single kernel + matmul-activation-fusion : bool — fuse MatMul+activation (DML-only, requires matmul-transpose-fusion) + matmul-transpose-fusion : bool — fuse MatMul+Transpose → FusedMatMul + matmul-scale-fusion : bool — fuse MatMul+Scale + matmul-bn-fusion : bool — fuse MatMul+BatchNorm + dynamic-quantize-matmul-fusion : bool — dynamic quant for MatMul + Misc: + gather-slice-to-split-fusion : bool — fuse Gather+Slice → Split + gather-to-slice-fusion : bool — convert Gather to Slice (contiguous idx) + pad-fusion : bool — fuse Pad with Conv/Pool + not-where-fusion : bool — fuse Not+Where + + FusionPipe (ORT transformer fusions, via FusionOptions): + attention-fusion : bool — fuse MHA pattern → Attention/MultiHeadAttention + layer-norm-fusion : bool — (FusionPipe variant, same flag) + skip-layer-norm-fusion : bool — (FusionPipe variant) + simplified-layer-norm-fusion : bool — (FusionPipe variant) + embed-layer-norm-fusion : bool — fuse Embedding+Position+LayerNorm (requires layer-norm-fusion) + bias-skip-layer-norm-fusion : bool — fuse Bias+SkipLayerNorm (requires skip-layer-norm-fusion) + fuse-rmsnorm : bool — fuse RMSNorm → LpNormalization(p=2) [custom, QNN-compatible] + packed-qkv-fusion : bool — (SD only) + packed-kv-fusion : bool — (SD only) + skip-group-norm-fusion : bool — (SD only) + bias-add-fusion : bool — fuse BiasAdd + qordered-matmul : bool — (SD only) + + SurgeryPipe (pre-EP graph fixes): + clamp-constant-values : bool — clamp -inf/+inf constants → [-1e3, 1e3] (prevents QNN quant issues) + remove-isnan-in-attention-mask: bool — remove Softmax→IsNaN→Where guards (use after clamp) + + RewritePipe (pattern-based subgraph rewriting): + --enable-{source-slug}-{target-slug} (run winml optimize --list-rewrites for full list) + Examples: --enable-gelu-singlegelu, --enable-matmuladdpattern-reshapegemmreshapepattern + + [quant] + precision : fp16 | w8a16 | w8a8 + calibration_method : minmax | entropy | percentile + samples : 64 | 128 | 256 | 512 + per_channel : bool + symmetric : bool + op_types_to_quantize : list[str] — restrict which op types get quantized + nodes_to_exclude : list[str] — exclude specific named nodes + +FIXED: winml build + winml eval + winml perf (the experiment harness) +METRIC: cosine_similarity (from winml eval --format json) + p50_ms (from winml perf --format json) +RECORD: results.tsv +``` + +--- + +### Profiler-Enhanced Agent Architecture (redesigned) + +**Insight from GPU Optimizer v2 analysis and ConvNext POC:** +Running the profiler *before* the search loop would have shown Gemm=57.7% on ConvNext — +immediately ruling out layout-pass experiments (Transpose only 2.6%, already fused Gelu already +canonical). Profile-first makes the Explorer smarter and the search shorter. + +**New 4-phase structure:** + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ PHASE 0 — INTAKE │ +│ winml inspect → validate model is supported │ +│ winml build (baseline config) → get model.onnx │ +│ winml eval --mode compare → lock FP32 correctness baseline │ +│ winml perf (baseline) → establish latency floor │ +└────────────────────────────┬────────────────────────────────────────┘ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ PHASE 1 — PROFILE (runs ONCE, before any search) │ +│ winml perf -m baseline/model.onnx --ep --profile │ +│ Parse bottleneck.json: │ +│ - top_bottleneck: op type with highest % of kernel time │ +│ - top3_concentration_pct: how concentrated the compute is │ +│ - headroom_hints: actionable pass recommendations │ +│ Classify each bottleneck op type: │ +│ - "compute" (Gemm, Conv, Attention) → quant/kernel matters │ +│ - "layout" (Transpose, Reshape) → graph pass matters │ +│ - "already_canonical" (op shows as fused type) → fusion N/A │ +│ Output: prioritized_hypothesis_queue (ordered by profile evidence)│ +└────────────────────────────┬────────────────────────────────────────┘ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ PHASE 2 — PROFILE-GUIDED OPTIMIZATION LOOP │ +│ │ +│ ┌──────────────┐ ┌──────────────┐ ┌─────────────────────┐ │ +│ │ EXPLORER │───►│ OPTIMIZER │───►│ REVIEWER │ │ +│ │ │ │ │ │ │ │ +│ │ Pops next │ │ Runs ONE │ │ Cross-exp verdict: │ │ +│ │ hypothesis │ │ experiment: │ │ - CV gate Phase A │ │ +│ │ from queue, │ │ build + │ │ - full bench Gate 1 │ │ +│ │ motivated by │ │ quick-screen │ │ - keep / discard │ │ +│ │ profile data │ │ → full bench │ │ - detect plateau │ │ +│ │ │ │ → eval │ │ - stop condition │ │ +│ └──────────────┘ └──────────────┘ │ - write KB draft │ │ +│ ▲ └─────────────────────┘ │ +│ mandatory external-research triggers (adopted from V2): │ +│ • after 5 consecutive DISCARDs in same search dimension │ +│ → search ORT/QNN SDK source code for mechanism │ +│ • after every KEEP that changes precision or EP │ +│ → re-read ep_knowledge for updated constraints │ +│ • before declaring search_space_exhausted │ +│ → ORT source sweep: opset gates, EP-specific dispatch rules │ +│ │ +│ Explorer prunes via bottleneck.json (only "confirmed" KB rules): │ +│ IF top_bottleneck == "Gemm" (>50%): │ +│ → SKIP layout passes (transpose-optimizer, nchwc, nhwc) │ +│ → FOCUS on: quant precision, calibration, matmul fusions │ +│ IF top_bottleneck == "Transpose" (>10%): │ +│ → CHECK kMaxSupportedOpset for current ORT version FIRST │ +│ IF top_bottleneck == "Conv" (>20%): │ +│ → try nchwc-transformer, conv-activation-fusion │ +│ IF "Gelu"/"LayerNormalization" op_type (already canonical): │ +│ → SKIP corresponding fusion flags │ +└────────────────────────────┬────────────────────────────────────────┘ + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ PHASE 3 — REPORT │ +│ config__optimal.json ← champion config with _autoconfig_meta│ +│ report.html ← full benchmark + profile section │ +│ experiments// ← per-exp: hypothesis/impl/parity/ │ +│ perf/analysis/decision (V2 pattern) │ +│ kb_entry.json ← status="draft"; promoted to │ +│ "confirmed" only after mechanism confirmed (Gate 2) │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +**ep_knowledge draft/confirmed lifecycle (Gap 3 fix):** + +``` +KB entry states: + "draft" — observed perf delta, mechanism unconfirmed (Gate 2 not passed) + Can influence hypothesis PRIORITY but NOT prune search space + "confirmed" — mechanism confirmed via ORT/QNN source code (Gate 2 passed) + Can prune search space for future runs + "deprecated"— finding invalidated by new experiment or stack version change + Must NOT influence search space; kept for history only + +Transition rules: + draft → confirmed: requires mechanism_confirmed=true + source_citation + confirmed → deprecated: requires contradicting experiment OR stack version bump + deprecated entries: kept in JSON with status field, never deleted +``` + +**Profiler output → Explorer mapping table:** + +| Profile finding | Explorer action | Hypothesis skipped | +|---|---|---| +| Gemm > 50% | Prioritize quant/calib experiments | All layout-transform passes | +| Transpose < 5% (opset=17) | Transpose Optimizer already working | transpose-optimizer trials | +| op_type "Gelu" present | Already fused | gelu-fusion, fast-gelu-fusion | +| op_type "LayerNormalization" present | Already fused | layer-norm-fusion trials | +| Reorder{Input,Output} present (>4%) | NCHWc already active | nchwc-transformer trials | +| op_type "Attention" present | MHA already fused | attention-fusion trials | +| QDQ ops > 15% | Quant overhead high | Focus on op_types_to_quantize exclusions | +| Transpose > 10% + opset ≥ 19 | kMaxSupportedOpset issue | Flag as [KNOWN_TRADEOFF], lower opset | + +**Why profile-first matters (validated on ConvNext):** + +The ablation experiment ran 22 experiments over multiple days. Had the profiler run first: +- Profile shows: Gemm=57.7%, Conv=12.6%, Transpose=2.6%, Gelu=8% (already "Gelu" op) +- Explorer would have immediately skipped: `gelu-fusion`, `layer-norm-fusion`, `transpose-optimizer`, + `nchwc-transformer` (already active via ReorderInput/Output) +- Only candidates from profile: `matmul-add-fusion` (Gemm bottleneck), `conv-activation-fusion` +- This would have reduced 22 experiments to ~6, with the same conclusions + +**POC profiler:** `C:\tmp\autoconfig-demo\winml_profile.py` +- Uses ORT `enable_profiling=True` + `end_profiling()` (same pattern as AI Studio's profile_file.py) +- CPU EP: parses `_kernel_time` events from ORT JSON trace +- Output: `bottleneck.json` (structured) + `bottleneck.txt` (human-readable) + raw ORT trace +- ConvNext result: Gemm 57.7%, Conv 12.6%, Transpose 2.6% → confirms baseline is optimal for CPU + +--- + +### Sections + +**1. Phase 0 — Intake + Baseline** + +```bash +# Step 1: verify the model is supported +winml inspect -m --format json + +# Step 2: graph analysis — extract structural flags before any experiment +winml analyze -m --ep --format json -o analyze_out/ +# Parse analyze_out/analysis.json: +# conv_op_count = count of "Conv" in op_distribution +# total_op_count = sum of all op counts +# conv_pct = conv_op_count / total_op_count * 100 +# +# Set EP-specific flags (used throughout Phase 2 loop): +# conv_fusions_blocked = (ep == "qnn" AND device == "npu" AND conv_pct > 20) +# [npu-006: FusedConv not supported by QNN EP → CPU fallback] +# nhwc_blocked = (ep in ["qnn_gpu", "dml"]) +# [dml-002: NHWC increases p90 variance on Adreno+D3D12] +# opset_sweep_blocked = (ep == "cpu") +# [cpu-001: opset 19+ causes 3-4× regression on CPU EP] +# bench_protocol = "npu" if (ep == "qnn" AND device == "npu") else "standard" +# [npu-007: QNN NPU CV 0.15-1.2 normal; always use 3-session] + +# Step 3: baseline build (default config, opset=17) +winml export -m -o baseline/ +winml build -c config_baseline.json -m -o baseline_built/ + +# Step 4: correctness contract +winml eval --mode compare -m baseline_built/model.onnx --model-id --format json +# Expected: cosine=1.0 (FP32 self-comparison) + +# Step 5: baseline perf (using EP-appropriate protocol) +# standard: winml perf -m baseline_built/model.onnx --ep --warmup 20 --iterations 200 +# npu: full 3-session bench (see Phase 2 PERF step) +winml perf -m baseline_built/model.onnx --ep --warmup 20 --iterations 200 --format json +# Record: baseline_p50_ms +``` + +Initialize `results.tsv` (TSV, not CSV — commas break in description field): +``` +commit precision nodes_excluded cosine p50_ms calibration_samples status notes +``` + +--- + +**2. Phase 1 — Profile (runs once, BEFORE any search experiments)** + +```bash +# Run profiler on baseline model (--profile flag added to winml perf) +winml perf -m baseline_built/model.onnx --ep \ + --warmup 5 --iterations 20 --profile --out profile_out/ --format json +# Reads: profile_out/bottleneck.json +# POC (before --profile ships): python winml_profile.py --model ... --ep ... +``` + +Profiler output drives Explorer hypothesis initialization: + +``` +READ bottleneck.json: + top_bottleneck: + op_summary: [{op_type, pct}, ...] (sorted by descending pct) + headroom_hints: [...] + +BUILD skip_set (passes not worth trying): + FOR each op_type in op_summary: + IF op_type == "Gelu": skip_set.add(gelu-fusion, fast-gelu-fusion) + IF op_type == "LayerNormalization": skip_set.add(layer-norm-fusion) + IF op_type == "Attention": skip_set.add(attention-fusion) + IF "ReorderInput" in op_summary AND pct > 2%: + skip_set.add(nchwc-transformer) # already active + IF Transpose pct < 5% AND opset=17: + skip_set.add(transpose-optimizer) # already working, no gain + IF Transpose pct > 10% AND opset >= 19: + flag as [KNOWN_TRADEOFF]; add to report + + # EP-specific hard blocks (from Phase 0 flags): + IF conv_fusions_blocked: # npu-006: FusedConv → QNN EP CPU fallback + skip_set.add(conv-bn-fusion, conv-add-fusion, conv-mul-fusion, + conv-activation-fusion, conv-add-activation-fusion) + log "BLOCKED conv-*-fusion: npu-006 FusedConv risk (Conv% = {conv_pct:.0f}%)" + + IF nhwc_blocked: # dml-002: NHWC worsens p90/std on Adreno+D3D12 + skip_set.add(nhwc-transformer) + log "BLOCKED nhwc-transformer: dml-002 variance increase on GPU EP" + + IF opset_sweep_blocked: # cpu-001: opset 19+ regresses 3-4× on CPU EP + skip_set.add(opset_sweep) # opset is FIXED at 17 for CPU; never sweep + log "BLOCKED opset sweep: cpu-001, using opset=17 only" + +BUILD priority_queue (hypotheses in evidence-based order): + IF top_bottleneck == "Gemm" OR "MatMul": + queue: [quant_precision, calib_method, calib_samples, matmul_fusions, per_channel] + IF top_bottleneck == "Conv": + # Only add conv_fusions if not blocked by npu-006 + conv_fusions_entry = [] if conv_fusions_blocked else [conv_fusions] + queue: [nchwc (if not in skip_set)] + conv_fusions_entry + [quant_precision] + IF top_bottleneck == "Attention": + queue: [quant_precision, nodes_to_exclude (Attention), calib_method] + DEFAULT: + queue: [quant_precision, calib_method, calib_samples] +``` + +--- + +**3. Phase 2 — Profile-Guided Optimization Loop (single EP)** + +``` +LOOP FOREVER (until user stops or convergence): + +1. EXPLORER: pop next hypothesis from priority_queue + - Skip if in skip_set (pruned by profile) + - If queue empty → enter Phase 4 (generalization) or stop + +2. HYPOTHESIZE: build config.json delta based on hypothesis + Hypothesis rules (profile-informed, in priority order): + a. If first loop: start with W8A16 (NOT W8A8), all ops quantized + Rationale: W8A8 is high-risk on models with LN/GELU (catastrophic on QNN NPU). + Try W8A16 first; only escalate to W8A8 after W8A16 establishes a valid baseline. + b. If cosine < floor: add worst partial_op to nodes_to_exclude (one at a time) + c. If cosine ≥ floor but latency > budget: try W8A8 instead of W8A16, + or reduce calibration_samples, or add per_channel=true + d. If stuck (3 iterations no improvement): try calibration_method change + (minmax → entropy → percentile) + e. If still stuck: try precision escalation (W8A8 → W8A16 → FP16) + +3. MODIFY: write updated config.json + Key fields in quant section: + { + "precision": "w8a8", + "samples": 128, + "calibration_method": "minmax", + "nodes_to_exclude": ["LayerNorm_0", "Softmax_3"], + "per_channel": false + } + +4. OPTIMIZER: winml build -c config.json -m -o out_/ + If build crashes: log as "crash", revert config, try different hypothesis + +5a. EVAL — quick sanity (cosine proxy, cheap): + winml eval --mode compare -m out_/artifact.onnx \ + --model-id --format json + → cosine_similarity, sqnr_db + If cosine < hard_floor (e.g. 0.85): fail-fast, skip step 5b + 6, log as discard + +5b. EVAL — task accuracy (real quality gate): + winml eval -m out_/artifact.onnx \ + --model-id \ + --task --device --ep \ + --samples 100 --format json + → top1_accuracy (image-classification), f1 (text), mAP (detection), etc. + This is the authoritative accuracy metric for Reviewer verdict. + + W8A8 EARLY EXIT (save 3+ wasted bench sessions): + IF precision == "w8a8" AND top1_accuracy ≤ 0.15 (near-random): + → log "W8A8 EARLY EXIT: top-1 ≤15%, quantization collapsed" + → skip_set.add(all W8A8 variants) # never try W8A8 again for this model/EP + → discard this config immediately (skip step 6 PERF) + → next hypothesis: try W8A16 with nodes_to_exclude for sensitive op types + + Why cosine alone is not sufficient: + - High cosine (0.97) but top-1 drops 5%: logit magnitudes preserved but relative ranking shifted + - Low cosine (0.92) but same top-1: relative ranking unchanged despite numeric difference + → Only task accuracy tells you whether the model still does its job + +6. PERF: bench protocol depends on bench_protocol flag set in Phase 0 + + standard (CPU / GPU / DML): + winml perf -m out_/artifact.onnx \ + --device --ep --warmup 20 --iterations 200 --format json + CV = std / p50 + IF CV > 0.10: log [UNSTABLE], cool down 120s, retry once; if still >0.10 → skip/discard + IF CV ≤ 0.10: proceed to full bench (3×500-iter, 60s cool-down for GPU) + + npu (QNN NPU only) — always use 3-session protocol (npu-007): + # High CV (0.15-1.2) is NORMAL for Hexagon HTP. Never reject on CV alone. + winml perf ... --warmup 20 --iterations 500 -o run1.json + sleep 30 + winml perf ... --warmup 20 --iterations 500 -o run2.json + sleep 30 + winml perf ... --warmup 20 --iterations 500 -o run3.json + + # s0 JIT exclusion: if any run's first 50 iters (inferred via warmup behavior) are + # elevated, it reflects JIT compilation, not steady-state. When run1/2/3 disagree: + # candidate_p50 = median(run1.p50, run2.p50, run3.p50) + # If run1.p50 > median(run2.p50, run3.p50) × 1.20 → suspect JIT; use run2+run3 median. + + # KEEP only if: ALL of run1/2/3 p50 < baseline best × (1 - min_improvement) + # (ranges must not overlap — median alone is insufficient for noisy NPU measurements) + → record: candidate_p50_ms, bench_sessions_used + +7. REVIEWER: cross-experiment verdict + keep if task_accuracy ≥ accuracy_floor AND p50_ms ≤ latency_budget + discard if task_accuracy < accuracy_floor OR p50_ms > latency_budget + crash if build/eval failed + + Reviewer also checks: + - Plateau: 3+ keeps with Δlatency < 2% → likely at local optimum + - Profile divergence: if new op_type appears after build, re-profile + - Skip_set update: if experiment proves a pass is a no-op, add to skip_set + - Accuracy cliff: if task_accuracy drops > 3% in one step → flag, do not cascade + +8. LOG to results.tsv: + keep/discard/crash + +9. If keep: advance to next iteration from this config + If discard: revert to last kept config, try different hypothesis +``` + +**Convergence criteria** (stop the loop): +- cosine ≥ target floor AND p50_ms ≤ latency budget: objective achieved +- 5 consecutive discards with no improvement: report best so far +- User manually stops the agent + +**Post-convergence: mandatory finalization for QNN NPU** + +```bash +# For QNN NPU only: always compile the best-found quantized model +# compile adds ~1.7× speedup on top of quantization (validated on ConvNext: 10.3ms → 6.0ms) +IF bench_protocol == "npu": + winml compile -m best_config/model.onnx --device npu --ep qnn -o best_compiled/ + # Re-bench compiled model (same 3-session protocol) + # compiled latency replaces quantized latency in report + +# For GPU/DML: NEVER run winml compile — it regresses latency on Adreno X1-85 +IF ep in ["qnn_gpu", "dml"]: + log "compile step skipped: GPU compile regresses latency (validated -34% on ConvNext QNN GPU)" +``` + +--- + +**3. Hypothesis generation rules (the intelligence layer)** + +The agent generates hypotheses by traversing the search space in priority order. +Each hypothesis is motivated by diagnostic data from the previous experiment, not random search. + +**Priority ordering across the three config sections:** + +``` +Phase 1 — establish baseline (iteration 0) + Start with: opset_version=17, all fusions enabled, precision=w8a16, minmax, 128 samples + +Phase 2 — precision first (fastest to try, most impact) + If cosine < floor: + w8a16 → try w8a8 with selective exclusions, or w8a16 first + If latency > budget: + w8a16 → try w8a8 (smaller model, faster inference) + fp16 → try w8a16 (if currently at fp16) + +Phase 3 — calibration tuning (if precision is right but cosine still low) + Try in order: minmax → entropy → percentile + Try increasing samples: 128 → 256 → 512 + Try per_channel=true (better accuracy, slightly slower build) + Try symmetric=false if currently true + +Phase 4 — optimize pass tuning (independent of quant, affects graph structure) + Hypothesis: some fusion patterns create op shapes QNN handles poorly + Transformer models (try in order): + attention-fusion → skip-layer-norm-fusion → layer-norm-fusion → fuse-rmsnorm + Vision models — CRITICAL GATE BEFORE CONV FUSIONS (npu-006): + ⚠️ conv-bn-fusion, conv-add-fusion, conv-activation-fusion produce FusedConv ops. + FusedConv is NOT a standard ONNX op — QNN EP does not support it → CPU fallback. + On Conv-dense models (ResNet, EfficientNet): this causes +4900% regression (confirmed). + On attention-dominant models (DINOv2, ViT): only 1 Conv, CPU fallback is negligible. + RULE: run `winml analyze` FIRST. If Conv% of total ops > 20% → SKIP all conv fusions for QNN NPU. + If Conv% < 5% (attention-dominant) → safe to try. Always bench to confirm. + Vision models — only try if Conv% < 5%: + conv-bn-fusion → conv-add-fusion → conv-activation-fusion + Shared (try if cosine drops or build crashes): + constant-folding=false (prevents size bloat; sometimes exposes EP-incompatible shape) + clamp-constant-values=true (fixes -inf attention mask → quantization issues) + remove-isnan-in-attention-mask=true (use after clamp; cleans dead IsNaN guards) + Try opset_version (only if opset_sweep NOT blocked): + CPU EP: SKIP entirely — opset_sweep_blocked=True (cpu-001: opset 19+ regresses 3-4×) + QNN GPU/DML: SKIP — not validated beyond opset 17 + QNN NPU: full sweep 17 → 18 → 19 → 20 → 21 (architecture-dependent benefit; + opset21 confirmed +24-31% for DINOv2 family, NEUTRAL for general ViT) + +Phase 5 — selective node exclusion (when analyze shows partial ops) + Read winml analyze --format json → partial_ops list + Exclude one partial_op at a time (greedy: exclude highest-impact first) + Also try excluding op_types_to_quantize selectively + e.g., remove "LayerNorm" from op_types_to_quantize list + +Phase 6 — combined search (if single-dimension changes are stuck) + Try combinations of best Phase 3 + Phase 4 + Phase 5 changes together +``` + +**Diagnosis table — what to try given what you see:** + +| Symptom | Likely cause | Phase to try next | +|---|---|---| +| cosine drops a lot at quant stage, all ops supported | Calibration data mismatch | Phase 3: entropy calib, more samples | +| cosine drops at quant, Attention ops partial | Attention activation quant on QNN | Phase 5: exclude Attention nodes | +| cosine OK but latency worse than CPU | Fusion pattern creating unoptimized subgraph | Phase 4: disable attention-fusion, try different opset | +| cosine OK but model larger than expected | Constant folding inlining large weights | Phase 4: constant-folding=false | +| Both cosine and latency good at w8a8 but build crashes | opset op not supported by quant pipeline | Phase 4: opset_version 17 → 16 | +| cosine highly variable across seeds | Calibration with too few samples | Phase 3: 128 → 256 samples | +| All ops supported, cosine still drops after fusions | Fusion creates non-quantizable shape | Phase 4: disable skip-layer-norm-fusion | +| QNN build fails with "invalid scale" | -inf in attention mask initializer | Phase 4: clamp-constant-values=true | +| Vision model: accuracy drops unexpectedly | Conv+BN fusion slightly changes weight values | Phase 4: disable conv-bn-fusion | +| **QNN NPU** Conv model: latency catastrophically worse (+10x) after conv fusions | **FusedConv not supported by QNN EP → CPU fallback** (npu-006, confirmed on ResNet-18 +4900%) | Phase 4: **immediately disable all conv-*-fusion flags**; NEVER enable for Conv-dense models on QNN NPU | +| MatMul-heavy model: latency not improving | MatMul not being fused | Phase 4: matmul-add-fusion, matmul-transpose-fusion | +| RMSNorm model (Llama etc.) poor QNN perf | ORT not recognizing RMSNorm pattern | Phase 4: fuse-rmsnorm=true | + +This is the key difference from grid search: **each hypothesis is motivated by diagnostic data from `winml analyze` and the previous experiment result**. + +--- + +**4. Multi-EP config generation** + +Run parallel loops for each target EP, then aggregate into `manifest.json`: + +```bash +# Agent runs loops for each EP (can be sequential or parallel): +# Loop 1: ep=qnn, target_device=npu +# Loop 2: ep=dml, target_device=gpu +# Loop 3: ep=cpu, target_device=cpu + +# After all loops complete, agent generates: +# - config_qnn_optimal.json (best config found for QNN) +# - config_dml_optimal.json (best config found for DirectML) +# - config_cpu_optimal.json (best config found for CPU) + +# Then builds final artifacts and assembles manifest.json +``` + +Generated `manifest.json` includes experiment provenance: +```json +{ + "model_id": "microsoft/resnet-50", + "generated_by": "autoconfig", + "experiments_run": 34, + "variants": [ + { + "ep": "qnn", "device": "npu", + "file": "model_qnn.onnx", + "precision": "w8a16", + "nodes_excluded": ["MultiHeadAttention"], + "cosine_similarity": 0.972, + "p50_ms": 18.3, + "config": "config_qnn_optimal.json" + }, + { + "ep": "dml", "device": "gpu", + "file": "model_dml.onnx", + "precision": "fp16", + "nodes_excluded": [], + "cosine_similarity": 0.999, + "p50_ms": 22.1, + "config": "config_dml_optimal.json" + }, + { + "ep": "cpu", "device": "cpu", + "file": "model_cpu.onnx", + "precision": "w8a8", + "nodes_excluded": ["LayerNorm"], + "cosine_similarity": 0.931, + "p50_ms": 84.7, + "config": "config_cpu_optimal.json" + } + ], + "selection_order": ["qnn", "dml", "cpu"] +} +``` + +--- + +**5. results.tsv format** + +Track all three config sections per experiment (TSV, not CSV): +``` +commit opset fusions_disabled precision nodes_excluded cosine p50_ms calib_samples calib_method status notes +baseline 17 [] fp32 [] 1.000 — — — keep FP32 reference +a1b2c3d 17 [] w8a8 [] 0.871 16.2 128 minmax discard full W8A8 too aggressive +b2c3d4e 17 [] w8a16 [] 0.967 19.8 128 minmax keep W8A16 baseline meets floor +c3d4e5f 17 [] w8a16 [] 0.969 19.1 256 entropy keep entropy calib improvement +d4e5f6g 17 [attention-fusion] w8a16 [] 0.971 18.4 256 entropy keep disabling attn-fusion helps latency +e5f6g7h 18 [attention-fusion] w8a16 [] 0.973 17.9 256 entropy keep opset18 best so far +f6g7h8i 18 [attention-fusion] w8a8 [MultiHeadAttention] 0.961 14.2 256 entropy keep mixed prec: meet latency budget +``` + +--- + +**6. Skill outputs** + +autoconfig produces **two primary outputs** after convergence or user stop: + +#### Output A: Best config file + +`config__optimal.json` — the winning config.json, ready to pass to `winml build`. Contains provenance metadata so it's reproducible: + +```json +{ + "_autoconfig_meta": { + "model_id": "facebook/convnext-tiny-224", + "ep": "qnn", + "objective": "latency-primary", + "latency_budget_ms": 20, + "accuracy_floor": 0.95, + "experiments_run": 23, + "best_iter": "iter_17", + "timestamp": "2026-06-10T11:55:05+08:00" + }, + "export": { "opset_version": 18 }, + "optimize": { "attention-fusion": false }, + "quantize": { + "precision": "w8a16", + "calibration_method": "entropy", + "calibration_samples": 256, + "nodes_to_exclude": ["MultiHeadAttention_0"] + } +} +``` + +#### Output B: HTML benchmark report + +`report.html` — self-contained single-file report (no external dependencies), viewable in any browser. Contains: + +**Section 1 — Summary card** +``` +Model: facebook/convnext-tiny-224 EP: QNN (NPU) +Objective: latency-primary ≤ 20ms Accuracy floor: 0.95 +Result: ✅ FOUND Experiments: 23 Time: 41 min + +Best config: W8A16, entropy calib, 256 samples + Accuracy: 0.953 (floor 0.95 ✓) + p50 latency: 15.8ms (budget 20ms ✓) +``` + +**Section 2 — Search progress chart** +Scatter plot: all 23 experiments, x=p50_latency_ms, y=accuracy. +- Green dot = kept (improvement) +- Red dot = discarded (regression) +- Star = best found +- Hover tooltip: iter ID, config diff vs previous + +**Section 3 — Iteration table** +Full results.tsv rendered as sortable HTML table with columns: +``` +iter | opset | precision | nodes_excluded | calib | accuracy | p50_ms | Δacc | Δlatency | status | hypothesis +``` +Color-coded rows: green = keep, red = discard, gold = best. + +**Section 4 — Config diff timeline** +Visual diff showing what changed between each kept iteration (config deltas as `+`/`-` lines). + +**Section 5 — Model graph analysis** (from pre-search `winml analyze`) +- Op distribution pie chart (ONNX vs com.microsoft) +- EP compatibility table: ops supported/unsupported on target EP +- Detected patterns (GELU variant, attention structure, Transpose-sandwich) + +**Section 6 — Benchmark details** +For the best config, full `winml perf` output: +- p10/p50/p90/p99 latency histogram +- Throughput (samples/sec) +- Warmup vs steady-state comparison +- (If multi-EP: side-by-side EP comparison bar chart) + +**Section 7 — Reproduction instructions** +```bash +# Reproduce the winning config: +winml build -c config_qnn_optimal.json -m facebook/convnext-tiny-224 -o out/ +# For NPU: always compile after build (empirically +1.7× speedup) +winml compile -m out/model.onnx --device npu --ep qnn -o out_compiled/ +winml perf -m out_compiled/model_npu_ctx.onnx --ep qnn --iterations 100 --warmup 10 +``` + +**Report generation approach**: The agent generates report.html using inline Python with Jinja2-style string templating + embedded Chart.js (CDN or inlined). No external dependencies — single file, opens offline. + +--- + +**7. What the agent says in chat** + +After convergence or user stop (terminal summary, report is the real deliverable): + +``` +autoconfig completed. 23 experiments run (41 min). + +Best config (QNN NPU): + W8A16, entropy calib, 256 samples, MultiHeadAttention excluded + accuracy 0.953 ✓ (floor 0.95) p50 15.8ms ✓ (budget 20ms) + +Outputs: + config_qnn_optimal.json ← drop into winml build -c + report.html ← open in browser for full benchmark breakdown + +Next: winml validate-before-ship for production gate. +``` + +--- + +**8. Constraints and failure handling** + +- **Build timeout**: If `winml build` exceeds 15 minutes, kill and log as crash +- **OOM**: If build fails with out-of-memory, reduce `calibration_samples` by half +- **All hypotheses exhausted**: Report best config found, note convergence limit +- **Latency not measurable** (target EP not on machine): run eval only, skip perf gate + +**9. CLI-only constraint (critical)** + +The agent MUST use only official `winml` CLI commands as its tool surface. No Python scripting, no direct ONNX manipulation, no third-party tools (onnxconverter-common, onnxsim, Olive, etc.) except where explicitly documented as a known workaround. + +**Rationale**: autoconfig's output is a `config.json` + `report.html` that a user can reproduce with `winml build -c config.json`. If the agent used a Python hack to produce a model artifact, the config is not reproducible and the report is misleading. + +**Known workarounds (allowed, must be flagged in report):** +| Workaround | Replaces | Tracking issue | Required flag in report | +|---|---|---|---| +| `python winml_profile.py` | `winml perf --profile` (not yet shipped) | pending | ⚠️ "Profile data via POC script, not official API" | + +**Gap reporting rule**: If a hypothesis cannot be tested because the required `winml` CLI capability does not exist, the agent MUST: +1. Record the hypothesis as `SKIPPED — CLI gap` in the experiment table +2. Add an entry to **Section 6 "Gaps & Issues"** block in `report.html`: + ``` + GAP: requires + Impact: + Filed: + ``` +3. NOT silently substitute a Python workaround that produces unverifiable artifacts + +**Example gaps encountered during ConvNext QNN GPU validation:** +- `winml build --precision fp16` flag not available (#867) → FP16 native export untested → `SKIPPED — CLI gap` +- `winml perf --ep-option` not available (#865) → runtime flag sweep untested → `SKIPPED — CLI gap` +- `winml perf --profile` for QNN EP not available → profiling via POC script (allowed workaround) +- W8A8 QDQ ONNX on QNN GPU EP hangs indefinitely — root cause is QNN SDK behavior; ``winml build`` already prevents this via ``_patch_device()``; fast-fail enhancement filed as #868 (low priority) + +--- + +### Key commands used + +```bash +# Phase 1: profiling (--profile flag on winml perf, before search) +winml perf -m baseline_built/model.onnx --ep --warmup 5 --iterations 20 \ + --profile --out profile_out/ --format json +# → profile_out/bottleneck.json (machine-readable for Explorer) +# → profile_out/bottleneck.txt (human-readable summary) +# POC: python winml_profile.py --model ... --ep ... (until --profile ships) + +# Phase 2: analysis (informs nodes_to_exclude hypotheses) +winml analyze -m .onnx --ep --format json + +# Phase 2: experiment +winml build -c config.json -m -o out_/ + +# Phase 2: metrics +winml eval --mode compare -m out_/artifact.onnx --model-id --format json +winml perf -m out_/artifact.onnx --device --ep --iterations 50 --format json + +# Phase 3: compile best candidate to QNN EPContext (NPU only) +# Eliminates JIT overhead; empirically ~1.7× further speedup on ConvNext W8A16 +winml compile -m best_candidate/model.onnx --device npu --ep qnn -o best_compiled/ +# → best_compiled/model_npu_ctx.onnx (loads context binary at runtime) +# → best_compiled/model_npu_ctx_qnn.bin (QNN hardware-compiled graph) + +# Phase 3: re-benchmark compiled model +winml perf -m best_compiled/model_npu_ctx.onnx --device npu --ep qnn --warmup 10 --iterations 50 +``` + +**Empirical data: ConvNext QNN NPU compile impact** +| Version | p50 | vs FP32 NPU | +|---|---|---| +| FP32 baseline | 19.39ms | — | +| W8A16 quantized | 10.29ms | 1.9× | +| **W8A16 + compile** | **6.01ms** | **3.2×** | +→ `winml compile` alone adds ~1.7× on top of quantization. Always compile for NPU deployment. + +**Empirical data: ConvNext QNN GPU optimization sweep (Adreno X1-85) — full search** +| Experiment | p50 | p90 | std | vs FP32 | Notes | +|---|---|---|---|---|---| +| FP32 baseline (autoconf) | **17.7ms** | 19.7ms | 0.97 | — | ✅ **OPTIMAL with current CLI** | +| NHWC transformer | 19.5ms | 23.8ms | 3.43 | ❌ −10% | Hurts Adreno+QNN EP | +| NHWC + all GPU fusions | 18.1ms | 23.9ms | 2.71 | ❌ −2% | Still worse | +| Conv/norm fusions (no NHWC) | 17.6ms | 22.6ms | 5.51 | ≈0% | Variance ↑, no gain | +| LayerNorm rewrite | 18.4ms | 21.4ms | 2.04 | ❌ −4% | Pattern mismatch anyway | +| Transpose optimizer | 0% node Δ | — | — | no-op | Already optimal positions | +| HiDimRTR→LowDimRTR | 0% node Δ | — | — | no-op | ConvNext RTR doesn't match pattern | +| MatMulAdd→Conv2D (2d/3d/4d) | 0% node Δ | — | — | no-op | ConvNext uses Reshape→MatMul, not bare MatMul+Add | +| FP32 + compile | 23.7ms | — | — | ❌ −34% | Compile hurts GPU (opposite of NPU) | +| W8A8 QDQ quantized | hangs | — | — | ❌ blocked | #868 enhancement (fast-fail) | +| FP16 (invalid CLI path) | 8.8ms | ~32ms | bimodal | ⚠️ 2× p50 | BLOCKED — need #867 | + +**Root cause: why no pass matches ConvNext on QNN GPU** +- All 251 ops run natively on GPU (251/0/0/0) — no CPU fallback to eliminate +- ConvNext linear layers: `Reshape → MatMul → Reshape` pattern, not bare `MatMul+Add` → Conv2D rewrites don't match +- 72 Reshape + 42 Transpose are already at minimum / optimal topology from PyTorch export +- `winml build` autoconf (gelu_fusion + matmul_add_fusion) already applied all relevant transforms +- The bottleneck is compute throughput + memory bandwidth — only FP16 (smaller tensors) can improve this + +**Key insight: gelu_fusion matters for variance, not p50** +| Version | p50 | p90 | std | +|---|---|---|---| +| Raw export (287 nodes, unfused Gelu) | 17.4ms | 29.2ms | 5.90 | +| Autoconf (251 nodes, fused Gelu+Gemm) | 17.7ms | 19.7ms | 0.97 | + +Unfused Gelu = 5 separate GPU kernel launches (Mul→Div→Erf→Mul→Add) with scheduling jitter. +A single `Gelu` kernel eliminates dispatch overhead → p90 −48%, std −6×. +→ autoconf's role on GPU is **stability**, not speedup. Critical for real-time / latency-SLA deployments. + +→ **QNN GPU search space exhausted.** FP16 is the only remaining lever, blocked by #867. + +**Empirical data: ConvNext DML optimization sweep (Adreno X1-85, DirectML)** +| Experiment | p50 | p90 | std | vs FP32 | +|---|---|---|---|---| +| FP32 baseline (autoconf, 251 nodes) | **16.9ms** | 17.7ms | 0.52 | — ← OPTIMAL with current CLI | +| NHWC transformer | 16.5ms | 21.0ms | 1.89 | ❌ p90 worse | +| Raw unfused export (287 nodes) | 16.5ms | 18.4ms | 2.74 | ❌ p99=35ms, worse tail | +| FP16 (Python hack ⚠️) | **11.8ms** | 12.8ms | 0.66 | ✅ **1.4× faster, clean dist** — BLOCKED #867 | + +**DML vs QNN GPU comparison (same Adreno X1-85) — validated 2026-06-17:** +| | QNN GPU FP32 | DML FP32 | DML FP16 (invalid) | +|---|---|---|---| +| p50 | 17.7ms | **16.9ms** | **11.8ms** | +| p90 | 19.7ms | **17.7ms** | **12.8ms** | +| std | 0.97 | **0.52** | **0.66** | + +⚠️ **Correction (dml-001):** The 0.8ms p50 difference (17.7 vs 16.9ms) = 0.82σ of the GPU measurement. +Distributions OVERLAP. "DML is consistently faster than QNN GPU" is NOT supported at p50. +**What IS confirmed**: DML has meaningfully better stability (std 0.52 vs 0.97). For latency-SLA workloads, +DML's lower variance is the real advantage, not raw p50 speed. +→ Correct claim: "DML is more stable than QNN GPU at FP32 (std 0.52 vs 0.97)." +→ Root cause of stability: DML JIT-compiles HLSL shaders at model load; QNN GPU EP partitions at each session. +→ For speed, both EPs need FP16 (#867) to show meaningful improvement. + +**QNN Hub benchmark comparison (Snapdragon X Elite CRD) — WITH cross-stack test** + +| Model | Stack | NPU p50 | GPU p50 | Notes | +|---|---|---|---|---| +| QNN Hub Float (opset 21, 222 nodes, MatMul) | qairt cloud | **2.687ms** | — | Reference | +| QNN Hub Float (same model) | winml ORT QNN EP | **8.78ms** | 23.9ms | Direct test on this device | +| Our Float (opset 17, 251 nodes, Gemm) | winml ORT QNN EP | 19.4ms | 17.7ms | winml build output | +| QNN Hub W8A16 (opset 21, 798 QDQ, uint16 input) | qairt cloud | **2.612ms** | — | Reference | +| QNN Hub W8A16 (same model) | winml ORT QNN EP | 14.82ms (std=8.8!) | — | ORT-QNN mismatch | +| Our W8A16 + compile (opset 17, ORT quant) | winml ORT QNN EP | **6.01ms** | — | Best we can do | + +**Gap decomposition (three independent sources):** +``` +QNN Hub cloud: 2.7ms + ↑ 3.3× Runtime gap (qairt native vs ORT QNN EP adapter overhead) +QNN Hub on winml: 8.78ms + ↑ 2.2× Model graph gap (opset 21/MatMul/222 nodes vs opset 17/Gemm/251 nodes) +Our model on winml: 19.4ms (FP32) +``` + +**Actionable findings (updated 2026-06-17 — validated by 3×500-iter protocol + source analysis):** +1. **opset 21 speedup for DINOv2 family — empirically REAL, mechanism UNKNOWN** (#869) + - DINOv2-small +30.6%, DINOv2-base +24.1% — confirmed by 3-session non-overlapping ranges. + - dino-vitb16 -0.7% NEUTRAL, gender-classification ViT +3.5% NEUTRAL. + - **Two hypotheses definitively ruled out**: + - (a) kMaxSupportedOpset bypass: ORT 1.24.4 kMaxSupportedOpset ≥ 23 → NHWC transform applies to both opset17 and opset21 equally. Bypass does NOT occur. + - (b) Transpose elimination: Transpose count identical (49 both) in opset17 vs opset21 optimized.onnx and quantized.onnx. + - Only observed structural difference: +48 Reshape nodes in opset21. Why this helps QNN NPU is unknown. + - **Do not generalize**: benefit appears specific to DINOv2 family. ViT models with identical op counts see no benefit. + - **Do not use for autoconfig search default**: only try opset21 sweep after profiling suggests Reshape/layout overhead; otherwise use opset17. +2. **Runtime stack gap (3.3×) is structural**: qairt native will always be faster. Correct baseline = "QNN Hub ONNX on winml" (8.78ms). +3. **QNN Hub W8A16 is WORSE on our stack** (14.82ms, std=8.8ms): opset 21 QDQ + uint16 input incompatible with ORT QNN EP format. +4. **Opset is a search dimension** — full sweep (17–22), no prior. The optimal opset is model-architecture-dependent and may change with ORT version upgrades. + +**EP-specific search space rules (validated 2026-06-17)** + +| EP | Quantization | Opset | Graph passes | Compile | Key insight | +|---|---|---|---|---|---| +| QNN NPU | ✅ W8A16 | Full sweep 17-22 (benefit is model-architecture-dependent, not ORT-version) | autoconf (gelu+matmul_add); **NO conv fusions for Conv-dense models** (npu-006) | ✅ Always | W8A8 catastrophic on LN+GELU; opset21 benefit real for DINOv2 family, mechanism unknown | +| QNN GPU | ❌ Skip | 17 (opset 21 not validated) | autoconf only | ❌ Skip | Compile regresses; FP16 only lever (#867) | +| DML | ❌ Skip | 17 (opset 21 not validated) | autoconf only | N/A | FP16 primary lever (#867); more stable than QNN GPU (p50 diff is noise, std is real) | +| CPU | ❌ Skip | 17 only (kMaxSupportedOpset causes 3-4× regression on opset 19+) | nchwc, matmul-add, gelu | N/A | Regression data confirmed; mechanism uncertain (two separate kMaxSupportedOpset constants in ORT) | + +Rule: autoconfig must use EP-specific search space. Do NOT run quantization experiments for GPU/DML/CPU. +Rule: for QNN NPU conv-fusion experiments — check `winml analyze` output first. If Conv op count > 20% of total → SKIP all conv-*-fusion flags. FusedConv is not a standard ONNX op; QNN EP falls back to CPU → catastrophic regression (npu-006). +Rule: for QNN NPU opset sweep — full sweep 17–22 with no prior. kMaxSupportedOpset ≥ 23 is confirmed in ORT 1.24.4, so the NHWC bypass hypothesis does not apply. Observe results empirically. +Rule: for NPU, if W8A8 top-1 ≤ 15% on first attempt → skip all W8A8 variants, go directly to W8A16. +Rule: always run `winml compile` after finding best quantized config for QNN NPU. NEVER compile for GPU (regresses). +Rule: for GPU/DML, skip ALL graph optimization passes beyond what `winml build` autoconf applies (NHWC and additional fusions hurt). +Rule: W8A8 QDQ on GPU EP hangs — skip quantization immediately for GPU targets without testing. + +**User scenario mapping** + +| Scenario | How autoconfig addresses it | +|---|---| +| S1: LLM fast support (7-30d) | autoconfig replaces manual per-EP tuning; outputs `config_optimal.json + report.html` deployable in hours not days | +| S2: ISV non-LLM model support | Exact use case: ISV brings model → autoconfig finds config → report is deliverable with SOP turnaround | +| S3: Cross-EP parity | Multi-EP parallel run: same model, EP-specific search spaces in parallel → output config matrix per EP | +| S4: Customer ONNX can't run | Phase 0 intake diagnoses "can't run" (partial ops → block reason); Phase 1+2 finds "escape config" for "runs poorly" | +| S5: PyTorch HF Hub coverage | Phase 0 IS the "can WinML run it?" gate; failed Phase 0 → structured block reason feeds long-tail gap tracking | + +**Dependencies on code changes**: +- `winml perf --profile` (new flag) — adds per-op bottleneck output alongside existing latency metrics; POC script `winml_profile.py` exists to unblock +- `--format json` on `winml eval` (#847), `winml analyze` (#848), `winml perf` (#849) + +### Cross-references +- Run `check-model-feasibility` before starting to pick a model and verify the EP is available +- After autoconfig completes → `ship-to-winapp` for final validation gates + packaging +- If autoconfig cannot meet objective → `debug-accuracy-drop` for deeper diagnosis +- Multi-EP output feeds directly into `ship-to-winapp`'s manifest layout +- If the best config found is still not good enough → escalate to `optimization-research` + +--- + +## Skill: `optimization-research` (contributor — internal, deep gap analysis) + +### Frontmatter +```yaml +name: optimization-research +description: > + Use this skill when a winml-cli engineer wants to find out whether a model can + be optimized better than what winml-cli currently achieves, identify what is + blocking that optimization, and produce concrete backlog work items. + The agent performs a deep search across: ORT source code and its optimizer + passes, Olive recipes and benchmarks, other ONNX ecosystem tools (onnxsim, + onnxoptimizer, neural-compressor, etc.), and native stack reference models + and datasets. It compares the best achievable result (using all available tools) + against what winml produces today, diagnoses the gap, and files GitHub issues + with reproduction steps. Use when an internal engineer says "why is this model + slower than it should be", "what optimization techniques are we missing", + or "what would it take to match Olive's results". + +audience: internal (winml-cli team engineers) +``` + +### When to use +- "ConvNext on QNN is 3× slower than what Qualcomm's SDK achieves — why?" +- "Olive gets 15ms on this model; winml gets 28ms — what's the gap?" +- "We're seeing quantization accuracy drop on LLaMA; are there better calibration methods we're not supporting?" +- "What would it take to match ORT's best-known config for this architecture?" +- After `autoconfig` hits a ceiling: best config found is still not meeting the objective + +### What this skill produces + +**Primary outputs:** +1. **`gap_analysis.md`** — structured report of what the best achievable result is and what's missing +2. **`repro/`** — scripts to reproduce the better result using external tools +3. **GitHub issues** — one per identified gap, filed against winml-cli with: repro steps, expected vs actual, what ORT/Olive/ecosystem already does, proposed fix direction + +--- + +### Design: Deep Search Process + +``` +┌──────────────────────────────────────────────────────────────────┐ +│ PHASE 1 — BASELINE │ +│ winml autoconfig best result for this model/EP │ +│ (or provided by user if already run) │ +└─────────────────────────┬────────────────────────────────────────┘ + ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ PHASE 2 — EXTERNAL BENCHMARK │ +│ Run same model through: │ +│ A. ORT optimizer directly (onnxruntime.tools.transformers) │ +│ B. Olive (olive-ai) with ep-specific recipe │ +│ C. onnxsim + onnxoptimizer (static graph simplification) │ +│ D. neural-compressor (Intel) for quantization comparison │ +│ Record: best latency, accuracy, config used │ +└─────────────────────────┬────────────────────────────────────────┘ + ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ PHASE 3 — GAP DIAGNOSIS │ +│ For each gap (external better than winml): │ +│ a. Diff the ONNX graphs (what ops/patterns differ?) │ +│ b. Read ORT optimizer source to understand what it does │ +│ c. Check winml's capability registry — is this pass missing? │ +│ disabled by default? wired incorrectly? │ +│ d. Check Olive recipe — what flags/params does it use? │ +│ Classify gap as one of: │ +│ [MISSING_CAPABILITY] — pass exists in ORT, not in winml │ +│ [WRONG_DEFAULT] — pass exists but wrong default/order │ +│ [BUG] — pass exists but produces wrong graph│ +│ [CALIBRATION_DATA] — accuracy gap from calibration set │ +│ [EP_LIMITATION] — EP itself can't do this, not winml │ +│ [KNOWN_TRADEOFF] — intentional: winml trades X for Y │ +└─────────────────────────┬────────────────────────────────────────┘ + ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ PHASE 4 — NATIVE STACK VALIDATION │ +│ Check existing reference models in winml-cli test suite: │ +│ - Are there models of this architecture in tests/models/? │ +│ - Do their expected results match what we see? │ +│ Check Windows AI Studio / WinML model zoo: │ +│ - Is this architecture listed? At what performance? │ +│ Check QNN SDK reference benchmarks (if QNN EP): │ +│ - Does QNN vendor claim better numbers for this model? │ +└─────────────────────────┬────────────────────────────────────────┘ + ▼ +┌──────────────────────────────────────────────────────────────────┐ +│ PHASE 5 — WORK ITEMS │ +│ For each [MISSING_CAPABILITY] or [WRONG_DEFAULT] gap: │ +│ - Draft GitHub issue with: title, body, repro, expected, │ +│ actual, proposed fix, ORT source pointer │ +│ - Estimate implementation complexity (S/M/L/XL) │ +│ For [BUG]: file with full repro script │ +│ For [CALIBRATION_DATA]: suggest dataset and eval protocol │ +│ For [EP_LIMITATION]: file with QNN/DML SDK reference │ +└──────────────────────────────────────────────────────────────────┘ +``` + +--- + +### Key external tools to invoke + +```bash +# A. ORT transformer optimizer (the "gold standard" for transformer models) +python -c " +from onnxruntime.transformers import optimizer +from onnxruntime.transformers.fusion_options import FusionOptions +opts = FusionOptions('bert') # or 'gpt2', 'clip', etc. +opts.enable_attention = True +opts.enable_gelu = True +model = optimizer.optimize_model( + 'export.onnx', model_type='bert', + num_heads=12, hidden_size=768, + optimization_options=opts +) +model.save_model_to_file('ort_optimized.onnx') +" + +# B. Olive (end-to-end, EP-aware) +olive run --config olive_recipe.json +# olive recipe template: see skills/optimization-research/templates/olive_qnn.json + +# C. onnxsim (structural simplification) +python -m onnxsim export.onnx simplified.onnx + +# D. onnxoptimizer +python -c " +import onnxoptimizer, onnx +m = onnx.load('export.onnx') +passes = onnxoptimizer.get_available_passes() +m2 = onnxoptimizer.optimize(m, passes) +onnx.save(m2, 'onnxopt.onnx') +" +``` + +--- + +### Gap report format (`gap_analysis.md`) + +```markdown +# Optimization Gap Analysis: on + +Date: +winml-cli version: +ORT version: + +## Summary +| Tool | Latency p50 | Accuracy | Config notes | +|---|---|---|---| +| winml best (autoconfig) | 28.3ms | 0.953 | W8A16, entropy, 256 samples | +| ORT transformer optimizer | 19.1ms | 0.951 | model_type=bert, all fusions | +| Olive QNN recipe | 17.8ms | 0.948 | W8A8 + attention fusion | +| **Gap** | **10.5ms (37%)** | — | — | + +## Gap 1: [MISSING_CAPABILITY] FusedMatMul with rotary embedding +**What external tool does:** ... +**What winml does:** ... +**ORT source:** `onnxruntime/python/tools/transformers/fusion_rotary_attention.py` +**Proposed fix:** Add RotaryAttentionFusion to FusionPipe capability registry +**Estimated effort:** M + +## Gap 2: [WRONG_DEFAULT] attention-fusion disabled by default +... +``` + +--- + +### GitHub issue template + +```markdown +title: [optimization-gap] /: + +body: +## Summary + + +## Reproduction +```bash +# Install +uv pip install winml-cli + +# Baseline (winml current) +winml build -c config.json -m -o winml_out/ +winml perf -m winml_out/model.onnx --ep --warmup 10 --iterations 50 + +# Better result (external) + +``` + +## Expected vs actual +- External tool achieves: ms at +- winml achieves: ms at +- Gap: ms (%) + +## Root cause + + +## ORT source reference + + +## Proposed fix direction + + +## Complexity estimate +S / M / L / XL +``` + +--- + +### What this skill does NOT do +- Does not make code changes to winml-cli itself (files issues only) +- Does not run production benchmarks (uses quick screening methodology) +- Does not replace formal performance testing with validated hardware + +### Cross-references +- `autoconfig` provides the winml baseline to compare against +- Issues filed here feed `adding-ep-support` and `contributing-a-skill` workflows +- Use `check-model-feasibility` to confirm EP availability before running external benchmarks + +--- + + +--- + +## ConvNext Autoconfig POC — Rigorous Ablation Results + +**Source:** `C:\tmp\autoconfig-demo\ablation.py` — 4-phase rigorous ablation experiment +**Measurement:** `winml perf --ep cpu --warmup 10 --iterations 50` — pure inference latency, no preprocessing +**Design:** 3 independent runs per config; promotion threshold = max(3%, 2×σ_baseline); correctness gate (`winml eval --samples 20`) per config +**Report:** `C:\tmp\autoconfig-demo\report.html` | **Config:** `C:\tmp\autoconfig-demo\config_cpu_optimal.json` + +### Graph structure (facebook/convnext-tiny-224, opset 17) + +**Op counts (raw export):** 287 nodes total +``` +Add×72 Mul×54 Transpose×42 MatMul×36 LayerNormalization×23 +Conv×22 Div×18 Erf×18 ReduceMean×1 Gemm×1 +``` + +**ConvNext block structure** (traced from first DW-Conv): +``` +DW-Conv(7x7, g=96) → Transpose +→ LayerNormalization (native, already fused at export) +→ MatMul(C→4C) → Add(bias) +→ [GELU: Div → Erf → Add(1) → Mul → Mul(0.5)] ← 18 unfused in export +→ MatMul(4C→C) → Add(bias) [Gemm after ORT L2] +→ Mul (layer scale) → Add (residual) +→ Transpose (back to NCHW) +``` + +**Conv breakdown:** 4 regular (1×stem 4x4, 3×downsample 2x2 stride-2), 18×DW-Conv 7x7 + +**Transpose patterns:** +``` +19× Conv → Transpose → LayerNormalization (NCHW→NHWC for LN) +15× Mul → Transpose → Add (NHWC→NCHW for residual) + 4× LayerNormalization → Transpose → Conv (NHWC→NCHW for next DW-Conv) + 2× Add → Transpose → Conv + 2× Add → Transpose → LayerNormalization +``` +→ ConvNext is a **Transpose-sandwich** model: alternates NCHW (Conv) and NHWC (LN) layout + +**Observed graph transformation (export.onnx → model.onnx after winml build, baseline config):** +| Op | export.onnx | model.onnx (baseline) | Change | +|---|---|---|---| +| `com.microsoft/Gelu` | 0 | 18 | +18 | +| `Gemm` | 1 | 37 | +36 | +| `MatMul` | 36 | 0 | −36 | +| `Add` | 72 | 18 | −54 | +| `Mul` | 54 | 18 | −36 | +| `Div`, `Erf` | 18 each | 0 | −18 each | +| `Reshape` | 0 | 72 | +72 | + +**Observation (confirmed):** The baseline `model.onnx` (no user fusion flags) already differs substantially from `export.onnx`. GELU and MatMul+Add are fused before any user capability flag is applied. + +**Open question (unresolved):** The `ORTGraphPipe` design (graph.py) is supposed to disable `GeluFusion`/`GeluFusionL2`/`LayerNormFusion` in the baseline via `optimization.disable_specified_optimizers`. Yet the baseline output clearly contains `com.microsoft/Gelu`. This contradiction is unresolved — possible explanations include: ORT name mismatch in disabled list, a different code path fusing GELU, or the export step (via HF Optimum) applying fusion before winml. **This must be investigated before any mechanistic claims about "ORT L2 already does X" are written in user-facing reports.** + +--- + +### Ablation results (rigorous, Phase 0–4) + +**Clean baseline:** 43.7ms p50 (base_0 + base_1, 6 runs, all within 42.5–45.4ms) + +| config | p50 mean | Δ vs baseline | runs (ms) | verdict | +|---|---|---|---|---| +| base_0 | 43.0ms | −0.6ms | 43.8 / 42.7 / 42.5 | baseline | +| base_1 | 44.3ms | +0.6ms | 43.2 / 44.3 / 45.4 | baseline | +| base_2 | 73.5ms | +29.8ms | 47.2 / **127.1** / 46.2 | outlier run (system spike) | +| opset_18 | 48.0ms | +4.3ms | 50.2 / 44.0 / 49.7 | neutral | +| **opset_19** | **160.3ms** | **+116ms** | **147.6 / 145.8 / 187.4** | **⚠️ SEVERE REGRESSION** | +| **opset_20** | **131.0ms** | **+87ms** | **135.7 / 129.8 / 127.5** | **⚠️ SEVERE REGRESSION** | +| **opset_21** | **170.3ms** | **+126ms** | **190.1 / 164.9 / 155.8** | **⚠️ SEVERE REGRESSION** | +| **opset_22** | **85.0ms** | **+41ms** | **70.9 / 93.9 / 90.2** | **confirmed regression** | +| no_cf_17 | 51.8ms | +8.1ms | 56.4 / 49.0 / 49.9 | mild regression | +| base_mid | 49.4ms | +5.8ms | 51.3 / 51.1 / 45.9 | baseline (mid-exp drift) | +| gelu_only | 52.5ms | +8.9ms | 53.0 / 55.6 / 49.1 | mild regression | +| ln_only | 57.2ms | +13.6ms | **79.3** / 47.9 / 44.5 | inconclusive (outlier) | +| conv_add | 50.2ms | +6.5ms | 47.3 / 55.9 / 47.4 | inconclusive | +| conv_act | 51.2ms | +7.5ms | 45.2 / 41.9 / **66.4** | inconclusive (outlier) | +| **matmul_add** | **81.7ms** | **+38.0ms** | **63.0 / 70.8 / 111.2** | **CONFIRMED REGRESSION** | +| transpose_opt | 45.5ms | +1.8ms | 42.3 / 52.3 / 41.8 | neutral | +| nchwc | 45.4ms | +1.7ms | 43.4 / 48.0 / 44.7 | neutral | +| matmul_scale | 56.9ms | +13.3ms | 51.5 / 58.1 / 61.2 | probable mild regression | +| base_end | 48.3ms | +4.7ms | 45.3 / 56.7 / 43.1 | baseline (end-of-exp drift) | + +**Phase 3 outcome:** No candidates met promotion threshold (29.4ms needed). Baseline is optimal. + +--- + +### Confirmed findings (statistically defensible) + +**1. `matmul-add-fusion` is a confirmed regression on ConvNext CPU (+38ms)** +- All 3 independent runs: 63.0 / 70.8 / 111.2ms — each far above the highest clean baseline run (45.4ms) +- Not attributable to system noise (no run-to-run overlap with baseline distribution) +- Mechanism hypothesis: baseline already converts MatMul+Add→Gemm (37 Gemm in model.onnx); applying matmul-add-fusion on top may create redundant or conflicting kernel dispatch. Unconfirmed — requires profiling. + +**2. `transpose-optimizer` is NEUTRAL on pure inference latency** +- Runs: 42.3 / 52.3 / 41.8ms — overlapping with clean baseline (42.5–45.4ms) +- ⚠️ **CORRECTION OF EARLIER FINDING:** A previous 8-iteration search (using `winml eval`) reported +270ms. That was a measurement artifact — `winml eval` includes HF preprocessing pipeline overhead and has no warmup. It measures *application startup + preprocessing + inference*, not *inference alone*. With `winml perf` (warmup=10, iter=50, pure inference): transpose_opt = baseline. Do not cite the +270ms in any report. + +**3. `nchwc-transformer` is neutral on this model** +- NCHWc SIMD layout: 43.4 / 48.0 / 44.7ms — no benefit for ConvNext CPU inference. + +**4. opset=18 is neutral** +- Same node count (251) as opset=17 — no graph structure changes. Mean slightly above baseline (48ms) is within machine variance. + +**5. No flag improved latency beyond noise. Baseline is the optimal config.** + +--- + +### ⚠️ Critical finding: ORT performance cliff at opset 19 (ConvNext CPU) + +**Experiment:** tested opset 17–22, all with identical graph structure (251 nodes, same op counts) + +| opset | mean p50 | slowdown | +|---|---|---| +| 17 | 43.7ms | — (baseline) | +| 18 | 48.0ms | 1.1× | +| **19** | **160.3ms** | **3.7×** | +| **20** | **131.0ms** | **3.0×** | +| **21** | **170.3ms** | **3.9×** | +| **22** | **85.0ms** | **1.9×** | + +**Key facts:** +- All runs within each opset are consistent (no outliers) — this is real, not noise +- Graph structure is **byte-for-byte identical**: Reshape×72, Transpose×42, Gemm×37, LN×23, Conv×22 for ALL opsets +- The performance difference is entirely in ORT's runtime execution path, not the graph + +**Mechanism: CONFIRMED ROOT CAUSE — ORT `kMaxSupportedOpset` gates Transpose Optimizer** + +Source: `onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h` +```cpp +constexpr int64_t kMaxSupportedOpset = 18; // ORT v1.14.x — bumped each ORT release +``` + +Entry point `onnx_transpose_optimization::Optimize()` → `MakeOptimizerContext()`: +```cpp +if (*opset > kMaxSupportedOpset) { + return std::nullopt; // entire Transpose Optimizer skipped silently +} +``` + +ConvNext has 42 Transpose nodes (NCHW↔NHWC sandwich in every block). The Transpose Optimizer normally: +- Pushes Transposes through Add×18, Mul×18 (layer-scale + residual) across block boundaries +- Cancels adjacent inverse pairs + +When bypassed (opset > kMaxSupportedOpset), all 42 Transposes execute as full memory-layout copies → 3–4× systemic slowdown. + +**ORT optimization level experiment (definitive proof):** + +| Session opt level | opset=17 | opset=19 | ratio | explanation | +|---|---|---|---|---| +| DISABLE_ALL | 47.5ms | **355ms** | **7.5×** | No Transpose Optimizer → all 42 Transposes raw | +| ENABLE_BASIC | 289ms | 315ms | 1.1× | Both slow (re-optimizing pre-fused graph) | +| ENABLE_EXTENDED | 209ms | 241ms | 1.2× | Better but no layout transform | +| **ENABLE_ALL** | 216ms | **215ms** | **1.0×** | Transpose Optimizer runs on both → full parity | + +**`kMaxSupportedOpset` version history:** + +| ORT version | kMaxSupportedOpset | opset ≥ N disabled | +|---|---|---| +| v1.14.x | **18** | ≥ 19 | +| v1.16.x | 19 | ≥ 20 | +| v1.17.x | 20 | ≥ 21 | +| v1.18.x | 21 | ≥ 22 | +| main/HEAD | **26** | fully covered | + +**Classification for optimization-research skill:** `[KNOWN_TRADEOFF]` (intentional design: ORT bumps the ceiling with each ONNX opset release) +- winml-cli ships a specific ORT build → its `kMaxSupportedOpset` is fixed +- winml-cli's **default opset=17 is correct and essential** — it is the safe zone for all current ORT builds +- Raising opset requires ensuring the shipping ORT version has `kMaxSupportedOpset ≥ target_opset` +- Do NOT raise default opset without verifying `kMaxSupportedOpset` in the shipped ORT + +**Call chain:** +``` +InferenceSession::Initialize() + → TransposeOptimizer::ApplyImpl() [transpose_optimizer.cc:18] + → onnx_transpose_optimization::Optimize() + → MakeOptimizerContext() + → if opset > kMaxSupportedOpset: return nullopt ← THE GATE +``` + +--- + +### Inconclusive / do not report + +These show elevated means but cannot be confirmed as regressions given machine variance (p90 = 2–3× p50 throughout): +- `ln_only`, `conv_add`, `conv_act`: each has ≥1 extreme outlier run; other runs are baseline-level +- `gelu_only`: consistently 49–56ms, possibly a mild regression but no outlier; 3 runs insufficient to separate from drift +- `matmul_scale`: all 3 runs elevated (51–61ms), but concurrent baseline also drifted (+5ms); net delta ~+8ms, weak signal + +Do not write these as confirmed regressions in user-facing reports. Label as "inconclusive" or omit. + +--- + +### Measurement methodology correction (winml eval vs winml perf) + +| Tool | What it measures | Latency for ConvNext CPU | +|---|---|---| +| `winml eval` (no warmup, includes preprocessing) | Application-level: model load + HF preprocessing + inference × N | ~67ms/sample | +| `winml perf --warmup 10 --iterations 50` | Pure inference: steady-state kernel execution only | ~43.7ms p50 | +| Difference | HF preprocessing + JIT warmup overhead | ~23ms | + +**Rule for autoconfig skill:** Always use `winml perf` with `--warmup 10 --iterations 50` for latency measurements in experiments. Never use `winml eval` latency to compare configs. + +--- + +### Key insight for autoconfig skill + +- CPU EP on ConvNext: no extra flag tested improved latency. Baseline (no fusions beyond what ORT L2 applies unconditionally) is optimal. +- The only actionable finding is: **do not add `matmul-add-fusion` for ConvNext on CPU** (or any model where baseline already uses Gemm). +- QNN/DML: not yet tested. Guidance on those EPs requires separate validated experiments. + +--- + +### `winml analyze` gaps discovered + +These are cases where analyzing the graph *before* running autoconfig would have prevented wasted search iterations: + +**Gap 1: "Already fused" vs "fuseable" not distinguished** +- ConvNext has `LayerNormalization` as a native op (already fused at PyTorch export) +- `layer-norm-fusion` targets the *decomposed* ReduceMean→Sub→... pattern +- `winml analyze` reports `OP/ai.onnx/LayerNormalization` without indicating it's already in canonical form +- **Impact:** user enables `layer-norm-fusion` thinking it will help; it does nothing (but builds take longer) +- **Fix:** analyze should tag ops as `already_canonical` vs `fuseable_subgraph` + +**Gap 2: DW-Conv not distinguished from regular Conv** +- ConvNext has 18×7x7 DW-Conv (group=C) and 4×regular Conv (group=1) +- `winml analyze` reports all as `OP/ai.onnx/Conv` (undifferentiated) +- QNN EP supports DW-Conv natively (important for NPU efficiency), but EP support classification is per op type, not per `groups` value +- **Impact:** user cannot tell whether Conv ops are the DW or regular variant; EP support may differ +- **Fix:** analyze should emit `OP/ai.onnx/Conv[depthwise]` vs `OP/ai.onnx/Conv[regular]` + +**Gap 3: Transpose-sandwich pattern not detected** +- 42 Transpose nodes in ConvNext form a clear `Conv→Transpose→LN→...→Transpose` repeating pattern +- `transpose-optimizer` turns this into NHWC chains (good for GPU/NPU, bad for CPU) +- `winml analyze` reports Transpose as just `OP/ai.onnx/Transpose` with no structural context +- **Impact:** user cannot predict whether `transpose-optimizer` will help or hurt without running it +- **Fix:** analyze should detect `transpose_sandwich_depth: N` and emit a warning for CPU EP + +**Gap 4: ORT L2 baseline fusions not surfaced** +- After ORT Level 2 optimization (which runs unconditionally), the graph already has fused Gelu, Gemm +- The analyze command runs on the *pre-optimize* export.onnx, not the actual optimized model +- `winml analyze` sees 36×MatMul in export.onnx but the real model at inference has 37×Gemm +- **Impact:** analyze output doesn't reflect what the model actually looks like when running +- **Fix:** analyze should optionally run on `optimized.onnx` (post-ORT-L2), not just `export.onnx` + +**Gap 5: MatMul semantic not classified** +- 36 MatMul ops are all MLP dense layers (4C→C or C→4C expansion) +- No attention MatMuls present (ConvNext has no self-attention) +- QNN handles dense-layer MatMul differently from attention-context MatMul +- `winml analyze` reports `OP/ai.onnx/MatMul` without semantic classification +- **Fix:** analyze could detect MatMul role heuristically (shapes: attention = square-ish, MLP = wide fan-out) + +--- + + + +### Why skill eval matters + +Mobius has no skill eval mechanism — it tests models but not skills themselves. This is a gap. +A SKILL.md can have correct content but still cause the agent to give wrong guidance if the +trigger description is poorly written or the structure is confusing. Skill eval catches this. + +### Two eval dimensions + +| Dimension | What it checks | When to run | +|---|---|---| +| **Static (content quality)** | description trigger phrases, command accuracy, cross-reference validity | Every PR that modifies a SKILL.md | +| **Dynamic (agent behavior)** | Given a user scenario + skill injected, does the agent produce the right commands and diagnosis? | On significant content changes; periodically | + +Static eval = the review checklist in `contributing-a-skill`. +Dynamic eval = test cases in `evals/eval.yaml` per skill, run with `winml skill eval`. + +### `winml skill` — new CLI subcommand + +The eval system is built into winml-cli itself as a new `skill` subcommand. +This keeps the toolchain self-contained and enables CI integration without external dependencies. + +**Command surface:** +```bash +winml skill check [--skill ] # static: lint + auto-verify all commands in SKILL.md +winml skill gen-evals [--skill ] # auto-research: generate eval.yaml from SKILL.md content +winml skill eval [--skill ] # dynamic: run agent behavior tests +winml skill list # list all skills with pass/fail status +``` + +#### `winml skill check` — auto-research via command extraction + +This is the "code change that does auto research": + +1. **Parse SKILL.md** — extract every code block containing `winml ` patterns +2. **Verify flags exist** — run `winml --help` and check each flag is present +3. **Verify cross-references** — confirm every `.agents/skills//SKILL.md` path exists +4. **Verify trigger coverage** — count quoted phrases in `description` frontmatter (must be ≥3) +5. **Optionally run commands** — with `--dry-run-commands`, execute each command on a + canary model to verify it doesn't crash + +Example output: +``` +winml skill check --skill debug-accuracy-drop + +Checking debug-accuracy-drop... + ✓ description: 4 trigger phrases found + ✓ winml eval --mode compare [flag verified against eval --help] + ✓ winml analyze -m ... --ep qnn [flag verified against analyze --help] + ✗ winml perf --monitor [flag '--monitor' not found in perf --help] ← STALE + ✓ cross-ref: ep-compatibility-check/SKILL.md exists + ✗ cross-ref: validate-before-ship/SKILL.md [file missing] ← BROKEN LINK +Summary: 2 issues found +``` + +Key insight: **every time winml-cli flags change, `winml skill check` automatically +detects which skills have stale commands** — no manual audit needed. + +Implementation sketch (`src/winml/modelkit/commands/skill.py`): +```python +import re, subprocess +from pathlib import Path +import click + +SKILLS_DIR = Path(__file__).parents[5] / "skills" +WINML_CMD_PATTERN = re.compile(r'^\s*(winml\s+\w[\w\-]*\s+[^\n]+)', re.MULTILINE) + +def extract_commands(skill_md: str) -> list[str]: + """Extract all 'winml ...' lines from code blocks.""" + in_block = False + commands = [] + for line in skill_md.splitlines(): + if line.strip().startswith("```"): + in_block = not in_block + elif in_block and line.strip().startswith("winml "): + commands.append(line.strip()) + return commands + +def verify_flag(command_line: str) -> tuple[bool, str]: + """Check flags in a command line exist in --help output.""" + parts = command_line.split() + subcommand = parts[1] + flags = [p for p in parts[2:] if p.startswith("--")] + result = subprocess.run(["winml", subcommand, "--help"], + capture_output=True, text=True) + help_text = result.stdout + for flag in flags: + if flag not in help_text: + return False, f"flag '{flag}' not found in {subcommand} --help" + return True, "ok" + +@click.group("skill") +def skill_cmd(): + """Manage and evaluate winml-cli skills.""" + +@skill_cmd.command("check") +@click.option("--skill", default=None, help="Skill name to check (default: all)") +@click.option("--dry-run-commands", is_flag=True, help="Execute commands on canary model") +def check(skill, dry_run_commands): + """Static check: verify commands and cross-references in SKILL.md files.""" + targets = [SKILLS_DIR / skill] if skill else list(SKILLS_DIR.iterdir()) + for skill_dir in targets: + skill_md = (skill_dir / "SKILL.md").read_text() + for cmd in extract_commands(skill_md): + ok, msg = verify_flag(cmd) + status = "✓" if ok else "✗ STALE" + click.echo(f" {status} {cmd[:60]}") +``` + +#### `winml skill gen-evals` — LLM-powered eval case generation + +Auto-generates `evals/eval.yaml` from SKILL.md content using an LLM: + +1. **Extract trigger phrases** from `description` frontmatter +2. **Extract symptom→fix tables** from SKILL.md sections +3. **Prompt an LLM** to generate (user scenario, expected commands) pairs +4. **Write `evals/eval.yaml`** in PromptFoo format + +This is "auto research": the LLM reads the skill and generates adversarial cases +that challenge the agent — including negative cases where the agent should NOT +recommend something. + +```bash +winml skill gen-evals --skill debug-accuracy-drop --model gpt-4o --count 5 +# Writes: skills/debug-accuracy-drop/evals/eval.yaml (auto-generated) +# Human review before committing +``` + +The generated eval.yaml is a starting point — contributors review and refine before +committing. Over time, real user questions (from GitHub issues) can be mined and +added as additional eval cases. + +#### `winml skill eval` — agent behavior testing + +Runs the eval cases and reports results: + +```bash +winml skill eval --skill debug-accuracy-drop +# Uses evals/eval.yaml + injects SKILL.md as system prompt +# Reports pass/fail per test case +``` + +Internally shells out to PromptFoo (if installed) or uses a lightweight built-in runner +that calls the configured LLM API directly. + +### Directory layout + +Each skill carries its own eval cases: +``` +skills/ + debug-accuracy-drop/ + SKILL.md + evals/ + eval.yaml ← agent behavior test cases (hand-written or gen-evals output) +``` + +### eval.yaml format (PromptFoo) + +```yaml +# skills/debug-accuracy-drop/evals/eval.yaml +description: "Agent behavior eval for debug-accuracy-drop skill" + +prompts: + - "{{user_message}}" + +providers: + - id: openai:gpt-4o + config: + systemPrompt: | + You are a WinML CLI assistant. Use the following skill: + --- + {{skill_content}} + +tests: + - description: "Low cosine after W8A8 — should isolate to quantize stage" + vars: + user_message: "I quantized my model to W8A8 and cosine similarity is 0.87. What's wrong?" + assert: + - type: contains + value: "winml eval --mode compare" + - type: icontains + value: "quantize" + - type: icontains + value: "w8a16" # should suggest escalating precision + + - description: "NPU vs CPU discrepancy — should point to op fallback" + vars: + user_message: "My model gives different results on QNN NPU vs CPU after compile" + assert: + - type: contains + value: "winml analyze" + - type: icontains + value: "partial" # mention partial op fallback + - type: icontains + value: "compile" # blame compile stage, not quantize + + - description: "Drop after optimize only — should NOT blame calibration" + vars: + user_message: "cosine similarity dropped after winml optimize, I haven't quantized yet" + assert: + - type: contains + value: "winml eval --mode compare" + - type: icontains + value: "optimize" + - type: not-icontains + value: "calibration" # calibration is irrelevant here +``` + +### Minimum eval cases per skill + +| Skill | Min cases | Key assertions | +|---|---|---| +| `check-model-feasibility` | 4 | Screens candidates with `winml inspect` (never recommends an unsupported model); recommends the 3-layer check in order; gives fallback when EP absent | +| `debug-accuracy-drop` | 4 | Correctly isolates pipeline stage; suggests precision escalation | +| `ship-to-winapp` | 4 | Lists all 6 validation gates; handles waiver scenario; produces manifest.json with CPU fallback | +| `autoconfig` | 3 | Applies latency-budget vs accuracy-floor framework (manual mode); keeps/discards by objective (auto mode) | +| `adding-model-support` | 2 | Suggests L1→L5 order; correct recipe structure | +| `contributing-a-skill` | 2 | Flags missing trigger phrases; flags pseudocode commands | + +### What "passing" means + +An eval case passes when all assertions hold. Recommended pass threshold before merging: +- All `contains` / `icontains` assertions pass +- All `not-icontains` (negative) assertions pass (agent does NOT give wrong advice) + +The negative assertions are the most valuable — they catch the agent confidently giving +wrong guidance (e.g., blaming calibration for an optimize-stage drop). + +### Running evals + +```bash +# Install PromptFoo +npm install -g promptfoo + +# Run eval for a single skill +cd skills/debug-accuracy-drop +promptfoo eval --config evals/eval.yaml + +# Run all skill evals +for dir in skills/*/; do + if [ -f "$dir/evals/eval.yaml" ]; then + promptfoo eval --config "$dir/evals/eval.yaml" + fi +done +``` + +--- + +## Implementation notes + +### Directory structure +``` +skills/ + use-winml-cli/ ← existing, extend (user) + SKILL.md + evals/eval.yaml + check-model-feasibility/ ← new (user — model discovery + EP/device compatibility) + SKILL.md + evals/eval.yaml + debug-accuracy-drop/ ← new (user) + SKILL.md + evals/eval.yaml + autoconfig/ ← new (user — optimize: autoresearch loop + manual framework) + SKILL.md + evals/eval.yaml + ship-to-winapp/ ← new (user — validation gates + multi-EP packaging; partial dep on winml package feature) + SKILL.md + evals/eval.yaml + adding-model-support/ ← new (contributor) + SKILL.md + evals/eval.yaml + adding-ep-support/ ← new (contributor) + SKILL.md + evals/eval.yaml + contributing-a-skill/ ← new (contributor) + SKILL.md + evals/eval.yaml + optimization-research/ ← new (contributor — internal deep gap analysis for winml-cli team) + SKILL.md + templates/olive_qnn.json + templates/olive_dml.json + evals/eval.yaml +``` + +### Priority order for implementation + +This is **implementation sequencing** (risk- and dependency-driven), which intentionally differs from +the **importance** ranking in the Overview. Importance answers "which skill matters most to users"; +this answers "which is safest to build first." Example: `autoconfig` is the #1 *importance* user skill +but ships *last* because it depends on the `--format json` changes and is the most complex. + +**Code changes first (unblocks agentic skill execution):** +0. `winml eval --format json` — critical: enables all accuracy-related agentic flows +0. `winml analyze --format json` — enables EP compatibility agentic flows +0. `winml perf --format json` — enables performance SLA agentic flows + +**User skills:** +1. `check-model-feasibility` — lowest risk, pure existing commands (`inspect`/`sys`/`analyze`); front door for new users (model discovery half needs `analyze --format json`) +2. `debug-accuracy-drop` — closes clearest pain point, existing `eval --mode compare` +3. `ship-to-winapp` — validation checklist + packaging; build it once the gate commands exist (partial dep on `winml package` feature) +4. `autoconfig` — depends on #847/#848/#849 + most complex skill to implement (manual mode can ship first as the lightweight framework) + +**Contributor skills:** +5. `contributing-a-skill` — enables community contributions to the skill ecosystem +6. `adding-model-support` — most impactful for model coverage growth +7. `adding-ep-support` — lower frequency, but needed for new EP onboarding +8. `optimization-research` — internal gap-finder; depends on a working `autoconfig` baseline to compare against + +### Required code changes for agentic skill execution + +The three changes that turn skills from documentation into agentic programs: + +**1. `winml eval --format json`** + +File: `src/winml/modelkit/commands/eval.py` + +Add `--format` option and emit structured JSON to stdout: +```json +{ + "mode": "compare", + "model": "path/to/quantized.onnx", + "model_id": "microsoft/resnet-50", + "metrics": { + "cosine_similarity": 0.87, + "sqnr_db": 28.3, + "psnr_db": 31.1, + "max_abs_diff": 0.042 + }, + "task_metric": { "top1_accuracy": 0.741 }, + "threshold_pass": false +} +``` + +**2. `winml analyze --format json`** + +File: `src/winml/modelkit/commands/analyze.py` + +Already supports `--output file.json`. Add `--format json` to also print to stdout +(mirrors pattern from `winml inspect` and `winml sys`): +```json +{ + "ep": "qnn", + "model": "path/to/model.onnx", + "summary": { "supported": 142, "partial": 3, "unsupported": 1 }, + "partial_ops": ["MultiHeadAttention", "LayerNorm", "Softmax"], + "unsupported_ops": ["CustomRotaryEmbedding"] +} +``` + +**3. `winml perf --format json`** + +File: `src/winml/modelkit/commands/perf.py` + +Already writes JSON to file via `-o`. Add `--format json` stdout output: +```json +{ + "model": "path/to/model.onnx", + "ep": "qnn", + "device": "npu", + "iterations": 100, + "latency_ms": { "p50": 18.3, "p90": 21.7, "p99": 28.4, "mean": 18.9 }, + "throughput_rps": 54.6 +} +``` + +These three changes are ~50 lines of code each, follow the existing pattern from +`winml inspect --format json` and `winml sys --format json`, and unlock the full +agentic execution model for all consumer skills. + +### Sizing estimate (per skill) +Each SKILL.md based on Mobius patterns (~8–14KB): +- ~200 lines prose + decision tables +- ~50 lines code examples +- Cross-reference section + +### Relationship to existing `use-winml-cli` skill +The new skills are **task-scoped** (problem → solution) vs the existing skill which is +**tool-scoped** (here's what each command does). They complement, not replace each other. +The existing skill should add cross-references to the new skills in its "Common patterns" section. + +--- + +## QNN NPU Catalog Sweep — Findings & Feature Gaps (2026-06-13) + +Source: 8-model catalog sweep via autoconfig POC (C:\tmp\autoconfig-demo\catalog_qnn_sweep.py) + +### Cross-model results + +| Model | Arch | Baseline p50 | Best p50 | Gain | Best config | +|-------|------|-------------|----------|------|-------------| +| microsoft/resnet-18 | resnet | 0.96ms | 0.96ms | — | baseline (opset17) | +| google/vit-base-patch16-224 | vit | 9.04ms | 9.04ms | — | baseline (opset17) | +| apple/mobilevit-small | mobilevit | 12.07ms | **8.62ms** | +29% | opset21+conv_fusions | +| facebook/dinov2-small | dinov2 | 6.56ms | **4.98ms** | +24% | opset21 | +| hustvl/yolos-small | yolos | 78.69ms | — | timeout | — | +| distilbert SST-2 | distilbert | 19.48ms | 19.48ms | — | baseline | +| all-MiniLM-L6-v2 | bert | 5.81ms | 5.81ms | — | baseline | +| deepset/roberta-base-squad2 | roberta | 14.94ms | 14.72ms | 1.5% | opset21 | + +### Validated KB findings + +**npu-001 refined**: opset21 benefit is architecture-gated: +- ✅ Conv + residual connections: +25–31% (mobilevit, dinov2, convnext) +- ❌ Pure transformer (ViT, YOLOS): -7% or neutral +- ⚪ NLP BERT-family: neutral + +**npu-006 NEW — CRITICAL**: Conv fusions (conv-bn/add/activation) cause catastrophic QNN NPU CPU fallback +- ResNet-18 with conv fusions: 0.96ms → 132ms (+4900% regression) +- MobileViT: safe (no regression) +- Severity: critical — can produce 50x+ regression silently + +**npu-007 NEW**: DVFS thermal noise makes CV gate unreliable on QNN NPU +- New bench protocol: 3 sessions × 500 iters + 30s cool-down + median p50 + >10% noise floor + +### Feature gaps (winml-cli backlog items) + +**Gap A: winml analyze — Conv fusion QNN safety check** +winml analyze should detect Conv-dominant topologies and warn when conv-bn/add/activation +fusions are configured for QNN NPU target. Currently no pre-build detection of this hazard. +- Command to add: warning in analyze output when ep=qnn AND conv_fusion_pass is enabled AND model has >N Conv ops +- Priority: HIGH (silent 50x regression risk) + +**Gap B: budget-aware sweep in autoconfig** +Large models (YOLOS, ~78ms/inf) cause sweep timeout with current fixed budget. +Need: per-hypothesis time estimation → auto-skip models that exceed budget, log as "timeout" not failure. +- Affects: autoconfig POC and any future winml sweep command + +**Gap C: winml perf DVFS-aware session averaging** +winml perf should natively support session-level median aggregation for QNN NPU. +Current single-session variance is dominated by DVFS thermal state, not model performance. +- Flag proposal: --sessions 3 --cool-down 30 --signal median-p50 +- This would make winml perf output trustworthy for optimization decisions on Snapdragon X Elite + +--- + +## Feature Request: FusedConv detection + unfuse-for-qnn (2026-06-15) + +### Problem + +用户可能从外部拿到一个已经做过 Conv fusion 的 ONNX 模型,或者 autoconfig 实验里开了 conv-add-activation-fusion flag。 +这类模型在 QNN NPU 上跑起来特别慢(ResNet-18 实测 +4900% regression),但没有任何报错,用户完全不知道原因。 + +### Root cause + +conv-add-activation-fusion 生成的是 ORT 扩展 op FusedConv(非标准 ONNX op)。 +QNN EP 不认识这个 op,所有 FusedConv 节点全部 fallback 到 CPU,PCIe round-trip 开销极大。 + +conv-bn-fusion 不同:它把 BN 参数数学吸收进 Conv weight,不产生新 op 类型,结果仍是标准 Conv,**不可逆**。 + +### Proposed feature + +**1. winml analyze — FusedConv detection** + +winml analyze -m model.onnx --ep qnn 扫描图中所有节点, +如果发现 FusedConv 节点且目标 EP 为 QNN,输出警告: + +` +⚠ QNN NPU: 23 FusedConv nodes detected. + FusedConv is an ORT-internal op not supported by QNN EP — these nodes will fall back to CPU. + Recommend: run winml optimize --unfuse-conv to expand back to standard ONNX ops. +` + +**2. winml optimize --unfuse-conv** + +新增 optimize pass:把 FusedConv 节点拆回 Conv + Add + 。 +- Lossless(权重不变,只拆 op 结构) +- 输出标准 ONNX,QNN EP 可正常映射 HTP kernel +- 适用场景:BYOM 用户带入已做过 fusion 的模型 + +**Implementation notes** +- 检测: +ode.op_type == "FusedConv" 即可定位 +- 拆分:读 FusedConv attribute ctivation 字段 → 插入对应 Relu/Sigmoid/Tanh 节点 +- 不处理 conv-bn-fusion 产生的模型(那个无法反向,只能重新从 FP32 export) + +### Priority +MEDIUM — 默认 flag 是关的,不是高频路径,但对 BYOM 场景(拿到别人优化过的模型)有实际价值。 diff --git a/research/autoconfig/ep_knowledge/README.md b/research/autoconfig/ep_knowledge/README.md new file mode 100644 index 000000000..61ccd28cc --- /dev/null +++ b/research/autoconfig/ep_knowledge/README.md @@ -0,0 +1,25 @@ +# Per-EP Empirical Knowledge Base + +Each JSON file stores empirical findings for one EP/device combination. + +## ⚠️ CRITICAL EPISTEMICS + +These findings are **observational hypotheses, not ground truth**. They were derived +from a small number of experiments on a single model (ConvNext-tiny) on a single device +(Snapdragon X Elite CRD). Every finding carries a `confidence` field and a `falsified_by` +field. Before using a finding to prune a search space, check: + +1. **Is the model architecture similar?** (ConvNext ≠ BERT ≠ ResNet) +2. **Is the hardware the same?** (X Elite CRD ≠ X Plus ≠ X1E-80-100) +3. **Is the ORT/QNN SDK version the same?** +4. **Is the mechanism confirmed?** (see `mechanism_confirmed` field) + +**Dialectical rule**: A finding that prunes a search dimension must be re-enabled +if a new experiment on a new model/hardware contradicts it. Findings degrade over time +as ORT and QNN SDK versions change. + +## Files +- `qnn_npu.json` — QNN HTP (NPU) EP findings +- `qnn_gpu.json` — QNN GPU EP findings +- `dml.json` — DirectML EP findings +- `cpu.json` — CPU EP findings diff --git a/research/autoconfig/ep_knowledge/cpu.json b/research/autoconfig/ep_knowledge/cpu.json new file mode 100644 index 000000000..42a693928 --- /dev/null +++ b/research/autoconfig/ep_knowledge/cpu.json @@ -0,0 +1,128 @@ +{ + "_meta": { + "ep": "cpu", + "device": "cpu", + "hardware": "Snapdragon X Elite CRD (Oryon CPU)", + "ort_version": "1.x (check winml version at experiment time)", + "model": "facebook/convnext-tiny-224 (ALL findings from this model only)", + "last_updated": "2026-06-17", + "epistemics_warning": "⚠️ All findings from rigorous 3-run ablation. However, still 1 model, 1 device. CPU behavior can differ significantly between x86 and ARM (Oryon). Check architecture before applying rules." + }, + + "findings": [ + + { + "id": "cpu-001", + "title": "opset 19+ causes severe regression on CPU EP (3-4x slowdown) — data confirmed, mechanism uncertain", + "observation": "opset 17: p50=43.7ms. opset 19: p50=160ms (3.7x). opset 20: p50=131ms (3.0x). opset 21: p50=170ms (3.9x). opset 22: p50=85ms (1.9x). All runs consistent — not noise. Pattern is non-monotonic: opset 22 partially recovers but remains 1.9x slower than opset 17.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "Original hypothesis: ORT C++ Transpose Optimizer has a kMaxSupportedOpset gate (optimizer_api.h). If model opset > kMaxSupportedOpset, Transpose Optimizer is skipped silently. ConvNext has 42 Transpose nodes — without optimization, each executes as a full memory-layout copy. HOWEVER: the non-monotonic recovery at opset 22 (85ms vs 160-170ms at opset 19-21) is inconsistent with a simple binary gate. If the gate fires for opset > N, opset 22 should behave identically to opset 19. The actual mechanism is more complex. Additionally, ORT 1.24.x has kMaxSupportedOpset >= 23 confirmed (separate NHWC gate) — the Transpose Optimizer gate threshold may differ but is unverified.", + "action_for_autoconfig": "For CPU EP: default to opset 17. The empirical data (1 model, consistent across opsets) is unambiguous — opset 17 is the best option. Do NOT try opset 19+. The mechanism reason is uncertain but the practical conclusion is solid.", + "confidence": "high on empirical observation (consistent data across opsets for 1 model). Low on mechanism — the gate hypothesis does not fully explain the non-monotonic opset 22 partial recovery.", + "falsified_by": null, + "scope": "ConvNext on Oryon CPU, ORT 1.24.x. Models with few Transpose nodes (BERT) likely unaffected.", + "ort_kMaxSupportedOpset_by_version": { + "note": "These values are for the NHWC layout_transformation gate, NOT the Transpose Optimizer gate. The two constants may differ within the same ORT release.", + "v1.14.x": 18, + "v1.16.x": 19, + "v1.17.x": 20, + "v1.18.x": 21, + "v1.24.x": ">= 23 (confirmed for NHWC gate; Transpose Optimizer gate unknown)", + "main_HEAD": 26 + }, + "do_not_generalize_to": "QNN NPU EP or DML EP — kMaxSupportedOpset is a CPU-only ORT optimizer gate. These EPs have their own kernel dispatch unaffected by this." + }, + + { + "id": "cpu-002", + "title": "matmul_add_fusion is a CONFIRMED REGRESSION on ConvNext CPU (+38ms, ~87%)", + "observation": "matmul_add_fusion: p50=81.7ms, runs=[63.0, 70.8, 111.2ms]. Baseline p50=43.7ms. All 3 runs far above highest baseline run (45.4ms).", + "mechanism_confirmed": false, + "mechanism_hypothesis": "ORT baseline already converts MatMul+Add→Gemm (37 Gemm in model.onnx). Applying matmul_add_fusion on top may create redundant kernel dispatch or conflicting operator mapping. Requires profiling to confirm.", + "action_for_autoconfig": "Do NOT apply matmul_add_fusion for CPU EP on models where baseline already uses Gemm (check model.onnx for Gemm nodes before applying this pass).", + "confidence": "high — 3 independent runs, all far above baseline; direction is unambiguous", + "falsified_by": null, + "scope": "ConvNext and models where ORT L2 baseline already fuses MatMul+Add→Gemm", + "do_not_generalize_to": "Models where baseline does NOT have Gemm (the pass may legitimately help there)" + }, + + { + "id": "cpu-003", + "title": "transpose_optimizer is neutral on ConvNext CPU (NOT +270ms as previously reported)", + "observation": "winml perf (warmup=10, iter=50): 42.3 / 52.3 / 41.8ms — overlapping baseline. Earlier winml eval-based measurement showed +270ms — this was a measurement artifact.", + "mechanism_confirmed": true, + "mechanism_hypothesis": "winml eval includes HF preprocessing + model load + no warmup. The +270ms was preprocessing overhead, not inference regression. Pure inference measurement (winml perf) shows no effect.", + "action_for_autoconfig": "transpose_optimizer is neutral for ConvNext CPU — neither helpful nor harmful. Can be omitted from search space.", + "confidence": "high — measurement methodology confirmed; tool comparison validated", + "falsified_by": "Earlier winml eval measurement — RETRACTED. Use winml perf for all latency comparisons.", + "scope": "ConvNext CPU", + "measurement_lesson": "Always use winml perf (warmup=10, iter=50) for latency experiments. Never use winml eval latency to compare configs." + }, + + { + "id": "cpu-004", + "title": "nchwc_transformer is neutral on ConvNext CPU", + "observation": "nchwc: 43.4 / 48.0 / 44.7ms — overlapping baseline (42.5–45.4ms). No improvement.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "NCHWc SIMD layout benefits Conv-heavy models. ConvNext has 22 Conv nodes but 57.7% of kernel time is Gemm. The bottleneck is not memory layout but compute throughput — NCHWc doesn't help.", + "action_for_autoconfig": "nchwc_transformer is low-priority for ConvNext-class models. Profile first — if Conv% > 40%, try nchwc. If Gemm% > 50%, skip.", + "confidence": "medium — 3 runs, neutral result; mechanism is a hypothesis", + "falsified_by": null, + "scope": "ConvNext CPU (Gemm-dominated, not Conv-dominated)" + }, + + { + "id": "cpu-005", + "title": "Baseline (no extra flags) is the optimal config for ConvNext CPU", + "observation": "No flag in 22-experiment ablation improved p50 beyond noise. Baseline p50=43.7ms is the floor.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "ORT L2 baseline already applies gelu_fusion and MatMul→Gemm before any user flags. The effective optimization space is narrow for ConvNext on CPU. Compute bottleneck (Gemm=57.7%) is not addressable via graph passes.", + "action_for_autoconfig": "For CPU EP on ConvNext-class models: skip optimization pass sweep. Go directly to quantization experiments.", + "confidence": "high — 22 experiments, no improvement found", + "falsified_by": null, + "scope": "ConvNext-class vision models on CPU", + "do_not_generalize_to": "BERT/Transformer models where attention_fusion + skip_layer_norm can significantly help" + }, + + { + "id": "cpu-006", + "title": "CPU EP opset 21 is 3.9x SLOWER — opposite of QNN NPU behavior", + "observation": "CPU opset 21: p50=170ms. CPU opset 17: p50=43.7ms. QNN NPU opset 21 (DINOv2): p50=26ms (~24% FASTER than opset 17 at 34ms). Note: the NPU and CPU experiments used DIFFERENT models (CPU=ConvNext, NPU=DINOv2) — the comparison is directional only, not quantitative.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "CPU regression from Transpose Optimizer bypass (see cpu-001 — mechanism uncertain). QNN NPU speedup from unknown cause (original Transpose bypass hypothesis invalidated; Transpose counts identical in opset17/21 graphs). The key insight is that CPU and QNN NPU respond oppositely to opset changes, regardless of the root cause.", + "action_for_autoconfig": "EP ISOLATION: CPU opset findings MUST NOT influence QNN NPU search space, and vice versa. Always validate per EP independently.", + "confidence": "high on empirical observation. Low on mechanism for both directions.", + "falsified_by": null, + "scope": "ALL — this is a meta-rule about EP isolation, not model-specific" + } + + ], + + "search_space_rules": { + "opset": { + "recommended_order": [17], + "skip": ["19, 20, 21, 22 — kMaxSupportedOpset regression (cpu-001). Only safe to try if ORT version's kMaxSupportedOpset >= target."], + "dialectical_note": "⚠️ This rule is ORT-version dependent. Check kMaxSupportedOpset for the shipping ORT build before skipping higher opsets." + }, + "quantization": { + "recommended": "w8a8 (CPU benefits most from small model size)", + "dialectical_note": "⚠️ W8A8 on CPU not yet validated for ConvNext. General guidance — run accuracy gate." + }, + "compile": { + "always_run": false, + "skip": true, + "dialectical_note": "⚠️ winml compile targets QNN EPContext. Not applicable to CPU EP." + }, + "graph_passes": { + "recommended": "autoconf defaults only", + "skip": ["matmul_add_fusion if model already has Gemm (cpu-002)", "nchwc_transformer if Gemm% > 50% in profile (cpu-004)"], + "dialectical_note": "⚠️ Skip rules are Gemm-bottleneck specific. Conv-heavy models may still benefit from nchwc_transformer." + } + }, + + "meta_lessons": { + "measurement_discipline": "Always use winml perf (warmup=10, iter=50) for latency. Never use winml eval latency. See cpu-003.", + "ep_isolation": "CPU findings (especially opset regression) DO NOT transfer to QNN NPU or DML. Each EP has its own optimizer path. See cpu-006.", + "baseline_check": "Before applying any fusion flag, check model.onnx for existing fused ops. If Gemm already present, matmul_add_fusion is likely a no-op or regression." + } +} diff --git a/research/autoconfig/ep_knowledge/dml.json b/research/autoconfig/ep_knowledge/dml.json new file mode 100644 index 000000000..829a7a85e --- /dev/null +++ b/research/autoconfig/ep_knowledge/dml.json @@ -0,0 +1,104 @@ +{ + "_meta": { + "ep": "dml", + "device": "gpu", + "hardware": "Snapdragon X Elite CRD (Adreno X1-85 / DirectML via D3D12)", + "ort_version": "1.x with onnxruntime-directml package", + "model": "facebook/convnext-tiny-224 (ALL findings from this model only)", + "last_updated": "2026-06-17", + "epistemics_warning": "⚠️ DML experiments required swapping onnxruntime-directml for onnxruntime (Python package conflict). Results reflect DML EP behavior via winml's DML DLL, not the Python onnxruntime-directml package directly. Re-validate if package setup changes." + }, + + "findings": [ + + { + "id": "dml-001", + "title": "DML FP32 is more stable than QNN GPU FP32 — p50 difference is within noise", + "observation": "DML FP32: p50=16.9ms, p90=17.7ms, std=0.52. QNN GPU FP32: p50=17.7ms, p90=19.7ms, std=0.97. p50 diff = 0.8ms = 0.82σ of QNN GPU measurement — distributions OVERLAP. NOT a separable performance difference. DML is meaningfully more stable (std 0.52 vs 0.97, CV 3% vs 5.5%).", + "mechanism_confirmed": false, + "mechanism_hypothesis": "DML JIT-compiles HLSL shaders at model load time — shader compilation done once, producing stable execution. QNN GPU EP does graph partitioning at each session creation — more overhead and jitter.", + "action_for_autoconfig": "CORRECTED: Do NOT claim DML is faster than QNN GPU based on this data — the 0.8ms difference is within noise. DML IS more stable (lower CV). Prefer DML for lower tail latency (p90) and variance. p50 advantage is unconfirmed.", + "confidence": "low on p50 speedup (not statistically separable). Medium on stability advantage (std 0.52 vs 0.97 is real difference even if p50 overlaps).", + "falsified_by": "Statistical analysis: 0.8ms diff < 1σ of GPU measurement. Removed from 'DML is faster' claims.", + "scope": "Adreno X1-85, ConvNext-class models, 3-run comparison (insufficient for definitive p50 ranking)", + "do_not_generalize_to": "NVIDIA/Intel GPUs (QNN GPU not available there anyway)" + }, + + { + "id": "dml-002", + "title": "NHWC transformer increases latency variance on DML — p50 is neutral or marginally better", + "observation": "DML NHWC: p50=16.5ms (-0.4ms vs baseline 16.9ms), p90=21.0ms (+19% vs baseline 17.7ms), std=1.89 (3.6x worse than FP32 baseline 0.52). NOTE: p50 is marginally BETTER with NHWC, not worse. The regression is in tail latency and variance.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "D3D12 on Adreno X1-85 handles tensor layouts internally via HLSL shaders. Adding explicit ORT NHWC Transposes does not improve memory alignment for DML but adds dispatch overhead that occasionally causes scheduling jitter, inflating p90 and std.", + "action_for_autoconfig": "Do NOT apply nhwc-transformer for DML EP if tail latency stability matters. p50 may be marginally better but p90 is 19% worse and std is 3.6x worse. For applications sensitive to worst-case latency, NHWC is harmful.", + "confidence": "low — single run comparison, different baselines (run_count unspecified). Direction for variance is clear; p50 benefit is marginal and unreliable.", + "falsified_by": null, + "scope": "Adreno X1-85 + DML, ConvNext", + "do_not_generalize_to": "NVIDIA GPUs (NHWC may help with CUDNN)" + }, + + { + "id": "dml-003", + "title": "DML FP16 gives ~1.4x speedup with NO DVFS bimodal (unlike QNN GPU FP16)", + "observation": "DML FP16 (via Python hack, not official CLI): p50=11.8ms, p90=12.8ms, std=0.66. Clean unimodal distribution.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "DML HLSL shader compilation locks in FP16 compute paths at load time — no dynamic voltage/frequency switching surprises. QNN GPU FP16 showed DVFS bimodal distribution (some runs in high-power state, some in low-power state).", + "action_for_autoconfig": "FP16 is the primary optimization lever for DML. Unblock via #867 (--precision fp16 flag).", + "confidence": "low — experiment used Python hack (not official winml CLI). Mark as SKIPPED/CLI-gap until #867 ships.", + "falsified_by": null, + "scope": "Adreno X1-85 + DML", + "tracked_issue": "#867", + "cli_gap": true, + "cli_gap_note": "⚠️ This finding was produced via a Python workaround, not winml CLI. Cannot be reproduced with winml build today. Blocked on #867." + }, + + { + "id": "dml-004", + "title": "winml analyze returns 0/0/0/251 (all Unknown) for DML EP — no rule data", + "observation": "winml analyze --ep dml outputs: supported=0, partial=0, unsupported=0, unknown=251.", + "mechanism_confirmed": true, + "mechanism_hypothesis": "DML EP supports all standard ONNX ops by design (D3D12 universal op coverage). winml analyze has no DML-specific rule data file. This is a cosmetic gap — DML actually runs all ops natively.", + "action_for_autoconfig": "Do not use winml analyze output to prune search space for DML. Assume all ops supported.", + "confidence": "high — confirmed by DML running all 251 ops with no CPU fallback", + "falsified_by": null, + "scope": "DML EP (all models)", + "tracked_issue": "not filed — cosmetic gap, low priority" + }, + + { + "id": "dml-005", + "title": "opset 21 on DML not yet validated", + "observation": "opset 21 sweep only run on QNN NPU. DML behavior with opset 21 is unknown.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "DML uses D3D12 dispatch — different from QNN EP kernel registry. opset 21 speedup on QNN NPU may not apply.", + "action_for_autoconfig": "Include opset 21 in DML search sweep. No prior data — must run experiment.", + "confidence": "low — no data", + "falsified_by": null, + "scope": "UNKNOWN — needs experiment" + } + + ], + + "search_space_rules": { + "opset": { + "recommended_order": [17, 21], + "rationale": "dml-005: unknown. Include both in sweep.", + "dialectical_note": "⚠️ No data on DML + opset 21. Do not assume NPU behavior transfers." + }, + "quantization": { + "recommended": "fp16 (when #867 ships)", + "skip": ["w8a8", "w8a16 — quantization rarely helps on GPU via DML"], + "dialectical_note": "⚠️ Quantization skip is based on general DML behavior. Some models with large weights may benefit from W8A16 even on DML. Test empirically." + }, + "compile": { + "always_run": false, + "skip": true, + "dialectical_note": "⚠️ DML uses HLSL, not QNN binary compilation. winml compile targets QNN EPContext only. Not applicable to DML." + }, + "graph_passes": { + "recommended": "autoconf defaults only", + "skip": ["nhwc-transformer (dml-002)"], + "dialectical_note": "⚠️ Same as QNN GPU: NHWC hurts on Adreno. NVIDIA/Intel may differ." + } + } +} diff --git a/research/autoconfig/ep_knowledge/qnn_gpu.json b/research/autoconfig/ep_knowledge/qnn_gpu.json new file mode 100644 index 000000000..d350cc54a --- /dev/null +++ b/research/autoconfig/ep_knowledge/qnn_gpu.json @@ -0,0 +1,115 @@ +{ + "_meta": { + "ep": "qnn", + "device": "gpu", + "hardware": "Snapdragon X Elite CRD (Adreno X1-85 / QNN GPU EP)", + "ort_version": "1.x (check winml version at experiment time)", + "qnn_sdk_version": "unknown — check QnnSystem.dll version", + "model": "facebook/convnext-tiny-224 (ALL findings from this model only)", + "last_updated": "2026-06-17", + "epistemics_warning": "⚠️ All findings are hypotheses derived from 1 model on 1 device. Confidence levels reflect mechanism understanding, not universal applicability. GPU EP behavior varies significantly by model architecture and Adreno driver version." + }, + + "findings": [ + + { + "id": "gpu-001", + "title": "FP32 baseline is already optimal for ConvNext on QNN GPU — no optimization pass helps", + "observation": "Full sweep of 11 passes/combinations on ConvNext QNN GPU: all returned 0% node reduction or worse latency. Baseline p50=17.7ms, p90=19.7ms, std=0.97.", + "mechanism_confirmed": true, + "mechanism_hypothesis": "251/0/0/0 (all ops native on GPU, zero CPU fallback). ConvNext linear layers use Reshape→MatMul→Reshape, not bare MatMul+Add — so MatMulAdd→Conv2D rewrites don't match. autoconf (gelu_fusion + matmul_add_fusion) already applied all applicable transforms.", + "action_for_autoconfig": "Skip all graph optimization experiments for QNN GPU on ConvNext-class models. Use FP32 baseline directly.", + "confidence": "high — confirmed by 0% node delta on all rewrites + 251/0/0/0 analyze output", + "falsified_by": null, + "scope": "ConvNext-class models (Reshape→MatMul→Reshape pattern)", + "do_not_generalize_to": "Transformer models with bare MatMul+Add (those may benefit from rewrites)" + }, + + { + "id": "gpu-002", + "title": "NHWC transformer hurts QNN GPU on Adreno X1-85 (~10% worse)", + "observation": "NHWC transformer: p50=19.5ms (+10%), p90=23.8ms (+21%), std=3.43 (3.5x worse). Consistent across multiple runs.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "Adreno X1-85 + QNN GPU EP does not benefit from explicit NHWC layout transforms. QNN GPU EP handles layout internally; forcing NHWC via ORT creates additional Reshape overhead without the memory alignment benefit.", + "action_for_autoconfig": "Do NOT apply nhwc-transformer for QNN GPU EP.", + "confidence": "medium — observed consistently; mechanism hypothesis, not confirmed", + "falsified_by": null, + "scope": "Adreno X1-85 + QNN GPU EP", + "do_not_generalize_to": "Non-Adreno GPUs (NVIDIA, Intel Arc) — NHWC may help there" + }, + + { + "id": "gpu-003", + "title": "winml compile appears to hurt QNN GPU (~34% regression) — SINGLE EXPERIMENT, LOW CONFIDENCE", + "observation": "FP32 + compile: p50=23.7ms vs baseline 17.7ms (+34%). Single experiment only.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "QNN GPU EP compile (EPContext) is designed for NPU (HTP). On GPU EP, the compilation path may force a different dispatch mode that bypasses the optimized GPU shader path. QNN SDK likely has a GPU-specific compilation flow that winml compile doesn't trigger correctly.", + "action_for_autoconfig": "AVOID winml compile for QNN GPU EP. Direction (regression) is consistent with mechanism hypothesis and 34% is a large signal, but this is a single experiment. Until replicated, treat as likely harmful but not confirmed.", + "confidence": "low — single experiment. 34% gap is above DVFS noise level (CV ~0.05 → noise ~1ms, gap is 6ms). Direction probably real but magnitude uncertain.", + "falsified_by": null, + "scope": "QNN GPU EP", + "do_not_generalize_to": "QNN NPU EP (compile always helps NPU)" + }, + + { + "id": "gpu-004", + "title": "W8A8 QDQ hangs indefinitely on QNN GPU EP", + "observation": "Passing a W8A8 QDQ-annotated ONNX to QNN GPU EP causes infinite hang. winml build's _patch_device() sets quant=null for GPU, preventing this in normal user path.", + "mechanism_confirmed": true, + "mechanism_hypothesis": "QNN SDK's GPU EP does not support QDQ-quantized graphs. This is a known QNN SDK limitation. winml build already protects against this via _patch_device().", + "action_for_autoconfig": "Skip ALL quantization experiments for QNN GPU EP. Do not even attempt W8A8 or W8A16.", + "confidence": "high — hang confirmed; protection mechanism in _patch_device() confirmed by code inspection", + "falsified_by": null, + "scope": "QNN GPU EP (QNN SDK limitation)", + "tracked_issue": "#868 (fast-fail enhancement)" + }, + + { + "id": "gpu-005", + "title": "gelu_fusion improves latency STABILITY (p90/std) on QNN GPU, not p50", + "observation": "Raw export (287 nodes, unfused Gelu): p50=17.4ms, p90=29.2ms, std=5.90. Autoconf (251 nodes, fused Gelu): p50=17.7ms, p90=19.7ms, std=0.97. p50 nearly identical, p90 -48%, std -6x.", + "mechanism_confirmed": true, + "mechanism_hypothesis": "5 separate GPU kernel dispatches (Mul→Div→Erf→Mul→Add) for unfused GELU create scheduling jitter. Single Gelu kernel eliminates dispatch overhead → dramatically lower tail latency.", + "action_for_autoconfig": "Always apply gelu_fusion for QNN GPU (stability benefit). Do not expect p50 improvement.", + "confidence": "high — mechanism is well-understood (GPU kernel dispatch overhead)", + "falsified_by": null, + "scope": "Any model with GELU activations on QNN GPU" + }, + + { + "id": "gpu-006", + "title": "opset 21 on QNN GPU not yet validated", + "observation": "opset 21 sweep only run on QNN NPU. QNN GPU behavior with opset 21 is unknown.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "QNN GPU and QNN NPU use different kernel registries. opset 21 speedup on NPU does NOT imply the same on GPU.", + "action_for_autoconfig": "Do not assume opset 21 helps QNN GPU. Run a validation experiment before adding to search space.", + "confidence": "low — no data", + "falsified_by": null, + "scope": "UNKNOWN — needs experiment" + } + + ], + + "search_space_rules": { + "opset": { + "recommended_order": [17], + "rationale": "gpu-006: opset 21 not validated for GPU. Stay at 17 until tested.", + "dialectical_note": "⚠️ May change once opset 21 GPU experiment is run." + }, + "quantization": { + "recommended": "skip", + "skip": ["all — QDQ hangs on GPU EP (gpu-004)"], + "dialectical_note": "⚠️ This is a QNN SDK limitation, not winml. May change with future QNN SDK versions that support GPU quantization." + }, + "compile": { + "always_run": false, + "skip": true, + "dialectical_note": "⚠️ gpu-003: compile regresses QNN GPU. Confirmed by single experiment. Re-validate if winml compile behavior changes." + }, + "graph_passes": { + "recommended": "autoconf defaults only", + "skip": ["nhwc-transformer (gpu-002)", "all additional fusion passes (gpu-001)"], + "dialectical_note": "⚠️ Skip rules are ConvNext-specific. Transformer models may benefit from attention_fusion etc." + } + } +} diff --git a/research/autoconfig/ep_knowledge/qnn_npu.json b/research/autoconfig/ep_knowledge/qnn_npu.json new file mode 100644 index 000000000..8023db8e6 --- /dev/null +++ b/research/autoconfig/ep_knowledge/qnn_npu.json @@ -0,0 +1,323 @@ +{ + "_meta": { + "ep": "qnn", + "device": "npu", + "hardware": "Snapdragon X Elite CRD (Adreno X1-85 / Hexagon HTP)", + "ort_version": "1.24.5 (onnxruntime-windowsml; confirmed kMaxSupportedOpset >= 23)", + "qnn_sdk_version": "unknown — check QnnSystem.dll version", + "models_tested": [ + "facebook/convnext-tiny-224", + "microsoft/resnet-18", + "google/vit-base-patch16-224", + "apple/mobilevit-small", + "facebook/dinov2-small", + "hustvl/yolos-small", + "distilbert/distilbert-base-uncased-finetuned-sst-2-english", + "sentence-transformers/all-MiniLM-L6-v2", + "deepset/roberta-base-squad2", + "facebook/dinov2-base", + "microsoft/rad-dino", + "facebook/dino-vitb16", + "BAAI/bge-small-en-v1.5", + "rizvandwiki/gender-classification" + ], + "last_updated": "2026-06-17", + "epistemics_warning": "⚠️ All findings are hypotheses derived from limited models on 1 device (Snapdragon X Elite). Confidence levels reflect how well the mechanism is understood, not how universally applicable the finding is. ALWAYS re-validate on new model architectures before using to prune search space." + }, + + "findings": [ + + { + "id": "npu-001", + "title": "opset 21 export gives +24-31% speedup on DINOv2 family models on QNN NPU — mechanism UNKNOWN, NOT a general ViT property", + "observation": "Catalog sweep 2026-06-13 + validation sweep 2026-06-16 (ORT 1.24.5, W8A16 quantized.onnx, 3×500-iter sessions): DINOv2-small +30.6% (opset17 7.18ms → opset21 4.98ms). DINOv2-base +24.1% (opset17 34.56ms → opset21 26.23ms). CRITICAL CONTROL: dino-vitb16 (plain DINO ViT-B/16) -0.7% — NEUTRAL. rad-dino (ViT-L medical) -0.1% — CPU-bound, no NPU effect. MobileViT-small +26.5% original data (DVFS spike caveat). ViT-base: -7.4%. BERT/RoBERTa/DistilBERT: neutral.", + "mechanism_confirmed": false, + "mechanism_invalidation": "Original hypothesis: kMaxSupportedOpset < 21 gate causes NHWC bypass on older ORT. INVALIDATED: sweep used onnxruntime-windowsml==1.24.5 where kMaxSupportedOpset >= 22. Both opset 17 and opset 21 go through the same NHWC layout transform path on this ORT version. The bypass mechanism does NOT apply. The observed speedup is real but the cause is unknown.", + "mechanism_status": "ORIGINAL_MECHANISM_INVALIDATED — must re-investigate", + "mechanism_source": "ORT source code investigation (2026-06-10) for ORT < 1.18. Sweep used onnxruntime-windowsml==1.24.5 where this mechanism no longer applies.", + "ort_version_critical_note": "The original mechanism (kMaxSupportedOpset gate in IsSupportedOpset()) requires kMaxSupportedOpset < 21. onnxruntime-windowsml==1.24.5 (ORT 1.24.x) has kMaxSupportedOpset >= 22, so BOTH opset17 and opset21 go through the NHWC layout transform. The bypass mechanism does NOT apply to the ORT version used in the sweep. The observed speedup for DINOv2 and MobileViT has an UNKNOWN root cause.", + "architecture_requirement": ["empirically: DINOv2 family (facebook/dinov2-*) consistently benefits. Plain ViT (dino-vitb16) does NOT. Hybrid Conv+attention (MobileViT) showed speedup in original data. Pure Conv (ResNet) insufficient data. NLP: neutral."], + "critical_caveats": [ + "MECHANISM UNKNOWN: Transpose count is IDENTICAL in opset17 and opset21 (both 49 nodes on dinov2-small). The original Transpose-elimination hypothesis is RULED OUT. The +48 Reshape nodes in opset21 are the most observable structural difference but why this speeds up QNN NPU is not understood.", + "RESNET-18 EXCLUDED: apparent +20% is statistical noise — 3 sessions span 4x range at sub-ms latency. Need 3 sessions × 2000 iters for reliable data at this scale.", + "DVFS NOISE: always use 3 sessions × 500+ iters with cool-down. Single-session CV is meaningless on QNN NPU.", + "SCOPE IS DINOV2-FAMILY NOT GENERAL VIT: dino-vitb16 (same ViT-B size as dinov2-base) shows -0.7% NEUTRAL. The speedup is DINOv2-architecture-specific." + ], + "validated_models": { + "benefits_from_opset21": [ + "facebook/dinov2-small (+30.6%, original catalog sweep 2026-06-13, 3-session)", + "facebook/dinov2-base (+24.1%, validation sweep 2026-06-16, fresh quantized.onnx builds, 3-session h1=[34.56,34.67,33.15]ms h3=[33.00,26.22,26.23]ms)", + "apple/mobilevit-small (~20-26%, original catalog, note: opset17 has DVFS spike session)" + ], + "no_benefit_neutral": [ + "facebook/dino-vitb16 (-0.7%, validation sweep 2026-06-16, h1=[19.92,19.97,19.90]ms h3=[20.20,20.07,19.99]ms — NEUTRAL, critical control)", + "google/vit-base-patch16-224 (-7.4%, original catalog)", + "hustvl/yolos-small (timeout, no data)", + "rizvandwiki/gender-classification (+3.5% apparent, ranges overlap 13.89/13.92ms, NEUTRAL — plain ViT, CRITICAL: near-identical op counts to DINOv2-small (49 Transpose, 121 Reshape) yet NO benefit)", + "distilbert/distilbert-base-uncased-finetuned-sst-2-english (-0.1%, NLP neutral)", + "sentence-transformers/all-MiniLM-L6-v2 (-0.7%, NLP neutral)", + "deepset/roberta-base-squad2 (+0.1%, NLP neutral)" + ], + "marginal_inconclusive": [ + "BAAI/bge-small-en-v1.5 (+7.3%, h0=[10.52,10.32,11.01]ms h3=[10.25,9.33,9.94]ms — ranges barely non-overlapping but CV=0.3; NOT CONFIRMED. Needs 5+ sessions to differentiate from noise. Unusual for BERT architecture; all other NLP models tested at <1%)" + ], + "not_benchmarked_predicted_neutral": [ + "openai/clip-vit-base-patch32 — build failed at quantization (feature-extraction task calibration not supported); pure transformer, expected neutral based on all NLP data", + "cardiffnlp/twitter-roberta-base-sentiment-latest — not run; RoBERTa architecture, predicted neutral (consistent with roberta-base-squad2 +0.1%)", + "distilbert/distilbert-base-cased-distilled-squad — not run; DistilBERT architecture, predicted neutral (consistent with distilbert-base-uncased -0.1%)" + ], + "cpu_bound_cannot_test": [ + "microsoft/rad-dino (-0.1% on CPU EP, all hypotheses ~275ms CV<0.022 — model runs on CPU, opset irrelevant; QNN NPU BUILD_FAIL 2026-06-17, see npu-008)" + ], + "data_unreliable": ["resnet-18 — sub-ms latency, 3-session range spans 4x; no reliable signal (see data_reliability_notes)"] + }, + "original_mechanism_explanation": { + "root_cause_for_old_ort": "kMaxSupportedOpset gate in IsSupportedOpset() (onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc). On ORT where kMaxSupportedOpset < 21, opset 21 models bypass the NCHW→NHWC layout transformer entirely.", + "why_bypass_helped_convnext": "NHWC layout transform inserts Transpose(NCHW→NHWC) around Conv. For ConvNext, residual connections prevent Transpose cancellation → opset17 graph has MORE Transposes on HTP than opset21 graph.", + "why_cpu_is_opposite": "CPU relies on TransposeOptimizer to REMOVE existing Transposes. Skipping the optimizer (opset > kMaxSupportedOpset) leaves Transposes in place → CPU SLOWER. Same gate, opposite effect.", + "ort_kMaxSupportedOpset_by_version": { + "v1.14.x": 18, + "v1.16.x": 19, + "v1.17.x": 20, + "v1.18.x": 21, + "v1.24.x": ">= 23 (CONFIRMED: ORT 1.24.4 in C:\\tmp\\autoconfig-demo accepts opset 22 and 23 via InferenceSession with CPUExecutionProvider; opset 24 fails with 'No op registered for ...' not 'Unsupported opset')", + "main_HEAD": 26 + }, + "key_files": [ + "onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc:2724-2746 — MakeOptimizerContext() gate", + "onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc — IsSupportedOpset()", + "onnxruntime/core/session/inference_session.cc:1589-1626 — transform_layout_fn=nullptr path" + ] + }, + "transpose_analysis_2026_06_16": { + "method": "onnx.load() on winml-built optimized.onnx and quantized.onnx for h0 (opset17) and h3 (opset21) from catalog_qnn_sweep facebook--dinov2-small. Op counts via collections.Counter on graph.node.", + "opset17_optimized": {"total_nodes": 391, "Transpose": 49, "Reshape": 121, "Gemm": 72, "Mul": 48, "Conv": 1}, + "opset21_optimized": {"total_nodes": 439, "Transpose": 49, "Reshape": 169, "Gemm": 72, "Mul": 48, "Conv": 1}, + "opset17_quantized": {"total_nodes": 1398, "Transpose": 49, "Reshape": 121, "DequantizeLinear": 615, "QuantizeLinear": 392}, + "opset21_quantized": {"total_nodes": 1542, "Transpose": 49, "Reshape": 169, "DequantizeLinear": 663, "QuantizeLinear": 440}, + "key_finding": "Transpose count is IDENTICAL (49 nodes) in both opset17 and opset21. The NHWC Transpose-reduction hypothesis is RULED OUT. opset21 has MORE Reshape nodes (+48), more QDQ pairs (+48 DQ, +48 Q), and more total nodes. Despite more nodes, opset21 runs 30% faster on QNN NPU — mechanism still unknown.", + "rules_out": ["NHWC Transpose elimination as speedup cause", "Fewer total ops as explanation"], + "consistent_with": ["Different graph structure at opset21 enabling better QNN NPU internal scheduling or graph partitioning, possibly via the +48 Reshape nodes acting as data-layout hints or memory access pattern changes"] + }, + "alternative_mechanism_hypotheses": [ + "QNN EP graph partitioner assigns ops differently when the model has opset21 Reshape semantics — the +48 Reshape nodes may segment the graph into better-aligned HTP subgraphs", + "Quantization calibration path differs between opset exports → quantized.onnx has different scale/zero-point distributions at opset21 → better QNN NPU numeric alignment", + "PyTorch ONNX exporter produces different intermediate tensor shapes at opset 21 → better memory access locality on QNN NPU HBM", + "The +48 Reshape ops in opset21 are 'free' no-ops on QNN NPU (identity reshape with same shape) that happen to trigger a faster QNN internal code path" + ], + "data_reliability_notes": { + "dinov2_small": { + "h1_opset17_sessions_ms": [7.176, 6.392, 9.436], + "h3_opset21_sessions_ms": [4.977, 4.876, 6.884], + "assessment": "RELIABLE. Ranges barely overlap only at extremes. h3 sessions 1+2 (4.97/4.88ms) are well below entire h1 range. Speedup is real.", + "tool": "catalog_qnn_sweep.py, optimized.onnx (v1 pipeline)" + }, + "dinov2_base_v3": { + "h1_opset17_sessions_ms": [34.556, 34.668, 33.148], + "h3_opset21_sessions_ms": [33.001, 26.224, 26.227], + "assessment": "RELIABLE. h1 sessions fully consistent (~34ms). h3 s0 slightly elevated (JIT warmup) but s1+s2 consistent at 26.2ms. Speedup +24.1% is well-separated from noise.", + "tool": "validation_sweep.py v3, quantized.onnx W8A16 (fresh builds for both hyps)" + }, + "dino_vitb16": { + "h1_opset17_sessions_ms": [19.924, 19.975, 19.897], + "h3_opset21_sessions_ms": [20.197, 20.071, 19.988], + "assessment": "RELIABLE CONTROL. Extremely stable. +0.7% regression (within noise). Opset21 has NO EFFECT on plain DINO ViT-B/16. Critical discriminant: npu-001 speedup is NOT a general ViT property.", + "tool": "validation_sweep.py, quantized.onnx W8A16 (fresh builds)" + }, + "mobilevit_small": { + "h1_opset17_sessions_ms": [10.557, 11.721, 27.436], + "h3_opset21_sessions_ms": [10.814, 8.625, 8.449], + "assessment": "PARTIALLY RELIABLE. h1 session 3 (27.4ms) is a DVFS spike — median inflated to 11.72ms vs true ~11ms. h3 sessions 2+3 (8.6/8.4ms) consistently faster. Actual speedup ~20-26% (not the reported 26.5%)." + }, + "resnet_18": { + "h1_opset17_sessions_ms": [0.990, 4.003, 2.716], + "h3_opset21_sessions_ms": [1.054, 2.175, 4.107], + "assessment": "UNRELIABLE. Sub-ms model. Session range spans 4x for same config. Reported '+20.2% speedup' (h1 median 2.72ms vs h3 median 2.18ms) is NOT a real signal — the two distributions fully overlap. REMOVED from benefits list." + }, + "gender_classification_vit": { + "h0_opset17_sessions_ms": [14.15, 14.94, 13.89], + "h3_opset21_sessions_ms": [13.70, 13.92, 13.87], + "assessment": "NEUTRAL. Ranges barely not overlapping (h0 min=13.89ms, h3 max=13.92ms). +3.5% is within DVFS noise (CV ~0.35). CRITICAL: this ViT model has IDENTICAL op counts to DINOv2-small (49 Transpose, 121 Reshape, ~72 Gemm) yet shows NO benefit. Confirms npu-001 is not explainable by op-count or general ViT architecture.", + "tool": "run_one.py 2026-06-17, quantized.onnx W8A16" + }, + "bge_small_en": { + "h0_opset17_sessions_ms": [10.52, 10.32, 11.01], + "h3_opset21_sessions_ms": [10.25, 9.33, 9.94], + "assessment": "MARGINAL / INCONCLUSIVE. Ranges barely not overlapping but CV ~0.3 means high within-session variance. +7.3% apparent gain — larger than all other NLP models (distilbert -0.1%, MiniLM -0.7%, RoBERTa +0.1%) but may be DVFS noise. Needs 5+ sessions to confirm. Do NOT cite as benefit.", + "tool": "run_one.py 2026-06-17, quantized.onnx W8A16, bert model-type" + } + }, + "action_for_autoconfig": "Include opset 21 in search for DINOv2-family models (facebook/dinov2-*). Likely worthwhile for MobileViT-class Conv+attention hybrids. Do NOT apply to plain ViT (dino-vitb16, gender-classification both neutral), YOLOS, or NLP (BERT-family all neutral at ±0.7%). CRITICAL: gender-classification ViT has IDENTICAL op counts to DINOv2-small (49 Transpose, 121 Reshape) but shows NO benefit — the effect is deeper than op counts. For ResNet-class Conv-only: insufficient data. ALWAYS dump optimized graph to compare Transpose counts if speedup is unexpected.", + "confidence": "medium-high on empirical observation (DINOv2-small +30.6% and DINOv2-base +24.1% both confirmed with clean 3-session protocol, fresh builds). Low on mechanism — original Transpose-bypass explanation ruled out (Transpose count identical opset17/21), kMaxSupportedOpset>=23 confirmed. Mechanism unknown. Scope: DINOv2 family only until mechanism is understood. 12 models now tested: 3 benefit, 7 neutral, 1 marginal/inconclusive (BGE-small +7.3% with high CV), 1 CPU-bound.", + "falsified_by": null, + "scope": "ORT 1.24.5 (onnxruntime-windowsml). DINOv2-small and DINOv2-base confirmed. MobileViT-small likely. Does NOT apply to plain ViT (dino-vitb16 and rizvandwiki/gender-classification both confirmed NEUTRAL despite identical op counts to DINOv2-small), YOLOS-small, BERT-family NLP, CPU-bound models (rad-dino). ResNet-18 data inconclusive. BGE-small-en +7.3% marginal, inconclusive.", + "tracked_issue": "#869", + "perf_gain_validation_gates": { + "gate1_statistical": "PASSED for DINOv2 (3-session, ranges separate). PARTIALLY for MobileViT (DVFS spike in h1). FAILED for ResNet-18.", + "gate2_mechanism": "FAILED — original kMaxSupportedOpset bypass mechanism does not apply to ORT 1.24.x. New mechanism uninvestigated.", + "gate3_thermal_control": "PARTIALLY — 3×500-iter with 30s cool-down is better than single-session but DVFS spikes still occur (MobileViT h1, DINOv2 h1 session 3)" + }, + "follow_up_required": [ + "DONE: kMaxSupportedOpset >= 23 confirmed for ORT 1.24.4 (accepts opset 22 and 23 at InferenceSession level)", + "DONE: Transpose analysis — opset17 vs opset21 DINOv2-small: IDENTICAL (49 Transpose both). Not the mechanism.", + "OPEN: Investigate QNN EP graph partitioning diff for opset17 vs opset21. Why do +48 Reshape nodes help?", + "Run 5+ sessions (not 3) on DINOv2 opset17 vs opset21 to reduce DVFS uncertainty", + "Test EfficientNet-B0, MobileNet-V3 to determine if benefit is 'Conv+residual' or 'Conv+attention hybrid' specific", + "For ResNet-18: run 3 sessions x 2000 iters to get reliable sub-ms measurements" + ], + "experiments_convnext_early": [ + {"opset": 17, "p50_ms": 54.2, "p90_ms": 104.5, "min_ms": 9.56, "std_ms": 44.1, "iters": 50, "note": "warm device, DVFS-dominated, NOT reliable"}, + {"opset": 19, "p50_ms": 12.1, "p90_ms": 77.7, "min_ms": 9.11, "std_ms": 60.0, "iters": 50, "note": "NOT reliable — 50 iters, DVFS"}, + {"opset": 21, "p50_ms": 12.2, "p90_ms": 38.0, "min_ms": 9.73, "std_ms": 10.1, "iters": 20, "note": "only 20 iters — NOT reliable"} + ] + }, + + { + "id": "npu-002", + "title": "W8A16 quantization provides ~1.9x speedup over FP32 on QNN NPU (ConvNext only — not yet generalized)", + "observation": "ConvNext FP32 baseline: p50=19.4ms. W8A16 quantized (minmax, 128 samples): p50=10.29ms. 1 model, 1 device.", + "mechanism_confirmed": true, + "mechanism_hypothesis": "QNN HTP has native INT8 weight / FP16 activation datapath. W8A16 maps directly to HTP's weight-compressed matmul kernels.", + "action_for_autoconfig": "Always quantize for QNN NPU. W8A16 is the starting point. Validate accuracy after quantization.", + "confidence": "medium — mechanism is well-understood (HTP architecture), but 1.9x magnitude is from 1 model only. Speedup will vary by architecture.", + "falsified_by": null, + "scope": "ConvNext only — single model validation. The catalog sweep used W8A16 for all 8 models but did not include FP32 baselines for those models, so the 1.9x figure cannot be generalized. Need FP32 baseline runs on at least 3 diverse models before claiming 'most vision models'.", + "do_not_generalize_to": "Models with unusual op types not supported by QNN W8A16 path. Magnitude claim (1.9x) is ConvNext-specific.", + "follow_up_required": ["Measure FP32 baseline for MobileViT, DINOv2, ResNet-18 to verify speedup generalizes"] + }, + + { + "id": "npu-003", + "title": "winml compile adds ~1.7x speedup on top of quantization for QNN NPU (ConvNext only — not yet generalized)", + "observation": "ConvNext W8A16 quantized: p50=10.29ms. W8A16 + compiled (EPContext): p50=6.01ms. 1 model, 1 device.", + "mechanism_confirmed": true, + "mechanism_hypothesis": "Compilation pre-builds the QNN binary graph (.bin) and eliminates JIT graph partitioning at session creation time. EPContext model loads the pre-built binary directly.", + "action_for_autoconfig": "Always run winml compile after finding best quantized config for QNN NPU.", + "confidence": "medium — mechanism is well-understood (EPContext documented by QNN SDK). 1.7x magnitude is ConvNext-specific. Simpler models may see less benefit; complex models may see more.", + "falsified_by": null, + "scope": "ConvNext only — single model validation. Mechanism generalizes; magnitude (1.7x) does not. The catalog sweep results.json baseline p50 values already include the effects of whatever auto-config winml chose (which may or may not include compile) — not directly comparable.", + "follow_up_required": ["Verify compile speedup on MobileViT and DINOv2"] + }, + + { + "id": "npu-004", + "title": "⚠️ ANECDOTE (NO DATA): W8A8 may cause accuracy collapse on models with LN+GELU — UNVALIDATED", + "observation": "W8A8 quantization was attempted on ConvNext. The experiment was aborted early — exact accuracy numbers were NOT recorded. The claim 'top-1 < 15%' is a recalled anecdote from the experimenter, not a measured result.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "ConvNext uses LayerNormalization + GELU in every block. Quantizing both weights AND activations to INT8 in these ops introduces severe numerical error. However, this is a hypothesis — the aborted experiment does not confirm or refute it.", + "action_for_autoconfig": "Treat as anecdotal. Do NOT use this to skip W8A8 without running eval first. If W8A8 top-1 drops > 15 points vs W8A16 baseline on first attempt, then skip.", + "confidence": "very_low — anecdotal, no preserved data, experiment not reproducible as recorded", + "falsified_by": null, + "scope": "UNVALIDATED. May apply to models with LN+GELU blocks but this is unconfirmed.", + "do_not_generalize_to": "BERT/ResNet models where W8A8 is often fine", + "required_experiment": "Run W8A8 quantization on ConvNext-tiny-224, record exact top-1 accuracy (eval on ImageNet-1k, 1000 samples minimum). Compare to W8A16 baseline. If collapse observed, also run with calibration_method=percentile to see if calibration quality is the issue." + }, + + { + "id": "npu-005", + "title": "QNN Hub W8A16 model is slower on ORT QNN EP stack than ORT-quantized W8A16 — but comparison is not fair", + "observation": "QNN Hub W8A16 on winml ORT QNN EP: p50=14.82ms, std=8.8ms. ORT-quantized W8A16 (opset 17 QDQ): p50=6.01ms stable.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "QNN Hub uses opset 21 QDQ format with uint16 input tensor — this format may be incompatible with ORT QNN EP's expected quantization format.", + "fairness_caveat": "⚠️ This is NOT a fair comparison. QNN Hub models are compiled for the qairt native stack (qualcomm AI runtime), not for ORT QNN EP. Running a qairt-compiled model through ORT QNN EP is an unsupported use case. The comparison only shows that you should use ORT-generated quantization when targeting ORT QNN EP — which is obvious.", + "action_for_autoconfig": "Use ORT-generated W8A16 quantization (winml build), NOT QNN Hub pre-quantized models, when targeting ORT QNN EP stack.", + "confidence": "low — the finding is trivially true (use the right tool for the right stack) but the experiment doesn't tell us anything useful about relative performance.", + "falsified_by": null, + "scope": "ORT QNN EP stack only. QNN Hub models on their native qairt stack are likely much faster — that comparison was never made." + }, + + { + "id": "npu-006", + "title": "Conv fusions (conv-bn/add/activation) cause catastrophic QNN NPU CPU fallback on Conv-dominant models", + "observation": "ResNet-18 with conv-bn-fusion+conv-add-fusion+conv-activation-fusion: 3-session p50s = [132.3, 134.97, 130.67]ms (CV=0.016, extremely stable) vs baseline [0.99, 4.00, 2.72]ms. ~130-135x regression. MobileViT with same fusions: [11.60, 11.36, 10.52]ms — neutral vs baseline [10.56, 11.72, 27.44]ms. BERT-family: neutral (no Conv ops to fuse). VALIDATION SWEEP 2026-06-16: dinov2-base h4=[26.06,25.92,25.87]ms vs h1=[34.56,34.67,33.15]ms → fusions actually -25% (FASTER, not regression). dino-vitb16 h4=[20.12,20.04,20.41]ms vs h1=[19.92,19.97,19.90]ms → +1.0% (neutral). Conv fusions are only hazardous for Conv-dominant models.", + "session_evidence_note": "The h4 sessions for ResNet-18 (132.3, 134.97, 130.67ms) show near-zero variance (CV=0.016) — in stark contrast to all other hypotheses. This is unusual for QNN NPU and strongly suggests deterministic CPU fallback (not DVFS noise). The regression is 50-136x even comparing best sessions.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "ORT conv fusion pass (ConvAddActivationFusion, ConvBNFusion) produces fused op types (e.g., Conv+BN fused) that QNN EP cannot map to HTP kernels. These ops fall back to CPU execution, adding PCIe round-trip overhead per-op for a Conv-heavy graph like ResNet.", + "action_for_autoconfig": "⚠️ CRITICAL: Do NOT apply conv-bn-fusion / conv-add-fusion / conv-activation-fusion for QNN NPU on Conv-dominant models (ResNet, EfficientNet, MobileNet). These passes are beneficial for CPU EP but hazardous for QNN NPU. Always run accuracy + latency gate after applying any Conv fusion. If regression > 5x, disable all conv fusions immediately.", + "confidence": "high on regression observation (4900%); medium on mechanism (CPU fallback hypothesis not yet confirmed via EP partition dump)", + "falsified_by": null, + "scope": "Conv-dominant models (ResNet, EfficientNet, MobileNet). MobileViT safe (original data). DINOv2 and plain ViT: fusions are neutral or slightly beneficial (2026-06-16 validation). Not applicable to NLP.", + "severity": "critical — can produce 50x regression", + "follow_up_required": [ + "Dump QNN EP partition to confirm fused ops cause CPU fallback", + "Test EfficientNet and MobileNet to confirm generalization", + "Check if winml analyze linter can detect this pattern pre-build" + ] + }, + + { + "id": "npu-007", + "title": "DVFS thermal noise on QNN NPU makes CV-based stability gating unreliable — requires session-level averaging", + "observation": "Across all 8 catalog models, QNN NPU CV ranges 0.1–2.0+ even on warm device. Original CV<15% gate blocks most candidates. Differences < 10% are within noise floor.", + "mechanism_confirmed": true, + "mechanism_hypothesis": "Snapdragon X Elite HTP Hexagon core runs DVFS aggressively. Single-session CV is dominated by thermal state, not model performance. The only reliable signal comes from session-level averaging (3+ independent sessions with cool-down).", + "action_for_autoconfig": "DISABLE CV gate for QNN NPU. Replace with: (1) minimum 3 independent sessions × 500+ iters with 30s cool-down between sessions. (2) Use median p50 across sessions as the signal. (3) Only trust gains > 10% — anything below is within noise floor. (4) Do NOT compare within-session std to declare stability.", + "confidence": "high — consistent across 8 models in catalog sweep", + "falsified_by": null, + "scope": "General — applies to all models on QNN NPU / Snapdragon X Elite HTP", + "bench_protocol_update": { + "screen_phase": "SKIP CV gate; run 200 iters as warmup only", + "full_phase": "3 sessions × 500 iters, 30s cool-down between sessions", + "signal": "median p50 across sessions", + "noise_floor": ">10% gain required to declare improvement" + } + } + + , + + { + "id": "npu-008", + "title": "microsoft/rad-dino fails to build on QNN NPU across all opset variants (winml crash rc=0xC0000005)", + "observation": "catalog_qnn_sweep run 2026-06-17: all 6 hypotheses for microsoft/rad-dino (opset 17/19/21, with/without conv fusions) returned rc=3221225794 (0xC0000005, access violation) in <2s. No stderr captured — winml process crashed before producing any output. This is distinct from a build error: it is a hard crash of the winml CLI itself.", + "mechanism_confirmed": false, + "mechanism_hypothesis": "rad-dino is a ViT encoder with a non-standard DINOv2 variant (larger heads, custom CLS token handling). Likely contains one or more ONNX operators or graph shapes that trigger an unguarded null-dereference or out-of-bounds access in the QNN EP quantization or compilation path (winml build calls QNN SDK compilation under the hood). Could also be a model size / dynamic axis issue.", + "action_for_autoconfig": "Skip QNN NPU for microsoft/rad-dino. If QNN NPU is required, file a bug with the crash dump and test with winml analyze first to identify unsupported ops before attempting build.", + "confidence": "high on observation (reproducible across all 6 hypotheses in same run); low on mechanism (no stack trace available)", + "falsified_by": null, + "scope": "microsoft/rad-dino only (confirmed). DINOv2-family models in general (facebook/dinov2-small, facebook/dinov2-base) are NOT affected — they build and run on QNN NPU successfully.", + "severity": "blocker — model is incompatible with QNN NPU build", + "follow_up_required": [ + "Run winml analyze --ep qnn on rad-dino ONNX to check unsupported ops", + "Capture crash dump (ProcDump) to get stack trace", + "Compare ONNX graph structure of rad-dino vs facebook/dinov2-small to isolate differentiating ops" + ], + "date_observed": "2026-06-17" + } + + ], + + "search_space_rules": { + "opset": { + "recommended_order_conv_residual": [21, 17], + "recommended_order_pure_attention": [17], + "recommended_order_nlp": [17], + "recommended_order_pure_conv": [17, "21 only if time allows — insufficient data"], + "architecture_gate": "DINOv2 family (facebook/dinov2-*) → try opset 21 first (+24-31% confirmed). MobileViT-class Conv+attention hybrid → try opset 21 (+26% original data). Plain ViT (dino-vitb16-class) → opset 17 only (NEUTRAL confirmed 2026-06-16). YOLOS → opset 17 only. NLP (BERT-family) → opset 17 only. Pure Conv (ResNet) → opset 17 (data insufficient for opset21 recommendation).", + "rationale": "npu-001 validated 2026-06-13 and 2026-06-16: DINOv2-small +30.6%, DINOv2-base +24.1% (fresh builds, clean protocol). Critical control: dino-vitb16 -0.7% NEUTRAL. This proves the speedup is DINOv2-architecture-specific, not a general ViT property.", + "dialectical_note": "⚠️ The original mechanism explanation (kMaxSupportedOpset bypass) does NOT apply to ORT 1.24.x (onnxruntime-windowsml 1.24.5). The speedup for DINOv2/MobileViT is empirically real but mechanistically unexplained. Always validate on the actual ORT version being shipped." + }, + "quantization": { + "recommended": "w8a16", + "skip": ["w8a8 if initial top1 < 15%"], + "dialectical_note": "⚠️ W8A8 skip rule is ConvNext-specific (LN+GELU sensitivity). Try W8A8 for models without LN in every block." + }, + "compile": { + "always_run": true, + "dialectical_note": "⚠️ Compile benefit is well-understood (EPContext pre-built binary). Low risk of being wrong, but verify compile output loads correctly." + }, + "graph_passes": { + "recommended": "autoconf defaults (gelu_fusion, matmul_add_fusion)", + "NEVER_apply_for_qnn_npu": ["conv-bn-fusion", "conv-add-fusion", "conv-activation-fusion"], + "hazard_note": "npu-006 CRITICAL: Conv fusions cause 4900% regression on ResNet-18. Do NOT apply conv fusions to Conv-dominant models on QNN NPU.", + "dialectical_note": "⚠️ Conv fusion ban is confirmed for ResNet. MobileViT was safe. Always run latency gate after applying any fusion to catch regressions." + }, + "bench_protocol": { + "cv_gate": "DISABLED for QNN NPU (npu-007)", + "sessions": 3, + "iters_per_session": 500, + "cool_down_s": 30, + "noise_floor_pct": 10, + "signal": "median p50 across sessions" + } + } +} diff --git a/research/autoconfig/gen_report_v3.py b/research/autoconfig/gen_report_v3.py new file mode 100644 index 000000000..806bdddc0 --- /dev/null +++ b/research/autoconfig/gen_report_v3.py @@ -0,0 +1,338 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +import datetime +import json + + +results = json.load(open(r"ablation-search\results.json")) + +clean_base = [r for r in results if r["name"] in ["base_0", "base_1"]] +clean_runs = [v for r in clean_base for v in r["p50_runs"]] +clean_mean = round(sum(clean_runs) / len(clean_runs), 1) + + +def verdict(name, mean): + if name in ["base_0", "base_1", "base_2", "base_mid", "base_end"]: + return "outlier run" if name == "base_2" else "baseline" + if name == "matmul_add": + return "CONFIRMED REGRESSION" + if name == "matmul_scale": + return "probable mild regression" + if name.startswith("opset_"): + opset = int(name.split("_")[1]) + if opset >= 19: + return "SEVERE REGRESSION (kMaxSupportedOpset bug)" + return "neutral" + delta = mean - clean_mean + if abs(delta) < 5: + return "neutral" + if delta > 5: + return "mild regression" + return "possible improvement" + + +def row_class(name): + if name in ["base_0", "base_1", "base_mid", "base_end"]: + return "row-base" + if name == "base_2": + return "row-outlier" + if name == "matmul_add": + return "row-bad" + if name.startswith("opset_") and int(name.split("_")[1]) >= 19: + return "row-bad" + if name in ["matmul_scale"]: + return "row-warn" + return "row-neutral" + + +rows_html = "" +for r in results: + runs = r["p50_runs"] + delta = r["p50_mean"] - clean_mean + v = verdict(r["name"], r["p50_mean"]) + rc = row_class(r["name"]) + runs_str = " / ".join("%.1f" % x for x in runs) + sign = "+" if delta >= 0 else "" + rows_html += ( + '%s%.1f%s%.1f' + "%.1f%.1f%s%s\n" + % (rc, r["name"], r["p50_mean"], sign, delta, min(runs), max(runs), runs_str, v) + ) + +bar_labels = [ + r["name"] + for r in results + if r["name"] not in ["base_0", "base_1", "base_2", "base_mid", "base_end"] +] +bar_values = [ + round(r["p50_mean"], 1) + for r in results + if r["name"] not in ["base_0", "base_1", "base_2", "base_mid", "base_end"] +] +bar_colors = [] +for r in results: + if r["name"] in ["base_0", "base_1", "base_2", "base_mid", "base_end"]: + continue + if r["name"] == "matmul_add" or ( + r["name"].startswith("opset_") and int(r["name"].split("_")[1]) >= 19 + ): + bar_colors.append("'#dc3545'") + elif r["name"] in ["matmul_scale"]: + bar_colors.append("'#fd7e14'") + elif abs(r["p50_mean"] - clean_mean) < 5: + bar_colors.append("'#198754'") + else: + bar_colors.append("'#ffc107'") + +bar_labels_js = json.dumps(bar_labels) +bar_values_js = json.dumps(bar_values) +bar_colors_js = ",".join(bar_colors) +n_bars = len(bar_labels) +baseline_line = clean_mean +now_str = datetime.datetime.now().strftime("%Y-%m-%d") +n_results = len(results) + +html = """ + + + +ConvNext CPU Ablation Report + + + + +
+

📊 ConvNext CPU Ablation — Autoconfig POC + Opset Cliff RCA

+

Model: facebook/convnext-tiny-224  |  EP: CPU  |  DATE_PLACEHOLDER  |  N_RESULTS_PLACEHOLDER experiments  |  ORT ORTVER_PLACEHOLDER

+ + + +
+
Clean Baseline p50
CLEAN_MEAN_PLACEHOLDERms
base_0 + base_1, opset=17
+
Best Config Found
Baseline
opset=17, no extra flags
+
Worst Finding
+38ms
matmul-add-fusion
+
Root Cause Found
kMaxSupportedOpset
Transpose Optimizer gate
+
+ + +

🔍 Root Cause Analysis: ORT Opset Performance Cliff

+ +
+❌ ROOT CAUSE IDENTIFIED: ORT kMaxSupportedOpset gates the entire Transpose Optimizer

+In onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h: +
constexpr int64_t kMaxSupportedOpset = 18;  // in ORT v1.14.x
+// Current ORT (v1.24.5) kMaxSupportedOpset = 21 or 22
+
+// In onnx_transpose_optimization.cc:
+if (*opset > kMaxSupportedOpset) {
+    return std::nullopt;  // ← ENTIRE Transpose Optimizer skipped silently
+}
+ConvNext has 42 Transpose nodes forming a NCHW↔NHWC "transpose sandwich" in every block. +The Transpose Optimizer normally eliminates/merges these (pushing through Add×18, Mul×18, canceling adjacent inverses). +When it is bypassed, all 42 Transpose nodes execute as raw memory-layout copy operations → systemic slowdown. +
+ +

📊 ORT Optimization Level Experiment (confirms root cause)

+ + + + + + +
Session Optimization Levelopset=17opset=19RatioExplanation
DISABLE_ALL47.5ms355ms7.5×No Transpose Optimizer → all 42 Transposes execute. v17 model.onnx has pre-fused ops; v19 export has more raw ops.
ENABLE_BASIC289ms315ms1.1×Basic opts run on already-fused model, some interference. Near-parity: Transpose Optimizer not yet active at this level.
ENABLE_EXTENDED209ms241ms1.2×Extended optimizations help both but some overhead from re-optimizing pre-fused model.
ENABLE_ALL (default)216ms215ms1.0×Transpose Optimizer runs on both. Full parity achieved — confirms optimizer gap is the entire cause.
+ + + +

📋 kMaxSupportedOpset Version History (verified from ORT git tags)

+ + + + + + + +
ORT ReleasekMaxSupportedOpsetEffect
v1.14.x18opset ≥ 19 → Transpose Optimizer DISABLED
v1.16.x19opset ≥ 20 → disabled
v1.17.x20opset ≥ 21 → disabled
v1.18.x21opset ≥ 22 → disabled
main/HEAD26Fully covered for all current ONNX opsets
+ +

📜 ORT Source (exact call chain)

+
InferenceSession::Initialize()
+  → graph_transformer_mgr_.ApplyTransformers(graph, Level1)
+      → TransposeOptimizer::ApplyImpl()           [transpose_optimizer.cc:18]
+          → onnx_transpose_optimization::Optimize() [onnx_transpose_optimization.cc:3344]
+              → MakeOptimizerContext(graph, ...)
+                  → graph.Opset("ai.onnx")         // reads DomainToVersionMap()
+                  → if opset > kMaxSupportedOpset: return nullopt  // ← THE GATE
+              → if ctx == nullopt: return early    // no optimization performed
+ +

Why ConvNext is especially sensitive

+

The Transpose Optimizer can push Transposes through Add, Mul, and simple unary ops. ConvNext has 18×(Add + Mul) layer-scale and residual connections between blocks, meaning a single Transpose can cascade through many nodes. With the optimizer enabled, adjacent inverse pairs cancel; without it, every NCHW↔NHWC conversion is a full memory copy of the activation tensor.

+ + +

💡 Ablation Key Findings

+ +
+❌ CONFIRMED REGRESSION: matmul-add-fusion +38ms
+All 3 independent runs: 63.0 / 70.8 / 111.2ms vs clean baseline ~43.7ms. +The minimum observed (63ms) is 20ms above the highest clean-baseline run. Not attributable to noise. +Hypothesis: baseline already converts MatMul+Add→Gemm (37 Gemm in model.onnx); applying matmul-add-fusion creates redundant or conflicting dispatch. Unconfirmed — requires op-level profiling. +
+ +
+📝 MEASUREMENT CORRECTION: transpose-optimizer is NEUTRAL on inference latency
+Earlier 8-iteration search using winml eval reported +270ms. That measurement included HF preprocessing pipeline and had no warmup — it measured application latency, not model inference. +With winml perf (warmup=10, iter=50): 42.3 / 52.3 / 41.8ms — indistinguishable from baseline. +The +270ms was entirely a measurement artifact. Do not cite in user-facing reports. +
+ +
+❌ CONFIRMED: opset=19–22 causes 1.9–3.9× regression on this ORT build
+Mechanism confirmed: kMaxSupportedOpset gate in ORT's Transpose Optimizer. All 3 runs per opset are consistent. +Fix: use opset≤17 (current winml-cli default) OR upgrade ORT to a version where kMaxSupportedOpset ≥ 22 (main branch). +
+ +
+✅ NEUTRAL: nchwc-transformer, transpose-optimizer, opset=18 — all within noise of baseline (~43.7ms). +
+ +
+⚠ PROBABLE MILD REGRESSION: matmul-scale-fusion — all 3 runs elevated (51.5 / 58.1 / 61.2ms). Weak signal due to baseline drift during experiment. +
+ +

📊 Per-Config p50 Latency vs Baseline

+
+ +

📋 Full Results Table

+ + + +ROWS_PLACEHOLDER +
Configp50 mean (ms)Δ vs baselineminmaxRuns (ms)Verdict
+ +

🔧 Optimal Config

+
# Optimal config: baseline (opset=17, constant_folding=True, no extra flags)
+winml build --model-id facebook/convnext-tiny-224 -o out_cpu/
+winml perf -m out_cpu/model.onnx --ep cpu --warmup 10 --iterations 50
+# Expected: p50 ~43-44ms
+
+# AVOID:
+#   --optimize matmul-add-fusion     (confirmed +38ms regression)
+#   opset_version: 19-22             (kMaxSupportedOpset bug: 3-4x regression on affected ORT builds)
+ +

🧠 Open Questions

+
    +
  • Exact ORT version boundary: winml-cli ships ORT 1.24.5 (internal versioning). The exact kMaxSupportedOpset value in that build determines whether opset 19-22 is safe. Needs verification against ORT source at that specific commit.
  • +
  • Why does matmul-add-fusion regress? 37 Gemm nodes already exist; applying this fusion may create double-fusion or suboptimal kernel selection. Requires --profile to confirm.
  • +
  • GELU fusion mystery: baseline model.onnx has com.microsoft/Gelu×18 despite GeluFusion being in disabled_optimizers. Source unclear — likely HF Optimum pre-fuses GELU before ORT.
  • +
+ +
+ + +""" + +import subprocess + + +result = subprocess.run( + ["python", "-c", "import onnxruntime as ort; print(ort.__version__)"], + capture_output=True, + encoding="utf-8", + cwd=r"C:\tmp\autoconfig-demo", + env={ + **__import__("os").environ, + "PATH": r"C:\tmp\autoconfig-demo\.venv\Scripts;" + __import__("os").environ.get("PATH", ""), + }, +) +ort_ver = result.stdout.strip() or "1.24.5" + +html = html.replace("DATE_PLACEHOLDER", now_str) +html = html.replace("N_RESULTS_PLACEHOLDER", str(n_results)) +html = html.replace("ORTVER_PLACEHOLDER", ort_ver) +html = html.replace("CLEAN_MEAN_PLACEHOLDER", str(clean_mean)) +html = html.replace("ROWS_PLACEHOLDER", rows_html) +html = html.replace("BAR_LABELS_JS", bar_labels_js) +html = html.replace("BAR_VALUES_JS", bar_values_js) +html = html.replace("BAR_COLORS_JS", bar_colors_js) +html = html.replace("N_BARS_PLACEHOLDER", str(n_bars)) +html = html.replace("BASELINE_LINE_PLACEHOLDER", str(baseline_line)) + +with open(r"report.html", "w", encoding="utf-8") as f: + f.write(html) +print("report.html written: %d bytes, %d experiments" % (len(html), n_results)) diff --git a/research/autoconfig/report_gen.py b/research/autoconfig/report_gen.py new file mode 100644 index 000000000..0a4769bc5 --- /dev/null +++ b/research/autoconfig/report_gen.py @@ -0,0 +1,280 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +"""report_gen.py — Phase 3 HTML report generator for autoconfig. + +Reads results.tsv and generates report.html with: + - Summary bar chart (p50 per hypothesis, colour-coded by status) + - Experiment table (config / delta_pct / status / CV) + - Champion config box +""" + +from __future__ import annotations + +import csv +import html as html_lib +from datetime import datetime +from pathlib import Path + + +# ── helpers ─────────────────────────────────────────────────────────────────── + + +def _load_tsv(results_tsv: Path) -> list[dict]: + if not results_tsv.exists(): + return [] + with results_tsv.open(encoding="utf-8") as f: + return list(csv.DictReader(f, delimiter="\t")) + + +def _status_color(status: str) -> str: + s = status.lower() + if "new best" in s or (s.startswith("keep") and "marginal" not in s): + return "#2e7d32" # dark green + if "marginal" in s: + return "#f57f17" # amber + if "discard" in s: + return "#b0bec5" # grey + if "crash" in s or "fail" in s: + return "#c62828" # red + return "#78909c" + + +def _status_bg(status: str) -> str: + s = status.lower() + if "new best" in s or (s.startswith("keep") and "marginal" not in s): + return "#e8f5e9" + if "marginal" in s: + return "#fff8e1" + if "crash" in s or "fail" in s: + return "#ffebee" + return "#f5f5f5" + + +def _p50_float(val: str | None) -> float | None: + if not val or val == "N/A" or "UNSTABLE" in str(val): + return None + try: + return float(str(val).replace("ms", "").strip()) + except ValueError: + return None + + +# ── bar chart ───────────────────────────────────────────────────────────────── + + +def _bar_chart_html(rows: list[dict], baseline_p50: float | None) -> str: + valid = [(r, _p50_float(r.get("median_p50_ms") or r.get("screen_p50_ms"))) for r in rows] + valid = [(r, v) for r, v in valid if v is not None] + if not valid: + return "

No benchmark data yet.

" + + max_val = max(v for _, v in valid) * 1.1 + bars = [] + for r, p50 in valid: + label = html_lib.escape(r.get("label", "?")) + status = r.get("status", "") + color = _status_color(status) + width_pct = p50 / max_val * 100 + delta = r.get("delta_pct", "") + baseline_marker = "" + if baseline_p50: + bx = baseline_p50 / max_val * 100 + baseline_marker = ( + f'
' + ) + bars.append(f""" +
+ {baseline_marker} +
{label}
+
+
+
+
+
{p50:.1f}ms + {html_lib.escape(delta)} +
+
+
""") + + return ( + '
\n' + '
' + "— baseline (blue line)
\n" + "".join(bars) + "\n
" + ) + + +# ── experiment table ────────────────────────────────────────────────────────── + + +def _table_html(rows: list[dict]) -> str: + cols = [ + "iter", + "label", + "dimension", + "optim_flags", + "opset", + "screen_p50_ms", + "median_p50_ms", + "delta_pct", + "cv", + "status", + ] + hdrs = "".join( + f'{c.replace("_", " ")}' + for c in cols + ) + trs = [] + for r in rows: + status = r.get("status", "") + bg = _status_bg(status) + color = _status_color(status) + cells = [] + for c in cols: + val = html_lib.escape(str(r.get(c, ""))) + if c == "status": + cells.append( + f'{val}' + ) + else: + cells.append(f'{val}') + trs.append( + f'' + "".join(cells) + "" + ) + return ( + '' + f"{hdrs}" + f"{''.join(trs)}" + "
" + ) + + +# ── champion box ───────────────────────────────────────────────────────────── + + +def _champion_html(rows: list[dict], model_id: str, ep: str) -> str: + keeps = [r for r in rows if r.get("status", "").lower().startswith("keep")] + if not keeps: + return ( + '
' + "No KEEP verdict yet — search in progress.
" + ) + best = min(keeps, key=lambda r: _p50_float(r.get("median_p50_ms")) or 999) + flags = html_lib.escape(best.get("optim_flags", "(none)")) + opset = html_lib.escape(str(best.get("opset", 17))) + p50 = html_lib.escape(best.get("median_p50_ms", "N/A")) + delta = html_lib.escape(best.get("delta_pct", "N/A")) + label = html_lib.escape(best.get("label", "?")) + return f""" +
+
+ Champion Config
+ + + + + + + + + + + + + +
Model{html_lib.escape(model_id)}
EP{html_lib.escape(ep.upper())}
Hypothesis{label}
Optim flags{flags}
Opset{opset}
Median p50{p50} ms + ({delta})
+
""" + + +# ── main entry ──────────────────────────────────────────────────────────────── + + +def generate_report( + results_tsv: Path, + work_dir: Path, + model_id: str, + ep: str, + insight_notes: list[str] | None = None, +) -> Path: + """Generate report.html inside work_dir. Returns the output path.""" + rows = _load_tsv(results_tsv) + out_path = work_dir / "report.html" + + # Find baseline p50 from h0 row + baseline_p50: float | None = None + for r in rows: + if r.get("iter") == "0" or "baseline" in r.get("label", "").lower(): + baseline_p50 = _p50_float(r.get("median_p50_ms")) + if baseline_p50: + break + + chart = _bar_chart_html(rows, baseline_p50) + table = _table_html(rows) + champion = _champion_html(rows, model_id, ep) + ts = datetime.now().strftime("%Y-%m-%d %H:%M") + n_done = len(rows) + n_keep = sum(1 for r in rows if r.get("status", "").lower().startswith("keep")) + + insight_section = "" + if insight_notes: + items = "".join(f"
  • {html_lib.escape(n)}
  • " for n in insight_notes) + insight_section = f""" +

    Phase 1 Insight Engine

    +
      {items}
    """ + + html = f""" + + + +autoconfig report — {html_lib.escape(model_id)} ({ep.upper()}) + + + + +

    autoconfig — {html_lib.escape(model_id)}

    +
    EP: {html_lib.escape(ep.upper())}  ·  + {n_done} experiments  ·  {n_keep} KEEP  ·  + Generated: {ts}
    + +
    + {champion} +
    + +
    +

    Benchmark Chart (median p50)

    + {chart} +
    + +{f'
    {insight_section}
    ' if insight_section else ""} + +
    +

    All Experiments

    + {table} +
    + + +""" + + out_path.write_text(html, encoding="utf-8") + print(f" Report written: {out_path}") + return out_path diff --git a/research/autoconfig/validation_sweep.py b/research/autoconfig/validation_sweep.py new file mode 100644 index 000000000..1ec68f752 --- /dev/null +++ b/research/autoconfig/validation_sweep.py @@ -0,0 +1,464 @@ +#!/usr/bin/env python3 +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +""" +validation_sweep.py — Focused validation sweep for npu-001 and npu-006. + +Tests: + npu-001: opset17 vs opset21 speedup on Conv+attention hybrid vs pure ViT + npu-006: conv fusions regression — confirm MobileViT/DINOv2 are unaffected + +Hypotheses (subset of catalog_qnn_sweep.py): + h0: baseline (auto-config, W8A16) + h1: opset 17 explicit + h3: opset 21 ← npu-001 test + h4: opset 17 + conv fusions ← npu-006 test + +Models: + facebook/dinov2-base → expect npu-001 speedup (larger DINOv2) + microsoft/rad-dino → expect npu-001 speedup (DINOv2 variant) + facebook/dino-vitb16 → expect NEUTRAL (pure DINO ViT, no Conv+residual) + Intel/dpt-hybrid-midas → expect npu-001 speedup; npu-006 regression (ResNet backbone) + +Output: research/autoconfig/catalog-qnn-sweep//results_v2.json +""" + +import argparse +import copy +import json +import subprocess +import sys +import time +from datetime import datetime +from pathlib import Path + +sys.stdout.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined] + +BASE_DIR = Path(__file__).parent +REPO_ROOT = BASE_DIR.parent.parent # research/autoconfig/ → research/ → repo root +WINML = str(REPO_ROOT / ".venv" / "Scripts" / "winml.exe") +EP = "qnn" +DEVICE = "npu" +RESULTS_DIR = BASE_DIR / "catalog-qnn-sweep" + +SCREEN_WARMUP = 20 +SCREEN_ITERS = 200 + +FULL_WARMUP = 50 +FULL_ITERS = 500 +FULL_SESSIONS = 3 +COOL_DOWN_S = 30 + +MODEL_TIMEOUT_S = ( + 120 * 60 +) # 120 min per model (rad-dino/large models: 450s per bench session × 3 × 3) +BUILD_TIMEOUT_S = 15 * 60 +BENCH_TIMEOUT_S = 15 * 60 +EVAL_TIMEOUT_S = 6 * 60 + +# Focused hypothesis matrix +HYPOTHESES = [ + ("h0", "baseline (auto-config, W8A16)", None, None), + ("h1", "opset 17 explicit", 17, None), + ("h3", "opset 21 (tests npu-001)", 21, None), + ( + "h4", + "opset 17 + conv fusions", + 17, + { + "conv_bn_fusion": True, + "conv_add_fusion": True, + "conv_activation_fusion": True, + }, + ), +] + +# (model_id, task, model_type, run_h4_fusion_test) +VALIDATION_MODELS = [ + ("facebook/dinov2-base", "image-feature-extraction", "dinov2", True), + ("microsoft/rad-dino", "image-feature-extraction", "dinov2", False), + ("facebook/dino-vitb16", "image-feature-extraction", "vit", True), + ("Intel/dpt-hybrid-midas", "depth-estimation", "dpt", True), +] + + +def run_cmd(cmd, label="", timeout=600): + t0 = time.time() + print(f" >> {label or cmd[1]}", flush=True) + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + timeout=timeout, + ) + elapsed = time.time() - t0 + tag = "ok" if result.returncode == 0 else f"rc={result.returncode}" + print(f" {elapsed:.0f}s [{tag}]", flush=True) + if result.returncode != 0: + print(f" stderr: {(result.stderr or result.stdout or '')[-400:]}", flush=True) + return result.returncode, result.stdout + result.stderr, elapsed + except subprocess.TimeoutExpired: + elapsed = time.time() - t0 + print(f" TIMEOUT after {elapsed:.0f}s", flush=True) + return -999, f"TIMEOUT after {timeout}s", elapsed + + +def get_base_config(model_id, task, model_type): + tmp = RESULTS_DIR / "_tmp_val_cfg.json" + tmp.parent.mkdir(parents=True, exist_ok=True) + + def _try(extra): + cmd = [ + WINML, + "config", + "-m", + model_id, + "-t", + task, + "--device", + DEVICE, + "--ep", + EP, + "--no-compile", + "-o", + str(tmp), + ] + extra + rc, _, _ = run_cmd(cmd, "winml config", 600) + if rc == 0 and tmp.exists(): + try: + cfg = json.loads(tmp.read_text(encoding="utf-8")) + tmp.unlink(missing_ok=True) + return cfg + except Exception: + pass + tmp.unlink(missing_ok=True) + return None + + cfg = _try(["--model-type", model_type]) + if cfg is None: + print(" [warn] retrying without --model-type", flush=True) + cfg = _try([]) + return cfg + + +def make_hyp_config(base, opset_override, extra_optim): + cfg = copy.deepcopy(base) + if opset_override is not None and cfg.get("export"): + cfg["export"]["opset_version"] = opset_override + if extra_optim is not None: + cfg["optim"] = {**(cfg.get("optim") or {}), **extra_optim} + return cfg + + +def run_build(model_id, cfg_path, out_dir): + out_dir.mkdir(parents=True, exist_ok=True) + cmd = [ + WINML, + "build", + "-c", + str(cfg_path), + "-m", + model_id, + "-o", + str(out_dir), + "--ep", + EP, + "--device", + DEVICE, + "--no-compile", + "--rebuild", + ] + rc, out, _ = run_cmd(cmd, f"winml build [{out_dir.name}]", BUILD_TIMEOUT_S) + return rc == 0, out + + +def bench_screen(model_path): + out_json = model_path.parent / "val_screen.json" + rc, _, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(model_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(SCREEN_WARMUP), + "--iterations", + str(SCREEN_ITERS), + "-o", + str(out_json), + ], + f"perf screen ({SCREEN_ITERS} iters)", + BENCH_TIMEOUT_S, + ) + if rc != 0 or not out_json.exists(): + return None, 999.0, False + try: + d = json.loads(out_json.read_text(encoding="utf-8")) + lat = d.get("latency_ms", {}) + p50 = lat.get("p50") if isinstance(lat, dict) else None + std = lat.get("std", 0) if isinstance(lat, dict) else 0 + if not p50: + return None, 999.0, False + cv = std / p50 + stable = cv < 0.15 + return p50, cv, stable + except Exception: + return None, 999.0, False + + +def bench_full(model_path): + p50s = [] + for s in range(FULL_SESSIONS): + if s > 0: + print(f" [cool-down {COOL_DOWN_S}s]", flush=True) + time.sleep(COOL_DOWN_S) + out_json = model_path.parent / f"val_full_s{s}.json" + rc, _, _ = run_cmd( + [ + WINML, + "perf", + "-m", + str(model_path), + "--ep", + EP, + "--device", + DEVICE, + "--warmup", + str(FULL_WARMUP), + "--iterations", + str(FULL_ITERS), + "-o", + str(out_json), + ], + f"perf full s{s} ({FULL_ITERS} iters)", + BENCH_TIMEOUT_S, + ) + if rc != 0 or not out_json.exists(): + continue + try: + d = json.loads(out_json.read_text(encoding="utf-8")) + lat = d.get("latency_ms", {}) + p50 = lat.get("p50") if isinstance(lat, dict) else None + if p50: + p50s.append(round(p50, 3)) + except Exception: + pass + if not p50s: + return None, None + median = sorted(p50s)[len(p50s) // 2] + return p50s, round(median, 3) + + +def run_model(model_id, task, model_type, run_h4): + slug = model_id.replace("/", "--") + print(f"\n{'=' * 60}", flush=True) + print(f" Model: {model_id}", flush=True) + print(" Hypotheses: h0, h1, h3" + (", h4" if run_h4 else ""), flush=True) + print(f"{'=' * 60}", flush=True) + + out_dir = RESULTS_DIR / slug + out_dir.mkdir(parents=True, exist_ok=True) + result = { + "model_id": model_id, + "task": task, + "model_type": model_type, + "timestamp": datetime.now().isoformat(timespec="seconds"), + "ep": EP, + "device": DEVICE, + "validation_sweep": True, + "hypotheses": {}, + "errors": [], + } + + base_cfg = get_base_config(model_id, task, model_type) + if base_cfg is None: + result["errors"].append("FAILED: could not generate base config") + (out_dir / "results_v2.json").write_text(json.dumps(result, indent=2), encoding="utf-8") + return result + + t0_model = time.time() + + active_hyps = [ + (hid, lbl, opset, optim) + for hid, lbl, opset, optim in HYPOTHESES + if hid in ("h0", "h1", "h3") or (run_h4 and hid == "h4") + ] + + for hid, label, opset_override, extra_optim in active_hyps: + elapsed_model = time.time() - t0_model + if elapsed_model > MODEL_TIMEOUT_S: + result["errors"].append(f"Model timed out at {elapsed_model:.0f}s (before {hid})") + result["hypotheses"][hid] = {"status": "TIMEOUT", "label": label} + continue + + print(f"\n --- {hid}: {label} ---", flush=True) + hyp_dir = out_dir / f"val_{hid}" + hyp_dir.mkdir(parents=True, exist_ok=True) + + cfg = make_hyp_config(base_cfg, opset_override, extra_optim) + cfg_path = hyp_dir / "config.json" + cfg_path.write_text(json.dumps(cfg, indent=2), encoding="utf-8") + + # Reuse existing build output if already present (avoids re-downloading) + # Require optimized.onnx or quantized.onnx as completion signal — export.onnx alone + # means the build was truncated before optimization/quantization finished. + complete_models = [ + f for f in hyp_dir.glob("*.onnx") if "optimized" in f.name or "quantized" in f.name + ] + if complete_models: + print(f" [reuse] existing build in {hyp_dir.name}", flush=True) + ok = True + build_out = "(reused)" + else: + ok, build_out = run_build(model_id, cfg_path, hyp_dir) + if not ok: + result["hypotheses"][hid] = { + "status": "BUILD_FAIL", + "label": label, + "build_error": build_out[-300:], + } + result["errors"].append(f"{hid}: BUILD_FAIL") + continue + + # find model file — prefer quantized > optimized > any + model_files = list(hyp_dir.glob("*.onnx")) + model_path = next((f for f in model_files if "quantized" in f.name), None) + if model_path is None: + model_path = next((f for f in model_files if "optimized" in f.name), None) + if model_path is None and model_files: + model_path = model_files[0] + if model_path is None: + result["hypotheses"][hid] = { + "status": "BUILD_FAIL", + "label": label, + "build_error": "no .onnx found", + } + continue + + p50_screen, cv, stable = bench_screen(model_path) + # npu-007: For QNN NPU, screen failure (rc!=0, empty output) must NOT gate Phase B. + # DVFS thermal noise can cause transient subprocess failures on first inference. + # Only skip Phase B if screen hard-failed AND the EP is not QNN NPU. + is_npu = EP == "qnn" and DEVICE == "npu" + if p50_screen is None and not is_npu: + result["hypotheses"][hid] = { + "status": "BENCH_FAIL", + "label": label, + "opset": opset_override or "auto", + } + continue + + p50s, median = bench_full(model_path) + status = "OK" if (cv is None or cv < 0.15) else "OK_HIGH_CV" + if not p50s: + status = "BENCH_FAIL" + result["hypotheses"][hid] = { + "status": status, + "screen": { + "p50_ms": round(p50_screen, 3) if p50_screen is not None else None, + "cv": round(cv, 4) if cv is not None else None, + "stable": stable, + "note": "DVFS noise — high CV expected on QNN NPU" if not stable else None, + }, + "full": {"p50s_ms": p50s, "median_p50_ms": median}, + "label": label, + "opset": opset_override or "auto", + } + screen_str = f"{p50_screen:.2f}ms" if p50_screen is not None else "N/A" + cv_str = f"{cv:.3f}" if cv is not None else "N/A" + print( + f" [RESULT {hid}] screen p50={screen_str} CV={cv_str} full_median={median}ms sessions={p50s}", + flush=True, + ) + + # Compute npu-001 signal + h1 = result["hypotheses"].get("h1", {}) + h3 = result["hypotheses"].get("h3", {}) + if h1.get("full") and h3.get("full"): + m1 = h1["full"]["median_p50_ms"] + m3 = h3["full"]["median_p50_ms"] + if m1 and m3: + gain = round((m1 - m3) / m1 * 100, 1) + result["npu001_opset21_vs_17_gain_pct"] = gain + result["npu001_note"] = f"opset21 median {m3}ms vs opset17 {m1}ms = {gain:+.1f}%" + + # Compute npu-006 signal + h4 = result["hypotheses"].get("h4", {}) + if h1.get("full") and h4.get("full"): + m1 = h1["full"]["median_p50_ms"] + m4 = h4["full"]["median_p50_ms"] + if m1 and m4: + regression = round((m4 - m1) / m1 * 100, 1) + result["npu006_conv_fusion_regression_pct"] = regression + result["npu006_note"] = ( + f"conv fusions median {m4}ms vs no-fusion {m1}ms = {regression:+.1f}%" + ) + + out_path = out_dir / "results_v2.json" + out_path.write_text(json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8") + print(f"\n [SAVED] {out_path}", flush=True) + return result + + +def main(): + parser = argparse.ArgumentParser(description="Focused npu-001/npu-006 validation sweep") + parser.add_argument("--model", help="Run single model by ID") + parser.add_argument( + "--no-h4", action="store_true", help="Skip h4 (conv fusions) for all models" + ) + args = parser.parse_args() + + models = VALIDATION_MODELS + if args.model: + models = [ + (m, t, tp, h4) + for m, t, tp, h4 in VALIDATION_MODELS + if m == args.model or m.split("/")[-1] == args.model + ] + if not models: + print(f"Model '{args.model}' not in validation list. Available:") + for m, t, tp, h4 in VALIDATION_MODELS: + print(f" {m} ({t}, {tp})") + sys.exit(1) + + print(f"\nValidation sweep — {len(models)} model(s)", flush=True) + print( + f"EP: {EP} / {DEVICE} Proto: {FULL_SESSIONS}×{FULL_ITERS} iters, {COOL_DOWN_S}s cool-down\n", + flush=True, + ) + + all_results = [] + for model_id, task, model_type, run_h4 in models: + if args.no_h4: + run_h4 = False + res = run_model(model_id, task, model_type, run_h4) + all_results.append(res) + + print("\n" + "=" * 60) + print("VALIDATION SUMMARY") + print("=" * 60) + for r in all_results: + mid = r["model_id"] + npu001 = r.get("npu001_note", "n/a") + npu006 = r.get("npu006_note", "") + print(f" {mid}") + print(f" npu-001: {npu001}") + if npu006: + print(f" npu-006: {npu006}") + if r.get("errors"): + print(f" errors: {r['errors']}") + print("=" * 60) + + +if __name__ == "__main__": + main()