microsoft · JacksonWeber · Jun 16, 2026 · Jun 15, 2026 · Jun 15, 2026
diff --git a/.github/workflows/performance-comment.yml b/.github/workflows/performance-comment.yml
@@ -0,0 +1,121 @@
+name: Performance Comment
+
+# Posts the perf comparison as a sticky PR comment.
+#
+# This runs in the *base* repository's context via `workflow_run`, which
+# means it has a writable `GITHUB_TOKEN` even for pull requests opened
+# from forks. The `Performance` workflow itself runs in the (potentially
+# untrusted) PR context and only produces an artifact — it has no PR
+# write permissions.
+#
+# Security model:
+#   * The artifact from the PR-context workflow is treated as untrusted
+#     input. We only consume the JSON benchmark outputs (`baseline.json` /
+#     `candidate.json`) and regenerate the comparison markdown here using
+#     the base repository's checked-out `comparePerf.mjs` so the markdown
+#     posted under the writable token is never attacker-supplied.
+#   * The PR number is resolved authoritatively from the `workflow_run`
+#     event payload (or a GitHub API lookup keyed off the workflow's
+#     untamperable `head_sha`), never from artifact contents. This
+#     prevents a malicious PR from writing a different PR number into the
+#     artifact and tricking us into commenting on an unrelated PR.
+
+on:
+  workflow_run:
+    workflows: ["Performance"]
+    types:
+      - completed
+
+permissions:
+  contents: read
+  pull-requests: write
+  actions: read
+
+jobs:
+  comment:
+    name: Post sticky PR comment
+    runs-on: ubuntu-latest
+    if: github.event.workflow_run.event == 'pull_request'
+
+    steps:
+      - name: Check out base repository (trusted)
+        uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: '22.x'
+
+      - name: Download perf-results artifact
+        id: download
+        # Don't fail this job if the upstream Performance run was cancelled
+        # or failed before producing the artifact — we just skip posting.
+        continue-on-error: true
+        uses: actions/download-artifact@v4
+        with:
+          name: perf-results
+          path: perf-results
+          run-id: ${{ github.event.workflow_run.id }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Resolve PR number
+        id: pr
+        if: steps.download.outcome == 'success'
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const wr = context.payload.workflow_run;
+            // Prefer the PR list attached to the workflow_run event payload
+            // (populated for same-repo PRs). Fork PRs leave this empty, so
+            // fall back to an authoritative API lookup keyed off the
+            // workflow_run's head_sha, which the triggering PR cannot forge.
+            let prNumber = wr.pull_requests && wr.pull_requests[0] && wr.pull_requests[0].number;
+            if (!prNumber) {
+              const { data: prs } = await github.rest.repos.listPullRequestsAssociatedWithCommit({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                commit_sha: wr.head_sha,
+              });
+              const match = prs.find((p) => p.state === "open");
+              prNumber = match && match.number;
+            }
+            if (!Number.isInteger(prNumber) || prNumber <= 0) {
+              core.info("Could not determine PR number from workflow_run event; skipping comment.");
+              core.setOutput("number", "");
+              return;
+            }
+            core.setOutput("number", String(prNumber));
+
+      - name: Regenerate report from JSON (trusted code)
+        id: report
+        if: steps.pr.outputs.number != ''
+        run: |
+          set -euo pipefail
+          if [ ! -s perf-results/baseline.json ] || [ ! -s perf-results/candidate.json ]; then
+            echo "baseline.json or candidate.json missing from artifact; skipping comment." >&2
+            echo "found=false" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          # Use the base repo's comparePerf.mjs (trusted) to render the
+          # markdown from the PR-supplied JSON data, so we never post
+          # attacker-supplied markdown under the base repo's writable token.
+          node test/performanceTests/comparePerf.mjs \
+            perf-results/baseline.json \
+            perf-results/candidate.json \
+            report.md \
+            || true
+
+          if [ -s report.md ]; then
+            echo "found=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "found=false" >> "$GITHUB_OUTPUT"
+            echo "report.md was not produced; skipping comment." >&2
+          fi
+
+      - name: Post sticky PR comment
+        if: steps.pr.outputs.number != '' && steps.report.outputs.found == 'true'
+        uses: marocchino/sticky-pull-request-comment@v2
+        with:
+          header: perf-regression
+          number: ${{ steps.pr.outputs.number }}
+          path: report.md
diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml
@@ -8,17 +8,16 @@ on:
 concurrency:
   group: perf-${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
-
+  
 permissions:
   contents: read
-  pull-requests: write
 
 jobs:
   perf-regression:
     runs-on: ubuntu-latest
     # Headroom for two full benchmark runs (candidate + baseline) plus npm
     # installs and tarball packing. Each benchmark run is bounded by
-    # samples * scenarios * (warmup + duration + per-child init).
+    # scenarios * (warmup + samples * duration + per-child init).
     timeout-minutes: 40
     env:
       NODE_VERSION: '22.x'
@@ -103,7 +102,9 @@ jobs:
             --duration "$PERF_DURATION" \
             --warmup "$PERF_WARMUP"
 
-      # ---- Compare and publish ----
+      # ---- Compare for gating; the artifact is consumed by the companion
+      # `performance-comment.yml` workflow which posts the PR comment under
+      # the base repository's trusted token. ----
       - name: Compare results
         id: compare
         working-directory: pr/test/performanceTests
@@ -115,7 +116,7 @@ jobs:
             "$GITHUB_WORKSPACE/perf-comparison.md"
           echo "exit=$?" >> "$GITHUB_OUTPUT"
 
-      - name: Upload raw results
+      - name: Upload perf results artifact
         if: always()
         uses: actions/upload-artifact@v4
         with:
@@ -124,14 +125,7 @@ jobs:
             baseline.json
             candidate.json
             perf-comparison.md
-
-      - name: Comment on PR (best-effort)
-        if: always() && github.event.pull_request.head.repo.full_name == github.repository
-        uses: marocchino/sticky-pull-request-comment@v2
-        continue-on-error: true
-        with:
-          header: perf-regression
-          path: perf-comparison.md
+          if-no-files-found: warn
 
       - name: Fail job on gating regression
         if: steps.compare.outputs.exit != '0'

diff --git a/test/performanceTests/README.md b/test/performanceTests/README.md
@@ -42,5 +42,12 @@ repo, so they are never used for gate-fail decisions.
 the base branch as tarballs, installs each in turn under the PR's perf harness,
 runs the benchmark suite, and fails the job (blocking merge when set as a
 required check) if any gating scenario regresses beyond
-`PERF_REGRESSION_THRESHOLD` percent (default 15%). A sticky comment with the
-full comparison table is posted to the PR.
+`PERF_REGRESSION_THRESHOLD` percent (default 15%). All benchmark results are
+uploaded as the `perf-results` artifact.
+
+`.github/workflows/performance-comment.yml` then runs in the base repository's
+trusted context via `workflow_run`, downloads the artifact, regenerates the
+comparison markdown from the JSON inputs using the base branch's own
+`comparePerf.mjs` (so attacker-supplied markdown is never posted under the
+base repo's writable token), and posts a sticky PR comment. This split is
+what allows fork PRs to receive a perf-comparison comment.
diff --git a/test/performanceTests/bench.mjs b/test/performanceTests/bench.mjs
@@ -8,26 +8,41 @@
  * process makes JSON output and result capture deterministic, and a fresh
  * Node child per scenario keeps OpenTelemetry global state isolated.
  *
+ * All requested samples for the scenario are taken inside this single
+ * process — warmup runs once, then `--samples` timed durations are
+ * collected back-to-back. Per-sample Node startup and module-load cost
+ * (importing `applicationinsights` and calling `useAzureMonitor()`)
+ * dominate wall-clock time on cold processes, so amortising them across
+ * samples is the single biggest perf-CI speedup. OpenTelemetry global
+ * state is still isolated *between scenarios* because runBenchmarks.mjs
+ * spawns a fresh child per scenario.
+ *
  * Usage:
- *   node bench.mjs --scenario <Name> --duration <sec> --warmup <sec> --out <file>
+ *   node bench.mjs --scenario <Name> --duration <sec> --warmup <sec>
+ *                  --samples <N> --out <file>
  */
 
 import { writeFileSync } from "node:fs";
 
 function parseArgs(argv) {
-    const a = { duration: 8, warmup: 2 };
+    const a = { duration: 8, warmup: 2, samples: 1 };
     for (let i = 0; i < argv.length; i++) {
         const k = argv[i];
         const v = () => argv[++i];
         if (k === "--scenario") a.scenario = v();
         else if (k === "--duration") a.duration = Number(v());
         else if (k === "--warmup") a.warmup = Number(v());
+        else if (k === "--samples") a.samples = Number(v());
         else if (k === "--out") a.out = v();
     }
     if (!a.scenario || !a.out) {
         console.error("Required: --scenario <Name> --out <file>");
         process.exit(2);
     }
+    if (!Number.isInteger(a.samples) || a.samples < 1) {
+        console.error("--samples must be a positive integer");
+        process.exit(2);
+    }
     return a;
 }
 
@@ -73,34 +88,37 @@ async function main() {
     }
     const instance = new Cls();
 
-    // Warmup (not counted)
+    // Warmup runs once for the whole process, not per sample. The JIT and
+    // hot caches we're warming up persist across the subsequent timed
+    // samples in this same V8 instance.
     if (args.warmup > 0) {
         await runLoop(instance, args.warmup * 1000);
     }
 
-    const startWall = Date.now();
-    const ops = await runLoop(instance, args.duration * 1000);
-    const elapsedMs = Date.now() - startWall;
-    const opsPerSec = (ops / elapsedMs) * 1000;
+    const samples = [];
+    for (let i = 0; i < args.samples; i++) {
+        const startWall = Date.now();
+        const ops = await runLoop(instance, args.duration * 1000);
+        const elapsedMs = Date.now() - startWall;
+        const opsPerSec = (ops / elapsedMs) * 1000;
+        samples.push({ opsPerSec, ops, elapsedMs });
+        console.log(
+            `[bench] ${args.scenario} sample ${i + 1}/${args.samples}: ${ops} ops in ${elapsedMs}ms => ${opsPerSec.toFixed(0)} ops/s`,
+        );
+    }
 
     writeFileSync(
         args.out,
         JSON.stringify(
             {
                 scenario: args.scenario,
-                opsPerSec,
-                ops,
-                elapsedMs,
+                samples,
                 timestamp: new Date().toISOString(),
             },
             null,
             2,
         ),
     );
-
-    console.log(
-        `[bench] ${args.scenario}: ${ops} ops in ${elapsedMs}ms => ${opsPerSec.toFixed(0)} ops/s`,
-    );
 }
 
 main().catch((err) => {

diff --git a/test/performanceTests/runBenchmarks.mjs b/test/performanceTests/runBenchmarks.mjs
@@ -4,8 +4,7 @@
 
 /*
  * Runs each scenario via bench.mjs in a fresh Node child process to avoid
- * OpenTelemetry global-state contamination across scenarios. Each scenario
- * is sampled N times; median ops/s is recorded.
+ * OpenTelemetry global-state contamination across scenarios. Median ops/s is recorded.
  *
  * Usage:
  *   node runBenchmarks.mjs --out results.json [--samples 5] [--duration 8]
@@ -75,6 +74,8 @@ function runOne(scenario, args, outPath) {
             String(args.duration),
             "--warmup",
             String(args.warmup),
+            "--samples",
+            String(args.samples),
             "--out",
             outPath,
         ],
@@ -95,10 +96,16 @@ function runOne(scenario, args, outPath) {
         throw new Error(`Scenario ${scenario} failed with exit code ${child.status}`);
     }
     const result = JSON.parse(readFileSync(outPath, "utf8"));
-    if (!isFinite(result.opsPerSec)) {
-        throw new Error(`Scenario ${scenario} produced no ops/s value`);
+    if (!Array.isArray(result.samples) || result.samples.length === 0) {
+        throw new Error(`Scenario ${scenario} produced no samples`);
     }
-    return result.opsPerSec;
+    const ops = result.samples.map((s) => s.opsPerSec);
+    for (const v of ops) {
+        if (!isFinite(v)) {
+            throw new Error(`Scenario ${scenario} produced a non-finite ops/s value`);
+        }
+    }
+    return ops;
 }
 
 function main() {
@@ -112,13 +119,11 @@ function main() {
 
     try {
         for (const scenario of selected) {
-            const samples = [];
-            for (let i = 0; i < args.samples; i++) {
-                const out = join(tmp, `${scenario.name}-${i}.json`);
-                console.log(`\n[run] ${scenario.name} sample ${i + 1}/${args.samples}`);
-                const ops = runOne(scenario.name, args, out);
-                samples.push(ops);
-            }
+            const out = join(tmp, `${scenario.name}.json`);
+            console.log(
+                `\n[run] ${scenario.name}: ${args.samples} sample(s) x ${args.duration}s (warmup ${args.warmup}s)`,
+            );
+            const samples = runOne(scenario.name, args, out);
             results.push({
                 name: scenario.name,
                 tier: scenario.tier,