Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 121 additions & 0 deletions .github/workflows/performance-comment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
name: Performance Comment

# Posts the perf comparison as a sticky PR comment.
#
# This runs in the *base* repository's context via `workflow_run`, which
# means it has a writable `GITHUB_TOKEN` even for pull requests opened
# from forks. The `Performance` workflow itself runs in the (potentially
# untrusted) PR context and only produces an artifact — it has no PR
# write permissions.
#
# Security model:
# * The artifact from the PR-context workflow is treated as untrusted
# input. We only consume the JSON benchmark outputs (`baseline.json` /
# `candidate.json`) and regenerate the comparison markdown here using
# the base repository's checked-out `comparePerf.mjs` so the markdown
# posted under the writable token is never attacker-supplied.
# * The PR number is resolved authoritatively from the `workflow_run`
# event payload (or a GitHub API lookup keyed off the workflow's
# untamperable `head_sha`), never from artifact contents. This
# prevents a malicious PR from writing a different PR number into the
# artifact and tricking us into commenting on an unrelated PR.

on:
workflow_run:
workflows: ["Performance"]
types:
- completed

permissions:
contents: read
pull-requests: write
actions: read

jobs:
comment:
name: Post sticky PR comment
runs-on: ubuntu-latest
if: github.event.workflow_run.event == 'pull_request'

steps:
- name: Check out base repository (trusted)
uses: actions/checkout@v4

- uses: actions/setup-node@v4
with:
node-version: '22.x'

- name: Download perf-results artifact
id: download
# Don't fail this job if the upstream Performance run was cancelled
# or failed before producing the artifact — we just skip posting.
continue-on-error: true
uses: actions/download-artifact@v4
with:
name: perf-results
path: perf-results
run-id: ${{ github.event.workflow_run.id }}
github-token: ${{ secrets.GITHUB_TOKEN }}

- name: Resolve PR number
id: pr
if: steps.download.outcome == 'success'
uses: actions/github-script@v7
with:
script: |
const wr = context.payload.workflow_run;
// Prefer the PR list attached to the workflow_run event payload
// (populated for same-repo PRs). Fork PRs leave this empty, so
// fall back to an authoritative API lookup keyed off the
// workflow_run's head_sha, which the triggering PR cannot forge.
let prNumber = wr.pull_requests && wr.pull_requests[0] && wr.pull_requests[0].number;
if (!prNumber) {
const { data: prs } = await github.rest.repos.listPullRequestsAssociatedWithCommit({
owner: context.repo.owner,
repo: context.repo.repo,
commit_sha: wr.head_sha,
});
const match = prs.find((p) => p.state === "open");
prNumber = match && match.number;
}
if (!Number.isInteger(prNumber) || prNumber <= 0) {
core.info("Could not determine PR number from workflow_run event; skipping comment.");
core.setOutput("number", "");
return;
}
core.setOutput("number", String(prNumber));

- name: Regenerate report from JSON (trusted code)
id: report
if: steps.pr.outputs.number != ''
run: |
set -euo pipefail
if [ ! -s perf-results/baseline.json ] || [ ! -s perf-results/candidate.json ]; then
echo "baseline.json or candidate.json missing from artifact; skipping comment." >&2
echo "found=false" >> "$GITHUB_OUTPUT"
exit 0
fi

# Use the base repo's comparePerf.mjs (trusted) to render the
# markdown from the PR-supplied JSON data, so we never post
# attacker-supplied markdown under the base repo's writable token.
node test/performanceTests/comparePerf.mjs \
perf-results/baseline.json \
perf-results/candidate.json \
report.md \
|| true

if [ -s report.md ]; then
echo "found=true" >> "$GITHUB_OUTPUT"
else
echo "found=false" >> "$GITHUB_OUTPUT"
echo "report.md was not produced; skipping comment." >&2
fi

- name: Post sticky PR comment
if: steps.pr.outputs.number != '' && steps.report.outputs.found == 'true'
uses: marocchino/sticky-pull-request-comment@v2
with:
header: perf-regression
number: ${{ steps.pr.outputs.number }}
path: report.md
20 changes: 7 additions & 13 deletions .github/workflows/performance.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,16 @@ on:
concurrency:
group: perf-${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

permissions:
contents: read
pull-requests: write

jobs:
perf-regression:
runs-on: ubuntu-latest
# Headroom for two full benchmark runs (candidate + baseline) plus npm
# installs and tarball packing. Each benchmark run is bounded by
# samples * scenarios * (warmup + duration + per-child init).
# scenarios * (warmup + samples * duration + per-child init).
timeout-minutes: 40
env:
NODE_VERSION: '22.x'
Expand Down Expand Up @@ -103,7 +102,9 @@ jobs:
--duration "$PERF_DURATION" \
--warmup "$PERF_WARMUP"

# ---- Compare and publish ----
# ---- Compare for gating; the artifact is consumed by the companion
# `performance-comment.yml` workflow which posts the PR comment under
# the base repository's trusted token. ----
- name: Compare results
id: compare
working-directory: pr/test/performanceTests
Expand All @@ -115,7 +116,7 @@ jobs:
"$GITHUB_WORKSPACE/perf-comparison.md"
echo "exit=$?" >> "$GITHUB_OUTPUT"

- name: Upload raw results
- name: Upload perf results artifact
if: always()
uses: actions/upload-artifact@v4
with:
Expand All @@ -124,14 +125,7 @@ jobs:
baseline.json
candidate.json
perf-comparison.md

- name: Comment on PR (best-effort)
if: always() && github.event.pull_request.head.repo.full_name == github.repository
uses: marocchino/sticky-pull-request-comment@v2
continue-on-error: true
with:
header: perf-regression
path: perf-comparison.md
if-no-files-found: warn

- name: Fail job on gating regression
if: steps.compare.outputs.exit != '0'
Expand Down
11 changes: 9 additions & 2 deletions test/performanceTests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,12 @@ repo, so they are never used for gate-fail decisions.
the base branch as tarballs, installs each in turn under the PR's perf harness,
runs the benchmark suite, and fails the job (blocking merge when set as a
required check) if any gating scenario regresses beyond
`PERF_REGRESSION_THRESHOLD` percent (default 15%). A sticky comment with the
full comparison table is posted to the PR.
`PERF_REGRESSION_THRESHOLD` percent (default 15%). All benchmark results are
uploaded as the `perf-results` artifact.

`.github/workflows/performance-comment.yml` then runs in the base repository's
trusted context via `workflow_run`, downloads the artifact, regenerates the
comparison markdown from the JSON inputs using the base branch's own
`comparePerf.mjs` (so attacker-supplied markdown is never posted under the
base repo's writable token), and posts a sticky PR comment. This split is
what allows fork PRs to receive a perf-comparison comment.
46 changes: 32 additions & 14 deletions test/performanceTests/bench.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,41 @@
* process makes JSON output and result capture deterministic, and a fresh
* Node child per scenario keeps OpenTelemetry global state isolated.
*
* All requested samples for the scenario are taken inside this single
* process — warmup runs once, then `--samples` timed durations are
* collected back-to-back. Per-sample Node startup and module-load cost
* (importing `applicationinsights` and calling `useAzureMonitor()`)
* dominate wall-clock time on cold processes, so amortising them across
* samples is the single biggest perf-CI speedup. OpenTelemetry global
* state is still isolated *between scenarios* because runBenchmarks.mjs
* spawns a fresh child per scenario.
*
* Usage:
* node bench.mjs --scenario <Name> --duration <sec> --warmup <sec> --out <file>
* node bench.mjs --scenario <Name> --duration <sec> --warmup <sec>
* --samples <N> --out <file>
*/

import { writeFileSync } from "node:fs";

function parseArgs(argv) {
const a = { duration: 8, warmup: 2 };
const a = { duration: 8, warmup: 2, samples: 1 };
for (let i = 0; i < argv.length; i++) {
const k = argv[i];
const v = () => argv[++i];
if (k === "--scenario") a.scenario = v();
else if (k === "--duration") a.duration = Number(v());
else if (k === "--warmup") a.warmup = Number(v());
else if (k === "--samples") a.samples = Number(v());
else if (k === "--out") a.out = v();
}
if (!a.scenario || !a.out) {
console.error("Required: --scenario <Name> --out <file>");
process.exit(2);
}
if (!Number.isInteger(a.samples) || a.samples < 1) {
console.error("--samples must be a positive integer");
process.exit(2);
}
return a;
}

Expand Down Expand Up @@ -73,34 +88,37 @@ async function main() {
}
const instance = new Cls();

// Warmup (not counted)
// Warmup runs once for the whole process, not per sample. The JIT and
// hot caches we're warming up persist across the subsequent timed
// samples in this same V8 instance.
if (args.warmup > 0) {
await runLoop(instance, args.warmup * 1000);
}

const startWall = Date.now();
const ops = await runLoop(instance, args.duration * 1000);
const elapsedMs = Date.now() - startWall;
const opsPerSec = (ops / elapsedMs) * 1000;
const samples = [];
for (let i = 0; i < args.samples; i++) {
const startWall = Date.now();
const ops = await runLoop(instance, args.duration * 1000);
const elapsedMs = Date.now() - startWall;
const opsPerSec = (ops / elapsedMs) * 1000;
samples.push({ opsPerSec, ops, elapsedMs });
console.log(
`[bench] ${args.scenario} sample ${i + 1}/${args.samples}: ${ops} ops in ${elapsedMs}ms => ${opsPerSec.toFixed(0)} ops/s`,
);
}

writeFileSync(
args.out,
JSON.stringify(
{
scenario: args.scenario,
opsPerSec,
ops,
elapsedMs,
samples,
timestamp: new Date().toISOString(),
},
null,
2,
),
);

console.log(
`[bench] ${args.scenario}: ${ops} ops in ${elapsedMs}ms => ${opsPerSec.toFixed(0)} ops/s`,
);
}

main().catch((err) => {
Expand Down
29 changes: 17 additions & 12 deletions test/performanceTests/runBenchmarks.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@

/*
* Runs each scenario via bench.mjs in a fresh Node child process to avoid
* OpenTelemetry global-state contamination across scenarios. Each scenario
* is sampled N times; median ops/s is recorded.
* OpenTelemetry global-state contamination across scenarios. Median ops/s is recorded.
*
* Usage:
* node runBenchmarks.mjs --out results.json [--samples 5] [--duration 8]
Expand Down Expand Up @@ -75,6 +74,8 @@ function runOne(scenario, args, outPath) {
String(args.duration),
"--warmup",
String(args.warmup),
"--samples",
String(args.samples),
"--out",
outPath,
],
Expand All @@ -95,10 +96,16 @@ function runOne(scenario, args, outPath) {
throw new Error(`Scenario ${scenario} failed with exit code ${child.status}`);
}
const result = JSON.parse(readFileSync(outPath, "utf8"));
if (!isFinite(result.opsPerSec)) {
throw new Error(`Scenario ${scenario} produced no ops/s value`);
if (!Array.isArray(result.samples) || result.samples.length === 0) {
throw new Error(`Scenario ${scenario} produced no samples`);
}
return result.opsPerSec;
const ops = result.samples.map((s) => s.opsPerSec);
for (const v of ops) {
if (!isFinite(v)) {
throw new Error(`Scenario ${scenario} produced a non-finite ops/s value`);
}
}
return ops;
}

function main() {
Expand All @@ -112,13 +119,11 @@ function main() {

try {
for (const scenario of selected) {
const samples = [];
for (let i = 0; i < args.samples; i++) {
const out = join(tmp, `${scenario.name}-${i}.json`);
console.log(`\n[run] ${scenario.name} sample ${i + 1}/${args.samples}`);
const ops = runOne(scenario.name, args, out);
samples.push(ops);
}
const out = join(tmp, `${scenario.name}.json`);
console.log(
`\n[run] ${scenario.name}: ${args.samples} sample(s) x ${args.duration}s (warmup ${args.warmup}s)`,
);
const samples = runOne(scenario.name, args, out);
results.push({
name: scenario.name,
tier: scenario.tier,
Expand Down
Loading