From a82f9803c3bc98bda15cfc8de87bf17c0847ae5e Mon Sep 17 00:00:00 2001 From: tmathern <60901087+tmathern@users.noreply.github.com> Date: Wed, 1 Jul 2026 18:44:55 -0700 Subject: [PATCH 1/5] fix: CPU bench --- .github/workflows/cpu-benchmark.yml | 67 +++ .github/workflows/memory-benchmark.yml | 2 +- .gitignore | 6 +- Makefile | 45 +- requirements-dev.txt | 1 - tests/benchmark.py | 147 ------ .../python-3.10-slim-perf-Dockerfile | 4 +- .../python-3.12-slim-perf-Dockerfile | 4 +- .../Dockerfiles/ubuntu-22.04-perf-Dockerfile | 4 +- .../Dockerfiles/ubuntu-24.04-perf-Dockerfile | 4 +- tests/perf/README.md | 268 +---------- tests/perf/cpu/README.md | 92 ++++ tests/perf/cpu/__init__.py | 1 + tests/perf/cpu/baseline.json | 225 +++++++++ tests/perf/{ => cpu}/reports/.gitkeep | 0 tests/perf/cpu/run_profile.py | 443 ++++++++++++++++++ tests/perf/memory/README.md | 265 +++++++++++ tests/perf/memory/__init__.py | 1 + tests/perf/{ => memory}/baseline.json | 0 tests/perf/memory/reports/.gitkeep | 0 tests/perf/{ => memory}/run_profile.py | 7 +- tests/perf/scenarios.py | 8 + 22 files changed, 1171 insertions(+), 423 deletions(-) create mode 100644 .github/workflows/cpu-benchmark.yml delete mode 100644 tests/benchmark.py create mode 100644 tests/perf/cpu/README.md create mode 100644 tests/perf/cpu/__init__.py create mode 100644 tests/perf/cpu/baseline.json rename tests/perf/{ => cpu}/reports/.gitkeep (100%) create mode 100644 tests/perf/cpu/run_profile.py create mode 100644 tests/perf/memory/README.md create mode 100644 tests/perf/memory/__init__.py rename tests/perf/{ => memory}/baseline.json (100%) create mode 100644 tests/perf/memory/reports/.gitkeep rename tests/perf/{ => memory}/run_profile.py (98%) diff --git a/.github/workflows/cpu-benchmark.yml b/.github/workflows/cpu-benchmark.yml new file mode 100644 index 00000000..197e7da8 --- /dev/null +++ b/.github/workflows/cpu-benchmark.yml @@ -0,0 +1,67 @@ +name: Python SDK py-spy CPU benchmark + +on: + pull_request: + types: + - opened + - reopened + - synchronize + - labeled + +permissions: + contents: read + +jobs: + # Timing and profiling run as parallel jobs on separate runners so py-spy + # sampling never contends with the timed run. + cpu-timing: + name: CPU benchmark timing + # Needs to match the arch the baseline was generated on. + runs-on: ubuntu-24.04-arm + if: | + contains(github.event.pull_request.labels.*.name, 'check-cpu-benchmark') && + ( + github.event.pull_request.author_association == 'COLLABORATOR' || + github.event.pull_request.author_association == 'MEMBER' || + github.event.pull_request.author_association == 'OWNER' + ) + steps: + - uses: actions/checkout@v4 + + # Build the perf image (shared with the memory benchmark). + - name: Build perf image + run: make perf-image-rebuild + + # Uses the Dockerfile environment for repeatable runs. Report-only: + # baseline deltas land in this job's step summary, exit code stays 0. + - name: Run CPU timing benchmark + run: make cpu-bench CPU_MODE=timing + + cpu-profile: + name: CPU benchmark flamegraphs + runs-on: ubuntu-24.04-arm + if: | + contains(github.event.pull_request.labels.*.name, 'check-cpu-benchmark') && + ( + github.event.pull_request.author_association == 'COLLABORATOR' || + github.event.pull_request.author_association == 'MEMBER' || + github.event.pull_request.author_association == 'OWNER' + ) + steps: + - uses: actions/checkout@v4 + + - name: Build perf image + run: make perf-image-rebuild + + - name: Run py-spy CPU profiles + run: make cpu-bench CPU_MODE=profile + + - name: Upload CPU flamegraphs + if: always() + uses: actions/upload-artifact@v4 + with: + name: pyspy-cpu-flamegraphs + path: | + tests/perf/cpu/reports/*.svg + tests/perf/cpu/reports/*.speedscope.json + if-no-files-found: warn diff --git a/.github/workflows/memory-benchmark.yml b/.github/workflows/memory-benchmark.yml index 992587b6..c54a8418 100644 --- a/.github/workflows/memory-benchmark.yml +++ b/.github/workflows/memory-benchmark.yml @@ -40,5 +40,5 @@ jobs: uses: actions/upload-artifact@v4 with: name: memray-flamegraphs - path: tests/perf/reports/*.html + path: tests/perf/memory/reports/*.html if-no-files-found: warn diff --git a/.gitignore b/.gitignore index 147e8357..757c0d11 100644 --- a/.gitignore +++ b/.gitignore @@ -124,5 +124,7 @@ src/c2pa/libs/ !tests/fixtures/*.key # Memory profiling reports -tests/perf/reports/*.html -tests/perf/reports/*.bin +tests/perf/memory/reports/*.html +tests/perf/memory/reports/*.bin +tests/perf/cpu/reports/*.svg +tests/perf/cpu/reports/*.speedscope.json diff --git a/Makefile b/Makefile index e1762040..2d873e36 100644 --- a/Makefile +++ b/Makefile @@ -62,10 +62,6 @@ test: $(PYTHON) ./tests/test_unit_tests.py $(PYTHON) ./tests/test_unit_tests_threaded.py -# Runs benchmarks in the venv -benchmark: - $(PYTHON) -m pytest tests/benchmark.py -v - # Tests building and installing a local wheel package # Downloads required artifacts, builds the wheel, installs it, and verifies the installation test-local-wheel-build: @@ -141,11 +137,17 @@ build-from-source: docs: $(PYTHON) scripts/generate_api_docs.py -# Memory profiling with memray (runs in Docker, reports go to tests/perf/reports/) -# More details for usage are in tests/perf/README.md +# Performance benchmarks (run in Docker): +# - memory (memray): reports go to tests/perf/memory/reports/, docs in tests/perf/memory/README.md +# - cpu (py-spy): reports go to tests/perf/cpu/reports/, docs in tests/perf/cpu/README.md PERF_ENV ?= python-3.12-slim MEMRAY_ITERATIONS ?= 100 MEMRAY_THRESHOLD ?= 1.1 +CPU_ITERATIONS ?= 100 +CPU_THRESHOLD ?= 1.25 +CPU_MODE ?= all +CPU_REPEATS ?= 0 +PYSPY_RATE ?= 100 SCENARIO ?= SCENARIO_ARG := $(if $(SCENARIO),--scenario $(SCENARIO),) # In CI, use en vars to write the report to the job run @@ -156,23 +158,36 @@ GH_SUMMARY_MOUNT := $(if $(GITHUB_STEP_SUMMARY),-v $(GITHUB_STEP_SUMMARY):$(GITH # change (use perf-image-rebuild for that). .PHONY: perf-image perf-image: - @docker image inspect c2pa-memray-$(PERF_ENV) >/dev/null 2>&1 || \ - docker build -f tests/perf/Dockerfiles/$(PERF_ENV)-perf-Dockerfile -t c2pa-memray-$(PERF_ENV) . + @docker image inspect c2pa-perf-$(PERF_ENV) >/dev/null 2>&1 || \ + docker build -f tests/perf/Dockerfiles/$(PERF_ENV)-perf-Dockerfile -t c2pa-perf-$(PERF_ENV) . -# Force a clean rebuild of the memray perf Docker image +# Force a clean rebuild of the perf Docker image (shared by memory and cpu benchmarks) .PHONY: perf-image-rebuild perf-image-rebuild: - docker build --no-cache --pull -f tests/perf/Dockerfiles/$(PERF_ENV)-perf-Dockerfile -t c2pa-memray-$(PERF_ENV) . + docker build --no-cache --pull -f tests/perf/Dockerfiles/$(PERF_ENV)-perf-Dockerfile -t c2pa-perf-$(PERF_ENV) . # Runs memory benchmarks. Pre-requisite: Docker image built using `make perf-image-rebuild`. .PHONY: memory-use-bench memory-use-bench: - docker run --rm -v $(PWD):/workspace $(GH_SUMMARY_MOUNT) -e PYTHONPATH=/workspace/src -e PERF_ENV=$(PERF_ENV) -e MEMRAY_ITERATIONS=$(MEMRAY_ITERATIONS) -e MEMRAY_THRESHOLD=$(MEMRAY_THRESHOLD) -e GITHUB_TOKEN -e GITHUB_STEP_SUMMARY c2pa-memray-$(PERF_ENV) python -m tests.perf.run_profile $(SCENARIO_ARG) $(PERF_ARGS) + docker run --rm -v $(PWD):/workspace $(GH_SUMMARY_MOUNT) -e PYTHONPATH=/workspace/src -e PERF_ENV=$(PERF_ENV) -e MEMRAY_ITERATIONS=$(MEMRAY_ITERATIONS) -e MEMRAY_THRESHOLD=$(MEMRAY_THRESHOLD) -e GITHUB_TOKEN -e GITHUB_STEP_SUMMARY c2pa-perf-$(PERF_ENV) python -m tests.perf.memory.run_profile $(SCENARIO_ARG) $(PERF_ARGS) @echo "" - @echo "Reports written to tests/perf/reports/" - @echo "Open tests/perf/reports/-{peak,leaks,temporary}.html in a browser" + @echo "Reports written to tests/perf/memory/reports/" + @echo "Open tests/perf/memory/reports/-{peak,leaks,temporary}.html in a browser" .PHONY: clean-memory-perf-reports clean-memory-perf-reports: - rm -f tests/perf/reports/*.html tests/perf/reports/*.bin - @echo "Cleared tests/perf/reports/" + rm -f tests/perf/memory/reports/*.html tests/perf/memory/reports/*.bin + @echo "Cleared tests/perf/memory/reports/" + +# Runs CPU benchmarks (timing metrics+py-spy flamegraphs). +# Pre-requisite: Docker image built using `make perf-image-rebuild`. +.PHONY: cpu-bench +cpu-bench: + docker run --rm --cap-add SYS_PTRACE --security-opt seccomp=unconfined -v $(PWD):/workspace $(GH_SUMMARY_MOUNT) -e PYTHONPATH=/workspace/src -e PERF_ENV=$(PERF_ENV) -e CPU_ITERATIONS=$(CPU_ITERATIONS) -e CPU_THRESHOLD=$(CPU_THRESHOLD) -e CPU_REPEATS=$(CPU_REPEATS) -e PERF_DISABLE_TSA -e PYSPY_RATE=$(PYSPY_RATE) -e PYSPY_FORMAT -e GITHUB_TOKEN -e GITHUB_STEP_SUMMARY c2pa-perf-$(PERF_ENV) python -m tests.perf.cpu.run_profile --mode $(CPU_MODE) $(SCENARIO_ARG) $(PERF_ARGS) + @echo "" + @echo "Reports written to tests/perf/cpu/reports/" + +.PHONY: clean-cpu-perf-reports +clean-cpu-perf-reports: + rm -f tests/perf/cpu/reports/*.svg tests/perf/cpu/reports/*.speedscope.json + @echo "Cleared tests/perf/cpu/reports/" diff --git a/requirements-dev.txt b/requirements-dev.txt index ae6c7a61..083439e7 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -6,7 +6,6 @@ toml==0.10.2 # For reading pyproject.toml files # Testing dependencies pytest>=8.1.0 -pytest-benchmark>=5.1.0 # for downloading the library artifacts requests>=2.0.0 diff --git a/tests/benchmark.py b/tests/benchmark.py deleted file mode 100644 index c576c272..00000000 --- a/tests/benchmark.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright 2025 Adobe. All rights reserved. -# This file is licensed to you under the Apache License, -# Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0) -# or the MIT license (http://opensource.org/licenses/MIT), -# at your option. - -# Unless required by applicable law or agreed to in writing, -# this software is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR REPRESENTATIONS OF ANY KIND, either express or -# implied. See the LICENSE-MIT and LICENSE-APACHE files for the -# specific language governing permissions and limitations under -# each license. - -import os -import io -import json -import shutil -from c2pa import Reader, Builder, Signer, C2paSigningAlg, C2paSignerInfo - -PROJECT_PATH = os.getcwd() - -# Test paths -test_path = os.path.join(PROJECT_PATH, "tests", "fixtures", "C.jpg") -temp_dir = os.path.join(PROJECT_PATH, "tests", "temp") -output_path = os.path.join(temp_dir, "python_out.jpg") - -# Ensure temp directory exists -os.makedirs(temp_dir, exist_ok=True) - -manifestDefinition = { - "claim_generator": "python_test", - "claim_generator_info": [{ - "name": "python_test", - "version": "0.0.1", - }], - "format": "image/jpeg", - "title": "Python Test Image", - "ingredients": [], - "assertions": [ - { - "label": "c2pa.actions", - "data": { - "actions": [ - { - "action": "c2pa.created", - "digitalSourceType": "http://cv.iptc.org/newscodes/digitalsourcetype/digitalCreation" - } - ] - } - } - ] -} - -# Load private key and certificates -private_key = open("tests/fixtures/ps256.pem", "rb").read() -certs = open("tests/fixtures/ps256.pub", "rb").read() - -# Create a local Ps256 signer with certs and a timestamp server -signer_info = C2paSignerInfo( - alg=b"ps256", - sign_cert=certs, - private_key=private_key, - ta_url=b"http://timestamp.digicert.com" -) -signer = Signer.from_info(signer_info) -builder = Builder(manifestDefinition) - -# Load source image -source = open(test_path, "rb").read() - -# Run the benchmark: python -m pytest tests/benchmark.py -v - - -def test_files_read(): - """Benchmark reading a C2PA asset from a file.""" - with open(test_path, "rb") as f: - reader = Reader("image/jpeg", f) - result = reader.json() - reader.close() - assert result is not None - # Parse the JSON string into a dictionary - result_dict = json.loads(result) - # Additional assertions to verify the structure of the result - assert "active_manifest" in result_dict - assert "manifests" in result_dict - assert "validation_state" in result_dict - assert result_dict["validation_state"] == "Valid" - - -def test_streams_read(): - """Benchmark reading a C2PA asset from a stream.""" - with open(test_path, "rb") as file: - source = file.read() - reader = Reader("image/jpeg", io.BytesIO(source)) - result = reader.json() - reader.close() - assert result is not None - # Parse the JSON string into a dictionary - result_dict = json.loads(result) - # Additional assertions to verify the structure of the result - assert "active_manifest" in result_dict - assert "manifests" in result_dict - assert "validation_state" in result_dict - assert result_dict["validation_state"] == "Valid" - - -def test_files_build(): - """Benchmark building a C2PA asset from a file.""" - # Delete the output file if it exists - if os.path.exists(output_path): - os.remove(output_path) - with open(test_path, "rb") as source_file: - with open(output_path, "w+b") as dest_file: - builder.sign(signer, "image/jpeg", source_file, dest_file) - - -def test_streams_build(): - """Benchmark building a C2PA asset from a stream.""" - output = io.BytesIO(bytearray()) - with open(test_path, "rb") as source_file: - builder.sign(signer, "image/jpeg", source_file, output) - - -def test_files_reading(benchmark): - """Benchmark file-based reading.""" - benchmark(test_files_read) - - -def test_streams_reading(benchmark): - """Benchmark stream-based reading.""" - benchmark(test_streams_read) - - -def test_files_builder_signer_benchmark(benchmark): - """Benchmark file-based building.""" - benchmark(test_files_build) - - -def test_streams_builder_benchmark(benchmark): - """Benchmark stream-based building.""" - benchmark(test_streams_build) - - -def teardown_module(module): - """Clean up temporary files after all tests.""" - if os.path.exists(temp_dir): - shutil.rmtree(temp_dir) diff --git a/tests/perf/Dockerfiles/python-3.10-slim-perf-Dockerfile b/tests/perf/Dockerfiles/python-3.10-slim-perf-Dockerfile index 100a082f..55b3a89e 100644 --- a/tests/perf/Dockerfiles/python-3.10-slim-perf-Dockerfile +++ b/tests/perf/Dockerfiles/python-3.10-slim-perf-Dockerfile @@ -13,10 +13,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ COPY requirements.txt requirements-dev.txt ./ RUN pip install --no-cache-dir -r requirements.txt -r requirements-dev.txt -RUN pip install --no-cache-dir memray==1.19.3 +RUN pip install --no-cache-dir memray==1.19.3 py-spy==0.4.2 COPY tests/perf/entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh ENTRYPOINT ["/entrypoint.sh"] -CMD ["python", "-m", "tests.perf.run_profile"] +CMD ["python", "-m", "tests.perf.memory.run_profile"] diff --git a/tests/perf/Dockerfiles/python-3.12-slim-perf-Dockerfile b/tests/perf/Dockerfiles/python-3.12-slim-perf-Dockerfile index 03968dbc..0691a126 100644 --- a/tests/perf/Dockerfiles/python-3.12-slim-perf-Dockerfile +++ b/tests/perf/Dockerfiles/python-3.12-slim-perf-Dockerfile @@ -13,10 +13,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ COPY requirements.txt requirements-dev.txt ./ RUN pip install --no-cache-dir -r requirements.txt -r requirements-dev.txt -RUN pip install --no-cache-dir memray==1.19.3 +RUN pip install --no-cache-dir memray==1.19.3 py-spy==0.4.2 COPY tests/perf/entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh ENTRYPOINT ["/entrypoint.sh"] -CMD ["python", "-m", "tests.perf.run_profile"] +CMD ["python", "-m", "tests.perf.memory.run_profile"] diff --git a/tests/perf/Dockerfiles/ubuntu-22.04-perf-Dockerfile b/tests/perf/Dockerfiles/ubuntu-22.04-perf-Dockerfile index 649422ac..969d2fd5 100644 --- a/tests/perf/Dockerfiles/ubuntu-22.04-perf-Dockerfile +++ b/tests/perf/Dockerfiles/ubuntu-22.04-perf-Dockerfile @@ -22,10 +22,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ COPY requirements.txt ./ RUN pip3 install --no-cache-dir -r requirements.txt -RUN pip3 install --no-cache-dir memray==1.19.3 requests==2.34.2 +RUN pip3 install --no-cache-dir memray==1.19.3 py-spy==0.4.2 requests==2.34.2 COPY tests/perf/entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh ENTRYPOINT ["/entrypoint.sh"] -CMD ["python", "-m", "tests.perf.run_profile"] +CMD ["python", "-m", "tests.perf.memory.run_profile"] diff --git a/tests/perf/Dockerfiles/ubuntu-24.04-perf-Dockerfile b/tests/perf/Dockerfiles/ubuntu-24.04-perf-Dockerfile index 0fd3a523..8e8134a1 100644 --- a/tests/perf/Dockerfiles/ubuntu-24.04-perf-Dockerfile +++ b/tests/perf/Dockerfiles/ubuntu-24.04-perf-Dockerfile @@ -22,10 +22,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ COPY requirements.txt ./ RUN pip3 install --no-cache-dir --break-system-packages -r requirements.txt -RUN pip3 install --no-cache-dir --break-system-packages memray==1.19.3 requests==2.34.2 +RUN pip3 install --no-cache-dir --break-system-packages memray==1.19.3 py-spy==0.4.2 requests==2.34.2 COPY tests/perf/entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh ENTRYPOINT ["/entrypoint.sh"] -CMD ["python", "-m", "tests.perf.run_profile"] +CMD ["python", "-m", "tests.perf.memory.run_profile"] diff --git a/tests/perf/README.md b/tests/perf/README.md index 1e2baf41..55c6c35f 100644 --- a/tests/perf/README.md +++ b/tests/perf/README.md @@ -1,265 +1,41 @@ -# Memory profiling framework +# Performance benchmarks -Uses [memray](https://github.com/bloomberg/memray) to track peak memory, allocation patterns, -and memory leaks across c2pa-python SDK operations. +## Overview -## Files +Two benchmark frameworks share the scenario set and Docker environments in this folder: -| File | Purpose | -| --- | --- | -| `scenarios.py` | Functions that exercise each profiling scenario. Imported by `run_profile.py`. | -| `run_profile.py` | Memory performance/usage analysis. Runs each scenario under `memray`, generates HTML reports, reads metrics, and compares against `baseline.json`. | -| `Dockerfiles/` | One Dockerfile per target environment. Selected via `PERF_ENV` at `make` time when running the memory analysis. | -| `entrypoint.sh` | Container entrypoint. Downloads the Linux native `libc2pa_c.so` at startup into the volume-mounted workspace so it sticks around even through the `-v` mount. | -| `reports/` | Generated HTML reports (gitignored). Three files per scenario: `-peak.html` (peak/high-water view), `-leaks.html` (leak view), and `-temporary.html` (temporary-allocations view). | +- `scenarios.py`: the scenario functions and the `SCENARIOS` registry. +- `Dockerfiles/`: one image per target environment (selected with `PERF_ENV`), containing both memray and py-spy. +- `entrypoint.sh`: container entrypoint that adds the Linux native library used by the wheel at startup. -## Scenarios - -Each scenario loops multiple times so leaks accumulate and become visible in the leaks flamegraph and the memory use graph (defaults to 100). Change the count of iterations when running by setting the `MEMRAY_ITERATIONS` variable (the Makefile forwards it into the container): - -```bash -make memory-use-bench MEMRAY_ITERATIONS=1000 -``` - -Most scenarios use the Context API: they build a `Context` once and reuse it across iterations, so its settings are parsed a single time. The jpeg and png cases also keep a `_legacy` variant that builds the `Reader`/`Builder` without a `Context`, which re-reads the thread-local settings on each construction. Running a pair (for example `builder_sign_jpeg_legacy` and `builder_sign_jpeg_with_context`) compares the two paths. - -The `builder_sign_{jpeg,png}_parallel_*` scenarios build one `Context` and share it across 10 threads that sign concurrently, each with its own streams and `Builder`. The name encodes two axes. `split` divides the iteration budget across the threads, so total work matches a single-threaded scenario; `full` runs the full loop on each of the 10 threads, so total work is 10x (use these with `SCENARIO=` rather than the whole suite). `pool` runs the threads through a `ThreadPoolExecutor`; `barrier` starts all 10 at once with a `threading.Barrier`. - -## Environments - -Select the target environment with `PERF_ENV` (default: `python-3.12-slim`): - -| `PERF_ENV` value | Base image | Python | Native symbols | +| Folder | Tool | Measures | Docs | | --- | --- | --- | --- | -| `python-3.12-slim` | `python:3.12-slim` | 3.12 | interpreter frames unresolved | -| `python-3.10-slim` | `python:3.10-slim` | 3.10 | interpreter frames unresolved | -| `ubuntu-22.04` | `ubuntu:22.04` | 3.10 (apt default) | resolved (`python3-dbg`) | -| `ubuntu-24.04` | `ubuntu:24.04` | 3.12 (apt default) | resolved (`python3-dbg`) | - -The slim images run a source-built `/usr/local/bin/python` that ships stripped, and Debian's `python3-dbg` targets a different binary (build-id mismatch), so memray cannot resolve the interpreter's native (C) frames there. You will see a "No debug information was found for the Python interpreter" warning, and native traces may lack file names and line numbers. The ubuntu images install `python3-dbg` for the matching apt interpreter, so their native flamegraphs are fully symbolized. Use an `ubuntu-*` `PERF_ENV` when you need resolved native traces. - -## Running (via Docker) - -```bash -# First run (if there is no baseline.json): establishes baseline.json -make memory-use-bench - -# Subsequent runs: compares against baseline, fails if >10% regression -make memory-use-bench - -# Refresh baseline after an intentional memory change -make memory-use-bench PERF_ARGS=--update-baseline - -# Run against a different runner environment -make memory-use-bench PERF_ENV=ubuntu-24.04 - -# Run a single scenario instead of the whole suite -make memory-use-bench SCENARIO=builder_sign_gif - -# Refresh just one scenario's baseline entry (others are preserved) -make memory-use-bench SCENARIO=builder_sign_gif PERF_ARGS=--update-baseline - -# Remove all generated HTML reports -make clean-memory-perf-reports -``` - -The trailing `VAR=value` arguments (e.g. `PERF_ENV=ubuntu-24.04`, `PERF_ARGS=--update-baseline`) are `make` variable overrides, not shell env vars. `make` parses `word=value` argument as a variable assignment. Each overrides a `?=` default in the Makefile, and the recipe interpolates them into the `docker build`/`docker run` commands. See [Configuration](#configuration) for the full list and what each forwards to. - -Reports are written to `tests/perf/reports/` on the local machine. Three HTML files per scenario, one per suffix (described below). Open any in a browser. After a run, the run also reports if the scenarios were or were not all within baseline threshold (baseline +10% memory use tolerance). - -## Running in CI - -The `.github/workflows/memory-benchmark.yml` workflow runs the Docker-based benchmarks on a PR, but only when the PR has the `check-memory-benchmark` label. This runs `make memory-use-bench`, so: +| [`memory/`](memory/) | [memray](https://github.com/bloomberg/memray) | peak memory, leaks, allocations | [memory/README.md](memory/README.md) | +| [`cpu/`](cpu/) | [py-spy](https://github.com/benfred/py-spy) | wall/CPU time, CPU flamegraphs | [cpu/README.md](cpu/README.md) | -- A regression (peak or leaked > baseline +10%) makes the benchmark job exit non-zero. -- A values report table is written to the job's Step Summary. -- All three flamegraph HTML views per scenario are uploaded as the `memray-flamegraphs` artifact. +## Why two frameworks? -The gate only acts as regression test once a `tests/perf/baseline.json` is committed on the branch. Without one, `run_profile.py` treats the run as baseline creation (exits 0, no gating). +Each framework focuses on different indicators: -## Report views +- **`memory/`** tracks memory usage, not time: peak RSS, leaks, temporary-allocation churn. Catches a change that holds more memory at once or leaks with iteration count, even if it runs just as fast. +- **`cpu/`** tracks time, not memory usage: wall/CPU seconds plus a flamegraph of where cycles go. Catches a slowdown and the call site causing it, even if memory use is unchanged. -Each scenario produces three [memray flamegraphs](https://bloomberg.github.io/memray/flamegraph.html). All three are flamegraphs of the same run. They differ only in which allocations they count. +Both run inside the Docker perf image (`../Dockerfiles/`), with a fixed Python version, fixed OS, fixed dependency set, and no other host processes competing for CPU/memory. That isolation is why they carry a committed `baseline.json`: same environment every run, so a delta means the code changed, not that the host did. -### `-peak.html`: peak/high-water view +## Quickstart -What it shows: allocations that were simultaneously alive at the moment the process used the most memory (the high-water mark). - -Why it's useful: tells you what drives the largest memory footprint, the working set you must hold at once. Consult this view when you care about peak RSS or OOM headroom. - -How to read it: the widest frames are the biggest contributors to peak. Walk up a wide column to the top frame to find the call site holding that memory at the high-water instant. - -### `-leaks.html`: leak view - -What it shows: memory that was allocated but never freed before tracking stopped (`memray --leaks`). - -Why it's useful: finds memory leaks, meaning memory that grows with work done. It is never zero, because one-time static setup (the native `libc2pa_c` library loading global structures that live for the whole process) shows as "never freed." A real leak is one that scales with iterations. Profile at `MEMRAY_ITERATIONS=100` and `=1000` and compare: flat means static overhead, growing means a leak. See [Why is leaked_bytes not zero?](#why-is-leaked_bytes-not-zero). - -How to read it: a wide frame here is unfreed memory. If its width grows when you raise the iteration count, that top frame is the leaking call site. - -### `-temporary.html`: temporary-allocations view - -What it shows: short-lived churn, meaning memory allocated and then freed almost immediately (memray's threshold: freed before more than one other allocation happens). - -Why it's useful: temporary allocations are not leaks, since the memory is returned, but high allocation and free turnover costs CPU and can fragment the heap. This view surfaces hot per-call churn that the peak and leak views hide, because those objects are freed between iterations and so barely register at the high-water mark. Use it when a loop allocates too much. - -How to read it: wide frames are the biggest sources of throwaway allocations. The view may be sparse or empty for a scenario that does little churn, which is itself a valid result. See [Temporary allocations](#temporary-allocations). - -The temporary view is the heaviest to render: memray holds every allocation and free to decide which are short-lived. On a very large capture (a long run, a high `MEMRAY_ITERATIONS`, or a churn-heavy scenario) the render can run out of memory and fail. The run does not abort in that case; it records what failed and keeps going. See [Troubleshooting](#troubleshooting). - -## Running without Docker (if memray is supported and installed locally) - -```bash -pip install memray -python -m tests.perf.run_profile -``` - -Run a single scenario (useful for generating data for one operation without the full suite): +Building the Docker images is a pre-requisite to run the benchmarks: ```bash -python -m tests.perf.run_profile --scenario builder_sign_gif +make perf-image-rebuild ``` -With `--update-baseline`, a single-scenario run only rewrites that scenario's entry in `baseline.json`; the other scenarios' entries are preserved. +To run the benchmarks: ```bash -python -m tests.perf.run_profile --scenario builder_sign_gif --update-baseline -``` - -## Configuration - -With `make memory-use-bench VAR=value` you set the **`make` variable** and the Makefile forwards it as shown in the "Forwarded as" column. Running `run_profile.py` without Docker, you set the **env var** (or pass the CLI arg) directly. - -| `make` variable | Forwarded as | Default | Description | -| --- | --- | --- | --- | -| `PERF_ENV` | `PERF_ENV` env var | `python-3.12-slim` | Target environment; selects the Dockerfile, tags report filenames (`--.html`), recorded in `baseline.json` `_meta`. See [Environments](#environments). | -| `MEMRAY_ITERATIONS` | `MEMRAY_ITERATIONS` env var | `100` | Loop count per scenario. | -| `MEMRAY_THRESHOLD` | `MEMRAY_THRESHOLD` env var | `1.1` | Regression multiplier (1.1 = 10% tolerance). | -| `SCENARIO` | `--scenario` CLI arg | _(all)_ | Run a single scenario (e.g. `SCENARIO=builder_sign_jpeg`). | -| `PERF_ARGS` | passed straight through | _(none)_ | Extra `run_profile.py` args (e.g. `PERF_ARGS=--update-baseline`). | - -`PERF_SCENARIO` is an additional env var, but internal: the runner sets it per scenario so the loop can label its progress. Not user-configurable. - -Example to override iteration count: - -```bash -make memory-use-bench MEMRAY_ITERATIONS=1000 -``` - -## Reading baseline.json - -`baseline.json` is committed to the repo and reports following data for each scenario: - -```json -{ - "_meta": { - "memray_version": "1.19.3", - "python_version": "3.12.13", - "c2pa_native_version": "c2pa-v0.85.0", - "iterations": 100, - "perf_env": "python-3.12-slim", - "arch": "x86_64" - }, - "scenario_name": { - "peak_bytes": 62914560, - "leaked_bytes": 3271766, - "total_allocations": 12840 - }, - ... -} -``` - -The `_meta` block records which toolchain produced the baseline so the numbers are reproducible. It is provenance only and is never compared against. The regression check only looks at the per-scenario entries. - -| `_meta` field | Meaning | -| --- | --- | -| `memray_version` | memray version that generated the metrics | -| `python_version` | Python version that ran the test framework | -| `c2pa_native_version` | native `libc2pa_c` version (from `c2pa-native-version.txt`) | -| `iterations` | `MEMRAY_ITERATIONS` used for the run | -| `perf_env` | `PERF_ENV` (target environment) | -| `arch` | machine architecture (`platform.machine()`) | - -`peak_bytes`, `total_allocations` and the `arch`/`python`/`memray` versions are all environment-sensitive: a baseline is most meaningful when compared against a run from the same `_meta`. - -`peak_bytes` is the highest amount of memory in use at any single point during the scenario. - -`leaked_bytes` is memory that was allocated during the run but never freed before the process exited. Static allocations persist, since there are one-time loads such as the native library. - -`total_allocations` is the total number of individual memory allocation calls made. - -### Why is leaked_bytes not zero? - -You might expect the baseline to show `leaked_bytes: 0`. In practice it never does. When the c2pa native library (`libc2pa_c.so`) is first loaded, Rust sets up global data structures designed to live for the entire lifetime of the process. They get cleaned up when the process exits, which is after memray stops watching, so memray sees them as "never freed" even though they are not leaking. - -A memory leak grows proportionally with work done. If you sign 50 images and get 3.2 MB leaked, then sign 1000 images and still get 3.2 MB leaked, that 3.2 MB is static one-time overhead rather than a leak, since it does not grow with the work that ran. If signing 1000 images gave you 64 MB leaked, that would be a leak, as the leaked memory grows with the work executed. - -The baseline captures this expected static overhead. Future runs compare against it: if `leaked_bytes` grows beyond the baseline by more than 10%, the run fails. - -The framework runs `gc.collect()` twice after the scenario finishes, while memray is still tracking. Without that sweep, objects sitting in not-yet-collected reference cycles would be counted in `leaked_bytes` and the number would depend on garbage collector timing rather than on actual leaks. With it, `leaked_bytes` means memory that is still allocated even though nothing in Python can reach it: true leaks plus the one-time static overhead described above. - -### How to confirm no leak exists? - -Run with a higher iteration count than default (100) and compare: - -```bash -make memory-use-bench MEMRAY_ITERATIONS=1000 PERF_ARGS=--update-baseline -``` - -If `leaked_bytes` stays flat compared to a baseline run or in a larger run (more iterations), there is no leak. If it scales with iterations, open `tests/perf/reports/-leaks.html` in a browser to see which function is responsible. - -### Reading the "Resident set size over time" graph (why memory looks like it climbs) - -The "Resident set size over time" plot (chart icon, top-right of the report) draws two lines. "Resident size" (RSS) is every page the OS counts as resident: interpreter and pages the allocator holds but has not returned. "Heap size" is only the live tracked allocations. - -On the parallel scenarios the RSS line steps up and stays high. The threads each hold their own source, output, and `Builder` live at once, so RSS rises to cover that combined working set (the steps line up with the moments all threads overlap). The allocator then keeps those arena pages for reuse instead of returning them, so RSS plateaus at the high-water mark. - -Judge leaks by the heap line. The heap rises early and then settles or falls, the same shape as the single-threaded baseline. A within-run heap rise is not by itself proof of a leak (the allocator high-water can climb and settle within a bounded run). - -### Temporary allocations - -`-temporary.html` shows temporary allocations, meaning memory that is allocated and then freed almost immediately (memray's threshold is one allocation: a block is temporary if it is freed before more than one other allocation happens). The memory is returned, so these are not leaks, but they are churn: high allocation and free turnover that costs CPU and can fragment the heap. A scenario doing lots of short-lived work can show heavy temporary allocations while `leaked_bytes` stays flat. - -### When to update the baseline - -Update `baseline.json` after any intentional change that affects memory use: - -```bash -make memory-use-bench PERF_ARGS=--update-baseline -``` - -Commit the updated `baseline.json` alongside the code change, so it becomes the new reference to compare against. - -## Troubleshooting - -### A flamegraph render fails with `exit -9` - -You may see a message like `flamegraph render failed for reader_mp4-...-temporary.html (killed (likely OOM))`. The `-9` is SIGKILL: the operating system's out-of-memory killer terminated the `memray flamegraph` subprocess. The temporary view is the heaviest to render, and on a large capture (a long run, a high `MEMRAY_ITERATIONS`, or a churn-heavy scenario such as `reader_mp4`) it can exhaust available memory. - -The run does not abort. The capture and the metrics (`peak_bytes`, `leaked_bytes`, `total_allocations`) are read separately and are still recorded, the baseline is still written, and the run lists every failed render at the end. Only the HTML render is missing, and you have two ways to regenerate it. - -#### Option A: rerun the one scenario - -A single-scenario run renders one capture at a time with nothing else resident, so it often fits where the full suite did not: - -```bash -make memory-use-bench SCENARIO=reader_mp4 -``` - -If it still runs out of memory, lower the iteration count to shrink the capture: - -```bash -make memory-use-bench SCENARIO=reader_mp4 MEMRAY_ITERATIONS=20 -``` - -A lower iteration count makes that scenario's absolute allocation numbers no longer directly comparable to a full 100-iteration run. - -#### Option B: re-render the kept capture (no re-profiling) - -When a render fails, the run keeps that scenario's capture as `reports/-.bin`. Re-render just the failed view from that file with a higher temporary-allocation threshold, which cuts how much memray holds in memory so the render fits. This uses the original run's data, so the result stays comparable to the rest of the run: +# memory benchmark +make memory-use-bench -```bash -python3 -m memray flamegraph reports/reader_mp4-python-3.12-slim.bin \ - -o reports/reader_mp4-python-3.12-slim-temporary.html \ - --temporary-allocations --temporary-allocation-threshold=10 --force + # cpu benchmark +make cpu-bench ``` diff --git a/tests/perf/cpu/README.md b/tests/perf/cpu/README.md new file mode 100644 index 00000000..4d788b76 --- /dev/null +++ b/tests/perf/cpu/README.md @@ -0,0 +1,92 @@ +# CPU profiling framework + +Uses [py-spy](https://github.com/benfred/py-spy) to profile where CPU time goes across c2pa-python operations, plus plain timing measurements to track wall/CPU time per scenario against a baseline. + +## Files + +| File | Purpose | +| --- | --- | +| `../scenarios.py` | Functions that exercise each profiling scenario. Imported by `run_profile.py`. | +| `run_profile.py` | CPU analysis. Times each scenario in a plain child process, renders a py-spy flamegraph per scenario, and compares timings against `baseline.json`. | +| `baseline.json` | Committed reference timings (`_meta` provenance block + per-scenario `wall_seconds`, `cpu_seconds`, `children_cpu_seconds`). | +| `../Dockerfiles/` | One Dockerfile per target environment (shared with the memory benchmark). Selected via `PERF_ENV` at `make` time. | +| `../entrypoint.sh` | Container entrypoint (shared). Downloads the Linux native `libc2pa_c.so` at startup. | +| `reports/` | Generated profiles (gitignored). One file per scenario: `-cpu.svg` (or `.speedscope.json` with `PYSPY_FORMAT=speedscope`). | + +## Approach: two passes per scenario + +1. Timing pass: the scenario runs in a plain child process with no profiler attached. The child measures three metrics around the scenario call only, excluding interpreter startup, and hands the result back as JSON: `wall_seconds` (`time.perf_counter`), `cpu_seconds` (`time.process_time`, process-wide so the thread-pool scenarios count all threads), and `children_cpu_seconds` (`resource.getrusage(RUSAGE_CHILDREN)`, the CPU burned in forked children, which `process_time` cannot see; relevant for the `fork_*` scenarios). These numbers feed the baseline comparison. +2. Profile pass: the scenario runs again under `py-spy record` to produce a flamegraph. Profile numbers never feed the baseline, since sampling adds rate-dependent overhead that would bake profiler cost into the timings. + +Scenario children run with `PERF_DISABLE_TSA=1` by default, so signing scenarios skip the network round-trip to the timestamp authority that every `sign` call otherwise makes. Timings then measure code, not network latency. Pass `PERF_DISABLE_TSA=0` to restore the TSA call. The memory benchmark is unaffected either way; it keeps TSA on. + +When a scenario's first timing pass finishes in under 1 second, the harness runs 4 more passes and records the median of each metric, because single-shot timing at that scale is mostly jitter. Set `CPU_REPEATS=N` to force a fixed pass count instead. + +Locally the default `--mode all` runs both passes back to back. In CI they run as two parallel jobs on separate runners (`--mode timing` / `--mode profile`), so sampling never contends with the timed run and wall-clock cost stays at one pass. + +## CI/CD report + +Timings on shared CI runners are noisy (±10-20% wall-clock swing is normal). The harness therefore ships report-only: baseline deltas are printed and shown in the CI step summary: rows over the threshold get a `drift` status, but the run always exits 0. + +## Running + +The benchmarks run inside Docker and need the perf Docker image: + +```bash +make perf-image-rebuild # once, or after dependency changes +``` + +Once the image is built, run the benchmarks: + +```bash +# all scenarios, timing + flamegraphs +make cpu-bench + +# run only one scenario +make cpu-bench SCENARIO=builder_sign_gif + +# timings only +make cpu-bench CPU_MODE=timing + +# flamegraphs only +make cpu-bench CPU_MODE=profile + +# update baselines +make cpu-bench PERF_ARGS=--update-baseline +``` + +The `cpu-bench` target runs the container with `--cap-add SYS_PTRACE --security-opt seccomp=unconfined`. `py-spy` samples the child process via ptrace and `process_vm_readv`, which the default container security profile blocks. + +## Variables + +| Variable | Default | Meaning | +| --- | --- | --- | +| `CPU_ITERATIONS` | `100` | Loop count per scenario (the Makefile forwards it into the container). | +| `CPU_THRESHOLD` | `1.25` | Drift multiplier vs baseline (1.25 = +25%) used for the `drift` status. | +| `CPU_MODE` | `all` | `all`, `timing`, or `profile` (maps to `--mode`). | +| `CPU_REPEATS` | `0` | Fixed timing pass count per scenario, median recorded. `0` = adaptive: 1 pass, extended to 5 when the first pass runs under 1 s. | +| `PERF_DISABLE_TSA` | `1` (set by the harness) | Skip the timestamp-authority network call during signing. Set `0` to restore it. | +| `PYSPY_RATE` | `100` | Sampling rate in Hz. Raise (up to ~500) if flamegraphs for fast scenarios look sparse. | +| `PYSPY_FORMAT` | `flamegraph` | `flamegraph` writes self-contained SVGs; `speedscope` writes JSON for [speedscope.app](https://www.speedscope.app/). | +| `PERF_ENV` | `python-3.12-slim` | Which Docker environment to use (see `../Dockerfiles/`). | +| `SCENARIO` | unset | Run a single scenario. | +| `PERF_ARGS` | unset | Extra args for `run_profile.py`, e.g. `--update-baseline`. | + +## Interpreting the flamegraphs + +Profiles show Python frames only. Time spent inside the Rust `libc2pa_c` library is attributed to the Python frame that made the FFI call. Fast scenarios at the default 100 Hz can produce thin profiles; raise `PYSPY_RATE` or `CPU_ITERATIONS` for more samples. + +## Updating the baseline + +Run with `--update-baseline` on. Single-scenario updates merge into the existing file and warn if the environment (`_meta`) does not match the other entries. + +Without a committed `baseline.json`, a run creates one and reports no deltas. + +## CI + +`.github/workflows/cpu-benchmark.yml` runs on PRs labeled `check-cpu-benchmark`, on `ubuntu-24.04-arm` to match the baseline arch. + +Two parallel jobs run for this workflow: + +- `cpu-timing` measures timing metrics; the baseline comparison table lands in the job's step summary. +- `cpu-profile` renders the py-spy flamegraphs and uploads them as the `pyspy-cpu-flamegraphs` artifact. diff --git a/tests/perf/cpu/__init__.py b/tests/perf/cpu/__init__.py new file mode 100644 index 00000000..17f51950 --- /dev/null +++ b/tests/perf/cpu/__init__.py @@ -0,0 +1 @@ +# Empty placeholder file to facilitate imports diff --git a/tests/perf/cpu/baseline.json b/tests/perf/cpu/baseline.json new file mode 100644 index 00000000..e28d017b --- /dev/null +++ b/tests/perf/cpu/baseline.json @@ -0,0 +1,225 @@ +{ + "_meta": { + "pyspy_version": "0.4.2", + "python_version": "3.12.13", + "c2pa_native_version": "c2pa-v0.89.0", + "iterations": 100, + "perf_env": "python-3.12-slim", + "arch": "aarch64" + }, + "reader_jpeg_legacy": { + "wall_seconds": 0.2208, + "cpu_seconds": 0.2163, + "children_cpu_seconds": 0.0 + }, + "reader_jpeg_with_context": { + "wall_seconds": 0.2208, + "cpu_seconds": 0.2165, + "children_cpu_seconds": 0.0 + }, + "reader_manifest_data_context": { + "wall_seconds": 0.0957, + "cpu_seconds": 0.0968, + "children_cpu_seconds": 0.0 + }, + "reader_mp4": { + "wall_seconds": 0.4446, + "cpu_seconds": 0.4392, + "children_cpu_seconds": 0.0 + }, + "reader_wav": { + "wall_seconds": 0.1413, + "cpu_seconds": 0.1272, + "children_cpu_seconds": 0.0 + }, + "builder_sign_jpeg_legacy": { + "wall_seconds": 1.4077, + "cpu_seconds": 1.4082, + "children_cpu_seconds": 0.0 + }, + "builder_sign_jpeg_with_context": { + "wall_seconds": 1.3963, + "cpu_seconds": 1.3971, + "children_cpu_seconds": 0.0 + }, + "builder_sign_png_legacy": { + "wall_seconds": 1.8313, + "cpu_seconds": 1.8313, + "children_cpu_seconds": 0.0 + }, + "builder_sign_png_with_context": { + "wall_seconds": 1.7863, + "cpu_seconds": 1.7869, + "children_cpu_seconds": 0.0 + }, + "builder_sign_jpeg_parallel_split_pool": { + "wall_seconds": 0.3109, + "cpu_seconds": 2.0177, + "children_cpu_seconds": 0.0 + }, + "builder_sign_jpeg_parallel_split_barrier": { + "wall_seconds": 0.3068, + "cpu_seconds": 2.0781, + "children_cpu_seconds": 0.0 + }, + "builder_sign_png_parallel_split_pool": { + "wall_seconds": 0.6139, + "cpu_seconds": 2.8088, + "children_cpu_seconds": 0.0 + }, + "builder_sign_png_parallel_split_barrier": { + "wall_seconds": 0.6298, + "cpu_seconds": 2.8269, + "children_cpu_seconds": 0.0 + }, + "builder_sign_gif": { + "wall_seconds": 34.1958, + "cpu_seconds": 34.1939, + "children_cpu_seconds": 0.0 + }, + "builder_sign_heic": { + "wall_seconds": 0.1297, + "cpu_seconds": 0.1358, + "children_cpu_seconds": 0.0 + }, + "builder_sign_m4a": { + "wall_seconds": 0.3892, + "cpu_seconds": 0.395, + "children_cpu_seconds": 0.0 + }, + "builder_sign_webp": { + "wall_seconds": 2.0344, + "cpu_seconds": 2.0337, + "children_cpu_seconds": 0.0 + }, + "builder_sign_avi": { + "wall_seconds": 1.6515, + "cpu_seconds": 1.6517, + "children_cpu_seconds": 0.0 + }, + "builder_sign_mp4": { + "wall_seconds": 0.212, + "cpu_seconds": 0.2177, + "children_cpu_seconds": 0.0 + }, + "builder_sign_tiff": { + "wall_seconds": 3.422, + "cpu_seconds": 3.4215, + "children_cpu_seconds": 0.0 + }, + "builder_sign_jpeg_parent_of": { + "wall_seconds": 3.7998, + "cpu_seconds": 3.8144, + "children_cpu_seconds": 0.0 + }, + "builder_sign_jpeg_component_of": { + "wall_seconds": 3.8204, + "cpu_seconds": 3.8368, + "children_cpu_seconds": 0.0 + }, + "builder_sign_jpeg_parent_and_component": { + "wall_seconds": 8.2377, + "cpu_seconds": 5.8484, + "children_cpu_seconds": 0.0 + }, + "builder_sign_jpeg_parent_and_component_mixed_mime": { + "wall_seconds": 5.512, + "cpu_seconds": 5.5281, + "children_cpu_seconds": 0.0 + }, + "builder_sign_jpeg_two_components_same_mime": { + "wall_seconds": 8.0985, + "cpu_seconds": 5.8208, + "children_cpu_seconds": 0.0 + }, + "builder_sign_jpeg_two_components_mixed_mime": { + "wall_seconds": 5.5014, + "cpu_seconds": 5.5182, + "children_cpu_seconds": 0.0 + }, + "builder_sign_jpeg_archive_roundtrip": { + "wall_seconds": 3.8633, + "cpu_seconds": 3.8827, + "children_cpu_seconds": 0.0 + }, + "builder_to_archive_with_ingredient": { + "wall_seconds": 2.3895, + "cpu_seconds": 2.4024, + "children_cpu_seconds": 0.0 + }, + "builder_sign_jpeg_archive_roundtrip_ingredient_in_archive": { + "wall_seconds": 4.1713, + "cpu_seconds": 4.2044, + "children_cpu_seconds": 0.0 + }, + "builder_write_ingredient_archive": { + "wall_seconds": 2.3881, + "cpu_seconds": 2.401, + "children_cpu_seconds": 0.0 + }, + "builder_sign_jpeg_add_ingredient_from_archive": { + "wall_seconds": 1.7888, + "cpu_seconds": 1.8066, + "children_cpu_seconds": 0.0 + }, + "builder_ingredient_archive_roundtrip": { + "wall_seconds": 4.1485, + "cpu_seconds": 4.181, + "children_cpu_seconds": 0.0 + }, + "builder_sign_jpeg_two_ingredient_archives": { + "wall_seconds": 1.8626, + "cpu_seconds": 1.8852, + "children_cpu_seconds": 0.0 + }, + "reader_error_no_manifest": { + "wall_seconds": 0.0096, + "cpu_seconds": 0.0079, + "children_cpu_seconds": 0.0 + }, + "builder_error_invalid_manifest": { + "wall_seconds": 0.0028, + "cpu_seconds": 0.0016, + "children_cpu_seconds": 0.0 + }, + "reader_string_apis": { + "wall_seconds": 0.3259, + "cpu_seconds": 0.3286, + "children_cpu_seconds": 0.0 + }, + "fork_reader_collect": { + "wall_seconds": 0.559, + "cpu_seconds": 0.2634, + "children_cpu_seconds": 0.2941 + }, + "fork_contended_mutex": { + "wall_seconds": 7.2526, + "cpu_seconds": 24.1507, + "children_cpu_seconds": 2.1624 + }, + "fork_thread_local_orphan": { + "wall_seconds": 0.577, + "cpu_seconds": 0.2764, + "children_cpu_seconds": 0.3 + }, + "fork_gc_cycle": { + "wall_seconds": 0.694, + "cpu_seconds": 0.3994, + "children_cpu_seconds": 0.2928 + }, + "fork_parent_frees_after_fork": { + "wall_seconds": 4.5677, + "cpu_seconds": 4.4903, + "children_cpu_seconds": 0.038 + }, + "fork_child_sys_exit": { + "wall_seconds": 1.0942, + "cpu_seconds": 0.28, + "children_cpu_seconds": 0.8111 + }, + "fork_stream_cleanup": { + "wall_seconds": 0.2973, + "cpu_seconds": 0.02, + "children_cpu_seconds": 0.2834 + } +} \ No newline at end of file diff --git a/tests/perf/reports/.gitkeep b/tests/perf/cpu/reports/.gitkeep similarity index 100% rename from tests/perf/reports/.gitkeep rename to tests/perf/cpu/reports/.gitkeep diff --git a/tests/perf/cpu/run_profile.py b/tests/perf/cpu/run_profile.py new file mode 100644 index 00000000..f5f38c0a --- /dev/null +++ b/tests/perf/cpu/run_profile.py @@ -0,0 +1,443 @@ +#!/usr/bin/env python3 +# Copyright 2026 Adobe. All rights reserved. +# This file is licensed to you under the Apache License, +# Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +# or the MIT license (http://opensource.org/licenses/MIT), +# at your option. + +""" +CPU profiling harness using py-spy. + +For each scenario in scenarios.SCENARIOS this script runs up to two passes: +- Timing pass: runs the scenario in a plain child process and measures + wall_seconds (time.perf_counter) and cpu_seconds (time.process_time, + process-wide so thread-pool scenarios count all threads) around the + scenario call only, excluding interpreter startup. No profiler is + attached, so the numbers are free of sampling overhead. +- Profile pass: re-runs the scenario under `py-spy record` to produce a + flamegraph (SVG by default, speedscope JSON via PYSPY_FORMAT). Profile + numbers never feed the baseline; the artifact is diagnostic only. + +Results are compared against baseline.json (created on first run). The +comparison is REPORT-ONLY: over-threshold drift is printed and highlighted +in the CI step summary, but never fails the run. CPU timings on shared +runners are too noisy to gate CI; the memory benchmark is the gate. + +Usage: + python -m tests.perf.cpu.run_profile [--update-baseline] + [--scenario NAME] [--mode {all,timing,profile}] + +--mode exists so CI can run the two passes as parallel jobs on separate +runners: `timing` is unpolluted by py-spy CPU contention, `profile` only +produces flamegraphs. + +Environment variables: +- CPU_ITERATIONS: number of times each scenario loops (default: 100) +- CPU_THRESHOLD: drift multiplier, e.g. 1.25 for +25% (default: 1.25) +- CPU_REPEATS: fixed number of timing passes per scenario, median recorded + (default: adaptive — 1 pass, extended to 5 when the first pass finishes + under 1 second, where single-shot timing is mostly jitter) +- PERF_DISABLE_TSA: forwarded to scenario children; defaults to 1 here so + sign timings measure code rather than the network round-trip to the + timestamp authority. Pass PERF_DISABLE_TSA=0 to restore the TSA call. +- PYSPY_RATE: py-spy sampling rate in Hz (default: 100) +- PYSPY_FORMAT: 'flamegraph' (SVG, default) or 'speedscope' +""" + +import argparse +import json +import os +import shutil +import subprocess +import sys +import tempfile +from pathlib import Path + +import platform + +# Scenario name list +from tests.perf.scenarios import SCENARIO_NAMES + +HERE = Path(__file__).parent +REPO_ROOT = HERE.parent.parent.parent +REPORTS_DIR = HERE / "reports" +BASELINE_FILE = HERE / "baseline.json" + +ITERATIONS = int(os.environ.get("CPU_ITERATIONS", "100")) +THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "1.25")) +REPEATS = int(os.environ.get("CPU_REPEATS", "0")) # 0 = adaptive +PYSPY_RATE = int(os.environ.get("PYSPY_RATE", "100")) +PYSPY_FORMAT = os.environ.get("PYSPY_FORMAT", "flamegraph") +PERF_ENV = os.environ.get("PERF_ENV", "") + +# Scenarios whose first pass finishes under this many seconds get re-run to +# _FAST_REPEATS total passes (median recorded): single-shot timing there is +# mostly jitter. A CPU_REPEATS value overrides the adaptive rule. +_FAST_WALL_SECONDS = 1.0 +_FAST_REPEATS = 5 + +# Metrics compared against the baseline. cpu_seconds is the primary signal +# (stable under runner load); wall_seconds also catches blocking/sleep +# regressions that never burn CPU. children_cpu_seconds (CPU burned in +# forked children, invisible to process_time) is reported but not part of +# the gate list. +_METRICS = ("wall_seconds", "cpu_seconds") +_ALL_METRICS = ("wall_seconds", "cpu_seconds", "children_cpu_seconds") +# Drift on children_cpu_seconds is only meaningful when the baseline value +# is non-trivial; below this, percent deltas are pure noise. +_CHILDREN_CPU_MIN_BASE = 0.01 + + +def _scenario_script(name: str, timing_json: Path | None = None) -> str: + """Child-process source that runs one scenario. + + When timing_json is given, the scenario call is bracketed with + perf_counter/process_time and the metrics are written to that file as + JSON. Writing to a file rather than stdout keeps scenario prints from + corrupting the metrics. + """ + body = f""" +import sys +sys.path.insert(0, "{REPO_ROOT}") +sys.path.insert(0, "{REPO_ROOT / 'src'}") +from tests.perf.scenarios import SCENARIOS +""" + if timing_json is None: + body += f""" +SCENARIOS["{name}"]({ITERATIONS}) +""" + else: + body += f""" +import json, resource, time +children_start = resource.getrusage(resource.RUSAGE_CHILDREN) +wall_start = time.perf_counter() +cpu_start = time.process_time() +SCENARIOS["{name}"]({ITERATIONS}) +wall = time.perf_counter() - wall_start +cpu = time.process_time() - cpu_start +children_end = resource.getrusage(resource.RUSAGE_CHILDREN) +children_cpu = ( + children_end.ru_utime - children_start.ru_utime + + children_end.ru_stime - children_start.ru_stime +) +with open("{timing_json}", "w") as fh: + json.dump({{"wall_seconds": wall, "cpu_seconds": cpu, + "children_cpu_seconds": children_cpu}}, fh) +""" + return body + + +def _child_env(name: str) -> dict: + """Environment for scenario child processes. + + TSA is disabled by default so sign timings measure code, not the network + round-trip to the timestamp authority; an explicit PERF_DISABLE_TSA=0 + from the caller wins. + """ + env = {**os.environ, "PERF_SCENARIO": name} + env.setdefault("PERF_DISABLE_TSA", "1") + return env + + +def _run_timing_pass(name: str) -> dict: + """Run one scenario in a plain child process and read its timing metrics.""" + with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp: + timing_json = Path(tmp.name) + try: + cmd = [sys.executable, "-c", _scenario_script(name, timing_json)] + result = subprocess.run(cmd, text=True, env=_child_env(name)) + if result.returncode != 0: + print(f" timing run failed for {name} (exit {result.returncode})", file=sys.stderr) + sys.exit(1) + metrics = json.loads(timing_json.read_text()) + finally: + timing_json.unlink(missing_ok=True) + return {k: round(v, 4) for k, v in metrics.items()} + + +def _median(values: list[float]) -> float: + ordered = sorted(values) + n = len(ordered) + mid = n // 2 + if n % 2: + return ordered[mid] + return (ordered[mid - 1] + ordered[mid]) / 2 + + +def _run_timing(name: str) -> dict: + """Run the timing pass, repeating fast scenarios, and record the median. + + A CPU_REPEATS value fixes the pass count; otherwise one pass, extended to + _FAST_REPEATS when the first pass finishes under _FAST_WALL_SECONDS. + """ + passes = [_run_timing_pass(name)] + if REPEATS > 0: + target = REPEATS + elif passes[0]["wall_seconds"] < _FAST_WALL_SECONDS: + target = _FAST_REPEATS + print(f" fast scenario, repeating ({target} passes, median)...", flush=True) + else: + target = 1 + while len(passes) < target: + passes.append(_run_timing_pass(name)) + if len(passes) == 1: + return passes[0] + return { + metric: round(_median([p[metric] for p in passes]), 4) + for metric in _ALL_METRICS + } + + +def _run_pyspy_pass(name: str, out_path: Path) -> bool: + """Re-run one scenario under py-spy record to produce a flamegraph. + + Launch mode (py-spy is the parent of the profiled process) avoids + pid-attach races and satisfies Yama ptrace_scope without host changes. + A failed render does not abort the run: the timing metrics are recorded + separately and still good. + """ + pyspy = shutil.which("py-spy") + if pyspy is None: + print(f" py-spy not found on PATH; skipping profile for {name}", file=sys.stderr) + return False + cmd = [ + pyspy, "record", + "-o", str(out_path), + "--format", PYSPY_FORMAT, + "--rate", str(PYSPY_RATE), + "--subprocesses", + "--", sys.executable, "-c", _scenario_script(name), + ] + print(f" py-spy record ({PYSPY_FORMAT}, {PYSPY_RATE} Hz)...", flush=True) + result = subprocess.run(cmd, text=True, env=_child_env(name)) + if result.returncode != 0: + print(f" py-spy record failed for {name} (exit {result.returncode})", file=sys.stderr) + return False + return True + + +def _pyspy_version() -> str: + try: + out = subprocess.run(["py-spy", "--version"], capture_output=True, text=True) + # "py-spy 0.4.2" -> "0.4.2" + return out.stdout.strip().split()[-1] if out.returncode == 0 else "" + except OSError: + return "" + + +def _build_meta() -> dict: + """Provenance for the baseline: which toolchain produced these numbers. + Recorded so a committed baseline is reproducible under same conditions. + """ + native_version = "" + try: + native_version = (REPO_ROOT / "c2pa-native-version.txt").read_text().strip() + except OSError: + pass + return { + "pyspy_version": _pyspy_version(), + "python_version": platform.python_version(), + "c2pa_native_version": native_version, + "iterations": ITERATIONS, + "perf_env": PERF_ENV, + "arch": platform.machine(), + } + + +def _fmt_secs(s: float) -> str: + if s < 1: + return f"{s * 1000:.1f} ms" + return f"{s:.3f} s" + + +def _delta_pct(current: float, base: float) -> str: + """Signed percentage change vs baseline, or '-' when no baseline.""" + if not base: + return "-" + return f"{(current - base) / base * 100:+.1f}%" + + +def _write_github_summary(results: dict, baseline: dict) -> None: + """Append a values table to $GITHUB_STEP_SUMMARY when running in CI. + """ + summary_path = os.environ.get("GITHUB_STEP_SUMMARY") + if not summary_path or not results: + return + + lines = [ + "## CPU benchmark (py-spy)", + "", + f"Iterations: {ITERATIONS} · drift threshold: +{(THRESHOLD - 1) * 100:.0f}%" + f" · report-only" + f"{f' · env: {PERF_ENV}' if PERF_ENV else ''}", + "", + "| scenario | wall | cpu | cpu/iter | child cpu | wall Δ% | cpu Δ% | status |", + "|----------|------|-----|----------|-----------|---------|--------|--------|", + ] + for name, m in results.items(): + b = baseline.get(name, {}) if baseline else {} + wall_base = b.get("wall_seconds", 0) + cpu_base = b.get("cpu_seconds", 0) + over = ( + (wall_base and m["wall_seconds"] > wall_base * THRESHOLD) + or (cpu_base and m["cpu_seconds"] > cpu_base * THRESHOLD) + ) + status = "drift" if over else "ok" + lines.append( + f"| {name} | {_fmt_secs(m['wall_seconds'])} " + f"| {_fmt_secs(m['cpu_seconds'])} " + f"| {_fmt_secs(m['cpu_seconds'] / ITERATIONS)} " + f"| {_fmt_secs(m.get('children_cpu_seconds', 0))} " + f"| {_delta_pct(m['wall_seconds'], wall_base)} " + f"| {_delta_pct(m['cpu_seconds'], cpu_base)} | {status} |" + ) + lines.append("") + + with open(summary_path, "a", encoding="utf-8") as fh: + fh.write("\n".join(lines) + "\n") + + +def main() -> None: + parser = argparse.ArgumentParser(description="c2pa-python CPU profiler") + parser.add_argument( + "--update-baseline", + action="store_true", + help="Overwrite baseline.json with current measurements and exit 0", + ) + parser.add_argument( + "--scenario", + choices=SCENARIO_NAMES, + default=None, + help="Run a single scenario instead of all of them. With --update-baseline, " + "only that scenario's entry in baseline.json is updated; the rest are kept.", + ) + parser.add_argument( + "--mode", + choices=("all", "timing", "profile"), + default="all", + help="'timing' measures metrics only, 'profile' renders py-spy flamegraphs " + "only, 'all' does both. CI runs the two as parallel jobs on separate " + "runners so sampling never contends with the timed run.", + ) + args = parser.parse_args() + + run_timing = args.mode in ("all", "timing") + run_profile = args.mode in ("all", "profile") + + if args.update_baseline and not run_timing: + parser.error("--update-baseline requires a mode that measures timing") + + scenarios_to_run = (args.scenario,) if args.scenario else SCENARIO_NAMES + + REPORTS_DIR.mkdir(parents=True, exist_ok=True) + + # prior_baseline: the existing file, always loaded so a single-scenario + # update can preserve the other scenarios' entries when it rewrites the file. + prior_baseline: dict = {} + + # baseline: the subset used for the drift comparison below, which is + # suppressed when --update-baseline is set (because we are re-baselining). + if BASELINE_FILE.exists(): + prior_baseline = json.loads(BASELINE_FILE.read_text()) + baseline: dict = {} if args.update_baseline else prior_baseline + + results: dict = {} + render_failures: list[str] = [] + + ext = ".svg" if PYSPY_FORMAT == "flamegraph" else ".speedscope.json" + total = len(scenarios_to_run) + for idx, name in enumerate(scenarios_to_run, 1): + print(f"\n=== [{idx}/{total}] {name} (iterations={ITERATIONS}) ===") + env_tag = f"-{PERF_ENV}" if PERF_ENV else "" + + if run_timing: + print(" timing...", flush=True) + metrics = _run_timing(name) + results[name] = metrics + print(f" wall: {_fmt_secs(metrics['wall_seconds'])}" + f" ({_fmt_secs(metrics['wall_seconds'] / ITERATIONS)}/iter)") + print(f" cpu: {_fmt_secs(metrics['cpu_seconds'])}" + f" ({_fmt_secs(metrics['cpu_seconds'] / ITERATIONS)}/iter)") + children_cpu = metrics.get("children_cpu_seconds", 0) + if children_cpu >= _CHILDREN_CPU_MIN_BASE: + print(f" child cpu: {_fmt_secs(children_cpu)}") + + if baseline and name in baseline: + b = baseline[name] + # children_cpu_seconds joins the drift check only when the + # baseline value is non-trivial (percent deltas on ~0 bases + # are pure noise). + checked: list[str] = list(_METRICS) + if b.get("children_cpu_seconds", 0) >= _CHILDREN_CPU_MIN_BASE: + checked.append("children_cpu_seconds") + for metric in checked: + current = metrics.get(metric, 0) + base = b.get(metric, 0) + limit = base * THRESHOLD + if current <= limit: + continue + diff_pct = (current - base) / base * 100 if base else float("inf") + print( + f" drift note: {name}.{metric}: {_fmt_secs(current)} " + f"> baseline {_fmt_secs(base)}" + f" (+{diff_pct:.1f}%, threshold {(THRESHOLD-1)*100:.0f}%)", + flush=True, + ) + + if run_profile: + out_path = REPORTS_DIR / f"{name}{env_tag}-cpu{ext}" + if _run_pyspy_pass(name, out_path): + print(f" cpu profile: {out_path}") + else: + render_failures.append(name) + + if run_timing and (args.update_baseline or not prior_baseline): + # When running a single scenario, merge its result into the existing + # baseline so the other scenarios' entries are preserved. A full run + # replaces the file wholesale. + if args.scenario and prior_baseline: + output = dict(prior_baseline) + else: + output = {} + new_meta = _build_meta() + # On a single-scenario merge the new entry must come from the same + # toolchain as the entries it is being merged next to, or the numbers + # are not comparable. Warn if _meta would change (e.g. wrong PERF_ENV, + # iteration count, or native version) instead of silently overwriting it. + if args.scenario and prior_baseline: + old_meta = prior_baseline.get("_meta", {}) + if old_meta and old_meta != new_meta: + diffs = sorted( + set(old_meta) | set(new_meta), + key=str, + ) + changed = [ + f"{k}: {old_meta.get(k)!r} -> {new_meta.get(k)!r}" + for k in diffs if old_meta.get(k) != new_meta.get(k) + ] + print( + "\nWARNING: this run's environment differs from the existing " + "baseline's _meta; the merged entry will NOT be comparable to " + "the other scenarios:\n " + "\n ".join(changed), + file=sys.stderr, + ) + output["_meta"] = new_meta + output.update(results) + BASELINE_FILE.write_text(json.dumps(output, indent=2)) + verb = "Updated" if prior_baseline else "Created" + print(f"\n{verb} baseline: {BASELINE_FILE}") + + # Emit the report table to the PR's Step Summary in CI. + _write_github_summary(results, baseline) + + if render_failures: + print("\nPY-SPY PROFILES FAILED (timing metrics still recorded):", file=sys.stderr) + for name in render_failures: + print(f" {name}", file=sys.stderr) + + if run_timing: + print("\nDone. Baseline comparison is report-only; drift never fails the run.") + + +if __name__ == "__main__": + main() diff --git a/tests/perf/memory/README.md b/tests/perf/memory/README.md new file mode 100644 index 00000000..3c1bd318 --- /dev/null +++ b/tests/perf/memory/README.md @@ -0,0 +1,265 @@ +# Memory profiling framework + +Uses [memray](https://github.com/bloomberg/memray) to track peak memory, allocation patterns, +and memory leaks across c2pa-python SDK operations. + +## Files + +| File | Purpose | +| --- | --- | +| `../scenarios.py` | Functions that exercise each profiling scenario. Imported by `run_profile.py`. | +| `run_profile.py` | Memory performance/usage analysis. Runs each scenario under `memray`, generates HTML reports, reads metrics, and compares against `baseline.json`. | +| `../Dockerfiles/` | One Dockerfile per target environment (shared with the CPU benchmark). Selected via `PERF_ENV` at `make` time. | +| `../entrypoint.sh` | Container entrypoint (shared). Downloads the Linux native `libc2pa_c.so` at startup into the volume-mounted workspace so it sticks around even through the `-v` mount. | +| `reports/` | Generated HTML reports (gitignored). Three files per scenario: `-peak.html` (peak/high-water view), `-leaks.html` (leak view), and `-temporary.html` (temporary-allocations view). | + +## Scenarios + +Each scenario loops multiple times so leaks accumulate and become visible in the leaks flamegraph and the memory use graph (defaults to 100). Change the count of iterations when running by setting the `MEMRAY_ITERATIONS` variable (the Makefile forwards it into the container): + +```bash +make memory-use-bench MEMRAY_ITERATIONS=1000 +``` + +Most scenarios use the Context API: they build a `Context` once and reuse it across iterations, so its settings are parsed a single time. The jpeg and png cases also keep a `_legacy` variant that builds the `Reader`/`Builder` without a `Context`, which re-reads the thread-local settings on each construction. Running a pair (for example `builder_sign_jpeg_legacy` and `builder_sign_jpeg_with_context`) compares the two paths. + +The `builder_sign_{jpeg,png}_parallel_*` scenarios build one `Context` and share it across 10 threads that sign concurrently, each with its own streams and `Builder`. The name encodes two axes. `split` divides the iteration budget across the threads, so total work matches a single-threaded scenario; `full` runs the full loop on each of the 10 threads, so total work is 10x (use these with `SCENARIO=` rather than the whole suite). `pool` runs the threads through a `ThreadPoolExecutor`; `barrier` starts all 10 at once with a `threading.Barrier`. + +## Environments + +Select the target environment with `PERF_ENV` (default: `python-3.12-slim`): + +| `PERF_ENV` value | Base image | Python | Native symbols | +| --- | --- | --- | --- | +| `python-3.12-slim` | `python:3.12-slim` | 3.12 | interpreter frames unresolved | +| `python-3.10-slim` | `python:3.10-slim` | 3.10 | interpreter frames unresolved | +| `ubuntu-22.04` | `ubuntu:22.04` | 3.10 (apt default) | resolved (`python3-dbg`) | +| `ubuntu-24.04` | `ubuntu:24.04` | 3.12 (apt default) | resolved (`python3-dbg`) | + +The slim images run a source-built `/usr/local/bin/python` that ships stripped, and Debian's `python3-dbg` targets a different binary (build-id mismatch), so memray cannot resolve the interpreter's native (C) frames there. You will see a "No debug information was found for the Python interpreter" warning, and native traces may lack file names and line numbers. The ubuntu images install `python3-dbg` for the matching apt interpreter, so their native flamegraphs are fully symbolized. Use an `ubuntu-*` `PERF_ENV` when you need resolved native traces. + +## Running (via Docker) + +```bash +# First run (if there is no baseline.json): establishes baseline.json +make memory-use-bench + +# Subsequent runs: compares against baseline, fails if >10% regression +make memory-use-bench + +# Refresh baseline after an intentional memory change +make memory-use-bench PERF_ARGS=--update-baseline + +# Run against a different runner environment +make memory-use-bench PERF_ENV=ubuntu-24.04 + +# Run a single scenario instead of the whole suite +make memory-use-bench SCENARIO=builder_sign_gif + +# Refresh just one scenario's baseline entry (others are preserved) +make memory-use-bench SCENARIO=builder_sign_gif PERF_ARGS=--update-baseline + +# Remove all generated HTML reports +make clean-memory-perf-reports +``` + +The trailing `VAR=value` arguments (e.g. `PERF_ENV=ubuntu-24.04`, `PERF_ARGS=--update-baseline`) are `make` variable overrides, not shell env vars. `make` parses `word=value` argument as a variable assignment. Each overrides a `?=` default in the Makefile, and the recipe interpolates them into the `docker build`/`docker run` commands. See [Configuration](#configuration) for the full list and what each forwards to. + +Reports are written to `tests/perf/memory/reports/` on the local machine. Three HTML files per scenario, one per suffix (described below). Open any in a browser. After a run, the run also reports if the scenarios were or were not all within baseline threshold (baseline +10% memory use tolerance). + +## Running in CI + +The `.github/workflows/memory-benchmark.yml` workflow runs the Docker-based benchmarks on a PR, but only when the PR has the `check-memory-benchmark` label. This runs `make memory-use-bench`, so: + +- A regression (peak or leaked > baseline +10%) makes the benchmark job exit non-zero. +- A values report table is written to the job's Step Summary. +- All three flamegraph HTML views per scenario are uploaded as the `memray-flamegraphs` artifact. + +The gate only acts as regression test once a `tests/perf/memory/baseline.json` is committed on the branch. Without one, `run_profile.py` treats the run as baseline creation (exits 0, no gating). + +## Report views + +Each scenario produces three [memray flamegraphs](https://bloomberg.github.io/memray/flamegraph.html). All three are flamegraphs of the same run. They differ only in which allocations they count. + +### `-peak.html`: peak/high-water view + +What it shows: allocations that were simultaneously alive at the moment the process used the most memory (the high-water mark). + +Why it's useful: tells you what drives the largest memory footprint, the working set you must hold at once. Consult this view when you care about peak RSS or OOM headroom. + +How to read it: the widest frames are the biggest contributors to peak. Walk up a wide column to the top frame to find the call site holding that memory at the high-water instant. + +### `-leaks.html`: leak view + +What it shows: memory that was allocated but never freed before tracking stopped (`memray --leaks`). + +Why it's useful: finds memory leaks, meaning memory that grows with work done. It is never zero, because one-time static setup (the native `libc2pa_c` library loading global structures that live for the whole process) shows as "never freed." A real leak is one that scales with iterations. Profile at `MEMRAY_ITERATIONS=100` and `=1000` and compare: flat means static overhead, growing means a leak. See [Why is leaked_bytes not zero?](#why-is-leaked_bytes-not-zero). + +How to read it: a wide frame here is unfreed memory. If its width grows when you raise the iteration count, that top frame is the leaking call site. + +### `-temporary.html`: temporary-allocations view + +What it shows: short-lived churn, meaning memory allocated and then freed almost immediately (memray's threshold: freed before more than one other allocation happens). + +Why it's useful: temporary allocations are not leaks, since the memory is returned, but high allocation and free turnover costs CPU and can fragment the heap. This view surfaces hot per-call churn that the peak and leak views hide, because those objects are freed between iterations and so barely register at the high-water mark. Use it when a loop allocates too much. + +How to read it: wide frames are the biggest sources of throwaway allocations. The view may be sparse or empty for a scenario that does little churn, which is itself a valid result. See [Temporary allocations](#temporary-allocations). + +The temporary view is the heaviest to render: memray holds every allocation and free to decide which are short-lived. On a very large capture (a long run, a high `MEMRAY_ITERATIONS`, or a churn-heavy scenario) the render can run out of memory and fail. The run does not abort in that case; it records what failed and keeps going. See [Troubleshooting](#troubleshooting). + +## Running without Docker (if memray is supported and installed locally) + +```bash +pip install memray +python -m tests.perf.memory.run_profile +``` + +Run a single scenario (useful for generating data for one operation without the full suite): + +```bash +python -m tests.perf.memory.run_profile --scenario builder_sign_gif +``` + +With `--update-baseline`, a single-scenario run only rewrites that scenario's entry in `baseline.json`; the other scenarios' entries are preserved. + +```bash +python -m tests.perf.memory.run_profile --scenario builder_sign_gif --update-baseline +``` + +## Configuration + +With `make memory-use-bench VAR=value` you set the **`make` variable** and the Makefile forwards it as shown in the "Forwarded as" column. Running `run_profile.py` without Docker, you set the **env var** (or pass the CLI arg) directly. + +| `make` variable | Forwarded as | Default | Description | +| --- | --- | --- | --- | +| `PERF_ENV` | `PERF_ENV` env var | `python-3.12-slim` | Target environment; selects the Dockerfile, tags report filenames (`--.html`), recorded in `baseline.json` `_meta`. See [Environments](#environments). | +| `MEMRAY_ITERATIONS` | `MEMRAY_ITERATIONS` env var | `100` | Loop count per scenario. | +| `MEMRAY_THRESHOLD` | `MEMRAY_THRESHOLD` env var | `1.1` | Regression multiplier (1.1 = 10% tolerance). | +| `SCENARIO` | `--scenario` CLI arg | _(all)_ | Run a single scenario (e.g. `SCENARIO=builder_sign_jpeg`). | +| `PERF_ARGS` | passed straight through | _(none)_ | Extra `run_profile.py` args (e.g. `PERF_ARGS=--update-baseline`). | + +`PERF_SCENARIO` is an additional env var, but internal: the runner sets it per scenario so the loop can label its progress. Not user-configurable. + +Example to override iteration count: + +```bash +make memory-use-bench MEMRAY_ITERATIONS=1000 +``` + +## Reading baseline.json + +`baseline.json` is committed to the repo and reports following data for each scenario: + +```json +{ + "_meta": { + "memray_version": "1.19.3", + "python_version": "3.12.13", + "c2pa_native_version": "c2pa-v0.85.0", + "iterations": 100, + "perf_env": "python-3.12-slim", + "arch": "x86_64" + }, + "scenario_name": { + "peak_bytes": 62914560, + "leaked_bytes": 3271766, + "total_allocations": 12840 + }, + ... +} +``` + +The `_meta` block records which toolchain produced the baseline so the numbers are reproducible. It is provenance only and is never compared against. The regression check only looks at the per-scenario entries. + +| `_meta` field | Meaning | +| --- | --- | +| `memray_version` | memray version that generated the metrics | +| `python_version` | Python version that ran the test framework | +| `c2pa_native_version` | native `libc2pa_c` version (from `c2pa-native-version.txt`) | +| `iterations` | `MEMRAY_ITERATIONS` used for the run | +| `perf_env` | `PERF_ENV` (target environment) | +| `arch` | machine architecture (`platform.machine()`) | + +`peak_bytes`, `total_allocations` and the `arch`/`python`/`memray` versions are all environment-sensitive: a baseline is most meaningful when compared against a run from the same `_meta`. + +`peak_bytes` is the highest amount of memory in use at any single point during the scenario. + +`leaked_bytes` is memory that was allocated during the run but never freed before the process exited. Static allocations persist, since there are one-time loads such as the native library. + +`total_allocations` is the total number of individual memory allocation calls made. + +### Why is leaked_bytes not zero? + +You might expect the baseline to show `leaked_bytes: 0`. In practice it never does. When the c2pa native library (`libc2pa_c.so`) is first loaded, Rust sets up global data structures designed to live for the entire lifetime of the process. They get cleaned up when the process exits, which is after memray stops watching, so memray sees them as "never freed" even though they are not leaking. + +A memory leak grows proportionally with work done. If you sign 50 images and get 3.2 MB leaked, then sign 1000 images and still get 3.2 MB leaked, that 3.2 MB is static one-time overhead rather than a leak, since it does not grow with the work that ran. If signing 1000 images gave you 64 MB leaked, that would be a leak, as the leaked memory grows with the work executed. + +The baseline captures this expected static overhead. Future runs compare against it: if `leaked_bytes` grows beyond the baseline by more than 10%, the run fails. + +The framework runs `gc.collect()` twice after the scenario finishes, while memray is still tracking. Without that sweep, objects sitting in not-yet-collected reference cycles would be counted in `leaked_bytes` and the number would depend on garbage collector timing rather than on actual leaks. With it, `leaked_bytes` means memory that is still allocated even though nothing in Python can reach it: true leaks plus the one-time static overhead described above. + +### How to confirm no leak exists? + +Run with a higher iteration count than default (100) and compare: + +```bash +make memory-use-bench MEMRAY_ITERATIONS=1000 PERF_ARGS=--update-baseline +``` + +If `leaked_bytes` stays flat compared to a baseline run or in a larger run (more iterations), there is no leak. If it scales with iterations, open `tests/perf/memory/reports/-leaks.html` in a browser to see which function is responsible. + +### Reading the "Resident set size over time" graph (why memory looks like it climbs) + +The "Resident set size over time" plot (chart icon, top-right of the report) draws two lines. "Resident size" (RSS) is every page the OS counts as resident: interpreter and pages the allocator holds but has not returned. "Heap size" is only the live tracked allocations. + +On the parallel scenarios the RSS line steps up and stays high. The threads each hold their own source, output, and `Builder` live at once, so RSS rises to cover that combined working set (the steps line up with the moments all threads overlap). The allocator then keeps those arena pages for reuse instead of returning them, so RSS plateaus at the high-water mark. + +Judge leaks by the heap line. The heap rises early and then settles or falls, the same shape as the single-threaded baseline. A within-run heap rise is not by itself proof of a leak (the allocator high-water can climb and settle within a bounded run). + +### Temporary allocations + +`-temporary.html` shows temporary allocations, meaning memory that is allocated and then freed almost immediately (memray's threshold is one allocation: a block is temporary if it is freed before more than one other allocation happens). The memory is returned, so these are not leaks, but they are churn: high allocation and free turnover that costs CPU and can fragment the heap. A scenario doing lots of short-lived work can show heavy temporary allocations while `leaked_bytes` stays flat. + +### When to update the baseline + +Update `baseline.json` after any intentional change that affects memory use: + +```bash +make memory-use-bench PERF_ARGS=--update-baseline +``` + +Commit the updated `baseline.json` alongside the code change, so it becomes the new reference to compare against. + +## Troubleshooting + +### A flamegraph render fails with `exit -9` + +You may see a message like `flamegraph render failed for reader_mp4-...-temporary.html (killed (likely OOM))`. The `-9` is SIGKILL: the operating system's out-of-memory killer terminated the `memray flamegraph` subprocess. The temporary view is the heaviest to render, and on a large capture (a long run, a high `MEMRAY_ITERATIONS`, or a churn-heavy scenario such as `reader_mp4`) it can exhaust available memory. + +The run does not abort. The capture and the metrics (`peak_bytes`, `leaked_bytes`, `total_allocations`) are read separately and are still recorded, the baseline is still written, and the run lists every failed render at the end. Only the HTML render is missing, and you have two ways to regenerate it. + +#### Option A: rerun the one scenario + +A single-scenario run renders one capture at a time with nothing else resident, so it often fits where the full suite did not: + +```bash +make memory-use-bench SCENARIO=reader_mp4 +``` + +If it still runs out of memory, lower the iteration count to shrink the capture: + +```bash +make memory-use-bench SCENARIO=reader_mp4 MEMRAY_ITERATIONS=20 +``` + +A lower iteration count makes that scenario's absolute allocation numbers no longer directly comparable to a full 100-iteration run. + +#### Option B: re-render the kept capture (no re-profiling) + +When a render fails, the run keeps that scenario's capture as `reports/-.bin`. Re-render just the failed view from that file with a higher temporary-allocation threshold, which cuts how much memray holds in memory so the render fits. This uses the original run's data, so the result stays comparable to the rest of the run: + +```bash +python3 -m memray flamegraph reports/reader_mp4-python-3.12-slim.bin \ + -o reports/reader_mp4-python-3.12-slim-temporary.html \ + --temporary-allocations --temporary-allocation-threshold=10 --force +``` diff --git a/tests/perf/memory/__init__.py b/tests/perf/memory/__init__.py new file mode 100644 index 00000000..17f51950 --- /dev/null +++ b/tests/perf/memory/__init__.py @@ -0,0 +1 @@ +# Empty placeholder file to facilitate imports diff --git a/tests/perf/baseline.json b/tests/perf/memory/baseline.json similarity index 100% rename from tests/perf/baseline.json rename to tests/perf/memory/baseline.json diff --git a/tests/perf/memory/reports/.gitkeep b/tests/perf/memory/reports/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/perf/run_profile.py b/tests/perf/memory/run_profile.py similarity index 98% rename from tests/perf/run_profile.py rename to tests/perf/memory/run_profile.py index 16640fac..8930e609 100644 --- a/tests/perf/run_profile.py +++ b/tests/perf/memory/run_profile.py @@ -20,7 +20,7 @@ scenarios, so it is informational, not a gate. Usage: - python -m tests.perf.run_profile [--update-baseline] + python -m tests.perf.memory.run_profile [--update-baseline] Environment variables: - MEMRAY_ITERATIONS: number of times each scenario loops (default: 100) @@ -44,6 +44,7 @@ from tests.perf.scenarios import SCENARIO_NAMES HERE = Path(__file__).parent +REPO_ROOT = HERE.parent.parent.parent REPORTS_DIR = HERE / "reports" BASELINE_FILE = HERE / "baseline.json" @@ -54,7 +55,7 @@ def _run_scenario_under_memray(name: str, bin_path: Path) -> None: """Spawn a subprocess that runs one scenario under memray --native.""" - repo_root = HERE.parent.parent + repo_root = REPO_ROOT script = f""" import sys sys.path.insert(0, "{repo_root}") @@ -156,7 +157,7 @@ def _build_meta() -> dict: """ native_version = "" try: - native_version = (HERE.parent.parent / "c2pa-native-version.txt").read_text().strip() + native_version = (REPO_ROOT / "c2pa-native-version.txt").read_text().strip() except OSError: pass return { diff --git a/tests/perf/scenarios.py b/tests/perf/scenarios.py index 0e367fb0..974425b6 100644 --- a/tests/perf/scenarios.py +++ b/tests/perf/scenarios.py @@ -104,6 +104,14 @@ def _make_signer() -> Signer: private_key=key, ta_url=b"http://timestamp.digicert.com", ) + # Each sign call blocks on a round-trip to the timestamp authority. The + # CPU harness sets PERF_DISABLE_TSA=1 so its timings measure code, not + # network; the memory harness leaves it unset (TSA on). The native lib + # treats a NULL ta_url as "no timestamping" (an empty string is rejected + # as an invalid URL), and the C2paSignerInfo constructor only accepts + # str/bytes, so the field is nulled after construction. + if os.environ.get("PERF_DISABLE_TSA") == "1": + info.ta_url = None return Signer.from_info(info) From c2707ea41345090b2fcdd8de4ec48424acd5a180 Mon Sep 17 00:00:00 2001 From: tmathern <60901087+tmathern@users.noreply.github.com> Date: Wed, 1 Jul 2026 19:02:52 -0700 Subject: [PATCH 2/5] fix: cleanup workflow --- .github/workflows/cpu-benchmark.yml | 29 ----------------- tests/perf/cpu/README.md | 50 +++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 29 deletions(-) diff --git a/.github/workflows/cpu-benchmark.yml b/.github/workflows/cpu-benchmark.yml index 197e7da8..c9e6c3f1 100644 --- a/.github/workflows/cpu-benchmark.yml +++ b/.github/workflows/cpu-benchmark.yml @@ -36,32 +36,3 @@ jobs: # baseline deltas land in this job's step summary, exit code stays 0. - name: Run CPU timing benchmark run: make cpu-bench CPU_MODE=timing - - cpu-profile: - name: CPU benchmark flamegraphs - runs-on: ubuntu-24.04-arm - if: | - contains(github.event.pull_request.labels.*.name, 'check-cpu-benchmark') && - ( - github.event.pull_request.author_association == 'COLLABORATOR' || - github.event.pull_request.author_association == 'MEMBER' || - github.event.pull_request.author_association == 'OWNER' - ) - steps: - - uses: actions/checkout@v4 - - - name: Build perf image - run: make perf-image-rebuild - - - name: Run py-spy CPU profiles - run: make cpu-bench CPU_MODE=profile - - - name: Upload CPU flamegraphs - if: always() - uses: actions/upload-artifact@v4 - with: - name: pyspy-cpu-flamegraphs - path: | - tests/perf/cpu/reports/*.svg - tests/perf/cpu/reports/*.speedscope.json - if-no-files-found: warn diff --git a/tests/perf/cpu/README.md b/tests/perf/cpu/README.md index 4d788b76..52b74a63 100644 --- a/tests/perf/cpu/README.md +++ b/tests/perf/cpu/README.md @@ -82,6 +82,56 @@ Run with `--update-baseline` on. Single-scenario updates merge into the existing Without a committed `baseline.json`, a run creates one and reports no deltas. +## Reading the CI/CD report + +The step summary table compares each scenario's current metrics against `baseline.json`, showing deltas (Δ) as percentages. Rows exceeding the drift threshold (default +25%) are marked with `drift` status in the rightmost column. + +### Table columns + +| Column | Meaning | +| --- | --- | +| `scenario` | Scenario name. | +| `wall` | Wall-clock time (seconds), across all iterations. | +| `cpu` | CPU time (seconds), process-wide; excludes I/O, sleep, time in child processes. | +| `cpu/iter` | Per-iteration CPU cost (cpu ÷ iterations). | +| `child cpu` | CPU burned in forked child processes (`fork_*` scenarios). Parent CPU time does not include this. | +| `wall Δ%` | Wall-clock drift: `(current - baseline) / baseline × 100`. Exceeds threshold → marked `drift`. | +| `cpu Δ%` | CPU time drift (same calculation). Informational only; does not trigger drift. | +| `status` | `ok` if within threshold; `drift` if over +25%. Informational; never fails the run. | + +### Common sources of large drifts (>25%) + +**Hardware mismatch**: Baseline generated on aarch64 ARM CPU; new run on x86_64 or different CPU generation. Check baseline `_meta.arch` (e.g., `aarch64`); if it doesn't match your runner's `platform.machine()`, re-baseline. + +**Native library version mismatch**: `_meta.c2pa_native_version` changed (e.g., c2pa-v0.89.0 → c2pa-v0.90.0). Native library performance can shift between releases, especially for asset hashing (GIF parsing, ingredient re-encoding). Re-baseline with `make cpu-bench PERF_ARGS=--update-baseline`. + +**CI runner contention**: Shared runners see ±10–20% wall-clock variance as normal. Burst load (other jobs, system activity) can push drifts to 50–80%. A single run with high drift is often temporary; re-run to confirm. + +**TSA re-enabled**: By default, CPU runs disable timestamp-authority network calls (`PERF_DISABLE_TSA=1`). If re-enabled (`PERF_DISABLE_TSA=0`), signing scenarios add unpredictable network latency (typically +5–10 seconds per run, high variance). This looks like a CPU regression but is external latency. + +### When drifts indicate a real problem + +**Both `cpu_seconds` and `wall_seconds` drift together (same direction/magnitude)**, and the environment matches the baseline (`_meta.arch` = `platform.machine()`, native lib version unchanged) is a possible real CPU regression. Compare the scenario's logic against recent code changes. + +**Only `wall_seconds` drifts; `cpu_seconds` is stable** is due to I/O contention or CI runner load, not a CPU regression. + +**Only `cpu_seconds` drifts; `wall_seconds` is stable** should be rare, likely a measurement artifact or GC timing variation in a single run. + +### When to update the baseline + +**Native library upgrade**: `c2pa-native-version.txt` changed. + +**Environment change**: Different `PERF_ENV`, CPU architecture, or Python version. + +**Parameter change**: Modified `CPU_ITERATIONS` (e.g., from 100 to 200). + +Command: +```bash +make cpu-bench PERF_ARGS=--update-baseline +``` + +This rewrites `baseline.json` with current metrics and metadata (`_meta`). Commit the updated baseline alongside your code changes. + ## CI `.github/workflows/cpu-benchmark.yml` runs on PRs labeled `check-cpu-benchmark`, on `ubuntu-24.04-arm` to match the baseline arch. From 3beed42ab51d6907c32e7908c5fd9a6d42834238 Mon Sep 17 00:00:00 2001 From: tmathern <60901087+tmathern@users.noreply.github.com> Date: Wed, 1 Jul 2026 19:26:14 -0700 Subject: [PATCH 3/5] fix: re-baseline --- tests/perf/cpu/baseline.json | 180 +++++++++++++++++------------------ 1 file changed, 90 insertions(+), 90 deletions(-) diff --git a/tests/perf/cpu/baseline.json b/tests/perf/cpu/baseline.json index e28d017b..bfdaec7d 100644 --- a/tests/perf/cpu/baseline.json +++ b/tests/perf/cpu/baseline.json @@ -8,168 +8,168 @@ "arch": "aarch64" }, "reader_jpeg_legacy": { - "wall_seconds": 0.2208, - "cpu_seconds": 0.2163, + "wall_seconds": 0.2247, + "cpu_seconds": 0.2198, "children_cpu_seconds": 0.0 }, "reader_jpeg_with_context": { - "wall_seconds": 0.2208, - "cpu_seconds": 0.2165, + "wall_seconds": 0.2264, + "cpu_seconds": 0.2216, "children_cpu_seconds": 0.0 }, "reader_manifest_data_context": { - "wall_seconds": 0.0957, - "cpu_seconds": 0.0968, + "wall_seconds": 0.0965, + "cpu_seconds": 0.0977, "children_cpu_seconds": 0.0 }, "reader_mp4": { - "wall_seconds": 0.4446, - "cpu_seconds": 0.4392, + "wall_seconds": 0.4531, + "cpu_seconds": 0.4476, "children_cpu_seconds": 0.0 }, "reader_wav": { - "wall_seconds": 0.1413, - "cpu_seconds": 0.1272, + "wall_seconds": 0.1409, + "cpu_seconds": 0.1266, "children_cpu_seconds": 0.0 }, "builder_sign_jpeg_legacy": { - "wall_seconds": 1.4077, - "cpu_seconds": 1.4082, + "wall_seconds": 1.4478, + "cpu_seconds": 1.4483, "children_cpu_seconds": 0.0 }, "builder_sign_jpeg_with_context": { - "wall_seconds": 1.3963, - "cpu_seconds": 1.3971, + "wall_seconds": 1.4488, + "cpu_seconds": 1.4493, "children_cpu_seconds": 0.0 }, "builder_sign_png_legacy": { - "wall_seconds": 1.8313, - "cpu_seconds": 1.8313, + "wall_seconds": 1.8338, + "cpu_seconds": 1.8343, "children_cpu_seconds": 0.0 }, "builder_sign_png_with_context": { - "wall_seconds": 1.7863, - "cpu_seconds": 1.7869, + "wall_seconds": 1.8371, + "cpu_seconds": 1.8377, "children_cpu_seconds": 0.0 }, "builder_sign_jpeg_parallel_split_pool": { - "wall_seconds": 0.3109, - "cpu_seconds": 2.0177, + "wall_seconds": 0.3059, + "cpu_seconds": 2.0522, "children_cpu_seconds": 0.0 }, "builder_sign_jpeg_parallel_split_barrier": { - "wall_seconds": 0.3068, - "cpu_seconds": 2.0781, + "wall_seconds": 0.3052, + "cpu_seconds": 2.06, "children_cpu_seconds": 0.0 }, "builder_sign_png_parallel_split_pool": { - "wall_seconds": 0.6139, - "cpu_seconds": 2.8088, + "wall_seconds": 0.6215, + "cpu_seconds": 2.8177, "children_cpu_seconds": 0.0 }, "builder_sign_png_parallel_split_barrier": { - "wall_seconds": 0.6298, - "cpu_seconds": 2.8269, + "wall_seconds": 0.627, + "cpu_seconds": 2.8222, "children_cpu_seconds": 0.0 }, "builder_sign_gif": { - "wall_seconds": 34.1958, - "cpu_seconds": 34.1939, + "wall_seconds": 34.1829, + "cpu_seconds": 34.1822, "children_cpu_seconds": 0.0 }, "builder_sign_heic": { - "wall_seconds": 0.1297, - "cpu_seconds": 0.1358, + "wall_seconds": 0.1322, + "cpu_seconds": 0.138, "children_cpu_seconds": 0.0 }, "builder_sign_m4a": { - "wall_seconds": 0.3892, - "cpu_seconds": 0.395, + "wall_seconds": 0.4129, + "cpu_seconds": 0.4182, "children_cpu_seconds": 0.0 }, "builder_sign_webp": { - "wall_seconds": 2.0344, - "cpu_seconds": 2.0337, + "wall_seconds": 2.0369, + "cpu_seconds": 2.0369, "children_cpu_seconds": 0.0 }, "builder_sign_avi": { - "wall_seconds": 1.6515, - "cpu_seconds": 1.6517, + "wall_seconds": 1.6338, + "cpu_seconds": 1.6347, "children_cpu_seconds": 0.0 }, "builder_sign_mp4": { - "wall_seconds": 0.212, - "cpu_seconds": 0.2177, + "wall_seconds": 0.2142, + "cpu_seconds": 0.2207, "children_cpu_seconds": 0.0 }, "builder_sign_tiff": { - "wall_seconds": 3.422, - "cpu_seconds": 3.4215, + "wall_seconds": 3.4209, + "cpu_seconds": 3.4216, "children_cpu_seconds": 0.0 }, "builder_sign_jpeg_parent_of": { - "wall_seconds": 3.7998, - "cpu_seconds": 3.8144, + "wall_seconds": 3.8125, + "cpu_seconds": 3.8276, "children_cpu_seconds": 0.0 }, "builder_sign_jpeg_component_of": { - "wall_seconds": 3.8204, - "cpu_seconds": 3.8368, + "wall_seconds": 3.8186, + "cpu_seconds": 3.8342, "children_cpu_seconds": 0.0 }, "builder_sign_jpeg_parent_and_component": { - "wall_seconds": 8.2377, - "cpu_seconds": 5.8484, + "wall_seconds": 7.295, + "cpu_seconds": 5.8927, "children_cpu_seconds": 0.0 }, "builder_sign_jpeg_parent_and_component_mixed_mime": { - "wall_seconds": 5.512, - "cpu_seconds": 5.5281, + "wall_seconds": 5.5439, + "cpu_seconds": 5.5601, "children_cpu_seconds": 0.0 }, "builder_sign_jpeg_two_components_same_mime": { - "wall_seconds": 8.0985, - "cpu_seconds": 5.8208, + "wall_seconds": 7.2159, + "cpu_seconds": 5.8835, "children_cpu_seconds": 0.0 }, "builder_sign_jpeg_two_components_mixed_mime": { - "wall_seconds": 5.5014, - "cpu_seconds": 5.5182, + "wall_seconds": 5.5449, + "cpu_seconds": 5.5602, "children_cpu_seconds": 0.0 }, "builder_sign_jpeg_archive_roundtrip": { - "wall_seconds": 3.8633, - "cpu_seconds": 3.8827, + "wall_seconds": 3.8909, + "cpu_seconds": 3.9107, "children_cpu_seconds": 0.0 }, "builder_to_archive_with_ingredient": { - "wall_seconds": 2.3895, - "cpu_seconds": 2.4024, + "wall_seconds": 2.4093, + "cpu_seconds": 2.4227, "children_cpu_seconds": 0.0 }, "builder_sign_jpeg_archive_roundtrip_ingredient_in_archive": { - "wall_seconds": 4.1713, - "cpu_seconds": 4.2044, + "wall_seconds": 4.2106, + "cpu_seconds": 4.2444, "children_cpu_seconds": 0.0 }, "builder_write_ingredient_archive": { - "wall_seconds": 2.3881, - "cpu_seconds": 2.401, + "wall_seconds": 2.408, + "cpu_seconds": 2.4205, "children_cpu_seconds": 0.0 }, "builder_sign_jpeg_add_ingredient_from_archive": { - "wall_seconds": 1.7888, - "cpu_seconds": 1.8066, + "wall_seconds": 1.799, + "cpu_seconds": 1.8156, "children_cpu_seconds": 0.0 }, "builder_ingredient_archive_roundtrip": { - "wall_seconds": 4.1485, - "cpu_seconds": 4.181, + "wall_seconds": 4.1928, + "cpu_seconds": 4.2245, "children_cpu_seconds": 0.0 }, "builder_sign_jpeg_two_ingredient_archives": { - "wall_seconds": 1.8626, - "cpu_seconds": 1.8852, + "wall_seconds": 1.8996, + "cpu_seconds": 1.922, "children_cpu_seconds": 0.0 }, "reader_error_no_manifest": { @@ -178,48 +178,48 @@ "children_cpu_seconds": 0.0 }, "builder_error_invalid_manifest": { - "wall_seconds": 0.0028, + "wall_seconds": 0.0029, "cpu_seconds": 0.0016, "children_cpu_seconds": 0.0 }, "reader_string_apis": { - "wall_seconds": 0.3259, - "cpu_seconds": 0.3286, + "wall_seconds": 0.3305, + "cpu_seconds": 0.3332, "children_cpu_seconds": 0.0 }, "fork_reader_collect": { - "wall_seconds": 0.559, - "cpu_seconds": 0.2634, - "children_cpu_seconds": 0.2941 + "wall_seconds": 0.5732, + "cpu_seconds": 0.2697, + "children_cpu_seconds": 0.302 }, "fork_contended_mutex": { - "wall_seconds": 7.2526, - "cpu_seconds": 24.1507, - "children_cpu_seconds": 2.1624 + "wall_seconds": 7.4983, + "cpu_seconds": 24.5089, + "children_cpu_seconds": 2.213 }, "fork_thread_local_orphan": { - "wall_seconds": 0.577, - "cpu_seconds": 0.2764, - "children_cpu_seconds": 0.3 + "wall_seconds": 0.5863, + "cpu_seconds": 0.2787, + "children_cpu_seconds": 0.3075 }, "fork_gc_cycle": { - "wall_seconds": 0.694, - "cpu_seconds": 0.3994, - "children_cpu_seconds": 0.2928 + "wall_seconds": 0.7053, + "cpu_seconds": 0.4032, + "children_cpu_seconds": 0.3003 }, "fork_parent_frees_after_fork": { - "wall_seconds": 4.5677, - "cpu_seconds": 4.4903, - "children_cpu_seconds": 0.038 + "wall_seconds": 4.5946, + "cpu_seconds": 4.5223, + "children_cpu_seconds": 0.0401 }, "fork_child_sys_exit": { - "wall_seconds": 1.0942, - "cpu_seconds": 0.28, - "children_cpu_seconds": 0.8111 + "wall_seconds": 1.1108, + "cpu_seconds": 0.2825, + "children_cpu_seconds": 0.8249 }, "fork_stream_cleanup": { - "wall_seconds": 0.2973, - "cpu_seconds": 0.02, - "children_cpu_seconds": 0.2834 + "wall_seconds": 0.3052, + "cpu_seconds": 0.0209, + "children_cpu_seconds": 0.2905 } } \ No newline at end of file From 7a7c9ce33073c905a1968cc05c99c081ae2c373f Mon Sep 17 00:00:00 2001 From: tmathern <60901087+tmathern@users.noreply.github.com> Date: Wed, 1 Jul 2026 19:45:26 -0700 Subject: [PATCH 4/5] fix: clean up bench --- Makefile | 3 +- tests/perf/README.md | 2 +- tests/perf/cpu/README.md | 76 ++---------- tests/perf/cpu/baseline.json | 225 ---------------------------------- tests/perf/cpu/run_profile.py | 170 +++---------------------- 5 files changed, 26 insertions(+), 450 deletions(-) delete mode 100644 tests/perf/cpu/baseline.json diff --git a/Makefile b/Makefile index 2d873e36..e75caf7c 100644 --- a/Makefile +++ b/Makefile @@ -144,7 +144,6 @@ PERF_ENV ?= python-3.12-slim MEMRAY_ITERATIONS ?= 100 MEMRAY_THRESHOLD ?= 1.1 CPU_ITERATIONS ?= 100 -CPU_THRESHOLD ?= 1.25 CPU_MODE ?= all CPU_REPEATS ?= 0 PYSPY_RATE ?= 100 @@ -183,7 +182,7 @@ clean-memory-perf-reports: # Pre-requisite: Docker image built using `make perf-image-rebuild`. .PHONY: cpu-bench cpu-bench: - docker run --rm --cap-add SYS_PTRACE --security-opt seccomp=unconfined -v $(PWD):/workspace $(GH_SUMMARY_MOUNT) -e PYTHONPATH=/workspace/src -e PERF_ENV=$(PERF_ENV) -e CPU_ITERATIONS=$(CPU_ITERATIONS) -e CPU_THRESHOLD=$(CPU_THRESHOLD) -e CPU_REPEATS=$(CPU_REPEATS) -e PERF_DISABLE_TSA -e PYSPY_RATE=$(PYSPY_RATE) -e PYSPY_FORMAT -e GITHUB_TOKEN -e GITHUB_STEP_SUMMARY c2pa-perf-$(PERF_ENV) python -m tests.perf.cpu.run_profile --mode $(CPU_MODE) $(SCENARIO_ARG) $(PERF_ARGS) + docker run --rm --cap-add SYS_PTRACE --security-opt seccomp=unconfined -v $(PWD):/workspace $(GH_SUMMARY_MOUNT) -e PYTHONPATH=/workspace/src -e PERF_ENV=$(PERF_ENV) -e CPU_ITERATIONS=$(CPU_ITERATIONS) -e CPU_REPEATS=$(CPU_REPEATS) -e PERF_DISABLE_TSA -e PYSPY_RATE=$(PYSPY_RATE) -e PYSPY_FORMAT -e GITHUB_TOKEN -e GITHUB_STEP_SUMMARY c2pa-perf-$(PERF_ENV) python -m tests.perf.cpu.run_profile --mode $(CPU_MODE) $(SCENARIO_ARG) $(PERF_ARGS) @echo "" @echo "Reports written to tests/perf/cpu/reports/" diff --git a/tests/perf/README.md b/tests/perf/README.md index 55c6c35f..365a84a9 100644 --- a/tests/perf/README.md +++ b/tests/perf/README.md @@ -20,7 +20,7 @@ Each framework focuses on different indicators: - **`memory/`** tracks memory usage, not time: peak RSS, leaks, temporary-allocation churn. Catches a change that holds more memory at once or leaks with iteration count, even if it runs just as fast. - **`cpu/`** tracks time, not memory usage: wall/CPU seconds plus a flamegraph of where cycles go. Catches a slowdown and the call site causing it, even if memory use is unchanged. -Both run inside the Docker perf image (`../Dockerfiles/`), with a fixed Python version, fixed OS, fixed dependency set, and no other host processes competing for CPU/memory. That isolation is why they carry a committed `baseline.json`: same environment every run, so a delta means the code changed, not that the host did. +Both run inside the Docker perf image (`../Dockerfiles/`), with a fixed Python version, fixed OS, and fixed dependency set. `memory/` carries a committed `baseline.json` and gates CI on it: memory measurements don't depend on host CPU allocation, so a delta reliably means the code changed. `cpu/` does not: CI runs on shared/burstable runner vCPUs, so timings vary with host load in a way memory doesn't, and no committed baseline is meaningful there — it reports raw numbers only. See [cpu/README.md](cpu/README.md#why-theres-no-baseline-or-drift-gate) for details. ## Quickstart diff --git a/tests/perf/cpu/README.md b/tests/perf/cpu/README.md index 52b74a63..5a15a4f7 100644 --- a/tests/perf/cpu/README.md +++ b/tests/perf/cpu/README.md @@ -1,22 +1,21 @@ # CPU profiling framework -Uses [py-spy](https://github.com/benfred/py-spy) to profile where CPU time goes across c2pa-python operations, plus plain timing measurements to track wall/CPU time per scenario against a baseline. +Uses [py-spy](https://github.com/benfred/py-spy) to profile where CPU time goes across c2pa-python operations, plus plain timing measurements to report wall/CPU time per scenario. ## Files | File | Purpose | | --- | --- | | `../scenarios.py` | Functions that exercise each profiling scenario. Imported by `run_profile.py`. | -| `run_profile.py` | CPU analysis. Times each scenario in a plain child process, renders a py-spy flamegraph per scenario, and compares timings against `baseline.json`. | -| `baseline.json` | Committed reference timings (`_meta` provenance block + per-scenario `wall_seconds`, `cpu_seconds`, `children_cpu_seconds`). | +| `run_profile.py` | CPU analysis. Times each scenario in a plain child process and renders a py-spy flamegraph per scenario. | | `../Dockerfiles/` | One Dockerfile per target environment (shared with the memory benchmark). Selected via `PERF_ENV` at `make` time. | | `../entrypoint.sh` | Container entrypoint (shared). Downloads the Linux native `libc2pa_c.so` at startup. | | `reports/` | Generated profiles (gitignored). One file per scenario: `-cpu.svg` (or `.speedscope.json` with `PYSPY_FORMAT=speedscope`). | ## Approach: two passes per scenario -1. Timing pass: the scenario runs in a plain child process with no profiler attached. The child measures three metrics around the scenario call only, excluding interpreter startup, and hands the result back as JSON: `wall_seconds` (`time.perf_counter`), `cpu_seconds` (`time.process_time`, process-wide so the thread-pool scenarios count all threads), and `children_cpu_seconds` (`resource.getrusage(RUSAGE_CHILDREN)`, the CPU burned in forked children, which `process_time` cannot see; relevant for the `fork_*` scenarios). These numbers feed the baseline comparison. -2. Profile pass: the scenario runs again under `py-spy record` to produce a flamegraph. Profile numbers never feed the baseline, since sampling adds rate-dependent overhead that would bake profiler cost into the timings. +1. Timing pass: the scenario runs in a plain child process with no profiler attached. The child measures three metrics around the scenario call only, excluding interpreter startup, and hands the result back as JSON: `wall_seconds` (`time.perf_counter`), `cpu_seconds` (`time.process_time`, process-wide so the thread-pool scenarios count all threads), and `children_cpu_seconds` (`resource.getrusage(RUSAGE_CHILDREN)`, the CPU burned in forked children, which `process_time` cannot see; relevant for the `fork_*` scenarios). +2. Profile pass: the scenario runs again under `py-spy record` to produce a flamegraph. This is diagnostic only, since sampling adds rate-dependent overhead that would bake profiler cost into the timings. Scenario children run with `PERF_DISABLE_TSA=1` by default, so signing scenarios skip the network round-trip to the timestamp authority that every `sign` call otherwise makes. Timings then measure code, not network latency. Pass `PERF_DISABLE_TSA=0` to restore the TSA call. The memory benchmark is unaffected either way; it keeps TSA on. @@ -24,10 +23,6 @@ When a scenario's first timing pass finishes in under 1 second, the harness runs Locally the default `--mode all` runs both passes back to back. In CI they run as two parallel jobs on separate runners (`--mode timing` / `--mode profile`), so sampling never contends with the timed run and wall-clock cost stays at one pass. -## CI/CD report - -Timings on shared CI runners are noisy (±10-20% wall-clock swing is normal). The harness therefore ships report-only: baseline deltas are printed and shown in the CI step summary: rows over the threshold get a `drift` status, but the run always exits 0. - ## Running The benchmarks run inside Docker and need the perf Docker image: @@ -50,9 +45,6 @@ make cpu-bench CPU_MODE=timing # flamegraphs only make cpu-bench CPU_MODE=profile - -# update baselines -make cpu-bench PERF_ARGS=--update-baseline ``` The `cpu-bench` target runs the container with `--cap-add SYS_PTRACE --security-opt seccomp=unconfined`. `py-spy` samples the child process via ptrace and `process_vm_readv`, which the default container security profile blocks. @@ -62,7 +54,6 @@ The `cpu-bench` target runs the container with `--cap-add SYS_PTRACE --security- | Variable | Default | Meaning | | --- | --- | --- | | `CPU_ITERATIONS` | `100` | Loop count per scenario (the Makefile forwards it into the container). | -| `CPU_THRESHOLD` | `1.25` | Drift multiplier vs baseline (1.25 = +25%) used for the `drift` status. | | `CPU_MODE` | `all` | `all`, `timing`, or `profile` (maps to `--mode`). | | `CPU_REPEATS` | `0` | Fixed timing pass count per scenario, median recorded. `0` = adaptive: 1 pass, extended to 5 when the first pass runs under 1 s. | | `PERF_DISABLE_TSA` | `1` (set by the harness) | Skip the timestamp-authority network call during signing. Set `0` to restore it. | @@ -70,21 +61,17 @@ The `cpu-bench` target runs the container with `--cap-add SYS_PTRACE --security- | `PYSPY_FORMAT` | `flamegraph` | `flamegraph` writes self-contained SVGs; `speedscope` writes JSON for [speedscope.app](https://www.speedscope.app/). | | `PERF_ENV` | `python-3.12-slim` | Which Docker environment to use (see `../Dockerfiles/`). | | `SCENARIO` | unset | Run a single scenario. | -| `PERF_ARGS` | unset | Extra args for `run_profile.py`, e.g. `--update-baseline`. | +| `PERF_ARGS` | unset | Extra args for `run_profile.py`. | ## Interpreting the flamegraphs -Profiles show Python frames only. Time spent inside the Rust `libc2pa_c` library is attributed to the Python frame that made the FFI call. Fast scenarios at the default 100 Hz can produce thin profiles; raise `PYSPY_RATE` or `CPU_ITERATIONS` for more samples. - -## Updating the baseline - -Run with `--update-baseline` on. Single-scenario updates merge into the existing file and warn if the environment (`_meta`) does not match the other entries. - -Without a committed `baseline.json`, a run creates one and reports no deltas. +Profiles show Python frames only. Time spent inside the Rust `libc2pa_c` library is attributed to the Python frame that made the FFI call. Fast scenarios can produce thin profiles: raising `PYSPY_RATE` or `CPU_ITERATIONS` will lead to getting more samples. ## Reading the CI/CD report -The step summary table compares each scenario's current metrics against `baseline.json`, showing deltas (Δ) as percentages. Rows exceeding the drift threshold (default +25%) are marked with `drift` status in the rightmost column. +`.github/workflows/cpu-benchmark.yml` runs on PRs labeled `check-cpu-benchmark`, on `ubuntu-24.04-arm`. `cpu-timing` measures timing metrics; the raw timing table lands in the job's step summary. + +The step summary table reports each scenario's raw timing: `wall`, `cpu`, `cpu/iter`, and `child cpu`. There is no baseline comparison or drift status. ### Table columns @@ -95,48 +82,3 @@ The step summary table compares each scenario's current metrics against `baselin | `cpu` | CPU time (seconds), process-wide; excludes I/O, sleep, time in child processes. | | `cpu/iter` | Per-iteration CPU cost (cpu ÷ iterations). | | `child cpu` | CPU burned in forked child processes (`fork_*` scenarios). Parent CPU time does not include this. | -| `wall Δ%` | Wall-clock drift: `(current - baseline) / baseline × 100`. Exceeds threshold → marked `drift`. | -| `cpu Δ%` | CPU time drift (same calculation). Informational only; does not trigger drift. | -| `status` | `ok` if within threshold; `drift` if over +25%. Informational; never fails the run. | - -### Common sources of large drifts (>25%) - -**Hardware mismatch**: Baseline generated on aarch64 ARM CPU; new run on x86_64 or different CPU generation. Check baseline `_meta.arch` (e.g., `aarch64`); if it doesn't match your runner's `platform.machine()`, re-baseline. - -**Native library version mismatch**: `_meta.c2pa_native_version` changed (e.g., c2pa-v0.89.0 → c2pa-v0.90.0). Native library performance can shift between releases, especially for asset hashing (GIF parsing, ingredient re-encoding). Re-baseline with `make cpu-bench PERF_ARGS=--update-baseline`. - -**CI runner contention**: Shared runners see ±10–20% wall-clock variance as normal. Burst load (other jobs, system activity) can push drifts to 50–80%. A single run with high drift is often temporary; re-run to confirm. - -**TSA re-enabled**: By default, CPU runs disable timestamp-authority network calls (`PERF_DISABLE_TSA=1`). If re-enabled (`PERF_DISABLE_TSA=0`), signing scenarios add unpredictable network latency (typically +5–10 seconds per run, high variance). This looks like a CPU regression but is external latency. - -### When drifts indicate a real problem - -**Both `cpu_seconds` and `wall_seconds` drift together (same direction/magnitude)**, and the environment matches the baseline (`_meta.arch` = `platform.machine()`, native lib version unchanged) is a possible real CPU regression. Compare the scenario's logic against recent code changes. - -**Only `wall_seconds` drifts; `cpu_seconds` is stable** is due to I/O contention or CI runner load, not a CPU regression. - -**Only `cpu_seconds` drifts; `wall_seconds` is stable** should be rare, likely a measurement artifact or GC timing variation in a single run. - -### When to update the baseline - -**Native library upgrade**: `c2pa-native-version.txt` changed. - -**Environment change**: Different `PERF_ENV`, CPU architecture, or Python version. - -**Parameter change**: Modified `CPU_ITERATIONS` (e.g., from 100 to 200). - -Command: -```bash -make cpu-bench PERF_ARGS=--update-baseline -``` - -This rewrites `baseline.json` with current metrics and metadata (`_meta`). Commit the updated baseline alongside your code changes. - -## CI - -`.github/workflows/cpu-benchmark.yml` runs on PRs labeled `check-cpu-benchmark`, on `ubuntu-24.04-arm` to match the baseline arch. - -Two parallel jobs run for this workflow: - -- `cpu-timing` measures timing metrics; the baseline comparison table lands in the job's step summary. -- `cpu-profile` renders the py-spy flamegraphs and uploads them as the `pyspy-cpu-flamegraphs` artifact. diff --git a/tests/perf/cpu/baseline.json b/tests/perf/cpu/baseline.json deleted file mode 100644 index bfdaec7d..00000000 --- a/tests/perf/cpu/baseline.json +++ /dev/null @@ -1,225 +0,0 @@ -{ - "_meta": { - "pyspy_version": "0.4.2", - "python_version": "3.12.13", - "c2pa_native_version": "c2pa-v0.89.0", - "iterations": 100, - "perf_env": "python-3.12-slim", - "arch": "aarch64" - }, - "reader_jpeg_legacy": { - "wall_seconds": 0.2247, - "cpu_seconds": 0.2198, - "children_cpu_seconds": 0.0 - }, - "reader_jpeg_with_context": { - "wall_seconds": 0.2264, - "cpu_seconds": 0.2216, - "children_cpu_seconds": 0.0 - }, - "reader_manifest_data_context": { - "wall_seconds": 0.0965, - "cpu_seconds": 0.0977, - "children_cpu_seconds": 0.0 - }, - "reader_mp4": { - "wall_seconds": 0.4531, - "cpu_seconds": 0.4476, - "children_cpu_seconds": 0.0 - }, - "reader_wav": { - "wall_seconds": 0.1409, - "cpu_seconds": 0.1266, - "children_cpu_seconds": 0.0 - }, - "builder_sign_jpeg_legacy": { - "wall_seconds": 1.4478, - "cpu_seconds": 1.4483, - "children_cpu_seconds": 0.0 - }, - "builder_sign_jpeg_with_context": { - "wall_seconds": 1.4488, - "cpu_seconds": 1.4493, - "children_cpu_seconds": 0.0 - }, - "builder_sign_png_legacy": { - "wall_seconds": 1.8338, - "cpu_seconds": 1.8343, - "children_cpu_seconds": 0.0 - }, - "builder_sign_png_with_context": { - "wall_seconds": 1.8371, - "cpu_seconds": 1.8377, - "children_cpu_seconds": 0.0 - }, - "builder_sign_jpeg_parallel_split_pool": { - "wall_seconds": 0.3059, - "cpu_seconds": 2.0522, - "children_cpu_seconds": 0.0 - }, - "builder_sign_jpeg_parallel_split_barrier": { - "wall_seconds": 0.3052, - "cpu_seconds": 2.06, - "children_cpu_seconds": 0.0 - }, - "builder_sign_png_parallel_split_pool": { - "wall_seconds": 0.6215, - "cpu_seconds": 2.8177, - "children_cpu_seconds": 0.0 - }, - "builder_sign_png_parallel_split_barrier": { - "wall_seconds": 0.627, - "cpu_seconds": 2.8222, - "children_cpu_seconds": 0.0 - }, - "builder_sign_gif": { - "wall_seconds": 34.1829, - "cpu_seconds": 34.1822, - "children_cpu_seconds": 0.0 - }, - "builder_sign_heic": { - "wall_seconds": 0.1322, - "cpu_seconds": 0.138, - "children_cpu_seconds": 0.0 - }, - "builder_sign_m4a": { - "wall_seconds": 0.4129, - "cpu_seconds": 0.4182, - "children_cpu_seconds": 0.0 - }, - "builder_sign_webp": { - "wall_seconds": 2.0369, - "cpu_seconds": 2.0369, - "children_cpu_seconds": 0.0 - }, - "builder_sign_avi": { - "wall_seconds": 1.6338, - "cpu_seconds": 1.6347, - "children_cpu_seconds": 0.0 - }, - "builder_sign_mp4": { - "wall_seconds": 0.2142, - "cpu_seconds": 0.2207, - "children_cpu_seconds": 0.0 - }, - "builder_sign_tiff": { - "wall_seconds": 3.4209, - "cpu_seconds": 3.4216, - "children_cpu_seconds": 0.0 - }, - "builder_sign_jpeg_parent_of": { - "wall_seconds": 3.8125, - "cpu_seconds": 3.8276, - "children_cpu_seconds": 0.0 - }, - "builder_sign_jpeg_component_of": { - "wall_seconds": 3.8186, - "cpu_seconds": 3.8342, - "children_cpu_seconds": 0.0 - }, - "builder_sign_jpeg_parent_and_component": { - "wall_seconds": 7.295, - "cpu_seconds": 5.8927, - "children_cpu_seconds": 0.0 - }, - "builder_sign_jpeg_parent_and_component_mixed_mime": { - "wall_seconds": 5.5439, - "cpu_seconds": 5.5601, - "children_cpu_seconds": 0.0 - }, - "builder_sign_jpeg_two_components_same_mime": { - "wall_seconds": 7.2159, - "cpu_seconds": 5.8835, - "children_cpu_seconds": 0.0 - }, - "builder_sign_jpeg_two_components_mixed_mime": { - "wall_seconds": 5.5449, - "cpu_seconds": 5.5602, - "children_cpu_seconds": 0.0 - }, - "builder_sign_jpeg_archive_roundtrip": { - "wall_seconds": 3.8909, - "cpu_seconds": 3.9107, - "children_cpu_seconds": 0.0 - }, - "builder_to_archive_with_ingredient": { - "wall_seconds": 2.4093, - "cpu_seconds": 2.4227, - "children_cpu_seconds": 0.0 - }, - "builder_sign_jpeg_archive_roundtrip_ingredient_in_archive": { - "wall_seconds": 4.2106, - "cpu_seconds": 4.2444, - "children_cpu_seconds": 0.0 - }, - "builder_write_ingredient_archive": { - "wall_seconds": 2.408, - "cpu_seconds": 2.4205, - "children_cpu_seconds": 0.0 - }, - "builder_sign_jpeg_add_ingredient_from_archive": { - "wall_seconds": 1.799, - "cpu_seconds": 1.8156, - "children_cpu_seconds": 0.0 - }, - "builder_ingredient_archive_roundtrip": { - "wall_seconds": 4.1928, - "cpu_seconds": 4.2245, - "children_cpu_seconds": 0.0 - }, - "builder_sign_jpeg_two_ingredient_archives": { - "wall_seconds": 1.8996, - "cpu_seconds": 1.922, - "children_cpu_seconds": 0.0 - }, - "reader_error_no_manifest": { - "wall_seconds": 0.0096, - "cpu_seconds": 0.0079, - "children_cpu_seconds": 0.0 - }, - "builder_error_invalid_manifest": { - "wall_seconds": 0.0029, - "cpu_seconds": 0.0016, - "children_cpu_seconds": 0.0 - }, - "reader_string_apis": { - "wall_seconds": 0.3305, - "cpu_seconds": 0.3332, - "children_cpu_seconds": 0.0 - }, - "fork_reader_collect": { - "wall_seconds": 0.5732, - "cpu_seconds": 0.2697, - "children_cpu_seconds": 0.302 - }, - "fork_contended_mutex": { - "wall_seconds": 7.4983, - "cpu_seconds": 24.5089, - "children_cpu_seconds": 2.213 - }, - "fork_thread_local_orphan": { - "wall_seconds": 0.5863, - "cpu_seconds": 0.2787, - "children_cpu_seconds": 0.3075 - }, - "fork_gc_cycle": { - "wall_seconds": 0.7053, - "cpu_seconds": 0.4032, - "children_cpu_seconds": 0.3003 - }, - "fork_parent_frees_after_fork": { - "wall_seconds": 4.5946, - "cpu_seconds": 4.5223, - "children_cpu_seconds": 0.0401 - }, - "fork_child_sys_exit": { - "wall_seconds": 1.1108, - "cpu_seconds": 0.2825, - "children_cpu_seconds": 0.8249 - }, - "fork_stream_cleanup": { - "wall_seconds": 0.3052, - "cpu_seconds": 0.0209, - "children_cpu_seconds": 0.2905 - } -} \ No newline at end of file diff --git a/tests/perf/cpu/run_profile.py b/tests/perf/cpu/run_profile.py index f5f38c0a..24f5e3be 100644 --- a/tests/perf/cpu/run_profile.py +++ b/tests/perf/cpu/run_profile.py @@ -15,25 +15,18 @@ scenario call only, excluding interpreter startup. No profiler is attached, so the numbers are free of sampling overhead. - Profile pass: re-runs the scenario under `py-spy record` to produce a - flamegraph (SVG by default, speedscope JSON via PYSPY_FORMAT). Profile - numbers never feed the baseline; the artifact is diagnostic only. - -Results are compared against baseline.json (created on first run). The -comparison is REPORT-ONLY: over-threshold drift is printed and highlighted -in the CI step summary, but never fails the run. CPU timings on shared -runners are too noisy to gate CI; the memory benchmark is the gate. + flamegraph (SVG by default, speedscope JSON via PYSPY_FORMAT). The + artifact is diagnostic only. Usage: - python -m tests.perf.cpu.run_profile [--update-baseline] - [--scenario NAME] [--mode {all,timing,profile}] + python -m tests.perf.cpu.run_profile [--scenario NAME] [--mode {all,timing,profile}] ---mode exists so CI can run the two passes as parallel jobs on separate -runners: `timing` is unpolluted by py-spy CPU contention, `profile` only +--mode exists so CI could run the two passes as parallel jobs on separate +runners: `timing` is unaffected by py-spy CPU contention, `profile` only produces flamegraphs. Environment variables: - CPU_ITERATIONS: number of times each scenario loops (default: 100) -- CPU_THRESHOLD: drift multiplier, e.g. 1.25 for +25% (default: 1.25) - CPU_REPEATS: fixed number of timing passes per scenario, median recorded (default: adaptive — 1 pass, extended to 5 when the first pass finishes under 1 second, where single-shot timing is mostly jitter) @@ -53,18 +46,14 @@ import tempfile from pathlib import Path -import platform - # Scenario name list from tests.perf.scenarios import SCENARIO_NAMES HERE = Path(__file__).parent REPO_ROOT = HERE.parent.parent.parent REPORTS_DIR = HERE / "reports" -BASELINE_FILE = HERE / "baseline.json" ITERATIONS = int(os.environ.get("CPU_ITERATIONS", "100")) -THRESHOLD = float(os.environ.get("CPU_THRESHOLD", "1.25")) REPEATS = int(os.environ.get("CPU_REPEATS", "0")) # 0 = adaptive PYSPY_RATE = int(os.environ.get("PYSPY_RATE", "100")) PYSPY_FORMAT = os.environ.get("PYSPY_FORMAT", "flamegraph") @@ -76,15 +65,9 @@ _FAST_WALL_SECONDS = 1.0 _FAST_REPEATS = 5 -# Metrics compared against the baseline. cpu_seconds is the primary signal -# (stable under runner load); wall_seconds also catches blocking/sleep -# regressions that never burn CPU. children_cpu_seconds (CPU burned in -# forked children, invisible to process_time) is reported but not part of -# the gate list. -_METRICS = ("wall_seconds", "cpu_seconds") _ALL_METRICS = ("wall_seconds", "cpu_seconds", "children_cpu_seconds") -# Drift on children_cpu_seconds is only meaningful when the baseline value -# is non-trivial; below this, percent deltas are pure noise. +# Below this, child-cpu is not worth printing; percent-scale noise on a +# ~0 base isn't informative. _CHILDREN_CPU_MIN_BASE = 0.01 @@ -216,48 +199,13 @@ def _run_pyspy_pass(name: str, out_path: Path) -> bool: return True -def _pyspy_version() -> str: - try: - out = subprocess.run(["py-spy", "--version"], capture_output=True, text=True) - # "py-spy 0.4.2" -> "0.4.2" - return out.stdout.strip().split()[-1] if out.returncode == 0 else "" - except OSError: - return "" - - -def _build_meta() -> dict: - """Provenance for the baseline: which toolchain produced these numbers. - Recorded so a committed baseline is reproducible under same conditions. - """ - native_version = "" - try: - native_version = (REPO_ROOT / "c2pa-native-version.txt").read_text().strip() - except OSError: - pass - return { - "pyspy_version": _pyspy_version(), - "python_version": platform.python_version(), - "c2pa_native_version": native_version, - "iterations": ITERATIONS, - "perf_env": PERF_ENV, - "arch": platform.machine(), - } - - def _fmt_secs(s: float) -> str: if s < 1: return f"{s * 1000:.1f} ms" return f"{s:.3f} s" -def _delta_pct(current: float, base: float) -> str: - """Signed percentage change vs baseline, or '-' when no baseline.""" - if not base: - return "-" - return f"{(current - base) / base * 100:+.1f}%" - - -def _write_github_summary(results: dict, baseline: dict) -> None: +def _write_github_summary(results: dict) -> None: """Append a values table to $GITHUB_STEP_SUMMARY when running in CI. """ summary_path = os.environ.get("GITHUB_STEP_SUMMARY") @@ -267,29 +215,18 @@ def _write_github_summary(results: dict, baseline: dict) -> None: lines = [ "## CPU benchmark (py-spy)", "", - f"Iterations: {ITERATIONS} · drift threshold: +{(THRESHOLD - 1) * 100:.0f}%" - f" · report-only" + f"Iterations: {ITERATIONS} · report-only" f"{f' · env: {PERF_ENV}' if PERF_ENV else ''}", "", - "| scenario | wall | cpu | cpu/iter | child cpu | wall Δ% | cpu Δ% | status |", - "|----------|------|-----|----------|-----------|---------|--------|--------|", + "| scenario | wall | cpu | cpu/iter | child cpu |", + "|----------|------|-----|----------|-----------|", ] for name, m in results.items(): - b = baseline.get(name, {}) if baseline else {} - wall_base = b.get("wall_seconds", 0) - cpu_base = b.get("cpu_seconds", 0) - over = ( - (wall_base and m["wall_seconds"] > wall_base * THRESHOLD) - or (cpu_base and m["cpu_seconds"] > cpu_base * THRESHOLD) - ) - status = "drift" if over else "ok" lines.append( f"| {name} | {_fmt_secs(m['wall_seconds'])} " f"| {_fmt_secs(m['cpu_seconds'])} " f"| {_fmt_secs(m['cpu_seconds'] / ITERATIONS)} " - f"| {_fmt_secs(m.get('children_cpu_seconds', 0))} " - f"| {_delta_pct(m['wall_seconds'], wall_base)} " - f"| {_delta_pct(m['cpu_seconds'], cpu_base)} | {status} |" + f"| {_fmt_secs(m.get('children_cpu_seconds', 0))} |" ) lines.append("") @@ -299,17 +236,11 @@ def _write_github_summary(results: dict, baseline: dict) -> None: def main() -> None: parser = argparse.ArgumentParser(description="c2pa-python CPU profiler") - parser.add_argument( - "--update-baseline", - action="store_true", - help="Overwrite baseline.json with current measurements and exit 0", - ) parser.add_argument( "--scenario", choices=SCENARIO_NAMES, default=None, - help="Run a single scenario instead of all of them. With --update-baseline, " - "only that scenario's entry in baseline.json is updated; the rest are kept.", + help="Run a single scenario instead of all of them.", ) parser.add_argument( "--mode", @@ -324,23 +255,10 @@ def main() -> None: run_timing = args.mode in ("all", "timing") run_profile = args.mode in ("all", "profile") - if args.update_baseline and not run_timing: - parser.error("--update-baseline requires a mode that measures timing") - scenarios_to_run = (args.scenario,) if args.scenario else SCENARIO_NAMES REPORTS_DIR.mkdir(parents=True, exist_ok=True) - # prior_baseline: the existing file, always loaded so a single-scenario - # update can preserve the other scenarios' entries when it rewrites the file. - prior_baseline: dict = {} - - # baseline: the subset used for the drift comparison below, which is - # suppressed when --update-baseline is set (because we are re-baselining). - if BASELINE_FILE.exists(): - prior_baseline = json.loads(BASELINE_FILE.read_text()) - baseline: dict = {} if args.update_baseline else prior_baseline - results: dict = {} render_failures: list[str] = [] @@ -362,28 +280,6 @@ def main() -> None: if children_cpu >= _CHILDREN_CPU_MIN_BASE: print(f" child cpu: {_fmt_secs(children_cpu)}") - if baseline and name in baseline: - b = baseline[name] - # children_cpu_seconds joins the drift check only when the - # baseline value is non-trivial (percent deltas on ~0 bases - # are pure noise). - checked: list[str] = list(_METRICS) - if b.get("children_cpu_seconds", 0) >= _CHILDREN_CPU_MIN_BASE: - checked.append("children_cpu_seconds") - for metric in checked: - current = metrics.get(metric, 0) - base = b.get(metric, 0) - limit = base * THRESHOLD - if current <= limit: - continue - diff_pct = (current - base) / base * 100 if base else float("inf") - print( - f" drift note: {name}.{metric}: {_fmt_secs(current)} " - f"> baseline {_fmt_secs(base)}" - f" (+{diff_pct:.1f}%, threshold {(THRESHOLD-1)*100:.0f}%)", - flush=True, - ) - if run_profile: out_path = REPORTS_DIR / f"{name}{env_tag}-cpu{ext}" if _run_pyspy_pass(name, out_path): @@ -391,44 +287,8 @@ def main() -> None: else: render_failures.append(name) - if run_timing and (args.update_baseline or not prior_baseline): - # When running a single scenario, merge its result into the existing - # baseline so the other scenarios' entries are preserved. A full run - # replaces the file wholesale. - if args.scenario and prior_baseline: - output = dict(prior_baseline) - else: - output = {} - new_meta = _build_meta() - # On a single-scenario merge the new entry must come from the same - # toolchain as the entries it is being merged next to, or the numbers - # are not comparable. Warn if _meta would change (e.g. wrong PERF_ENV, - # iteration count, or native version) instead of silently overwriting it. - if args.scenario and prior_baseline: - old_meta = prior_baseline.get("_meta", {}) - if old_meta and old_meta != new_meta: - diffs = sorted( - set(old_meta) | set(new_meta), - key=str, - ) - changed = [ - f"{k}: {old_meta.get(k)!r} -> {new_meta.get(k)!r}" - for k in diffs if old_meta.get(k) != new_meta.get(k) - ] - print( - "\nWARNING: this run's environment differs from the existing " - "baseline's _meta; the merged entry will NOT be comparable to " - "the other scenarios:\n " + "\n ".join(changed), - file=sys.stderr, - ) - output["_meta"] = new_meta - output.update(results) - BASELINE_FILE.write_text(json.dumps(output, indent=2)) - verb = "Updated" if prior_baseline else "Created" - print(f"\n{verb} baseline: {BASELINE_FILE}") - # Emit the report table to the PR's Step Summary in CI. - _write_github_summary(results, baseline) + _write_github_summary(results) if render_failures: print("\nPY-SPY PROFILES FAILED (timing metrics still recorded):", file=sys.stderr) @@ -436,7 +296,7 @@ def main() -> None: print(f" {name}", file=sys.stderr) if run_timing: - print("\nDone. Baseline comparison is report-only; drift never fails the run.") + print("\nDone. Timings are report-only and never fail the run.") if __name__ == "__main__": From 9711f4fbda54e68d0e109dbe2cd87720f11ab638 Mon Sep 17 00:00:00 2001 From: tmathern <60901087+tmathern@users.noreply.github.com> Date: Wed, 1 Jul 2026 19:45:50 -0700 Subject: [PATCH 5/5] fix: clean up bench --- tests/perf/cpu/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/perf/cpu/README.md b/tests/perf/cpu/README.md index 5a15a4f7..2aadfb90 100644 --- a/tests/perf/cpu/README.md +++ b/tests/perf/cpu/README.md @@ -65,7 +65,7 @@ The `cpu-bench` target runs the container with `--cap-add SYS_PTRACE --security- ## Interpreting the flamegraphs -Profiles show Python frames only. Time spent inside the Rust `libc2pa_c` library is attributed to the Python frame that made the FFI call. Fast scenarios can produce thin profiles: raising `PYSPY_RATE` or `CPU_ITERATIONS` will lead to getting more samples. +Profiles show Python frames only. Time spent inside the Rust `libc2pa_c` library is attributed to the Python frame that made the C FFI call. Fast scenarios can produce thin profiles: raising `PYSPY_RATE` or `CPU_ITERATIONS` will lead to getting more samples. ## Reading the CI/CD report