diff --git a/.gitignore b/.gitignore index d58ae5a..3a6094b 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,10 @@ __pycache__/ /benchmarks/ /.watchdog-cm-work/ +# Watchdog runtime metrics written by tick (not test golden fixtures) +**/status.prom +**/status.prom.tmp + # Local CM / demo scratch (not part of the repo) /input-*-output-*.bin docs/live-demo.md diff --git a/docs/watchdog/README.md b/docs/watchdog/README.md index efa4912..772aff2 100644 --- a/docs/watchdog/README.md +++ b/docs/watchdog/README.md @@ -108,12 +108,13 @@ Lua modules: - `sequencer_reader.lua`: sequencer HTTP client (`GET /finalized_state/inclusion_block`, `GET /finalized_state`). - `compare.lua`: raw byte comparison. - `checkpoint.lua`: manifest-backed checkpoint persistence (`head.json` pointer). -- `state.lua`: persisted `config.json` and single-run state lock. +- `state.lua`: persisted `config.json`, atomic file writes, single-run state lock. +- `metrics.lua`: Prometheus textfile (`status.prom`) built and written each tick. - `retry.lua`: bounded retry helper used by the runtime. - `runner.lua`: one compare cycle — cheap `/finalized_state/inclusion_block` poll, then (when finalized advanced) L1 fetch, CM replay, SSZ compare, checkpoint write. -- `main.lua`: dispatches `init` and `tick`; `tick` exits `0`/`1`/`2`. +- `main.lua`: dispatches `init` and `tick`; `tick` exits `0`/`1`/`2` and writes `status.prom`. The L1 reader follows the Rust partition strategy from `sequencer/src/l1/partition.rs`: if an RPC provider rejects a large range, the @@ -152,6 +153,7 @@ inputs. state_dir/ config.json head.json + status.prom # Prometheus textfile from the last tick (see Metrics below) run.lock # advisory lock handle; file existence is not lock state checkpoints/ 00000000000001234567/ @@ -195,18 +197,64 @@ host scheduling should provide the same non-overlap guarantee. Each tick: then inspects with query `state`. 5. Byte-compares the SSZ report against `GET /finalized_state`; on match writes a new checkpoint, on mismatch emits a `watchdog_event` and exits `2`. +6. Atomically writes `$CARTESI_WATCHDOG_STATE_DIR/status.prom` (or + `CARTESI_WATCHDOG_METRICS_FILE`) before exit. Runtime knobs: - `CARTESI_WATCHDOG_BLOCKCHAIN_HTTP_ENDPOINT`: current L1 JSON-RPC endpoint for tick. +- `CARTESI_WATCHDOG_BLOCKCHAIN_ID`: optional chain id label persisted at `init` for `status.prom`. +- `CARTESI_WATCHDOG_METRICS_FILE`: optional override for the Prometheus textfile path (default `$CARTESI_WATCHDOG_STATE_DIR/status.prom`). - `CARTESI_WATCHDOG_RETRY_ATTEMPTS`: bounded retry attempts per run, default `3`. - `CARTESI_WATCHDOG_RETRY_DELAY_SEC`: delay between retry attempts, default `5`. +## Metrics (`status.prom`) + +Each `tick` writes a [Prometheus textfile](https://github.com/prometheus/node_exporter#textfile-collector) +before exiting. Operators scrape or push it from their side — the watchdog does +not run an HTTP server. + +| Exit code | `state` label | Meaning | +|-----------|---------------|---------| +| `0` | `ok` | Compare passed, or idle (finalized unchanged) | +| `1` | `warning` | Transient failure after retries | +| `2` | `failed` | Deterministic divergence | + +Gauges (labels `chain`, `app_address` on every series): + +- `cartesi_watchdog_status{state="ok|warning|failed"}` — exactly one series is `1` +- `cartesi_watchdog_last_tick_unix_seconds` +- `cartesi_watchdog_exit_code` +- `cartesi_watchdog_divergence_info{kind}` — only on exit `2` + +Set `CARTESI_WATCHDOG_BLOCKCHAIN_ID` at `init` for the `chain` label (defaults to +`unknown`). Golden fixtures: [`tests/fixtures/watchdog_status_ok.prom`](../tests/fixtures/watchdog_status_ok.prom), +[`tests/fixtures/watchdog_status_failed.prom`](../tests/fixtures/watchdog_status_failed.prom). + +Example after a clean tick: + +```prometheus +cartesi_watchdog_status{chain="11155111",app_address="0x4CE...",state="ok"} 1 +cartesi_watchdog_status{chain="11155111",app_address="0x4CE...",state="warning"} 0 +cartesi_watchdog_status{chain="11155111",app_address="0x4CE...",state="failed"} 0 +cartesi_watchdog_last_tick_unix_seconds{chain="11155111",app_address="0x4CE..."} 1717420800 +cartesi_watchdog_exit_code{chain="11155111",app_address="0x4CE..."} 0 +``` + +Example Prometheus alert (pull or push gateway — operator choice): + +```promql +cartesi_watchdog_status{state="failed"} == 1 +``` + +Divergence playbook: **notify only**; manual intervention (see +[`operator-deployment.md`](operator-deployment.md)). + ## Local Tests | Command | What it exercises | |---------|-------------------| -| `just test-watchdog` | Lua unit tests (fake HTTP/RPC/CM; no live chain) | +| `just test-watchdog` | Lua unit tests (fake HTTP/RPC/CM; includes `status.prom` golden fixtures) | | `just test-watchdog-e2e` | Real CM: advance, inspect; optional live compare if `CARTESI_WATCHDOG_E2E_SEQUENCER_URL` set | | `just test-watchdog-compare-harness` | **Full E2E**: Anvil + devnet sequencer + `/finalized_state` + CM inspect + Lua `init`/`tick` | | `just test-rollups-e2e` | All rollups e2e scenarios; includes watchdog genesis/non-genesis compare plus `watchdog_non_genesis_divergence_test` (needs Sepolia CM image) | diff --git a/docs/watchdog/design-notes.md b/docs/watchdog/design-notes.md index 3a67303..4c3f587 100644 --- a/docs/watchdog/design-notes.md +++ b/docs/watchdog/design-notes.md @@ -81,6 +81,7 @@ fsync support. state/ config.json head.json + status.prom # last tick metrics (Prometheus textfile) run.lock # advisory lock handle in the production container checkpoints/ 00000000000000000042/ diff --git a/docs/watchdog/getting-started.md b/docs/watchdog/getting-started.md index 02bf3d7..5b9a06d 100644 --- a/docs/watchdog/getting-started.md +++ b/docs/watchdog/getting-started.md @@ -132,7 +132,7 @@ export CARTESI_WATCHDOG_LUA_DEPS=.deps/lua Success: exit **0**. If finalized has advanced, stderr ends in `compare pass complete`; if it has not, the tick exits idle after the cheap poll. -Exit codes from `sequencer-watchdog tick`: **0** clean (or idle — finalized unchanged), **1** transient failure (RPC/CM/network after retries), **2** deterministic divergence (`watchdog_event` emitted on stderr before exit). +Exit codes from `sequencer-watchdog tick`: **0** clean (or idle — finalized unchanged), **1** transient failure (RPC/CM/network after retries), **2** deterministic divergence (`watchdog_event` emitted on stderr before exit). Each tick writes `$CARTESI_WATCHDOG_STATE_DIR/status.prom` — see [`README.md` — Metrics](README.md#metrics-statusprom). The watchdog tick runs **one cycle per process and exits** — re-run it on a timer/cron for continuous monitoring. When `inclusion_block` has not advanced since the watchdog checkpoint, the cycle **skips** L1/CM work (idle-cheap) and exits 0. `sequencer-watchdog` takes a non-blocking `flock`; production schedulers should @@ -164,6 +164,8 @@ Full operator runbook: **[`operator-deployment.md`](operator-deployment.md)**. |----------|----------|-------------| | `CARTESI_WATCHDOG_SEQUENCER_URL` | yes | e.g. `http://127.0.0.1:54321` | | `CARTESI_WATCHDOG_BLOCKCHAIN_HTTP_ENDPOINT` | tick | Current L1 JSON-RPC; not persisted by `init` | +| `CARTESI_WATCHDOG_BLOCKCHAIN_ID` | init | Optional chain id label for `status.prom` | +| `CARTESI_WATCHDOG_METRICS_FILE` | tick | Optional override for Prometheus textfile path | | `CARTESI_WATCHDOG_CONTRACTS_INPUT_BOX_ADDRESS` | yes | InputBox contract | | `CARTESI_WATCHDOG_APP_ADDRESS` | yes | Rollup application contract | | `CARTESI_WATCHDOG_STATE_DIR` | yes | Persistent watchdog state (`config.json`, `head.json`, checkpoints) | diff --git a/docs/watchdog/operator-deployment.md b/docs/watchdog/operator-deployment.md index a254a70..1a4d0d4 100644 --- a/docs/watchdog/operator-deployment.md +++ b/docs/watchdog/operator-deployment.md @@ -157,6 +157,8 @@ Today `WalletApp::default()` / `WalletConfig::sepolia()` align with Sepolia stag | `CARTESI_WATCHDOG_STATE_DIR` | Persistent volume on watchdog host | | `CARTESI_WATCHDOG_CM_SNAPSHOT_DIR` | Bootstrap CM snapshot (`init` only) | | `CARTESI_WATCHDOG_CM_SNAPSHOT_SAFE_BLOCK` | L1 block that bootstrap snapshot represents (= finalized `inclusion_block` at bootstrap) | +| `CARTESI_WATCHDOG_BLOCKCHAIN_ID` | Chain id label for `status.prom` metrics (optional; defaults to `unknown`) | +| `CARTESI_WATCHDOG_METRICS_FILE` | Override path for the Prometheus textfile written by each `tick` | | `CARTESI_WATCHDOG_LUA_DEPS` | `.deps/lua` | The sequencer discovers and pins `input_box_address` at startup; use the same values as `CARTESI_SEQUENCER_BLOCKCHAIN_HTTP_ENDPOINT` / `CARTESI_SEQUENCER_APP_ADDRESS` configuration. @@ -181,11 +183,53 @@ sequencer-watchdog init After init, schedule `tick`; tick will fail if `head.json` is missing. +Each `tick` atomically writes a Prometheus textfile to +`$CARTESI_WATCHDOG_STATE_DIR/status.prom` (override with +`CARTESI_WATCHDOG_METRICS_FILE`). Operators can scrape or push it from their +side. Gauges: + +- `cartesi_watchdog_status{chain,app_address,state="ok|warning|failed"}` — `1` on the active state +- `cartesi_watchdog_last_tick_unix_seconds{chain,app_address}` +- `cartesi_watchdog_exit_code{chain,app_address}` +- `cartesi_watchdog_divergence_info{chain,app_address,kind}` — present on exit `2` + +Set `CARTESI_WATCHDOG_BLOCKCHAIN_ID` at `init` so `chain` is labeled (defaults to +`unknown` when omitted). Divergence playbook: notify only; manual intervention. + +Example `status.prom` after a successful tick: + +```prometheus +cartesi_watchdog_status{chain="11155111",app_address="0x4CE...",state="ok"} 1 +cartesi_watchdog_status{chain="11155111",app_address="0x4CE...",state="warning"} 0 +cartesi_watchdog_status{chain="11155111",app_address="0x4CE...",state="failed"} 0 +cartesi_watchdog_last_tick_unix_seconds{chain="11155111",app_address="0x4CE..."} 1717420800 +cartesi_watchdog_exit_code{chain="11155111",app_address="0x4CE..."} 0 +``` + +On divergence (exit `2`), `state="failed"` is `1` and +`cartesi_watchdog_divergence_info{kind="state_mismatch"}` (or +`inclusion_block_regressed`) is present. Example alert: + +```promql +cartesi_watchdog_status{state="failed"} == 1 +``` + +Cron + push pattern (operator pushes to Prometheus after each tick): + +```bash +#!/bin/sh +set -eu +sequencer-watchdog tick || true # exit code still written to status.prom +# push $CARTESI_WATCHDOG_STATE_DIR/status.prom via your exporter +``` + ### 6. Run tick The watchdog runs **one tick per process, then exits** — there is no daemon -loop. Run it once as a smoke check, then schedule it (systemd timer / k8s -CronJob) and alert on the exit code: +loop. Run it once as a smoke check, then schedule it (cron, systemd timer, k8s +CronJob). Alert on `$CARTESI_WATCHDOG_STATE_DIR/status.prom` (preferred for +Prometheus push/pull) or on the process exit code. If the process is killed +mid-tick, `status.prom` keeps the last completed value until the next run. ```bash sequencer-watchdog tick # exit 0 = clean/idle, 1 = transient, 2 = divergence diff --git a/docs/watchdog/staging-drills.md b/docs/watchdog/staging-drills.md index 7f73cc5..fc7064b 100644 --- a/docs/watchdog/staging-drills.md +++ b/docs/watchdog/staging-drills.md @@ -16,7 +16,7 @@ This document covers staging and manual verification beyond the devnet tutorial. - JSON is pure Lua (`watchdog/third_party/json.lua`); no cjson compile step - Staging or local sequencer reachable at `CARTESI_WATCHDOG_SEQUENCER_URL` - L1 RPC + InputBox + app addresses matching that deployment -- Log collection for `watchdog_event` lines and process exit codes +- Log collection for `watchdog_event` lines, process exit codes, and `status.prom` ## Drill 1 — Divergence signal (synthetic mismatch, no CM) @@ -30,7 +30,8 @@ CARTESI_WATCHDOG_LUA_DEPS=.deps/lua lua watchdog/tests/drill_divergence.lua # ``` Expected: `main.lua` emits a structured `watchdog_event` with `kind=state_mismatch` and -non-zero `mismatch_offset`, then the drill process exits with code `2`. +non-zero `mismatch_offset`, then the drill process exits with code `2` and writes +`status.prom` with `state="failed"`. Unit coverage: `just test-watchdog` (`runner returns state mismatch payload`). @@ -64,6 +65,8 @@ export CARTESI_WATCHDOG_LUA_DEPS=.deps/lua ``` Expected: exit **0**; the tick may exit idle if the finalized block is unchanged. +`$CARTESI_WATCHDOG_STATE_DIR/status.prom` should show `state="ok"` and +`cartesi_watchdog_exit_code ... 0`. The harness path also proves byte-identical **devnet** genesis SSZ on sequencer `/finalized_state` and CM inspect (same bytes as `wallet_snapshot::encode(WalletConfig::devnet())`; the `.hex` fixture is for Sepolia `default()` — do not use it as the devnet golden). @@ -86,7 +89,35 @@ Exit codes from `sequencer-watchdog tick`: |------|---------| | `0` | Compare cycle completed — clean, or idle when finalized is unchanged | | `1` | Transient error after retries (RPC, CM, network) | -| `2` | Deterministic divergence — `watchdog_event` on stderr with `{kind, previous_safe_block, sequencer_inclusion_block, mismatch_offset?}` | +| `2` | Deterministic divergence — `watchdog_event` on stderr with `{kind, previous_safe_block, sequencer_inclusion_block, mismatch_offset?}`; `status.prom` has `state="failed"` | + +Each tick also writes `$CARTESI_WATCHDOG_STATE_DIR/status.prom` before exit. See +[`README.md` — Metrics](README.md#metrics-statusprom) for gauge names and alert +examples. + +## Drill 4 — Metrics file (synthetic divergence) + +Verifies `status.prom` is written on divergence without a live sequencer. + +```bash +just watchdog-lua-deps +dir=$(mktemp -d) +export CARTESI_WATCHDOG_STATE_DIR="$dir" +export CARTESI_WATCHDOG_BLOCKCHAIN_ID=31337 +# init once (needs CM snapshot env — reuse Drill 2 exports), then: +CARTESI_WATCHDOG_LUA_DEPS=.deps/lua lua watchdog/tests/drill_divergence.lua || true +cat "$dir/status.prom" +``` + +Or run the unit tests (includes golden fixture checks): + +```bash +just test-watchdog +``` + +Expected after Drill 1: `cartesi_watchdog_status{...,state="failed"} 1` and +`cartesi_watchdog_divergence_info{...,kind="state_mismatch"} 1` in +`$CARTESI_WATCHDOG_STATE_DIR/status.prom`. ## Triage checklist diff --git a/tests/fixtures/watchdog_status_failed.prom b/tests/fixtures/watchdog_status_failed.prom new file mode 100644 index 0000000..bb55680 --- /dev/null +++ b/tests/fixtures/watchdog_status_failed.prom @@ -0,0 +1,14 @@ +# HELP cartesi_watchdog_status Current watchdog compare state (1 = active). +# TYPE cartesi_watchdog_status gauge +cartesi_watchdog_status{app_address="0xdeadbeef",chain="31337",state="ok"} 0 +cartesi_watchdog_status{app_address="0xdeadbeef",chain="31337",state="warning"} 0 +cartesi_watchdog_status{app_address="0xdeadbeef",chain="31337",state="failed"} 1 +# HELP cartesi_watchdog_last_tick_unix_seconds Unix time of the last completed tick. +# TYPE cartesi_watchdog_last_tick_unix_seconds gauge +cartesi_watchdog_last_tick_unix_seconds{app_address="0xdeadbeef",chain="31337"} 1710000000 +# HELP cartesi_watchdog_exit_code Process exit code from the last tick. +# TYPE cartesi_watchdog_exit_code gauge +cartesi_watchdog_exit_code{app_address="0xdeadbeef",chain="31337"} 2 +# HELP cartesi_watchdog_divergence_info Divergence kind from the last tick (1 = present). +# TYPE cartesi_watchdog_divergence_info gauge +cartesi_watchdog_divergence_info{app_address="0xdeadbeef",chain="31337",kind="state_mismatch"} 1 diff --git a/tests/fixtures/watchdog_status_ok.prom b/tests/fixtures/watchdog_status_ok.prom new file mode 100644 index 0000000..51dddbd --- /dev/null +++ b/tests/fixtures/watchdog_status_ok.prom @@ -0,0 +1,11 @@ +# HELP cartesi_watchdog_status Current watchdog compare state (1 = active). +# TYPE cartesi_watchdog_status gauge +cartesi_watchdog_status{app_address="0x4CE633CA71071818cD73187765ee60F696dae083",chain="11155111",state="ok"} 1 +cartesi_watchdog_status{app_address="0x4CE633CA71071818cD73187765ee60F696dae083",chain="11155111",state="warning"} 0 +cartesi_watchdog_status{app_address="0x4CE633CA71071818cD73187765ee60F696dae083",chain="11155111",state="failed"} 0 +# HELP cartesi_watchdog_last_tick_unix_seconds Unix time of the last completed tick. +# TYPE cartesi_watchdog_last_tick_unix_seconds gauge +cartesi_watchdog_last_tick_unix_seconds{app_address="0x4CE633CA71071818cD73187765ee60F696dae083",chain="11155111"} 1710000000 +# HELP cartesi_watchdog_exit_code Process exit code from the last tick. +# TYPE cartesi_watchdog_exit_code gauge +cartesi_watchdog_exit_code{app_address="0x4CE633CA71071818cD73187765ee60F696dae083",chain="11155111"} 0 diff --git a/watchdog/config.lua b/watchdog/config.lua index f5404ea..54b24d0 100644 --- a/watchdog/config.lua +++ b/watchdog/config.lua @@ -81,6 +81,8 @@ function config.load_init(env) env ), cm_image_hash = env.CARTESI_WATCHDOG_CM_IMAGE_HASH, + blockchain_id = env.CARTESI_WATCHDOG_BLOCKCHAIN_ID, + metrics_file = env.CARTESI_WATCHDOG_METRICS_FILE, retry_attempts = optional_number("CARTESI_WATCHDOG_RETRY_ATTEMPTS", 3, env), retry_delay_sec = optional_number("CARTESI_WATCHDOG_RETRY_DELAY_SEC", 5, env), long_block_range_error_codes = split_csv( @@ -116,6 +118,7 @@ function config.persisted(cfg) app_address = cfg.app_address, input_added_topic = cfg.input_added_topic, cm_image_hash = cfg.cm_image_hash, + blockchain_id = cfg.blockchain_id, retry_attempts = cfg.retry_attempts, retry_delay_sec = cfg.retry_delay_sec, long_block_range_error_codes = cfg.long_block_range_error_codes, @@ -136,6 +139,8 @@ function config.from_persisted(state_dir, data, env) app_address = required_field(data, "app_address"), input_added_topic = data.input_added_topic, cm_image_hash = data.cm_image_hash, + blockchain_id = data.blockchain_id, + metrics_file = env.CARTESI_WATCHDOG_METRICS_FILE, retry_attempts = optional_field_number(data, "retry_attempts", 3), retry_delay_sec = optional_field_number(data, "retry_delay_sec", 5), long_block_range_error_codes = data.long_block_range_error_codes or {}, diff --git a/watchdog/main.lua b/watchdog/main.lua index 9b079ae..3a73627 100644 --- a/watchdog/main.lua +++ b/watchdog/main.lua @@ -9,6 +9,7 @@ local http_mod = require("watchdog.http") local json_mod = require("watchdog.json") local jsonrpc = require("watchdog.jsonrpc") local machine_cartesi = require("watchdog.machine_cartesi") +local metrics = require("watchdog.metrics") local retry = require("watchdog.retry") local runner = require("watchdog.runner") local sequencer_reader = require("watchdog.sequencer_reader") @@ -52,6 +53,28 @@ local function emit_watchdog_event(json, payload, deps) io.stderr:write("watchdog_event " .. json.encode(payload) .. "\n") end +local function divergence_kind_from_payload(payload) + if type(payload) == "table" and type(payload.kind) == "string" then + return payload.kind + end + return nil +end + +local function write_tick_metrics(cfg, exit_code, payload, env) + local ok, err = metrics.write_tick_status({ + cfg = cfg, + env = env or os.getenv, + exit_code = exit_code, + chain_id = cfg and cfg.blockchain_id or nil, + app_address = cfg and cfg.app_address or nil, + divergence_kind = divergence_kind_from_payload(payload), + timestamp = os.time(), + }) + if not ok then + io.stderr:write("watchdog metrics write failed: " .. tostring(err) .. "\n") + end +end + local function default_machine_deps(cfg) return { machine = load_machine(cfg), @@ -166,7 +189,10 @@ local function load_or_error(loader) return value end -local function exit_for_result(command, exit_code, err) +local function exit_for_result(command, exit_code, err, cfg, env) + if command == "tick" then + write_tick_metrics(cfg, exit_code, err, env) + end if exit_code == EXIT_DIVERGENCE then os.exit(EXIT_DIVERGENCE) end @@ -178,9 +204,13 @@ local function exit_for_result(command, exit_code, err) end -- One compare cycle per `tick` process. Infra (systemd timer / k8s CronJob) --- schedules re-runs and reacts to the exit code; the watchdog itself does not loop. -local function main(argv) +-- schedules re-runs and reacts to status.prom / the exit code; the watchdog +-- itself does not loop. +local function main(argv, opts) argv = argv or arg or {} + opts = opts or {} + local injected_env = opts.env + local injected_deps = opts.deps prepend_deps_cpath() local command = argv[1] @@ -206,21 +236,23 @@ local function main(argv) if command == "tick" then local cfg, cfg_err = load_or_error(function() - return load_tick_config() + return load_tick_config(injected_env) end) if not cfg then io.stderr:write("watchdog tick failed: " .. tostring(cfg_err) .. "\n") + write_tick_metrics(nil, EXIT_TRANSIENT, cfg_err, injected_env) os.exit(EXIT_TRANSIENT) end - local ok, exit_code, err = pcall(run_tick, cfg) + local ok, exit_code, err = pcall(run_tick, cfg, injected_deps) if not ok then io.stderr:write("watchdog tick failed: " .. tostring(exit_code) .. "\n") + write_tick_metrics(cfg, EXIT_TRANSIENT, exit_code, injected_env) os.exit(EXIT_TRANSIENT) end if not exit_code then exit_code = EXIT_TRANSIENT end - exit_for_result(command, exit_code, err) + exit_for_result(command, exit_code, err, cfg, injected_env) end io.stderr:write(usage() .. "\n") @@ -246,4 +278,5 @@ return { EXIT_OK = EXIT_OK, EXIT_TRANSIENT = EXIT_TRANSIENT, EXIT_DIVERGENCE = EXIT_DIVERGENCE, + write_tick_metrics = write_tick_metrics, } diff --git a/watchdog/metrics.lua b/watchdog/metrics.lua new file mode 100644 index 0000000..8569c7b --- /dev/null +++ b/watchdog/metrics.lua @@ -0,0 +1,155 @@ +-- (c) Cartesi and individual authors (see AUTHORS) +-- SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +local state = require("watchdog.state") + +local metrics = {} + +metrics.STATUS_FILE = "status.prom" +metrics.METRIC_STATUS = "cartesi_watchdog_status" +metrics.METRIC_LAST_TICK = "cartesi_watchdog_last_tick_unix_seconds" +metrics.METRIC_EXIT_CODE = "cartesi_watchdog_exit_code" + +local function normalize_env(env) + if env == nil then + return os.getenv + end + if type(env) == "function" then + return env + end + return function(name) + return env[name] + end +end + +local function escape_label(value) + value = tostring(value) + value = value:gsub("\\", "\\\\") + value = value:gsub('"', '\\"') + value = value:gsub("\n", "\\n") + return value +end + +local function base_labels(opts) + local labels = {} + local chain = opts.chain_id + if chain == nil or chain == "" then + chain = "unknown" + end + labels.chain = tostring(chain) + + local app_address = opts.app_address + if app_address == nil or app_address == "" then + app_address = "unknown" + end + labels.app_address = tostring(app_address) + return labels +end + +local function label_string(labels) + local keys = {} + for key in pairs(labels) do + table.insert(keys, key) + end + table.sort(keys) + + local parts = {} + for _, key in ipairs(keys) do + table.insert(parts, key .. '="' .. escape_label(labels[key]) .. '"') + end + return "{" .. table.concat(parts, ",") .. "}" +end + +local function gauge_line(name, labels, value) + return name .. label_string(labels) .. " " .. tostring(value) +end + +function metrics.state_for_exit_code(exit_code) + if exit_code == 2 then + return "failed" + end + if exit_code == 1 then + return "warning" + end + return "ok" +end + +function metrics.resolve_path(cfg, env) + local getenv = normalize_env(env) + local configured = getenv("CARTESI_WATCHDOG_METRICS_FILE") + if configured ~= nil and configured ~= "" then + return configured + end + if cfg and cfg.metrics_file and cfg.metrics_file ~= "" then + return cfg.metrics_file + end + if cfg and cfg.state_dir and cfg.state_dir ~= "" then + return cfg.state_dir .. "/" .. metrics.STATUS_FILE + end + local state_dir = getenv("CARTESI_WATCHDOG_STATE_DIR") + if state_dir == nil or state_dir == "" then + error("CARTESI_WATCHDOG_STATE_DIR is required") + end + return state_dir .. "/" .. metrics.STATUS_FILE +end + +function metrics.build_prom(opts) + assert(type(opts) == "table", "opts is required") + assert(type(opts.exit_code) == "number", "exit_code is required") + + local labels = base_labels(opts) + local active_state = metrics.state_for_exit_code(opts.exit_code) + local timestamp = opts.timestamp or os.time() + + local lines = { + "# HELP " .. metrics.METRIC_STATUS .. " Current watchdog compare state (1 = active).", + "# TYPE " .. metrics.METRIC_STATUS .. " gauge", + gauge_line(metrics.METRIC_STATUS, { + chain = labels.chain, + app_address = labels.app_address, + state = "ok", + }, active_state == "ok" and 1 or 0), + gauge_line(metrics.METRIC_STATUS, { + chain = labels.chain, + app_address = labels.app_address, + state = "warning", + }, active_state == "warning" and 1 or 0), + gauge_line(metrics.METRIC_STATUS, { + chain = labels.chain, + app_address = labels.app_address, + state = "failed", + }, active_state == "failed" and 1 or 0), + "# HELP " .. metrics.METRIC_LAST_TICK .. " Unix time of the last completed tick.", + "# TYPE " .. metrics.METRIC_LAST_TICK .. " gauge", + gauge_line(metrics.METRIC_LAST_TICK, { + chain = labels.chain, + app_address = labels.app_address, + }, timestamp), + "# HELP " .. metrics.METRIC_EXIT_CODE .. " Process exit code from the last tick.", + "# TYPE " .. metrics.METRIC_EXIT_CODE .. " gauge", + gauge_line(metrics.METRIC_EXIT_CODE, { + chain = labels.chain, + app_address = labels.app_address, + }, opts.exit_code), + } + + if opts.divergence_kind and opts.divergence_kind ~= "" then + table.insert(lines, "# HELP cartesi_watchdog_divergence_info Divergence kind from the last tick (1 = present).") + table.insert(lines, "# TYPE cartesi_watchdog_divergence_info gauge") + table.insert(lines, gauge_line("cartesi_watchdog_divergence_info", { + chain = labels.chain, + app_address = labels.app_address, + kind = opts.divergence_kind, + }, 1)) + end + + return table.concat(lines, "\n") .. "\n" +end + +function metrics.write_tick_status(opts) + local path = metrics.resolve_path(opts.cfg, opts.env) + local body = metrics.build_prom(opts) + return state.write_file_atomic(path, body) +end + +return metrics diff --git a/watchdog/state.lua b/watchdog/state.lua index ae17c48..d4c58bc 100644 --- a/watchdog/state.lua +++ b/watchdog/state.lua @@ -53,6 +53,30 @@ function state.ensure_dir(dir) return mkdir_p(dir) end +function state.write_file_atomic(path, data) + assert(type(path) == "string" and path ~= "", "path is required") + assert(type(data) == "string", "data must be a string") + + local dir = path:match("^(.*)/[^/]+$") + if dir and dir ~= "" then + local ok, err = state.ensure_dir(dir) + if not ok then + return nil, err + end + end + + local tmp = path .. ".tmp" + local ok, err = write_all(tmp, data) + if not ok then + return nil, err + end + ok, err = os.rename(tmp, path) + if not ok then + return nil, err + end + return true +end + function state.write_json_atomic(dir, name, value, json) assert(type(dir) == "string" and dir ~= "", "state dir is required") assert(type(name) == "string" and name ~= "", "file name is required") diff --git a/watchdog/tests/drill_divergence.lua b/watchdog/tests/drill_divergence.lua index ad4b69c..12ad4e5 100644 --- a/watchdog/tests/drill_divergence.lua +++ b/watchdog/tests/drill_divergence.lua @@ -18,12 +18,14 @@ local main_mod = require("watchdog.main") local log = dofile("watchdog/tests/e2e_log.lua") local function fake_cfg() + local state_dir = os.getenv("CARTESI_WATCHDOG_STATE_DIR") or "/tmp/watchdog-drill" return { - state_dir = "/tmp/watchdog-drill", + state_dir = state_dir, cm_snapshot_dir = "/tmp/genesis-snapshot", cm_snapshot_safe_block = 0, input_box_address = "0xinputbox", app_address = "0x1111111111111111111111111111111111111111", + blockchain_id = os.getenv("CARTESI_WATCHDOG_BLOCKCHAIN_ID") or "31337", input_added_topic = "0xtopic", long_block_range_error_codes = require("watchdog.l1_reader").DEFAULT_LONG_BLOCK_RANGE_ERROR_CODES, retry_attempts = 1, @@ -105,4 +107,27 @@ log.pass( "divergence-signal-drill", string.format("main.lua emitted state_mismatch; mismatch_offset=%s", tostring(captured_event.mismatch_offset)) ) + +local cfg = fake_cfg() +main_mod.write_tick_metrics(cfg, exit_code, err, { + CARTESI_WATCHDOG_STATE_DIR = cfg.state_dir, +}) +local prom_path = cfg.state_dir .. "/status.prom" +local prom_file = io.open(prom_path, "rb") +if not prom_file then + log.fail("divergence-signal-drill", "expected status.prom at " .. prom_path) + os.exit(1) +end +local prom_body = prom_file:read("*a") +prom_file:close() +if prom_body:find('state="failed"} 1', 1, true) == nil then + log.fail("divergence-signal-drill", "status.prom missing failed state") + os.exit(1) +end +if prom_body:find('kind="state_mismatch"} 1', 1, true) == nil then + log.fail("divergence-signal-drill", "status.prom missing divergence kind") + os.exit(1) +end +log.pass("divergence-signal-drill", "status.prom written with failed state") + os.exit(main_mod.EXIT_DIVERGENCE) diff --git a/watchdog/tests/run.lua b/watchdog/tests/run.lua index d2fadb6..f8efabd 100644 --- a/watchdog/tests/run.lua +++ b/watchdog/tests/run.lua @@ -10,6 +10,7 @@ local config = require("watchdog.config") local jsonrpc = require("watchdog.jsonrpc") local l1_reader = require("watchdog.l1_reader") local main_mod = require("watchdog.main") +local metrics = require("watchdog.metrics") local retry = require("watchdog.retry") local runner = require("watchdog.runner") local sequencer_reader = require("watchdog.sequencer_reader") @@ -27,6 +28,26 @@ local function assert_eq(actual, expected) end end +local TEST_OS_EXIT = "__TEST_OS_EXIT__" + +local function capture_os_exit(fn) + local captured = nil + local original_exit = os.exit + os.exit = function(code) + captured = code or 0 + error(TEST_OS_EXIT) + end + local ok, err = pcall(fn) + os.exit = original_exit + if not ok then + if tostring(err):find(TEST_OS_EXIT, 1, true) then + return captured, nil + end + return nil, err + end + return captured, nil +end + test("raw compare fails byte-different JSON", function() local ok, offset = compare.raw_equal('{"a":1}', '{ "a": 1 }') assert_eq(ok, false) @@ -466,6 +487,7 @@ local function fake_cfg() cm_snapshot_safe_block = 0, input_box_address = "0xinputbox", app_address = "0x1111111111111111111111111111111111111111", + blockchain_id = "31337", input_added_topic = "0xtopic", long_block_range_error_codes = l1_reader.DEFAULT_LONG_BLOCK_RANGE_ERROR_CODES, retry_attempts = 1, @@ -514,6 +536,255 @@ local function fake_machine(inspect_state) return machine end +local function load_fixture(path) + local file, err = io.open(path, "rb") + if not file then + error("open " .. path .. ": " .. tostring(err)) + end + local body = file:read("*a") + file:close() + return body +end + +test("main tick writes status.prom through exit path", function() + local dir = os.tmpname() + os.remove(dir) + + local cfg = fake_cfg() + cfg.state_dir = dir + local init_result, init_err = main_mod.run_init(cfg, { machine = fake_machine("{}") }) + assert(init_result, init_err) + + local tick_env = { + CARTESI_WATCHDOG_STATE_DIR = dir, + CARTESI_WATCHDOG_BLOCKCHAIN_HTTP_ENDPOINT = "http://tick-rpc", + } + local tick_deps = { + checkpoint = { + load = function(_state_dir) + return { snapshot_dir = "/tmp/snapshot", safe_block = 5 } + end, + }, + sequencer = { + get_finalized_inclusion_block = function() + return { inclusion_block = 5, l2_tx_index = 0 } + end, + }, + machine = fake_machine("{}"), + } + + local exit_code, run_err = capture_os_exit(function() + main_mod.main({ "tick" }, { deps = tick_deps, env = tick_env }) + end) + assert(run_err == nil, tostring(run_err)) + assert_eq(exit_code, main_mod.EXIT_OK) + + local file = assert(io.open(dir .. "/status.prom", "rb")) + local body = file:read("*a") + file:close() + assert(body:find('state="ok"} 1', 1, true) ~= nil, body) + assert(body:find('cartesi_watchdog_exit_code{app_address="0x1111111111111111111111111111111111111111",chain="31337"} 0', 1, true) ~= nil, body) +end) + +test("metrics prom matches golden ok fixture", function() + local body = metrics.build_prom({ + exit_code = 0, + chain_id = "11155111", + app_address = "0x4CE633CA71071818cD73187765ee60F696dae083", + timestamp = 1710000000, + }) + assert_eq(body, load_fixture("tests/fixtures/watchdog_status_ok.prom")) +end) + +test("metrics prom matches golden failed fixture", function() + local body = metrics.build_prom({ + exit_code = 2, + chain_id = "31337", + app_address = "0xdeadbeef", + divergence_kind = "state_mismatch", + timestamp = 1710000000, + }) + assert_eq(body, load_fixture("tests/fixtures/watchdog_status_failed.prom")) +end) + +test("metrics prom marks warning state without divergence info", function() + local body = metrics.build_prom({ + exit_code = 1, + chain_id = "31337", + app_address = "0xdeadbeef", + timestamp = 1710000000, + }) + + assert(body:find('state="warning"} 1', 1, true) ~= nil, body) + assert(body:find("cartesi_watchdog_divergence_info", 1, true) == nil, body) +end) + +test("metrics resolve_path honors custom metrics file env", function() + local path = metrics.resolve_path({ state_dir = "/var/lib/watchdog" }, { + CARTESI_WATCHDOG_METRICS_FILE = "/tmp/custom.prom", + }) + assert_eq(path, "/tmp/custom.prom") +end) + +test("metrics resolve_path defaults to state dir status prom", function() + local path = metrics.resolve_path({ state_dir = "/var/lib/watchdog" }, {}) + assert_eq(path, "/var/lib/watchdog/status.prom") +end) + +test("metrics prom uses unknown labels when chain and app are missing", function() + local body = metrics.build_prom({ + exit_code = 0, + timestamp = 1710000000, + }) + assert(body:find('chain="unknown"', 1, true) ~= nil, body) + assert(body:find('app_address="unknown"', 1, true) ~= nil, body) +end) + +test("metrics prom escapes label special characters", function() + local body = metrics.build_prom({ + exit_code = 2, + chain_id = '31"337', + app_address = '0x\\addr', + divergence_kind = "state_mismatch", + timestamp = 1710000000, + }) + assert(body:find('chain="31\\"337"', 1, true) ~= nil, body) + assert(body:find('app_address="0x\\\\addr"', 1, true) ~= nil, body) +end) + +test("metrics write is atomic and leaves no tmp file", function() + local dir = os.tmpname() + os.remove(dir) + + local prom_path = dir .. "/status.prom" + local ok, err = metrics.write_tick_status({ + cfg = { state_dir = dir, app_address = "0xabc", blockchain_id = "1" }, + exit_code = 0, + timestamp = 1710000000, + }) + assert(ok, err) + + local tmp = io.open(prom_path .. ".tmp", "rb") + assert_eq(tmp, nil) + local file = assert(io.open(prom_path, "rb")) + local body = file:read("*a") + file:close() + assert(body:find('state="ok"} 1', 1, true) ~= nil, body) +end) + +test("successful idle compare writes ok status prom", function() + local dir = os.tmpname() + os.remove(dir) + + local cfg = fake_cfg() + cfg.state_dir = dir + + local result, err = main_mod.run_init(cfg, { machine = fake_machine("{}") }) + assert(result, err) + + local exit_code, payload = main_mod.run_compare_cycle(cfg, { + checkpoint = { + load = function(_state_dir) + return { snapshot_dir = "/tmp/snapshot", safe_block = 5 } + end, + }, + sequencer = { + get_finalized_inclusion_block = function() + return { inclusion_block = 5, l2_tx_index = 0 } + end, + }, + machine = fake_machine("{}"), + }) + assert_eq(exit_code, main_mod.EXIT_OK) + assert(payload.skipped, "expected idle skip") + + main_mod.write_tick_metrics(cfg, exit_code, payload, { + CARTESI_WATCHDOG_STATE_DIR = dir, + }) + + local file = assert(io.open(dir .. "/status.prom", "rb")) + local body = file:read("*a") + file:close() + assert(body:find('state="ok"} 1', 1, true) ~= nil, body) + assert(body:find("cartesi_watchdog_divergence_info", 1, true) == nil, body) +end) + +test("metrics maps exit codes to status states", function() + assert_eq(metrics.state_for_exit_code(0), "ok") + assert_eq(metrics.state_for_exit_code(1), "warning") + assert_eq(metrics.state_for_exit_code(2), "failed") +end) + +test("metrics prom file marks failed state on divergence", function() + local body = metrics.build_prom({ + exit_code = 2, + chain_id = "31337", + app_address = "0xdeadbeef", + divergence_kind = "state_mismatch", + timestamp = 1710000000, + }) + + assert(body:find('cartesi_watchdog_status{app_address="0xdeadbeef",chain="31337",state="failed"} 1', 1, true) ~= nil, body) + assert(body:find('cartesi_watchdog_status{app_address="0xdeadbeef",chain="31337",state="ok"} 0', 1, true) ~= nil, body) + assert(body:find('cartesi_watchdog_last_tick_unix_seconds{app_address="0xdeadbeef",chain="31337"} 1710000000', 1, true) ~= nil, body) + assert(body:find('cartesi_watchdog_exit_code{app_address="0xdeadbeef",chain="31337"} 2', 1, true) ~= nil, body) + assert(body:find('cartesi_watchdog_divergence_info{app_address="0xdeadbeef",chain="31337",kind="state_mismatch"} 1', 1, true) ~= nil, body) +end) + +test("tick writes status.prom into watchdog state dir", function() + local dir = os.tmpname() + os.remove(dir) + + local cfg = fake_cfg() + cfg.state_dir = dir + cfg.blockchain_id = "31337" + + local result, err = main_mod.run_init(cfg, { machine = fake_machine("{}") }) + assert(result, err) + + local exit_code, payload = main_mod.run_compare_cycle(cfg, { + checkpoint = { + load = function(_state_dir) + return { snapshot_dir = "/tmp/snapshot", safe_block = 1 } + end, + }, + sequencer = { + get_finalized_inclusion_block = function() + return { inclusion_block = 2, l2_tx_index = 0 } + end, + get_finalized_state = function() + return { + inclusion_block = 2, + l2_tx_index = 0, + state = '{"a":1}', + } + end, + }, + fetch_inputs = function(from_block, to_block) + assert_eq(from_block, 2) + assert_eq(to_block, 2) + return {} + end, + machine = fake_machine('{ "a": 1 }'), + }) + assert_eq(exit_code, main_mod.EXIT_DIVERGENCE) + assert(type(payload) == "table", "expected divergence payload") + assert_eq(payload.kind, "state_mismatch") + + main_mod.write_tick_metrics(cfg, exit_code, payload, { + CARTESI_WATCHDOG_STATE_DIR = dir, + }) + + local prom_path = dir .. "/status.prom" + local file, open_err = io.open(prom_path, "rb") + assert(file, open_err) + local body = file:read("*a") + file:close() + + assert(body:find('state="failed"} 1', 1, true) ~= nil, body) + assert(body:find('kind="state_mismatch"} 1', 1, true) ~= nil, body) +end) + test("init stores bootstrap snapshot as watchdog head", function() local dir = os.tmpname() os.remove(dir) @@ -539,6 +810,7 @@ test("init stores bootstrap snapshot as watchdog head", function() assert(persisted, cfg_err) assert_eq(persisted.sequencer_url, "http://sequencer") assert_eq(persisted.l1_rpc_url, nil) + assert_eq(persisted.blockchain_id, "31337") local tick_cfg = main_mod.load_tick_config({ CARTESI_WATCHDOG_STATE_DIR = dir, @@ -547,6 +819,8 @@ test("init stores bootstrap snapshot as watchdog head", function() assert_eq(tick_cfg.state_dir, dir) assert_eq(tick_cfg.sequencer_url, "http://sequencer") assert_eq(tick_cfg.l1_rpc_url, "http://tick-rpc") + assert_eq(tick_cfg.blockchain_id, "31337") + assert_eq(tick_cfg.app_address, "0x1111111111111111111111111111111111111111") end) test("tick config requires current RPC URL outside persisted state", function()