From 0648a9ea8ddef1100430206589ba6f1837755dc9 Mon Sep 17 00:00:00 2001 From: David Boreham Date: Tue, 30 Jun 2026 08:10:05 -0600 Subject: [PATCH 1/2] Add backup functionality for docker --- docs/backup-implementation.md | 176 +++++++++++++++++ docs/backup.md | 272 ++++++++++++++++++++++++++ src/stack/constants.py | 5 + src/stack/deploy/deployment_create.py | 33 ++++ src/stack/deploy/spec.py | 3 + src/stack/deploy/stack.py | 32 +++ src/stack/init/init.py | 2 + tests/backup/run-test.sh | 176 +++++++++++++++++ 8 files changed, 699 insertions(+) create mode 100644 docs/backup-implementation.md create mode 100644 docs/backup.md create mode 100755 tests/backup/run-test.sh diff --git a/docs/backup-implementation.md b/docs/backup-implementation.md new file mode 100644 index 0000000..aeddfb7 --- /dev/null +++ b/docs/backup-implementation.md @@ -0,0 +1,176 @@ +# Backup — implementation sketch + +> **Status: design sketch — not yet implemented.** Companion to [backup.md](./backup.md). This pins the +> design down to concrete functions and insertion points in the existing code, mirroring how ingress is +> implemented, so the work can be reviewed before it is written. Code below is illustrative, not final. + +## Scope of this sketch + +This covers the **deploy-time injection pass** (the first concrete step) and the two things it depends on: + +1. parsing the `@stack backup-*` annotations into the spec (mirrors `Stack.get_http_proxy_targets`); +2. an accessor for that spec section (mirrors `Spec.get_http_proxy`); +3. injecting a backup service into the generated deployment at deploy time (mirrors the + `VIRTUAL_HOST_MULTIPORTS` injection). + +**Out of scope here** (later steps): the `bozemanpass/backup-stack` container image, the +`stack manage … backup` subcommands, and the Kubernetes K8up-resource emission. Those are tracked in +[backup.md](./backup.md). + +## The ingress pattern we are mirroring + +For reference, ingress works in three touch-points, all of which have a backup analogue: + +| Ingress | Backup analogue | +| --------------------------------------------------------- | --------------------------------------------------------- | +| `Stack.get_http_proxy_targets()` parses port annotations (`stack.py:203`) | `Stack.get_backup_targets()` parses volume/service annotations | +| targets written into `network.http-proxy` spec section | targets written into a `backup` spec section | +| `Spec.get_http_proxy()` accessor (`spec.py:140`) | `Spec.get_backup()` accessor | +| inject `VIRTUAL_HOST_MULTIPORTS` env at deploy (`deployment_create.py:550`) | inject a `backup` service + `:ro` mounts at deploy | + +## 1. Parse annotations — `deploy/stack.py` + +Add a method alongside `get_http_proxy_targets` (`stack.py:203`). Unlike ingress, backup annotations attach +to two different places — a **volume mount line** (`backup-exclude`) and a **service** (`backup-command`, +`backup-file-extension`). Both are read from `ruamel`'s comment attributes (`.ca`), exactly as the ingress +parser reads port-line comments. + +```python +# constants.py (new) +backup_key = "backup" +backup_exclude_annotation = "backup-exclude" +backup_command_annotation = "backup-command" +backup_file_extension_annotation = "backup-file-extension" + +# deploy/stack.py — new method on Stack +def get_backup_targets(self): + """Parse @stack backup-* annotations from the stack's composefiles. + + Returns {"exclude": [volume_name, ...], + "commands": {service_name: {"command": str, "file_extension": str}}} + """ + exclude = [] + commands = {} + for pod in self.get_pod_list(): + parsed_pod_file = self.load_pod_file(pod) + for svc_name, svc in parsed_pod_file.get(constants.services_key, {}).items(): + # Service-level annotations (backup-command / backup-file-extension) live in the + # comment block attached to the service mapping. + for ann in _stack_annotations_for(svc): # small helper over svc.ca + if constants.backup_command_annotation in ann: + commands.setdefault(svc_name, {})["command"] = _annotation_value(ann) + elif constants.backup_file_extension_annotation in ann: + commands.setdefault(svc_name, {})["file_extension"] = _annotation_value(ann) + + # Volume-level annotation (backup-exclude) lives on the mount line, like ports do. + volumes_section = svc.get(constants.volumes_key, []) + for i, mount in enumerate(volumes_section): + comment = _line_comment(volumes_section, i) # mirrors ports_section.ca.items[i] + if comment and constants.stack_annotation_marker in comment \ + and constants.backup_exclude_annotation in comment: + exclude.append(str(mount).split(":")[0]) + return {"exclude": exclude, "commands": commands} +``` + +`_line_comment()` is the same `ports_section.ca.items[i][0].value` access used at `stack.py:213-215`, +factored out so it can be reused for the volumes list. + +## 2. Spec section + accessor — `deploy/spec.py` + +The parsed targets are written into a top-level `backup` section of the spec during `stack init` (alongside +where `http-proxy` targets are written, `deployment_create.py:285-326`). Add the read accessor next to +`get_http_proxy` (`spec.py:140`): + +```python +# deploy/spec.py +def get_backup(self): + return self.obj.get(constants.backup_key, {}) +``` + +Because `backup` is a plain dict it merges additively across mixed-in specs with no special handling, like +the other spec sections. + +## 3. Augment the backup service at deploy time — `deploy/deployment_create.py` + +This is the core of the pass, and it follows the ingress model exactly: the **backup service is defined in +the `backup-stack` repo** (`backup/composefile.yml`), and the deploy step only *augments* it — appending the +read-only data-volume mounts and setting the backup environment — just as ingress augments the existing +`nginx` service with `VIRTUAL_HOST_MULTIPORTS`. We do **not** fabricate the service in Python, so its +definition (image, Docker-socket mount, restic cache volume, entrypoint) lives in one place. + +The augment runs inside the **existing** per-pod service loop, right next to the `VIRTUAL_HOST_MULTIPORTS` +injection (`deployment_create.py:550-574`), firing only for the `backup` service when the master switch is +on. **Current state:** the `backup-stack` must be mixed in explicitly (an extra `--spec-file`), exactly as +the ingress stack is today; auto-including it when the switch is on is a planned refinement, not yet built. +The implemented form: + +```python +# deploy/deployment_create.py — within the `for service_name in services:` loop, +# alongside the VIRTUAL_HOST_MULTIPORTS block (~line 550) + +from stack.config.util import get_config_setting + +if get_config_setting("backup", False) and service_name == constants.backup_service_name: + backup_cfg = parsed_spec.get_backup() + exclude = set(backup_cfg.get("exclude", [])) + + # Append a read-only mount for every (non-excluded) data volume to the service the + # backup-stack composefile already defines. On Docker each named volume is a bind mount + # under /data// (`_fixup_pod_file`, line 77), so we mount those host + # paths directly and avoid re-declaring named volumes across compose files. + mounts = service_info.setdefault("volumes", []) + for v, vol_path in parsed_spec.get_volumes().items(): + if v in exclude or not vol_path: + continue + device = vol_path if Path(vol_path).is_absolute() else f".{vol_path}" # same path the volume binds to + mounts.append(f"{device}:/backup/{v}:ro") + + # Consistency-dump hooks: "service:command:ext;...". Executed inside the target container + # via the Docker socket the backup service already mounts (mirrors K8up's backupcommand). + pre_hooks = [ + f"{svc}:{c['command']}:{c.get('file_extension', 'dump')}" + for svc, c in backup_cfg.get("commands", {}).items() + ] + + svc_env = service_info.get("environment", {}) + add_env_var("BACKUP_S3_ENDPOINT", get_config_setting("backup-s3-endpoint"), svc_env) + add_env_var("BACKUP_S3_BUCKET", get_config_setting("backup-s3-bucket"), svc_env) + add_env_var("BACKUP_SCHEDULE", get_config_setting("backup-schedule", "0 3 * * *"), svc_env) + add_env_var("BACKUP_RETENTION", + get_config_setting("backup-retention", + "--keep-daily 7 --keep-weekly 4 --keep-monthly 6"), svc_env) + add_env_var("BACKUP_PRE_HOOKS", ";".join(pre_hooks), svc_env) + service_info["environment"] = svc_env +``` + +`constants.backup_service_name` is `"backup"` — the service name in `backup-stack/backup/composefile.yml`. +The `restic-password` and S3 credentials are *not* set here: they arrive via the shared `config.env` +(`env_file`) already injected into every service at `deployment_create.py:538-548`, so no new secret path +is introduced. + +Notes: + +- **One mount per volume, `:ro`.** This matches the doc and lets restic see a clean per-volume tree. A + simpler variant is a single `../data:/backup:ro` mount with restic `--exclude` patterns; either works. +- **`restic-password` and the S3 credentials** arrive via the shared `config.env` (`env_file`), which is + how `get_config_setting` values already reach containers — no new secret-plumbing path is introduced. +- **Consistency hooks run via the Docker socket**, which the `backup` service mounts in + `backup-stack/backup/composefile.yml`. The hook runner `docker exec`s the dump command inside the target + service container (resolved by compose label), mirroring K8up's exec-into-the-pod model — so no database + clients need to be baked into the backup image. +- The backup service appears in `stack manage … ps` like any other container, which is what the + forthcoming `backup` subcommands will `exec` into. + +## What this deliberately does *not* do yet + +- The `bozemanpass/backup` image exists as an **initial scaffold** in the `backup-stack` repo (its + `Containerfile`, `build.sh`, and `scripts/` for entrypoint, cron, hook runner, and restore) but has not + been run end-to-end and is not yet wired into a build. +- The deploy-time augment above is **not yet implemented in code** — this document is still the spec for it. +- It does not emit anything on the Kubernetes target; that path generates a K8up `Schedule` + annotations + and is a sibling to this function. +- It does not add the `stack manage … backup` subcommands. + +These are intentionally separable: this pass produces a deployment whose `backup` service *declares* its +backup intent (its `:ro` data mounts and env) in the generated compose, which is the foundation the +remaining pieces build on. diff --git a/docs/backup.md b/docs/backup.md new file mode 100644 index 0000000..88c1571 --- /dev/null +++ b/docs/backup.md @@ -0,0 +1,272 @@ +# Backup & Restore + +> **Status: design proposal — not yet implemented.** This document describes the intended design for +> backing up and restoring service data. It is written to be reviewed and refined before any code is +> written. Where it describes commands or behaviour, read those as *proposed*. + +Stacks keep their software components and configuration under revision control in git. Once a stack is +running, however, it accumulates **persistent data** in mounted volumes that git does not track. This +document describes how that data is backed up to object storage (S3) and restored from a previous epoch. + +## Design goal: "backup my stuff", and nothing more + +The feature is designed to be as transparent as possible. In the common case the person deploying a stack +provides **no backup-specific information at all** — not which volumes to back up, not where to send them, +not on what schedule. Backup is configured once at the environment/profile level and then applies to every +deployment automatically. + +This is achieved by deriving everything possible from what the stack tool already knows or can source +ambiently: + +- **What to back up** is derived from the deployment itself: *all* read-write named volumes are backed up + by default. The tool already enumerates them (`Spec.get_volumes()`). +- **Where to back up, with which credentials, on what schedule** is sourced from the environment using the + existing configuration precedence (see [Configuration](#configuration)). + +Annotations exist only to *refine* this default, and are written by the **author** of a stack, never by +the person deploying it. + +## Engine: restic + +All backups — on both targets — are stored as a [restic](https://restic.net) repository. restic is the +contract, not an implementation detail: + +- **Client-side encryption is mandatory** (AES-256). The payload is encrypted *before* upload, so the + object store never sees plaintext. This is what makes backing up to commodity object storage acceptable. +- **Content-addressed dedup + incremental snapshots.** A daily backup of a mostly-static volume costs + almost nothing. +- **Snapshots and retention policies** give point-in-time restore (your "previous epoch"). +- **Native S3 backend** (and any S3-compatible store: MinIO, Wasabi, etc.). + +Standardising on the restic repository format means a backup written on the Docker target is restorable on +the Kubernetes target and vice-versa, and that in a pinch an operator can restore with the bare `restic` +CLI from outside the deployment entirely. + +The two targets differ only in *what runs restic and how it is scheduled*: + +| Concern | Docker | Kubernetes | +| ------------------ | ------------------------------------------------- | -------------------------------------------- | +| Engine / format | restic (off-the-shelf restic container image) | restic (via **K8up**) | +| Scheduling | cron in the backup container | K8up `Schedule` resource | +| Quiesce / hooks | pre/post hooks in the backup container | `k8up.io/backupcommand` pod annotation | +| Config generation | `stack` injects config at deploy time | `stack` emits K8up resources | +| Prerequisites | the auto-injected backup container | the `cluster` tool ensures K8up is present | +| Restore | start-stripped → restic restore → start full | K8up `Restore` into freshly-created PVCs | + +## Configuration + +Backup settings are resolved with the standard stack configuration precedence +(`config/util.py:get_config_setting`): **environment variable → active profile → built-in default**. This +is what lets backup be ambient — set it once in a profile and every deployment under that profile inherits +it, with no per-stack input. + +| Setting (profile key / `STACK_…` env var) | Purpose | Default | +| ------------------------------------------- | ----------------------------------------------- | -------------- | +| `backup` / `STACK_BACKUP` | Master switch — enable backup for deployments. | `false` | +| `backup-s3-endpoint` | Object store endpoint. | — | +| `backup-s3-bucket` | Bucket / repository location. | — | +| `aws-access-key-id`, `aws-secret-access-key`| Object store credentials. | — | +| `restic-password` | **Encryption key** (see warning below). | — | +| `backup-schedule` | Cron schedule. | `0 3 * * *` | +| `backup-retention` | `forget`/`prune` policy. | `--keep-daily 7 --keep-weekly 4 --keep-monthly 6` | + +Typical one-time setup for an environment: + +```bash +$ stack config set backup true +$ stack config set backup-s3-endpoint s3.us-west-2.amazonaws.com +$ stack config set backup-s3-bucket my-stack-backups +$ stack config set aws-access-key-id AKIA... +$ stack config set aws-secret-access-key ... +$ stack config set restic-password ... +``` + +After that, **every** deployment is backed up with no further action: + +```bash +$ stack deploy --spec-file ~/specs/todo.yml --deployment-dir ~/deployments/todo +$ stack manage --dir ~/deployments/todo start +# ...the deployment's volumes are now backed up on the configured schedule. +``` + +> #### ⚠ The encryption key cannot be purely ephemeral +> +> restic cannot decrypt a repository without its password. If `restic-password` is auto-generated and +> lives *only* in an environment that is later lost, the backups become **permanently unrecoverable** — an +> encrypted bucket that can never be read. The password must therefore either be set explicitly by the +> operator, or be auto-generated **and persisted and surfaced for the operator to escrow**. This is the one +> piece of backup configuration that must not be treated as disposable ambient state. Object-store +> credentials, by contrast, can be rotated freely. + +## Volume selection (automatic) + +By default **all read-write named volumes** in a deployment are backed up, file-level. Read-only mounts and +config maps are skipped. No annotation or flag is required for this — it is derived entirely from the +merged spec. + +The only optional refinement is to *exclude* a volume that is a cache, scratch space, or otherwise cheaply +reconstructable, using an annotation in the stack's `composefile.yml`: + +```yaml +services: + backend: + image: bozemanpass/todo-backend:stack + volumes: + - "uploads:/app/uploads" # backed up by default + - "cache:/app/cache" # @stack backup-exclude +``` + +Excluding a volume is an **author** decision encoded in the component, not something the deployer supplies. + +## Application consistency + +This is the one place where "just back up everything" needs care, and it is worth stating plainly: the +ingress analogy is misleading because ingress is *stateless* whereas backup is *deeply stateful*. A +file-level copy of a **live database's** data directory, read file-by-file while the database writes, can +produce a torn, unrestorable snapshot. + +For such services the stack author adds a single annotation specifying a logical dump command, whose stdout +is captured into the backup instead of (or alongside) the raw files: + +```yaml +services: + db: + image: bozemanpass/todo-db:stack + volumes: + - "pgdata:/var/lib/postgresql/data" # @stack backup-exclude + # @stack backup-command pg_dump -U postgres -d todos + # @stack backup-file-extension sql +``` + +Crucially this is **author-time** metadata: whoever packages the database component writes it once, and +every deployer of that stack gets consistent backups for free, having supplied nothing. The annotation maps +one-to-one onto K8up's `k8up.io/backupcommand` / `k8up.io/file-extension` pod annotations on the Kubernetes +target, and onto a pre-backup hook in the restic container on the Docker target. + +### Annotation summary + +There are only two optional annotations, both author-time, both with safe "just back it up" defaults: + +| Annotation | Applies to | Meaning | +| ----------------------------------- | -------------- | ----------------------------------------------------------------------- | +| `@stack backup-exclude` | a volume mount | Do not include this volume in the file-level backup. | +| `@stack backup-command ` | a service | Capture ``'s stdout into the backup (e.g. a consistent DB dump). | +| `@stack backup-file-extension `| a service | Name the captured `backup-command` output with this extension. | + +A stack that uses none of these is still fully backed up — every read-write volume, file-level. + +## How Docker backup works + +Like ingress, the stack tool does **not** write the backup engine's job logic by hand. It generates +configuration for an existing restic container image and lets that image do the work. The backup container +cannot be a purely *static* mix-in, because at the time its image is built it does not know which volumes +the application stack will contribute — and naming them statically would collide with the merge step, which +requires unique volume names across mixed-in specs. + +Instead, when the `backup` master switch is enabled, the backup container is **injected at deploy time**, +precisely parallel to the way `deploy` injects `VIRTUAL_HOST_MULTIPORTS` environment variables into +matching services for ingress: + +1. **Annotations to spec** — during `stack init`, any `@stack backup-*` annotations are parsed out of + `composefile.yml` into a `backup` section of the output spec. + +2. **Spec to backup container** — during `stack deploy`, the tool reads the merged deployment's volumes + (`Spec.get_volumes()`) and: + - mounts every non-excluded read-write volume **read-only** into the backup container + (`- :/backup/:ro`), and + - injects the ambiently-resolved schedule, retention, object-store, and per-service `backup-command` + hook settings as environment / config for the container. + +3. **The backup container runs restic on a schedule** — on each cron tick it runs any configured + pre-backup hooks (the logical dumps) and then `restic backup` of the mounted volume tree, applying the + retention policy. + +Because every named volume on the Docker target is already realised as a bind mount under +`/data//`, the set of paths to back up is fully deterministic. + +## Kubernetes + +On Kubernetes the work is delegated to [K8up](https://k8up.io), a restic-based backup operator. This +follows the same assume-present contract the stack tool already uses for `ingress-nginx` and `cert-manager` +(it references an `ingress_class_name="nginx"` and a `cert-manager.io/cluster-issuer` it does not install): +`stack` does not install K8up; it only emits resources that *reference* it. + +- During `stack deploy`, the tool emits a K8up `Schedule` for the deployment's namespace (one namespace per + deployment), so all of its PVCs are backed up, together with `k8up.io/backupcommand` / + `k8up.io/file-extension` annotations derived from the `@stack backup-*` annotations. +- K8up writes a **standard restic repository** to the same object store, with the same encryption — so the + repositories are interchangeable with those produced on the Docker target. + +K8up itself is provisioned by the **`cluster`** tool (the batteries-included checker/fixer for required +cluster components), exactly as `cluster` is responsible for `cert-manager` and `ingress-nginx`. A +backup-enabled deployment fails recognisably — the same way an ingress deployment fails today when +`cert-manager` is absent — if K8up has not been provisioned. The readiness probe is concrete: K8up's CRDs +registered and its operator `Deployment` healthy. + +> Because the deployment uses a single node (or node affinity to co-locate data), K8up's backup `Job` can +> mount the `ReadWriteOnce` PVCs alongside the running application pod. Cross-node volume access is +> explicitly out of scope. + +## Restore + +Restore is deliberately modelled as a distinct mode rather than a live operation, which neatly sidesteps the +problem of two consumers mounting the same volume at once. At restore time **nothing else holds the +volumes**, so even `ReadWriteOnce` PVCs can be attached by the restore job. + +The flow: + +1. The full stack is stopped (if running). +2. The volumes are (re)created empty. +3. A **backup-only** variant of the deployment is brought up, running a restore of the chosen snapshot: + - **Docker:** bring up only the backup container with a restore command, restoring into the now-empty + bind-mounted volumes. + - **Kubernetes:** create a K8up `Restore` resource targeting the freshly-created PVCs. +4. The backup-only variant is torn down once the restore completes. +5. The full stack is started; its volumes now contain the restored data. + +This is driven by a single command (see below) so the operator does not orchestrate the steps by hand. + +## Command structure + +Backup *configuration* is ambient (above) and backup runs automatically. The commands below are for +*operating on* an existing deployment — inspecting, triggering off-schedule, and restoring — so they live +under `stack manage --dir `, alongside `start`, `stop`, `ps`, and `logs`. They are **not** under +`stack deploy`, which only *creates* a deployment and exits. + +``` +stack manage --dir backup +``` + +| Command | Description | +| -------------------------------------------------------------- | ------------------------------------------------------------------ | +| `stack manage --dir backup now` | Run a backup immediately, outside the schedule. | +| `stack manage --dir backup status` | Show the result of the last run and repository health. | +| `stack manage --dir backup list` | List snapshots (the available epochs) with timestamps and tags. | +| `stack manage --dir backup restore [--snapshot ]` | Orchestrate the full stop → restore → restart flow. Defaults to the latest snapshot; `--volume ` restores a single volume; `--and-start` restarts the full stack on completion. | +| `stack manage --dir backup prune` | Apply the retention policy (`forget` + `prune`). | +| `stack manage --dir backup check` | Verify repository integrity. | + +`backup` is a Click sub-group of the existing `manage` group, so it inherits `--dir` and the deployment +context. Internally each subcommand dispatches to the active target: on Docker it `exec`s restic in the +backup container (or runs a one-off restore deployment); on Kubernetes it creates/reads the corresponding +K8up resources. + +> The earlier sketch of `stack deploy backup status` is intentionally **not** the chosen shape: `deploy` +> creates a deployment from specs and exits, whereas backup status/restore/list are operations *on an +> existing* deployment — which is precisely what `manage` is for. + +## Open questions + +- **Auto-enable vs explicit switch.** This design gates backup on an explicit `backup` master switch + (profile setting / `STACK_BACKUP`) so behaviour is predictable. The most transparent alternative — + enabling backup automatically whenever a destination is configured — is rejected as too implicit, but is + worth revisiting. +- **Encryption-key escrow.** The concrete mechanism for persisting and surfacing an auto-generated + `restic-password` so it cannot be silently lost (see the warning above). +- **Off-the-shelf Docker image.** Candidates that preserve the restic-repo format include `mazzolino/restic` + and `lobaro/restic-backup-docker`. (`offen/docker-volume-backup` is feature-rich but defaults to + tar+GPG, which would break cross-target compatibility.) +- **Monitoring.** Surfacing backup success/failure (a healthcheck or status that `backup status` can read) + so that a silently failing backup is not mistaken for a working one. +- **New repository.** The Docker backup container lives in its own repo (e.g. `bozemanpass/backup-stack`), + mirroring `bozemanpass/docker-ingress-stack`. diff --git a/src/stack/constants.py b/src/stack/constants.py index 8e01520..31188f3 100644 --- a/src/stack/constants.py +++ b/src/stack/constants.py @@ -30,6 +30,11 @@ container_file_name = "container.yml" cluster_issuer_key = "cluster-issuer" deploy_to_key = "deploy-to" +backup_key = "backup" +backup_service_name = "backup" +backup_exclude_annotation = "backup-exclude" +backup_command_annotation = "backup-command" +backup_file_extension_annotation = "backup-file-extension" deployment_file_name = "deployment.yml" host_name_key = "host-name" http_proxy_key = "http-proxy" diff --git a/src/stack/deploy/deployment_create.py b/src/stack/deploy/deployment_create.py index 2500a3b..5544905 100644 --- a/src/stack/deploy/deployment_create.py +++ b/src/stack/deploy/deployment_create.py @@ -27,6 +27,7 @@ from stack import constants from stack.deploy.compose.helpers import add_env_var +from stack.config.util import get_config_setting from stack.log import log_debug, log_warn, log_info from stack.util import ( get_stack_path, @@ -271,6 +272,7 @@ def init_operation( # noqa: C901 http_proxy_targets, output, map_ports_to_host, + backup_targets=None, ): spec_file_content = {"stack": stack, constants.deploy_to_key: deployer_type} if deployer_type in ["k8s", "k8s-kind"]: @@ -325,6 +327,10 @@ def init_operation( # noqa: C901 spec_file_content[constants.network_key] = {} spec_file_content[constants.network_key].update({constants.http_proxy_key: [http_proxy]}) + # Record backup annotations (e.g. excluded volumes) parsed from the stack's composefiles. + if backup_targets and (backup_targets.get("exclude") or backup_targets.get("commands")): + spec_file_content[constants.backup_key] = backup_targets + # Implement merge, since update() overwrites if config_variables: orig_config = spec_file_content.get("config", {}) @@ -573,6 +579,33 @@ def create_operation(deployment_command_context, parsed_spec: Spec | MergedSpec, add_env_var("LETSENCRYPT_HOST", host, svc_env) service_info["environment"] = svc_env + # When backup is enabled, augment the backup service (defined in the mixed-in + # backup-stack) with read-only mounts of the deployment's data volumes plus the + # backup engine configuration. Mirrors the VIRTUAL_HOST injection above. + # See docs/backup-implementation.md. + if get_config_setting("backup", False) and service_name == constants.backup_service_name: + backup_cfg = parsed_spec.get_backup() + exclude = set(backup_cfg.get("exclude", [])) + mounts = service_info.setdefault("volumes", []) + for vol_name, vol_path in parsed_spec.get_volumes().items(): + if vol_name in exclude or not vol_path: + continue + # Same host path the named volume binds to (see _fixup_pod_file). + # Mounted rw so the same container can restore in place; scheduled + # backups only read. See docs/backup.md "Restore". + device = vol_path if Path(vol_path).is_absolute() else f".{vol_path}" + mounts.append(f"{device}:/backup/{vol_name}:rw") + backup_env = service_info.get("environment", {}) + add_env_var("BACKUP_S3_ENDPOINT", get_config_setting("backup-s3-endpoint", ""), backup_env) + add_env_var("BACKUP_S3_BUCKET", get_config_setting("backup-s3-bucket", ""), backup_env) + add_env_var("BACKUP_SCHEDULE", get_config_setting("backup-schedule", "0 3 * * *"), backup_env) + add_env_var( + "BACKUP_RETENTION", + get_config_setting("backup-retention", "--keep-daily 7 --keep-weekly 4 --keep-monthly 6"), + backup_env, + ) + service_info["environment"] = backup_env + with open(destination_compose_dir.joinpath(f"{constants.compose_file_prefix}-%s.yml" % pod), "w") as output_file: yaml.dump(parsed_pod_file, output_file) diff --git a/src/stack/deploy/spec.py b/src/stack/deploy/spec.py index 96d6e4b..6f305d6 100644 --- a/src/stack/deploy/spec.py +++ b/src/stack/deploy/spec.py @@ -140,6 +140,9 @@ def get_volume_resources(self, volume_name): def get_http_proxy(self): return self.obj.get(constants.network_key, {}).get(constants.http_proxy_key, []) + def get_backup(self): + return self.obj.get(constants.backup_key, {}) + def _clear_http_proxy(self): if constants.network_key in self.obj: if constants.http_proxy_key in self.obj[constants.network_key]: diff --git a/src/stack/deploy/stack.py b/src/stack/deploy/stack.py index 278e5fa..4ea83fa 100644 --- a/src/stack/deploy/stack.py +++ b/src/stack/deploy/stack.py @@ -191,6 +191,38 @@ def get_volumes(self): volumes[svc_name] = svc[constants.volumes_key] return volumes + def get_backup_targets(self): + """Parse @stack backup-* annotations from the stack's composefiles. + + Currently supports `backup-exclude` on a volume mount line, parsed the same way as the + http-proxy port annotations (see get_http_proxy_targets). Returns: + + {"exclude": [volume_name, ...], "commands": {}} + + `commands` (per-service consistency dumps from `backup-command`) is reserved for a + follow-up; see docs/backup-implementation.md. + """ + exclude = [] + for pod in self.get_pod_list(): + parsed_pod_file = self.load_pod_file(pod) + if constants.services_key not in parsed_pod_file: + continue + for svc_name, svc in parsed_pod_file[constants.services_key].items(): + if constants.volumes_key not in svc: + continue + volumes_section = svc[constants.volumes_key] + for i, mount in enumerate(volumes_section): + item_comments = volumes_section.ca.items.get(i) + if item_comments and item_comments[0]: + # Only the end-of-line comment (first line of the token) counts. + # ruamel attaches trailing block comments (e.g. a comment that heads + # the next service) to the preceding item; those must be ignored. + comment = item_comments[0].value.split("\n", 1)[0].strip() + if constants.stack_annotation_marker in comment \ + and constants.backup_exclude_annotation in comment: + exclude.append(str(mount).split(":")[0]) + return {"exclude": exclude, "commands": {}} + def get_http_proxy_targets(self, prefix=None): if prefix: if prefix == "/": diff --git a/src/stack/init/init.py b/src/stack/init/init.py index 5b06d2f..bc281a9 100644 --- a/src/stack/init/init.py +++ b/src/stack/init/init.py @@ -189,6 +189,7 @@ def http_prefix_for(stack): inner_stack_config = get_parsed_stack_config(stack) http_proxy_targets = inner_stack_config.get_http_proxy_targets(http_prefix) + backup_targets = inner_stack_config.get_backup_targets() if i == len(required_stacks) - 1: http_proxy_targets.extend(http_proxy_target) @@ -209,6 +210,7 @@ def http_prefix_for(stack): http_proxy_targets, None, map_ports_to_host, + backup_targets, ) specs.append(spec) diff --git a/tests/backup/run-test.sh b/tests/backup/run-test.sh new file mode 100755 index 0000000..f9042f8 --- /dev/null +++ b/tests/backup/run-test.sh @@ -0,0 +1,176 @@ +#!/usr/bin/env bash +# End-to-end test of the backup/restore feature on the Docker (compose) target. +# +# Flow: deploy an app (holding data in a volume) + a local S3 store (SeaweedFS) + the +# mixed-in backup stack -> write a known payload -> take a restic backup -> wipe the data +# -> restore from the backup -> assert the payload came back (read via the app). Also +# relies on the s3 store's own volume being excluded from backup (@stack backup-exclude) +# so it is not captured. +# +# Requires Docker. Run from the repo root, either: +# ./tests/backup/run-test.sh # uses the built shiv package in ./package +# ./tests/backup/run-test.sh from-path # uses `stack` from PATH (dev mode) +# +# NOTE: this fetches the test stacks and the backup stack from GitHub, so the +# `test-backup-stack` additions in bozemanpass/stack-test-stacks and the +# bozemanpass/backup-stack repo must be pushed for this to run. +set -e +if [ -n "$STACK_SCRIPT_DEBUG" ]; then + set -x +fi + +if ! command -v docker &> /dev/null; then + echo "Error: 'docker' is not installed or not available on the PATH" + exit 1 +fi + +if [ "$1" == "from-path" ]; then + TEST_TARGET_STACK="stack" +else + TEST_TARGET_STACK=$( ls -t1 ./package/stack* | head -1 ) +fi + +app_stack="test-backup-stack" +backup_stack="backup" +deployment_dir_name="${app_stack}-deployment" +app_spec="${app_stack}-spec.yml" +backup_spec="${backup_stack}-spec.yml" + +# Ambient backup configuration (sourced from the environment by the stack tool). +export STACK_BACKUP=true +export STACK_BACKUP_S3_ENDPOINT=http://s3:8333 +export STACK_BACKUP_S3_BUCKET=stack-backups + +payload="backup-test-payload-$$" # a value unique to this run + +# Run a command inside a deployment container. The stack `exec` wraps the command in +# `sh -c`, so the whole command must be passed as a single argument. +dexec () { $TEST_TARGET_STACK manage --dir "$test_deployment_dir" exec "$1" "$2"; } + +# Containers write into the bind-mounted volume dirs as root, so the resulting files cannot +# be removed by the (non-root) host user. Remove such a dir via a throwaway container. +force_rm () { + if [ -d "$1" ]; then + docker run --rm -v "$(dirname "$1")":/w alpine rm -rf "/w/$(basename "$1")" || rm -rf "$1" + fi +} + +cleanup_exit () { + $TEST_TARGET_STACK manage --dir "$test_deployment_dir" stop --delete-volumes || true + exit 1 +} + +wait_for_pods_started () { + for i in {1..50}; do + local ps_output + ps_output=$( $TEST_TARGET_STACK manage --dir "$test_deployment_dir" ps ) + if [[ "$ps_output" == *"id:"* ]]; then + return + fi + sleep 5 + done + echo "waiting for pods to start: FAILED" + cleanup_exit +} + +STACK_TEST_DIR=~/stack-test/backup-test-dir +export STACK_REPO_BASE_DIR=${STACK_TEST_DIR}/repo-base-dir +echo "Testing this package: $TEST_TARGET_STACK" +$TEST_TARGET_STACK version +echo "Using test directory: $STACK_TEST_DIR" +force_rm "$STACK_TEST_DIR" +mkdir -p "$STACK_REPO_BASE_DIR" + +# Force a rebuild of the backup image so the test exercises current sources. +existing=$(docker image ls -q --filter=reference=bozemanpass/backup | uniq) +if [ -n "$existing" ]; then docker image rm -f ${existing} || true; fi + +# Fetch and prepare the stacks. +$TEST_TARGET_STACK fetch repo github.com/bozemanpass/stack-test-stacks +$TEST_TARGET_STACK fetch repo github.com/bozemanpass/backup-stack +$TEST_TARGET_STACK prepare --stack ${app_stack} +$TEST_TARGET_STACK prepare --stack ${backup_stack} + +test_deployment_dir=$STACK_TEST_DIR/${deployment_dir_name} +test_app_spec=$STACK_TEST_DIR/${app_spec} +test_backup_spec=$STACK_TEST_DIR/${backup_spec} + +# Init the app stack (Docker target - no --deploy-to k8s-kind). +$TEST_TARGET_STACK init --stack ${app_stack} --output "$test_app_spec" + +# Init the backup stack. The restic password + S3 credentials are passed as config so they +# reach the backup container via the shared config.env (SeaweedFS ignores the creds but +# restic requires them to be set). +$TEST_TARGET_STACK init --stack ${backup_stack} --output "$test_backup_spec" \ + --config RESTIC_PASSWORD=test-restic-password \ + --config AWS_ACCESS_KEY_ID=test-access-key \ + --config AWS_SECRET_ACCESS_KEY=test-secret-key + +# Deploy, mixing in the backup stack. +$TEST_TARGET_STACK deploy \ + --spec-file "$test_backup_spec" \ + --spec-file "$test_app_spec" \ + --deployment-dir "$test_deployment_dir" +if [ ! -d "$test_deployment_dir" ]; then + echo "deploy create test: deployment directory not present" + echo "deploy create test: FAILED" + exit 1 +fi +echo "deploy create test: passed" + +$TEST_TARGET_STACK manage --dir "$test_deployment_dir" start +wait_for_pods_started + +# 1. Write a known payload into the app's data volume (via the app). +dexec app "echo ${payload} > /data/payload.txt" +echo "wrote payload: ${payload}" + +# 2. Take a backup, retrying until the S3 store has finished starting up. backup.sh creates +# the restic repository on first use (restic auto-creates the bucket on SeaweedFS). +backed_up= +for i in {1..50}; do + if dexec backup "/scripts/backup.sh"; then backed_up=1; break; fi + echo "waiting for backup to succeed (s3 warming up): ${i}" + sleep 5 +done +if [ -z "$backed_up" ]; then + echo "Backup test: FAILED" + cleanup_exit +fi +echo "Backup test: passed" + +# 3. Simulate data loss by wiping the app volume (through the backup container's rw mount). +dexec backup "rm -rf /backup/app-data/*" +gone=$( dexec backup "ls /backup/app-data" || true ) +if [[ "$gone" == *"payload.txt"* ]]; then + echo "Simulate data loss: FAILED (payload still present)" + cleanup_exit +fi +echo "Simulate data loss: passed (payload gone)" + +# 4. Restore from the latest snapshot. +dexec backup "/scripts/restore.sh latest" + +# 5. Assert the payload came back, reading it through the app. +restored=$( dexec app "cat /data/payload.txt" || true ) +if [[ "$restored" == *"$payload"* ]]; then + echo "Restore content test: passed" +else + echo "Restore content test: FAILED (expected '${payload}', got '${restored}')" + cleanup_exit +fi + +# 6. Assert the excluded s3 store volume was NOT mounted into / captured by the backup. +listing=$( dexec backup "ls /backup" || true ) +if [[ "$listing" == *"s3-data"* ]]; then + echo "Exclude annotation test: FAILED (s3-data was backed up)" + cleanup_exit +fi +if [[ "$listing" != *"app-data"* ]]; then + echo "Exclude annotation test: FAILED (app-data missing from backup)" + cleanup_exit +fi +echo "Exclude annotation test: passed (s3-data excluded, app-data backed up)" + +$TEST_TARGET_STACK manage --dir "$test_deployment_dir" stop --delete-volumes +echo "Test passed" From a3bf3324656aea1b91b764d377fb28952c227eba Mon Sep 17 00:00:00 2001 From: David Boreham Date: Tue, 30 Jun 2026 08:20:48 -0600 Subject: [PATCH 2/2] Add CI job for backup --- .github/workflows/test-backup.yml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 .github/workflows/test-backup.yml diff --git a/.github/workflows/test-backup.yml b/.github/workflows/test-backup.yml new file mode 100644 index 0000000..99c351d --- /dev/null +++ b/.github/workflows/test-backup.yml @@ -0,0 +1,27 @@ +name: Backup Test + +on: + pull_request: + branches: + - '*' + push: + branches: + - '*' + +jobs: + test: + name: "Run backup/restore test suite" + runs-on: ubuntu-24.04 + steps: + - name: "Clone project repository" + uses: actions/checkout@v3 + - name: "Install uv" + uses: astral-sh/setup-uv@v4 + with: + python-version: '3.12' + - name: "Generate build version file" + run: ./scripts/create_build_tag_file.sh + - name: "Build local shiv package" + run: ./scripts/build_shiv_package.sh + - name: "Run backup tests" + run: ./tests/backup/run-test.sh