diff --git a/CHANGELOG.md b/CHANGELOG.md index b190553fd..f7847dea0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Change Log +## [Unreleased] + +### Added + +- `run-task` now accepts an opt-in `--{project}-checkout-bundle=` option to write a `.tar.zst` archive of a checkout (excluding `.git`/`.hg`) after checkout, so a decision task can publish its checkout as an artifact consumable downstream via `fetches:` + ## [24.1.0] - 2026-06-04 ### Added diff --git a/docs/howto/use-fetches.rst b/docs/howto/use-fetches.rst index d7e727b83..3a377573c 100644 --- a/docs/howto/use-fetches.rst +++ b/docs/howto/use-fetches.rst @@ -105,3 +105,47 @@ There are a few differences from the earlier ``build`` examples here: It is not possible to configure the ``dest`` or ``extract`` values when using ``fetch`` or ``toolchain`` kinds. + +Publishing a Checkout as a Fetch +-------------------------------- + +Re-cloning a large repository in every task can be slow. To avoid this, a task +that already performs a checkout (such as the decision task) can publish a +compressed snapshot of its checkout as an artifact, which downstream tasks then +consume via the ``fetches`` mechanism described above. + +The ``run-task`` script accepts an opt-in ``--{project}-checkout-bundle=`` +option (one per repository, mirroring ``--{project}-checkout=``). When +set, ``run-task`` writes a ``.tar.zst`` archive of the checkout to ```` +immediately after the checkout completes. The archive contains only the +working-tree files: the ``.git`` and ``.hg`` metadata directories are excluded, +so no history is included. The checkout directory's basename is used as the +archive's top-level prefix. If archiving fails, ``run-task`` exits non-zero +rather than producing a broken artifact. + +For example, point the bundle at a path inside the directory the worker exposes +as artifacts (commonly ``UPLOAD_DIR``). The producing task must also declare a +matching artifact in its payload; writing the file alone does not publish it: + +.. code-block:: text + + run-task --vcs-checkout=/builds/worker/checkouts/src \ + --vcs-checkout-bundle=$UPLOAD_DIR/src.tar.zst -- ... + +Because the artifact is a ``.tar.zst`` (the same format produced by +``fetch-content``'s ``git-checkout-archive``), a downstream task can fetch and +extract it like any other artifact. As with the ``build.zip`` example above, the +artifact path is relative to the producing task's artifact prefix (which +defaults to ``public/build``): + +.. code-block:: yaml + + test: + dependencies: + decision: decision + fetches: + decision: + - artifact: src.tar.zst + extract: true + +The checkout will be extracted into the ``$TASK_WORKDIR/fetches`` directory. diff --git a/src/taskgraph/run-task/run-task b/src/taskgraph/run-task/run-task index deca3a691..4a0bc9754 100755 --- a/src/taskgraph/run-task/run-task +++ b/src/taskgraph/run-task/run-task @@ -916,11 +916,17 @@ def add_vcs_arguments(parser, project, name): action="store_true", help=f"Use shallow clone for {name}", ) + parser.add_argument( + f"--{project}-checkout-bundle", + help=f"If set, write a .tar.zst archive of the {name} checkout " + f"(excluding .git/.hg) to this path after checkout", + ) def collect_vcs_options(args, project, name): checkout = getattr(args, f"{project}_checkout") shallow_clone = getattr(args, f"{project}_shallow_clone") + checkout_bundle = getattr(args, f"{project}_checkout_bundle") env_prefix = project.upper() @@ -938,6 +944,8 @@ def collect_vcs_options(args, project, name): # Expand ~ in some paths. if checkout: checkout = os.path.abspath(os.path.expanduser(checkout)) + if checkout_bundle: + checkout_bundle = os.path.abspath(os.path.expanduser(checkout_bundle)) if store_path: store_path = os.path.abspath(os.path.expanduser(store_path)) @@ -956,6 +964,7 @@ def collect_vcs_options(args, project, name): "name": name, "env-prefix": env_prefix, "checkout": checkout, + "checkout-bundle": checkout_bundle, "base-repo": base_repo, "base-rev": base_rev, "head-repo": head_repo, @@ -1034,6 +1043,79 @@ def vcs_checkout_from_args(options): os.environ["{}_HEAD_REV".format(options["env-prefix"])] = revision +def create_checkout_bundle(checkout_dir, dest_path): + """Write a compressed archive of a checkout's working tree. + + Produces a ``.tar.zst`` archive of the files in ``checkout_dir``, excluding + the ``.git`` and ``.hg`` vcs metadata directories (i.e. no history). The + basename of ``checkout_dir`` is used as the archive's top-level prefix. + + This matches the ``.tar.zst`` format produced by fetch-content's + ``git_checkout_archive``, so the resulting artifact is consumable downstream + via the standard ``fetches:`` mechanism. Raises on failure so a broken + artifact is never left behind. + """ + import zstandard # noqa: PLC0415 + + checkout_dir = Path(checkout_dir) + dest_path = Path(dest_path) + + if dest_path.suffixes[-2:] != [".tar", ".zst"]: + raise ValueError( + f"checkout bundle destination must end in .tar.zst, got: {dest_path}" + ) + + dest_path.parent.mkdir(parents=True, exist_ok=True) + + prefix = checkout_dir.name + print_line( + b"bundle", + b"creating checkout bundle of %s at %s\n" + % (str(checkout_dir).encode("utf-8"), str(dest_path).encode("utf-8")), + ) + + # Archive the checkout from its parent so the top-level entry is the + # checkout's basename, excluding vcs metadata so no history is included. + proc = subprocess.Popen( + [ + "tar", + "cf", + "-", + "--exclude=.git", + "--exclude=.hg", + "-C", + str(checkout_dir.parent), + prefix, + ], + stdout=subprocess.PIPE, + ) + + # Write to a temporary path and rename on success so a partial or broken + # archive is never left at dest_path. + tmp_path = dest_path.with_name(f"{dest_path.name}.tmp") + try: + with tmp_path.open("wb") as out: + zstandard.ZstdCompressor().copy_stream(proc.stdout, out) + except BaseException: + proc.kill() + proc.wait() + tmp_path.unlink(missing_ok=True) + raise + finally: + proc.stdout.close() + + ret = proc.wait() + if ret != 0: + tmp_path.unlink(missing_ok=True) + raise RuntimeError(f"tar exited with code {ret} while creating checkout bundle") + + tmp_path.rename(dest_path) + print_line( + b"bundle", + b"finished creating checkout bundle at %s\n" % str(dest_path).encode("utf-8"), + ) + + def install_pip_requirements(repositories): """Install pip requirements files from specified repositories, if necessary.""" requirements = [ @@ -1317,6 +1399,13 @@ def main(args): for repo in repositories: vcs_checkout_from_args(repo) + # Optionally publish each checkout as a compressed bundle so a decision + # task can expose it as an artifact for downstream tasks to fetch instead + # of re-cloning. Runs as the worker user, before the task command. + for repo in repositories: + if repo.get("checkout") and repo.get("checkout-bundle"): + create_checkout_bundle(repo["checkout"], repo["checkout-bundle"]) + # Convert certain well known environment variables to absolute paths. for k in [ "CARGO_HOME", diff --git a/test/test_scripts_run_task.py b/test/test_scripts_run_task.py index b2393721c..52a904634 100644 --- a/test/test_scripts_run_task.py +++ b/test/test_scripts_run_task.py @@ -1,9 +1,11 @@ import functools +import io import os import site import stat import subprocess import sys +import tarfile import tempfile from argparse import Namespace from importlib.machinery import SourceFileLoader @@ -11,6 +13,7 @@ from unittest.mock import Mock import pytest +import zstandard import taskgraph from taskgraph.util.caches import CACHES @@ -179,6 +182,16 @@ def test_install_pip_requirements_with_uv( {"shallow-clone": True}, id="git_with_shallow_clone", ), + pytest.param( + {"myrepo_checkout_bundle": "checkout.tar.zst"}, + { + "REPOSITORY_TYPE": "git", + "HEAD_REPOSITORY": "https://github.com/test/repo.git", + "HEAD_REV": "abc123", + }, + {}, + id="git_with_checkout_bundle", + ), ], ) def test_collect_vcs_options( @@ -197,6 +210,7 @@ def test_collect_vcs_options( args.setdefault(f"{name}_checkout", checkout) args.setdefault(f"{name}_shallow_clone", False) + args.setdefault(f"{name}_checkout_bundle", None) args = Namespace(**args) result = run_task_mod.collect_vcs_options(args, name, name) @@ -205,6 +219,7 @@ def test_collect_vcs_options( "base-repo": env.get("BASE_REPOSITORY"), "base-rev": env.get("BASE_REV"), "checkout": os.path.join(os.getcwd(), "checkout"), + "checkout-bundle": None, "env-prefix": name.upper(), "head-repo": env.get("HEAD_REPOSITORY"), "name": name, @@ -222,10 +237,80 @@ def test_collect_vcs_options( expected["checkout"], env.get("PIP_REQUIREMENTS") ) + bundle = getattr(args, f"{name}_checkout_bundle") + if bundle: + # Derived independently (like the ``checkout`` expectation above) so the + # abspath handling in collect_vcs_options is load-bearing, not mirrored. + expected["checkout-bundle"] = os.path.join(os.getcwd(), bundle) + expected.update(extra_expected) assert result == expected +def test_create_checkout_bundle(run_task_mod, tmp_path): + # Build a fake checkout containing vcs metadata (including a nested .git, as + # produced by submodules/sub-checkouts) plus real working files. + checkout = tmp_path / "myco" + (checkout / ".git").mkdir(parents=True) + (checkout / ".git" / "config").write_text("[core]\n") + (checkout / ".hg").mkdir() + (checkout / ".hg" / "hgrc").write_text("[paths]\n") + (checkout / "sub" / ".git").mkdir(parents=True) + (checkout / "sub" / ".git" / "config").write_text("[core]\n") + (checkout / "file1.txt").write_text("hello") + (checkout / "sub" / "file2.txt").write_text("world") + (checkout / ".gitignore").write_text("*.pyc\n") + + dest = tmp_path / "out.tar.zst" + run_task_mod.create_checkout_bundle(str(checkout), str(dest)) + + assert dest.exists() + # No leftover temporary file. + assert not dest.with_name(f"{dest.name}.tmp").exists() + + # The archive should be valid zstd + tar. Decompress fully so it can be read + # in random-access mode (to also verify file contents survive intact). + dctx = zstandard.ZstdDecompressor() + with dest.open("rb") as fh: + tar_bytes = dctx.stream_reader(fh).read() + with tarfile.open(fileobj=io.BytesIO(tar_bytes), mode="r:") as tf: + members = tf.getnames() + assert tf.extractfile("myco/file1.txt").read() == b"hello" + assert tf.extractfile("myco/sub/file2.txt").read() == b"world" + + # Regular files are present under the checkout's basename prefix. + assert "myco/file1.txt" in members + assert "myco/sub/file2.txt" in members + # A dotfile that merely starts with ".git" is not excluded. + assert "myco/.gitignore" in members + # vcs metadata directories are excluded at any depth (no history). + assert not any(".git" in m.split("/") or ".hg" in m.split("/") for m in members), ( + members + ) + + +def test_create_checkout_bundle_rejects_bad_suffix(run_task_mod, tmp_path): + checkout = tmp_path / "myco" + checkout.mkdir() + (checkout / "file1.txt").write_text("hello") + + with pytest.raises(ValueError): + run_task_mod.create_checkout_bundle(str(checkout), str(tmp_path / "out.tar.gz")) + + +def test_create_checkout_bundle_fails_fast(run_task_mod, tmp_path): + # tar exits nonzero for a checkout dir that does not exist; the helper must + # raise and leave neither the destination nor the temporary file behind. + dest = tmp_path / "out.tar.zst" + missing = tmp_path / "does-not-exist" + + with pytest.raises(RuntimeError): + run_task_mod.create_checkout_bundle(str(missing), str(dest)) + + assert not dest.exists() + assert not dest.with_name(f"{dest.name}.tmp").exists() + + def test_remove_directory(monkeypatch, run_task_mod): _tempdir = tempfile.TemporaryDirectory() assert os.path.isdir(_tempdir.name) is True @@ -638,3 +723,36 @@ def test_main_abspath_environment(mocker, run_main): assert env.get("MOZ_UV_HOME") == "/builds/worker/dir/uv" for key in envvars: assert env[key] == "/builds/worker/file" + + +@nowin +def test_main_runs_checkout_bundle(mocker, run_main, run_task_mod, tmp_path): + # Drive main() to verify the bundle step is opt-in (default OFF) and wired to + # create_checkout_bundle once per eligible repo. Stub out the real checkout + # and archiving; just record the bundle invocations. + mocker.patch.object(run_task_mod, "vcs_checkout_from_args", lambda repo: None) + + calls = [] + mocker.patch.object( + run_task_mod, + "create_checkout_bundle", + lambda checkout_dir, dest_path: calls.append((checkout_dir, dest_path)), + ) + + checkout = tmp_path / "checkouts" / "vcs" + bundle = tmp_path / "artifacts" / "src.tar.zst" + + # Default OFF: without --vcs-checkout-bundle, the bundle step is skipped. + result, _ = run_main(extra_args=[f"--vcs-checkout={checkout}"]) + assert result == 0 + assert calls == [] + + # Opt-in: the bundle step runs once with the (abspath'd) checkout and dest. + result, _ = run_main( + extra_args=[ + f"--vcs-checkout={checkout}", + f"--vcs-checkout-bundle={bundle}", + ] + ) + assert result == 0 + assert calls == [(os.path.abspath(str(checkout)), os.path.abspath(str(bundle)))]