Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Change Log

## [Unreleased]

### Added

- `run-task` now accepts an opt-in `--{project}-checkout-bundle=<dest>` option to write a `.tar.zst` archive of a checkout (excluding `.git`/`.hg`) after checkout, so a decision task can publish its checkout as an artifact consumable downstream via `fetches:`

## [24.1.0] - 2026-06-04

### Added
Expand Down
44 changes: 44 additions & 0 deletions docs/howto/use-fetches.rst
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,47 @@ There are a few differences from the earlier ``build`` examples here:

It is not possible to configure the ``dest`` or ``extract`` values when using
``fetch`` or ``toolchain`` kinds.

Publishing a Checkout as a Fetch
--------------------------------

Re-cloning a large repository in every task can be slow. To avoid this, a task
that already performs a checkout (such as the decision task) can publish a
compressed snapshot of its checkout as an artifact, which downstream tasks then
consume via the ``fetches`` mechanism described above.

The ``run-task`` script accepts an opt-in ``--{project}-checkout-bundle=<dest>``
option (one per repository, mirroring ``--{project}-checkout=<path>``). When
set, ``run-task`` writes a ``.tar.zst`` archive of the checkout to ``<dest>``
immediately after the checkout completes. The archive contains only the
working-tree files: the ``.git`` and ``.hg`` metadata directories are excluded,
so no history is included. The checkout directory's basename is used as the
archive's top-level prefix. If archiving fails, ``run-task`` exits non-zero
rather than producing a broken artifact.

For example, point the bundle at a path inside the directory the worker exposes
as artifacts (commonly ``UPLOAD_DIR``). The producing task must also declare a
matching artifact in its payload; writing the file alone does not publish it:

.. code-block:: text

run-task --vcs-checkout=/builds/worker/checkouts/src \
--vcs-checkout-bundle=$UPLOAD_DIR/src.tar.zst -- ...

Because the artifact is a ``.tar.zst`` (the same format produced by
``fetch-content``'s ``git-checkout-archive``), a downstream task can fetch and
extract it like any other artifact. As with the ``build.zip`` example above, the
artifact path is relative to the producing task's artifact prefix (which
defaults to ``public/build``):

.. code-block:: yaml

test:
dependencies:
decision: decision
fetches:
decision:
- artifact: src.tar.zst
extract: true

The checkout will be extracted into the ``$TASK_WORKDIR/fetches`` directory.
89 changes: 89 additions & 0 deletions src/taskgraph/run-task/run-task
Original file line number Diff line number Diff line change
Expand Up @@ -916,11 +916,17 @@ def add_vcs_arguments(parser, project, name):
action="store_true",
help=f"Use shallow clone for {name}",
)
parser.add_argument(
f"--{project}-checkout-bundle",
help=f"If set, write a .tar.zst archive of the {name} checkout "
f"(excluding .git/.hg) to this path after checkout",
)


def collect_vcs_options(args, project, name):
checkout = getattr(args, f"{project}_checkout")
shallow_clone = getattr(args, f"{project}_shallow_clone")
checkout_bundle = getattr(args, f"{project}_checkout_bundle")

env_prefix = project.upper()

Expand All @@ -938,6 +944,8 @@ def collect_vcs_options(args, project, name):
# Expand ~ in some paths.
if checkout:
checkout = os.path.abspath(os.path.expanduser(checkout))
if checkout_bundle:
checkout_bundle = os.path.abspath(os.path.expanduser(checkout_bundle))
if store_path:
store_path = os.path.abspath(os.path.expanduser(store_path))

Expand All @@ -956,6 +964,7 @@ def collect_vcs_options(args, project, name):
"name": name,
"env-prefix": env_prefix,
"checkout": checkout,
"checkout-bundle": checkout_bundle,
"base-repo": base_repo,
"base-rev": base_rev,
"head-repo": head_repo,
Expand Down Expand Up @@ -1034,6 +1043,79 @@ def vcs_checkout_from_args(options):
os.environ["{}_HEAD_REV".format(options["env-prefix"])] = revision


def create_checkout_bundle(checkout_dir, dest_path):
"""Write a compressed archive of a checkout's working tree.

Produces a ``.tar.zst`` archive of the files in ``checkout_dir``, excluding
the ``.git`` and ``.hg`` vcs metadata directories (i.e. no history). The
basename of ``checkout_dir`` is used as the archive's top-level prefix.

This matches the ``.tar.zst`` format produced by fetch-content's
``git_checkout_archive``, so the resulting artifact is consumable downstream
via the standard ``fetches:`` mechanism. Raises on failure so a broken
artifact is never left behind.
"""
import zstandard # noqa: PLC0415

checkout_dir = Path(checkout_dir)
dest_path = Path(dest_path)

if dest_path.suffixes[-2:] != [".tar", ".zst"]:
raise ValueError(
f"checkout bundle destination must end in .tar.zst, got: {dest_path}"
)

dest_path.parent.mkdir(parents=True, exist_ok=True)

prefix = checkout_dir.name
print_line(
b"bundle",
b"creating checkout bundle of %s at %s\n"
% (str(checkout_dir).encode("utf-8"), str(dest_path).encode("utf-8")),
)

# Archive the checkout from its parent so the top-level entry is the
# checkout's basename, excluding vcs metadata so no history is included.
proc = subprocess.Popen(
[
"tar",
"cf",
"-",
"--exclude=.git",
"--exclude=.hg",
"-C",
str(checkout_dir.parent),
prefix,
],
stdout=subprocess.PIPE,
)

# Write to a temporary path and rename on success so a partial or broken
# archive is never left at dest_path.
tmp_path = dest_path.with_name(f"{dest_path.name}.tmp")
try:
with tmp_path.open("wb") as out:
zstandard.ZstdCompressor().copy_stream(proc.stdout, out)
except BaseException:
proc.kill()
proc.wait()
tmp_path.unlink(missing_ok=True)
raise
finally:
proc.stdout.close()

ret = proc.wait()
if ret != 0:
tmp_path.unlink(missing_ok=True)
raise RuntimeError(f"tar exited with code {ret} while creating checkout bundle")

tmp_path.rename(dest_path)
print_line(
b"bundle",
b"finished creating checkout bundle at %s\n" % str(dest_path).encode("utf-8"),
)


def install_pip_requirements(repositories):
"""Install pip requirements files from specified repositories, if necessary."""
requirements = [
Expand Down Expand Up @@ -1317,6 +1399,13 @@ def main(args):
for repo in repositories:
vcs_checkout_from_args(repo)

# Optionally publish each checkout as a compressed bundle so a decision
# task can expose it as an artifact for downstream tasks to fetch instead
# of re-cloning. Runs as the worker user, before the task command.
for repo in repositories:
if repo.get("checkout") and repo.get("checkout-bundle"):
create_checkout_bundle(repo["checkout"], repo["checkout-bundle"])

# Convert certain well known environment variables to absolute paths.
for k in [
"CARGO_HOME",
Expand Down
118 changes: 118 additions & 0 deletions test/test_scripts_run_task.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
import functools
import io
import os
import site
import stat
import subprocess
import sys
import tarfile
import tempfile
from argparse import Namespace
from importlib.machinery import SourceFileLoader
from importlib.util import module_from_spec, spec_from_loader
from unittest.mock import Mock

import pytest
import zstandard

import taskgraph
from taskgraph.util.caches import CACHES
Expand Down Expand Up @@ -179,6 +182,16 @@ def test_install_pip_requirements_with_uv(
{"shallow-clone": True},
id="git_with_shallow_clone",
),
pytest.param(
{"myrepo_checkout_bundle": "checkout.tar.zst"},
{
"REPOSITORY_TYPE": "git",
"HEAD_REPOSITORY": "https://github.com/test/repo.git",
"HEAD_REV": "abc123",
},
{},
id="git_with_checkout_bundle",
),
],
)
def test_collect_vcs_options(
Expand All @@ -197,6 +210,7 @@ def test_collect_vcs_options(

args.setdefault(f"{name}_checkout", checkout)
args.setdefault(f"{name}_shallow_clone", False)
args.setdefault(f"{name}_checkout_bundle", None)
args = Namespace(**args)

result = run_task_mod.collect_vcs_options(args, name, name)
Expand All @@ -205,6 +219,7 @@ def test_collect_vcs_options(
"base-repo": env.get("BASE_REPOSITORY"),
"base-rev": env.get("BASE_REV"),
"checkout": os.path.join(os.getcwd(), "checkout"),
"checkout-bundle": None,
"env-prefix": name.upper(),
"head-repo": env.get("HEAD_REPOSITORY"),
"name": name,
Expand All @@ -222,10 +237,80 @@ def test_collect_vcs_options(
expected["checkout"], env.get("PIP_REQUIREMENTS")
)

bundle = getattr(args, f"{name}_checkout_bundle")
if bundle:
# Derived independently (like the ``checkout`` expectation above) so the
# abspath handling in collect_vcs_options is load-bearing, not mirrored.
expected["checkout-bundle"] = os.path.join(os.getcwd(), bundle)

expected.update(extra_expected)
assert result == expected


def test_create_checkout_bundle(run_task_mod, tmp_path):
# Build a fake checkout containing vcs metadata (including a nested .git, as
# produced by submodules/sub-checkouts) plus real working files.
checkout = tmp_path / "myco"
(checkout / ".git").mkdir(parents=True)
(checkout / ".git" / "config").write_text("[core]\n")
(checkout / ".hg").mkdir()
(checkout / ".hg" / "hgrc").write_text("[paths]\n")
(checkout / "sub" / ".git").mkdir(parents=True)
(checkout / "sub" / ".git" / "config").write_text("[core]\n")
(checkout / "file1.txt").write_text("hello")
(checkout / "sub" / "file2.txt").write_text("world")
(checkout / ".gitignore").write_text("*.pyc\n")

dest = tmp_path / "out.tar.zst"
run_task_mod.create_checkout_bundle(str(checkout), str(dest))

assert dest.exists()
# No leftover temporary file.
assert not dest.with_name(f"{dest.name}.tmp").exists()

# The archive should be valid zstd + tar. Decompress fully so it can be read
# in random-access mode (to also verify file contents survive intact).
dctx = zstandard.ZstdDecompressor()
with dest.open("rb") as fh:
tar_bytes = dctx.stream_reader(fh).read()
with tarfile.open(fileobj=io.BytesIO(tar_bytes), mode="r:") as tf:
members = tf.getnames()
assert tf.extractfile("myco/file1.txt").read() == b"hello"
assert tf.extractfile("myco/sub/file2.txt").read() == b"world"

# Regular files are present under the checkout's basename prefix.
assert "myco/file1.txt" in members
assert "myco/sub/file2.txt" in members
# A dotfile that merely starts with ".git" is not excluded.
assert "myco/.gitignore" in members
# vcs metadata directories are excluded at any depth (no history).
assert not any(".git" in m.split("/") or ".hg" in m.split("/") for m in members), (
members
)


def test_create_checkout_bundle_rejects_bad_suffix(run_task_mod, tmp_path):
checkout = tmp_path / "myco"
checkout.mkdir()
(checkout / "file1.txt").write_text("hello")

with pytest.raises(ValueError):
run_task_mod.create_checkout_bundle(str(checkout), str(tmp_path / "out.tar.gz"))


def test_create_checkout_bundle_fails_fast(run_task_mod, tmp_path):
# tar exits nonzero for a checkout dir that does not exist; the helper must
# raise and leave neither the destination nor the temporary file behind.
dest = tmp_path / "out.tar.zst"
missing = tmp_path / "does-not-exist"

with pytest.raises(RuntimeError):
run_task_mod.create_checkout_bundle(str(missing), str(dest))

assert not dest.exists()
assert not dest.with_name(f"{dest.name}.tmp").exists()


def test_remove_directory(monkeypatch, run_task_mod):
_tempdir = tempfile.TemporaryDirectory()
assert os.path.isdir(_tempdir.name) is True
Expand Down Expand Up @@ -638,3 +723,36 @@ def test_main_abspath_environment(mocker, run_main):
assert env.get("MOZ_UV_HOME") == "/builds/worker/dir/uv"
for key in envvars:
assert env[key] == "/builds/worker/file"


@nowin
def test_main_runs_checkout_bundle(mocker, run_main, run_task_mod, tmp_path):
# Drive main() to verify the bundle step is opt-in (default OFF) and wired to
# create_checkout_bundle once per eligible repo. Stub out the real checkout
# and archiving; just record the bundle invocations.
mocker.patch.object(run_task_mod, "vcs_checkout_from_args", lambda repo: None)

calls = []
mocker.patch.object(
run_task_mod,
"create_checkout_bundle",
lambda checkout_dir, dest_path: calls.append((checkout_dir, dest_path)),
)

checkout = tmp_path / "checkouts" / "vcs"
bundle = tmp_path / "artifacts" / "src.tar.zst"

# Default OFF: without --vcs-checkout-bundle, the bundle step is skipped.
result, _ = run_main(extra_args=[f"--vcs-checkout={checkout}"])
assert result == 0
assert calls == []

# Opt-in: the bundle step runs once with the (abspath'd) checkout and dest.
result, _ = run_main(
extra_args=[
f"--vcs-checkout={checkout}",
f"--vcs-checkout-bundle={bundle}",
]
)
assert result == 0
assert calls == [(os.path.abspath(str(checkout)), os.path.abspath(str(bundle)))]