|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Export a directory from a Git repository using treeless + sparse clone. |
| 4 | +
|
| 5 | +Workflow: |
| 6 | +1. Clone repository with --filter=tree:0 --sparse --no-checkout. |
| 7 | +2. Configure sparse-checkout for the requested directory. |
| 8 | +3. Checkout the requested ref (or default branch). |
| 9 | +4. Copy only the requested directory contents to output. |
| 10 | +5. Always remove .git from the output. |
| 11 | +""" |
| 12 | + |
| 13 | +from __future__ import annotations |
| 14 | + |
| 15 | +import argparse |
| 16 | +import os |
| 17 | +import shutil |
| 18 | +import subprocess |
| 19 | +import sys |
| 20 | +import tempfile |
| 21 | +import time |
| 22 | +import urllib.parse |
| 23 | +from pathlib import Path |
| 24 | + |
| 25 | + |
| 26 | +class GitExportError(RuntimeError): |
| 27 | + """Domain error for export failures.""" |
| 28 | + |
| 29 | + |
| 30 | +def info(message: str) -> None: |
| 31 | + print(f"[git-export] {message}", flush=True) |
| 32 | + |
| 33 | + |
| 34 | +def run_git(git_bin: str, args: list[str], cwd: Path | None = None, verbose: bool = False) -> None: |
| 35 | + cmd = [git_bin, *args] |
| 36 | + if verbose: |
| 37 | + location = str(cwd) if cwd else os.getcwd() |
| 38 | + print(f"+ (cwd={location}) {' '.join(cmd)}") |
| 39 | + try: |
| 40 | + subprocess.run(cmd, cwd=str(cwd) if cwd else None, check=True, text=True, capture_output=True) |
| 41 | + except subprocess.CalledProcessError as e: |
| 42 | + stderr = (e.stderr or "").strip() |
| 43 | + stdout = (e.stdout or "").strip() |
| 44 | + detail = stderr or stdout or str(e) |
| 45 | + raise GitExportError(f"git command failed: {' '.join(cmd)}\n{detail}") from e |
| 46 | + |
| 47 | + |
| 48 | +def normalize_source_path(path: str) -> str: |
| 49 | + source = path.strip().strip("/") |
| 50 | + if not source: |
| 51 | + raise GitExportError("Source path must not be empty") |
| 52 | + parts = [p for p in source.split("/") if p not in ("", ".")] |
| 53 | + if any(part == ".." for part in parts): |
| 54 | + raise GitExportError("Source path must not contain '..'") |
| 55 | + return "/".join(parts) |
| 56 | + |
| 57 | + |
| 58 | +def parse_github_directory_url(url: str) -> tuple[str, str, str | None]: |
| 59 | + """ |
| 60 | + Parse a GitHub directory URL into (repo_url, source_path, ref). |
| 61 | +
|
| 62 | + Supported examples: |
| 63 | + - https://github.com/org/repo/lang/ruby |
| 64 | + - https://github.com/org/repo/tree/main/lang/ruby |
| 65 | + - https://github.com/org/repo/blob/main/lang/ruby |
| 66 | + """ |
| 67 | + parsed = urllib.parse.urlparse(url) |
| 68 | + if parsed.scheme not in ("http", "https") or parsed.netloc not in ("github.com", "www.github.com"): |
| 69 | + raise GitExportError(f"Not a supported GitHub URL: {url}") |
| 70 | + |
| 71 | + parts = [p for p in parsed.path.split("/") if p] |
| 72 | + if len(parts) < 3: |
| 73 | + raise GitExportError( |
| 74 | + "GitHub URL must include a directory path after owner/repo " |
| 75 | + f"(got: {url})" |
| 76 | + ) |
| 77 | + |
| 78 | + owner = parts[0] |
| 79 | + repo = parts[1] |
| 80 | + if repo.endswith(".git"): |
| 81 | + repo = repo[:-4] |
| 82 | + |
| 83 | + rest = parts[2:] |
| 84 | + ref: str | None = None |
| 85 | + source: str |
| 86 | + |
| 87 | + if rest[0] in ("tree", "blob"): |
| 88 | + if len(rest) < 3: |
| 89 | + raise GitExportError( |
| 90 | + "tree/blob URLs must include ref and directory path, " |
| 91 | + f"got: {url}" |
| 92 | + ) |
| 93 | + ref = rest[1] |
| 94 | + source = "/".join(rest[2:]) |
| 95 | + else: |
| 96 | + source = "/".join(rest) |
| 97 | + |
| 98 | + repo_url = f"https://github.com/{owner}/{repo}.git" |
| 99 | + return repo_url, normalize_source_path(source), ref |
| 100 | + |
| 101 | + |
| 102 | +def prepare_output_dir(output_dir: Path, force: bool) -> None: |
| 103 | + if output_dir.exists(): |
| 104 | + if not force: |
| 105 | + raise GitExportError( |
| 106 | + f"Output path already exists: {output_dir} (use --force to overwrite)" |
| 107 | + ) |
| 108 | + if output_dir.is_file() or output_dir.is_symlink(): |
| 109 | + output_dir.unlink() |
| 110 | + else: |
| 111 | + shutil.rmtree(output_dir) |
| 112 | + output_dir.mkdir(parents=True, exist_ok=True) |
| 113 | + |
| 114 | + |
| 115 | +def copy_entry(src: Path, dst: Path) -> None: |
| 116 | + if src.is_symlink(): |
| 117 | + target = os.readlink(src) |
| 118 | + if dst.exists() or dst.is_symlink(): |
| 119 | + if dst.is_dir() and not dst.is_symlink(): |
| 120 | + shutil.rmtree(dst) |
| 121 | + else: |
| 122 | + dst.unlink() |
| 123 | + os.symlink(target, dst) |
| 124 | + return |
| 125 | + if src.is_dir(): |
| 126 | + shutil.copytree(src, dst, symlinks=True, dirs_exist_ok=True) |
| 127 | + return |
| 128 | + shutil.copy2(src, dst, follow_symlinks=False) |
| 129 | + |
| 130 | + |
| 131 | +def export_directory( |
| 132 | + repo_url: str, |
| 133 | + source_path: str, |
| 134 | + output_dir: Path, |
| 135 | + ref: str | None, |
| 136 | + depth: int, |
| 137 | + force: bool, |
| 138 | + git_bin: str, |
| 139 | + verbose: bool, |
| 140 | +) -> None: |
| 141 | + start_total = time.perf_counter() |
| 142 | + source_path = normalize_source_path(source_path) |
| 143 | + output_dir = output_dir.resolve() |
| 144 | + |
| 145 | + info(f"Repository: {repo_url}") |
| 146 | + info(f"Source path: {source_path}") |
| 147 | + info(f"Ref: {ref or 'default branch'}") |
| 148 | + info(f"Output: {output_dir}") |
| 149 | + |
| 150 | + with tempfile.TemporaryDirectory(prefix="git-export-") as temp_dir: |
| 151 | + work_dir = Path(temp_dir) |
| 152 | + clone_dir = work_dir / "repo" |
| 153 | + |
| 154 | + info("Step 1/6: cloning repository (treeless + sparse, no checkout)") |
| 155 | + step_start = time.perf_counter() |
| 156 | + run_git( |
| 157 | + git_bin, |
| 158 | + [ |
| 159 | + "clone", |
| 160 | + "--depth", |
| 161 | + str(depth), |
| 162 | + "--filter=tree:0", |
| 163 | + "--sparse", |
| 164 | + "--no-checkout", |
| 165 | + repo_url, |
| 166 | + str(clone_dir), |
| 167 | + ], |
| 168 | + verbose=verbose, |
| 169 | + ) |
| 170 | + info(f"Step 1/6 complete in {time.perf_counter() - step_start:.1f}s") |
| 171 | + |
| 172 | + info("Step 2/6: configuring sparse checkout") |
| 173 | + step_start = time.perf_counter() |
| 174 | + run_git(git_bin, ["sparse-checkout", "init", "--cone"], cwd=clone_dir, verbose=verbose) |
| 175 | + run_git(git_bin, ["sparse-checkout", "set", "--", source_path], cwd=clone_dir, verbose=verbose) |
| 176 | + info(f"Step 2/6 complete in {time.perf_counter() - step_start:.1f}s") |
| 177 | + |
| 178 | + info("Step 3/6: checking out requested ref/path") |
| 179 | + step_start = time.perf_counter() |
| 180 | + if ref: |
| 181 | + run_git( |
| 182 | + git_bin, |
| 183 | + ["fetch", "--depth", str(depth), "origin", ref], |
| 184 | + cwd=clone_dir, |
| 185 | + verbose=verbose, |
| 186 | + ) |
| 187 | + run_git(git_bin, ["checkout", "--detach", "FETCH_HEAD"], cwd=clone_dir, verbose=verbose) |
| 188 | + else: |
| 189 | + run_git(git_bin, ["checkout"], cwd=clone_dir, verbose=verbose) |
| 190 | + info(f"Step 3/6 complete in {time.perf_counter() - step_start:.1f}s") |
| 191 | + |
| 192 | + info("Step 4/6: validating source directory") |
| 193 | + source_dir = clone_dir / source_path |
| 194 | + if not source_dir.exists() or not source_dir.is_dir(): |
| 195 | + raise GitExportError( |
| 196 | + f"Source directory not found after checkout: {source_path}\n" |
| 197 | + f"Repository: {repo_url}\n" |
| 198 | + f"Ref: {ref or 'default branch'}" |
| 199 | + ) |
| 200 | + info("Step 4/6 complete") |
| 201 | + |
| 202 | + info("Step 5/6: preparing output directory") |
| 203 | + step_start = time.perf_counter() |
| 204 | + prepare_output_dir(output_dir, force=force) |
| 205 | + info(f"Step 5/6 complete in {time.perf_counter() - step_start:.1f}s") |
| 206 | + |
| 207 | + info("Step 6/6: copying exported files") |
| 208 | + step_start = time.perf_counter() |
| 209 | + children = list(source_dir.iterdir()) |
| 210 | + total_children = len(children) |
| 211 | + if total_children == 0: |
| 212 | + info("Source directory is empty") |
| 213 | + for idx, child in enumerate(children, start=1): |
| 214 | + info(f" - [{idx}/{total_children}] {child.name}") |
| 215 | + copy_entry(child, output_dir / child.name) |
| 216 | + info(f"Step 6/6 complete in {time.perf_counter() - step_start:.1f}s") |
| 217 | + |
| 218 | + # Explicitly ensure .git is never left in output. |
| 219 | + info("Finalizing export (removing .git if present)") |
| 220 | + shutil.rmtree(output_dir / ".git", ignore_errors=True) |
| 221 | + info(f"All done in {time.perf_counter() - start_total:.1f}s") |
| 222 | + |
| 223 | + |
| 224 | +def main() -> int: |
| 225 | + parser = argparse.ArgumentParser( |
| 226 | + description=( |
| 227 | + "Export one directory from a huge Git repository using treeless + sparse clone." |
| 228 | + ) |
| 229 | + ) |
| 230 | + parser.add_argument( |
| 231 | + "input", |
| 232 | + help=( |
| 233 | + "Either a repository URL (legacy mode) or a full GitHub directory URL, " |
| 234 | + "e.g. https://github.com/apache/avro/lang/ruby" |
| 235 | + ), |
| 236 | + ) |
| 237 | + parser.add_argument( |
| 238 | + "arg2", |
| 239 | + help=( |
| 240 | + "In URL mode: destination output directory. " |
| 241 | + "In legacy mode: source directory path." |
| 242 | + ), |
| 243 | + ) |
| 244 | + parser.add_argument( |
| 245 | + "arg3", |
| 246 | + nargs="?", |
| 247 | + help="Legacy mode only: destination output directory.", |
| 248 | + ) |
| 249 | + parser.add_argument( |
| 250 | + "--source", |
| 251 | + help=( |
| 252 | + "Source directory path when using 2-arg mode with a repository URL input." |
| 253 | + ), |
| 254 | + ) |
| 255 | + parser.add_argument( |
| 256 | + "--ref", |
| 257 | + "-r", |
| 258 | + help="Branch/tag/ref to export (default: repository default branch)", |
| 259 | + ) |
| 260 | + parser.add_argument( |
| 261 | + "--depth", |
| 262 | + type=int, |
| 263 | + default=1, |
| 264 | + help="Fetch depth for clone/fetch (default: 1)", |
| 265 | + ) |
| 266 | + parser.add_argument( |
| 267 | + "--force", |
| 268 | + "-f", |
| 269 | + action="store_true", |
| 270 | + help="Overwrite output directory if it already exists", |
| 271 | + ) |
| 272 | + parser.add_argument( |
| 273 | + "--git-bin", |
| 274 | + default="git", |
| 275 | + help="Git binary path/name (default: git)", |
| 276 | + ) |
| 277 | + parser.add_argument( |
| 278 | + "--verbose", |
| 279 | + "-v", |
| 280 | + action="store_true", |
| 281 | + help="Print git commands while running", |
| 282 | + ) |
| 283 | + |
| 284 | + args = parser.parse_args() |
| 285 | + |
| 286 | + if args.depth < 1: |
| 287 | + print("Error: --depth must be >= 1", file=sys.stderr) |
| 288 | + return 2 |
| 289 | + |
| 290 | + try: |
| 291 | + parsed_ref: str | None = None |
| 292 | + if args.arg3 is not None: |
| 293 | + # Legacy mode: repo source output |
| 294 | + repo_url = args.input |
| 295 | + source_path = args.arg2 |
| 296 | + output_path = args.arg3 |
| 297 | + else: |
| 298 | + # URL mode: input output |
| 299 | + output_path = args.arg2 |
| 300 | + if args.source: |
| 301 | + repo_url = args.input |
| 302 | + source_path = args.source |
| 303 | + else: |
| 304 | + repo_url, source_path, parsed_ref = parse_github_directory_url(args.input) |
| 305 | + |
| 306 | + export_directory( |
| 307 | + repo_url=repo_url, |
| 308 | + source_path=source_path, |
| 309 | + output_dir=Path(output_path), |
| 310 | + ref=args.ref or parsed_ref, |
| 311 | + depth=args.depth, |
| 312 | + force=args.force, |
| 313 | + git_bin=args.git_bin, |
| 314 | + verbose=args.verbose, |
| 315 | + ) |
| 316 | + except GitExportError as e: |
| 317 | + print(f"Error: {e}", file=sys.stderr) |
| 318 | + return 1 |
| 319 | + except FileNotFoundError as e: |
| 320 | + print(f"Error: unable to execute git binary '{args.git_bin}': {e}", file=sys.stderr) |
| 321 | + return 1 |
| 322 | + |
| 323 | + print(f"Export complete: {Path(output_path).resolve()}") |
| 324 | + return 0 |
| 325 | + |
| 326 | + |
| 327 | +if __name__ == "__main__": |
| 328 | + raise SystemExit(main()) |
0 commit comments