Skip to content

Commit 479ceea

Browse files
committed
feat(git-export): add script for efficient directory export from git repos
1 parent c21cdb3 commit 479ceea

1 file changed

Lines changed: 328 additions & 0 deletions

File tree

git-export.py

Lines changed: 328 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,328 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Export a directory from a Git repository using treeless + sparse clone.
4+
5+
Workflow:
6+
1. Clone repository with --filter=tree:0 --sparse --no-checkout.
7+
2. Configure sparse-checkout for the requested directory.
8+
3. Checkout the requested ref (or default branch).
9+
4. Copy only the requested directory contents to output.
10+
5. Always remove .git from the output.
11+
"""
12+
13+
from __future__ import annotations
14+
15+
import argparse
16+
import os
17+
import shutil
18+
import subprocess
19+
import sys
20+
import tempfile
21+
import time
22+
import urllib.parse
23+
from pathlib import Path
24+
25+
26+
class GitExportError(RuntimeError):
27+
"""Domain error for export failures."""
28+
29+
30+
def info(message: str) -> None:
31+
print(f"[git-export] {message}", flush=True)
32+
33+
34+
def run_git(git_bin: str, args: list[str], cwd: Path | None = None, verbose: bool = False) -> None:
35+
cmd = [git_bin, *args]
36+
if verbose:
37+
location = str(cwd) if cwd else os.getcwd()
38+
print(f"+ (cwd={location}) {' '.join(cmd)}")
39+
try:
40+
subprocess.run(cmd, cwd=str(cwd) if cwd else None, check=True, text=True, capture_output=True)
41+
except subprocess.CalledProcessError as e:
42+
stderr = (e.stderr or "").strip()
43+
stdout = (e.stdout or "").strip()
44+
detail = stderr or stdout or str(e)
45+
raise GitExportError(f"git command failed: {' '.join(cmd)}\n{detail}") from e
46+
47+
48+
def normalize_source_path(path: str) -> str:
49+
source = path.strip().strip("/")
50+
if not source:
51+
raise GitExportError("Source path must not be empty")
52+
parts = [p for p in source.split("/") if p not in ("", ".")]
53+
if any(part == ".." for part in parts):
54+
raise GitExportError("Source path must not contain '..'")
55+
return "/".join(parts)
56+
57+
58+
def parse_github_directory_url(url: str) -> tuple[str, str, str | None]:
59+
"""
60+
Parse a GitHub directory URL into (repo_url, source_path, ref).
61+
62+
Supported examples:
63+
- https://github.com/org/repo/lang/ruby
64+
- https://github.com/org/repo/tree/main/lang/ruby
65+
- https://github.com/org/repo/blob/main/lang/ruby
66+
"""
67+
parsed = urllib.parse.urlparse(url)
68+
if parsed.scheme not in ("http", "https") or parsed.netloc not in ("github.com", "www.github.com"):
69+
raise GitExportError(f"Not a supported GitHub URL: {url}")
70+
71+
parts = [p for p in parsed.path.split("/") if p]
72+
if len(parts) < 3:
73+
raise GitExportError(
74+
"GitHub URL must include a directory path after owner/repo "
75+
f"(got: {url})"
76+
)
77+
78+
owner = parts[0]
79+
repo = parts[1]
80+
if repo.endswith(".git"):
81+
repo = repo[:-4]
82+
83+
rest = parts[2:]
84+
ref: str | None = None
85+
source: str
86+
87+
if rest[0] in ("tree", "blob"):
88+
if len(rest) < 3:
89+
raise GitExportError(
90+
"tree/blob URLs must include ref and directory path, "
91+
f"got: {url}"
92+
)
93+
ref = rest[1]
94+
source = "/".join(rest[2:])
95+
else:
96+
source = "/".join(rest)
97+
98+
repo_url = f"https://github.com/{owner}/{repo}.git"
99+
return repo_url, normalize_source_path(source), ref
100+
101+
102+
def prepare_output_dir(output_dir: Path, force: bool) -> None:
103+
if output_dir.exists():
104+
if not force:
105+
raise GitExportError(
106+
f"Output path already exists: {output_dir} (use --force to overwrite)"
107+
)
108+
if output_dir.is_file() or output_dir.is_symlink():
109+
output_dir.unlink()
110+
else:
111+
shutil.rmtree(output_dir)
112+
output_dir.mkdir(parents=True, exist_ok=True)
113+
114+
115+
def copy_entry(src: Path, dst: Path) -> None:
116+
if src.is_symlink():
117+
target = os.readlink(src)
118+
if dst.exists() or dst.is_symlink():
119+
if dst.is_dir() and not dst.is_symlink():
120+
shutil.rmtree(dst)
121+
else:
122+
dst.unlink()
123+
os.symlink(target, dst)
124+
return
125+
if src.is_dir():
126+
shutil.copytree(src, dst, symlinks=True, dirs_exist_ok=True)
127+
return
128+
shutil.copy2(src, dst, follow_symlinks=False)
129+
130+
131+
def export_directory(
132+
repo_url: str,
133+
source_path: str,
134+
output_dir: Path,
135+
ref: str | None,
136+
depth: int,
137+
force: bool,
138+
git_bin: str,
139+
verbose: bool,
140+
) -> None:
141+
start_total = time.perf_counter()
142+
source_path = normalize_source_path(source_path)
143+
output_dir = output_dir.resolve()
144+
145+
info(f"Repository: {repo_url}")
146+
info(f"Source path: {source_path}")
147+
info(f"Ref: {ref or 'default branch'}")
148+
info(f"Output: {output_dir}")
149+
150+
with tempfile.TemporaryDirectory(prefix="git-export-") as temp_dir:
151+
work_dir = Path(temp_dir)
152+
clone_dir = work_dir / "repo"
153+
154+
info("Step 1/6: cloning repository (treeless + sparse, no checkout)")
155+
step_start = time.perf_counter()
156+
run_git(
157+
git_bin,
158+
[
159+
"clone",
160+
"--depth",
161+
str(depth),
162+
"--filter=tree:0",
163+
"--sparse",
164+
"--no-checkout",
165+
repo_url,
166+
str(clone_dir),
167+
],
168+
verbose=verbose,
169+
)
170+
info(f"Step 1/6 complete in {time.perf_counter() - step_start:.1f}s")
171+
172+
info("Step 2/6: configuring sparse checkout")
173+
step_start = time.perf_counter()
174+
run_git(git_bin, ["sparse-checkout", "init", "--cone"], cwd=clone_dir, verbose=verbose)
175+
run_git(git_bin, ["sparse-checkout", "set", "--", source_path], cwd=clone_dir, verbose=verbose)
176+
info(f"Step 2/6 complete in {time.perf_counter() - step_start:.1f}s")
177+
178+
info("Step 3/6: checking out requested ref/path")
179+
step_start = time.perf_counter()
180+
if ref:
181+
run_git(
182+
git_bin,
183+
["fetch", "--depth", str(depth), "origin", ref],
184+
cwd=clone_dir,
185+
verbose=verbose,
186+
)
187+
run_git(git_bin, ["checkout", "--detach", "FETCH_HEAD"], cwd=clone_dir, verbose=verbose)
188+
else:
189+
run_git(git_bin, ["checkout"], cwd=clone_dir, verbose=verbose)
190+
info(f"Step 3/6 complete in {time.perf_counter() - step_start:.1f}s")
191+
192+
info("Step 4/6: validating source directory")
193+
source_dir = clone_dir / source_path
194+
if not source_dir.exists() or not source_dir.is_dir():
195+
raise GitExportError(
196+
f"Source directory not found after checkout: {source_path}\n"
197+
f"Repository: {repo_url}\n"
198+
f"Ref: {ref or 'default branch'}"
199+
)
200+
info("Step 4/6 complete")
201+
202+
info("Step 5/6: preparing output directory")
203+
step_start = time.perf_counter()
204+
prepare_output_dir(output_dir, force=force)
205+
info(f"Step 5/6 complete in {time.perf_counter() - step_start:.1f}s")
206+
207+
info("Step 6/6: copying exported files")
208+
step_start = time.perf_counter()
209+
children = list(source_dir.iterdir())
210+
total_children = len(children)
211+
if total_children == 0:
212+
info("Source directory is empty")
213+
for idx, child in enumerate(children, start=1):
214+
info(f" - [{idx}/{total_children}] {child.name}")
215+
copy_entry(child, output_dir / child.name)
216+
info(f"Step 6/6 complete in {time.perf_counter() - step_start:.1f}s")
217+
218+
# Explicitly ensure .git is never left in output.
219+
info("Finalizing export (removing .git if present)")
220+
shutil.rmtree(output_dir / ".git", ignore_errors=True)
221+
info(f"All done in {time.perf_counter() - start_total:.1f}s")
222+
223+
224+
def main() -> int:
225+
parser = argparse.ArgumentParser(
226+
description=(
227+
"Export one directory from a huge Git repository using treeless + sparse clone."
228+
)
229+
)
230+
parser.add_argument(
231+
"input",
232+
help=(
233+
"Either a repository URL (legacy mode) or a full GitHub directory URL, "
234+
"e.g. https://github.com/apache/avro/lang/ruby"
235+
),
236+
)
237+
parser.add_argument(
238+
"arg2",
239+
help=(
240+
"In URL mode: destination output directory. "
241+
"In legacy mode: source directory path."
242+
),
243+
)
244+
parser.add_argument(
245+
"arg3",
246+
nargs="?",
247+
help="Legacy mode only: destination output directory.",
248+
)
249+
parser.add_argument(
250+
"--source",
251+
help=(
252+
"Source directory path when using 2-arg mode with a repository URL input."
253+
),
254+
)
255+
parser.add_argument(
256+
"--ref",
257+
"-r",
258+
help="Branch/tag/ref to export (default: repository default branch)",
259+
)
260+
parser.add_argument(
261+
"--depth",
262+
type=int,
263+
default=1,
264+
help="Fetch depth for clone/fetch (default: 1)",
265+
)
266+
parser.add_argument(
267+
"--force",
268+
"-f",
269+
action="store_true",
270+
help="Overwrite output directory if it already exists",
271+
)
272+
parser.add_argument(
273+
"--git-bin",
274+
default="git",
275+
help="Git binary path/name (default: git)",
276+
)
277+
parser.add_argument(
278+
"--verbose",
279+
"-v",
280+
action="store_true",
281+
help="Print git commands while running",
282+
)
283+
284+
args = parser.parse_args()
285+
286+
if args.depth < 1:
287+
print("Error: --depth must be >= 1", file=sys.stderr)
288+
return 2
289+
290+
try:
291+
parsed_ref: str | None = None
292+
if args.arg3 is not None:
293+
# Legacy mode: repo source output
294+
repo_url = args.input
295+
source_path = args.arg2
296+
output_path = args.arg3
297+
else:
298+
# URL mode: input output
299+
output_path = args.arg2
300+
if args.source:
301+
repo_url = args.input
302+
source_path = args.source
303+
else:
304+
repo_url, source_path, parsed_ref = parse_github_directory_url(args.input)
305+
306+
export_directory(
307+
repo_url=repo_url,
308+
source_path=source_path,
309+
output_dir=Path(output_path),
310+
ref=args.ref or parsed_ref,
311+
depth=args.depth,
312+
force=args.force,
313+
git_bin=args.git_bin,
314+
verbose=args.verbose,
315+
)
316+
except GitExportError as e:
317+
print(f"Error: {e}", file=sys.stderr)
318+
return 1
319+
except FileNotFoundError as e:
320+
print(f"Error: unable to execute git binary '{args.git_bin}': {e}", file=sys.stderr)
321+
return 1
322+
323+
print(f"Export complete: {Path(output_path).resolve()}")
324+
return 0
325+
326+
327+
if __name__ == "__main__":
328+
raise SystemExit(main())

0 commit comments

Comments
 (0)