Skip to content

Commit 9499e92

Browse files
committed
Add support for loading docs from a GIT repo.
1 parent 1fd2512 commit 9499e92

10 files changed

Lines changed: 894 additions & 8 deletions

File tree

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
- [pgEdge Document Loader Quickstart](docs/quickstart.md)
1313
- Using pgEdge Document Loader
1414
- [Using Document Loader](docs/usage.md)
15+
- [Using Git Repository Sources](docs/git-sources.md)
1516
- [Using Custom Metadata Columns](docs/metadata.md)
1617
- [Updating a Document](docs/updating.md)
1718
- [Managing Authentication](docs/authentication.md)
@@ -35,9 +36,10 @@ The pgEdge Document Loader automatically converts documents (HTML, Markdown, reS
3536
**Features**
3637

3738
- **Multiple Format Support**: HTML, Markdown, reStructuredText, and DocBook SGML/XML
39+
- **Git Repository Support**: Clone and process docs directly from Git repositories
3840
- **Automatic Conversion**: All formats converted to Markdown
3941
- **Metadata Extraction**: Titles, filenames, timestamps
40-
- **Flexible Input**: Single file, directory, or glob patterns (including `**` recursive matching)
42+
- **Flexible Input**: Single file, directory, glob patterns, or Git repository URL
4143
- **Database Flexibility**: Configurable column mappings
4244
- **Custom Metadata Columns**: Add fixed values to custom columns for every row
4345
- **Update Mode**: Update existing rows or insert new ones

cmd/pgedge-docloader/main.go

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/pgedge/pgedge-docloader/internal/config"
2020
"github.com/pgedge/pgedge-docloader/internal/converter"
2121
"github.com/pgedge/pgedge-docloader/internal/database"
22+
"github.com/pgedge/pgedge-docloader/internal/gitsource"
2223
"github.com/pgedge/pgedge-docloader/internal/processor"
2324
"github.com/pgedge/pgedge-docloader/internal/types"
2425
)
@@ -51,10 +52,19 @@ func init() {
5152
// Configuration file
5253
rootCmd.Flags().StringP("config", "c", "", "Path to configuration file")
5354

54-
// Source configuration
55+
// Source configuration - Local
5556
rootCmd.Flags().StringP("source", "s", "", "Source file, directory, or glob pattern")
5657
rootCmd.Flags().Bool("strip-path", false, "Strip path from filename, keeping only the base name")
5758

59+
// Source configuration - Git (mutually exclusive with --source)
60+
rootCmd.Flags().String("git-url", "", "Git repository URL to clone and process")
61+
rootCmd.Flags().String("git-branch", "", "Git branch to checkout (default: repository default)")
62+
rootCmd.Flags().String("git-tag", "", "Git tag to checkout (mutually exclusive with --git-branch)")
63+
rootCmd.Flags().String("git-doc-path", "", "Path within repository to process (supports glob patterns)")
64+
rootCmd.Flags().String("git-clone-dir", "", "Directory to store cloned repositories (default: temp directory)")
65+
rootCmd.Flags().Bool("git-keep-clone", false, "Keep cloned repository after processing")
66+
rootCmd.Flags().Bool("git-skip-fetch", false, "Skip git fetch if repository already exists")
67+
5868
// Database connection
5969
rootCmd.Flags().String("db-host", "localhost", "Database host")
6070
rootCmd.Flags().Int("db-port", 5432, "Database port")
@@ -113,9 +123,30 @@ func run(cmd *cobra.Command, args []string) error {
113123
return fmt.Errorf("failed to load configuration: %w", err)
114124
}
115125

126+
// Determine source path
127+
var sourcePath string
128+
var gitSource *gitsource.GitSource
129+
130+
if cfg.GitURL != "" {
131+
// Git source
132+
gitSource, err = gitsource.New(cfg)
133+
if err != nil {
134+
return fmt.Errorf("failed to setup git source: %w", err)
135+
}
136+
defer func() {
137+
if cleanupErr := gitSource.Cleanup(); cleanupErr != nil {
138+
fmt.Fprintf(os.Stderr, "Warning: cleanup failed: %v\n", cleanupErr)
139+
}
140+
}()
141+
sourcePath = gitSource.GetSourcePath()
142+
} else {
143+
// Local source
144+
sourcePath = cfg.Source
145+
}
146+
116147
// Process files
117-
fmt.Printf("Processing files from: %s\n", cfg.Source)
118-
documents, stats, err := processor.ProcessFiles(cfg.Source, cfg.StripPath)
148+
fmt.Printf("Processing files from: %s\n", sourcePath)
149+
documents, stats, err := processor.ProcessFiles(sourcePath, cfg.StripPath)
119150
if err != nil {
120151
return fmt.Errorf("failed to process files: %w", err)
121152
}

docs/changelog.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,26 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to
77
[Semantic Versioning](https://semver.org/spec/v2.0.0.html).
88

9+
## [Unreleased]
10+
11+
### Added
12+
13+
- **Git repository source support**: Clone and process documentation directly
14+
from Git repositories as an alternative to local files
15+
16+
- `--git-url` option to specify repository URL (mutually exclusive with
17+
`--source`)
18+
- `--git-branch` option to checkout a specific branch
19+
- `--git-tag` option to checkout a specific tag (mutually exclusive with
20+
`--git-branch`)
21+
- `--git-doc-path` option to specify path within repository (supports glob
22+
patterns)
23+
- `--git-clone-dir` option for persistent clone directory
24+
- `--git-keep-clone` option to preserve cloned repository after processing
25+
- `--git-skip-fetch` option to skip fetch for existing clones
26+
- Automatic cleanup of temporary clone directories
27+
- Support for both HTTPS and SSH repository URLs
28+
929
## [1.0.0-beta1] - 2025-12-15
1030

1131
### Changed

docs/git-sources.md

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
# Using Git Repository Sources
2+
3+
As an alternative to local files, pgEdge Document Loader can clone and process
4+
documentation directly from Git repositories. This is useful for:
5+
6+
- Loading documentation from remote repositories without manual cloning
7+
- Processing specific branches or tags (e.g., versioned documentation)
8+
- Automated pipelines that fetch and load docs from source control
9+
10+
## Git Source Options
11+
12+
| Option | Required | Description |
13+
|------------------|----------|--------------------------------------------------|
14+
| `--git-url` | Yes* | Git repository URL to clone |
15+
| `--git-branch` | No | Branch to checkout (default: repository default) |
16+
| `--git-tag` | No | Tag to checkout (mutually exclusive with branch) |
17+
| `--git-doc-path` | No | Path within repository to process |
18+
| `--git-clone-dir`| No | Directory to store cloned repositories |
19+
| `--git-keep-clone`| No | Keep cloned repository after processing |
20+
| `--git-skip-fetch`| No | Skip fetch if repository already exists |
21+
22+
*Either `--source` or `--git-url` is required, but not both.
23+
24+
## Basic Usage
25+
26+
Clone a repository and process all supported files from the root:
27+
28+
```bash
29+
pgedge-docloader \
30+
--git-url https://github.com/org/docs-repo.git \
31+
--db-host localhost \
32+
--db-name mydb \
33+
--db-user myuser \
34+
--db-table documents \
35+
--col-doc-content content \
36+
--col-file-name filename
37+
```
38+
39+
## Processing a Specific Directory
40+
41+
Use `--git-doc-path` to process files from a specific directory within the
42+
repository:
43+
44+
```bash
45+
pgedge-docloader \
46+
--git-url https://github.com/org/project.git \
47+
--git-doc-path docs/api \
48+
--db-host localhost \
49+
--db-name mydb \
50+
--db-user myuser \
51+
--db-table documents \
52+
--col-doc-content content
53+
```
54+
55+
The `--git-doc-path` option supports glob patterns:
56+
57+
```bash
58+
# Process only markdown files in the docs directory
59+
pgedge-docloader \
60+
--git-url https://github.com/org/project.git \
61+
--git-doc-path "docs/**/*.md" \
62+
--config config.yml
63+
```
64+
65+
## Working with Branches and Tags
66+
67+
### Checkout a Specific Branch
68+
69+
```bash
70+
pgedge-docloader \
71+
--git-url https://github.com/org/docs.git \
72+
--git-branch main \
73+
--git-doc-path docs \
74+
--config config.yml
75+
```
76+
77+
### Checkout a Specific Tag
78+
79+
Use tags for versioned documentation:
80+
81+
```bash
82+
pgedge-docloader \
83+
--git-url https://github.com/org/project.git \
84+
--git-tag v2.0.0 \
85+
--git-doc-path docs \
86+
--set-column version="2.0.0" \
87+
--config config.yml
88+
```
89+
90+
!!! note
91+
92+
`--git-branch` and `--git-tag` are mutually exclusive. You cannot specify
93+
both options at the same time.
94+
95+
## Persistent Clone Directory
96+
97+
By default, repositories are cloned to a temporary directory and removed after
98+
processing. For repeated runs, you can specify a persistent clone directory:
99+
100+
```bash
101+
pgedge-docloader \
102+
--git-url https://github.com/org/docs.git \
103+
--git-clone-dir /var/cache/docloader/repos \
104+
--git-keep-clone \
105+
--config config.yml
106+
```
107+
108+
On subsequent runs with `--git-skip-fetch`, the tool will reuse the existing
109+
clone without fetching updates:
110+
111+
```bash
112+
pgedge-docloader \
113+
--git-url https://github.com/org/docs.git \
114+
--git-clone-dir /var/cache/docloader/repos \
115+
--git-keep-clone \
116+
--git-skip-fetch \
117+
--config config.yml
118+
```
119+
120+
## Configuration File Example
121+
122+
Git source options can also be specified in a configuration file:
123+
124+
```yaml
125+
# Git source configuration
126+
git-url: https://github.com/org/docs-repo.git
127+
git-branch: main
128+
git-doc-path: docs
129+
git-clone-dir: /var/cache/docloader/repos
130+
git-keep-clone: true
131+
132+
# Database configuration
133+
db-host: localhost
134+
db-name: mydb
135+
db-user: myuser
136+
db-table: documents
137+
138+
# Column mappings
139+
col-doc-content: content
140+
col-file-name: filename
141+
col-doc-title: title
142+
143+
# Custom metadata
144+
custom-columns:
145+
source: "git-repo"
146+
project: "my-project"
147+
```
148+
149+
Then run with:
150+
151+
```bash
152+
pgedge-docloader --config config.yml
153+
```
154+
155+
## Authentication
156+
157+
### HTTPS URLs
158+
159+
For public repositories, use the HTTPS URL directly:
160+
161+
```bash
162+
--git-url https://github.com/org/public-repo.git
163+
```
164+
165+
For private repositories, you can use a personal access token in the URL:
166+
167+
```bash
168+
--git-url https://TOKEN@github.com/org/private-repo.git
169+
```
170+
171+
Or configure Git credential helpers before running the tool.
172+
173+
### SSH URLs
174+
175+
For SSH authentication, ensure your SSH keys are configured:
176+
177+
```bash
178+
--git-url git@github.com:org/repo.git
179+
```
180+
181+
## Error Handling
182+
183+
The tool will fail with a clear error message if:
184+
185+
- Git is not installed on the system
186+
- The repository URL is invalid or inaccessible
187+
- The specified branch or tag does not exist
188+
- The `--git-doc-path` does not exist in the repository
189+
190+
## Best Practices
191+
192+
1. **Use tags for versioned docs**: When loading documentation for specific
193+
software versions, use `--git-tag` to ensure consistency.
194+
195+
2. **Cache clones for repeated runs**: Use `--git-clone-dir` and
196+
`--git-keep-clone` to avoid re-cloning on every run.
197+
198+
3. **Use `--git-skip-fetch` carefully**: Only skip fetching when you're sure
199+
the local clone is up-to-date.
200+
201+
4. **Set version metadata**: Use `--set-column` to add version information
202+
when processing tagged releases:
203+
204+
```bash
205+
--git-tag v1.2.3 --set-column version="1.2.3"
206+
```

internal/config/config.go

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,19 @@ func Load(cmd *cobra.Command) (*types.Config, error) {
4949
}
5050

5151
// Load configuration values (CLI flags override config file)
52+
// Local source configuration
5253
cfg.Source = viper.GetString("source")
5354
cfg.StripPath = viper.GetBool("strip-path")
5455

56+
// Git source configuration
57+
cfg.GitURL = viper.GetString("git-url")
58+
cfg.GitBranch = viper.GetString("git-branch")
59+
cfg.GitTag = viper.GetString("git-tag")
60+
cfg.GitDocPath = viper.GetString("git-doc-path")
61+
cfg.GitCloneDir = viper.GetString("git-clone-dir")
62+
cfg.GitKeepClone = viper.GetBool("git-keep-clone")
63+
cfg.GitSkipFetch = viper.GetBool("git-skip-fetch")
64+
5565
cfg.DBHost = viper.GetString("db-host")
5666
cfg.DBPort = viper.GetInt("db-port")
5767
cfg.DBName = viper.GetString("db-name")
@@ -104,6 +114,7 @@ func Load(cmd *cobra.Command) (*types.Config, error) {
104114
if cfg.ConfigFile != "" {
105115
configDir := filepath.Dir(cfg.ConfigFile)
106116
cfg.Source = resolvePath(cfg.Source, configDir)
117+
cfg.GitCloneDir = resolvePath(cfg.GitCloneDir, configDir)
107118
cfg.DBSSLCert = resolvePath(cfg.DBSSLCert, configDir)
108119
cfg.DBSSLKey = resolvePath(cfg.DBSSLKey, configDir)
109120
cfg.DBSSLRoot = resolvePath(cfg.DBSSLRoot, configDir)
@@ -198,8 +209,19 @@ func readPgPass() (string, error) {
198209

199210
// validate validates the configuration
200211
func validate(cfg *types.Config) error {
201-
if cfg.Source == "" {
202-
return fmt.Errorf("source path is required")
212+
// Source validation: either local source or git-url, but not both
213+
if cfg.Source == "" && cfg.GitURL == "" {
214+
return fmt.Errorf("either --source or --git-url is required")
215+
}
216+
if cfg.Source != "" && cfg.GitURL != "" {
217+
return fmt.Errorf("--source and --git-url are mutually exclusive")
218+
}
219+
220+
// Git-specific validation
221+
if cfg.GitURL != "" {
222+
if cfg.GitBranch != "" && cfg.GitTag != "" {
223+
return fmt.Errorf("--git-branch and --git-tag are mutually exclusive")
224+
}
203225
}
204226

205227
if cfg.DBHost == "" {

0 commit comments

Comments
 (0)