Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
name: CI

on:
push:
branches:
- main
- development
pull_request:
branches:
- main
- development
workflow_dispatch:

jobs:
build:
name: "Build & Test"
runs-on: ubuntu-latest
permissions:
contents: read
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: "1.22"
check-latest: true
cache: true

- name: Install dependencies
run: go mod download

- name: Install tools
run: |
go install mvdan.cc/gofumpt@latest
go install github.com/daixiang0/gci@latest
go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest

- name: Check formatting (gofumpt)
run: |
if [ -n "$(gofumpt -d .)" ]; then
echo "❌ Code is not formatted. Run: gofumpt -w ."
gofumpt -d .
exit 1
fi
echo "✅ Code formatting is clean"

- name: Check imports (gci)
run: |
if [ -n "$(gci diff -s standard -s default -s 'prefix(github.com/mindfiredigital/DeepScanBot)' .)" ]; then
echo "❌ Imports are not sorted. Run: gci write -s standard -s default -s 'prefix(github.com/mindfiredigital/DeepScanBot)' ."
gci diff -s standard -s default -s 'prefix(github.com/mindfiredigital/DeepScanBot)' .
exit 1
fi
echo "✅ Imports are clean"

- name: Run golangci-lint
run: |
golangci-lint run --timeout 5m ./...
echo "✅ Lint passed"

- name: Build
run: |
go build -o deepscanbot ./apps/cli
echo "✅ Build successful"

- name: Run tests
run: |
go test -v -count=1 ./...
echo "✅ All tests passed"

- name: Verify tidy
run: |
go mod tidy
if [ -n "$(git diff --name-only go.mod go.sum)" ]; then
echo "❌ go.mod/go.sum are not tidy. Run: go mod tidy"
git diff go.mod go.sum
exit 1
fi
echo "✅ go.mod is tidy"
70 changes: 70 additions & 0 deletions .github/workflows/release-docs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
name: Release docs Workflow

on:
push:
branches:
- main
workflow_dispatch:

jobs:
build:
name: 'Build Docusaurus Documentation'
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: 24

- name: Install dependencies
run: |
cd apps/docs
npm ci

- name: Build Docusaurus
run: |
cd apps/docs
npm run build

- name: Set Git user
run: |
git config --local user.email "github-actions@github.com"
git config --local user.name "GitHub Actions"

- name: Deploy to gh-pages branch
run: |
# Store the build artifacts
mkdir -p /tmp/docusaurus-build
cp -r apps/docs/build/* /tmp/docusaurus-build/

# Stash any changes to prevent checkout conflicts
git stash push --include-untracked || true

# Switch to gh-pages branch, creating it if it doesn't exist
git checkout gh-pages || git checkout -b gh-pages

# Remove all existing files
rm -rf *

# Copy the build artifacts
cp -r /tmp/docusaurus-build/* .

# Check if there are any changes before committing
if [[ -n "$(git status --porcelain)" ]]; then
git add . -f
git commit -m "chore(docs): update documentation build" --no-verify
git push origin gh-pages --force
echo "✅ Documentation deployed to gh-pages"
else
echo "No changes to commit"
fi

# Switch back to original branch
git checkout -
68 changes: 68 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
name: Release

on:
push:
tags:
- 'v*'
workflow_dispatch:

permissions:
contents: write
packages: write

jobs:
goreleaser:
name: Release Binaries
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: '1.22'
check-latest: true
cache: true

- name: Install dependencies
run: go mod download

- name: Run tests
run: go test -v -count=1 ./...

- name: Build binaries
run: |
mkdir -p dist
GOOS=linux GOARCH=amd64 go build -o dist/deepscanbot-linux-amd64 ./apps/cli
GOOS=linux GOARCH=arm64 go build -o dist/deepscanbot-linux-arm64 ./apps/cli
GOOS=darwin GOARCH=amd64 go build -o dist/deepscanbot-darwin-amd64 ./apps/cli
GOOS=darwin GOARCH=arm64 go build -o dist/deepscanbot-darwin-arm64 ./apps/cli
GOOS=windows GOARCH=amd64 go build -o dist/deepscanbot-windows-amd64.exe ./apps/cli
echo "✅ Binaries built"

- name: Generate checksums
run: |
cd dist
sha256sum * > checksums.txt
echo "✅ Checksums generated"

- name: Create Release
uses: softprops/action-gh-release@v2
with:
name: Release ${{ github.ref_name }}
tag_name: ${{ github.ref_name }}
body_path: CHANGELOG.md
draft: false
prerelease: false
files: |
dist/deepscanbot-linux-amd64
dist/deepscanbot-linux-arm64
dist/deepscanbot-darwin-amd64
dist/deepscanbot-darwin-arm64
dist/deepscanbot-windows-amd64.exe
dist/checksums.txt
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
scan-results-goodreads.json
scan-results-*.json
scan-results-*.txt
deepscanbot
solved.txt

Expand Down
3 changes: 2 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ By participating in this project, you agree to abide by the [CODE_OF_CONDUCT.md]

5. Verify the installation:
```bash
./deepscanbot -h
go run ./apps/cli -h
```

### Project Structure
Expand Down Expand Up @@ -193,6 +193,7 @@ Use conventional commits format:
### Commit Messages

Follow the [Conventional Commits](https://www.conventionalcommits.org/) specification. Use `cocogitto` or a custom commit hook to validate commit messages:

```bash
# Using cocogitto
cog commit
Expand Down
32 changes: 16 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ cd DeepScanBot
# Install dependencies
go mod download

# Build the crawler
go build -o deepscanbot
# Build the crawler (optional)
go build -o deepscanbot ./apps/cli

# Or run directly
go run main.go -url <starting_url> [options]
# Or run directly (no build required)
go run ./apps/cli -url <starting_url> [options]
```

## Usage
Expand Down Expand Up @@ -76,31 +76,31 @@ go run main.go -url <starting_url> [options]
#### 1. Basic Crawl

```bash
deepscanbot -url https://example.com -depth 2
go run ./apps/cli -url https://example.com -depth 2
```

Crawls `https://example.com` up to 2 levels deep and outputs results to `crawler_results.txt`.

#### 2. JSON Output with Details

```bash
deepscanbot -url https://example.com -depth 3 -json -s -u -output my_results
go run ./apps/cli -url https://example.com -depth 3 -json -s -u -output my_results
```

Outputs JSON to `my_results.json` with URL source tracking and deduplication.

#### 3. Crawl with Retry and Delay

```bash
deepscanbot -url https://docs.example.com -depth 2 -retries 3 -retry-backoff 2s -delay 1s -host-concurrency 1
go run ./apps/cli -url https://docs.example.com -depth 2 -retries 3 -retry-backoff 2s -delay 1s -host-concurrency 1
```

Retries failed requests up to 3 times with exponential backoff, waits 1 second between requests to the same host, and allows only 1 concurrent request per host.

#### 4. Cross-Domain Crawl with Sitemap

```bash
deepscanbot -url https://example.com -depth 3 -cross-domain -sitemap -concurrency 10 -host-concurrency 2 -json
go run ./apps/cli -url https://example.com -depth 3 -cross-domain -sitemap -concurrency 10 -host-concurrency 2 -json
```

Discovers URLs from sitemap.xml, follows links to any domain, with 10 total workers and 2 per host.
Expand All @@ -109,26 +109,26 @@ Discovers URLs from sitemap.xml, follows links to any domain, with 10 total work

```bash
# First run (interrupted)
deepscanbot -url https://example.com -depth 3 -json -output my_results
go run ./apps/cli -url https://example.com -depth 3 -json -output my_results

# Resume
deepscanbot -url https://example.com -depth 3 -json -output my_results -resume
go run ./apps/cli -url https://example.com -depth 3 -json -output my_results -resume
```

Loaded existing results from `my_results.json` and skips already-crawled URLs.

#### 6. Crawl Goodreads with Rate-Limit Handling

```bash
deepscanbot -url https://www.goodreads.com -depth 2 -delay 2s -retries 5 -retry-backoff 2s -concurrency 2 -host-concurrency 1 -json -output goodreads_results
go run ./apps/cli -url https://www.goodreads.com -depth 2 -delay 2s -retries 5 -retry-backoff 2s -concurrency 2 -host-concurrency 1 -json -output goodreads_results
```

Uses 2-second politeness delay, 5 retries with exponential backoff, limited concurrency to handle Goodreads rate limits gracefully.

#### 7. Crawl PDF and Images

```bash
deepscanbot -url https://example.com -depth 2 -content-types "text/html,application/pdf,image/jpeg,image/png" -json
go run ./apps/cli -url https://example.com -depth 2 -content-types "text/html,application/pdf,image/jpeg,image/png" -json
```

Downloads HTML, PDF, JPEG, and PNG files while still parsing HTML for links.
Expand Down Expand Up @@ -250,10 +250,10 @@ The JSON report contains a detailed summary and two URL lists:
Text output shows one URL per line with optional metadata in brackets:

```
[https://example.com [status=200] [result=passed]
[https://example.com/about [status=200] [result=passed]
[https://example.com/not-found [status=404] [result=failed] [error=bad status code: 404]
[https://external.com/page [result=skipped] [skipped=outside domain scope]
[https://example.com] [status=200] [result=passed]
[https://example.com/about] [status=200] [result=passed]
[https://example.com/not-found] [status=404] [result=failed] [error=bad status code: 404]
[https://external.com/page] [result=skipped] [skipped=outside domain scope]

// With -s flag:
[href] https://example.com
Expand Down
11 changes: 7 additions & 4 deletions apps/cli/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ func main() {
uniqueUrls := flag.Bool("u", false, "Ensure unique URLs")
concurrency := flag.Int("concurrency", 0, "Maximum concurrent requests; 0 uses available CPU capacity")
hostConcurrency := flag.Int("host-concurrency", 0, "Maximum concurrent requests per host; 0 uses -concurrency")
contentTypes := flag.String("content-types", "text/html", "Comma-separated MIME types to download, e.g. text/html,application/pdf,image/jpeg")
contentTypes := flag.String("content-types", "text/html", "Comma-separated MIME types to download (quote the list), e.g. \"text/html,application/pdf,image/jpeg\"")
output := flag.String("output", "crawler_results", "Output filename without an extension")
ignoreRobots := flag.Bool("ignore-robots", false, "Ignore robots.txt crawl restrictions")
crossDomain := flag.Bool("cross-domain", false, "Follow links to hosts other than the starting URL")
Expand Down Expand Up @@ -126,9 +126,12 @@ func validateStartURL(rawURL string) (string, error) {
func parseContentTypes(value string) []string {
var contentTypes []string

for _, contentType := range strings.Split(value, ",") {
if contentType = strings.TrimSpace(contentType); contentType != "" {
contentTypes = append(contentTypes, contentType)
// Support both comma-separated and space-separated values
for _, part := range strings.FieldsFunc(value, func(r rune) bool {
return r == ',' || r == ' '
}) {
if part = strings.TrimSpace(part); part != "" {
contentTypes = append(contentTypes, part)
}
}

Expand Down
11 changes: 9 additions & 2 deletions apps/cli/tests/crawler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,15 @@ import (
)

func TestCrawlerStartReturnsResultsWithoutWritingFiles(t *testing.T) {
//nolint:govet // testing.Chdir requires Go 1.24+, using available version
t.Chdir(t.TempDir())
origDir, _ := os.Getwd()

tmpDir := t.TempDir()

if err := os.Chdir(tmpDir); err != nil {
t.Fatalf("chdir to temp dir: %v", err)
}

t.Cleanup(func() { _ = os.Chdir(origDir) })

server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
Expand Down
Loading
Loading