From 8dc4f7046c7efa976df327cbbcd1a10b07a75c32 Mon Sep 17 00:00:00 2001 From: Anuj Singh Date: Thu, 25 Jun 2026 12:57:15 +0530 Subject: [PATCH 1/8] fix: support space-separated content-types and improve help text --- apps/cli/main.go | 11 +- scan-results-quotes.json | 498 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 505 insertions(+), 4 deletions(-) create mode 100644 scan-results-quotes.json diff --git a/apps/cli/main.go b/apps/cli/main.go index 8302721..40389e6 100644 --- a/apps/cli/main.go +++ b/apps/cli/main.go @@ -28,7 +28,7 @@ func main() { uniqueUrls := flag.Bool("u", false, "Ensure unique URLs") concurrency := flag.Int("concurrency", 0, "Maximum concurrent requests; 0 uses available CPU capacity") hostConcurrency := flag.Int("host-concurrency", 0, "Maximum concurrent requests per host; 0 uses -concurrency") - contentTypes := flag.String("content-types", "text/html", "Comma-separated MIME types to download, e.g. text/html,application/pdf,image/jpeg") + contentTypes := flag.String("content-types", "text/html", "Comma-separated MIME types to download (quote the list), e.g. \"text/html,application/pdf,image/jpeg\"") output := flag.String("output", "crawler_results", "Output filename without an extension") ignoreRobots := flag.Bool("ignore-robots", false, "Ignore robots.txt crawl restrictions") crossDomain := flag.Bool("cross-domain", false, "Follow links to hosts other than the starting URL") @@ -126,9 +126,12 @@ func validateStartURL(rawURL string) (string, error) { func parseContentTypes(value string) []string { var contentTypes []string - for _, contentType := range strings.Split(value, ",") { - if contentType = strings.TrimSpace(contentType); contentType != "" { - contentTypes = append(contentTypes, contentType) + // Support both comma-separated and space-separated values + for _, part := range strings.FieldsFunc(value, func(r rune) bool { + return r == ',' || r == ' ' + }) { + if part = strings.TrimSpace(part); part != "" { + contentTypes = append(contentTypes, part) } } diff --git a/scan-results-quotes.json b/scan-results-quotes.json new file mode 100644 index 0000000..0c7c838 --- /dev/null +++ b/scan-results-quotes.json @@ -0,0 +1,498 @@ +{ + "start_url": "https://quotes.toscrape.com/", + "output_file": "scan-results-quotes.json", + "started_at": "2026-06-25T12:32:47.768829989+05:30", + "finished_at": "2026-06-25T12:32:54.419092497+05:30", + "duration_ms": 6650, + "summary": { + "total": 52, + "passed": 47, + "failed": 0, + "skipped": 5, + "discovered": 0, + "skipped_by_robots": 0, + "skipped_by_domain": 2, + "skipped_by_duplicate": 1, + "skipped_by_content_type": 2, + "skipped_by_depth": 0, + "skipped_by_other": 0, + "retried_requests": 0, + "max_depth": 1, + "urls_by_status_code": { + "200": 49 + }, + "skipped_by_reason": { + "content type not allowed": 2, + "duplicate": 1, + "outside domain scope": 2 + } + }, + "urls": [ + { + "url": "https://quotes.toscrape.com/", + "source": "href", + "depth": 0, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/miracle/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/author/J-K-Rowling", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/classic/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/love/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/life/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/friendship/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/misattributed-eleanor-roosevelt/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/failure/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/page/2/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/author/Eleanor-Roosevelt", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/world/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/humor/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/inspirational/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/humor/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/simile/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/value/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/author/Albert-Einstein", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/change/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/success/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/obvious/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/author/Andre-Gide", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/choices/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/live/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/inspirational/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/life/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/books/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/simile/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/love/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/aliteracy/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/books/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/thinking/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/edison/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/friends/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/author/Thomas-A-Edison", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/paraphrased/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/login", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/author/Steve-Martin", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/author/Marilyn-Monroe", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/miracles/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/deep-thoughts/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/truth/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/author/Jane-Austen", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/abilities/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/be-yourself/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/reading/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/adulthood/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + } + ], + "skipped": [ + { + "url": "https://www.zyte.com", + "source": "href", + "depth": 1, + "result": "skipped", + "skipped_reason": "outside domain scope" + }, + { + "url": "https://www.goodreads.com/quotes", + "source": "href", + "depth": 1, + "result": "skipped", + "skipped_reason": "outside domain scope" + }, + { + "url": "https://quotes.toscrape.com/", + "source": "href", + "depth": 1, + "result": "skipped", + "skipped_reason": "duplicate" + }, + { + "url": "https://quotes.toscrape.com/static/bootstrap.min.css", + "source": "link", + "depth": 1, + "status_code": 200, + "content_type": "text/css; charset=utf-8", + "result": "skipped", + "skipped_reason": "content type not allowed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/static/main.css", + "source": "link", + "depth": 1, + "status_code": 200, + "content_type": "text/css; charset=utf-8", + "result": "skipped", + "skipped_reason": "content type not allowed", + "attempts": 1 + } + ] +} \ No newline at end of file From 35e92e483dbf714370151cb40157c9ca81a60b78 Mon Sep 17 00:00:00 2001 From: Anuj Singh Date: Thu, 25 Jun 2026 13:20:47 +0530 Subject: [PATCH 2/8] docs: use go run ./apps/cli for CI-compatible commands - Replace all ./deepscanbot references with go run ./apps/cli - Update .gitignore to exclude all scan-results-* files - Docs build verified successful --- .gitignore | 3 +- CONTRIBUTING.md | 2 +- apps/docs/docs/features.mdx | 30 +++++----- apps/docs/docs/installation.mdx | 10 ++-- apps/docs/docs/usage.mdx | 30 +++++----- scan-results-quotes.json | 100 ++++++++++++++++---------------- 6 files changed, 88 insertions(+), 87 deletions(-) diff --git a/.gitignore b/.gitignore index 9fb6e3f..7e42ad5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ -scan-results-goodreads.json +scan-results-*.json +scan-results-*.txt deepscanbot solved.txt diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cb6a41b..eb1e8f1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -69,7 +69,7 @@ By participating in this project, you agree to abide by the [CODE_OF_CONDUCT.md] 5. Verify the installation: ```bash - ./deepscanbot -h + go run ./apps/cli -h ``` ### Project Structure diff --git a/apps/docs/docs/features.mdx b/apps/docs/docs/features.mdx index 25fe55a..de0d212 100644 --- a/apps/docs/docs/features.mdx +++ b/apps/docs/docs/features.mdx @@ -13,7 +13,7 @@ Multi-threaded architecture with configurable concurrency limits for optimal per ```bash # Set concurrency to 10 -./deepscanbot -url https://example.com -concurrency 10 +go run ./apps/cli -url https://example.com -concurrency 10 ``` ### Configurable Crawl Depth @@ -21,10 +21,10 @@ Control how deep the crawler explores linked pages. ```bash # Single page only -./deepscanbot -url https://example.com -depth 0 +go run ./apps/cli -url https://example.com -depth 0 # Full site (3 levels deep) -./deepscanbot -url https://example.com -depth 3 +go run ./apps/cli -url https://example.com -depth 3 ``` ### Robots.txt Compliance @@ -32,7 +32,7 @@ Respects robots.txt rules with an option to ignore them. ```bash # Ignore robots.txt -./deepscanbot -url https://example.com -ignore-robots +go run ./apps/cli -url https://example.com -ignore-robots ``` ### Retry Logic @@ -40,7 +40,7 @@ Built-in retry mechanism with exponential backoff for transient failures. ```bash # Retry up to 3 times with 2 second backoff -./deepscanbot -url https://example.com -retries 3 -retry-backoff 2s +go run ./apps/cli -url https://example.com -retries 3 -retry-backoff 2s ``` ## Advanced Features @@ -50,24 +50,24 @@ Filter crawled pages by MIME types and size limits. ```bash # Only HTML and PDF files -./deepscanbot -url https://example.com -content-types "text/html,application/pdf" +go run ./apps/cli -url https://example.com -content-types "text/html,application/pdf" # Limit page size to 1MB -./deepscanbot -url https://example.com -size 1024 +go run ./apps/cli -url https://example.com -size 1024 ``` ### Proxy Support Route traffic through HTTP/HTTPS proxies. ```bash -./deepscanbot -url https://example.com -proxy http://127.0.0.1:8080 +go run ./apps/cli -url https://example.com -proxy http://127.0.0.1:8080 ``` ### Sitemap Discovery Automatically discover and crawl URLs from sitemap.xml. ```bash -./deepscanbot -url https://example.com -sitemap +go run ./apps/cli -url https://example.com -sitemap ``` ### Resume Mode @@ -75,17 +75,17 @@ Resume interrupted crawls without recrawling already visited URLs. ```bash # First crawl -./deepscanbot -url https://example.com -output my-crawl +go run ./apps/cli -url https://example.com -output my-crawl # Resume -./deepscanbot -url https://example.com -resume -output my-crawl +go run ./apps/cli -url https://example.com -resume -output my-crawl ``` ### Cross-Domain Crawling Optionally follow links to external domains. ```bash -./deepscanbot -url https://example.com -cross-domain +go run ./apps/cli -url https://example.com -cross-domain ``` ### Multiple Output Formats @@ -93,10 +93,10 @@ Export results in JSON or plain text format. ```bash # Text output (default) -./deepscanbot -url https://example.com -output results +go run ./apps/cli -url https://example.com -output results # JSON output -./deepscanbot -url https://example.com -json -output results +go run ./apps/cli -url https://example.com -json -output results ``` ### Politely Crawl @@ -104,7 +104,7 @@ Configure delay between requests to avoid overwhelming servers. ```bash # Wait 1 second between requests -./deepscanbot -url https://example.com -delay 1s +go run ./apps/cli -url https://example.com -delay 1s ``` ## All CLI Flags diff --git a/apps/docs/docs/installation.mdx b/apps/docs/docs/installation.mdx index c139c84..1327b66 100644 --- a/apps/docs/docs/installation.mdx +++ b/apps/docs/docs/installation.mdx @@ -47,8 +47,8 @@ lefthook install ## Verify Installation ```bash -# Test the CLI -./deepscanbot -h +# Test the CLI (using go run - no build required) +go run ./apps/cli -h # Run tests go test ./... @@ -61,13 +61,13 @@ golangci-lint run ./... ```bash # Crawl a website -./deepscanbot -url https://example.com -depth 2 +go run ./apps/cli -url https://example.com -depth 2 # Output as JSON -./deepscanbot -url https://example.com -depth 2 -json +go run ./apps/cli -url https://example.com -depth 2 -json # With specific content types -./deepscanbot -url https://example.com -depth 2 -content-types "text/html,application/pdf" +go run ./apps/cli -url https://example.com -depth 2 -content-types "text/html,application/pdf" ``` ## Next Steps diff --git a/apps/docs/docs/usage.mdx b/apps/docs/docs/usage.mdx index 55f58c8..72c868b 100644 --- a/apps/docs/docs/usage.mdx +++ b/apps/docs/docs/usage.mdx @@ -11,7 +11,7 @@ Learn how to use DeepScanBot effectively for your web crawling needs. The simplest way to crawl a website: ```bash -./deepscanbot -url https://example.com +go run ./apps/cli -url https://example.com ``` This will: @@ -26,7 +26,7 @@ This will: Crawl only the specified URL without following links: ```bash -./deepscanbot -url https://example.com -depth 0 +go run ./apps/cli -url https://example.com -depth 0 ``` ### Full Site Crawl @@ -34,7 +34,7 @@ Crawl only the specified URL without following links: Crawl an entire site with greater depth: ```bash -./deepscanbot -url https://example.com -depth 5 -concurrency 20 +go run ./apps/cli -url https://example.com -depth 5 -concurrency 20 ``` ### JSON Output @@ -42,7 +42,7 @@ Crawl an entire site with greater depth: Export results in JSON format for programmatic processing: ```bash -./deepscanbot -url https://example.com -json -output results +go run ./apps/cli -url https://example.com -json -output results ``` ### Crawl with Proxy @@ -50,7 +50,7 @@ Export results in JSON format for programmatic processing: Route traffic through a proxy server: ```bash -./deepscanbot -url https://example.com -proxy http://127.0.0.1:8080 +go run ./apps/cli -url https://example.com -proxy http://127.0.0.1:8080 ``` ### Content-Type Filtering @@ -59,13 +59,13 @@ Only download specific file types: ```bash # Only HTML pages -./deepscanbot -url https://example.com -content-types "text/html" +go run ./apps/cli -url https://example.com -content-types "text/html" # HTML and PDF documents -./deepscanbot -url https://example.com -content-types "text/html,application/pdf" +go run ./apps/cli -url https://example.com -content-types "text/html,application/pdf" # All content types -./deepscanbot -url https://example.com -content-types "*/*" +go run ./apps/cli -url https://example.com -content-types "*/*" ``` ### Unique URLs Only @@ -73,7 +73,7 @@ Only download specific file types: Avoid processing duplicate URLs: ```bash -./deepscanbot -url https://example.com -u +go run ./apps/cli -url https://example.com -u ``` ### Resume a Crawl @@ -82,10 +82,10 @@ Stop and resume a crawl without recrawling: ```bash # Initial crawl -./deepscanbot -url https://example.com -depth 3 -output my-crawl +go run ./apps/cli -url https://example.com -depth 3 -output my-crawl # Resume (skips already crawled URLs) -./deepscanbot -url https://example.com -depth 3 -resume -output my-crawl +go run ./apps/cli -url https://example.com -depth 3 -resume -output my-crawl ``` ### Politely Crawl @@ -93,7 +93,7 @@ Stop and resume a crawl without recrawling: Add delays to avoid overwhelming servers: ```bash -./deepscanbot -url https://example.com -delay 500ms -host-concurrency 2 +go run ./apps/cli -url https://example.com -delay 500ms -host-concurrency 2 ``` ## Output Formats @@ -154,16 +154,16 @@ https://example.com/admin [result=skipped] [skipped=disallowed by robots.txt] ### Crawl Documentation Site ```bash -./deepscanbot -url https://docs.example.com -depth 3 -sitemap -json -output docs-crawl +go run ./apps/cli -url https://docs.example.com -depth 3 -sitemap -json -output docs-crawl ``` ### Monitor a Website ```bash -./deepscanbot -url https://example.com -depth 1 -u -output monitor -dr +go run ./apps/cli -url https://example.com -depth 1 -u -output monitor -dr ``` ### Scrape PDF Documents ```bash -./deepscanbot -url https://example.com -depth 2 -content-types "application/pdf" -size 10240 -output pdfs \ No newline at end of file +go run ./apps/cli -url https://example.com -depth 2 -content-types "application/pdf" -size 10240 -output pdfs \ No newline at end of file diff --git a/scan-results-quotes.json b/scan-results-quotes.json index 0c7c838..943fc63 100644 --- a/scan-results-quotes.json +++ b/scan-results-quotes.json @@ -1,9 +1,9 @@ { "start_url": "https://quotes.toscrape.com/", "output_file": "scan-results-quotes.json", - "started_at": "2026-06-25T12:32:47.768829989+05:30", - "finished_at": "2026-06-25T12:32:54.419092497+05:30", - "duration_ms": 6650, + "started_at": "2026-06-25T13:10:29.169600358+05:30", + "finished_at": "2026-06-25T13:10:36.216894869+05:30", + "duration_ms": 7047, "summary": { "total": 52, "passed": 47, @@ -38,7 +38,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/miracle/page/1/", + "url": "https://quotes.toscrape.com/tag/friends/", "source": "href", "depth": 1, "status_code": 200, @@ -47,7 +47,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/author/J-K-Rowling", + "url": "https://quotes.toscrape.com/tag/success/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -56,7 +56,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/classic/page/1/", + "url": "https://quotes.toscrape.com/tag/love/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -65,7 +65,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/love/page/1/", + "url": "https://quotes.toscrape.com/tag/miracles/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -74,7 +74,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/life/", + "url": "https://quotes.toscrape.com/tag/reading/", "source": "href", "depth": 1, "status_code": 200, @@ -83,7 +83,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/friendship/", + "url": "https://quotes.toscrape.com/tag/inspirational/", "source": "href", "depth": 1, "status_code": 200, @@ -92,7 +92,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/misattributed-eleanor-roosevelt/page/1/", + "url": "https://quotes.toscrape.com/tag/obvious/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -101,7 +101,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/failure/page/1/", + "url": "https://quotes.toscrape.com/tag/simile/", "source": "href", "depth": 1, "status_code": 200, @@ -110,7 +110,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/page/2/", + "url": "https://quotes.toscrape.com/author/Thomas-A-Edison", "source": "href", "depth": 1, "status_code": 200, @@ -119,7 +119,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/author/Eleanor-Roosevelt", + "url": "https://quotes.toscrape.com/tag/thinking/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -128,7 +128,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/world/page/1/", + "url": "https://quotes.toscrape.com/tag/classic/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -137,7 +137,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/humor/", + "url": "https://quotes.toscrape.com/tag/adulthood/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -146,7 +146,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/inspirational/", + "url": "https://quotes.toscrape.com/tag/misattributed-eleanor-roosevelt/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -155,7 +155,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/humor/page/1/", + "url": "https://quotes.toscrape.com/author/Steve-Martin", "source": "href", "depth": 1, "status_code": 200, @@ -164,7 +164,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/simile/", + "url": "https://quotes.toscrape.com/tag/paraphrased/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -173,7 +173,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/value/page/1/", + "url": "https://quotes.toscrape.com/tag/truth/", "source": "href", "depth": 1, "status_code": 200, @@ -182,7 +182,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/author/Albert-Einstein", + "url": "https://quotes.toscrape.com/author/Eleanor-Roosevelt", "source": "href", "depth": 1, "status_code": 200, @@ -191,7 +191,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/change/page/1/", + "url": "https://quotes.toscrape.com/page/2/", "source": "href", "depth": 1, "status_code": 200, @@ -200,7 +200,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/success/page/1/", + "url": "https://quotes.toscrape.com/tag/books/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -209,7 +209,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/obvious/page/1/", + "url": "https://quotes.toscrape.com/login", "source": "href", "depth": 1, "status_code": 200, @@ -218,7 +218,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/author/Andre-Gide", + "url": "https://quotes.toscrape.com/tag/miracle/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -227,7 +227,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/choices/page/1/", + "url": "https://quotes.toscrape.com/tag/value/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -236,7 +236,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/live/page/1/", + "url": "https://quotes.toscrape.com/tag/world/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -245,7 +245,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/inspirational/page/1/", + "url": "https://quotes.toscrape.com/tag/failure/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -254,7 +254,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/life/page/1/", + "url": "https://quotes.toscrape.com/author/Marilyn-Monroe", "source": "href", "depth": 1, "status_code": 200, @@ -263,7 +263,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/books/", + "url": "https://quotes.toscrape.com/tag/humor/", "source": "href", "depth": 1, "status_code": 200, @@ -272,7 +272,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/simile/page/1/", + "url": "https://quotes.toscrape.com/tag/humor/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -290,7 +290,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/aliteracy/page/1/", + "url": "https://quotes.toscrape.com/tag/friendship/", "source": "href", "depth": 1, "status_code": 200, @@ -299,7 +299,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/books/page/1/", + "url": "https://quotes.toscrape.com/tag/choices/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -308,7 +308,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/thinking/page/1/", + "url": "https://quotes.toscrape.com/tag/be-yourself/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -317,7 +317,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/edison/page/1/", + "url": "https://quotes.toscrape.com/tag/life/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -326,7 +326,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/friends/", + "url": "https://quotes.toscrape.com/author/Jane-Austen", "source": "href", "depth": 1, "status_code": 200, @@ -335,7 +335,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/author/Thomas-A-Edison", + "url": "https://quotes.toscrape.com/author/Andre-Gide", "source": "href", "depth": 1, "status_code": 200, @@ -344,7 +344,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/paraphrased/page/1/", + "url": "https://quotes.toscrape.com/tag/simile/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -353,7 +353,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/login", + "url": "https://quotes.toscrape.com/tag/life/", "source": "href", "depth": 1, "status_code": 200, @@ -362,7 +362,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/author/Steve-Martin", + "url": "https://quotes.toscrape.com/author/Albert-Einstein", "source": "href", "depth": 1, "status_code": 200, @@ -371,7 +371,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/author/Marilyn-Monroe", + "url": "https://quotes.toscrape.com/tag/abilities/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -380,7 +380,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/miracles/page/1/", + "url": "https://quotes.toscrape.com/tag/aliteracy/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -389,7 +389,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/deep-thoughts/page/1/", + "url": "https://quotes.toscrape.com/tag/books/", "source": "href", "depth": 1, "status_code": 200, @@ -398,7 +398,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/truth/", + "url": "https://quotes.toscrape.com/tag/deep-thoughts/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -407,7 +407,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/author/Jane-Austen", + "url": "https://quotes.toscrape.com/tag/change/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -416,7 +416,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/abilities/page/1/", + "url": "https://quotes.toscrape.com/tag/live/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -425,7 +425,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/be-yourself/page/1/", + "url": "https://quotes.toscrape.com/tag/edison/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -434,7 +434,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/reading/", + "url": "https://quotes.toscrape.com/author/J-K-Rowling", "source": "href", "depth": 1, "status_code": 200, @@ -443,7 +443,7 @@ "attempts": 1 }, { - "url": "https://quotes.toscrape.com/tag/adulthood/page/1/", + "url": "https://quotes.toscrape.com/tag/inspirational/page/1/", "source": "href", "depth": 1, "status_code": 200, @@ -454,14 +454,14 @@ ], "skipped": [ { - "url": "https://www.zyte.com", + "url": "https://www.goodreads.com/quotes", "source": "href", "depth": 1, "result": "skipped", "skipped_reason": "outside domain scope" }, { - "url": "https://www.goodreads.com/quotes", + "url": "https://www.zyte.com", "source": "href", "depth": 1, "result": "skipped", From e6f351bade78cd4f54a9d7a845ba9bc4ff7d41fb Mon Sep 17 00:00:00 2001 From: Anuj Singh Date: Thu, 25 Jun 2026 13:25:42 +0530 Subject: [PATCH 3/8] docs: update README with go run ./apps/cli commands --- README.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 9b8edb9..33ce156 100644 --- a/README.md +++ b/README.md @@ -35,11 +35,11 @@ cd DeepScanBot # Install dependencies go mod download -# Build the crawler -go build -o deepscanbot +# Build the crawler (optional) +go build -o deepscanbot ./apps/cli -# Or run directly -go run main.go -url [options] +# Or run directly (no build required) +go run ./apps/cli -url [options] ``` ## Usage @@ -76,7 +76,7 @@ go run main.go -url [options] #### 1. Basic Crawl ```bash -deepscanbot -url https://example.com -depth 2 +go run ./apps/cli -url https://example.com -depth 2 ``` Crawls `https://example.com` up to 2 levels deep and outputs results to `crawler_results.txt`. @@ -84,7 +84,7 @@ Crawls `https://example.com` up to 2 levels deep and outputs results to `crawler #### 2. JSON Output with Details ```bash -deepscanbot -url https://example.com -depth 3 -json -s -u -output my_results +go run ./apps/cli -url https://example.com -depth 3 -json -s -u -output my_results ``` Outputs JSON to `my_results.json` with URL source tracking and deduplication. @@ -92,7 +92,7 @@ Outputs JSON to `my_results.json` with URL source tracking and deduplication. #### 3. Crawl with Retry and Delay ```bash -deepscanbot -url https://docs.example.com -depth 2 -retries 3 -retry-backoff 2s -delay 1s -host-concurrency 1 +go run ./apps/cli -url https://docs.example.com -depth 2 -retries 3 -retry-backoff 2s -delay 1s -host-concurrency 1 ``` Retries failed requests up to 3 times with exponential backoff, waits 1 second between requests to the same host, and allows only 1 concurrent request per host. @@ -100,7 +100,7 @@ Retries failed requests up to 3 times with exponential backoff, waits 1 second b #### 4. Cross-Domain Crawl with Sitemap ```bash -deepscanbot -url https://example.com -depth 3 -cross-domain -sitemap -concurrency 10 -host-concurrency 2 -json +go run ./apps/cli -url https://example.com -depth 3 -cross-domain -sitemap -concurrency 10 -host-concurrency 2 -json ``` Discovers URLs from sitemap.xml, follows links to any domain, with 10 total workers and 2 per host. @@ -109,10 +109,10 @@ Discovers URLs from sitemap.xml, follows links to any domain, with 10 total work ```bash # First run (interrupted) -deepscanbot -url https://example.com -depth 3 -json -output my_results +go run ./apps/cli -url https://example.com -depth 3 -json -output my_results # Resume -deepscanbot -url https://example.com -depth 3 -json -output my_results -resume +go run ./apps/cli -url https://example.com -depth 3 -json -output my_results -resume ``` Loaded existing results from `my_results.json` and skips already-crawled URLs. @@ -120,7 +120,7 @@ Loaded existing results from `my_results.json` and skips already-crawled URLs. #### 6. Crawl Goodreads with Rate-Limit Handling ```bash -deepscanbot -url https://www.goodreads.com -depth 2 -delay 2s -retries 5 -retry-backoff 2s -concurrency 2 -host-concurrency 1 -json -output goodreads_results +go run ./apps/cli -url https://www.goodreads.com -depth 2 -delay 2s -retries 5 -retry-backoff 2s -concurrency 2 -host-concurrency 1 -json -output goodreads_results ``` Uses 2-second politeness delay, 5 retries with exponential backoff, limited concurrency to handle Goodreads rate limits gracefully. @@ -128,7 +128,7 @@ Uses 2-second politeness delay, 5 retries with exponential backoff, limited conc #### 7. Crawl PDF and Images ```bash -deepscanbot -url https://example.com -depth 2 -content-types "text/html,application/pdf,image/jpeg,image/png" -json +go run ./apps/cli -url https://example.com -depth 2 -content-types "text/html,application/pdf,image/jpeg,image/png" -json ``` Downloads HTML, PDF, JPEG, and PNG files while still parsing HTML for links. From 1f4a9a22da1968db28454ea61409f96ba7e89b76 Mon Sep 17 00:00:00 2001 From: Anuj Singh Date: Thu, 25 Jun 2026 13:40:31 +0530 Subject: [PATCH 4/8] feat: add GitHub Actions CI/CD workflows - ci.yml: Run on all pushes/PRs. Checks: formatting (gofumpt), imports (gci), lint (golangci-lint), build, tests, go mod tidy - release.yml: Trigger on v* tags. Builds binaries for linux (amd64/arm64), darwin (amd64/arm64), windows (amd64). Generates checksums and creates GitHub Release - release-docs.yml: Trigger on main branch pushes. Builds Docusaurus documentation and deploys to gh-pages branch --- .github/workflows/ci.yml | 82 ++++++++++++++++++++++++++++++ .github/workflows/release-docs.yml | 70 +++++++++++++++++++++++++ .github/workflows/release.yml | 68 +++++++++++++++++++++++++ CONTRIBUTING.md | 1 + README.md | 8 +-- 5 files changed, 225 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/release-docs.yml create mode 100644 .github/workflows/release.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..9f670d5 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,82 @@ +name: CI + +on: + push: + branches: + - "**" + pull_request: + branches: + - main + - development + workflow_dispatch: + +jobs: + build: + name: 'Build & Test' + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.22' + check-latest: true + cache: true + + - name: Install dependencies + run: go mod download + + - name: Install tools + run: | + go install mvdan.cc/gofumpt@latest + go install github.com/daixiang0/gci@latest + go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest + + - name: Check formatting (gofumpt) + run: | + if [ -n "$(gofumpt -d .)" ]; then + echo "❌ Code is not formatted. Run: gofumpt -w ." + gofumpt -d . + exit 1 + fi + echo "✅ Code formatting is clean" + + - name: Check imports (gci) + run: | + if [ -n "$(gci diff -s standard -s default -s 'prefix(github.com/mindfiredigital/DeepScanBot)' .)" ]; then + echo "❌ Imports are not sorted. Run: gci write -s standard -s default -s 'prefix(github.com/mindfiredigital/DeepScanBot)' ." + gci diff -s standard -s default -s 'prefix(github.com/mindfiredigital/DeepScanBot)' . + exit 1 + fi + echo "✅ Imports are clean" + + - name: Run golangci-lint + run: | + golangci-lint run --timeout 5m ./... + echo "✅ Lint passed" + + - name: Build + run: | + go build -o deepscanbot ./apps/cli + echo "✅ Build successful" + + - name: Run tests + run: | + go test -v -count=1 ./... + echo "✅ All tests passed" + + - name: Verify tidy + run: | + go mod tidy + if [ -n "$(git diff --name-only go.mod go.sum)" ]; then + echo "❌ go.mod/go.sum are not tidy. Run: go mod tidy" + git diff go.mod go.sum + exit 1 + fi + echo "✅ go.mod is tidy" \ No newline at end of file diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml new file mode 100644 index 0000000..6804869 --- /dev/null +++ b/.github/workflows/release-docs.yml @@ -0,0 +1,70 @@ +name: Release docs Workflow + +on: + push: + branches: + - main + workflow_dispatch: + +jobs: + build: + name: 'Build Docusaurus Documentation' + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: 24 + + - name: Install dependencies + run: | + cd apps/docs + npm ci + + - name: Build Docusaurus + run: | + cd apps/docs + npm run build + + - name: Set Git user + run: | + git config --local user.email "github-actions@github.com" + git config --local user.name "GitHub Actions" + + - name: Deploy to gh-pages branch + run: | + # Store the build artifacts + mkdir -p /tmp/docusaurus-build + cp -r apps/docs/build/* /tmp/docusaurus-build/ + + # Stash any changes to prevent checkout conflicts + git stash push --include-untracked || true + + # Switch to gh-pages branch, creating it if it doesn't exist + git checkout gh-pages || git checkout -b gh-pages + + # Remove all existing files + rm -rf * + + # Copy the build artifacts + cp -r /tmp/docusaurus-build/* . + + # Check if there are any changes before committing + if [[ -n "$(git status --porcelain)" ]]; then + git add . -f + git commit -m "chore(docs): update documentation build" --no-verify + git push origin gh-pages --force + echo "✅ Documentation deployed to gh-pages" + else + echo "No changes to commit" + fi + + # Switch back to original branch + git checkout - \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..0b6a980 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,68 @@ +name: Release + +on: + push: + tags: + - 'v*' + workflow_dispatch: + +permissions: + contents: write + packages: write + +jobs: + goreleaser: + name: Release Binaries + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.22' + check-latest: true + cache: true + + - name: Install dependencies + run: go mod download + + - name: Run tests + run: go test -v -count=1 ./... + + - name: Build binaries + run: | + mkdir -p dist + GOOS=linux GOARCH=amd64 go build -o dist/deepscanbot-linux-amd64 ./apps/cli + GOOS=linux GOARCH=arm64 go build -o dist/deepscanbot-linux-arm64 ./apps/cli + GOOS=darwin GOARCH=amd64 go build -o dist/deepscanbot-darwin-amd64 ./apps/cli + GOOS=darwin GOARCH=arm64 go build -o dist/deepscanbot-darwin-arm64 ./apps/cli + GOOS=windows GOARCH=amd64 go build -o dist/deepscanbot-windows-amd64.exe ./apps/cli + echo "✅ Binaries built" + + - name: Generate checksums + run: | + cd dist + sha256sum * > checksums.txt + echo "✅ Checksums generated" + + - name: Create Release + uses: softprops/action-gh-release@v2 + with: + name: Release ${{ github.ref_name }} + tag_name: ${{ github.ref_name }} + body_path: CHANGELOG.md + draft: false + prerelease: false + files: | + dist/deepscanbot-linux-amd64 + dist/deepscanbot-linux-arm64 + dist/deepscanbot-darwin-amd64 + dist/deepscanbot-darwin-arm64 + dist/deepscanbot-windows-amd64.exe + dist/checksums.txt + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index eb1e8f1..b96b2de 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -193,6 +193,7 @@ Use conventional commits format: ### Commit Messages Follow the [Conventional Commits](https://www.conventionalcommits.org/) specification. Use `cocogitto` or a custom commit hook to validate commit messages: + ```bash # Using cocogitto cog commit diff --git a/README.md b/README.md index 33ce156..351c873 100644 --- a/README.md +++ b/README.md @@ -250,10 +250,10 @@ The JSON report contains a detailed summary and two URL lists: Text output shows one URL per line with optional metadata in brackets: ``` -[https://example.com [status=200] [result=passed] -[https://example.com/about [status=200] [result=passed] -[https://example.com/not-found [status=404] [result=failed] [error=bad status code: 404] -[https://external.com/page [result=skipped] [skipped=outside domain scope] +[https://example.com] [status=200] [result=passed] +[https://example.com/about] [status=200] [result=passed] +[https://example.com/not-found] [status=404] [result=failed] [error=bad status code: 404] +[https://external.com/page] [result=skipped] [skipped=outside domain scope] // With -s flag: [href] https://example.com From 4d3e5f1739673e1f5a8fde90366f09f677a1ca1e Mon Sep 17 00:00:00 2001 From: Anuj Singh Date: Thu, 25 Jun 2026 14:06:15 +0530 Subject: [PATCH 5/8] fix: replace t.Chdir with os.Chdir + t.Cleanup for Go 1.22 compatibility t.Chdir requires Go 1.24+, but CI runner uses Go 1.22. Replaced with os.Chdir + t.Cleanup pattern to fix CI pipeline. --- apps/cli/tests/crawler_test.go | 8 ++++++-- apps/cli/tests/storage_test.go | 16 ++++++++++++---- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/apps/cli/tests/crawler_test.go b/apps/cli/tests/crawler_test.go index 8a1c5aa..d77eb55 100644 --- a/apps/cli/tests/crawler_test.go +++ b/apps/cli/tests/crawler_test.go @@ -15,8 +15,12 @@ import ( ) func TestCrawlerStartReturnsResultsWithoutWritingFiles(t *testing.T) { - //nolint:govet // testing.Chdir requires Go 1.24+, using available version - t.Chdir(t.TempDir()) + origDir, _ := os.Getwd() + tmpDir := t.TempDir() + if err := os.Chdir(tmpDir); err != nil { + t.Fatalf("chdir to temp dir: %v", err) + } + t.Cleanup(func() { _ = os.Chdir(origDir) }) server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html") diff --git a/apps/cli/tests/storage_test.go b/apps/cli/tests/storage_test.go index d61e4b1..2f0d724 100644 --- a/apps/cli/tests/storage_test.go +++ b/apps/cli/tests/storage_test.go @@ -14,8 +14,12 @@ import ( func TestTextOutputIsTruncatedForEachStorageInstance(t *testing.T) { const filename = "crawler_results.txt" - //nolint:govet // testing.Chdir requires Go 1.24+, using available version - t.Chdir(t.TempDir()) + origDir, _ := os.Getwd() + tmpDir := t.TempDir() + if err := os.Chdir(tmpDir); err != nil { + t.Fatalf("chdir to temp dir: %v", err) + } + t.Cleanup(func() { _ = os.Chdir(origDir) }) if err := os.WriteFile(filename, []byte("result from a previous crawl\n"), 0o644); err != nil { t.Fatalf("seed previous output: %v", err) @@ -41,8 +45,12 @@ func TestTextOutputIsTruncatedForEachStorageInstance(t *testing.T) { func TestTextOutputIsFlushedOnClose(t *testing.T) { const filename = "crawler_results.txt" - //nolint:govet // testing.Chdir requires Go 1.24+, using available version - t.Chdir(t.TempDir()) + origDir, _ := os.Getwd() + tmpDir := t.TempDir() + if err := os.Chdir(tmpDir); err != nil { + t.Fatalf("chdir to temp dir: %v", err) + } + t.Cleanup(func() { _ = os.Chdir(origDir) }) pageStorage := storage.NewPageStorage() pageStorage.StoreContent("https://example.com/one") From 13ca7de7028d393ef31842baa2de1fcc81fdce23 Mon Sep 17 00:00:00 2001 From: Anuj Singh Date: Thu, 25 Jun 2026 14:10:23 +0530 Subject: [PATCH 6/8] fix: add blank lines to satisfy wsl linter in test files --- apps/cli/tests/crawler_test.go | 3 +++ apps/cli/tests/storage_test.go | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/apps/cli/tests/crawler_test.go b/apps/cli/tests/crawler_test.go index d77eb55..1c5cda3 100644 --- a/apps/cli/tests/crawler_test.go +++ b/apps/cli/tests/crawler_test.go @@ -16,10 +16,13 @@ import ( func TestCrawlerStartReturnsResultsWithoutWritingFiles(t *testing.T) { origDir, _ := os.Getwd() + tmpDir := t.TempDir() + if err := os.Chdir(tmpDir); err != nil { t.Fatalf("chdir to temp dir: %v", err) } + t.Cleanup(func() { _ = os.Chdir(origDir) }) server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { diff --git a/apps/cli/tests/storage_test.go b/apps/cli/tests/storage_test.go index 2f0d724..780d461 100644 --- a/apps/cli/tests/storage_test.go +++ b/apps/cli/tests/storage_test.go @@ -15,10 +15,13 @@ func TestTextOutputIsTruncatedForEachStorageInstance(t *testing.T) { const filename = "crawler_results.txt" origDir, _ := os.Getwd() + tmpDir := t.TempDir() + if err := os.Chdir(tmpDir); err != nil { t.Fatalf("chdir to temp dir: %v", err) } + t.Cleanup(func() { _ = os.Chdir(origDir) }) if err := os.WriteFile(filename, []byte("result from a previous crawl\n"), 0o644); err != nil { @@ -46,10 +49,13 @@ func TestTextOutputIsFlushedOnClose(t *testing.T) { const filename = "crawler_results.txt" origDir, _ := os.Getwd() + tmpDir := t.TempDir() + if err := os.Chdir(tmpDir); err != nil { t.Fatalf("chdir to temp dir: %v", err) } + t.Cleanup(func() { _ = os.Chdir(origDir) }) pageStorage := storage.NewPageStorage() From 45ec3e118a8c70ebca156af83467fcf428a0f2f8 Mon Sep 17 00:00:00 2001 From: Anuj Singh Date: Thu, 25 Jun 2026 14:12:29 +0530 Subject: [PATCH 7/8] ci: restrict CI to main/development branches only --- .github/workflows/ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9f670d5..73457d5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -3,7 +3,8 @@ name: CI on: push: branches: - - "**" + - main + - development pull_request: branches: - main From 3be3cca73d1415e856a6e5d1129602d7b6852e4d Mon Sep 17 00:00:00 2001 From: Anuj Singh Date: Thu, 25 Jun 2026 19:01:45 +0530 Subject: [PATCH 8/8] feat(cli): changed the branch actions --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 73457d5..c1fa7a9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,7 +13,7 @@ on: jobs: build: - name: 'Build & Test' + name: "Build & Test" runs-on: ubuntu-latest permissions: contents: read @@ -26,7 +26,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v5 with: - go-version: '1.22' + go-version: "1.22" check-latest: true cache: true @@ -80,4 +80,4 @@ jobs: git diff go.mod go.sum exit 1 fi - echo "✅ go.mod is tidy" \ No newline at end of file + echo "✅ go.mod is tidy"