diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..c1fa7a9 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,83 @@ +name: CI + +on: + push: + branches: + - main + - development + pull_request: + branches: + - main + - development + workflow_dispatch: + +jobs: + build: + name: "Build & Test" + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: "1.22" + check-latest: true + cache: true + + - name: Install dependencies + run: go mod download + + - name: Install tools + run: | + go install mvdan.cc/gofumpt@latest + go install github.com/daixiang0/gci@latest + go install github.com/golangci/golangci-lint/cmd/golangci-lint@latest + + - name: Check formatting (gofumpt) + run: | + if [ -n "$(gofumpt -d .)" ]; then + echo "❌ Code is not formatted. Run: gofumpt -w ." + gofumpt -d . + exit 1 + fi + echo "✅ Code formatting is clean" + + - name: Check imports (gci) + run: | + if [ -n "$(gci diff -s standard -s default -s 'prefix(github.com/mindfiredigital/DeepScanBot)' .)" ]; then + echo "❌ Imports are not sorted. Run: gci write -s standard -s default -s 'prefix(github.com/mindfiredigital/DeepScanBot)' ." + gci diff -s standard -s default -s 'prefix(github.com/mindfiredigital/DeepScanBot)' . + exit 1 + fi + echo "✅ Imports are clean" + + - name: Run golangci-lint + run: | + golangci-lint run --timeout 5m ./... + echo "✅ Lint passed" + + - name: Build + run: | + go build -o deepscanbot ./apps/cli + echo "✅ Build successful" + + - name: Run tests + run: | + go test -v -count=1 ./... + echo "✅ All tests passed" + + - name: Verify tidy + run: | + go mod tidy + if [ -n "$(git diff --name-only go.mod go.sum)" ]; then + echo "❌ go.mod/go.sum are not tidy. Run: go mod tidy" + git diff go.mod go.sum + exit 1 + fi + echo "✅ go.mod is tidy" diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml new file mode 100644 index 0000000..6804869 --- /dev/null +++ b/.github/workflows/release-docs.yml @@ -0,0 +1,70 @@ +name: Release docs Workflow + +on: + push: + branches: + - main + workflow_dispatch: + +jobs: + build: + name: 'Build Docusaurus Documentation' + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: 24 + + - name: Install dependencies + run: | + cd apps/docs + npm ci + + - name: Build Docusaurus + run: | + cd apps/docs + npm run build + + - name: Set Git user + run: | + git config --local user.email "github-actions@github.com" + git config --local user.name "GitHub Actions" + + - name: Deploy to gh-pages branch + run: | + # Store the build artifacts + mkdir -p /tmp/docusaurus-build + cp -r apps/docs/build/* /tmp/docusaurus-build/ + + # Stash any changes to prevent checkout conflicts + git stash push --include-untracked || true + + # Switch to gh-pages branch, creating it if it doesn't exist + git checkout gh-pages || git checkout -b gh-pages + + # Remove all existing files + rm -rf * + + # Copy the build artifacts + cp -r /tmp/docusaurus-build/* . + + # Check if there are any changes before committing + if [[ -n "$(git status --porcelain)" ]]; then + git add . -f + git commit -m "chore(docs): update documentation build" --no-verify + git push origin gh-pages --force + echo "✅ Documentation deployed to gh-pages" + else + echo "No changes to commit" + fi + + # Switch back to original branch + git checkout - \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..0b6a980 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,68 @@ +name: Release + +on: + push: + tags: + - 'v*' + workflow_dispatch: + +permissions: + contents: write + packages: write + +jobs: + goreleaser: + name: Release Binaries + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.22' + check-latest: true + cache: true + + - name: Install dependencies + run: go mod download + + - name: Run tests + run: go test -v -count=1 ./... + + - name: Build binaries + run: | + mkdir -p dist + GOOS=linux GOARCH=amd64 go build -o dist/deepscanbot-linux-amd64 ./apps/cli + GOOS=linux GOARCH=arm64 go build -o dist/deepscanbot-linux-arm64 ./apps/cli + GOOS=darwin GOARCH=amd64 go build -o dist/deepscanbot-darwin-amd64 ./apps/cli + GOOS=darwin GOARCH=arm64 go build -o dist/deepscanbot-darwin-arm64 ./apps/cli + GOOS=windows GOARCH=amd64 go build -o dist/deepscanbot-windows-amd64.exe ./apps/cli + echo "✅ Binaries built" + + - name: Generate checksums + run: | + cd dist + sha256sum * > checksums.txt + echo "✅ Checksums generated" + + - name: Create Release + uses: softprops/action-gh-release@v2 + with: + name: Release ${{ github.ref_name }} + tag_name: ${{ github.ref_name }} + body_path: CHANGELOG.md + draft: false + prerelease: false + files: | + dist/deepscanbot-linux-amd64 + dist/deepscanbot-linux-arm64 + dist/deepscanbot-darwin-amd64 + dist/deepscanbot-darwin-arm64 + dist/deepscanbot-windows-amd64.exe + dist/checksums.txt + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 9fb6e3f..7e42ad5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ -scan-results-goodreads.json +scan-results-*.json +scan-results-*.txt deepscanbot solved.txt diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cb6a41b..b96b2de 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -69,7 +69,7 @@ By participating in this project, you agree to abide by the [CODE_OF_CONDUCT.md] 5. Verify the installation: ```bash - ./deepscanbot -h + go run ./apps/cli -h ``` ### Project Structure @@ -193,6 +193,7 @@ Use conventional commits format: ### Commit Messages Follow the [Conventional Commits](https://www.conventionalcommits.org/) specification. Use `cocogitto` or a custom commit hook to validate commit messages: + ```bash # Using cocogitto cog commit diff --git a/README.md b/README.md index 9b8edb9..351c873 100644 --- a/README.md +++ b/README.md @@ -35,11 +35,11 @@ cd DeepScanBot # Install dependencies go mod download -# Build the crawler -go build -o deepscanbot +# Build the crawler (optional) +go build -o deepscanbot ./apps/cli -# Or run directly -go run main.go -url [options] +# Or run directly (no build required) +go run ./apps/cli -url [options] ``` ## Usage @@ -76,7 +76,7 @@ go run main.go -url [options] #### 1. Basic Crawl ```bash -deepscanbot -url https://example.com -depth 2 +go run ./apps/cli -url https://example.com -depth 2 ``` Crawls `https://example.com` up to 2 levels deep and outputs results to `crawler_results.txt`. @@ -84,7 +84,7 @@ Crawls `https://example.com` up to 2 levels deep and outputs results to `crawler #### 2. JSON Output with Details ```bash -deepscanbot -url https://example.com -depth 3 -json -s -u -output my_results +go run ./apps/cli -url https://example.com -depth 3 -json -s -u -output my_results ``` Outputs JSON to `my_results.json` with URL source tracking and deduplication. @@ -92,7 +92,7 @@ Outputs JSON to `my_results.json` with URL source tracking and deduplication. #### 3. Crawl with Retry and Delay ```bash -deepscanbot -url https://docs.example.com -depth 2 -retries 3 -retry-backoff 2s -delay 1s -host-concurrency 1 +go run ./apps/cli -url https://docs.example.com -depth 2 -retries 3 -retry-backoff 2s -delay 1s -host-concurrency 1 ``` Retries failed requests up to 3 times with exponential backoff, waits 1 second between requests to the same host, and allows only 1 concurrent request per host. @@ -100,7 +100,7 @@ Retries failed requests up to 3 times with exponential backoff, waits 1 second b #### 4. Cross-Domain Crawl with Sitemap ```bash -deepscanbot -url https://example.com -depth 3 -cross-domain -sitemap -concurrency 10 -host-concurrency 2 -json +go run ./apps/cli -url https://example.com -depth 3 -cross-domain -sitemap -concurrency 10 -host-concurrency 2 -json ``` Discovers URLs from sitemap.xml, follows links to any domain, with 10 total workers and 2 per host. @@ -109,10 +109,10 @@ Discovers URLs from sitemap.xml, follows links to any domain, with 10 total work ```bash # First run (interrupted) -deepscanbot -url https://example.com -depth 3 -json -output my_results +go run ./apps/cli -url https://example.com -depth 3 -json -output my_results # Resume -deepscanbot -url https://example.com -depth 3 -json -output my_results -resume +go run ./apps/cli -url https://example.com -depth 3 -json -output my_results -resume ``` Loaded existing results from `my_results.json` and skips already-crawled URLs. @@ -120,7 +120,7 @@ Loaded existing results from `my_results.json` and skips already-crawled URLs. #### 6. Crawl Goodreads with Rate-Limit Handling ```bash -deepscanbot -url https://www.goodreads.com -depth 2 -delay 2s -retries 5 -retry-backoff 2s -concurrency 2 -host-concurrency 1 -json -output goodreads_results +go run ./apps/cli -url https://www.goodreads.com -depth 2 -delay 2s -retries 5 -retry-backoff 2s -concurrency 2 -host-concurrency 1 -json -output goodreads_results ``` Uses 2-second politeness delay, 5 retries with exponential backoff, limited concurrency to handle Goodreads rate limits gracefully. @@ -128,7 +128,7 @@ Uses 2-second politeness delay, 5 retries with exponential backoff, limited conc #### 7. Crawl PDF and Images ```bash -deepscanbot -url https://example.com -depth 2 -content-types "text/html,application/pdf,image/jpeg,image/png" -json +go run ./apps/cli -url https://example.com -depth 2 -content-types "text/html,application/pdf,image/jpeg,image/png" -json ``` Downloads HTML, PDF, JPEG, and PNG files while still parsing HTML for links. @@ -250,10 +250,10 @@ The JSON report contains a detailed summary and two URL lists: Text output shows one URL per line with optional metadata in brackets: ``` -[https://example.com [status=200] [result=passed] -[https://example.com/about [status=200] [result=passed] -[https://example.com/not-found [status=404] [result=failed] [error=bad status code: 404] -[https://external.com/page [result=skipped] [skipped=outside domain scope] +[https://example.com] [status=200] [result=passed] +[https://example.com/about] [status=200] [result=passed] +[https://example.com/not-found] [status=404] [result=failed] [error=bad status code: 404] +[https://external.com/page] [result=skipped] [skipped=outside domain scope] // With -s flag: [href] https://example.com diff --git a/apps/cli/main.go b/apps/cli/main.go index 8302721..40389e6 100644 --- a/apps/cli/main.go +++ b/apps/cli/main.go @@ -28,7 +28,7 @@ func main() { uniqueUrls := flag.Bool("u", false, "Ensure unique URLs") concurrency := flag.Int("concurrency", 0, "Maximum concurrent requests; 0 uses available CPU capacity") hostConcurrency := flag.Int("host-concurrency", 0, "Maximum concurrent requests per host; 0 uses -concurrency") - contentTypes := flag.String("content-types", "text/html", "Comma-separated MIME types to download, e.g. text/html,application/pdf,image/jpeg") + contentTypes := flag.String("content-types", "text/html", "Comma-separated MIME types to download (quote the list), e.g. \"text/html,application/pdf,image/jpeg\"") output := flag.String("output", "crawler_results", "Output filename without an extension") ignoreRobots := flag.Bool("ignore-robots", false, "Ignore robots.txt crawl restrictions") crossDomain := flag.Bool("cross-domain", false, "Follow links to hosts other than the starting URL") @@ -126,9 +126,12 @@ func validateStartURL(rawURL string) (string, error) { func parseContentTypes(value string) []string { var contentTypes []string - for _, contentType := range strings.Split(value, ",") { - if contentType = strings.TrimSpace(contentType); contentType != "" { - contentTypes = append(contentTypes, contentType) + // Support both comma-separated and space-separated values + for _, part := range strings.FieldsFunc(value, func(r rune) bool { + return r == ',' || r == ' ' + }) { + if part = strings.TrimSpace(part); part != "" { + contentTypes = append(contentTypes, part) } } diff --git a/apps/cli/tests/crawler_test.go b/apps/cli/tests/crawler_test.go index 8a1c5aa..1c5cda3 100644 --- a/apps/cli/tests/crawler_test.go +++ b/apps/cli/tests/crawler_test.go @@ -15,8 +15,15 @@ import ( ) func TestCrawlerStartReturnsResultsWithoutWritingFiles(t *testing.T) { - //nolint:govet // testing.Chdir requires Go 1.24+, using available version - t.Chdir(t.TempDir()) + origDir, _ := os.Getwd() + + tmpDir := t.TempDir() + + if err := os.Chdir(tmpDir); err != nil { + t.Fatalf("chdir to temp dir: %v", err) + } + + t.Cleanup(func() { _ = os.Chdir(origDir) }) server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html") diff --git a/apps/cli/tests/storage_test.go b/apps/cli/tests/storage_test.go index d61e4b1..780d461 100644 --- a/apps/cli/tests/storage_test.go +++ b/apps/cli/tests/storage_test.go @@ -14,8 +14,15 @@ import ( func TestTextOutputIsTruncatedForEachStorageInstance(t *testing.T) { const filename = "crawler_results.txt" - //nolint:govet // testing.Chdir requires Go 1.24+, using available version - t.Chdir(t.TempDir()) + origDir, _ := os.Getwd() + + tmpDir := t.TempDir() + + if err := os.Chdir(tmpDir); err != nil { + t.Fatalf("chdir to temp dir: %v", err) + } + + t.Cleanup(func() { _ = os.Chdir(origDir) }) if err := os.WriteFile(filename, []byte("result from a previous crawl\n"), 0o644); err != nil { t.Fatalf("seed previous output: %v", err) @@ -41,8 +48,15 @@ func TestTextOutputIsTruncatedForEachStorageInstance(t *testing.T) { func TestTextOutputIsFlushedOnClose(t *testing.T) { const filename = "crawler_results.txt" - //nolint:govet // testing.Chdir requires Go 1.24+, using available version - t.Chdir(t.TempDir()) + origDir, _ := os.Getwd() + + tmpDir := t.TempDir() + + if err := os.Chdir(tmpDir); err != nil { + t.Fatalf("chdir to temp dir: %v", err) + } + + t.Cleanup(func() { _ = os.Chdir(origDir) }) pageStorage := storage.NewPageStorage() pageStorage.StoreContent("https://example.com/one") diff --git a/apps/docs/docs/features.mdx b/apps/docs/docs/features.mdx index 25fe55a..de0d212 100644 --- a/apps/docs/docs/features.mdx +++ b/apps/docs/docs/features.mdx @@ -13,7 +13,7 @@ Multi-threaded architecture with configurable concurrency limits for optimal per ```bash # Set concurrency to 10 -./deepscanbot -url https://example.com -concurrency 10 +go run ./apps/cli -url https://example.com -concurrency 10 ``` ### Configurable Crawl Depth @@ -21,10 +21,10 @@ Control how deep the crawler explores linked pages. ```bash # Single page only -./deepscanbot -url https://example.com -depth 0 +go run ./apps/cli -url https://example.com -depth 0 # Full site (3 levels deep) -./deepscanbot -url https://example.com -depth 3 +go run ./apps/cli -url https://example.com -depth 3 ``` ### Robots.txt Compliance @@ -32,7 +32,7 @@ Respects robots.txt rules with an option to ignore them. ```bash # Ignore robots.txt -./deepscanbot -url https://example.com -ignore-robots +go run ./apps/cli -url https://example.com -ignore-robots ``` ### Retry Logic @@ -40,7 +40,7 @@ Built-in retry mechanism with exponential backoff for transient failures. ```bash # Retry up to 3 times with 2 second backoff -./deepscanbot -url https://example.com -retries 3 -retry-backoff 2s +go run ./apps/cli -url https://example.com -retries 3 -retry-backoff 2s ``` ## Advanced Features @@ -50,24 +50,24 @@ Filter crawled pages by MIME types and size limits. ```bash # Only HTML and PDF files -./deepscanbot -url https://example.com -content-types "text/html,application/pdf" +go run ./apps/cli -url https://example.com -content-types "text/html,application/pdf" # Limit page size to 1MB -./deepscanbot -url https://example.com -size 1024 +go run ./apps/cli -url https://example.com -size 1024 ``` ### Proxy Support Route traffic through HTTP/HTTPS proxies. ```bash -./deepscanbot -url https://example.com -proxy http://127.0.0.1:8080 +go run ./apps/cli -url https://example.com -proxy http://127.0.0.1:8080 ``` ### Sitemap Discovery Automatically discover and crawl URLs from sitemap.xml. ```bash -./deepscanbot -url https://example.com -sitemap +go run ./apps/cli -url https://example.com -sitemap ``` ### Resume Mode @@ -75,17 +75,17 @@ Resume interrupted crawls without recrawling already visited URLs. ```bash # First crawl -./deepscanbot -url https://example.com -output my-crawl +go run ./apps/cli -url https://example.com -output my-crawl # Resume -./deepscanbot -url https://example.com -resume -output my-crawl +go run ./apps/cli -url https://example.com -resume -output my-crawl ``` ### Cross-Domain Crawling Optionally follow links to external domains. ```bash -./deepscanbot -url https://example.com -cross-domain +go run ./apps/cli -url https://example.com -cross-domain ``` ### Multiple Output Formats @@ -93,10 +93,10 @@ Export results in JSON or plain text format. ```bash # Text output (default) -./deepscanbot -url https://example.com -output results +go run ./apps/cli -url https://example.com -output results # JSON output -./deepscanbot -url https://example.com -json -output results +go run ./apps/cli -url https://example.com -json -output results ``` ### Politely Crawl @@ -104,7 +104,7 @@ Configure delay between requests to avoid overwhelming servers. ```bash # Wait 1 second between requests -./deepscanbot -url https://example.com -delay 1s +go run ./apps/cli -url https://example.com -delay 1s ``` ## All CLI Flags diff --git a/apps/docs/docs/installation.mdx b/apps/docs/docs/installation.mdx index c139c84..1327b66 100644 --- a/apps/docs/docs/installation.mdx +++ b/apps/docs/docs/installation.mdx @@ -47,8 +47,8 @@ lefthook install ## Verify Installation ```bash -# Test the CLI -./deepscanbot -h +# Test the CLI (using go run - no build required) +go run ./apps/cli -h # Run tests go test ./... @@ -61,13 +61,13 @@ golangci-lint run ./... ```bash # Crawl a website -./deepscanbot -url https://example.com -depth 2 +go run ./apps/cli -url https://example.com -depth 2 # Output as JSON -./deepscanbot -url https://example.com -depth 2 -json +go run ./apps/cli -url https://example.com -depth 2 -json # With specific content types -./deepscanbot -url https://example.com -depth 2 -content-types "text/html,application/pdf" +go run ./apps/cli -url https://example.com -depth 2 -content-types "text/html,application/pdf" ``` ## Next Steps diff --git a/apps/docs/docs/usage.mdx b/apps/docs/docs/usage.mdx index 55f58c8..72c868b 100644 --- a/apps/docs/docs/usage.mdx +++ b/apps/docs/docs/usage.mdx @@ -11,7 +11,7 @@ Learn how to use DeepScanBot effectively for your web crawling needs. The simplest way to crawl a website: ```bash -./deepscanbot -url https://example.com +go run ./apps/cli -url https://example.com ``` This will: @@ -26,7 +26,7 @@ This will: Crawl only the specified URL without following links: ```bash -./deepscanbot -url https://example.com -depth 0 +go run ./apps/cli -url https://example.com -depth 0 ``` ### Full Site Crawl @@ -34,7 +34,7 @@ Crawl only the specified URL without following links: Crawl an entire site with greater depth: ```bash -./deepscanbot -url https://example.com -depth 5 -concurrency 20 +go run ./apps/cli -url https://example.com -depth 5 -concurrency 20 ``` ### JSON Output @@ -42,7 +42,7 @@ Crawl an entire site with greater depth: Export results in JSON format for programmatic processing: ```bash -./deepscanbot -url https://example.com -json -output results +go run ./apps/cli -url https://example.com -json -output results ``` ### Crawl with Proxy @@ -50,7 +50,7 @@ Export results in JSON format for programmatic processing: Route traffic through a proxy server: ```bash -./deepscanbot -url https://example.com -proxy http://127.0.0.1:8080 +go run ./apps/cli -url https://example.com -proxy http://127.0.0.1:8080 ``` ### Content-Type Filtering @@ -59,13 +59,13 @@ Only download specific file types: ```bash # Only HTML pages -./deepscanbot -url https://example.com -content-types "text/html" +go run ./apps/cli -url https://example.com -content-types "text/html" # HTML and PDF documents -./deepscanbot -url https://example.com -content-types "text/html,application/pdf" +go run ./apps/cli -url https://example.com -content-types "text/html,application/pdf" # All content types -./deepscanbot -url https://example.com -content-types "*/*" +go run ./apps/cli -url https://example.com -content-types "*/*" ``` ### Unique URLs Only @@ -73,7 +73,7 @@ Only download specific file types: Avoid processing duplicate URLs: ```bash -./deepscanbot -url https://example.com -u +go run ./apps/cli -url https://example.com -u ``` ### Resume a Crawl @@ -82,10 +82,10 @@ Stop and resume a crawl without recrawling: ```bash # Initial crawl -./deepscanbot -url https://example.com -depth 3 -output my-crawl +go run ./apps/cli -url https://example.com -depth 3 -output my-crawl # Resume (skips already crawled URLs) -./deepscanbot -url https://example.com -depth 3 -resume -output my-crawl +go run ./apps/cli -url https://example.com -depth 3 -resume -output my-crawl ``` ### Politely Crawl @@ -93,7 +93,7 @@ Stop and resume a crawl without recrawling: Add delays to avoid overwhelming servers: ```bash -./deepscanbot -url https://example.com -delay 500ms -host-concurrency 2 +go run ./apps/cli -url https://example.com -delay 500ms -host-concurrency 2 ``` ## Output Formats @@ -154,16 +154,16 @@ https://example.com/admin [result=skipped] [skipped=disallowed by robots.txt] ### Crawl Documentation Site ```bash -./deepscanbot -url https://docs.example.com -depth 3 -sitemap -json -output docs-crawl +go run ./apps/cli -url https://docs.example.com -depth 3 -sitemap -json -output docs-crawl ``` ### Monitor a Website ```bash -./deepscanbot -url https://example.com -depth 1 -u -output monitor -dr +go run ./apps/cli -url https://example.com -depth 1 -u -output monitor -dr ``` ### Scrape PDF Documents ```bash -./deepscanbot -url https://example.com -depth 2 -content-types "application/pdf" -size 10240 -output pdfs \ No newline at end of file +go run ./apps/cli -url https://example.com -depth 2 -content-types "application/pdf" -size 10240 -output pdfs \ No newline at end of file diff --git a/scan-results-quotes.json b/scan-results-quotes.json new file mode 100644 index 0000000..943fc63 --- /dev/null +++ b/scan-results-quotes.json @@ -0,0 +1,498 @@ +{ + "start_url": "https://quotes.toscrape.com/", + "output_file": "scan-results-quotes.json", + "started_at": "2026-06-25T13:10:29.169600358+05:30", + "finished_at": "2026-06-25T13:10:36.216894869+05:30", + "duration_ms": 7047, + "summary": { + "total": 52, + "passed": 47, + "failed": 0, + "skipped": 5, + "discovered": 0, + "skipped_by_robots": 0, + "skipped_by_domain": 2, + "skipped_by_duplicate": 1, + "skipped_by_content_type": 2, + "skipped_by_depth": 0, + "skipped_by_other": 0, + "retried_requests": 0, + "max_depth": 1, + "urls_by_status_code": { + "200": 49 + }, + "skipped_by_reason": { + "content type not allowed": 2, + "duplicate": 1, + "outside domain scope": 2 + } + }, + "urls": [ + { + "url": "https://quotes.toscrape.com/", + "source": "href", + "depth": 0, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/friends/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/success/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/love/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/miracles/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/reading/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/inspirational/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/obvious/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/simile/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/author/Thomas-A-Edison", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/thinking/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/classic/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/adulthood/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/misattributed-eleanor-roosevelt/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/author/Steve-Martin", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/paraphrased/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/truth/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/author/Eleanor-Roosevelt", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/page/2/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/books/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/login", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/miracle/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/value/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/world/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/failure/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/author/Marilyn-Monroe", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/humor/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/humor/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/love/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/friendship/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/choices/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/be-yourself/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/life/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/author/Jane-Austen", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/author/Andre-Gide", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/simile/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/life/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/author/Albert-Einstein", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/abilities/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/aliteracy/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/books/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/deep-thoughts/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/change/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/live/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/edison/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/author/J-K-Rowling", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/tag/inspirational/page/1/", + "source": "href", + "depth": 1, + "status_code": 200, + "content_type": "text/html; charset=utf-8", + "result": "passed", + "attempts": 1 + } + ], + "skipped": [ + { + "url": "https://www.goodreads.com/quotes", + "source": "href", + "depth": 1, + "result": "skipped", + "skipped_reason": "outside domain scope" + }, + { + "url": "https://www.zyte.com", + "source": "href", + "depth": 1, + "result": "skipped", + "skipped_reason": "outside domain scope" + }, + { + "url": "https://quotes.toscrape.com/", + "source": "href", + "depth": 1, + "result": "skipped", + "skipped_reason": "duplicate" + }, + { + "url": "https://quotes.toscrape.com/static/bootstrap.min.css", + "source": "link", + "depth": 1, + "status_code": 200, + "content_type": "text/css; charset=utf-8", + "result": "skipped", + "skipped_reason": "content type not allowed", + "attempts": 1 + }, + { + "url": "https://quotes.toscrape.com/static/main.css", + "source": "link", + "depth": 1, + "status_code": 200, + "content_type": "text/css; charset=utf-8", + "result": "skipped", + "skipped_reason": "content type not allowed", + "attempts": 1 + } + ] +} \ No newline at end of file