Skip to content

Commit 5da8ea2

Browse files
author
tal123ph
committed
Add Wikipedia data source: script and README documentation
1 parent 19249f8 commit 5da8ea2

2 files changed

Lines changed: 135 additions & 0 deletions

File tree

scripts/1-fetch/wikipedia_fetch.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
# scripts/wikipedia_fetch.py
2+
3+
import requests
4+
from typing import Dict
5+
6+
WIKI_API = "https://en.wikipedia.org/w/api.php"
7+
8+
# Add a User-Agent to avoid 403 errors
9+
HEADERS = {
10+
"User-Agent": "QuantifyingCommonsBot/1.0 (https://github.com/YOUR_USERNAME/quantifying)"
11+
}
12+
13+
14+
def get_site_statistics() -> Dict[str, int]:
15+
"""
16+
Fetch general statistics from Wikipedia.
17+
18+
Returns:
19+
dict: Dictionary containing:
20+
- articles: number of articles
21+
- pages: total number of pages
22+
- edits: total number of edits
23+
- users: total number of users
24+
- images: total number of images
25+
"""
26+
params = {
27+
"action": "query",
28+
"meta": "siteinfo",
29+
"siprop": "statistics",
30+
"format": "json"
31+
}
32+
try:
33+
response = requests.get(WIKI_API, params=params, headers=HEADERS, timeout=10)
34+
response.raise_for_status()
35+
stats = response.json()['query']['statistics']
36+
return {
37+
"articles": stats.get("articles", 0),
38+
"pages": stats.get("pages", 0),
39+
"edits": stats.get("edits", 0),
40+
"users": stats.get("users", 0),
41+
"images": stats.get("images", 0)
42+
}
43+
except requests.RequestException as e:
44+
print(f"Error fetching Wikipedia site statistics: {e}")
45+
return {"articles": 0, "pages": 0, "edits": 0, "users": 0, "images": 0}
46+
47+
48+
def search_articles_count(keyword: str) -> int:
49+
"""
50+
Count the number of Wikipedia articles containing a specific keyword.
51+
52+
Args:
53+
keyword (str): Keyword or phrase to search for.
54+
55+
Returns:
56+
int: Total number of search hits/articles.
57+
"""
58+
params = {
59+
"action": "query",
60+
"list": "search",
61+
"srsearch": keyword,
62+
"format": "json"
63+
}
64+
try:
65+
response = requests.get(WIKI_API, params=params, headers=HEADERS, timeout=10)
66+
response.raise_for_status()
67+
return response.json()['query']['searchinfo']['totalhits']
68+
except requests.RequestException as e:
69+
print(f"Error searching Wikipedia articles for '{keyword}': {e}")
70+
return 0
71+
72+
73+
def fetch_cc_related_statistics() -> Dict[str, int]:
74+
"""
75+
Fetch statistics related to Creative Commons on Wikipedia.
76+
77+
Returns:
78+
dict: Dictionary containing counts of articles referencing CC licenses.
79+
"""
80+
keywords = [
81+
"Creative Commons",
82+
"CC BY",
83+
"CC BY-SA",
84+
"CC BY-ND",
85+
"CC BY-NC",
86+
"CC BY-NC-SA",
87+
"CC BY-NC-ND"
88+
]
89+
results = {}
90+
for kw in keywords:
91+
results[kw] = search_articles_count(kw)
92+
return results
93+
94+
95+
if __name__ == "__main__":
96+
print("Wikipedia Site Statistics:")
97+
print(get_site_statistics())
98+
99+
print("\nCreative Commons Related Articles Count:")
100+
cc_stats = fetch_cc_related_statistics()
101+
for k, v in cc_stats.items():
102+
print(f"{k}: {v}")
103+

sources.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,3 +174,35 @@ adjust video parameters, and obtain search results.
174174
- API key required
175175
- Query limit: depends on the type and number of requests
176176
- Data available through JSON format
177+
178+
## 📖 Wikipedia Data Source
179+
180+
Quantifying now supports fetching data from Wikipedia as an additional source alongside GitHub and Google Custom Search.
181+
182+
### Available Statistics
183+
184+
- **Number of articles** – Total articles on Wikipedia.
185+
- **Number of pages** – Total pages, including non-article pages.
186+
- **Number of edits** – Total edits across Wikipedia.
187+
- **Number of users** – Total registered users.
188+
- **Number of images** – Total uploaded images.
189+
- **Keyword-based counts** – Number of articles referencing specific Creative Commons licenses or keywords.
190+
191+
### Example Usage
192+
193+
```python
194+
from scripts.wikipedia_fetch import get_site_statistics, search_articles_count, fetch_cc_related_statistics
195+
196+
# General Wikipedia statistics
197+
stats = get_site_statistics()
198+
print("Wikipedia Site Stats:", stats)
199+
200+
# Count articles containing a specific keyword
201+
cc_articles = search_articles_count("Creative Commons")
202+
print("Articles with 'Creative Commons':", cc_articles)
203+
204+
# Fetch counts for various Creative Commons licenses
205+
cc_stats = fetch_cc_related_statistics()
206+
for license_name, count in cc_stats.items():
207+
print(f"{license_name}: {count}")
208+

0 commit comments

Comments
 (0)