Skip to content

Commit 05cfc07

Browse files
author
tal123ph
committed
Add Wikipedia as a new data source
1 parent 5da8ea2 commit 05cfc07

2 files changed

Lines changed: 45 additions & 61 deletions

File tree

scripts/1-fetch/wikipedia_fetch.py

Lines changed: 44 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
# scripts/wikipedia_fetch.py
2-
31
import requests
42
from typing import Dict
53

@@ -18,86 +16,72 @@ def get_site_statistics() -> Dict[str, int]:
1816
Returns:
1917
dict: Dictionary containing:
2018
- articles: number of articles
21-
- pages: total number of pages
22-
- edits: total number of edits
23-
- users: total number of users
24-
- images: total number of images
19+
- pages: number of pages
20+
- edits: number of edits
21+
- users: number of users
22+
- images: number of images
2523
"""
2624
params = {
2725
"action": "query",
2826
"meta": "siteinfo",
29-
"siprop": "statistics",
27+
"siprop": "statistics|rightsinfo",
3028
"format": "json"
3129
}
32-
try:
33-
response = requests.get(WIKI_API, params=params, headers=HEADERS, timeout=10)
34-
response.raise_for_status()
35-
stats = response.json()['query']['statistics']
36-
return {
37-
"articles": stats.get("articles", 0),
38-
"pages": stats.get("pages", 0),
39-
"edits": stats.get("edits", 0),
40-
"users": stats.get("users", 0),
41-
"images": stats.get("images", 0)
42-
}
43-
except requests.RequestException as e:
44-
print(f"Error fetching Wikipedia site statistics: {e}")
45-
return {"articles": 0, "pages": 0, "edits": 0, "users": 0, "images": 0}
46-
47-
48-
def search_articles_count(keyword: str) -> int:
30+
31+
response = requests.get(WIKI_API, headers=HEADERS, params=params)
32+
response.raise_for_status()
33+
data = response.json()
34+
35+
stats = data.get("query", {}).get("statistics", {})
36+
37+
return {
38+
"articles": stats.get("articles", 0),
39+
"pages": stats.get("pages", 0),
40+
"edits": stats.get("edits", 0),
41+
"users": stats.get("users", 0),
42+
"images": stats.get("images", 0)
43+
}
44+
45+
46+
def search_articles_by_license(license_keyword: str, limit: int = 10) -> Dict[str, int]:
4947
"""
50-
Count the number of Wikipedia articles containing a specific keyword.
48+
Search Wikipedia articles containing a specific Creative Commons license keyword.
5149
5250
Args:
53-
keyword (str): Keyword or phrase to search for.
51+
license_keyword (str): e.g., "CC BY-SA 4.0"
52+
limit (int): Number of results to fetch
5453
5554
Returns:
56-
int: Total number of search hits/articles.
55+
dict: Dictionary with count and sample articles
5756
"""
5857
params = {
5958
"action": "query",
6059
"list": "search",
61-
"srsearch": keyword,
60+
"srsearch": license_keyword,
61+
"srlimit": limit,
6262
"format": "json"
6363
}
64-
try:
65-
response = requests.get(WIKI_API, params=params, headers=HEADERS, timeout=10)
66-
response.raise_for_status()
67-
return response.json()['query']['searchinfo']['totalhits']
68-
except requests.RequestException as e:
69-
print(f"Error searching Wikipedia articles for '{keyword}': {e}")
70-
return 0
7164

65+
response = requests.get(WIKI_API, headers=HEADERS, params=params)
66+
response.raise_for_status()
67+
data = response.json()
7268

73-
def fetch_cc_related_statistics() -> Dict[str, int]:
74-
"""
75-
Fetch statistics related to Creative Commons on Wikipedia.
76-
77-
Returns:
78-
dict: Dictionary containing counts of articles referencing CC licenses.
79-
"""
80-
keywords = [
81-
"Creative Commons",
82-
"CC BY",
83-
"CC BY-SA",
84-
"CC BY-ND",
85-
"CC BY-NC",
86-
"CC BY-NC-SA",
87-
"CC BY-NC-ND"
88-
]
89-
results = {}
90-
for kw in keywords:
91-
results[kw] = search_articles_count(kw)
92-
return results
69+
search_results = data.get("query", {}).get("search", [])
70+
return {
71+
"count": len(search_results),
72+
"sample_titles": [item["title"] for item in search_results]
73+
}
9374

9475

9576
if __name__ == "__main__":
9677
print("Wikipedia Site Statistics:")
97-
print(get_site_statistics())
78+
stats = get_site_statistics()
79+
print(stats)
80+
81+
license_query = "Creative Commons"
82+
print(f"\nArticles mentioning '{license_query}':")
83+
results = search_articles_by_license(license_query, limit=5)
84+
print(results)
85+
9886

99-
print("\nCreative Commons Related Articles Count:")
100-
cc_stats = fetch_cc_related_statistics()
101-
for k, v in cc_stats.items():
102-
print(f"{k}: {v}")
10387

scripts/shared.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
# Third-party
77
from git import InvalidGitRepositoryError, NoSuchPathError, Repo
8-
from pandas import PeriodIndex
8+
from pandas import
99

1010

1111
class QuantifyingException(Exception):

0 commit comments

Comments
 (0)