1- # scripts/wikipedia_fetch.py
2-
31import requests
42from typing import Dict
53
@@ -18,86 +16,72 @@ def get_site_statistics() -> Dict[str, int]:
1816 Returns:
1917 dict: Dictionary containing:
2018 - articles: number of articles
21- - pages: total number of pages
22- - edits: total number of edits
23- - users: total number of users
24- - images: total number of images
19+ - pages: number of pages
20+ - edits: number of edits
21+ - users: number of users
22+ - images: number of images
2523 """
2624 params = {
2725 "action" : "query" ,
2826 "meta" : "siteinfo" ,
29- "siprop" : "statistics" ,
27+ "siprop" : "statistics|rightsinfo " ,
3028 "format" : "json"
3129 }
32- try :
33- response = requests .get (WIKI_API , params = params , headers = HEADERS , timeout = 10 )
34- response .raise_for_status ()
35- stats = response .json ()[ 'query' ][ 'statistics' ]
36- return {
37- "articles" : stats .get ("articles " , 0 ),
38- "pages" : stats . get ( "pages" , 0 ),
39- "edits" : stats . get ( "edits" , 0 ),
40- "users " : stats .get ("users " , 0 ),
41- "images " : stats .get ("images " , 0 )
42- }
43- except requests . RequestException as e :
44- print ( f"Error fetching Wikipedia site statistics: { e } " )
45- return { "articles" : 0 , "pages" : 0 , "edits" : 0 , "users" : 0 , "images" : 0 }
46-
47-
48- def search_articles_count ( keyword : str ) -> int :
30+
31+ response = requests .get (WIKI_API , headers = HEADERS , params = params )
32+ response .raise_for_status ()
33+ data = response .json ()
34+
35+ stats = data . get ( "query" , {}) .get ("statistics " , {})
36+
37+ return {
38+ "articles " : stats .get ("articles " , 0 ),
39+ "pages " : stats .get ("pages " , 0 ),
40+ "edits" : stats . get ( "edits" , 0 ),
41+ "users" : stats . get ( "users" , 0 ),
42+ "images" : stats . get ( "images" , 0 )
43+ }
44+
45+
46+ def search_articles_by_license ( license_keyword : str , limit : int = 10 ) -> Dict [ str , int ] :
4947 """
50- Count the number of Wikipedia articles containing a specific keyword.
48+ Search Wikipedia articles containing a specific Creative Commons license keyword.
5149
5250 Args:
53- keyword (str): Keyword or phrase to search for.
51+ license_keyword (str): e.g., "CC BY-SA 4.0"
52+ limit (int): Number of results to fetch
5453
5554 Returns:
56- int: Total number of search hits/ articles.
55+ dict: Dictionary with count and sample articles
5756 """
5857 params = {
5958 "action" : "query" ,
6059 "list" : "search" ,
61- "srsearch" : keyword ,
60+ "srsearch" : license_keyword ,
61+ "srlimit" : limit ,
6262 "format" : "json"
6363 }
64- try :
65- response = requests .get (WIKI_API , params = params , headers = HEADERS , timeout = 10 )
66- response .raise_for_status ()
67- return response .json ()['query' ]['searchinfo' ]['totalhits' ]
68- except requests .RequestException as e :
69- print (f"Error searching Wikipedia articles for '{ keyword } ': { e } " )
70- return 0
7164
65+ response = requests .get (WIKI_API , headers = HEADERS , params = params )
66+ response .raise_for_status ()
67+ data = response .json ()
7268
73- def fetch_cc_related_statistics () -> Dict [str , int ]:
74- """
75- Fetch statistics related to Creative Commons on Wikipedia.
76-
77- Returns:
78- dict: Dictionary containing counts of articles referencing CC licenses.
79- """
80- keywords = [
81- "Creative Commons" ,
82- "CC BY" ,
83- "CC BY-SA" ,
84- "CC BY-ND" ,
85- "CC BY-NC" ,
86- "CC BY-NC-SA" ,
87- "CC BY-NC-ND"
88- ]
89- results = {}
90- for kw in keywords :
91- results [kw ] = search_articles_count (kw )
92- return results
69+ search_results = data .get ("query" , {}).get ("search" , [])
70+ return {
71+ "count" : len (search_results ),
72+ "sample_titles" : [item ["title" ] for item in search_results ]
73+ }
9374
9475
9576if __name__ == "__main__" :
9677 print ("Wikipedia Site Statistics:" )
97- print (get_site_statistics ())
78+ stats = get_site_statistics ()
79+ print (stats )
80+
81+ license_query = "Creative Commons"
82+ print (f"\n Articles mentioning '{ license_query } ':" )
83+ results = search_articles_by_license (license_query , limit = 5 )
84+ print (results )
85+
9886
99- print ("\n Creative Commons Related Articles Count:" )
100- cc_stats = fetch_cc_related_statistics ()
101- for k , v in cc_stats .items ():
102- print (f"{ k } : { v } " )
10387
0 commit comments