1010import sys
1111import textwrap
1212import traceback
13- import urllib .parse
14- import random
13+
1514# Third-party
1615import requests
1716from pygments import highlight
2221
2322# Add parent directory so shared can be imported
2423sys .path .append (os .path .join (os .path .dirname (__file__ ), ".." ))
25- WIKI_BASE_URL = "https://en.wikipedia.org/w/api.php"
2624# First-party/Local
2725import shared # noqa: E402
2826
2927# Setup
3028LOGGER , PATHS = shared .setup (__file__ )
31- FILE1_COUNT = os .path .join (PATHS ["data_phase" ], "wiki_1_count.csv" )
32- HEADER1_COUNT = ["PLAN_INDEX" , "TOOL_IDENTIFIER" , "COUNT" ]
29+ FILE_LANGUAGES = os .path .join (
30+ PATHS ["data_phase" ], "wikipedia_count_by_languages.csv"
31+ )
32+ HEADER_LANGUAGES = ["LANGUAGE_CODE" , "LANGUAGE_NAME" , "COUNT" ]
3333QUARTER = os .path .basename (PATHS ["data_quarter" ])
34+ WIKIPEDIA_BASE_URL = "https://en.wikipedia.org/w/api.php"
3435WIKIPEDIA_RETRY_STATUS_FORCELIST = [
3536 408 , # Request Timeout
3637 422 , # Unprocessable Content (Validation failed, or endpoint spammed)
4142 504 , # Gateway Timeout
4243]
4344
45+
4446def parse_arguments ():
4547 """
4648 Parse command-line options, returns parsed argument namespace.
@@ -57,16 +59,12 @@ def parse_arguments():
5759 action = "store_true" ,
5860 help = "Enable git actions (fetch, merge, add, commit, and push)" ,
5961 )
60- parser .add_argument (
61- "--dev" ,
62- action = "store_true" ,
63- help = "Development mode: avoid hitting API (generate fake data)" ,
64- )
6562 args = parser .parse_args ()
6663 if not args .enable_save and args .enable_git :
6764 parser .error ("--enable-git requires --enable-save" )
6865 return args
6966
67+
7068def get_requests_session ():
7169 max_retries = Retry (
7270 total = 5 ,
@@ -75,75 +73,101 @@ def get_requests_session():
7573 )
7674 session = requests .Session ()
7775 session .mount ("https://" , HTTPAdapter (max_retries = max_retries ))
78- session .headers .update ({"User-Agent" : "quantifying-wikipedia-fetch/1.0 (contact@example.com)" })
76+ session .headers .update (
77+ {"User-Agent" : "quantifying-wikipedia-fetch/1.0 (contact@example.com)" }
78+ )
7979 return session
8080
8181
8282def write_data (args , tool_data ):
8383 if not args .enable_save :
8484 return args
85-
86- # Create data directory for this phase
85+ LOGGER .info ("Saving fetched data" )
8786 os .makedirs (PATHS ["data_phase" ], exist_ok = True )
8887
89- with open (FILE1_COUNT , "w" , newline = "" ) as file_obj :
88+ with open (FILE_LANGUAGES , "w" , newline = "" , encoding = "utf-8 " ) as file_obj :
9089 writer = csv .DictWriter (
91- file_obj , fieldnames = HEADER1_COUNT , dialect = "unix"
90+ file_obj , fieldnames = HEADER_LANGUAGES , dialect = "unix"
9291 )
9392 writer .writeheader ()
9493 for row in tool_data :
9594 writer .writerow (row )
9695 return args
9796
9897
99- def query_wikipedia ( args , session ):
100- LOGGER .info ("Beginning to fetch results from Wikipedia API " )
98+ def query_wikipedia_languages ( session ):
99+ LOGGER .info ("Fetching article counts from all language Wikipedias " )
101100 tool_data = []
102101
103- try :
104- if args .dev :
105- license_name = "Creative Commons Attribution-ShareAlike 4.0"
106- article_count = random .randint (100000 , 5000000 )
107- else :
108- params = {
109- "action" : "query" ,
110- "meta" : "siteinfo" ,
111- "siprop" : "general|statistics|rightsinfo" ,
112- "format" : "json" ,
113- }
114- r = session .get (WIKI_BASE_URL , params = params , timeout = 30 )
102+ # Get all language wikipedias
103+ site_matrix_url = "https://meta.wikimedia.org/w/api.php"
104+ params = {"action" : "sitematrix" , "format" : "json" }
105+ r = session .get (site_matrix_url , params = params , timeout = 30 )
106+ data = r .json ()["sitematrix" ]
107+
108+ langs = []
109+ for key , val in data .items ():
110+ if key .isdigit ():
111+ lang_code = val .get ("code" )
112+ lang_name = val .get ("name" )
113+ for site in val .get ("site" , []):
114+ if "wikipedia.org" in site ["url" ]:
115+ langs .append (
116+ {
117+ "lang" : lang_code ,
118+ "name" : lang_name ,
119+ }
120+ )
121+
122+ # For each language wikipedia, fetch statistics.
123+ for site in langs :
124+ base_url = f"{ site ['url' ]} /w/api.php"
125+ params = {
126+ "action" : "query" ,
127+ "meta" : "siteinfo" ,
128+ "siprop" : "statistics" ,
129+ "format" : "json" ,
130+ }
131+ try :
132+ r = session .get (base_url , params = params , timeout = 30 )
115133 r .raise_for_status ()
116134 data = r .json ()
117-
118135 stats = data ["query" ]["statistics" ]
119- rights = data ["query" ]["rightsinfo" ]
120136
121- license_name = rights .get ("text" , "" )
122137 article_count = stats .get ("articles" , 0 )
123138
124- tool_data .append ({
125- "PLAN_INDEX" : 1 ,
126- "TOOL_IDENTIFIER" : f"{ license_name } " ,
127- "COUNT" : article_count
128- })
129-
130- LOGGER .info (f"License: { license_name } -> Articles: { article_count } " )
139+ tool_data .append (
140+ {
141+ "LANGUAGE_CODE" : site ["lang" ],
142+ "LANGUAGE_NAME" : site ["name" ],
143+ "COUNT" : article_count ,
144+ }
145+ )
146+ LOGGER .info (f"{ site ['lang' ]} ({ site ['name' ]} ): { article_count } " )
131147
132- except requests .RequestException as e :
133- LOGGER .error (f"Request error while fetching Wikipedia rightsinfo: { e } " )
134- raise shared .QuantifyingException (f"Request error: { e } " , 1 )
148+ except Exception as e :
149+ LOGGER .warning (
150+ f"Failed to fetch for { site ['lang' ]} ({ site ['name' ]} ): { e } "
151+ )
135152
136153 return tool_data
137154
155+
138156def main ():
139157 args = parse_arguments ()
140158 shared .paths_log (LOGGER , PATHS )
141159 shared .git_fetch_and_merge (args , PATHS ["repo" ])
142- tool_data = query_wikipedia ( args , get_requests_session ())
160+ tool_data = query_wikipedia_languages ( get_requests_session ())
143161 args = write_data (args , tool_data )
144- args = shared .git_add_and_commit (args , PATHS ["repo" ], PATHS ["data_quarter" ], f"Add and commit new Wikipedia data for { QUARTER } " )
162+ args = shared .git_add_and_commit (
163+ args ,
164+ PATHS ["repo" ],
165+ PATHS ["data_quarter" ],
166+ f"Add and commit new Wikipedia data for { QUARTER } " ,
167+ )
145168 shared .git_push_changes (args , PATHS ["repo" ])
146169
170+
147171if __name__ == "__main__" :
148172 try :
149173 main ()
@@ -170,4 +194,4 @@ def main():
170194 " " ,
171195 )
172196 LOGGER .critical (f"(1) Unhandled exception:\n { traceback_formatted } " )
173- sys .exit (1 )
197+ sys .exit (1 )
0 commit comments