1+ #!/usr/bin/env python
2+ """
3+ Fetch CC Legal Tool usage from Wikipedia API.
4+ """
5+
6+ # Standard library
7+ import argparse
8+ import csv
9+ import os
10+ import sys
11+ import textwrap
12+ import traceback
13+ import random
14+ # Third-party
15+ import requests
16+ from pygments import highlight
17+ from pygments .formatters import TerminalFormatter
18+ from pygments .lexers import PythonTracebackLexer
19+ from requests .adapters import HTTPAdapter
20+ from urllib3 .util .retry import Retry
21+
22+ # Add parent directory so shared can be imported
23+ sys .path .append (os .path .join (os .path .dirname (__file__ ), ".." ))
24+ WIKI_BASE_URL = "https://en.wikipedia.org/w/api.php"
25+ # First-party/Local
26+ import shared # noqa: E402
27+
28+ # Setup
29+ LOGGER , PATHS = shared .setup (__file__ )
30+ FILE1_COUNT = os .path .join (PATHS ["data_phase" ], "wiki_1_count.csv" )
31+ HEADER1_COUNT = ["PLAN_INDEX" , "TOOL_IDENTIFIER" , "COUNT" ]
32+ QUARTER = os .path .basename (PATHS ["data_quarter" ])
33+ WIKIPEDIA_RETRY_STATUS_FORCELIST = [
34+ 408 , # Request Timeout
35+ 422 , # Unprocessable Content (Validation failed, or endpoint spammed)
36+ 429 , # Too Many Requests
37+ 500 , # Internal Server Error
38+ 502 , # Bad Gateway
39+ 503 , # Service Unavailable
40+ 504 , # Gateway Timeout
41+ ]
42+
43+ def parse_arguments ():
44+ """
45+ Parse command-line options, returns parsed argument namespace.
46+ """
47+ LOGGER .info ("Parsing command-line options" )
48+ parser = argparse .ArgumentParser (description = __doc__ )
49+ parser .add_argument (
50+ "--enable-save" ,
51+ action = "store_true" ,
52+ help = "Enable saving results" ,
53+ )
54+ parser .add_argument (
55+ "--enable-git" ,
56+ action = "store_true" ,
57+ help = "Enable git actions (fetch, merge, add, commit, and push)" ,
58+ )
59+ parser .add_argument (
60+ "--dev" ,
61+ action = "store_true" ,
62+ help = "Development mode: avoid hitting API (generate fake data)" ,
63+ )
64+ args = parser .parse_args ()
65+ if not args .enable_save and args .enable_git :
66+ parser .error ("--enable-git requires --enable-save" )
67+ return args
68+
69+ def get_requests_session ():
70+ max_retries = Retry (
71+ total = 5 ,
72+ backoff_factor = 10 ,
73+ status_forcelist = WIKIPEDIA_RETRY_STATUS_FORCELIST ,
74+ )
75+ session = requests .Session ()
76+ session .mount ("https://" , HTTPAdapter (max_retries = max_retries ))
77+ session .headers .update ({"User-Agent" : "quantifying-wikipedia-fetch/1.0 (contact@example.com)" })
78+ return session
79+
80+
81+ def write_data (args , tool_data ):
82+ if not args .enable_save :
83+ return args
84+
85+ # Create data directory for this phase
86+ os .makedirs (PATHS ["data_phase" ], exist_ok = True )
87+
88+ with open (FILE1_COUNT , "w" , newline = "" ) as file_obj :
89+ writer = csv .DictWriter (
90+ file_obj , fieldnames = HEADER1_COUNT , dialect = "unix"
91+ )
92+ writer .writeheader ()
93+ for row in tool_data :
94+ writer .writerow (row )
95+ return args
96+
97+
98+ def query_wikipedia (args , session ):
99+ LOGGER .info ("Fetching Wikipedia rightsinfo + article count" )
100+ tool_data = []
101+
102+ try :
103+ if args .dev :
104+ license_name = "Creative Commons (DEV)"
105+ article_count = random .randint (100000 , 5000000 )
106+ else :
107+ params = {
108+ "action" : "query" ,
109+ "meta" : "siteinfo" ,
110+ "siprop" : "general|statistics|rightsinfo" ,
111+ "format" : "json" ,
112+ }
113+ r = session .get (WIKI_BASE_URL , params = params , timeout = 30 )
114+ r .raise_for_status ()
115+ data = r .json ()
116+
117+ stats = data ["query" ]["statistics" ]
118+ rights = data ["query" ]["rightsinfo" ]
119+
120+ license_name = rights .get ("text" , "" )
121+ license_url = rights .get ("url" , "" )
122+ article_count = stats .get ("articles" , 0 )
123+
124+ tool_data .append ({
125+ "PLAN_INDEX" : 1 ,
126+ "TOOL_IDENTIFIER" : f"{ license_name } " ,
127+ "COUNT" : article_count
128+ })
129+
130+ LOGGER .info (f"License: { license_name } -> Articles: { article_count } " )
131+
132+ except requests .RequestException as e :
133+ LOGGER .error (f"Request error while fetching Wikipedia rightsinfo: { e } " )
134+ raise shared .QuantifyingException (f"Request error: { e } " , 1 )
135+
136+ return tool_data
137+
138+ def main ():
139+ args = parse_arguments ()
140+ shared .paths_log (LOGGER , PATHS )
141+ shared .git_fetch_and_merge (args , PATHS ["repo" ])
142+ tool_data = query_wikipedia (args , get_requests_session ())
143+ args = write_data (args , tool_data )
144+ args = shared .git_add_and_commit (args , PATHS ["repo" ], PATHS ["data_quarter" ], f"Add and commit new Wikipedia data for { QUARTER } " )
145+ shared .git_push_changes (args , PATHS ["repo" ])
146+
147+ if __name__ == "__main__" :
148+ try :
149+ main ()
150+ except shared .QuantifyingException as e :
151+ if e .exit_code == 0 :
152+ LOGGER .info (e .message )
153+ else :
154+ LOGGER .error (e .message )
155+ sys .exit (e .exit_code )
156+ except SystemExit as e :
157+ if e .code != 0 :
158+ LOGGER .error (f"System exit with code: { e .code } " )
159+ sys .exit (e .code )
160+ except KeyboardInterrupt :
161+ LOGGER .info ("(130) Halted via KeyboardInterrupt." )
162+ sys .exit (130 )
163+ except Exception :
164+ traceback_formatted = textwrap .indent (
165+ highlight (
166+ traceback .format_exc (),
167+ PythonTracebackLexer (),
168+ TerminalFormatter (),
169+ ),
170+ " " ,
171+ )
172+ LOGGER .critical (f"(1) Unhandled exception:\n { traceback_formatted } " )
173+ sys .exit (1 )
0 commit comments