Skip to content

Commit 1b9d968

Browse files
committed
Added wikipedia as data source
1 parent 19249f8 commit 1b9d968

2 files changed

Lines changed: 346 additions & 0 deletions

File tree

scripts/1-fetch/Wikipedia_fetch.py

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
#!/usr/bin/env python
2+
"""
3+
Fetch CC Legal Tool usage from Wikipedia API.
4+
"""
5+
6+
# Standard library
7+
import argparse
8+
import csv
9+
import os
10+
import sys
11+
import textwrap
12+
import traceback
13+
import random
14+
# Third-party
15+
import requests
16+
from pygments import highlight
17+
from pygments.formatters import TerminalFormatter
18+
from pygments.lexers import PythonTracebackLexer
19+
from requests.adapters import HTTPAdapter
20+
from urllib3.util.retry import Retry
21+
22+
# Add parent directory so shared can be imported
23+
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
24+
WIKI_BASE_URL = "https://en.wikipedia.org/w/api.php"
25+
# First-party/Local
26+
import shared # noqa: E402
27+
28+
# Setup
29+
LOGGER, PATHS = shared.setup(__file__)
30+
FILE1_COUNT = os.path.join(PATHS["data_phase"], "wiki_1_count.csv")
31+
HEADER1_COUNT = ["PLAN_INDEX", "TOOL_IDENTIFIER", "COUNT"]
32+
QUARTER = os.path.basename(PATHS["data_quarter"])
33+
WIKIPEDIA_RETRY_STATUS_FORCELIST = [
34+
408, # Request Timeout
35+
422, # Unprocessable Content (Validation failed, or endpoint spammed)
36+
429, # Too Many Requests
37+
500, # Internal Server Error
38+
502, # Bad Gateway
39+
503, # Service Unavailable
40+
504, # Gateway Timeout
41+
]
42+
43+
def parse_arguments():
44+
"""
45+
Parse command-line options, returns parsed argument namespace.
46+
"""
47+
LOGGER.info("Parsing command-line options")
48+
parser = argparse.ArgumentParser(description=__doc__)
49+
parser.add_argument(
50+
"--enable-save",
51+
action="store_true",
52+
help="Enable saving results",
53+
)
54+
parser.add_argument(
55+
"--enable-git",
56+
action="store_true",
57+
help="Enable git actions (fetch, merge, add, commit, and push)",
58+
)
59+
parser.add_argument(
60+
"--dev",
61+
action="store_true",
62+
help="Development mode: avoid hitting API (generate fake data)",
63+
)
64+
args = parser.parse_args()
65+
if not args.enable_save and args.enable_git:
66+
parser.error("--enable-git requires --enable-save")
67+
return args
68+
69+
def get_requests_session():
70+
max_retries = Retry(
71+
total=5,
72+
backoff_factor=10,
73+
status_forcelist=WIKIPEDIA_RETRY_STATUS_FORCELIST,
74+
)
75+
session = requests.Session()
76+
session.mount("https://", HTTPAdapter(max_retries=max_retries))
77+
session.headers.update({"User-Agent": "quantifying-wikipedia-fetch/1.0 (contact@example.com)"})
78+
return session
79+
80+
81+
def write_data(args, tool_data):
82+
if not args.enable_save:
83+
return args
84+
85+
# Create data directory for this phase
86+
os.makedirs(PATHS["data_phase"], exist_ok=True)
87+
88+
with open(FILE1_COUNT, "w", newline="") as file_obj:
89+
writer = csv.DictWriter(
90+
file_obj, fieldnames=HEADER1_COUNT, dialect="unix"
91+
)
92+
writer.writeheader()
93+
for row in tool_data:
94+
writer.writerow(row)
95+
return args
96+
97+
98+
def query_wikipedia(args, session):
99+
LOGGER.info("Fetching Wikipedia rightsinfo + article count")
100+
tool_data = []
101+
102+
try:
103+
if args.dev:
104+
license_name = "Creative Commons (DEV)"
105+
article_count = random.randint(100000, 5000000)
106+
else:
107+
params = {
108+
"action": "query",
109+
"meta": "siteinfo",
110+
"siprop": "general|statistics|rightsinfo",
111+
"format": "json",
112+
}
113+
r = session.get(WIKI_BASE_URL, params=params, timeout=30)
114+
r.raise_for_status()
115+
data = r.json()
116+
117+
stats = data["query"]["statistics"]
118+
rights = data["query"]["rightsinfo"]
119+
120+
license_name = rights.get("text", "")
121+
license_url = rights.get("url", "")
122+
article_count = stats.get("articles", 0)
123+
124+
tool_data.append({
125+
"PLAN_INDEX": 1,
126+
"TOOL_IDENTIFIER": f"{license_name}",
127+
"COUNT": article_count
128+
})
129+
130+
LOGGER.info(f"License: {license_name} -> Articles: {article_count}")
131+
132+
except requests.RequestException as e:
133+
LOGGER.error(f"Request error while fetching Wikipedia rightsinfo: {e}")
134+
raise shared.QuantifyingException(f"Request error: {e}", 1)
135+
136+
return tool_data
137+
138+
def main():
139+
args = parse_arguments()
140+
shared.paths_log(LOGGER, PATHS)
141+
shared.git_fetch_and_merge(args, PATHS["repo"])
142+
tool_data = query_wikipedia(args, get_requests_session())
143+
args = write_data(args, tool_data)
144+
args = shared.git_add_and_commit(args, PATHS["repo"], PATHS["data_quarter"], f"Add and commit new Wikipedia data for {QUARTER}")
145+
shared.git_push_changes(args, PATHS["repo"])
146+
147+
if __name__ == "__main__":
148+
try:
149+
main()
150+
except shared.QuantifyingException as e:
151+
if e.exit_code == 0:
152+
LOGGER.info(e.message)
153+
else:
154+
LOGGER.error(e.message)
155+
sys.exit(e.exit_code)
156+
except SystemExit as e:
157+
if e.code != 0:
158+
LOGGER.error(f"System exit with code: {e.code}")
159+
sys.exit(e.code)
160+
except KeyboardInterrupt:
161+
LOGGER.info("(130) Halted via KeyboardInterrupt.")
162+
sys.exit(130)
163+
except Exception:
164+
traceback_formatted = textwrap.indent(
165+
highlight(
166+
traceback.format_exc(),
167+
PythonTracebackLexer(),
168+
TerminalFormatter(),
169+
),
170+
" ",
171+
)
172+
LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
173+
sys.exit(1)

scripts/1-fetch/wiki_fetch.py

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
#!/usr/bin/env python
2+
"""
3+
Fetch CC Legal Tool usage from Wikipedia API.
4+
"""
5+
6+
# Standard library
7+
import argparse
8+
import csv
9+
import os
10+
import sys
11+
import textwrap
12+
import traceback
13+
import random
14+
# Third-party
15+
import requests
16+
from pygments import highlight
17+
from pygments.formatters import TerminalFormatter
18+
from pygments.lexers import PythonTracebackLexer
19+
from requests.adapters import HTTPAdapter
20+
from urllib3.util.retry import Retry
21+
22+
# Add parent directory so shared can be imported
23+
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
24+
WIKI_BASE_URL = "https://en.wikipedia.org/w/api.php"
25+
# First-party/Local
26+
import shared # noqa: E402
27+
28+
# Setup
29+
LOGGER, PATHS = shared.setup(__file__)
30+
FILE1_COUNT = os.path.join(PATHS["data_phase"], "wiki_1_count.csv")
31+
HEADER1_COUNT = ["PLAN_INDEX", "TOOL_IDENTIFIER", "COUNT"]
32+
QUARTER = os.path.basename(PATHS["data_quarter"])
33+
WIKIPEDIA_RETRY_STATUS_FORCELIST = [
34+
408, # Request Timeout
35+
422, # Unprocessable Content (Validation failed, or endpoint spammed)
36+
429, # Too Many Requests
37+
500, # Internal Server Error
38+
502, # Bad Gateway
39+
503, # Service Unavailable
40+
504, # Gateway Timeout
41+
]
42+
43+
def parse_arguments():
44+
"""
45+
Parse command-line options, returns parsed argument namespace.
46+
"""
47+
LOGGER.info("Parsing command-line options")
48+
parser = argparse.ArgumentParser(description=__doc__)
49+
parser.add_argument(
50+
"--enable-save",
51+
action="store_true",
52+
help="Enable saving results",
53+
)
54+
parser.add_argument(
55+
"--enable-git",
56+
action="store_true",
57+
help="Enable git actions (fetch, merge, add, commit, and push)",
58+
)
59+
parser.add_argument(
60+
"--dev",
61+
action="store_true",
62+
help="Development mode: avoid hitting API (generate fake data)",
63+
)
64+
args = parser.parse_args()
65+
if not args.enable_save and args.enable_git:
66+
parser.error("--enable-git requires --enable-save")
67+
return args
68+
69+
def get_requests_session():
70+
max_retries = Retry(
71+
total=5,
72+
backoff_factor=10,
73+
status_forcelist=WIKIPEDIA_RETRY_STATUS_FORCELIST,
74+
)
75+
session = requests.Session()
76+
session.mount("https://", HTTPAdapter(max_retries=max_retries))
77+
session.headers.update({"User-Agent": "quantifying-wikipedia-fetch/1.0 (contact@example.com)"})
78+
return session
79+
80+
81+
def write_data(args, tool_data):
82+
if not args.enable_save:
83+
return args
84+
85+
# Create data directory for this phase
86+
os.makedirs(PATHS["data_phase"], exist_ok=True)
87+
88+
with open(FILE1_COUNT, "w", newline="") as file_obj:
89+
writer = csv.DictWriter(
90+
file_obj, fieldnames=HEADER1_COUNT, dialect="unix"
91+
)
92+
writer.writeheader()
93+
for row in tool_data:
94+
writer.writerow(row)
95+
return args
96+
97+
98+
def query_wikipedia(args, session):
99+
LOGGER.info("Fetching Wikipedia rightsinfo + article count")
100+
tool_data = []
101+
102+
try:
103+
if args.dev:
104+
license_name = "Creative Commons (DEV)"
105+
article_count = random.randint(100000, 5000000)
106+
else:
107+
params = {
108+
"action": "query",
109+
"meta": "siteinfo",
110+
"siprop": "general|statistics|rightsinfo",
111+
"format": "json",
112+
}
113+
r = session.get(WIKI_BASE_URL, params=params, timeout=30)
114+
r.raise_for_status()
115+
data = r.json()
116+
117+
stats = data["query"]["statistics"]
118+
rights = data["query"]["rightsinfo"]
119+
120+
license_name = rights.get("text", "")
121+
license_url = rights.get("url", "")
122+
article_count = stats.get("articles", 0)
123+
124+
tool_data.append({
125+
"PLAN_INDEX": 1,
126+
"TOOL_IDENTIFIER": f"{license_name}",
127+
"COUNT": article_count
128+
})
129+
130+
LOGGER.info(f"License: {license_name} -> Articles: {article_count}")
131+
132+
except requests.RequestException as e:
133+
LOGGER.error(f"Request error while fetching Wikipedia rightsinfo: {e}")
134+
raise shared.QuantifyingException(f"Request error: {e}", 1)
135+
136+
return tool_data
137+
138+
def main():
139+
args = parse_arguments()
140+
shared.paths_log(LOGGER, PATHS)
141+
shared.git_fetch_and_merge(args, PATHS["repo"])
142+
tool_data = query_wikipedia(args, get_requests_session())
143+
args = write_data(args, tool_data)
144+
args = shared.git_add_and_commit(args, PATHS["repo"], PATHS["data_quarter"], f"Add and commit new Wikipedia data for {QUARTER}")
145+
shared.git_push_changes(args, PATHS["repo"])
146+
147+
if __name__ == "__main__":
148+
try:
149+
main()
150+
except shared.QuantifyingException as e:
151+
if e.exit_code == 0:
152+
LOGGER.info(e.message)
153+
else:
154+
LOGGER.error(e.message)
155+
sys.exit(e.exit_code)
156+
except SystemExit as e:
157+
if e.code != 0:
158+
LOGGER.error(f"System exit with code: {e.code}")
159+
sys.exit(e.code)
160+
except KeyboardInterrupt:
161+
LOGGER.info("(130) Halted via KeyboardInterrupt.")
162+
sys.exit(130)
163+
except Exception:
164+
traceback_formatted = textwrap.indent(
165+
highlight(
166+
traceback.format_exc(),
167+
PythonTracebackLexer(),
168+
TerminalFormatter(),
169+
),
170+
" ",
171+
)
172+
LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
173+
sys.exit(1)

0 commit comments

Comments
 (0)