Skip to content

Commit 4e493fd

Browse files
authored
Merge branch 'main' into openverse-fetch
2 parents 6b98c9d + ee9d97c commit 4e493fd

4 files changed

Lines changed: 218 additions & 6 deletions

File tree

scripts/1-fetch/gcs_fetch.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,7 @@ def query_gcs(args, service, last_completed_plan_index, plan):
259259
initial_delay *= 2 # Exponential backoff
260260
else:
261261
LOGGER.error(f"Error fetching results: {e}")
262+
break
262263
if success:
263264
append_data(args, plan_row, index, count)
264265
else:

scripts/1-fetch/wikipedia_fetch.py

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
#!/usr/bin/env python
2+
"""
3+
Fetch CC Legal Tool usage from Wikipedia API.
4+
"""
5+
6+
# Standard library
7+
import argparse
8+
import csv
9+
import os
10+
import sys
11+
import textwrap
12+
import traceback
13+
14+
# Third-party
15+
import requests
16+
from pygments import highlight
17+
from pygments.formatters import TerminalFormatter
18+
from pygments.lexers import PythonTracebackLexer
19+
from requests.adapters import HTTPAdapter
20+
from urllib3.util.retry import Retry
21+
22+
# Add parent directory so shared can be imported
23+
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
24+
25+
# First-party/Local
26+
import shared # noqa: E402
27+
28+
# Setup
29+
LOGGER, PATHS = shared.setup(__file__)
30+
31+
# Constants
32+
FILE_LANGUAGES = os.path.join(
33+
PATHS["data_phase"], "wikipedia_count_by_languages.csv"
34+
)
35+
HEADER_LANGUAGES = [
36+
"LANGUAGE_CODE",
37+
"LANGUAGE_NAME_EN",
38+
"LANGUAGE_NAME",
39+
"COUNT",
40+
]
41+
QUARTER = os.path.basename(PATHS["data_quarter"])
42+
WIKIPEDIA_BASE_URL = "https://en.wikipedia.org/w/api.php"
43+
WIKIPEDIA_MATRIX_URL = "https://meta.wikimedia.org/w/api.php"
44+
45+
46+
def parse_arguments():
47+
"""
48+
Parse command-line options, returns parsed argument namespace.
49+
"""
50+
LOGGER.info("Parsing command-line options")
51+
parser = argparse.ArgumentParser(description=__doc__)
52+
parser.add_argument(
53+
"--enable-save",
54+
action="store_true",
55+
help="Enable saving results",
56+
)
57+
parser.add_argument(
58+
"--enable-git",
59+
action="store_true",
60+
help="Enable git actions (fetch, merge, add, commit, and push)",
61+
)
62+
args = parser.parse_args()
63+
if not args.enable_save and args.enable_git:
64+
parser.error("--enable-git requires --enable-save")
65+
return args
66+
67+
68+
def get_requests_session():
69+
max_retries = Retry(
70+
total=5,
71+
backoff_factor=10,
72+
status_forcelist=shared.STATUS_FORCELIST,
73+
)
74+
session = requests.Session()
75+
session.mount("https://", HTTPAdapter(max_retries=max_retries))
76+
session.headers.update({"User-Agent": shared.USER_AGENT})
77+
return session
78+
79+
80+
def write_data(args, tool_data):
81+
if not args.enable_save:
82+
return args
83+
LOGGER.info("Saving fetched data")
84+
os.makedirs(PATHS["data_phase"], exist_ok=True)
85+
86+
with open(FILE_LANGUAGES, "w", newline="", encoding="utf-8") as file_obj:
87+
writer = csv.DictWriter(
88+
file_obj, fieldnames=HEADER_LANGUAGES, dialect="unix"
89+
)
90+
writer.writeheader()
91+
for row in tool_data:
92+
writer.writerow(row)
93+
return args
94+
95+
96+
def query_wikipedia_languages(session):
97+
LOGGER.info("Fetching article counts from all language Wikipedias")
98+
tool_data = []
99+
100+
# Gets all language wikipedias
101+
params = {"action": "sitematrix", "format": "json", "uselang": "en"}
102+
r = session.get(WIKIPEDIA_MATRIX_URL, params=params, timeout=30)
103+
data = r.json()["sitematrix"]
104+
105+
languages = []
106+
for key, val in data.items():
107+
if not isinstance(val, dict):
108+
continue
109+
if key.isdigit():
110+
language_code = val.get("code")
111+
language_name = val.get("name")
112+
language_name_en = val.get("localname")
113+
for site in val.get("site", []):
114+
if "wikipedia.org" in site["url"]:
115+
languages.append(
116+
{
117+
"code": language_code,
118+
"name": language_name,
119+
"name_en": language_name_en,
120+
"url": site["url"],
121+
}
122+
)
123+
# For each language wikipedia, fetch statistics.
124+
for site in languages:
125+
base_url = f"{site['url']}/w/api.php"
126+
params = {
127+
"action": "query",
128+
"meta": "siteinfo",
129+
"siprop": "statistics",
130+
"format": "json",
131+
}
132+
try:
133+
r = session.get(base_url, params=params, timeout=30)
134+
r.raise_for_status()
135+
data = r.json()
136+
stats = data["query"]["statistics"]
137+
article_count = stats.get("articles", 0)
138+
language_code = site["code"]
139+
language_name = site["name"]
140+
language_name_en = site["name_en"]
141+
142+
language_display = f"{language_code}"
143+
if language_name_en:
144+
language_display = f"{language_display} {language_name_en}"
145+
if language_name:
146+
language_display = f"{language_display} ({language_name})"
147+
148+
if article_count == 0:
149+
LOGGER.warning(f"Skipping {language_display} with 0 articles")
150+
continue
151+
tool_data.append(
152+
{
153+
"LANGUAGE_CODE": language_code,
154+
"LANGUAGE_NAME": language_name,
155+
"LANGUAGE_NAME_EN": language_name_en,
156+
"COUNT": article_count,
157+
}
158+
)
159+
LOGGER.info(f"{language_display}: {article_count}")
160+
161+
except Exception as e:
162+
LOGGER.warning(f"Failed to fetch for {language_display}): {e}")
163+
164+
return tool_data
165+
166+
167+
def main():
168+
args = parse_arguments()
169+
shared.paths_log(LOGGER, PATHS)
170+
shared.git_fetch_and_merge(args, PATHS["repo"])
171+
tool_data = query_wikipedia_languages(get_requests_session())
172+
args = write_data(args, tool_data)
173+
args = shared.git_add_and_commit(
174+
args,
175+
PATHS["repo"],
176+
PATHS["data_quarter"],
177+
f"Add and commit new Wikipedia data for {QUARTER}",
178+
)
179+
shared.git_push_changes(args, PATHS["repo"])
180+
181+
182+
if __name__ == "__main__":
183+
try:
184+
main()
185+
except shared.QuantifyingException as e:
186+
if e.exit_code == 0:
187+
LOGGER.info(e.message)
188+
else:
189+
LOGGER.error(e.message)
190+
sys.exit(e.exit_code)
191+
except SystemExit as e:
192+
if e.code != 0:
193+
LOGGER.error(f"System exit with code: {e.code}")
194+
sys.exit(e.code)
195+
except KeyboardInterrupt:
196+
LOGGER.info("(130) Halted via KeyboardInterrupt.")
197+
sys.exit(130)
198+
except Exception:
199+
traceback_formatted = textwrap.indent(
200+
highlight(
201+
traceback.format_exc(),
202+
PythonTracebackLexer(),
203+
TerminalFormatter(),
204+
),
205+
" ",
206+
)
207+
LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
208+
sys.exit(1)

scripts/plot.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def annotate_ylabels(ax, data, data_label, colors):
2121
# defaults: ytick.major.size + ytick.major.pad
2222
indent = -1 * (ytick.get_tick_padding() + ytick.get_pad())
2323
for index, row in data.iterrows():
24-
if c > len(colors):
24+
if c >= len(colors):
2525
c = 0
2626

2727
# annotate totals

scripts/shared.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,17 @@
88
from pandas import PeriodIndex
99

1010
# Constants
11-
RETRY_STATUS_FORCELIST = [
11+
STATUS_FORCELIST = [
1212
408, # Request Timeout
13-
422, # Unprocessable Content (Validation failed, or endpoint spammed)
13+
422, # Unprocessable Content (Validation failed, endpoint spammed, etc.)
1414
429, # Too Many Requests
1515
500, # Internal Server Error
1616
502, # Bad Gateway
1717
503, # Service Unavailable
1818
504, # Gateway Timeout
1919
]
2020
USER_AGENT = (
21-
"QuantifyingTheCommons/1.0"
21+
"QuantifyingTheCommons/1.0 "
2222
"(https://github.com/creativecommons/quantifying)"
2323
)
2424

@@ -235,8 +235,11 @@ def update_readme(
235235
entry_start_index = lines.index(entry_start_line)
236236
entry_end_index = lines.index(entry_end_line)
237237
# Include any trailing empty/whitespace-only lines
238-
while not lines[entry_end_index + 1].strip():
239-
entry_end_index += 1
238+
while entry_end_index + 1 < len(lines):
239+
if not lines[entry_end_index + 1].strip():
240+
entry_end_index += 1
241+
else:
242+
break
240243
# Initalize variables of entry is not present
241244
else:
242245
entry_start_index = None

0 commit comments

Comments
 (0)