Skip to content

Commit 87bc6d4

Browse files
committed
Added count by language wikipedias
1 parent 5ad95f4 commit 87bc6d4

3 files changed

Lines changed: 70 additions & 219 deletions

File tree

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ default_language_version:
66
repos:
77

88
- repo: https://github.com/pre-commit/pre-commit-hooks
9-
rev: v4.6.0
9+
rev: v6.0.0
1010
hooks:
1111
- id: check-added-large-files
1212
- id: check-ast

scripts/1-fetch/Wikipedia_fetch.py

Lines changed: 69 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,7 @@
1010
import sys
1111
import textwrap
1212
import traceback
13-
import urllib.parse
14-
import random
13+
1514
# Third-party
1615
import requests
1716
from pygments import highlight
@@ -22,15 +21,17 @@
2221

2322
# Add parent directory so shared can be imported
2423
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
25-
WIKI_BASE_URL = "https://en.wikipedia.org/w/api.php"
2624
# First-party/Local
2725
import shared # noqa: E402
2826

2927
# Setup
3028
LOGGER, PATHS = shared.setup(__file__)
31-
FILE1_COUNT = os.path.join(PATHS["data_phase"], "wiki_1_count.csv")
32-
HEADER1_COUNT = ["PLAN_INDEX", "TOOL_IDENTIFIER", "COUNT"]
29+
FILE_LANGUAGES = os.path.join(
30+
PATHS["data_phase"], "wikipedia_count_by_languages.csv"
31+
)
32+
HEADER_LANGUAGES = ["LANGUAGE_CODE", "LANGUAGE_NAME", "COUNT"]
3333
QUARTER = os.path.basename(PATHS["data_quarter"])
34+
WIKIPEDIA_BASE_URL = "https://en.wikipedia.org/w/api.php"
3435
WIKIPEDIA_RETRY_STATUS_FORCELIST = [
3536
408, # Request Timeout
3637
422, # Unprocessable Content (Validation failed, or endpoint spammed)
@@ -41,6 +42,7 @@
4142
504, # Gateway Timeout
4243
]
4344

45+
4446
def parse_arguments():
4547
"""
4648
Parse command-line options, returns parsed argument namespace.
@@ -57,16 +59,12 @@ def parse_arguments():
5759
action="store_true",
5860
help="Enable git actions (fetch, merge, add, commit, and push)",
5961
)
60-
parser.add_argument(
61-
"--dev",
62-
action="store_true",
63-
help="Development mode: avoid hitting API (generate fake data)",
64-
)
6562
args = parser.parse_args()
6663
if not args.enable_save and args.enable_git:
6764
parser.error("--enable-git requires --enable-save")
6865
return args
6966

67+
7068
def get_requests_session():
7169
max_retries = Retry(
7270
total=5,
@@ -75,75 +73,101 @@ def get_requests_session():
7573
)
7674
session = requests.Session()
7775
session.mount("https://", HTTPAdapter(max_retries=max_retries))
78-
session.headers.update({"User-Agent": "quantifying-wikipedia-fetch/1.0 (contact@example.com)"})
76+
session.headers.update(
77+
{"User-Agent": "quantifying-wikipedia-fetch/1.0 (contact@example.com)"}
78+
)
7979
return session
8080

8181

8282
def write_data(args, tool_data):
8383
if not args.enable_save:
8484
return args
85-
86-
# Create data directory for this phase
85+
LOGGER.info("Saving fetched data")
8786
os.makedirs(PATHS["data_phase"], exist_ok=True)
8887

89-
with open(FILE1_COUNT, "w", newline="") as file_obj:
88+
with open(FILE_LANGUAGES, "w", newline="", encoding="utf-8") as file_obj:
9089
writer = csv.DictWriter(
91-
file_obj, fieldnames=HEADER1_COUNT, dialect="unix"
90+
file_obj, fieldnames=HEADER_LANGUAGES, dialect="unix"
9291
)
9392
writer.writeheader()
9493
for row in tool_data:
9594
writer.writerow(row)
9695
return args
9796

9897

99-
def query_wikipedia(args, session):
100-
LOGGER.info("Beginning to fetch results from Wikipedia API")
98+
def query_wikipedia_languages(session):
99+
LOGGER.info("Fetching article counts from all language Wikipedias")
101100
tool_data = []
102101

103-
try:
104-
if args.dev:
105-
license_name = "Creative Commons Attribution-ShareAlike 4.0"
106-
article_count = random.randint(100000, 5000000)
107-
else:
108-
params = {
109-
"action": "query",
110-
"meta": "siteinfo",
111-
"siprop": "general|statistics|rightsinfo",
112-
"format": "json",
113-
}
114-
r = session.get(WIKI_BASE_URL, params=params, timeout=30)
102+
# Get all language wikipedias
103+
site_matrix_url = "https://meta.wikimedia.org/w/api.php"
104+
params = {"action": "sitematrix", "format": "json"}
105+
r = session.get(site_matrix_url, params=params, timeout=30)
106+
data = r.json()["sitematrix"]
107+
108+
langs = []
109+
for key, val in data.items():
110+
if key.isdigit():
111+
lang_code = val.get("code")
112+
lang_name = val.get("name")
113+
for site in val.get("site", []):
114+
if "wikipedia.org" in site["url"]:
115+
langs.append(
116+
{
117+
"lang": lang_code,
118+
"name": lang_name,
119+
}
120+
)
121+
122+
# For each language wikipedia, fetch statistics.
123+
for site in langs:
124+
base_url = f"{site['url']}/w/api.php"
125+
params = {
126+
"action": "query",
127+
"meta": "siteinfo",
128+
"siprop": "statistics",
129+
"format": "json",
130+
}
131+
try:
132+
r = session.get(base_url, params=params, timeout=30)
115133
r.raise_for_status()
116134
data = r.json()
117-
118135
stats = data["query"]["statistics"]
119-
rights = data["query"]["rightsinfo"]
120136

121-
license_name = rights.get("text", "")
122137
article_count = stats.get("articles", 0)
123138

124-
tool_data.append({
125-
"PLAN_INDEX": 1,
126-
"TOOL_IDENTIFIER": f"{license_name}",
127-
"COUNT": article_count
128-
})
129-
130-
LOGGER.info(f"License: {license_name} -> Articles: {article_count}")
139+
tool_data.append(
140+
{
141+
"LANGUAGE_CODE": site["lang"],
142+
"LANGUAGE_NAME": site["name"],
143+
"COUNT": article_count,
144+
}
145+
)
146+
LOGGER.info(f"{site['lang']} ({site['name']}): {article_count}")
131147

132-
except requests.RequestException as e:
133-
LOGGER.error(f"Request error while fetching Wikipedia rightsinfo: {e}")
134-
raise shared.QuantifyingException(f"Request error: {e}", 1)
148+
except Exception as e:
149+
LOGGER.warning(
150+
f"Failed to fetch for {site['lang']} ({site['name']}): {e}"
151+
)
135152

136153
return tool_data
137154

155+
138156
def main():
139157
args = parse_arguments()
140158
shared.paths_log(LOGGER, PATHS)
141159
shared.git_fetch_and_merge(args, PATHS["repo"])
142-
tool_data = query_wikipedia(args, get_requests_session())
160+
tool_data = query_wikipedia_languages(get_requests_session())
143161
args = write_data(args, tool_data)
144-
args = shared.git_add_and_commit(args, PATHS["repo"], PATHS["data_quarter"], f"Add and commit new Wikipedia data for {QUARTER}")
162+
args = shared.git_add_and_commit(
163+
args,
164+
PATHS["repo"],
165+
PATHS["data_quarter"],
166+
f"Add and commit new Wikipedia data for {QUARTER}",
167+
)
145168
shared.git_push_changes(args, PATHS["repo"])
146169

170+
147171
if __name__ == "__main__":
148172
try:
149173
main()
@@ -170,4 +194,4 @@ def main():
170194
" ",
171195
)
172196
LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
173-
sys.exit(1)
197+
sys.exit(1)

scripts/1-fetch/wiki_fetch.py

Lines changed: 0 additions & 173 deletions
This file was deleted.

0 commit comments

Comments
 (0)