Skip to content

Commit d6b35f2

Browse files
committed
fetch sources from /stats/ endpoint
1 parent e6e552c commit d6b35f2

1 file changed

Lines changed: 56 additions & 18 deletions

File tree

scripts/1-fetch/openverse_fetch.py

Lines changed: 56 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
returns a maximum of ~240 result count
88
per source-license combination, this
99
script currently provides approximate counts.
10-
It does not include vide pagination or license_version
10+
It does not include pagination or license_version
1111
breakdown.
1212
"""
1313

@@ -18,6 +18,7 @@
1818
import sys
1919
import textwrap
2020
import traceback
21+
import urllib
2122

2223
# Third-party
2324
import requests
@@ -36,19 +37,16 @@
3637
# Setup
3738
LOGGER, PATHS = shared.setup(__file__)
3839

39-
LOGGER.info("Starting Openverse Fetch Script...")
40-
41-
4240
# Constants
4341
FILE_PATH = os.path.join(PATHS["data_phase"], "openverse_fetch.csv")
42+
MEDIA_TYPES = ["audio", "images"]
43+
OPENVERSE_BASE_URL = "https://api.openverse.org/v1"
4444
OPENVERSE_FIELDS = [
4545
"SOURCE",
4646
"MEDIA_TYPE",
4747
"LICENSE",
4848
"MEDIA_COUNT",
4949
]
50-
OPENVERSE_BASE_URL = "https://api.openverse.org/v1"
51-
MEDIA_TYPES = ["audio", "images"]
5250

5351

5452
def parse_arguments():
@@ -89,21 +87,55 @@ def get_requests_session():
8987

9088
def get_all_sources_and_licenses(session, media_type):
9189
"""
92-
Fetch all available sources and licenses for a given media_type.
90+
Fetch all available sources for a given media_type.
9391
"""
9492
LOGGER.info(f"Fetching all sources and licenses for {media_type}")
95-
sources = set()
96-
licenses = set()
97-
url = f"{OPENVERSE_BASE_URL}/{media_type}/?format=json"
93+
url = f"{OPENVERSE_BASE_URL}/{media_type}/stats/?format=json"
94+
# encoded_nc_sampling = urllib.parse.quote("nc-sampling+", safe="")
95+
# encoded_sampling = urllib.parse.quote("sampling+", safe="")
96+
licenses = [
97+
"by",
98+
"by-nc",
99+
"by-nc-nd",
100+
"by-nc-sa",
101+
"by-nd",
102+
"by-sa",
103+
"cc0",
104+
"nc-sampling+",
105+
"pdm",
106+
"sampling+",
107+
]
98108
try:
99109
response = session.get(url)
100110
response.raise_for_status()
101-
records = response.json().get("results", [])
102-
for record in records:
103-
sources.add(record.get("source", ""))
104-
licenses.add(record.get("license", ""))
105-
return list(sources), list(licenses)
106-
except requests.HTTPError as e:
111+
records = response.json()
112+
raw_sources = sorted(
113+
[
114+
record["source_name"]
115+
for record in records
116+
if "source_name" in record
117+
]
118+
)
119+
"""
120+
To ensure the sources in /stats/ endpoints are indexed in
121+
Openverse's catalog.
122+
"""
123+
valid_sources = set()
124+
for source in raw_sources:
125+
new_response = session.get(
126+
f"{OPENVERSE_BASE_URL}/{media_type}/?"
127+
f"source={source}&format=json"
128+
)
129+
if new_response.status_code == 200:
130+
valid_sources.add(source)
131+
else:
132+
LOGGER.info(
133+
f"Skipping source {source}:"
134+
f" not available in /{media_type}/ endpoint"
135+
)
136+
LOGGER.info(f"Found {len(valid_sources)} sources for {media_type}")
137+
return valid_sources, set(licenses)
138+
except (requests.HTTPError, requests.RequestException) as e:
107139
LOGGER.error(f"Failed to fetch sources and licenses: {e}")
108140
raise shared.QuantifyingException(
109141
f"Failed to fetch sources and licenses: {e}"
@@ -112,7 +144,9 @@ def get_all_sources_and_licenses(session, media_type):
112144

113145
def query_openverse(session):
114146
"""
115-
Fetch records from Openverse API.
147+
Fetch available sources given the
148+
media_type and use standard list
149+
of Openverse's standard licenses.
116150
"""
117151
tally = {}
118152
for media_type in MEDIA_TYPES:
@@ -122,7 +156,9 @@ def query_openverse(session):
122156
for license in licenses:
123157
url = (
124158
f"{OPENVERSE_BASE_URL}/{media_type}/?"
125-
f"source={source}&license={license}"
159+
# encode the license
160+
f"source={source}&"
161+
f"license={urllib.parse.quote(license, safe='')}"
126162
"&format=json&page=1"
127163
)
128164
LOGGER.info(f"Target URL: {url}")
@@ -176,7 +212,9 @@ def write_data(args, data):
176212
def main():
177213
args = parse_arguments()
178214
session = get_requests_session()
215+
LOGGER.info("Starting Openverse Fetch Script...")
179216
records = query_openverse(session)
217+
LOGGER.info(f"CHECKING: {records[0]}")
180218
write_data(args, records)
181219
LOGGER.info(f"Fetched {len(records)} unique Openverse records")
182220

0 commit comments

Comments
 (0)