Skip to content

Commit b06b7af

Browse files
committed
removed media with results_counts=0
1 parent 1623c6a commit b06b7af

1 file changed

Lines changed: 37 additions & 24 deletions

File tree

scripts/1-fetch/openverse_fetch.py

Lines changed: 37 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
OPENVERSE_FIELDS = [
4545
"SOURCE",
4646
"MEDIA_TYPE",
47-
"LICENSE",
47+
"TOOL_IDENTIFIER",
4848
"MEDIA_COUNT",
4949
]
5050

@@ -92,7 +92,7 @@ def get_all_sources_and_licenses(session, media_type):
9292
LOGGER.info(f"Fetching all sources for {media_type}")
9393
url = f"{OPENVERSE_BASE_URL}/{media_type}/stats/?format=json"
9494
# Standard /stats/ license
95-
licenses = [
95+
OPENVERSE_LEGAL_TOOLS = [
9696
"by",
9797
"by-nc",
9898
"by-nc-nd",
@@ -128,16 +128,15 @@ def get_all_sources_and_licenses(session, media_type):
128128
if new_response.status_code == 200:
129129
valid_sources.add(source)
130130
else:
131-
LOGGER.info(
131+
LOGGER.warning(
132132
f"Skipping source {source}: "
133133
f"not available in /{media_type}/ endpoint"
134134
)
135135
LOGGER.info(f"Found {len(valid_sources)} sources for {media_type}")
136-
return valid_sources, set(licenses)
136+
return valid_sources, set(OPENVERSE_LEGAL_TOOLS)
137137
except (requests.HTTPError, requests.RequestException) as e:
138-
LOGGER.error(f"Failed to fetch sources and licenses: {e}")
139138
raise shared.QuantifyingException(
140-
f"Failed to fetch sources and licenses: {e}"
139+
f"Failed to fetch sources and licenses: {e}", exit_code=1
141140
)
142141

143142

@@ -152,11 +151,12 @@ def query_openverse(session):
152151
sources, licenses = get_all_sources_and_licenses(session, media_type)
153152
for source in sources:
154153
for license in licenses:
154+
# encode the license to escape '+' e.g sampling+
155+
encoded_license = urllib.parse.quote(license, safe="")
155156
url = (
156157
f"{OPENVERSE_BASE_URL}/{media_type}/?"
157-
# encode the license
158158
f"source={source}&"
159-
f"license={urllib.parse.quote(license, safe='')}"
159+
f"license={encoded_license}"
160160
"&format=json&page=1"
161161
)
162162
LOGGER.info(f"Target URL: {url}")
@@ -171,26 +171,39 @@ def query_openverse(session):
171171
response.raise_for_status()
172172
data = response.json()
173173
count = data.get("result_count", 0)
174-
key = (source, media_type, license)
175-
tally[key] = count
174+
# Skip (source x license) with result_count = 0
175+
if count > 0:
176+
key = (source, media_type, license)
177+
tally[key] = count
178+
else:
179+
LOGGER.warning(
180+
f"Skipping {source}, {license}: count is 0"
181+
)
176182
except (requests.HTTPError, requests.RequestException) as e:
177-
LOGGER.error(f"Openverse fetch failed: {e}")
178183
raise shared.QuantifyingException(
179-
f"Openverse fetch failed: {e}"
184+
f"Openverse fetch failed: {e}", exit_code=1
180185
)
181-
# Convert tally dictionary to a list of dicts for writing
182186
LOGGER.info("Aggregating the data")
183-
aggregate = [
184-
{
185-
OPENVERSE_FIELDS[0].lower(): field[0], # SOURCE
186-
OPENVERSE_FIELDS[1].lower(): field[1], # MEDIA_TYPE
187-
OPENVERSE_FIELDS[2].lower(): (
188-
f"{'cc ' + field[2] if field[2] not in ['pdm', 'cc0'] else field[2]}" # noqa: E501
189-
), # LICENSE
190-
OPENVERSE_FIELDS[3].lower(): count, # MEDIA_COUNT
191-
}
192-
for field, count in tally.items()
193-
]
187+
aggregate = []
188+
for field, count in tally.items():
189+
source_name = field[0]
190+
media_type_name = field[1]
191+
license_code = field[2]
192+
# Append prefix "cc" except for 'pdm' and 'cc0'
193+
if license_code not in ["pdm", "cc0"]:
194+
tool_identifier = f"cc {license_code}"
195+
else:
196+
tool_identifier = license_code
197+
aggregate.append(
198+
{
199+
OPENVERSE_FIELDS[0].lower(): source_name, # SOURCE
200+
OPENVERSE_FIELDS[1].lower(): media_type_name, # MEDIA_TYPE
201+
OPENVERSE_FIELDS[
202+
2
203+
].lower(): tool_identifier, # LEGAL_TOOL_IDENTIFIER
204+
OPENVERSE_FIELDS[3].lower(): count, # MEDIA_COUNT
205+
}
206+
)
194207
return aggregate
195208

196209

0 commit comments

Comments
 (0)