77 returns a maximum of ~240 result count
88 per source-license combination, this
99 script currently provides approximate counts.
10- It does not include vide pagination or license_version
10+ It does not include pagination or license_version
1111 breakdown.
1212"""
1313
1818import sys
1919import textwrap
2020import traceback
21+ import urllib
2122
2223# Third-party
2324import requests
3637# Setup
3738LOGGER , PATHS = shared .setup (__file__ )
3839
39- LOGGER .info ("Starting Openverse Fetch Script..." )
40-
41-
4240# Constants
4341FILE_PATH = os .path .join (PATHS ["data_phase" ], "openverse_fetch.csv" )
42+ MEDIA_TYPES = ["audio" , "images" ]
43+ OPENVERSE_BASE_URL = "https://api.openverse.org/v1"
4444OPENVERSE_FIELDS = [
4545 "SOURCE" ,
4646 "MEDIA_TYPE" ,
4747 "LICENSE" ,
4848 "MEDIA_COUNT" ,
4949]
50- OPENVERSE_BASE_URL = "https://api.openverse.org/v1"
51- MEDIA_TYPES = ["audio" , "images" ]
5250
5351
5452def parse_arguments ():
@@ -89,21 +87,55 @@ def get_requests_session():
8987
9088def get_all_sources_and_licenses (session , media_type ):
9189 """
92- Fetch all available sources and licenses for a given media_type.
90+ Fetch all available sources for a given media_type.
9391 """
9492 LOGGER .info (f"Fetching all sources and licenses for { media_type } " )
95- sources = set ()
96- licenses = set ()
97- url = f"{ OPENVERSE_BASE_URL } /{ media_type } /?format=json"
93+ url = f"{ OPENVERSE_BASE_URL } /{ media_type } /stats/?format=json"
94+ # encoded_nc_sampling = urllib.parse.quote("nc-sampling+", safe="")
95+ # encoded_sampling = urllib.parse.quote("sampling+", safe="")
96+ licenses = [
97+ "by" ,
98+ "by-nc" ,
99+ "by-nc-nd" ,
100+ "by-nc-sa" ,
101+ "by-nd" ,
102+ "by-sa" ,
103+ "cc0" ,
104+ "nc-sampling+" ,
105+ "pdm" ,
106+ "sampling+" ,
107+ ]
98108 try :
99109 response = session .get (url )
100110 response .raise_for_status ()
101- records = response .json ().get ("results" , [])
102- for record in records :
103- sources .add (record .get ("source" , "" ))
104- licenses .add (record .get ("license" , "" ))
105- return list (sources ), list (licenses )
106- except requests .HTTPError as e :
111+ records = response .json ()
112+ raw_sources = sorted (
113+ [
114+ record ["source_name" ]
115+ for record in records
116+ if "source_name" in record
117+ ]
118+ )
119+ """
120+ To ensure the sources in /stats/ endpoints are indexed in
121+ Openverse's catalog.
122+ """
123+ valid_sources = set ()
124+ for source in raw_sources :
125+ new_response = session .get (
126+ f"{ OPENVERSE_BASE_URL } /{ media_type } /?"
127+ f"source={ source } &format=json"
128+ )
129+ if new_response .status_code == 200 :
130+ valid_sources .add (source )
131+ else :
132+ LOGGER .info (
133+ f"Skipping source { source } :"
134+ f" not available in /{ media_type } / endpoint"
135+ )
136+ LOGGER .info (f"Found { len (valid_sources )} sources for { media_type } " )
137+ return valid_sources , set (licenses )
138+ except (requests .HTTPError , requests .RequestException ) as e :
107139 LOGGER .error (f"Failed to fetch sources and licenses: { e } " )
108140 raise shared .QuantifyingException (
109141 f"Failed to fetch sources and licenses: { e } "
@@ -112,7 +144,9 @@ def get_all_sources_and_licenses(session, media_type):
112144
113145def query_openverse (session ):
114146 """
115- Fetch records from Openverse API.
147+ Fetch available sources given the
148+ media_type and use standard list
149+ of Openverse's standard licenses.
116150 """
117151 tally = {}
118152 for media_type in MEDIA_TYPES :
@@ -122,7 +156,9 @@ def query_openverse(session):
122156 for license in licenses :
123157 url = (
124158 f"{ OPENVERSE_BASE_URL } /{ media_type } /?"
125- f"source={ source } &license={ license } "
159+ # encode the license
160+ f"source={ source } &"
161+ f"license={ urllib .parse .quote (license , safe = '' )} "
126162 "&format=json&page=1"
127163 )
128164 LOGGER .info (f"Target URL: { url } " )
@@ -176,7 +212,9 @@ def write_data(args, data):
176212def main ():
177213 args = parse_arguments ()
178214 session = get_requests_session ()
215+ LOGGER .info ("Starting Openverse Fetch Script..." )
179216 records = query_openverse (session )
217+ LOGGER .info (f"CHECKING: { records [0 ]} " )
180218 write_data (args , records )
181219 LOGGER .info (f"Fetched { len (records )} unique Openverse records" )
182220
0 commit comments