@@ -8,18 +8,36 @@ In the future the same will be used to:
88- Store PyPI index query results as facts in the MODULE.bazel.lock file
99"""
1010
11- def pypi_cache (store = None ):
11+ load (":version_from_filename.bzl" , "version_from_filename" )
12+
13+ # This value should be changed whenever the storage format changes.
14+ # Changing it simply means the information cached in the lockfile has to be
15+ # recomputed.
16+ _FACT_VERSION = "v1"
17+
18+ def pypi_cache (mctx = None , store = None ):
1219 """The cache for PyPI index queries.
1320
1421 Currently the key is of the following structure:
15- (url, real_url)
22+ (url, real_url, versions)
23+
24+ Args:
25+ mctx: The module context
26+ store: The in-memory store, should implement dict interface for get and setdefault
27+
28+ Returns:
29+ A cache struct
1630 """
31+ mcache = memory_cache (store )
32+ fcache = facts_cache (getattr (mctx , "facts" , None ))
1733
1834 # buildifier: disable=uninitialized
1935 self = struct (
20- _store = store or {},
36+ _mcache = mcache ,
37+ _facts = fcache ,
2138 setdefault = lambda key , parsed_result : _pypi_cache_setdefault (self , key , parsed_result ),
2239 get = lambda key : _pypi_cache_get (self , key ),
40+ get_facts = lambda : _pypi_cache_get_facts (self ),
2341 )
2442
2543 # buildifier: enable=uninitialized
@@ -40,7 +58,14 @@ def _pypi_cache_setdefault(self, key, parsed_result):
4058 Returns:
4159 The `parse_result`.
4260 """
43- return self ._store .setdefault (key , parsed_result )
61+ index_url , real_url , versions = key
62+ self ._mcache .setdefault (real_url , parsed_result )
63+ if not versions or not self ._facts :
64+ return parsed_result
65+
66+ # Filter the packages to only what is needed before writing to the facts cache
67+ filtered = _filter_packages (parsed_result , versions )
68+ return self ._facts .setdefault (index_url , filtered )
4469
4570def _pypi_cache_get (self , key ):
4671 """Return the parsed result from the cache.
@@ -52,4 +77,204 @@ def _pypi_cache_get(self, key):
5277 Returns:
5378 The {type}`struct` or `None` based on if the result is in the cache or not.
5479 """
55- return self ._store .get (key )
80+ index_url , real_url , versions = key
81+
82+ # When retrieving from memory cache, filter down to only what is needed. If the
83+ # cache is empty, we will attempt to read from facts, however, reading from memory
84+ # first allows us to not parse the contents of the lock file that may add up.
85+ cached = _filter_packages (self ._mcache .get (real_url ), versions )
86+ if not self ._facts :
87+ return cached
88+
89+ if not cached and versions :
90+ # Could not get from in-memory, read from lockfile facts
91+ cached = self ._facts .get (index_url , versions )
92+
93+ return cached
94+
95+ def _pypi_cache_get_facts (self ):
96+ if not self ._facts :
97+ return {}
98+
99+ return self ._facts .facts
100+
101+ def memory_cache (cache = None ):
102+ """SimpleAPI cache for making fewer calls.
103+
104+ We are using the `real_url` as the key in the cache functions on purpose in order to get the
105+ best possible cache hits.
106+
107+ Args:
108+ cache: the storage to store things in memory.
109+
110+ Returns:
111+ struct with 2 methods, `get` and `setdefault`.
112+ """
113+ if cache == None :
114+ cache = {}
115+
116+ return struct (
117+ get = lambda real_url : cache .get (real_url ),
118+ setdefault = lambda real_url , value : cache .setdefault (real_url , value ),
119+ )
120+
121+ def _filter_packages (dists , requested_versions ):
122+ if dists == None or not requested_versions :
123+ return dists
124+
125+ sha256s_by_version = {}
126+ whls = {}
127+ sdists = {}
128+
129+ for sha256 , d in dists .sdists .items ():
130+ if d .version not in requested_versions :
131+ continue
132+
133+ sdists [sha256 ] = d
134+ sha256s_by_version .setdefault (d .version , []).append (sha256 )
135+
136+ for sha256 , d in dists .whls .items ():
137+ if d .version not in requested_versions :
138+ continue
139+
140+ whls [sha256 ] = d
141+ sha256s_by_version .setdefault (d .version , []).append (sha256 )
142+
143+ if not whls and not sdists :
144+ # TODO @aignas 2026-03-08: add logging
145+ #print("WARN: no dists matched for versions {}".format(requested_versions))
146+ return None
147+
148+ return struct (
149+ whls = whls ,
150+ sdists = sdists ,
151+ sha256s_by_version = {
152+ k : sorted (v )
153+ for k , v in sha256s_by_version .items ()
154+ },
155+ )
156+
157+ def facts_cache (known_facts , facts_version = _FACT_VERSION ):
158+ """The facts cache.
159+
160+ Here we have a way to store things as facts and the main thing to keep in mind is that we should
161+ not use the real_url in case it contains credentials in it (e.g. is of form `https://<username>:<password>@<host>`).
162+
163+ Args:
164+ known_facts: An opaque object coming from {obj}`module_ctx.facts`.
165+ facts_version: {type}`str` the version of the facts schema, used for short-circuiting.
166+
167+ Returns:
168+ A struct that has:
169+ * `get` method for getting values from the facts cache.
170+ * `setdefault` method for setting values in the cache.
171+ * `facts` attribute that should be passed to the {obj}`module_ctx.extension_metadata` to persist facts.
172+ """
173+ if known_facts == None :
174+ return None
175+
176+ facts = {}
177+
178+ return struct (
179+ get = lambda index_url , versions : _get_from_facts (
180+ facts ,
181+ known_facts ,
182+ index_url ,
183+ versions ,
184+ facts_version ,
185+ ),
186+ setdefault = lambda url , value : _store_facts (facts , facts_version , url , value ),
187+ known_facts = known_facts ,
188+ facts = facts ,
189+ )
190+
191+ def _get_from_facts (facts , known_facts , index_url , requested_versions , facts_version ):
192+ if known_facts .get ("fact_version" ) != facts_version :
193+ # cannot trust known facts, different version that we know how to parse
194+ return None
195+
196+ known_sources = {}
197+
198+ root_url , _ , distribution = index_url .rstrip ("/" ).rpartition ("/" )
199+ distribution = distribution .rstrip ("/" )
200+ root_url = root_url .rstrip ("/" )
201+
202+ retrieved_versions = {}
203+
204+ for url , sha256 in known_facts .get ("dist_hashes" , {}).get (root_url , {}).get (distribution , {}).items ():
205+ filename = known_facts .get ("dist_filenames" , {}).get (root_url , {}).get (distribution , {}).get (sha256 )
206+ if not filename :
207+ _ , _ , filename = url .rpartition ("/" )
208+
209+ version = version_from_filename (filename )
210+ if version not in requested_versions :
211+ # TODO @aignas 2026-01-21: do the check by requested shas at some point
212+ # We don't have sufficient info in the lock file, need to call the API
213+ #
214+ continue
215+
216+ retrieved_versions [version ] = True
217+
218+ if filename .endswith (".whl" ):
219+ dists = known_sources .setdefault ("whls" , {})
220+ else :
221+ dists = known_sources .setdefault ("sdists" , {})
222+
223+ known_sources .setdefault ("sha256s_by_version" , {}).setdefault (version , []).append (sha256 )
224+
225+ dists .setdefault (sha256 , struct (
226+ sha256 = sha256 ,
227+ filename = filename ,
228+ version = version ,
229+ metadata_url = "" ,
230+ metadata_sha256 = "" ,
231+ url = url ,
232+ yanked = known_facts .get ("dist_yanked" , {}).get (root_url , {}).get (distribution , {}).get (sha256 ),
233+ ))
234+
235+ if not known_sources :
236+ # We found nothing in facts
237+ return None
238+
239+ if len (requested_versions ) != len (retrieved_versions ):
240+ # If the results are incomplete, then return None, so that we can fetch sources from the
241+ # internet again.
242+ return None
243+
244+ output = struct (
245+ whls = known_sources .get ("whls" , {}),
246+ sdists = known_sources .get ("sdists" , {}),
247+ sha256s_by_version = {
248+ k : sorted (v )
249+ for k , v in known_sources .get ("sha256s_by_version" , {}).items ()
250+ },
251+ )
252+
253+ # Persist these facts for the next run because we have used them.
254+ return _store_facts (facts , facts_version , index_url , output )
255+
256+ def _store_facts (facts , fact_version , index_url , value ):
257+ """Store values as facts in the lock file.
258+
259+ The main idea is to ensure that the lock file is small and it is only
260+ storing what we would need to fetch from the internet. Any derivative
261+ information we can get from this that can be achieved using pure Starlark
262+ functions should be done in Starlark.
263+ """
264+ if not value :
265+ return value
266+
267+ facts ["fact_version" ] = fact_version
268+
269+ root_url , _ , distribution = index_url .rstrip ("/" ).rpartition ("/" )
270+ distribution = distribution .rstrip ("/" )
271+ root_url = root_url .rstrip ("/" )
272+
273+ for sha256 , d in (value .sdists | value .whls ).items ():
274+ facts .setdefault ("dist_hashes" , {}).setdefault (root_url , {}).setdefault (distribution , {}).setdefault (d .url , sha256 )
275+ if not d .url .endswith (d .filename ):
276+ facts .setdefault ("dist_filenames" , {}).setdefault (root_url , {}).setdefault (distribution , {}).setdefault (d .url , d .filename )
277+ if d .yanked != None :
278+ facts .setdefault ("dist_yanked" , {}).setdefault (root_url , {}).setdefault (distribution , {}).setdefault (sha256 , d .yanked )
279+
280+ return value
0 commit comments