Skip to content

Commit 3ed161f

Browse files
aignasrickeylev
andauthored
feat(pypi): store PyPI results as facts v2 (#3654)
This PR adds functionality needed to write data that we find useful on the SimpleAPI responses to the lock file. I.e. this will no longer connect to the network if it can find the necessary information in the lock file. Superseeds #3559 Fixes #2731 --------- Co-authored-by: Richard Levasseur <rlevasseur@google.com>
1 parent 798df3f commit 3ed161f

11 files changed

Lines changed: 537 additions & 38 deletions

File tree

CHANGELOG.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,10 @@ Other changes:
7777

7878
{#v0-0-0-added}
7979
### Added
80-
* Nothing added.
80+
* (pypi) Write SimpleAPI contents to the `MODULE.bazel.lock` file if using
81+
{obj}`experimental_index_url` which should speed up consecutive initializations and should no
82+
longer require the network access if the cache is hydrated.
83+
Implements [#2731](https://github.com/bazel-contrib/rules_python/issues/2731).
8184

8285
{#v1-9-0}
8386
## [1.9.0] - 2026-02-21

python/private/pypi/BUILD.bazel

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,9 @@ bzl_library(
359359
bzl_library(
360360
name = "pypi_cache_bzl",
361361
srcs = ["pypi_cache.bzl"],
362+
deps = [
363+
":version_from_filename_bzl",
364+
],
362365
)
363366

364367
bzl_library(

python/private/pypi/extension.bzl

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ You cannot use both the additive_build_content and additive_build_content_file a
225225
# dict[str repo, HubBuilder]
226226
# See `hub_builder.bzl%hub_builder()` for `HubBuilder`
227227
pip_hub_map = {}
228-
simpleapi_cache = pypi_cache()
228+
simpleapi_cache = pypi_cache(mctx = module_ctx)
229229

230230
for mod in module_ctx.modules:
231231
for pip_attr in mod.tags.parse:
@@ -293,6 +293,7 @@ You cannot use both the additive_build_content and additive_build_content_file a
293293
config = config,
294294
exposed_packages = exposed_packages,
295295
extra_aliases = extra_aliases,
296+
facts = simpleapi_cache.get_facts(),
296297
hub_group_map = hub_group_map,
297298
hub_whl_map = hub_whl_map,
298299
whl_libraries = whl_libraries,
@@ -372,7 +373,11 @@ def _pip_impl(module_ctx):
372373
module_ctx: module contents
373374
"""
374375

375-
mods = parse_modules(module_ctx, enable_pipstar = rp_config.enable_pipstar, enable_pipstar_extract = rp_config.enable_pipstar and rp_config.bazel_8_or_later)
376+
mods = parse_modules(
377+
module_ctx,
378+
enable_pipstar = rp_config.enable_pipstar,
379+
enable_pipstar_extract = rp_config.enable_pipstar and rp_config.bazel_8_or_later,
380+
)
376381

377382
# Build all of the wheel modifications if the tag class is called.
378383
_whl_mods_impl(mods.whl_mods)
@@ -394,9 +399,15 @@ def _pip_impl(module_ctx):
394399
groups = mods.hub_group_map.get(hub_name),
395400
)
396401

397-
return module_ctx.extension_metadata(
398-
reproducible = True,
399-
)
402+
# The code is smart to not return facts if we don't support the mechanism for that.
403+
# Hence we should not pass it to the metadata
404+
if mods.facts:
405+
return module_ctx.extension_metadata(
406+
reproducible = True,
407+
facts = mods.facts,
408+
)
409+
else:
410+
return module_ctx.extension_metadata(reproducible = True)
400411

401412
_default_attrs = {
402413
"arch_name": attr.string(

python/private/pypi/hub_builder.bzl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -395,11 +395,11 @@ def _set_get_index_urls(self, pip_attr):
395395
index_url = pip_attr.experimental_index_url,
396396
extra_index_urls = pip_attr.experimental_extra_index_urls or [],
397397
index_url_overrides = pip_attr.experimental_index_url_overrides or {},
398-
sources = [
399-
d
400-
for d in distributions
398+
sources = {
399+
d: versions
400+
for d, versions in distributions.items()
401401
if _use_downloader(self, python_version, d)
402-
],
402+
},
403403
envsubst = pip_attr.envsubst,
404404
# Auth related info
405405
netrc = pip_attr.netrc,

python/private/pypi/parse_requirements.bzl

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def parse_requirements(
5353
os, arch combinations.
5454
extra_pip_args (string list): Extra pip arguments to perform extra validations and to
5555
be joined with args found in files.
56-
get_index_urls: Callable[[ctx, list[str]], dict], a callable to get all
56+
get_index_urls: Callable[[ctx, dict[str, list[str]]], dict], a callable to get all
5757
of the distribution URLs from a PyPI index. Accepts ctx and
5858
distribution names to query.
5959
evaluate_markers: A function to use to evaluate the requirements.
@@ -170,15 +170,17 @@ def parse_requirements(
170170

171171
index_urls = {}
172172
if get_index_urls:
173+
distributions = {}
174+
for reqs in requirements_by_platform.values():
175+
for req in reqs.values():
176+
if req.srcs.url:
177+
continue
178+
179+
distributions.setdefault(req.distribution, []).append(req.srcs.version)
180+
173181
index_urls = get_index_urls(
174182
ctx,
175-
# Use list({}) as a way to have a set
176-
list({
177-
req.distribution: None
178-
for reqs in requirements_by_platform.values()
179-
for req in reqs.values()
180-
if not req.srcs.url
181-
}),
183+
distributions,
182184
)
183185

184186
ret = []

python/private/pypi/pypi_cache.bzl

Lines changed: 230 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,36 @@ In the future the same will be used to:
88
- Store PyPI index query results as facts in the MODULE.bazel.lock file
99
"""
1010

11-
def pypi_cache(store = None):
11+
load(":version_from_filename.bzl", "version_from_filename")
12+
13+
# This value should be changed whenever the storage format changes.
14+
# Changing it simply means the information cached in the lockfile has to be
15+
# recomputed.
16+
_FACT_VERSION = "v1"
17+
18+
def pypi_cache(mctx = None, store = None):
1219
"""The cache for PyPI index queries.
1320
1421
Currently the key is of the following structure:
15-
(url, real_url)
22+
(url, real_url, versions)
23+
24+
Args:
25+
mctx: The module context
26+
store: The in-memory store, should implement dict interface for get and setdefault
27+
28+
Returns:
29+
A cache struct
1630
"""
31+
mcache = memory_cache(store)
32+
fcache = facts_cache(getattr(mctx, "facts", None))
1733

1834
# buildifier: disable=uninitialized
1935
self = struct(
20-
_store = store or {},
36+
_mcache = mcache,
37+
_facts = fcache,
2138
setdefault = lambda key, parsed_result: _pypi_cache_setdefault(self, key, parsed_result),
2239
get = lambda key: _pypi_cache_get(self, key),
40+
get_facts = lambda: _pypi_cache_get_facts(self),
2341
)
2442

2543
# buildifier: enable=uninitialized
@@ -40,7 +58,14 @@ def _pypi_cache_setdefault(self, key, parsed_result):
4058
Returns:
4159
The `parse_result`.
4260
"""
43-
return self._store.setdefault(key, parsed_result)
61+
index_url, real_url, versions = key
62+
self._mcache.setdefault(real_url, parsed_result)
63+
if not versions or not self._facts:
64+
return parsed_result
65+
66+
# Filter the packages to only what is needed before writing to the facts cache
67+
filtered = _filter_packages(parsed_result, versions)
68+
return self._facts.setdefault(index_url, filtered)
4469

4570
def _pypi_cache_get(self, key):
4671
"""Return the parsed result from the cache.
@@ -52,4 +77,204 @@ def _pypi_cache_get(self, key):
5277
Returns:
5378
The {type}`struct` or `None` based on if the result is in the cache or not.
5479
"""
55-
return self._store.get(key)
80+
index_url, real_url, versions = key
81+
82+
# When retrieving from memory cache, filter down to only what is needed. If the
83+
# cache is empty, we will attempt to read from facts, however, reading from memory
84+
# first allows us to not parse the contents of the lock file that may add up.
85+
cached = _filter_packages(self._mcache.get(real_url), versions)
86+
if not self._facts:
87+
return cached
88+
89+
if not cached and versions:
90+
# Could not get from in-memory, read from lockfile facts
91+
cached = self._facts.get(index_url, versions)
92+
93+
return cached
94+
95+
def _pypi_cache_get_facts(self):
96+
if not self._facts:
97+
return {}
98+
99+
return self._facts.facts
100+
101+
def memory_cache(cache = None):
102+
"""SimpleAPI cache for making fewer calls.
103+
104+
We are using the `real_url` as the key in the cache functions on purpose in order to get the
105+
best possible cache hits.
106+
107+
Args:
108+
cache: the storage to store things in memory.
109+
110+
Returns:
111+
struct with 2 methods, `get` and `setdefault`.
112+
"""
113+
if cache == None:
114+
cache = {}
115+
116+
return struct(
117+
get = lambda real_url: cache.get(real_url),
118+
setdefault = lambda real_url, value: cache.setdefault(real_url, value),
119+
)
120+
121+
def _filter_packages(dists, requested_versions):
122+
if dists == None or not requested_versions:
123+
return dists
124+
125+
sha256s_by_version = {}
126+
whls = {}
127+
sdists = {}
128+
129+
for sha256, d in dists.sdists.items():
130+
if d.version not in requested_versions:
131+
continue
132+
133+
sdists[sha256] = d
134+
sha256s_by_version.setdefault(d.version, []).append(sha256)
135+
136+
for sha256, d in dists.whls.items():
137+
if d.version not in requested_versions:
138+
continue
139+
140+
whls[sha256] = d
141+
sha256s_by_version.setdefault(d.version, []).append(sha256)
142+
143+
if not whls and not sdists:
144+
# TODO @aignas 2026-03-08: add logging
145+
#print("WARN: no dists matched for versions {}".format(requested_versions))
146+
return None
147+
148+
return struct(
149+
whls = whls,
150+
sdists = sdists,
151+
sha256s_by_version = {
152+
k: sorted(v)
153+
for k, v in sha256s_by_version.items()
154+
},
155+
)
156+
157+
def facts_cache(known_facts, facts_version = _FACT_VERSION):
158+
"""The facts cache.
159+
160+
Here we have a way to store things as facts and the main thing to keep in mind is that we should
161+
not use the real_url in case it contains credentials in it (e.g. is of form `https://<username>:<password>@<host>`).
162+
163+
Args:
164+
known_facts: An opaque object coming from {obj}`module_ctx.facts`.
165+
facts_version: {type}`str` the version of the facts schema, used for short-circuiting.
166+
167+
Returns:
168+
A struct that has:
169+
* `get` method for getting values from the facts cache.
170+
* `setdefault` method for setting values in the cache.
171+
* `facts` attribute that should be passed to the {obj}`module_ctx.extension_metadata` to persist facts.
172+
"""
173+
if known_facts == None:
174+
return None
175+
176+
facts = {}
177+
178+
return struct(
179+
get = lambda index_url, versions: _get_from_facts(
180+
facts,
181+
known_facts,
182+
index_url,
183+
versions,
184+
facts_version,
185+
),
186+
setdefault = lambda url, value: _store_facts(facts, facts_version, url, value),
187+
known_facts = known_facts,
188+
facts = facts,
189+
)
190+
191+
def _get_from_facts(facts, known_facts, index_url, requested_versions, facts_version):
192+
if known_facts.get("fact_version") != facts_version:
193+
# cannot trust known facts, different version that we know how to parse
194+
return None
195+
196+
known_sources = {}
197+
198+
root_url, _, distribution = index_url.rstrip("/").rpartition("/")
199+
distribution = distribution.rstrip("/")
200+
root_url = root_url.rstrip("/")
201+
202+
retrieved_versions = {}
203+
204+
for url, sha256 in known_facts.get("dist_hashes", {}).get(root_url, {}).get(distribution, {}).items():
205+
filename = known_facts.get("dist_filenames", {}).get(root_url, {}).get(distribution, {}).get(sha256)
206+
if not filename:
207+
_, _, filename = url.rpartition("/")
208+
209+
version = version_from_filename(filename)
210+
if version not in requested_versions:
211+
# TODO @aignas 2026-01-21: do the check by requested shas at some point
212+
# We don't have sufficient info in the lock file, need to call the API
213+
#
214+
continue
215+
216+
retrieved_versions[version] = True
217+
218+
if filename.endswith(".whl"):
219+
dists = known_sources.setdefault("whls", {})
220+
else:
221+
dists = known_sources.setdefault("sdists", {})
222+
223+
known_sources.setdefault("sha256s_by_version", {}).setdefault(version, []).append(sha256)
224+
225+
dists.setdefault(sha256, struct(
226+
sha256 = sha256,
227+
filename = filename,
228+
version = version,
229+
metadata_url = "",
230+
metadata_sha256 = "",
231+
url = url,
232+
yanked = known_facts.get("dist_yanked", {}).get(root_url, {}).get(distribution, {}).get(sha256),
233+
))
234+
235+
if not known_sources:
236+
# We found nothing in facts
237+
return None
238+
239+
if len(requested_versions) != len(retrieved_versions):
240+
# If the results are incomplete, then return None, so that we can fetch sources from the
241+
# internet again.
242+
return None
243+
244+
output = struct(
245+
whls = known_sources.get("whls", {}),
246+
sdists = known_sources.get("sdists", {}),
247+
sha256s_by_version = {
248+
k: sorted(v)
249+
for k, v in known_sources.get("sha256s_by_version", {}).items()
250+
},
251+
)
252+
253+
# Persist these facts for the next run because we have used them.
254+
return _store_facts(facts, facts_version, index_url, output)
255+
256+
def _store_facts(facts, fact_version, index_url, value):
257+
"""Store values as facts in the lock file.
258+
259+
The main idea is to ensure that the lock file is small and it is only
260+
storing what we would need to fetch from the internet. Any derivative
261+
information we can get from this that can be achieved using pure Starlark
262+
functions should be done in Starlark.
263+
"""
264+
if not value:
265+
return value
266+
267+
facts["fact_version"] = fact_version
268+
269+
root_url, _, distribution = index_url.rstrip("/").rpartition("/")
270+
distribution = distribution.rstrip("/")
271+
root_url = root_url.rstrip("/")
272+
273+
for sha256, d in (value.sdists | value.whls).items():
274+
facts.setdefault("dist_hashes", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(d.url, sha256)
275+
if not d.url.endswith(d.filename):
276+
facts.setdefault("dist_filenames", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(d.url, d.filename)
277+
if d.yanked != None:
278+
facts.setdefault("dist_yanked", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(sha256, d.yanked)
279+
280+
return value

0 commit comments

Comments
 (0)