diff --git a/Cargo.lock b/Cargo.lock
index 0519ab1..cf86a79 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1229,6 +1229,7 @@ version = "0.4.0"
 dependencies = [
  "base64",
  "chrono",
+ "futures",
  "multistore",
  "quick-xml 0.37.5",
  "rand 0.8.5",
diff --git a/crates/core/src/middleware.rs b/crates/core/src/middleware.rs
index 3a3ab3a..00d68e6 100644
--- a/crates/core/src/middleware.rs
+++ b/crates/core/src/middleware.rs
@@ -290,7 +290,7 @@ mod tests {
             identity: &IDENTITY,
             operation: &OPERATION,
             bucket_config: Some(Cow::Borrowed(&*BUCKET_CONFIG)),
-            headers: &*HEADERS,
+            headers: &HEADERS,
             source_ip: None,
             request_id: "test-request-id",
             list_rewrite: None,
diff --git a/crates/oidc-provider/Cargo.toml b/crates/oidc-provider/Cargo.toml
index 3eeec9f..82addee 100644
--- a/crates/oidc-provider/Cargo.toml
+++ b/crates/oidc-provider/Cargo.toml
@@ -22,6 +22,8 @@ rsa.workspace = true
 sha2.workspace = true
 tracing.workspace = true
 uuid.workspace = true
+# Per-key async lock for single-flight credential refresh (see `cache.rs`).
+futures.workspace = true
 
 [dev-dependencies]
 tokio = { workspace = true, features = ["rt", "macros"] }
diff --git a/crates/oidc-provider/src/cache.rs b/crates/oidc-provider/src/cache.rs
index e684f52..6dcb5ee 100644
--- a/crates/oidc-provider/src/cache.rs
+++ b/crates/oidc-provider/src/cache.rs
@@ -1,97 +1,227 @@
-//! TTL credential cache.
+//! Credential cache with single-flight refresh.
 //!
-//! Caches [`BackendCredentials`] by key, evicting entries that are within a
-//! safety margin of expiration. This avoids redundant STS calls when the
-//! same backend is accessed repeatedly within a short window.
+//! Caches [`BackendCredentials`] by key so the proxy doesn't re-mint and
+//! re-exchange on every request. Beyond a plain TTL cache it:
+//!
+//! - **serves while fresh** — returns a cached value directly while it is
+//!   comfortably valid,
+//! - **proactively refreshes** — once a value is within [`REFRESH_LEAD_SECS`]
+//!   of expiry, the next access re-mints it, so a credential is never handed
+//!   out about to expire mid-request, and
+//! - **single-flights** — while one caller is minting for a key, concurrent
+//!   callers for that *same* key await the in-flight result instead of each
+//!   launching their own exchange. A cold-cache burst collapses to one STS call.
+//!
+//! The fetch happens through a caller-supplied closure ([`get_or_fetch`]), so
+//! the cache never needs to know how credentials are minted, and a runtime can
+//! layer an additional cache tier (e.g. the Cloudflare Cache API) inside the
+//! closure. See `docs/architecture/caching.md`.
 
 use std::collections::HashMap;
+use std::future::Future;
 use std::sync::{Arc, Mutex};
 
 use chrono::{Duration, Utc};
+use futures::lock::Mutex as AsyncMutex;
 
 use crate::BackendCredentials;
 
-/// Safety margin before expiration — credentials are considered expired
-/// this many seconds before their actual `expires_at`.
-const EXPIRY_MARGIN_SECS: i64 = 60;
+/// Refresh a cached credential once it is within this many seconds of expiry,
+/// so it is never handed out about to expire mid-request.
+const REFRESH_LEAD_SECS: i64 = 60;
+
+/// One async-locked slot per key. The per-key [`AsyncMutex`] is what serializes
+/// (single-flights) refreshes; the value is shared via `Arc`.
+type Slot = Arc<AsyncMutex<Option<Arc<BackendCredentials>>>>;
 
-/// Thread-safe TTL cache for cloud credentials.
+/// Thread-safe credential cache with proactive refresh and single-flight.
 ///
-/// `Clone` shares the same underlying store (the entries map is behind an
-/// `Arc`), so a cloned [`OidcCredentialProvider`](crate::OidcCredentialProvider)
-/// keeps hitting the same cache — letting a runtime hold the provider in a
+/// `Clone` shares the same underlying store (the slot map is behind an `Arc`),
+/// so a cloned [`OidcCredentialProvider`](crate::OidcCredentialProvider) keeps
+/// hitting the same cache — letting a runtime hold the provider in a
 /// shared/`static` slot and reuse it across requests instead of re-minting and
 /// re-exchanging every time.
 #[derive(Clone, Default)]
 pub struct CredentialCache {
-    entries: Arc<Mutex<HashMap<String, Arc<BackendCredentials>>>>,
+    /// One slot per key. The outer `Mutex` only guards insertion into the map
+    /// and is never held across an `.await`; the per-key [`AsyncMutex`] inside
+    /// each [`Slot`] is what single-flights refreshes.
+    slots: Arc<Mutex<HashMap<String, Slot>>>,
 }
 
 impl CredentialCache {
     /// Create an empty credential cache.
     pub fn new() -> Self {
         Self {
-            entries: Arc::new(Mutex::new(HashMap::new())),
+            slots: Arc::new(Mutex::new(HashMap::new())),
         }
     }
 
-    /// Retrieve cached credentials if they are still valid.
-    pub fn get(&self, key: &str) -> Option<Arc<BackendCredentials>> {
-        let entries = self.entries.lock().unwrap();
-        if let Some(creds) = entries.get(key) {
-            let margin = Duration::seconds(EXPIRY_MARGIN_SECS);
-            if creds.expiration > Utc::now() + margin {
-                return Some(creds.clone());
+    /// Return cached credentials for `key` if still fresh, otherwise run `fetch`
+    /// (single-flighted) to obtain and cache new ones.
+    ///
+    /// A cached value is fresh while `now < expiration - REFRESH_LEAD_SECS`.
+    ///
+    /// Single-flight: while one caller is running `fetch` for a key, concurrent
+    /// callers for that same key block on the per-key lock; when it releases
+    /// they observe the freshly-cached value and return it without calling their
+    /// own `fetch`.
+    pub async fn get_or_fetch<F, Fut, E>(
+        &self,
+        key: &str,
+        fetch: F,
+    ) -> Result<Arc<BackendCredentials>, E>
+    where
+        F: FnOnce() -> Fut,
+        Fut: Future<Output = Result<Arc<BackendCredentials>, E>>,
+    {
+        let slot = self.slot(key);
+        let mut guard = slot.lock().await;
+
+        if let Some(creds) = guard.as_ref() {
+            if is_fresh(creds) {
+                return Ok(creds.clone());
             }
         }
-        None
+
+        let fresh = fetch().await?;
+        *guard = Some(fresh.clone());
+        Ok(fresh)
     }
 
-    /// Store credentials in the cache.
-    pub fn put(&self, key: String, creds: Arc<BackendCredentials>) {
-        let mut entries = self.entries.lock().unwrap();
-        entries.insert(key, creds);
+    fn slot(&self, key: &str) -> Slot {
+        self.slots
+            .lock()
+            .expect("credential cache mutex poisoned")
+            .entry(key.to_string())
+            .or_insert_with(|| Arc::new(AsyncMutex::new(None)))
+            .clone()
     }
 }
 
+/// A credential is fresh while it is more than [`REFRESH_LEAD_SECS`] from expiry.
+fn is_fresh(creds: &BackendCredentials) -> bool {
+    creds.expiration > Utc::now() + Duration::seconds(REFRESH_LEAD_SECS)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
+    use std::sync::atomic::{AtomicUsize, Ordering};
 
-    fn make_creds(expires_in_secs: i64) -> BackendCredentials {
-        BackendCredentials {
+    fn creds(expires_in_secs: i64) -> Arc<BackendCredentials> {
+        Arc::new(BackendCredentials {
             access_key_id: "AKID".into(),
             secret_access_key: "secret".into(),
             session_token: "token".into(),
             expiration: Utc::now() + Duration::seconds(expires_in_secs),
-        }
+        })
     }
 
-    #[test]
-    fn cache_returns_valid_entry() {
+    #[tokio::test]
+    async fn fetches_on_miss() {
         let cache = CredentialCache::new();
-        let creds = Arc::new(make_creds(600));
-        cache.put("role-a".into(), creds.clone());
-
-        let got = cache.get("role-a");
-        assert!(got.is_some());
-        assert_eq!(got.unwrap().access_key_id, "AKID");
+        let got = cache
+            .get_or_fetch("k", || async { Ok::<_, ()>(creds(600)) })
+            .await
+            .unwrap();
+        assert_eq!(got.access_key_id, "AKID");
     }
 
-    #[test]
-    fn cache_evicts_expired_entry() {
+    #[tokio::test]
+    async fn reuses_while_fresh() {
         let cache = CredentialCache::new();
-        // Expires in 30 seconds — within the 60-second margin
-        let creds = Arc::new(make_creds(30));
-        cache.put("role-b".into(), creds);
+        cache
+            .get_or_fetch("k", || async { Ok::<_, ()>(creds(600)) })
+            .await
+            .unwrap();
+        // Well outside the 60s refresh lead → must not re-fetch.
+        let got = cache
+            .get_or_fetch::<_, _, ()>("k", || async {
+                panic!("must not fetch while cached creds are fresh")
+            })
+            .await
+            .unwrap();
+        assert_eq!(got.access_key_id, "AKID");
+    }
 
-        let got = cache.get("role-b");
-        assert!(got.is_none());
+    #[tokio::test]
+    async fn refreshes_within_lead_window() {
+        let cache = CredentialCache::new();
+        // Expires in 30s — inside the 60s refresh lead → due for refresh.
+        cache
+            .get_or_fetch("k", || async { Ok::<_, ()>(creds(30)) })
+            .await
+            .unwrap();
+        let got = cache
+            .get_or_fetch("k", || async {
+                Ok::<_, ()>(Arc::new(BackendCredentials {
+                    access_key_id: "REFRESHED".into(),
+                    secret_access_key: "secret".into(),
+                    session_token: "token".into(),
+                    expiration: Utc::now() + Duration::hours(1),
+                }))
+            })
+            .await
+            .unwrap();
+        assert_eq!(got.access_key_id, "REFRESHED");
     }
 
-    #[test]
-    fn cache_miss_for_unknown_key() {
+    #[tokio::test]
+    async fn keys_are_isolated() {
         let cache = CredentialCache::new();
-        assert!(cache.get("unknown").is_none());
+        cache
+            .get_or_fetch("a", || async { Ok::<_, ()>(creds(600)) })
+            .await
+            .unwrap();
+        // A different key is a miss → fetches.
+        let mut fetched = false;
+        cache
+            .get_or_fetch("b", || async {
+                fetched = true;
+                Ok::<_, ()>(creds(600))
+            })
+            .await
+            .unwrap();
+        assert!(fetched);
+    }
+
+    #[tokio::test]
+    async fn single_flights_concurrent_fetches() {
+        let cache = Arc::new(CredentialCache::new());
+        let calls = Arc::new(AtomicUsize::new(0));
+
+        let one = {
+            let cache = cache.clone();
+            let calls = calls.clone();
+            async move {
+                cache
+                    .get_or_fetch("k", || async {
+                        calls.fetch_add(1, Ordering::SeqCst);
+                        // Yield while holding the per-key lock so the sibling
+                        // future contends for it — exercising single-flight.
+                        tokio::task::yield_now().await;
+                        Ok::<_, ()>(creds(600))
+                    })
+                    .await
+            }
+        };
+        let two = {
+            let cache = cache.clone();
+            let calls = calls.clone();
+            async move {
+                cache
+                    .get_or_fetch("k", || async {
+                        calls.fetch_add(1, Ordering::SeqCst);
+                        Ok::<_, ()>(creds(600))
+                    })
+                    .await
+            }
+        };
+
+        let (a, b) = tokio::join!(one, two);
+        a.unwrap();
+        b.unwrap();
+        assert_eq!(calls.load(Ordering::SeqCst), 1, "fetch should run once");
     }
 }
diff --git a/crates/oidc-provider/src/jwks.rs b/crates/oidc-provider/src/jwks.rs
index f916d2b..357e73c 100644
--- a/crates/oidc-provider/src/jwks.rs
+++ b/crates/oidc-provider/src/jwks.rs
@@ -54,7 +54,7 @@ mod tests {
         assert_eq!(key["use"], "sig");
         assert_eq!(key["kid"], "my-kid");
         assert!(key["n"].as_str().unwrap().len() > 10);
-        assert!(key["e"].as_str().unwrap().len() > 0);
+        assert!(!key["e"].as_str().unwrap().is_empty());
     }
 
     #[test]
diff --git a/crates/oidc-provider/src/lib.rs b/crates/oidc-provider/src/lib.rs
index a383a00..2555063 100644
--- a/crates/oidc-provider/src/lib.rs
+++ b/crates/oidc-provider/src/lib.rs
@@ -89,8 +89,12 @@ impl<H: HttpExchange> OidcCredentialProvider<H> {
     /// Get credentials for a backend, using cached values when available.
     ///
     /// `exchange` describes how to trade the self-signed JWT for cloud
-    /// credentials (AWS, Azure, GCP). `cache_key` identifies the backend
-    /// for caching purposes (e.g. the role ARN).
+    /// credentials (AWS, Azure, GCP). `cache_key` identifies the backend for
+    /// caching purposes (e.g. the role ARN).
+    ///
+    /// Concurrent calls for the same `cache_key` are single-flighted: only one
+    /// JWT mint + exchange runs, and the rest await its result. A cached value
+    /// is reused until it nears expiry, then proactively re-minted.
     pub async fn get_credentials<E: CredentialExchange<H>>(
         &self,
         cache_key: &str,
@@ -98,24 +102,16 @@ impl<H: HttpExchange> OidcCredentialProvider<H> {
         subject: &str,
         extra_claims: &[(&str, &str)],
     ) -> Result<Arc<BackendCredentials>, OidcProviderError> {
-        // Check cache first
-        if let Some(creds) = self.cache.get(cache_key) {
-            return Ok(creds);
-        }
-
-        // Mint a JWT
-        let token = self
-            .signer
-            .sign(subject, &self.issuer, &self.audience, extra_claims)?;
-
-        // Exchange it for cloud credentials
-        let creds: BackendCredentials = exchange.exchange(&self.http, &token).await?;
-        let creds = Arc::new(creds);
-
-        // Cache
-        self.cache.put(cache_key.to_string(), creds.clone());
-
-        Ok(creds)
+        self.cache
+            .get_or_fetch(cache_key, || async {
+                // Cache miss (or due for refresh): mint a JWT and exchange it.
+                let token =
+                    self.signer
+                        .sign(subject, &self.issuer, &self.audience, extra_claims)?;
+                let creds: BackendCredentials = exchange.exchange(&self.http, &token).await?;
+                Ok(Arc::new(creds))
+            })
+            .await
     }
 
     /// Access the underlying signer (e.g. for JWKS generation).
diff --git a/docs/.vitepress/config.ts b/docs/.vitepress/config.ts
index 2c38670..6ba8de4 100644
--- a/docs/.vitepress/config.ts
+++ b/docs/.vitepress/config.ts
@@ -75,6 +75,10 @@ const adminSidebar = [
         text: "Multi-Runtime Design",
         link: "/architecture/multi-runtime",
       },
+      {
+        text: "Caching",
+        link: "/architecture/caching",
+      },
     ],
   },
   {
diff --git a/docs/architecture/caching.md b/docs/architecture/caching.md
new file mode 100644
index 0000000..6b219fc
--- /dev/null
+++ b/docs/architecture/caching.md
@@ -0,0 +1,97 @@
+# Caching
+
+Multistore mints and fetches several kinds of short-lived data on the hot path — backend credentials, signing keys, config lookups. Re-doing that work on every request would add latency and hammer upstream services (STS, identity providers, config stores). This page covers what is cached, the credential cache, and — most importantly — how caching behaves differently on each runtime, with best practices for deploying it safely on Cloudflare Workers.
+
+## What gets cached
+
+| Cache | Crate | What it holds | Layer |
+|-------|-------|---------------|-------|
+| Credential cache | `multistore-oidc-provider` | Short-lived backend/cloud credentials, keyed by credential identity | Outbound auth |
+| JWKS cache | `multistore-sts` | Identity providers' public verification keys | Inbound auth |
+| Config provider cache | example code (`CachedProvider`) | Bucket/role/credential config lookups | Configuration |
+
+These are independent layers — they protect different upstreams. This page focuses on the **credential cache**, since it is the most performance-sensitive and the most subtle to deploy correctly across runtimes. See [Caching Providers](/configuration/providers/cached) for config-lookup caching and [Backend Auth](/auth/backend-auth) for where credentials come from.
+
+## The credential cache
+
+[`multistore-oidc-provider`](/auth/backend-auth#oidc-backend-auth) caches the cloud credentials it exchanges for in an in-memory `CredentialCache` (its `cache` module), keyed by the backend's credential identity (e.g. the IAM role ARN). It is an internal detail of the provider — callers get its benefit transparently through `OidcCredentialProvider::get_credentials`.
+
+On a miss the cache runs a caller-supplied `fetch` closure (the JWT mint + STS exchange) and stores the result:
+
+```rust
+let creds = cache
+    .get_or_fetch(role_arn, || async { mint_via_sts().await })
+    .await?;
+```
+
+It gives you three behaviours:
+
+- **Serve-while-fresh** — a cached value is returned directly while it is comfortably valid.
+- **Proactive refresh** — once a value is within its *refresh lead* (60s) of expiry, the next access re-mints it, so a credential is never handed out about to expire mid-request.
+- **Single-flight** — while one caller is minting for a key, concurrent callers for that *same* key await the in-flight result instead of each launching their own mint. This collapses a cold-cache burst into a single upstream call.
+
+Because the cache calls *your* fetch closure on a miss, you can layer additional cache tiers (e.g. the Cloudflare Cache API) *inside* the closure without the cache ever depending on a runtime — see [Layering an external tier](#layering-an-external-tier).
+
+## Runtime caveats
+
+A credential cache is only as useful as the lifetime of the thing holding it. The same `CredentialCache` behaves very differently depending on the runtime and on where you construct it.
+
+> [!IMPORTANT]
+> An in-memory cache only helps across requests if it lives in **persistent scope** (constructed once and reused), not rebuilt inside the per-request handler. If the provider holding the cache is created fresh on every request, every request starts with an empty cache and the cache does nothing.
+
+| Tier | Scope | Survives | Use for |
+|------|-------|----------|---------|
+| In-memory (`CredentialCache`) | Per-process (native) / **per-isolate** (Workers) | While the process/isolate is warm | The default; single-flight + proactive refresh |
+| Cloudflare Cache API | **Per-colo** (data center) | Isolate cold starts within a colo | Sharing mints across isolates in one location |
+| Workers KV | Global, eventually consistent | Everything (≈seconds to propagate) | Cross-colo sharing of short-lived creds |
+| Durable Objects | Global, single owner per key | Everything | True cross-isolate single-flight |
+
+### Native (server) runtime
+
+The server runtime is a long-lived multi-threaded process. Construct the provider (and thus its `CredentialCache`) **once at startup** and share it across requests. The in-memory cache is then global to the process: one mint per credential lifetime, and single-flight collapses concurrent requests. This is the simple, fully-effective case.
+
+### Cloudflare Workers runtime
+
+Workers run in V8 **isolates**, not per-request containers. Global/module-scope state persists across requests handled by the same warm isolate — but:
+
+- The cache is **per-isolate**, and Cloudflare runs many isolates across many colos. With _N_ live isolates you get up to _N_ independent mints per credential lifetime, not one.
+- Isolates cold-start empty and are evicted under memory pressure or idle.
+- Single-flight only collapses concurrency *within* one isolate.
+
+Even so, this is a large win: a warm isolate serving thousands of requests for the same bucket reuses one credential instead of minting per request. To get *any* cross-request benefit, hoist the provider into module scope (e.g. a `OnceCell`) rather than rebuilding it inside the `fetch` handler.
+
+For sharing beyond a single isolate, layer an external tier.
+
+## Layering an external tier
+
+The Cloudflare Cache API is **colo-local**: shared across all isolates in one data center and surviving isolate cold-starts there. It is the cheapest way to stop every fresh isolate in a busy colo from re-minting. Because `get_or_fetch` calls your closure on a miss, the external tier lives *inside* the closure — keeping the cache itself free of any runtime dependency:
+
+```text
+request
+  └─ L1: in-memory CredentialCache  (per-isolate, single-flight, proactive refresh)
+       └─ on miss, the fetch closure does:
+            L2: Cache API            (colo-local, shared across isolates in the colo)
+                 └─ on miss, origin:  STS / token exchange (mint)
+                      └─ write back to L2
+```
+
+This same shape works with Workers KV (global) as an L3, or Durable Objects when you need *global* single-flight (one DO instance per key serialises the mint across all isolates).
+
+### Best practices for an external credential cache
+
+> [!WARNING]
+> An external cache value is a usable credential at rest. Treat it as a secret.
+
+- **Use a synthetic, non-routable cache key.** Namespace it under a host you control (e.g. `https://creds.internal/v1/<hash>`) so a client can never `fetch` credentials straight out of the cache.
+- **Encrypt the stored value.** The proxy already holds a signing key; encrypting at rest means a leaked cache entry is not directly usable.
+- **Keep TTLs short** and aligned with the credential lifetime — these are already short-lived credentials; do not extend their reach.
+- **Align the external TTL with the in-memory refresh lead.** Set the external entry's `max-age` to `remaining_lifetime − refresh_lead`. Otherwise the in-memory layer enters its refresh window, reads a still-present-but-stale value from the external tier, and re-reads forever without ever minting fresh.
+- **Write back without blocking the response** (e.g. `ctx.waitUntil(...)` on Workers) so populating the cache never adds latency.
+- **Don't rely on presence.** External caches evict early; always re-check the embedded expiry rather than trusting that a hit is fresh.
+
+## See also
+
+- [Multi-Runtime Design](/architecture/multi-runtime) — why the cache is runtime-agnostic
+- [Backend Auth](/auth/backend-auth) — what the credential cache stores and where it's minted
+- [Cloudflare Workers Deployment](/deployment/cloudflare-workers) — deploying the Workers runtime
+- [Caching Providers](/configuration/providers/cached) — caching config/credential *lookups* (a separate layer)
diff --git a/docs/auth/backend-auth.md b/docs/auth/backend-auth.md
index 59df70e..cf3d5ad 100644
--- a/docs/auth/backend-auth.md
+++ b/docs/auth/backend-auth.md
@@ -232,11 +232,14 @@ On subsequent requests, cached credentials are reused until they expire.
 When using OIDC backend auth, the proxy caches temporary credentials to avoid calling the cloud provider's STS on every request. Credentials are:
 
 - Keyed by the IAM role ARN
-- Automatically refreshed when they expire
-- Shared across concurrent requests to the same bucket
+- **Proactively refreshed** shortly before they expire, so a credential is never handed out about to expire mid-request
+- **Single-flighted** across concurrent requests to the same bucket — only one token exchange runs while the rest await its result
 
 This means the first request to an OIDC-backed bucket incurs a small latency cost for the credential exchange, but subsequent requests use cached credentials until they expire.
 
+> [!NOTE]
+> How effective this cache is depends on the runtime — an in-memory cache is per-process on the server but per-isolate on Cloudflare Workers. See [Caching](/architecture/caching) for the cross-runtime details and best practices (including layering the Cloudflare Cache API).
+
 ## Choosing Between Static and OIDC
 
 | | Static Credentials | OIDC Backend Auth |