From a074887e1ec3588fad3a80894bbf6a11a093bbc7 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 11 Jun 2026 16:13:07 +0200 Subject: [PATCH 1/2] Dataset-type HF repos in materialize_dataset_source Certified data releases can live in dataset-type repos (e.g. policyengine/populace-us); the core helper assumes model-type. Retry with repo_type=dataset before surfacing the original failure. Verified: hf://policyengine/populace-us/populace_us_2024.h5 resolves and downloads through ensure_datasets with this fallback. --- .../provenance/dataset_sources.py | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/src/policyengine/provenance/dataset_sources.py b/src/policyengine/provenance/dataset_sources.py index 27d729b2..dfbdb311 100644 --- a/src/policyengine/provenance/dataset_sources.py +++ b/src/policyengine/provenance/dataset_sources.py @@ -96,10 +96,24 @@ def materialize_dataset_source( ) reference = parse_hf_uri(dataset_source) - return download_huggingface_dataset( - reference.repo_id, - reference.path, - version=_select_version(reference.version, version), - ) + try: + return download_huggingface_dataset( + reference.repo_id, + reference.path, + version=_select_version(reference.version, version), + ) + except Exception: + # The core helper assumes a model-type repo; certified data + # releases may live in dataset-type repos (e.g. + # policyengine/populace-us). Retry with the dataset repo type + # before surfacing the original failure. + from huggingface_hub import hf_hub_download + + return hf_hub_download( + repo_id=reference.repo_id, + repo_type="dataset", + filename=reference.path, + revision=_select_version(reference.version, version), + ) return dataset_source From c5354236fb488d699612dd7a3e288169c2d75bf4 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 11 Jun 2026 17:36:45 +0200 Subject: [PATCH 2/2] Add changelog fragment --- changelog.d/populace-dataset-repo-support.added.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/populace-dataset-repo-support.added.md diff --git a/changelog.d/populace-dataset-repo-support.added.md b/changelog.d/populace-dataset-repo-support.added.md new file mode 100644 index 00000000..ed46c783 --- /dev/null +++ b/changelog.d/populace-dataset-repo-support.added.md @@ -0,0 +1 @@ +Support dataset-type Hugging Face repos in dataset materialization (retry with repo_type=dataset before surfacing the original failure).