diff --git a/changelog.d/populace-dataset-repo-support.added.md b/changelog.d/populace-dataset-repo-support.added.md new file mode 100644 index 00000000..ed46c783 --- /dev/null +++ b/changelog.d/populace-dataset-repo-support.added.md @@ -0,0 +1 @@ +Support dataset-type Hugging Face repos in dataset materialization (retry with repo_type=dataset before surfacing the original failure). diff --git a/src/policyengine/provenance/dataset_sources.py b/src/policyengine/provenance/dataset_sources.py index 27d729b2..dfbdb311 100644 --- a/src/policyengine/provenance/dataset_sources.py +++ b/src/policyengine/provenance/dataset_sources.py @@ -96,10 +96,24 @@ def materialize_dataset_source( ) reference = parse_hf_uri(dataset_source) - return download_huggingface_dataset( - reference.repo_id, - reference.path, - version=_select_version(reference.version, version), - ) + try: + return download_huggingface_dataset( + reference.repo_id, + reference.path, + version=_select_version(reference.version, version), + ) + except Exception: + # The core helper assumes a model-type repo; certified data + # releases may live in dataset-type repos (e.g. + # policyengine/populace-us). Retry with the dataset repo type + # before surfacing the original failure. + from huggingface_hub import hf_hub_download + + return hf_hub_download( + repo_id=reference.repo_id, + repo_type="dataset", + filename=reference.path, + revision=_select_version(reference.version, version), + ) return dataset_source