Skip to content

Commit ef88b5c

Browse files
Tests and warnings for issue 399 (nan in table columns) (#1050)
tests and warnings for issue 399 (nan in table columns)
1 parent 65a281e commit ef88b5c

2 files changed

Lines changed: 57 additions & 0 deletions

File tree

src/spatialdata/models/models.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1061,6 +1061,21 @@ def _validate_table_annotation_metadata(self, data: AnnData) -> None:
10611061
if len(set(expected_regions).symmetric_difference(set(found_regions))) > 0:
10621062
raise ValueError(f"Regions in the AnnData object and `{attr[self.REGION_KEY_KEY]}` do not match.")
10631063

1064+
# Warning for object/string columns with NaN in region_key or instance_key
1065+
instance_key = attr[self.INSTANCE_KEY]
1066+
region_key = attr[self.REGION_KEY_KEY]
1067+
for key_name, key_value in [("region_key", region_key), ("instance_key", instance_key)]:
1068+
if key_value in data.obs:
1069+
col = data.obs[key_value]
1070+
col_dtype = col.dtype
1071+
if (col_dtype == "object" or pd.api.types.is_string_dtype(col_dtype)) and col.isna().any():
1072+
logger.warning(
1073+
f"The {key_name} column '{key_value}' is of {col_dtype} type and contains NaN values. "
1074+
"After writing and reading with AnnData, NaN values may (depending on the AnnData version) "
1075+
"be converted to strings. This may cause issues when matching instances across read/write "
1076+
"cycles."
1077+
)
1078+
10641079
def validate(
10651080
self,
10661081
data: AnnData,

tests/io/test_readwrite.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,3 +1065,45 @@ def test_read_sdata(tmp_path: Path, points: SpatialData) -> None:
10651065
assert_spatial_data_objects_are_identical(sdata_from_path, sdata_from_str)
10661066
assert_spatial_data_objects_are_identical(sdata_from_path, sdata_from_upath)
10671067
assert_spatial_data_objects_are_identical(sdata_from_path, sdata_from_zarr_group)
1068+
1069+
1070+
def test_sdata_with_nan_in_obs() -> None:
1071+
"""Test writing SpatialData with mixed string/NaN values in obs works correctly.
1072+
1073+
Regression test for https://github.com/scverse/spatialdata/issues/399
1074+
Previously this raised TypeError: expected unicode string, found nan.
1075+
Now the write succeeds, though NaN values in object-dtype columns are
1076+
converted to the string "nan" after round-trip.
1077+
"""
1078+
from spatialdata.models import TableModel
1079+
1080+
table = TableModel.parse(
1081+
AnnData(
1082+
obs=pd.DataFrame(
1083+
{
1084+
"region": ["region1", "region2"],
1085+
"instance": [0, 0],
1086+
"column_only_region1": ["string", np.nan],
1087+
"column_only_region2": [np.nan, 3],
1088+
}
1089+
)
1090+
),
1091+
region_key="region",
1092+
instance_key="instance",
1093+
region=["region1", "region2"],
1094+
)
1095+
sdata = SpatialData(tables={"table": table})
1096+
assert sdata["table"].obs["column_only_region1"].iloc[1] is np.nan
1097+
assert np.isnan(sdata["table"].obs["column_only_region2"].iloc[0])
1098+
1099+
with tempfile.TemporaryDirectory() as tmpdir:
1100+
path = os.path.join(tmpdir, "data.zarr")
1101+
sdata.write(path)
1102+
1103+
sdata2 = SpatialData.read(path)
1104+
assert "column_only_region1" in sdata2["table"].obs.columns
1105+
assert sdata2["table"].obs["column_only_region1"].iloc[0] == "string"
1106+
assert sdata2["table"].obs["column_only_region2"].iloc[1] == 3
1107+
# After round-trip, NaN in object-dtype column becomes string "nan"
1108+
assert sdata2["table"].obs["column_only_region1"].iloc[1] == "nan"
1109+
assert np.isnan(sdata2["table"].obs["column_only_region2"].iloc[0])

0 commit comments

Comments
 (0)