@@ -830,3 +830,31 @@ def test_warning_on_large_chunks():
830830 assert len (w ) == 1 , "Warning should be raised for large chunk size"
831831 assert issubclass (w [- 1 ].category , UserWarning )
832832 assert "Detected chunks larger than:" in str (w [- 1 ].message )
833+
834+
835+ def test_categories_on_partitioned_dataframe (sdata_blobs : SpatialData ):
836+ df = sdata_blobs ["blobs_points" ].compute ()
837+ df ["genes" ] = RNG .choice ([f"gene_{ i } " for i in range (200 )], len (df ))
838+ N_PARTITIONS = 200
839+ ddf = dd .from_pandas (df , npartitions = N_PARTITIONS )
840+ ddf ["genes" ] = ddf ["genes" ].astype ("category" )
841+
842+ df ["genes" ] = df ["genes" ].astype ("category" )
843+ df_parsed = PointsModel .parse (df , npartitions = N_PARTITIONS )
844+ ddf_parsed = PointsModel .parse (ddf , npartitions = N_PARTITIONS )
845+
846+ assert df ["genes" ].equals (df_parsed ["genes" ].compute ())
847+ assert df ["genes" ].cat .categories .equals (df_parsed ["genes" ].compute ().cat .categories )
848+
849+ assert np .array_equal (df ["genes" ].to_numpy (), ddf_parsed ["genes" ].compute ().to_numpy ())
850+ assert set (df ["genes" ].cat .categories .tolist ()) == set (ddf_parsed ["genes" ].compute ().cat .categories .tolist ())
851+
852+ # two behavior to investigate later/report to dask (they originate in dask)
853+ # TODO: df['genes'].cat.categories has dtype 'object', while ddf_parsed['genes'].compute().cat.categories has dtype
854+ # 'string'
855+ # this problem should disappear after pandas 3.0 is released
856+ assert df ["genes" ].cat .categories .dtype == "object"
857+ assert ddf_parsed ["genes" ].compute ().cat .categories .dtype == "string"
858+
859+ # TODO: the list of categories are not preserving the order
860+ assert df ["genes" ].cat .categories .tolist () != ddf_parsed ["genes" ].compute ().cat .categories .tolist ()
0 commit comments