Skip to content

Commit 7775ae1

Browse files
committed
reorganized functions, added new function to prepare data for 2d histogram plotting
1 parent ce0e0c0 commit 7775ae1

2 files changed

Lines changed: 211 additions & 134 deletions

File tree

coderdata/utils/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
from .stats import summarize_response_metric
1+
from .stats import summarize_response_metric
2+
from .stats import prepare_2d_hist_data

coderdata/utils/stats.py

Lines changed: 209 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,126 @@
1616
import seaborn as sns
1717

1818

19+
def plot_2d_respones_metric(
20+
data: DatasetLoader,
21+
metric1: str,
22+
metric2: str,
23+
**kwargs: dict
24+
) -> None:
25+
26+
data_plot = prepare_2d_hist_data(
27+
data=data,
28+
metric1=metric1,
29+
metric2=metric2,
30+
)
31+
32+
joint_bins = kwargs.get('joint_bins', default=50)
33+
marginal_bins = kwargs.get('marginal_bins', default=50)
34+
35+
sns.jointplot(
36+
data=data_plot,
37+
x=metric2,
38+
y=metric1,
39+
kind="hist",
40+
joint_kws=dict(bins=joint_bins),
41+
marginal_kws=dict(bins=marginal_bins)
42+
)
43+
44+
def plot_response_metric(
45+
data: DatasetLoader,
46+
metric: str='auc',
47+
ax: Axes=None,
48+
**kwargs: dict
49+
) -> None:
50+
"""
51+
Creates a histogram detailing the distribution of dose response
52+
values for a given dose respones metric.
53+
54+
If used in conjunction with `matplotlib.pyplot.subplot` or
55+
`matplotlib.pyplot.subplots` and the axes object is passed to the
56+
function, the function populates the axes object with the generated
57+
plot.
58+
59+
Parameters
60+
----------
61+
data : coderdata.DataLoader
62+
A full CoderData object of a dataset
63+
metric : str, default='auc'
64+
A string that defines the response metric that should be plotted
65+
ax : matplotlib.axes.Axes, default=None
66+
An `Axes` object can be defined. This is uesful if a multipannel
67+
subplot has been defined prior via `matplotlib.pyplot.subplots`.
68+
Passing the location of the axes to the function will then
69+
populate the subplot at the given location with the generated
70+
plot.
71+
**kwargs : dict, optional
72+
Additional keyword arguments that can be passed to the function
73+
- bins : int - sets the number of bins; passed to
74+
`seaborn.histplot`
75+
- title : str - sets the title of the axes
76+
- kde : bool - adds a kernel density estimate plot into the
77+
histogram
78+
79+
Returns
80+
-------
81+
None
82+
83+
Example
84+
-------
85+
In a Jupyter Notebook environment the following snippet can be used
86+
to display a histgram detailing the distribution of drug response
87+
AUC measures in the beataml dataset.
88+
89+
>>> import coderdata as cd
90+
>>> beataml = cd.DataLoader('beataml')
91+
>>> cd.plot_response_metric(data=beataml, metric='auc', bin=10)
92+
93+
For generating multipanel plots we can make use of matplotlib and
94+
the `ax` parameter of this function. Furthermore, other features /
95+
parameters of the cerated figure can be changed (e.g. the title of
96+
the figure via `suptitle()`). Finally it can be saved.
97+
98+
>>> import coderdata as cd
99+
>>> import matplotlib.pyplot as plt
100+
>>> beataml = cd.DataLoader('beataml')
101+
>>> fig, axs = plt.subplots(ncols=2, figsize=(10, 5))
102+
>>> plot_response_metric(
103+
... data=beataml,
104+
... metric='auc',
105+
... bins=10,
106+
... ax=axs[0]
107+
... )
108+
>>> plot_response_metric(
109+
... data=beataml,
110+
... metric='aac',
111+
... bins=10,
112+
... ax=axs[0]
113+
... )
114+
>>> fig.set_layout_engine('tight')
115+
>>> fig.suptitle('Distribution of drug response values')
116+
>>> fig.savefig('figure.png')
117+
"""
118+
119+
# assinging values to variables based on **kwargs and defining
120+
# default values if not present in **kwargs
121+
bins_ = kwargs.get('bins', 10)
122+
title_ = kwargs.get('title', None)
123+
kde_ = kwargs.get('kde', False)
124+
125+
# retrieving the data/values necessary to generate the figure
126+
metrics = (
127+
data.experiments # getting the experiments DF from the dataset
128+
.groupby('dose_response_metric') # grouping for later
129+
)
130+
metric_ = metrics.get_group(metric) # retrieving the desired group
131+
x = metric_['dose_response_value'] # getting the values
132+
133+
sns.set_theme(palette='colorblind')
134+
p = sns.histplot(data=x, kde=kde_, bins=bins_, ax=ax)
135+
p.set_xlabel(metric)
136+
p.set_title(title_)
137+
138+
19139
def split_experiments_by_study(data: DatasetLoader) -> dict:
20140
"""
21141
Splits the CoderData object into multiple smaller CoderData objects
@@ -58,6 +178,53 @@ def split_experiments_by_study(data: DatasetLoader) -> dict:
58178
return df_ret
59179

60180

181+
def summarize_response_metric(data: DatasetLoader) -> pd.DataFrame:
182+
"""
183+
Helper function to extract basic statistics for the `experiments`
184+
object in a CoderData object. Uses `pandas.DataFrame.describe()`
185+
internally to generate count, mean, standard deviation, minimum,
186+
25-, 50- and 75-percentile as well as maximum for
187+
`dose_response_value` for each `dose_response_metric` present in
188+
`experiments`.
189+
190+
Parameters
191+
----------
192+
data : coderdata.DatasetLoader
193+
A full CoderData object of a dataset
194+
195+
Returns
196+
-------
197+
pandas.DataFrame
198+
A `pandas.DataFrame` containing basic statistics for each
199+
dose response metric.
200+
201+
Example
202+
-------
203+
204+
The Example assumes that a dataset with the prefix 'beataml' has
205+
been downloaded previously. See also ``coderdata.download()``
206+
207+
>>> import coderdata as cd
208+
>>> beataml = cd.DataLoader('beataml')
209+
>>> summary_stats = summarize_response_metric(data=beataml)
210+
>>> summary_stats
211+
count mean std
212+
dose_response_metric
213+
aac 23378.0 3.028061e-01 1.821265e-01 ...
214+
auc 23378.0 6.971939e-01 1.821265e-01 ...
215+
dss 23378.0 3.218484e-01 5.733492e-01 ...
216+
... ... ... ... ...
217+
"""
218+
df_ret = (
219+
data.experiments # get experiments DF
220+
.groupby('dose_response_metric') # grouping by metric
221+
['dose_response_value'] # value to summarize
222+
.describe() # get count, mean, std, etc.
223+
)
224+
225+
return df_ret
226+
227+
61228
def _filter(
62229
data: DatasetLoader,
63230
sample_ids: list,
@@ -150,143 +317,52 @@ def _filter(
150317

151318
return data_ret
152319

153-
def summarize_response_metric(data: DatasetLoader) -> pd.DataFrame:
154-
"""
155-
Helper function to extract basic statistics for the `experiments`
156-
object in a CoderData object. Uses `pandas.DataFrame.describe()`
157-
internally to generate count, mean, standard deviation, minimum,
158-
25-, 50- and 75-percentile as well as maximum for
159-
`dose_response_value` for each `dose_response_metric` present in
160-
`experiments`.
161-
162-
Parameters
163-
----------
164-
data : coderdata.DatasetLoader
165-
A full CoderData object of a dataset
166-
167-
Returns
168-
-------
169-
pandas.DataFrame
170-
A `pandas.DataFrame` containing basic statistics for each
171-
dose response metric.
172-
173-
Example
174-
-------
175-
176-
The Example assumes that a dataset with the prefix 'beataml' has
177-
been downloaded previously. See also ``coderdata.download()``
178320

179-
>>> import coderdata as cd
180-
>>> beataml = cd.DataLoader('beataml')
181-
>>> summary_stats = summarize_response_metric(data=beataml)
182-
>>> summary_stats
183-
count mean std
184-
dose_response_metric
185-
aac 23378.0 3.028061e-01 1.821265e-01 ...
186-
auc 23378.0 6.971939e-01 1.821265e-01 ...
187-
dss 23378.0 3.218484e-01 5.733492e-01 ...
188-
... ... ... ... ...
189-
"""
190-
df_ret = (
191-
data.experiments # get experiments DF
192-
.groupby('dose_response_metric') # grouping by metric
193-
['dose_response_value'] # value to summarize
194-
.describe() # get count, mean, std, etc.
195-
)
196-
197-
return df_ret
198-
199-
200-
def plot_response_metric(
201-
data: DatasetLoader,
202-
metric: str='auc',
203-
ax: Axes=None,
204-
**kwargs: dict
205-
) -> None:
206-
"""
207-
Creates a histogram detailing the distribution of dose response
208-
values for a given dose respones metric.
209-
210-
If used in conjunction with `matplotlib.pyplot.subplot` or
211-
`matplotlib.pyplot.subplots` and the axes object is passed to the
212-
function, the function populates the axes object with the generated
213-
plot.
321+
def prepare_2d_hist_data(
322+
data: pd.DataFrame,
323+
metrics: list[str]=[
324+
"aac", "auc", "dss",
325+
"fit_auc", "fit_ec50", "fit_ec50se",
326+
"fit_einf", "fit_hs", "fit_ic50",
327+
"fit_r2",
328+
],
329+
r2: float=None,
330+
) -> pd.DataFrame:
214331

215-
Parameters
216-
----------
217-
data : coderdata.DataLoader
218-
A full CoderData object of a dataset
219-
metric : str, default='auc'
220-
A string that defines the response metric that should be plotted
221-
ax : matplotlib.axes.Axes, default=None
222-
An `Axes` object can be defined. This is uesful if a multipannel
223-
subplot has been defined prior via `matplotlib.pyplot.subplots`.
224-
Passing the location of the axes to the function will then
225-
populate the subplot at the given location with the generated
226-
plot.
227-
**kwargs : dict, optional
228-
Additional keyword arguments that can be passed to the function
229-
- bins : int - sets the number of bins; passed to
230-
`seaborn.histplot`
231-
- title : str - sets the title of the axes
232-
- kde : bool - adds a kernel density estimate plot into the
233-
histogram
234332

235-
Returns
236-
-------
237-
None
333+
metric_groups = data.groupby('dose_response_metric')
334+
335+
if r2 is not None:
336+
r2_ = deepcopy(metric_groups.get_group("fit_r2"))
337+
r2_.rename(columns={"dose_response_value": "r2_thresh"}, inplace=True)
338+
r2_.drop(
339+
columns=[
340+
'source', 'time_unit', 'dose_response_metric'
341+
],
342+
inplace=True
343+
)
344+
# print(metric_groups)
345+
d_ret = deepcopy(metric_groups.get_group(metrics[0]))
346+
d_ret.rename(columns={"dose_response_value": metrics[0]}, inplace=True)
347+
d_ret.drop(columns=["dose_response_metric"], inplace=True)
348+
349+
350+
for metric in metrics[1:]:
351+
m = deepcopy(metric_groups.get_group(metric))
352+
m.rename(columns={"dose_response_value": metric}, inplace=True)
353+
m.drop(
354+
columns=[
355+
'source', 'time_unit', 'dose_response_metric'
356+
],
357+
inplace=True
358+
)
238359

239-
Example
240-
-------
241-
In a Jupyter Notebook environment the following snippet can be used
242-
to display a histgram detailing the distribution of drug response
243-
AUC measures in the beataml dataset.
360+
d_ret = d_ret.merge(m, on=["improve_drug_id", "improve_sample_id", "time", "study"])
244361

245-
>>> import coderdata as cd
246-
>>> beataml = cd.DataLoader('beataml')
247-
>>> cd.plot_response_metric(data=beataml, metric='auc', bin=10)
362+
if r2 is not None:
363+
d_ret = d_ret.merge(r2_, on=["improve_drug_id", "improve_sample_id", "time", "study"])
364+
d_ret = d_ret[d_ret["r2_thresh"] > float(r2)]
365+
d_ret.drop(columns=["r2_thresh"], inplace=True)
248366

249-
For generating multipanel plots we can make use of matplotlib and
250-
the `ax` parameter of this function. Furthermore, other features /
251-
parameters of the cerated figure can be changed (e.g. the title of
252-
the figure via `suptitle()`). Finally it can be saved.
253367

254-
>>> import coderdata as cd
255-
>>> import matplotlib.pyplot as plt
256-
>>> beataml = cd.DataLoader('beataml')
257-
>>> fig, axs = plt.subplots(ncols=2, figsize=(10, 5))
258-
>>> plot_response_metric(
259-
... data=beataml,
260-
... metric='auc',
261-
... bins=10,
262-
... ax=axs[0]
263-
... )
264-
>>> plot_response_metric(
265-
... data=beataml,
266-
... metric='aac',
267-
... bins=10,
268-
... ax=axs[0]
269-
... )
270-
>>> fig.set_layout_engine('tight')
271-
>>> fig.suptitle('Distribution of drug response values')
272-
>>> fig.savefig('figure.png')
273-
"""
274-
275-
# assinging values to variables based on **kwargs and defining
276-
# default values if not present in **kwargs
277-
bins_ = kwargs.get('bins', 10)
278-
title_ = kwargs.get('title', None)
279-
kde_ = kwargs.get('kde', False)
280-
281-
# retrieving the data/values necessary to generate the figure
282-
metrics = (
283-
data.experiments # getting the experiments DF from the dataset
284-
.groupby('dose_response_metric') # grouping for later
285-
)
286-
metric_ = metrics.get_group(metric) # retrieving the desired group
287-
x = metric_['dose_response_value'] # getting the values
288-
289-
sns.set_theme(palette='colorblind')
290-
p = sns.histplot(data=x, kde=kde_, bins=bins_, ax=ax)
291-
p.set_xlabel(metric)
292-
p.set_title(title_)
368+
return d_ret

0 commit comments

Comments
 (0)