Skip to content

Commit ce0e0c0

Browse files
committed
added functionality to split a CoderData object into smaller CoderData objects according to the drug response studies recorded in the experiments table
1 parent 8cf3f1e commit ce0e0c0

1 file changed

Lines changed: 140 additions & 0 deletions

File tree

coderdata/utils/stats.py

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,153 @@
33
contained in a CoderData Object.
44
"""
55

6+
7+
from copy import deepcopy
8+
9+
import numpy as np
10+
611
from coderdata import DatasetLoader
712
import pandas as pd
813

914
import matplotlib.pyplot as plt
1015
from matplotlib.axes import Axes
1116
import seaborn as sns
1217

18+
19+
def split_experiments_by_study(data: DatasetLoader) -> dict:
20+
"""
21+
Splits the CoderData object into multiple smaller CoderData objects
22+
according to the `study` recorded in the ``.experiments`` table in
23+
the CoderData object.
24+
25+
Parameters
26+
----------
27+
data : DatasetLoader
28+
The CoderData object containing the data set loaded into memory
29+
via ``coderdata.DatasetLoader()``.
30+
31+
Returns
32+
-------
33+
dict
34+
A dictionary dict[study, data] where keys `study` are the names
35+
of the study in the ``.experiments`` part of the imported
36+
CoderData object and values `data` are the filtered smaller
37+
CoderData objects containing only data corresponding to the
38+
study.
39+
"""
40+
41+
df_ret = {}
42+
experiments = data.experiments
43+
44+
# creating the groups based on 'study' to itterate over
45+
groups = experiments.groupby('study')
46+
for name, group in groups:
47+
48+
# extracting improve sample and drug ids from the provided split
49+
sample_ids = list(np.unique(group['improve_sample_id'].values))
50+
drug_ids = list(np.unique(group['improve_drug_id'].values))
51+
52+
# creating new CoderData objects that contain only data
53+
# pertaining to the study defined by the previous grouping
54+
df_ret[name] = _filter(
55+
data=data, sample_ids=sample_ids, drug_ids=drug_ids, study=name
56+
)
57+
58+
return df_ret
59+
60+
61+
def _filter(
62+
data: DatasetLoader,
63+
sample_ids: list,
64+
drug_ids: list,
65+
study: str=None,
66+
) -> DatasetLoader:
67+
"""
68+
Helper function to filter down the CoderData object(s) to create
69+
independent more concise CoderData objects for further processing.
70+
This can be either splitting a dataset according to the different
71+
drug response studies (e.g. the broad_sanger dataset) or if small
72+
subsets need to be extracted (e.g. training / testing splits for
73+
machine learning)
74+
75+
Parameters
76+
----------
77+
data : DatasetLoader
78+
Contains a full CoderData object imported/loaded via
79+
``cd.DataLoader``
80+
sample_ids : list
81+
A list of improve_sample_id[s] that the CoderData object should
82+
be filtered to
83+
drug_ids : list
84+
A list of improve_drug_id[s] that the CoderData object should
85+
be filtered to
86+
study : str, default = None
87+
The drug response study that the CoderData object should be
88+
filtered to. This argument is only important for filtering the
89+
broad_sanger dataset if the splitting / filtering of the data
90+
set is based on the drug response study
91+
92+
Returns
93+
-------
94+
DatasetLoader
95+
The filtered CoderData object
96+
97+
Notes
98+
-----
99+
100+
Different data types of the CoderData object are going to be
101+
filtered using either the improve_sample_id or the improve_drug_id.
102+
103+
- cd.copynumber -> reduce based on ``improve_sample_id``
104+
- cd.drugs -> reduce based on ``improve_drug_id``
105+
- cd.experiments -> reduce based on ``study`` (only applicable if
106+
the dataset is broad_sanger)
107+
- cd.mutations -> reduce based on ``improve_sample_id``
108+
- cd.proteomics -> reduce based on ``improve_sample_id``
109+
- cd.samples -> reduce based on ``improve_sample_id``
110+
- cd.transcriptomics -> reduce based on ``improve_sample_id``
111+
112+
"""
113+
114+
# creating a deep copy of the CoderData object such that any
115+
# further operations on the object are not changing the original
116+
# object / data
117+
data_ret = deepcopy(data)
118+
119+
# filtering each individual data type down by only the improve
120+
# sample / drug ids that are present in the study
121+
if not data_ret.copy_number.empty:
122+
data_ret.copy_number = data_ret.copy_number[
123+
data_ret.copy_number['improve_sample_id'].isin(sample_ids)
124+
]
125+
if not data_ret.drugs.empty:
126+
data_ret.drugs = data_ret.drugs[
127+
data_ret.drugs['improve_drug_id'].isin(drug_ids)
128+
]
129+
if not data_ret.mutations.empty:
130+
data_ret.mutations = data_ret.mutations[
131+
data_ret.mutations['improve_sample_id'].isin(sample_ids)
132+
]
133+
if not data_ret.proteomics.empty:
134+
data_ret.proteomics = data_ret.proteomics[
135+
data_ret.proteomics['improve_sample_id'].isin(sample_ids)
136+
]
137+
if not data_ret.samples.empty:
138+
data_ret.samples = data_ret.samples[
139+
data_ret.samples['improve_sample_id'].isin(sample_ids)
140+
]
141+
if not data_ret.transcriptomics.empty:
142+
data_ret.transcriptomics = data_ret.transcriptomics[
143+
data_ret.transcriptomics['improve_sample_id'].isin(sample_ids)
144+
]
145+
if not data_ret.experiments.empty:
146+
data_ret.experiments = data_ret.experiments[
147+
data_ret.experiments['study'] == study
148+
]
149+
# TODO: do we also need to split the gene table?
150+
151+
return data_ret
152+
13153
def summarize_response_metric(data: DatasetLoader) -> pd.DataFrame:
14154
"""
15155
Helper function to extract basic statistics for the `experiments`

0 commit comments

Comments
 (0)