@@ -135,48 +135,6 @@ def plot_response_metric(
135135 p .set_title (title_ )
136136
137137
138- def split_experiments_by_study (data : cd .Dataset ) -> dict :
139- """
140- Splits the CoderData object into multiple smaller CoderData objects
141- according to the `study` recorded in the ``.experiments`` table in
142- the CoderData object.
143-
144- Parameters
145- ----------
146- data : cd.Dataset
147- The CoderData object containing the data set loaded into memory
148- via ``coderdata.cd.Dataset()``.
149-
150- Returns
151- -------
152- dict
153- A dictionary dict[study, data] where keys `study` are the names
154- of the study in the ``.experiments`` part of the imported
155- CoderData object and values `data` are the filtered smaller
156- CoderData objects containing only data corresponding to the
157- study.
158- """
159-
160- df_ret = {}
161- experiments = data .experiments
162-
163- # creating the groups based on 'study' to itterate over
164- groups = experiments .groupby ('study' )
165- for name , group in groups :
166-
167- # extracting improve sample and drug ids from the provided split
168- sample_ids = list (np .unique (group ['improve_sample_id' ].values ))
169- drug_ids = list (np .unique (group ['improve_drug_id' ].values ))
170-
171- # creating new CoderData objects that contain only data
172- # pertaining to the study defined by the previous grouping
173- df_ret [name ] = _filter (
174- data = data , sample_ids = sample_ids , drug_ids = drug_ids , study = name
175- )
176-
177- return df_ret
178-
179-
180138def summarize_response_metric (data : cd .Dataset ) -> pd .DataFrame :
181139 """
182140 Helper function to extract basic statistics for the `experiments`
@@ -224,99 +182,6 @@ def summarize_response_metric(data: cd.Dataset) -> pd.DataFrame:
224182 return df_ret
225183
226184
227- def _filter (
228- data : cd .Dataset ,
229- sample_ids : list ,
230- drug_ids : list ,
231- study : str = None ,
232- ) -> cd .Dataset :
233- """
234- Helper function to filter down the CoderData object(s) to create
235- independent more concise CoderData objects for further processing.
236- This can be either splitting a dataset according to the different
237- drug response studies (e.g. the broad_sanger dataset) or if small
238- subsets need to be extracted (e.g. training / testing splits for
239- machine learning)
240-
241- Parameters
242- ----------
243- data : cd.Dataset
244- Contains a full CoderData object imported/loaded via
245- ``cd.DataLoader``
246- sample_ids : list
247- A list of improve_sample_id[s] that the CoderData object should
248- be filtered to
249- drug_ids : list
250- A list of improve_drug_id[s] that the CoderData object should
251- be filtered to
252- study : str, default = None
253- The drug response study that the CoderData object should be
254- filtered to. This argument is only important for filtering the
255- broad_sanger dataset if the splitting / filtering of the data
256- set is based on the drug response study
257-
258- Returns
259- -------
260- cd.Dataset
261- The filtered CoderData object
262-
263- Notes
264- -----
265-
266- Different data types of the CoderData object are going to be
267- filtered using either the improve_sample_id or the improve_drug_id.
268-
269- - cd.copynumber -> reduce based on ``improve_sample_id``
270- - cd.drugs -> reduce based on ``improve_drug_id``
271- - cd.experiments -> reduce based on ``study`` (only applicable if
272- the dataset is broad_sanger)
273- - cd.mutations -> reduce based on ``improve_sample_id``
274- - cd.proteomics -> reduce based on ``improve_sample_id``
275- - cd.samples -> reduce based on ``improve_sample_id``
276- - cd.transcriptomics -> reduce based on ``improve_sample_id``
277-
278- """
279-
280- # creating a deep copy of the CoderData object such that any
281- # further operations on the object are not changing the original
282- # object / data
283- data_ret = deepcopy (data )
284-
285- # filtering each individual data type down by only the improve
286- # sample / drug ids that are present in the study
287- if not data_ret .copy_number .empty :
288- data_ret .copy_number = data_ret .copy_number [
289- data_ret .copy_number ['improve_sample_id' ].isin (sample_ids )
290- ]
291- if not data_ret .drugs .empty :
292- data_ret .drugs = data_ret .drugs [
293- data_ret .drugs ['improve_drug_id' ].isin (drug_ids )
294- ]
295- if not data_ret .mutations .empty :
296- data_ret .mutations = data_ret .mutations [
297- data_ret .mutations ['improve_sample_id' ].isin (sample_ids )
298- ]
299- if not data_ret .proteomics .empty :
300- data_ret .proteomics = data_ret .proteomics [
301- data_ret .proteomics ['improve_sample_id' ].isin (sample_ids )
302- ]
303- if not data_ret .samples .empty :
304- data_ret .samples = data_ret .samples [
305- data_ret .samples ['improve_sample_id' ].isin (sample_ids )
306- ]
307- if not data_ret .transcriptomics .empty :
308- data_ret .transcriptomics = data_ret .transcriptomics [
309- data_ret .transcriptomics ['improve_sample_id' ].isin (sample_ids )
310- ]
311- if not data_ret .experiments .empty :
312- data_ret .experiments = data_ret .experiments [
313- data_ret .experiments ['study' ] == study
314- ]
315- # TODO: do we also need to split the gene table?
316-
317- return data_ret
318-
319-
320185def _prepare_2d_hist_data (
321186 data : pd .DataFrame ,
322187 metrics : list [str ]= [
0 commit comments