1+ """
2+ _summary_
3+
4+ """
15
26from __future__ import annotations
37
1115import numpy as np
1216from numpy .random import RandomState
1317import pandas as pd
18+ # import polars as pl
1419
1520from sklearn .model_selection import GroupShuffleSplit
1621from sklearn .model_selection import ShuffleSplit
@@ -27,6 +32,25 @@ class Split:
2732
2833class Dataset :
2934
35+ data_format_params = {
36+ "samples" : (
37+ "improve_sample_id" , "cancer_type" , "model_type" , "common_name" ,
38+ "other_id" , "other_names" , "id_source" , "species"
39+ ),
40+ "transcriptomics" : (
41+ "improve_sample_id" , "entrez_id" , "transcriptomics"
42+ ),
43+ "proteomics" : ("improve_sample_id" , "entrez_id" , "proteomics" ),
44+ "mutations" : ("improve_sample_id" , "entrez_id" , "mutation" ),
45+ "copy_number" : ("improve_sample_id" , "entrez_id" , "copy_number" ),
46+ "methylation" : ("improve_sample_id" , "entrez_id" , "methylation" ),
47+ "experiments" : (
48+ "improve_sample_id" , "improve_drug_id" , "dose_response_value"
49+ ),
50+ "drugs" : ("improve_drug_id" , "chem_name" , "isoSMILES" ),
51+ "genes" : ("entrez_id" , "gene_symbol" , "other_id" )
52+ }
53+
3054 def __init__ (
3155 self ,
3256 name : str = None ,
@@ -49,6 +73,35 @@ def __init__(
4973
5074 Each attribute will be a Pandas DataFrame corresponding to a file with the dataset prefix.
5175 Attributes include transcriptomics, proteomics, mutations, etc.
76+
77+ Parameters
78+ ----------
79+ name : str
80+ The name of the dataset that is stored in the object
81+ transcriptomics : pd.DataFrame, optional
82+ _description_, by default None
83+ proteomics : pd.DataFrame, optional
84+ _description_, by default None
85+ mutations : pd.DataFrame, optional
86+ _description_, by default None
87+ copy_number : pd.DataFrame, optional
88+ _description_, by default None
89+ samples : pd.DataFrame, optional
90+ _description_, by default None
91+ drugs : pd.DataFrame, optional
92+ _description_, by default None
93+ mirna : pd.DataFrame, optional
94+ _description_, by default None
95+ experiments : pd.DataFrame, optional
96+ _description_, by default None
97+ methylation : pd.DataFrame, optional
98+ _description_, by default None
99+ metabolomics : pd.DataFrame, optional
100+ _description_, by default None
101+ genes : pd.DataFrame, optional
102+ _description_, by default None
103+ full : pd.DataFrame, optional
104+ _description_, by default None
52105 """
53106
54107 self .name = name
@@ -72,6 +125,11 @@ def __init__(
72125 # ----------------------------
73126
74127
128+ @property
129+ def data_format_params (self ):
130+ return self ._data_format_params
131+
132+
75133 @property
76134 def name (self ):
77135 return self ._name
@@ -451,15 +509,23 @@ def format(
451509 raise ValueError (
452510 f"'{ data_type } ' attribute of Dataset cannot be 'None'"
453511 )
512+ copy_call = kwargs .get ('copy_call' , False )
454513
455- # TODO: add way to extract copy_call
456514 ret = pd .pivot_table (
457515 data = data .copy_number ,
458516 index = 'entrez_id' ,
459517 columns = 'improve_sample_id' ,
460518 values = 'copy_number' ,
461519 aggfunc = 'mean' ,
462520 )
521+ if copy_call :
522+ ret = ret .apply (
523+ pd .cut ,
524+ bins = [0 , 0.5210507 , 0.7311832 , 1.214125 , 1.422233 , 2 ],
525+ labels = ['deep del' , 'het loss' , 'diploid' , 'gain' , 'amp' ],
526+ include_lowest = True
527+ )
528+
463529
464530 elif data_type == "proteomics" :
465531 if data .proteomics is None :
@@ -920,6 +986,7 @@ def train_test_validate(
920986 return Split (data_train , data_test , data_val )
921987
922988
989+
923990def _load_file (file_path : Path ) -> pd .DataFrame :
924991 if file_path .suffix == '.gz' :
925992 return pd .read_csv (
0 commit comments