@@ -57,13 +57,13 @@ def download_experiments_data(synID:str , save_path:str = None, synToken:str = N
5757
5858
5959
60+ ### Parse Data Function
6061def parse_experiments_excel_sheets (first_file_path , second_file_path ):
6162 # read in the excel files
62- first_exp_excel = pd .ExcelFile (open (first_file_path , 'rb' ))
63+ first_exp_excel = pd .ExcelFile (open (first_experiments_path , 'rb' ))
6364 first_experiments_dict = pd .read_excel (first_exp_excel , sheet_name = None , header = None )
64- rest_exp_excel = pd .ExcelFile (open (second_file_path , 'rb' ))
65+ rest_exp_excel = pd .ExcelFile (open (rest_experiments_path , 'rb' ))
6566 rest_experiments_dict = pd .read_excel (rest_exp_excel , sheet_name = None , header = None )
66- # use for loops to interate through the dictionaries, melt the df's into longer df's instead of matrices, and then concat
6767 list_of_exp_excels = [first_experiments_dict ,rest_experiments_dict ]
6868 full_df_list = []
6969 for dictionary in list_of_exp_excels :
@@ -76,13 +76,30 @@ def parse_experiments_excel_sheets(first_file_path, second_file_path):
7676 conc_indexes = conc_indexes + [one_sample_df .index [- 1 ]+ 1 ]
7777 for index in range (0 ,(len (conc_indexes )- 1 )):
7878 one_conc_df = one_sample_df .loc [conc_indexes [index ]:(conc_indexes [(index + 1 )]- 1 )]
79+ # print("length before melt is:", len(one_conc_df))
80+ # print("end index is ",(conc_indexes[(index+1)]-1))
7981 one_conc_df .columns = one_conc_df .iloc [0 ]
8082 one_conc_df = one_conc_df [1 :]
8183 one_conc_df = pd .melt (one_conc_df , id_vars = ['concentration' ], value_vars = one_conc_df .columns [one_conc_df .columns != 'concentration' ])
82- one_conc_df = one_conc_df .rename (columns = {"concentration" :"drug_id" ,one_conc_df .columns [1 ]:"concentration" ,"value" :"count" })
84+ one_conc_df = one_conc_df .rename (columns = {"concentration" :"drug_id" ,one_conc_df .columns [1 ]:"DOSE" ,"value" :"count" })
85+ one_conc_df = one_conc_df .astype ({"DOSE" : 'float' })
86+ one_conc_df = one_conc_df .reset_index (drop = True )
87+ # now convert all counts to growth rates
88+ for drug in one_conc_df ['drug_id' ].unique ():
89+ # print("the drug name is",drug)
90+ # print("the mean of the 0's is ",one_conc_df[(one_conc_df['drug_id'] == drug) & (one_conc_df['DOSE'] == 0)]['count'].mean())
91+ mean_of_zeros = one_conc_df [(one_conc_df ['drug_id' ] == drug ) & (one_conc_df ['DOSE' ] == 0 )]['count' ].mean ()
92+ one_conc_df .loc [one_conc_df ['drug_id' ] == drug , 'GROWTH' ] = (one_conc_df [(one_conc_df ['drug_id' ] == drug )]['count' ]/ mean_of_zeros )* 100
93+ # print("sample is ", experiment_key)
94+ # print("index is ",index)
95+ # print("length of df is", len(one_conc_df))
96+ # print("number of unique drugs is", one_conc_df['drug_id'].nunique())
8397 list_of_dfs .append (one_conc_df )
98+ # print(list_of_dfs)
8499 elongated_df = pd .concat (list_of_dfs )
85100 elongated_df ['sample_name' ] = experiment_key
101+ # print(experiment_key)
102+ # print(elongated_df['drug_id'].nunique())
86103 list_of_finished_dfs .append (elongated_df )
87104 full_experiments_df = pd .concat (list_of_finished_dfs )
88105 full_df_list .append (full_experiments_df )
@@ -93,7 +110,7 @@ def parse_experiments_excel_sheets(first_file_path, second_file_path):
93110def merge_improve_samples_drugs (experiment_data :pd .DataFrame , samples_data_path :str , drugs_info_path :str , improve_drugs_path :str ):
94111 # read in data
95112 experiments_df = experiment_data
96- improve_sample_df = pd .read_csv (samples_data_path , sep = ' \t ' )
113+ improve_sample_df = pd .read_csv (samples_data_path )
97114 improve_drug_df = pd .read_csv (improve_drugs_path , sep = '\t ' )
98115 druginfo_df = pd .read_excel (drugs_info_path )
99116 # merging improve drug id's
@@ -113,4 +130,46 @@ def merge_improve_samples_drugs(experiment_data:pd.DataFrame, samples_data_path:
113130 all_merged = all_merged [['study' ,'time' ,'DOSE' ,'GROWTH' ,'Drug' ,'improve_sample_id' ,'time_unit' ,'source' ]]
114131 all_merged = all_merged .dropna ()
115132
116- return (all_merged )
133+ return (all_merged )
134+
135+
136+
137+ if __name__ == "__main__" :
138+ parser = argparse .ArgumentParser (description = '###' )
139+
140+ # arguments for what data to process
141+ parser .add_argument ('-D' , '--Download' , action = 'store_true' , default = False , help = 'Download experiments data.' )
142+ parser .add_argument ('-t' , '--Token' , type = str , default = None , help = 'Synapse Token' )
143+ parser .add_argument ('-E' , '--Experiment' , action = 'store_true' , default = False , help = 'Create experiments data.' )
144+ parser .add_argument ('-s' , '--Samples' , type = str , default = None , help = 'Path to samples file.' )
145+ parser .add_argument ('-d' , '--Drugs' , type = str , default = None , help = 'Path to drugs file' )
146+
147+ args = parser .parse_args ()
148+
149+
150+ ###########################
151+
152+ if args .Download :
153+ if args .Token is None :
154+ print ("No synpase download tocken was provided. Cannot download data." )
155+ exit ()
156+ else :
157+ print ("Downloading Files from Synapse." )
158+ # download experiments data from synapse, which are split into 2 excel files
159+ first_experiments_path = download_experiments_data (synID = "syn66401301" , save_path = "/tmp/" , synToken = args .Token )
160+ rest_experiments_path = download_experiments_data (synID = "syn66401302" , save_path = "/tmp/" , synToken = args .Token )
161+ if args .Experiment :
162+ if args .Samples is None :
163+ print ("No path to samples file detected. Cannot generate experiment data." )
164+ exit ()
165+ if args .Drugs is None :
166+ print ("No path to drugs file detected. Cannot generate experiment data." )
167+ exit ()
168+ else :
169+ print ("Parsing experiments excel sheets" )
170+ parsed_experiments_data = parse_experiments_excel_sheets (first_experiments_path , rest_experiments_path )
171+ print ("Generating experiments data." )
172+ experiments_df = merge_improve_samples_drugs (experiment_data = parsed_experiments_data , samples_data_path = args .Samples , improve_drugs_path = args .Drugs , drugs_info_path = "/tmp/4_Drug_information.xlsx" )
173+ output_path = "/tmp/liverpdo_experiments_for_curve_fitting.tsv"
174+ print ("Experiments data sucessfully generated. Saving tsv to {}" .format (output_path ))
175+ experiments_df .to_csv (output_path , sep = '\t ' )
0 commit comments