@@ -60,9 +60,9 @@ def download_experiments_data(synID:str , save_path:str = None, synToken:str = N
6060### Parse Data Function
6161def parse_experiments_excel_sheets (first_file_path , second_file_path ):
6262 # read in the excel files
63- first_exp_excel = pd .ExcelFile (open (first_experiments_path , 'rb' ))
63+ first_exp_excel = pd .ExcelFile (open (first_file_path , 'rb' ))
6464 first_experiments_dict = pd .read_excel (first_exp_excel , sheet_name = None , header = None )
65- rest_exp_excel = pd .ExcelFile (open (rest_experiments_path , 'rb' ))
65+ rest_exp_excel = pd .ExcelFile (open (second_file_path , 'rb' ))
6666 rest_experiments_dict = pd .read_excel (rest_exp_excel , sheet_name = None , header = None )
6767 list_of_exp_excels = [first_experiments_dict ,rest_experiments_dict ]
6868 full_df_list = []
@@ -128,7 +128,20 @@ def merge_improve_samples_drugs(experiment_data:pd.DataFrame, samples_data_path:
128128 all_merged ['source' ] = "synapse"
129129 all_merged = all_merged .drop (columns = {'drug_id' ,'count' , 'sample_name' ,'Catalogue' ,'chem_name' ,'other_id' ,'Drug' })
130130 all_merged = all_merged .rename (columns = {'improve_drug_id' :'Drug' })
131- all_merged = all_merged .astype ({'improve_sample_id' :'int' })
131+
132+ # identify rows where improve_sample_id is NaN or non-finite
133+ all_merged ['improve_sample_id' ] = pd .to_numeric (all_merged ['improve_sample_id' ], errors = 'coerce' )
134+ bad_mask = all_merged ['improve_sample_id' ].isna () | np .isinf (all_merged ['improve_sample_id' ])
135+
136+ print (f"Rows before dropping bad improve_sample_id: { len (all_merged )} " )
137+ if bad_mask .any ():
138+ print (f"{ bad_mask .sum ()} rows with missing/non-finite improve_sample_id will be dropped" )
139+ # drop and report after
140+ all_merged = all_merged .loc [~ bad_mask ].copy ()
141+ print (f"Rows after dropping: { len (all_merged )} " )
142+
143+ # now safe to cast
144+ all_merged ['improve_sample_id' ] = all_merged ['improve_sample_id' ].astype (int )
132145 all_merged = all_merged [['study' ,'time' ,'DOSE' ,'GROWTH' ,'Drug' ,'improve_sample_id' ,'time_unit' ,'source' ]]
133146 all_merged = all_merged .dropna () # drop na's bc that will also cause issues in curve fitting
134147
0 commit comments