44import math
55import argparse
66import synapseclient
7+ import re
78
89
10+ def remove_zero_between_letter_and_digit (text ):
11+ """
12+ Removes a '0' character that is immediately preceded by a letter
13+ and immediately followed by a digit. We use this when merging drugsids and sampleids into the experiments df.
14+ """
15+ # The regex pattern looks for:
16+ # (r'([a-zA-Z])0(\d)')
17+ # ([a-zA-Z]): Captures any single uppercase or lowercase letter (Group 1)
18+ # 0: Matches the literal '0'
19+ # (\d): Captures any single digit (Group 2)
20+ # The replacement string r'\1\2' puts Group 1 and Group 2 back together,
21+ # effectively removing the '0' in between.
22+ return re .sub (r'([a-zA-Z])0(\d)' , r'\1\2' , text )
23+
924def download_experiments_data (synID :str , save_path :str = None , synToken :str = None ):
1025 """
1126 Download omics data from Synapse at synapseID syn66401303. Requires a synapse token, which requires you to make a Synapse account
@@ -72,4 +87,30 @@ def parse_experiments_excel_sheets(first_file_path, second_file_path):
7287 full_experiments_df = pd .concat (list_of_finished_dfs )
7388 full_df_list .append (full_experiments_df )
7489 experiments_df = pd .concat (full_df_list )
75- return (experiments_df )
90+ return (experiments_df )
91+
92+
93+ def merge_improve_samples_drugs (experiment_data :pd .DataFrame , samples_data_path :str , drugs_info_path :str , improve_drugs_path :str ):
94+ # read in data
95+ experiments_df = experiment_data
96+ improve_sample_df = pd .read_csv (samples_data_path , sep = '\t ' )
97+ improve_drug_df = pd .read_csv (improve_drugs_path , sep = '\t ' )
98+ druginfo_df = pd .read_excel (drugs_info_path )
99+ # merging improve drug id's
100+ drugnames_merged = pd .merge (experiments_df , druginfo_df [['Catalogue' ,'Drug' ]], how = 'inner' , left_on = "drug_id" , right_on = "Catalogue" )
101+ drugnames_merged ['Drug' ] = drugnames_merged ['Drug' ].str .lower ()
102+ drugids_merged = pd .merge (drugnames_merged , improve_drug_df [['improve_drug_id' ,'chem_name' ]], how = 'inner' , left_on = "Drug" , right_on = "chem_name" )
103+ # merging improve sample id's
104+ drugids_merged ['sample_name' ] = drugids_merged ['sample_name' ].apply (remove_zero_between_letter_and_digit ) # need to apply this function bc some of the naming conventions for the sample names are inconsistent (ex: HCCO01 and HCCO1)
105+ all_merged = pd .merge (drugids_merged , improve_sample_df [['other_id' ,'improve_sample_id' ]], how = 'left' , left_on = "sample_name" , right_on = "other_id" )
106+ # now do some formatting
107+ all_merged ['time' ] = 72
108+ all_merged ['time_unit' ] = "hours"
109+ all_merged ['study' ] = "LiverPDO"
110+ all_merged ['source' ] = "synapse"
111+ all_merged = all_merged .drop (columns = {'drug_id' ,'count' , 'sample_name' ,'Catalogue' ,'chem_name' ,'other_id' ,'Drug' })
112+ all_merged = all_merged .rename (columns = {'improve_drug_id' :'Drug' })
113+ all_merged = all_merged [['study' ,'time' ,'DOSE' ,'GROWTH' ,'Drug' ,'improve_sample_id' ,'time_unit' ,'source' ]]
114+ all_merged = all_merged .dropna ()
115+
116+ return (all_merged )
0 commit comments