Skip to content

Commit 19c1c9b

Browse files
adding final function to experimetns py
1 parent 321baeb commit 19c1c9b

1 file changed

Lines changed: 42 additions & 1 deletion

File tree

build/liverpdo/04-experiments-liverpdo.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,23 @@
44
import math
55
import argparse
66
import synapseclient
7+
import re
78

89

10+
def remove_zero_between_letter_and_digit(text):
11+
"""
12+
Removes a '0' character that is immediately preceded by a letter
13+
and immediately followed by a digit. We use this when merging drugsids and sampleids into the experiments df.
14+
"""
15+
# The regex pattern looks for:
16+
# (r'([a-zA-Z])0(\d)')
17+
# ([a-zA-Z]): Captures any single uppercase or lowercase letter (Group 1)
18+
# 0: Matches the literal '0'
19+
# (\d): Captures any single digit (Group 2)
20+
# The replacement string r'\1\2' puts Group 1 and Group 2 back together,
21+
# effectively removing the '0' in between.
22+
return re.sub(r'([a-zA-Z])0(\d)', r'\1\2', text)
23+
924
def download_experiments_data(synID:str , save_path:str = None, synToken:str = None):
1025
"""
1126
Download omics data from Synapse at synapseID syn66401303. Requires a synapse token, which requires you to make a Synapse account
@@ -72,4 +87,30 @@ def parse_experiments_excel_sheets(first_file_path, second_file_path):
7287
full_experiments_df = pd.concat(list_of_finished_dfs)
7388
full_df_list.append(full_experiments_df)
7489
experiments_df = pd.concat(full_df_list)
75-
return(experiments_df)
90+
return(experiments_df)
91+
92+
93+
def merge_improve_samples_drugs(experiment_data:pd.DataFrame, samples_data_path:str, drugs_info_path:str, improve_drugs_path:str):
94+
# read in data
95+
experiments_df = experiment_data
96+
improve_sample_df = pd.read_csv(samples_data_path, sep='\t')
97+
improve_drug_df = pd.read_csv(improve_drugs_path, sep='\t')
98+
druginfo_df = pd.read_excel(drugs_info_path)
99+
# merging improve drug id's
100+
drugnames_merged = pd.merge(experiments_df, druginfo_df[['Catalogue','Drug']], how = 'inner', left_on= "drug_id", right_on= "Catalogue")
101+
drugnames_merged['Drug'] = drugnames_merged['Drug'].str.lower()
102+
drugids_merged = pd.merge(drugnames_merged, improve_drug_df[['improve_drug_id','chem_name']], how = 'inner', left_on= "Drug", right_on= "chem_name")
103+
# merging improve sample id's
104+
drugids_merged['sample_name'] = drugids_merged['sample_name'].apply(remove_zero_between_letter_and_digit) # need to apply this function bc some of the naming conventions for the sample names are inconsistent (ex: HCCO01 and HCCO1)
105+
all_merged = pd.merge(drugids_merged, improve_sample_df[['other_id','improve_sample_id']], how = 'left', left_on= "sample_name", right_on= "other_id")
106+
# now do some formatting
107+
all_merged['time'] = 72
108+
all_merged['time_unit'] = "hours"
109+
all_merged['study'] = "LiverPDO"
110+
all_merged['source'] = "synapse"
111+
all_merged = all_merged.drop(columns={'drug_id','count', 'sample_name','Catalogue','chem_name','other_id','Drug'})
112+
all_merged = all_merged.rename(columns={'improve_drug_id':'Drug'})
113+
all_merged = all_merged[['study','time','DOSE','GROWTH','Drug','improve_sample_id','time_unit','source']]
114+
all_merged = all_merged.dropna()
115+
116+
return(all_merged)

0 commit comments

Comments
 (0)