Skip to content

Commit 321baeb

Browse files
added download and excel data parsing to file
1 parent ba74a5b commit 321baeb

1 file changed

Lines changed: 75 additions & 0 deletions

File tree

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import pandas as pd
2+
import numpy as np
3+
import os
4+
import math
5+
import argparse
6+
import synapseclient
7+
8+
9+
def download_experiments_data(synID:str , save_path:str = None, synToken:str = None):
10+
"""
11+
Download omics data from Synapse at synapseID syn66401303. Requires a synapse token, which requires you to make a Synapse account
12+
and create a Personal Access Token. More information here: https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens
13+
14+
Parameters
15+
----------
16+
synID : string
17+
SynapseID of dataset to download. Default is synapseID of the omics dataset.
18+
19+
save_path : string
20+
atal path where the downloaded file will be saved.
21+
22+
synToken : string
23+
Synapse Personal Access Token of user. Requires a Synapse account. More information at: https://help.synapse.org/docs/Managing-Your-Account.2055405596.html#ManagingYourAccount-PersonalAccessTokens
24+
25+
Returns
26+
-------
27+
experiments_filepath : string
28+
Path to downloaded file
29+
30+
"""
31+
32+
syn = synapseclient.Synapse()
33+
syn.login(authToken=synToken)
34+
35+
# Obtain a pointer and download the data
36+
downloaded_data = syn.get(entity=synID, downloadLocation = save_path)
37+
38+
# Get the path to the local copy of the data file
39+
experiments_filepath = downloaded_data.path
40+
41+
return(experiments_filepath)
42+
43+
44+
45+
def parse_experiments_excel_sheets(first_file_path, second_file_path):
46+
# read in the excel files
47+
first_exp_excel = pd.ExcelFile(open(first_file_path, 'rb'))
48+
first_experiments_dict = pd.read_excel(first_exp_excel, sheet_name=None, header=None)
49+
rest_exp_excel = pd.ExcelFile(open(second_file_path, 'rb'))
50+
rest_experiments_dict = pd.read_excel(rest_exp_excel, sheet_name=None, header=None)
51+
# use for loops to interate through the dictionaries, melt the df's into longer df's instead of matrices, and then concat
52+
list_of_exp_excels = [first_experiments_dict,rest_experiments_dict]
53+
full_df_list = []
54+
for dictionary in list_of_exp_excels:
55+
list_of_finished_dfs = []
56+
for experiment_key in dictionary.keys():
57+
one_sample_df = dictionary[experiment_key] # get 1 df from the df dictionary
58+
one_sample_df = one_sample_df.fillna(value={0:"concentration"}) # for many of the pages, they didn't write "concentration" but just left it blank. fill these na's with "concentration"
59+
list_of_dfs = [] # initiate empty list of df's for each conc type
60+
conc_indexes = one_sample_df[one_sample_df[0] == "concentration"].index.to_list() # get indexes of rows with concentrations in them (these will be column names)
61+
conc_indexes = conc_indexes + [one_sample_df.index[-1]+1]
62+
for index in range(0,(len(conc_indexes)-1)):
63+
one_conc_df = one_sample_df.loc[conc_indexes[index]:(conc_indexes[(index+1)]-1)]
64+
one_conc_df.columns = one_conc_df.iloc[0]
65+
one_conc_df = one_conc_df[1:]
66+
one_conc_df = pd.melt(one_conc_df, id_vars=['concentration'], value_vars=one_conc_df.columns[one_conc_df.columns != 'concentration'])
67+
one_conc_df = one_conc_df.rename(columns={"concentration":"drug_id",one_conc_df.columns[1]:"concentration","value":"count"})
68+
list_of_dfs.append(one_conc_df)
69+
elongated_df = pd.concat(list_of_dfs)
70+
elongated_df['sample_name'] = experiment_key
71+
list_of_finished_dfs.append(elongated_df)
72+
full_experiments_df = pd.concat(list_of_finished_dfs)
73+
full_df_list.append(full_experiments_df)
74+
experiments_df = pd.concat(full_df_list)
75+
return(experiments_df)

0 commit comments

Comments
 (0)