Skip to content

Commit 23d8a8e

Browse files
committed
Create 00_createSarcPDOSampleFile.py
add script for sarcpdo that downloads and combines genetic and transcriptomic sample IDs, merges and formats based on linkml schema
1 parent 37ac86d commit 23d8a8e

1 file changed

Lines changed: 121 additions & 0 deletions

File tree

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
import synapseclient
2+
import pandas as pd
3+
import numpy as np
4+
import argparse
5+
import os
6+
7+
def download_and_format_genetic_samples(synLoginObject):
8+
"""
9+
Download and format samples that have genetic data in the Sarcoma PDO project via Synapse.
10+
11+
Parameters
12+
----------
13+
synLoginObject : synapseclient.Synapse
14+
an object generated by a call to syn.login(PAT)
15+
16+
Returns
17+
-------
18+
pd.DataFrame
19+
a dataframe containing formatted sample info
20+
"""
21+
# download genetic sample sheet, 15 rows
22+
geneticSampleSheet = synLoginObject.tableQuery("select * from syn61894699")
23+
geneticSampleDF = geneticSampleSheet.asDataFrame()
24+
# subset and rename
25+
genetic_samples = geneticSampleDF[['Sample_ID', 'Diagnosis']].rename({'Sample_ID':'common_name', 'Diagnosis':'cancer_type'}, axis=1)
26+
genetic_samples['species'] = 'Homo sapiens(Human)'
27+
genetic_samples['other_id_source'] = 'Synapse'
28+
# append "_Tumor" to be more similar to RNAseq other_id's
29+
genetic_samples['other_id'] = genetic_samples['common_name'].astype(str) + '_Tumor'
30+
genetic_samples['other_id'] = genetic_samples['other_id'].str.replace("_2", "-2")
31+
# assign all model types to tumor - is this correct?
32+
genetic_samples['model_type'] = 'tumor'
33+
# make empty column
34+
genetic_samples['other_names'] = ''
35+
# re-order columns
36+
genetic_samples = genetic_samples[['other_id', 'common_name', 'other_id_source', 'other_names', 'cancer_type', 'species', 'model_type']]
37+
38+
return genetic_samples
39+
40+
41+
def download_and_format_rna_samples(synLoginObject):
42+
"""
43+
Download and format samples that have RNAseq data in the Sarcoma PDO project via Synapse.
44+
45+
Parameters
46+
----------
47+
synLoginObject : synapseclient.Synapse
48+
an object generated by a call to syn.login(PAT)
49+
50+
Returns
51+
-------
52+
pd.DataFrame
53+
a dataframe containing formatted sample info
54+
"""
55+
# download rna sample sheet, 64 rows, 32 unique samples
56+
rnaSampleSheet = synLoginObject.tableQuery("select * from syn61894657")
57+
rnaSampleDF = rnaSampleSheet.asDataFrame()
58+
# select and rename columns
59+
rna_samples = rnaSampleDF[['Sample_Name', 'individualID', 'diagnosis', 'experimentalCondition']].rename({"Sample_Name" : 'other_id', 'individualID':'common_name', 'diagnosis':'cancer_type', 'experimentalCondition':'model_type'}, axis=1)
60+
# subset to only include one of the paired-end IDs
61+
rna_samples = rna_samples[rna_samples['other_id'].str.contains("R1")]
62+
# trimming 'Sample_Name' to only include sample ID and model_type
63+
rna_samples['other_id'] = rna_samples['other_id'].str.slice(0, -16)
64+
# add 3 columns below
65+
rna_samples['species'] = 'Homo sapiens(Human)'
66+
rna_samples['other_id_source'] = 'Synapse'
67+
rna_samples['other_names'] = ''
68+
# re-order columns
69+
rna_samples = rna_samples[['other_id', 'common_name', 'other_id_source', 'other_names', 'cancer_type', 'species', 'model_type']]
70+
# replace abbreviation (MPNST) with full name
71+
rna_samples.loc[rna_samples['cancer_type'] == 'MPNST', "cancer_type"] = 'Malignant peripheral nerve sheath tumor'
72+
# sarc00095_tumor duplicated, Ewing Sarcoma (in RNA) vs. CIC-rearranged sarcoma in DNA
73+
rna_samples.loc[rna_samples['cancer_type'] == 'Ewing sarcoma', "cancer_type"] = 'CIC-rearranged sarcoma'
74+
75+
# Sarc0101, 0137 - change name in RNA to full 'Dedifferentiated liposarcoma
76+
rna_samples.loc[rna_samples['cancer_type'] == 'Dediff Liposarcoma', "cancer_type"] = 'Dedifferentiated liposarcoma'
77+
# Sarc 0120 - change name in RNA to full 'Well-differentiated liposarcoma'
78+
rna_samples.loc[rna_samples['cancer_type'] == 'Well-diff Liposarcoma', "cancer_type"] = 'Well-differentiated liposarcoma'
79+
# clean up 'model_type' by removing 'Thawed' prefix in some rows
80+
modeltypeDF = rna_samples['model_type'].str.rsplit("_", expand =True)
81+
modeltypeDF.loc[modeltypeDF[0] =="Thawed", [0]] = modeltypeDF[1]
82+
modeltypeDF[0] = modeltypeDF[0].str.lower()
83+
rna_samples['model_type'] = modeltypeDF[0]
84+
85+
return rna_samples
86+
87+
#def generate_samples_file(prev_samples_path):
88+
89+
# if prev_samples_path == "":
90+
#maxval = 0
91+
# else:
92+
# maxval = max(pd.read_csv(prev_samples_path).improve_sample_id)
93+
94+
if __name__ == "__main__":
95+
print('in main')
96+
parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of sample files for the Sarcoma PDO project into a single samplesheet")
97+
print('in line 97')
98+
parser.add_argument('-t', '--token', type=str, help='Synapse Token')
99+
100+
parser.add_argument("-p", '--prevSamples', nargs="?", type=str, default ="", const = "", help = "Use this to provide previous sample file, will run sample file generation")
101+
102+
args = parser.parse_args()
103+
print(args)
104+
print("Logging into Synapse")
105+
PAT = args.token
106+
synObject = synapseclient.login(authToken=PAT)
107+
108+
rnaTable = download_and_format_rna_samples(synObject)
109+
geneticTable = download_and_format_genetic_samples(synObject)
110+
print()
111+
merged = rnaTable.merge(geneticTable, how='outer')
112+
113+
prev_max_improve_id = max(pd.read_csv(args.prevSamples).improve_sample_id)
114+
merged['improve_sample_id'] = range(prev_max_improve_id+1, prev_max_improve_id+merged.shape[0]+1)
115+
116+
merged.to_csv('~/Downloads/sarcpdo_samples.csv', index=False)
117+
118+
# validate with: linkml validate -s coderdata/schema/coderdata.yaml ~/Downloads/sarcpdo_samples.csv
119+
120+
121+

0 commit comments

Comments
 (0)