1+ import pandas as pd
2+ import synapseclient
3+ import numpy as np
4+ import argparse
5+ import os
6+
7+ def get_complete_novartispdx_sample_sheet (synObject ):
8+
9+ files = list (synObject .getChildren (parent = 'syn66275995' , includeTypes = ['file' ]))
10+
11+ synIDs = [item ['id' ] for item in files ]
12+ # leave off synIDs for drug info
13+ synIDs .remove ('syn66276102' )
14+ synIDs .remove ('syn66276098' )
15+ synIDs .remove ("syn66477971" )
16+ # create empty dataframe
17+ allsamplesheet = pd .DataFrame ()
18+ # iterate through IDs and concatenate
19+ for id in synIDs :
20+ curr = synObject .get (id )
21+ currdf = pd .read_csv (curr .path )
22+ allsamplesheet = pd .concat ([allsamplesheet , currdf ], ignore_index = True )
23+ # rename columns and reformat cancer type from CANCER_HISTOLOGY column
24+ allsamplesheet ['other_id' ] = allsamplesheet ['Sample ID' ]
25+ allsamplesheet ['common_name' ] = allsamplesheet ['MODEL_ORIGINATOR_ID' ]
26+ allsamplesheet ['cancer_type' ] = allsamplesheet ['CANCER_HISTOLOGY' ].str .lower ().str .split (pat = "^[^\s]*\s" , expand = True )[1 ]
27+ allsamplesheet ['species' ] = "Homo Sapiens(human)"
28+ allsamplesheet ['model_type' ] = 'patient derived xenograft'
29+ allsamplesheet ['other_id_source' ] = 'Synapse'
30+ allsamplesheet ['other_names' ] = ''
31+ finalsamplesheet = allsamplesheet [['other_id' , 'common_name' , 'other_id_source' , 'other_names' , 'cancer_type' , 'species' , 'model_type' ]]
32+ return finalsamplesheet
33+
34+ if __name__ == "__main__" :
35+
36+ parser = argparse .ArgumentParser (description = "This script handles downloading, processing and formatting of sample files for the Novartis PDX data into a single samplesheet" )
37+
38+ parser .add_argument ('-t' , '--token' , type = str , help = 'Synapse Token' )
39+
40+ parser .add_argument ("-p" , '--prevSamples' , nargs = "?" , type = str , default = "" , const = "" , help = "Use this to provide previous sample file, will run sample file generation" )
41+
42+ args = parser .parse_args ()
43+
44+ print ("Logging into Synapse" )
45+ PAT = args .token
46+ synObject = synapseclient .login (authToken = PAT )
47+
48+ samplesheet = get_complete_novartispdx_sample_sheet (synObject )
49+
50+ if (args .prevSamples ):
51+ prev_max_improve_id = max (pd .read_csv (args .prevSamples ).improve_sample_id )
52+ else :
53+ prev_max_improve_id = 0
54+
55+ samplesheet ['improve_sample_id' ] = range (prev_max_improve_id + 1 , prev_max_improve_id + samplesheet .shape [0 ]+ 1 )
56+
57+ samplesheet .to_csv ('/tmp/novartispdx_samples.csv' , index = False )
58+
59+
0 commit comments