@@ -39,10 +39,50 @@ def parseCNVFile(fpath, sampid, genes):
3939 newdat = newdat [['improve_sample_id' ,'entrez_id' ,'copy_number' ,'source' ,'study' ]]
4040 newdat ['copy_call' ] = [get_copy_call (a ) for a in newdat ['copy_number' ]]
4141 return newdat
42-
43-
42+
43+
44+ mutmap = {'CODON_CHANGE_PLUS_CODON_DELETION' :'In_Frame_Del' , ##this isn't a great mapping
45+ 'CODON_CHANGE_PLUS_CODON_INSERTION' :'In_Frame_Ins' , ##this isn't a great mapping
46+ 'CODON_DELETION' :'In_Frame_Del' ,
47+ 'CODON_INSERTION' :'In_Frame_Ins' ,
48+ 'DOWNSTREAM' :"3'Flank" ,
49+ 'FRAME_SHIFT' :'Frameshift_Variant' ,
50+ 'FRAME_SHIFT+SPLICE_SITE_ACCEPTOR+SPLICE_SITE_REGION+INTRON' :'Frameshift_Variant' ,
51+ 'FRAME_SHIFT+SPLICE_SITE_REGION' :'Frameshift_Variant' ,
52+ 'INTERGENIC' :'IGR' ,
53+ 'INTRON' :'Intron' ,
54+ 'NON_SYNONYMOUS_CODING' :'Missense_Mutation' ,
55+ 'NON_SYNONYMOUS_CODING+SPLICE_SITE_REGION' :'Missense_Mutation' ,
56+ 'SPLICE_SITE_ACCEPTOR+INTRON' :'Splice_Site' ,
57+ 'SPLICE_SITE_DONOR+INTRON' :'Splice_Site' ,
58+ 'SPLICE_SITE_REGION+INTRON' :'Splice_Site' ,
59+ 'SPLICE_SITE_REGION+NON_CODING_EXON_VARIANT' :'Splice_Site' ,
60+ 'SPLICE_SITE_REGION+SYNONYMOUS_CODING' :'Silent' ,
61+ 'START_GAINED+UTR_5_PRIME' :'Start_Codon_Ins' ,
62+ 'STOP_GAINED' :'Stop_Codon_Ins' ,
63+ 'STOP_GAINED+CODON_CHANGE_PLUS_CODON_INSERTION' :'Stop_Codon_Ins' ,
64+ 'SYNONYMOUS_CODING' :'Silent' ,
65+ 'UPSTREAM' :"5'Flank" ,
66+ 'UTR_3_PRIME' :"3'UTR" ,
67+ 'UTR_5_PRIME' :"5'UTR"
68+ }
69+
4470def parseMutFile (fpath , sampid ,genes ):
45- mutfile = pd .read_csv (fpath ,sep = '\t ' )
71+ '''
72+ move mutations to following headers:
73+ entrez_id, improve_sample_id, source, study, mutation, variant_classification
74+ '''
75+ mutfile = pd .read_csv (fpath ,sep = '\t ' )[['SNPEFF_GENE_NAME' ,'SNPEFF_EFFECT' ,'SNPEFF_CDS_CHANGE' ]]
76+ mutfile = mutfile .dropna (subset = 'SNPEFF_CDS_CHANGE' )
77+ mutfile .columns = ['gene_symbol' ,'SNPEFF_EFFECT' ,'mutation' ]
78+ fullfile = pd .merge (mutfile ,pd .DataFrame ({'SNPEFF_EFFECT' :mutmap .keys (),'variant_classification' :mutmap .values ()}))
79+ fullfile = pd .merge (fullfile ,genes )
80+ fullfile ['improve_sample_id' ] = sampid
81+ fullfile ['source' ]= 'TiriacEtAl'
82+ fullfile ['study' ]= 'pancpdo'
83+ fullfile = fullfile [['improve_sample_id' ,'entrez_id' ,'source' ,'study' ,'mutation' ,'variant_classification' ]]
84+ fullfile = fullfile .dropna ().drop_duplicates ()
85+ return fullfile
4686
4787def main ():
4888 parser = argparse .ArgumentParser (description = 'Script that collects WES and CNV data from Synapse for Coderdata' )
@@ -83,12 +123,24 @@ def main():
83123 newcnv .to_csv ('/tmp/pancpdo_copy_number.csv.gz' ,compression = 'gzip' ,index = False )
84124
85125 if args .mutation :
86- wes = sc .tableQuery (' select * from syn64608378 where parentId== syn64608263' ).asDataFrame ()
126+ wes = sc .tableQuery (" select * from syn64608378 where parentId=' syn64608263'" ).asDataFrame ()
87127 alldats = []
88128 ##go through and get every mutation file
89129 for index ,row in wes .iterrows ():
130+ sname = row ['name' ].split ('--' )[0 ]
90131 sid = row .id
91- sname = row ['name' ]
92-
132+ print (sid ,sname )
133+ if sname in set (samps .other_id ):
134+ sampid = samps .loc [samps .other_id == sname ]['improve_sample_id' ].values [0 ]
135+ else :
136+ print ('Missing sample id for ' + sname )
137+ continue
138+ path = sc .get (sid ).path
139+ sampid = samps .loc [samps .other_id == sname ]['improve_sample_id' ].values [0 ]
140+ res = parseMutFile (path ,sampid , genes )
141+ alldats .append (res )
142+ newmut = pd .concat (alldats )
143+ newmut .to_csv ("/tmp/pancpdo_mutations.csv.gz" ,compression = 'gzip' ,index = False )
144+
93145if __name__ == '__main__' :
94146 main ()
0 commit comments