@@ -100,7 +100,13 @@ def main():
100100 genes = pd .read_csv (args .genes )
101101
102102 sc = synapseclient .login (args .token )
103-
103+ ##to double check identifiers, we use transcriptomics data since that determines what samples were sequenced
104+ trans = pd .read_csv ('/tmp/pancpdo_transcriptomics.csv.gz' )
105+ tsamps = samps [samps .improve_sample_id .isin (trans .improve_sample_id )]
106+ print (samps .shape )
107+ print (tsamps .shape )
108+
109+
104110 missingsamples = []
105111 if args .copy :
106112 ##query synapse view for files
@@ -112,11 +118,17 @@ def main():
112118 sname = row ['name' ].split ('--' )[0 ]
113119 print (sid ,sname )
114120 path = sc .get (sid ).path
115- if sname in set (samps .other_id ):
121+ if sname in set (tsamps .other_id ):
122+ print (sname + ' in transcriptomics, using that id' )
123+ sampid = tsamps .loc [tsamps .other_id == sname ]['improve_sample_id' ].values [0 ]
124+ missingsamples .append ('copy,trans,' + sname )
125+ elif sname in set (samps .other_id ):
126+ print (sname + ' in samples but not transcriptomics, using other id' )
116127 sampid = samps .loc [samps .other_id == sname ]['improve_sample_id' ].values [0 ]
128+ missingsamples .append ("copy,notrans," + sname )
117129 else :
118- print ('Missing sample id for ' + sname )
119- missingsamples .append ('copy,' + sname )
130+ print ('Missing sample id for ' + sname , ' skipping for now' )
131+ missingsamples .append ('copy,missed, ' + sname )
120132 continue
121133 sampid = samps .loc [samps .other_id == sname ]['improve_sample_id' ].values [0 ]
122134 res = parseCNVFile (path ,sampid , genes )
@@ -132,8 +144,14 @@ def main():
132144 sname = row ['name' ].split ('--' )[0 ]
133145 sid = row .id
134146 print (sid ,sname )
135- if sname in set (samps .other_id ):
147+ if sname in set (tsamps .other_id ):
148+ print (sname + ' in transcriptomics, using that id' )
149+ sampid = tsamps .loc [tsamps .other_id == sname ]['improve_sample_id' ].values [0 ]
150+ missingsamples .append ('mutation,trans,' + sname )
151+ elif sname in set (samps .other_id ):
152+ print (sname + ' in samples but not transcriptomics, using other id' )
136153 sampid = samps .loc [samps .other_id == sname ]['improve_sample_id' ].values [0 ]
154+ missingsamples .append ('mutation,notrans,' + sname )
137155 else :
138156 print ('Missing sample id for ' + sname )
139157 missingsamples .append ('mutation,' + sname )
@@ -144,6 +162,6 @@ def main():
144162 alldats .append (res )
145163 newmut = pd .concat (alldats )
146164 newmut .to_csv ("/tmp/pancpdo_mutations.csv.gz" ,compression = 'gzip' ,index = False )
147- missingsamples .to_csv ('missing.csv' )
165+ pd . DataFrame ( missingsamples ) .to_csv ('missing.csv' , index = False , quoting = None , header = False )
148166if __name__ == '__main__' :
149167 main ()
0 commit comments