@@ -129,7 +129,7 @@ def buildTumorSampleTable(sample_names, cancer_type, samples, maxval):
129129 samples = samples .reset_index (drop = True )
130130 return samples , maxval
131131
132- def formatMutData (df , dtype , ctype , samp_names , source , samples ):
132+ def formatMutData (df , dtype , ctype , samp_names , source , genes , samples ):
133133 '''
134134 Formats mutational data.
135135 '''
@@ -159,6 +159,10 @@ def formatMutData(df, dtype, ctype, samp_names, source, samples):
159159 'Mutation' : 'mutation'
160160 })
161161 blongdf = blongdf [['improve_sample_id' , 'entrez_id' , 'mutation' , 'variant_classification' , 'source' , 'study' ]]
162+
163+ #Ensure that genes that don't map to genes_file are dropped.
164+ valid = set (genes ['entrez_id' ].astype (int ))
165+ blongdf = blongdf [blongdf .entrez_id .isin (valid )]
162166 return blongdf
163167
164168
@@ -366,7 +370,7 @@ def main():
366370 df .dropna (how = 'all' , axis = 0 , inplace = True )
367371 print (cancertype + ' ' + dtype )
368372 if dtype == 'somatic_mutation' :
369- fdf = formatMutData (df , 'mutation' , cancertype , tumor_samps , all_sources [dtype ], samples )
373+ fdf = formatMutData (df , 'mutation' , cancertype , tumor_samps , all_sources [dtype ], genes , samples )
370374 fdf = fdf .reset_index (drop = True )
371375 dtype_key = 'mutations'
372376 elif dtype == 'CNV' :
@@ -393,6 +397,7 @@ def main():
393397 print (df .to_string ())
394398 df ['entrez_id' ] = df ['entrez_id' ].fillna (0 )
395399 df ['entrez_id' ] = df ['entrez_id' ].astype (int )
400+ df = df [df .entrez_id != 0 ]
396401 df .to_csv ("/tmp/" + "cptac_" + dtype_key + '.csv.gz' , sep = ',' , index = False , compression = 'gzip' )
397402
398403if __name__ == '__main__' :
0 commit comments