@@ -128,34 +128,34 @@ def map_mutations(mutation_data, improve_id_data, entrez_data):
128128 entrez_data = pd .read_csv (entrez_data )
129129
130130 # create mutation names using chr, position, etc
131- mutations_df .columns = mutations_df .iloc [0 ]
132- mutations_df = mutations_df .drop ([0 ], axis = 0 )
133- mutations_df ['mutation' ] = "g." + mutations_df ['Chromosome' ] + ":" + np .where (mutations_df ['Start_Position' ] == mutations_df ['End_Position' ], mutations_df ['Start_Position' ].astype (str ), mutations_df ['Start_Position' ].astype (str ) + "_" + mutations_df ['End_Position' ].astype (str ))
134- for index , row in mutations_df .iterrows ():
135- if mutations_df .at [index ,'Variant_Classification' ].__contains__ ("Ins" ):
136- mutations_df .at [index ,'mutation' ] = mutations_df .at [index ,'mutation' ] + "ins" + mutations_df .at [index ,'Tumor_Seq_Allele2' ]
137- elif mutations_df .at [index ,'Variant_Classification' ].__contains__ ("Del" ):
138- mutations_df .at [index ,'mutation' ] = mutations_df .at [index ,'mutation' ] + "del" + mutations_df .at [index ,'Tumor_Seq_Allele1' ]
131+ mutation_data .columns = mutation_data .iloc [0 ]
132+ mutation_data = mutation_data .drop ([0 ], axis = 0 )
133+ mutation_data ['mutation' ] = "g." + mutation_data ['Chromosome' ] + ":" + np .where (mutation_data ['Start_Position' ] == mutation_data ['End_Position' ], mutation_data ['Start_Position' ].astype (str ), mutation_data ['Start_Position' ].astype (str ) + "_" + mutation_data ['End_Position' ].astype (str ))
134+ for index , row in mutation_data .iterrows ():
135+ if mutation_data .at [index ,'Variant_Classification' ].__contains__ ("Ins" ):
136+ mutation_data .at [index ,'mutation' ] = mutation_data .at [index ,'mutation' ] + "ins" + mutation_data .at [index ,'Tumor_Seq_Allele2' ]
137+ elif mutation_data .at [index ,'Variant_Classification' ].__contains__ ("Del" ):
138+ mutation_data .at [index ,'mutation' ] = mutation_data .at [index ,'mutation' ] + "del" + mutation_data .at [index ,'Tumor_Seq_Allele1' ]
139139 else :
140- mutations_df .at [index ,'mutation' ] = mutations_df .at [index ,'mutation' ] + mutations_df .at [index ,'Tumor_Seq_Allele1' ] + ">" + mutations_df .at [index ,'Tumor_Seq_Allele2' ]
140+ mutation_data .at [index ,'mutation' ] = mutation_data .at [index ,'mutation' ] + mutation_data .at [index ,'Tumor_Seq_Allele1' ] + ">" + mutation_data .at [index ,'Tumor_Seq_Allele2' ]
141141
142142 # map columns in mutations data to their improved id
143- sample_mutations_df = pd .merge (mutations_df , samples_df [['other_id' ,'improve_sample_id' ]], how = 'inner' , left_on = "Tumor_Sample_Barcode" , right_on = "other_id" )
143+ sample_mutation_data = pd .merge (mutation_data , improve_id_data [['other_id' ,'improve_sample_id' ]], how = 'inner' , left_on = "Tumor_Sample_Barcode" , right_on = "other_id" )
144144
145145 # the data's variant classification matches scheme well, except "Non-coding_Transcript". let's change those to RNA
146- sample_entrez_mutations_df = pd .merge (sample_mutations_df , entrez_df [['entrez_id' ,'other_id' ]], how = 'left' , left_on = "Hugo_Symbol" , right_on = "other_id" ) # merge with our entrez database to see if we have additional matches
146+ sample_entrez_mutation_data = pd .merge (sample_mutation_data , entrez_data [['entrez_id' ,'other_id' ]], how = 'left' , left_on = "Hugo_Symbol" , right_on = "other_id" ) # merge with our entrez database to see if we have additional matches
147147
148148 # clean up column names and data types
149- columns_to_drop = set (sample_entrez_mutations_df .columns ) - set (['entrez_id' ,'mutation' ,'Variant_Classification' ,'improve_sample_id' ])
150- mapped_mutations_df = sample_entrez_mutations_df .drop (columns = columns_to_drop )
151- mapped_mutations_df = mapped_mutations_df .rename (columns = {'Variant_Classification' :'variant_classification' })
152- mapped_mutations_df ['source' ] = "Synapse"
153- mapped_mutations_df ['study' ] = "liverpdo"
154- mapped_mutations_df = mapped_mutations_df .astype ({'entrez_id' :'int' ,'improve_sample_id' :'int' })
155- mapped_mutations_df = mapped_mutations_df .drop_duplicates ()
156- mapped_mutations_df = mapped_mutations_df [['entrez_id' ,'mutation' ,'variant_classification' ,'improve_sample_id' ,'study' ,'source' ]]
149+ columns_to_drop = set (sample_entrez_mutation_data .columns ) - set (['entrez_id' ,'mutation' ,'Variant_Classification' ,'improve_sample_id' ])
150+ mapped_mutation_data = sample_entrez_mutation_data .drop (columns = columns_to_drop )
151+ mapped_mutation_data = mapped_mutation_data .rename (columns = {'Variant_Classification' :'variant_classification' })
152+ mapped_mutation_data ['source' ] = "Synapse"
153+ mapped_mutation_data ['study' ] = "liverpdo"
154+ mapped_mutation_data = mapped_mutation_data .astype ({'entrez_id' :'int' ,'improve_sample_id' :'int' })
155+ mapped_mutation_data = mapped_mutation_data .drop_duplicates ()
156+ mapped_mutation_data = mapped_mutation_data [['entrez_id' ,'mutation' ,'variant_classification' ,'improve_sample_id' ,'study' ,'source' ]]
157157
158- return (mapped_mutations_df )
158+ return (mapped_mutation_data )
159159
160160
161161def get_copy_call (a ):
@@ -235,8 +235,9 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
235235 entrez_data = pd .read_csv (entrez_data )
236236
237237 # first, convert genes, which are in ensembl id's to gene names
238+ transciptomics_data = transciptomics_data .rename (columns = {'Unnamed: 0' : 'stable_id' })
238239 mg = mygene .MyGeneInfo ()
239- ensembl_ids = transciptomics_data . iloc [:, 0 ].values
240+ ensembl_ids = transciptomics_data [ 'stable_id' ].values
240241 gene_info_list = mg .getgenes (ensembl_ids , fields = 'symbol' )
241242 gene_df = pd .DataFrame .from_dict (gene_info_list )
242243 for_tpm = pd .merge (transciptomics_data , gene_df [['query' ,'symbol' ]], how = 'inner' , left_on = "stable_id" , right_on = "query" )
@@ -266,6 +267,7 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
266267 mapped_transcriptomics_df = mapped_transcriptomics_df .drop (columns = ['stable_id' ,'variable' ,'other_id_x' ,'other_id_y' ])
267268 mapped_transcriptomics_df ['source' ] = "Synapse"
268269 mapped_transcriptomics_df ['study' ] = "liverpdo"
270+ mapped_transcriptomics_df = mapped_transcriptomics_df .dropna ()
269271 mapped_transcriptomics_df = mapped_transcriptomics_df .astype ({'entrez_id' :'int' ,'improve_sample_id' :'int' })
270272 mapped_transcriptomics_df = mapped_transcriptomics_df [['entrez_id' ,'transcriptomics' ,'improve_sample_id' ,'source' ,'study' ]]
271273
@@ -297,10 +299,10 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
297299 # Download and parse rnaseq data
298300 rnaseq_df = download_parse_rna_data (synID = "syn68327513" , synToken = args .token , save_path = "/tmp/" )
299301 # Download rest of omics data
300- mutations_df , copynum_df , proteomics_df = download_parse_omics_data (synID = "syn66401303" , synToken = args .token , save_path = "/tmp/" )
302+ mutation_data , copynum_df , proteomics_df = download_parse_omics_data (synID = "syn66401303" , synToken = args .token , save_path = "/tmp/" )
301303 # Save mutation and copy number data into csv format
302304 rnaseq_df .to_csv ("/tmp/raw_rnaseq_data.csv" )
303- mutations_df .to_csv ("/tmp/raw_mutation_data.csv" )
305+ mutation_data .to_csv ("/tmp/raw_mutation_data.csv" )
304306 copynum_df .to_csv ("/tmp/raw_copynum_data.csv" )
305307 proteomics_df .to_csv ("/tmp/raw_proteomics_data.csv" )
306308
0 commit comments