@@ -209,12 +209,12 @@ def process_datasets(args):
209209 #-------------------------------------------------------------------
210210
211211
212- split_data_sets (
213- args = args ,
214- data_sets = data_sets ,
215- data_sets_info = data_sets_info ,
216- response_data = response_data
217- )
212+ # split_data_sets(
213+ # args=args,
214+ # data_sets=data_sets,
215+ # data_sets_info=data_sets_info,
216+ # response_data=response_data
217+ # )
218218
219219 #-------------------------------------------------------------------
220220 # getting common / reference gene symbols
@@ -439,6 +439,48 @@ def process_datasets(args):
439439 )
440440
441441
442+ #-------------------------------------------------------------------
443+ # create mordred table
444+ #-------------------------------------------------------------------
445+
446+ dfs_to_merge = {}
447+ for data_set in data_sets :
448+ if (data_sets [data_set ].experiments is not None
449+ and data_sets [data_set ].drug_descriptors is not None
450+ ):
451+ df_tmp = data_sets [data_set ].format (data_type = 'drug_descriptor' , shape = 'wide' )
452+ df_tmp = df_tmp .drop (columns = ['morgan fingerprint' ]).add_prefix ('mordred.' )
453+ dfs_to_merge [data_set ] = df_tmp
454+
455+ concat_drugs = pd .concat (dfs_to_merge .values ())
456+ concat_drugs = concat_drugs .replace ({'False' : '0' , 'True' : '1' })
457+ cols = concat_drugs .columns
458+ concat_drugs [cols ] = concat_drugs [cols ].apply (pd .to_numeric , errors = 'coerce' )
459+ out_df = concat_drugs .reset_index ()
460+ out_df = out_df .fillna (0 ).round (4 ).drop_duplicates (subset = ['improve_drug_id' ], keep = 'first' )
461+
462+ if args .EXCL_DRUGS_LIST is not None :
463+ logger .info (
464+ f"Removing all chemical compunds with ids: '{ args .EXCL_DRUGS_LIST } '"
465+ )
466+ out_df = out_df [~ out_df ['improve_drug_id' ].isin (args .EXCL_DRUGS_LIST )]
467+
468+ out_df .rename (
469+ columns = {'improve_drug_id' : 'improve_chem_id' },
470+ inplace = True ,
471+ )
472+
473+ outfile_path = args .WORKDIR .joinpath (
474+ "data_out" ,
475+ "x_data" ,
476+ "drug_mordred.tsv"
477+ )
478+ out_df .to_csv (
479+ path_or_buf = outfile_path ,
480+ sep = '\t ' ,
481+ index = False ,
482+ )
483+
442484 #-------------------------------------------------------------------
443485 # create mutation count table
444486 #-------------------------------------------------------------------
@@ -514,6 +556,7 @@ def process_datasets(args):
514556 index = False
515557 )
516558
559+
517560def split_data_sets (
518561 args : dict ,
519562 data_sets : dict ,
0 commit comments