11'''
2- gets nci60 data from 10/2023 release
2+ gets nci60 data from 10/2024 release
33
44'''
55
1212from urllib import request
1313
1414conc_data = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP.zip?version=11&modificationDate=1712351454136&api=v2'
15+ ##OCT 2024
16+ conc_data = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP.zip?version=13&modificationDate=1727922354561&api=v2'
17+
1518cancelled = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP_Cancelled.csv?version=1&modificationDate=1660871847000&api=v2&download=true'
1619
1720def main ():
1821
1922 parser = argparse .ArgumentParser ()
2023 parser .add_argument ('--sampleFile' ,dest = 'samplefile' ,default = None ,help = 'DepMap sample file' )
2124 parser .add_argument ('--drugFile' ,dest = 'dfile' ,default = None ,help = 'Drug database' )
25+
2226
2327 opts = parser .parse_args ()
2428
@@ -31,7 +35,7 @@ def main():
3135 samples = pl .read_csv (samplefile ,quote_char = '"' )
3236 drugs = pl .read_csv (drugfile ,separator = '\t ' ,quote_char = '"' )
3337
34- dose_resp = pl .read_csv ("DOSERESP.csv" ,quote_char = '"' ,infer_schema_length = 10000000 )
38+ dose_resp = pl .read_csv ("DOSERESP.csv" ,quote_char = '"' ,infer_schema_length = 10000000 , ignore_errors = True )
3539
3640 ##update drug mapping
3741 drugmapping = pl .DataFrame (
@@ -47,7 +51,10 @@ def main():
4751 ###update sample mapping
4852 on = samples [['other_names' ,'improve_sample_id' ]]
4953 on .columns = ['common_name' ,'improve_sample_id' ]
50-
54+
55+ #there should be 71 cell lines, but there are 163.
56+ # 82 map to the 'other_names'
57+ # 81 map to neither
5158 sampmapping = pl .concat ([on [['common_name' ,'improve_sample_id' ]],samples [['common_name' ,'improve_sample_id' ]]])
5259
5360 sampmapping = sampmapping .unique ()
@@ -64,44 +71,50 @@ def main():
6471
6572
6673 ##now we can merge all the data into the dose response data frame
67- merged = dose_resp [['AVERAGE_PTC' ,'CONCENTRATION' ,'CELL_NAME' ,'EXPID' ,'NSC' ]].join (sampmapping ,on = 'CELL_NAME' ,how = 'left' )
74+ merged = dose_resp [['AVERAGE_PTC' ,'CONCENTRATION_UNIT' , ' CONCENTRATION' ,'CELL_NAME' ,'EXPID' ,'NSC' ]].join (sampmapping ,on = 'CELL_NAME' ,how = 'left' )
6875 merged = merged .join (timemapping ,on = 'EXPID' ,how = 'left' )
6976
7077 ##clean up mssing samples
7178 nonulls = merged .filter (pl .col ('improve_sample_id' ).is_not_null ())
7279
7380 nulls = merged .filter (pl .col ('improve_sample_id' ).is_null ())
7481
75- newnames = pl .DataFrame (
76- {
77- 'new_name' : [re .split (r' |\(|/' , a )[0 ] for a in nulls ['CELL_NAME' ]],
78- 'CELL_NAME' :nulls ['CELL_NAME' ]
79- }
80- )
81- newnames = newnames .unique ()
82+ # newnames = pl.DataFrame(
83+ # {
84+ # 'new_name':[re.split(' |\(|\/',a)[0] for a in nulls['CELL_NAME']],
85+ # 'CELL_NAME':nulls['CELL_NAME']
86+ # }
87+ # )
88+ # newnames = newnames.unique()
89+
8290
83- fixed = nulls [['AVERAGE_PTC' ,'CONCENTRATION' ,'CELL_NAME' ,'EXPID' ,'NSC' ,'time' ,'time_unit' ]].join (newnames ,on = 'CELL_NAME' ,how = 'left' )
84- fixed .columns = ['AVERAGE_PTC' ,'CONCENTRATION' ,'old_CELL_NAME' ,'EXPID' ,'NSC' ,'time' ,'time_unit' ,'CELL_NAME' ]
85- fixed = fixed .join (sampmapping ,on = 'CELL_NAME' ,how = 'left' )[['AVERAGE_PTC' ,'CONCENTRATION' ,'old_CELL_NAME' ,'EXPID' ,'NSC' ,'improve_sample_id' ,'time' ,'time_unit' ]]
86- fixed .columns = ['AVERAGE_PTC' ,'CONCENTRATION' ,'CELL_NAME' ,'EXPID' ,'NSC' ,'improve_sample_id' ,'time' ,'time_unit' ]
87- fixed = fixed .filter (pl .col ('improve_sample_id' ).is_not_null ())
91+ # fixed = nulls[['AVERAGE_PTC','CONCENTRATION_UNIT ','CONCENTRATION','CELL_NAME','EXPID','NSC','time','time_unit']].join(newnames,on='CELL_NAME',how='left')
92+ # merged .columns = ['AVERAGE_PTC','CONCENTRATION_UNIT ','CONCENTRATION','old_CELL_NAME','EXPID','NSC','time','time_unit','CELL_NAME']
93+ # fixed = merged .join(sampmapping,on='CELL_NAME',how='left')[['AVERAGE_PTC','CONCENTRATION_UNIT ','CONCENTRATION','old_CELL_NAME','EXPID','NSC','improve_sample_id','time','time_unit']]
94+ # fixed.columns = ['AVERAGE_PTC','CONCENTRATION_UNIT ','CONCENTRATION','CELL_NAME','EXPID','NSC','improve_sample_id','time','time_unit']
95+ # fixed = fixed.filter(pl.col('improve_sample_id').is_not_null())
8896
89- merged = pl .concat ([nonulls ,fixed ])
97+ merged = nonulls # pl.concat([nonulls,fixed])
9098
9199 ###we get a few more results added, but still missing a bunch
92100 merged = merged .join (drugmapping ,on = 'NSC' ,how = 'left' )
93101 nulldrugs = merged .filter (pl .col ('improve_drug_id' ).is_null ())
94102 nonulls = merged .filter (pl .col ('improve_drug_id' ).is_not_null ())
103+
104+ ###now update all the concentrations to be in Moles (some are in uM, all are log10)
105+ ##some are provided as molecular weights ('v') or other ('s') and we can't compare
106+ molar = merged .filter (pl .col ('CONCENTRATION_UNIT' )== 'M' )
107+
95108 finaldf = pl .DataFrame (
96109 {
97- 'source' :['NCI60_24 ' for a in nonulls ['improve_drug_id' ]], ##2024 build
98- 'improve_sample_id' :nonulls ['improve_sample_id' ],
99- 'Drug' :nonulls ['improve_drug_id' ],
100- 'study' :['NCI60' for a in nonulls ['improve_drug_id' ]],
101- 'time' :nonulls ['time' ],
102- 'time_unit' :nonulls ['time_unit' ],
103- 'DOSE' : [10 ** a for a in nonulls ['CONCENTRATION' ]],
104- 'GROWTH' :nonulls ['AVERAGE_PTC' ]
110+ 'source' :['NCI60 ' for a in molar ['improve_drug_id' ]], ##2024 build
111+ 'improve_sample_id' :molar ['improve_sample_id' ],
112+ 'Drug' :molar ['improve_drug_id' ],
113+ 'study' : molar [ 'EXPID' ], # ['NCI60' for a in nonulls['improve_drug_id']],
114+ 'time' :molar ['time' ],
115+ 'time_unit' :molar ['time_unit' ],
116+ 'DOSE' : [( 10 ** a ) * 1000000 for a in molar ['CONCENTRATION' ]], ##move from molar to uM to match pharmacoDB
117+ 'GROWTH' :molar ['AVERAGE_PTC' ]
105118 }
106119 )
107120 ##write to file
0 commit comments