Skip to content

Commit 09fb9e5

Browse files
committed
Updated mapping scripts with all datasets and removed cptac by default. Removed tons of print statements so debugging the full build would be easier
1 parent 797f37c commit 09fb9e5

7 files changed

Lines changed: 19 additions & 91 deletions

File tree

build/hcmi/02-getHCMIData.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -581,14 +581,14 @@ def align_to_schema(data, data_type, chunksize=7500,samples_path='/tmp/hcmi_samp
581581

582582
# Process in chunks
583583
merged_data = pl.DataFrame()
584-
print(f"merged_data:\n {merged_data}")
584+
# print(f"merged_data:\n {merged_data}")
585585

586586
for i in range(0, len(data), chunksize):
587587
chunk = data[i:i + chunksize]
588588
if data_type == "mutations":
589589
chunk = chunk.rename({"Variant_Classification": "variant_classification"})
590590
chunk = chunk.select(selected_columns)
591-
print(f"chunk: \n{chunk}")
591+
# print(f"chunk: \n{chunk}")
592592
merged_chunk = samples.join(chunk, left_on='other_names', right_on='aliquot_id', how='inner')
593593
merged_chunk = merged_chunk.drop(["aliquot_id", "other_names"])
594594

build/pancpdo/02-getPancPDOData.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -415,8 +415,8 @@ def map_and_combine(dataframe_list, data_type, metadata, entrez_map_file):
415415
df_metadata = pl.DataFrame(metadata_dict)
416416

417417
# Merge the metadata DataFrame with the final dataframe based on 'file_id'
418-
print(df_metadata)
419-
print(final_dataframe)
418+
# print(df_metadata)
419+
# print(final_dataframe)
420420
final_dataframe = final_dataframe.join(df_metadata, on='file_id', how='left')
421421

422422
return final_dataframe
@@ -540,14 +540,14 @@ def align_to_schema(data, data_type, chunksize=7500,samples_path='/tmp/hcmi_samp
540540

541541
# Process in chunks
542542
merged_data = pl.DataFrame()
543-
print(f"merged_data:\n {merged_data}")
543+
# print(f"merged_data:\n {merged_data}")
544544

545545
for i in range(0, len(data), chunksize):
546546
chunk = data[i:i + chunksize]
547547
if data_type == "mutations":
548548
chunk = chunk.rename({"Variant_Classification": "variant_classification"})
549549
chunk = chunk.select(selected_columns)
550-
print(f"chunk: \n{chunk}")
550+
# print(f"chunk: \n{chunk}")
551551
merged_chunk = samples.join(chunk, left_on='other_names', right_on='aliquot_id', how='inner')
552552
merged_chunk = merged_chunk.drop(["aliquot_id", "other_names"])
553553

build/utils/build_drug_desc.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,17 +76,17 @@ def main():
7676

7777
cores = multiprocessing.cpu_count()
7878
ncors = cores-1
79-
print("Running with "+str(ncors)+' out of '+str(cores)+' processors')
80-
print('Adding drug table for '+args.drugtable)
79+
# print("Running with "+str(ncors)+' out of '+str(cores)+' processors')
80+
# print('Adding drug table for '+args.drugtable)
8181
tab = pd.read_csv(args.drugtable,sep='\t')
8282

8383
cansmiles = [a for a in set(tab.canSMILES) if str(a)!='nan']
8484
# isosmiles = list(set(tab.isoSMILES))
8585
morgs = smiles_to_fingerprint(cansmiles)
8686

8787
ids = pd.DataFrame(tab[['improve_drug_id','canSMILES']]).drop_duplicates()
88-
print("IDS columns:", ids.columns.tolist())
89-
print("MORGS columns:", morgs.columns.tolist())
88+
# print("IDS columns:", ids.columns.tolist())
89+
# print("MORGS columns:", morgs.columns.tolist())
9090
id_morg = ids.rename({"canSMILES":'smile'},axis=1).merge(morgs)[['improve_drug_id','structural_descriptor','descriptor_value']]
9191

9292
mords = smiles_to_mordred(cansmiles,nproc=ncors)
@@ -105,8 +105,8 @@ def main():
105105
full['improve_drug_id'] = full['improve_drug_id'].astype(str).str.strip()
106106
mask = full['improve_drug_id'].str.match(r'^SMI_\d+$')
107107
n_dropped = (~mask).sum()
108-
if n_dropped:
109-
print(f"Dropping {n_dropped} malformed improve_drug_id rows.")
108+
# if n_dropped:
109+
# print(f"Dropping {n_dropped} malformed improve_drug_id rows.")
110110
full = full[mask].copy()
111111

112112

build/utils/pubchem_retrieval.py

Lines changed: 2 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -205,79 +205,6 @@ def timeout_handler(signum, frame):
205205
should_continue = False
206206

207207

208-
# def update_dataframe_and_write_tsv(unique_names, output_filename="drugs.tsv", ignore_chems="ignore_chems.txt",
209-
# batch_size=1, isname=True, time_limit=48 * 60 * 60):
210-
# """
211-
# Updates the data frame with drug information and writes it to a TSV file.
212-
213-
# Parameters:
214-
# - unique_names (iterable): List of unique compound names or CIDs.
215-
# - output_filename (str): File path to the output TSV file.
216-
# - ignore_chems (str): File path to log ignored compounds.
217-
# - batch_size (int): Number of compounds to process in each batch.
218-
# - isname (bool): True if unique_names are names, False if they're CIDs.
219-
# - time_limit (int): Time limit for the script in seconds. This is a remnant of the GitHub Action CI.
220-
221-
# Returns:
222-
# - None
223-
# """
224-
# global should_continue, existing_synonyms, existing_pubchemids
225-
# signal.signal(signal.SIGALRM, timeout_handler)
226-
# signal.alarm(time_limit)
227-
# print(f'Starting with {len(unique_names)} unique drug names/IDs')
228-
229-
# try:
230-
# print(f'Reading existing data from {output_filename}')
231-
# read_existing_data(output_filename)
232-
# if isname:
233-
# unique_names = set([str(name).lower() for name in unique_names if not pd.isna(name)])
234-
# unique_names = set(unique_names) - set(existing_synonyms)
235-
# print(f'Looking at {len(unique_names)} names')
236-
# else:
237-
# unique_names = set([str(name) for name in unique_names if not pd.isna(name)])
238-
# unique_names = set(unique_names) - set(existing_pubchemids)
239-
# print(f'Looking at {len(unique_names)} IDs')
240-
# ignore_chem_set = set()
241-
# if os.path.exists(ignore_chems):
242-
# with open(ignore_chems, 'r') as file:
243-
# for line in file:
244-
# ignore_chem_set.add(line.strip())
245-
# unique_names = list(set(unique_names) - ignore_chem_set)
246-
247-
# print(f"{len(unique_names)} Drugs to search")
248-
# for i in range(0, len(unique_names), batch_size):
249-
# if not should_continue:
250-
# break
251-
# if unique_names[i] in existing_synonyms or unique_names[i] in existing_pubchemids:
252-
# continue
253-
254-
# batch = unique_names[i:i + batch_size]
255-
# data = fetch_data_for_batch(batch, ignore_chems, isname)
256-
# if data:
257-
# file_exists = os.path.isfile(output_filename)
258-
# mode = 'a' if file_exists else 'w'
259-
# with open(output_filename, mode) as f:
260-
# if not file_exists:
261-
# f.write("improve_drug_id\tchem_name\tpubchem_id\tcanSMILES\tInChIKey\tformula\tweight\n")
262-
# for entry in data:
263-
# f.write(f"{entry['improve_drug_id']}\t{entry['name']}\t{entry.get('CID', '')}\t"
264-
# f"{entry['SMILES']}\t{entry['InChIKey']}\t"
265-
# f"{entry['MolecularFormula']}\t{entry['MolecularWeight']}\n")
266-
267-
# with open(ignore_chems, "a") as ig_f:
268-
# for entry in data:
269-
# if isname:
270-
# ig_f.write(f"{entry['name']}\n")
271-
# else:
272-
# ig_f.write(f"{entry.get('CID', '')}\n")
273-
274-
# except Exception as e:
275-
# print(f"An unexpected error occurred: {e}")
276-
# finally:
277-
# signal.alarm(0)
278-
279-
280-
281208

282209

283210
def _load_prev_drugs_union(prevDrugFilepath: str) -> pd.DataFrame:
@@ -497,8 +424,8 @@ def update_dataframe_and_write_tsv(unique_names,
497424
nums_comb = pd.to_numeric(extracted_comb, errors="coerce")
498425
if not nums_comb.empty:
499426
new_ids = set(combined.loc[nums_comb > previous_max, "improve_drug_id"])
500-
if new_ids:
501-
print(f"Newly assigned improve_drug_id(s): {new_ids}")
427+
# if new_ids:
428+
# print(f"Newly assigned improve_drug_id(s): {new_ids}")
502429

503430
# --- 9) union and filter final DataFrame by improve_drug_id(s) ---
504431
keep_ids = hit_ids.union(new_ids)

scripts/align_drug_descriptors.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ def rewrite_files(files, ref):
6565
key = (row['improve_drug_id'], row['structural_descriptor'])
6666
correct = ref.get(key)
6767
if correct is not None and row['descriptor_value'] != correct:
68-
print(f"Fixing {key} in {os.path.basename(fp)}: "
69-
f"{row['descriptor_value']} to {correct}")
68+
# print(f"Fixing {key} in {os.path.basename(fp)}: "
69+
# f"{row['descriptor_value']} to {correct}")
7070
row['descriptor_value'] = correct
7171
changed = True
7272
writer.writerow(row)

scripts/map_improve_drug_ids.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,7 @@ def main():
369369
help='Build date in YYYY-MM-DD. Default=now.')
370370
parser.add_argument('--version', required=True,
371371
help='Build version. Must be unique per build.')
372-
parser.add_argument('--datasets', default='gdscv1,ccle,ctrpv2,fimm,gcsi,gdscv2,nci60,prism,beataml,mpnst,mpnstpdx,pancpdo,bladderpdo,sarcpdo',
372+
parser.add_argument('--datasets', default='gdscv1,ccle,ctrpv2,fimm,gcsi,gdscv2,nci60,prism,beataml,pancpdo,bladderpdo,sarcpdo,liverpdo,novartispdx,mpnst',
373373
help='Comma-separated list of datasets.')
374374
parser.add_argument('--local_dir', default='data',
375375
help='Directory containing TSV files.')
@@ -378,6 +378,7 @@ def main():
378378
parser.add_argument('--input_files', nargs='+',
379379
help='List of input files to process. If specified, only these files will be processed.')
380380
args = parser.parse_args()
381+
381382

382383
# Set build_date
383384
build_date = args.build_date or datetime.utcnow().strftime("%Y-%m-%d")

scripts/map_improve_sample_ids.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,7 @@ def main():
412412
help='Build date in YYYY-MM-DD. Default=now.')
413413
parser.add_argument('--version', required=True,
414414
help='Build version. Must be unique per build.')
415-
parser.add_argument('--datasets', default='ccle,ctrpv2,fimm,gcsi,gdscv1,gdscv2,nci60,prism,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo,bladderpdo,sarcpdo',
415+
parser.add_argument('--datasets', default='ccle,ctrpv2,fimm,gcsi,gdscv1,gdscv2,nci60,prism,hcmi,beataml,cptac,pancpdo,bladderpdo,sarcpdo,liverpdo,novartispdx,mpnst',
416416
help='Comma-separated list of datasets, e.g., beataml,ccle')
417417
parser.add_argument('--local_dir', default='data',
418418
help='Directory containing all CSV/TSV files.')

0 commit comments

Comments
 (0)