Skip to content

Commit 02b65aa

Browse files
committed
Added pre-split script to remove problematic drugs
1 parent 0054cab commit 02b65aa

4 files changed

Lines changed: 51 additions & 4 deletions

File tree

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import gc
2+
import polars as pl
3+
4+
5+
6+
def main():
7+
8+
# Remove Problematic Drugs before Splitting Data
9+
10+
# Load the datasets
11+
all_drugs = pl.read_csv("broad_sanger_drugs.tsv", separator="\t")
12+
all_experiments = pl.read_csv("broad_sanger_experiments.tsv", separator="\t")
13+
14+
# Define the brd_list with lowercase entries for case-insensitive matching
15+
brd_list = [
16+
'brd-k03911514',
17+
'brd-k07442505',
18+
'brd-k13185470',
19+
'brd-k16130065',
20+
'brd-k20514654',
21+
'brd-k27188169',
22+
'brd-k55473186',
23+
'yl54',
24+
'brd-k58730230',
25+
'brd-k79669418',
26+
'brd-k99584050']
27+
28+
# Identify rows in all_drugs that match brd_list entries (case insensitive)
29+
removed_drugs = all_drugs.filter(pl.col("chem_name").str.to_lowercase().is_in(brd_list))
30+
31+
# Store the improve_drug_id IDs of removed entries
32+
improve_drug_id = removed_drugs["improve_drug_id"].to_list()
33+
34+
# Remove these rows from all_drugs and all_experiments
35+
all_drugs = all_drugs.filter(~pl.col("improve_drug_id").is_in(improve_drug_id))
36+
all_experiments = all_experiments.filter(~pl.col("improve_drug_id").is_in(improve_drug_id))
37+
38+
all_drugs.write_csv("broad_sanger_drugs.tsv", separator="\t")
39+
all_experiments.write_csv("broad_sanger_experiments.tsv", separator="\t")
40+
41+
42+
if __name__ == "__main__":
43+
main()

build/broad_sanger/05_separate_datasets.py renamed to build/broad_sanger/05b_separate_datasets.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55

66
def main():
7-
87
datasets_to_process = ["CCLE", "CTRPv2", "PRISM", "GDSCv1", "GDSCv2", "FIMM", "gCSI", "NCI60"]
98
omics_datatypes = ["transcriptomics","proteomics", "copy_number","mutations"] # csv
109
samples_datatypes = ["samples"] #csv

build/broad_sanger/build_misc.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,12 @@ set -euo pipefail
44
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
55

66
cp /tmp/broad_sanger* .
7-
echo "Running 05_separate_datasets.py..."
8-
/opt/venv/bin/python 05_separate_datasets.py
7+
8+
echo "Running 05a_remove_problem_drugs.py..."
9+
/opt/venv/bin/python 05a_remove_problem_drugs.py
10+
11+
echo "Running 05b_separate_datasets.py..."
12+
/opt/venv/bin/python 05b_separate_datasets.py
913

1014
echo "Removing broad_sanger* files..."
1115
rm broad_sanger*

build/docker/Dockerfile.broad_sanger_omics

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ ADD build/broad_sanger/build_samples.sh ./
3434
ADD build/broad_sanger/build_omics.sh ./
3535
ADD build/utils/* ./
3636
ADD build/broad_sanger/build_misc.sh ./
37-
ADD build/broad_sanger/05_separate_datasets.py ./
37+
ADD build/broad_sanger/05a_remove_problem_drugs.py ./
38+
ADD build/broad_sanger/05b_separate_datasets.py ./
3839

3940
ADD build/broad_sanger/requirements.txt .
4041
ADD build/broad_sanger/omics_requirements.r .

0 commit comments

Comments
 (0)