Skip to content

Commit b3ed31f

Browse files
authored
Merge pull request #410 from PNNL-CompBio/multi_bug_fix
Multiple fixes for data inconsistencies across all datasets.
2 parents 546c900 + 4388930 commit b3ed31f

14 files changed

Lines changed: 65 additions & 23 deletions

build/beatAML/GetBeatAML.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -665,6 +665,7 @@ def generate_drug_list(drug_map_path,drug_path):
665665
print(improve_map_file)
666666
t_df = map_and_combine(t_df, "transcriptomics", args.genes, improve_map_file, sample_mapping_file)
667667
t_df = t_df[t_df.entrez_id.notna()]
668+
t_df = t_df[t_df.entrez_id != 0]
668669
t_df = t_df[["improve_sample_id","transcriptomics","entrez_id","source","study"]].drop_duplicates()
669670
t_df.to_csv("/tmp/beataml_transcriptomics.csv.gz",index=False,compression='gzip')
670671

@@ -676,14 +677,15 @@ def generate_drug_list(drug_map_path,drug_path):
676677
p_df = pd.melt(p_df, id_vars=['Protein'], var_name='id', value_name='proteomics')
677678
p_df = map_and_combine(p_df, "proteomics", args.genes, improve_map_file, proteomics_map)
678679
p_df = p_df[["improve_sample_id","proteomics","entrez_id","source","study"]]
680+
p_df = p_df[p_df.entrez_id != 0]
679681
p_df.to_csv("/tmp/beataml_proteomics.csv.gz",index=False,compression='gzip')
680682

681683
# New Mutation Data
682684
print("Starting Mutation Data")
683685
m_df = pd.read_csv(mutations_file, sep = '\t')
684-
685686
m_df = map_and_combine(m_df, "mutations", args.genes,improve_map_file, mutation_map_file)
686687
m_df = m_df[["improve_sample_id","mutation", "entrez_id","variant_classification","source","study"]]
688+
m_df = m_df[m_df.entrez_id != 0]
687689
m_df.to_csv("/tmp/beataml_mutations.csv.gz",index=False,compression='gzip')
688690

689691
if args.exp:

build/bladderpdo/00_createBladderPDOSampleFile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def _parse_model_type(sample_id):
3131
if "_xenoorganoid" in low:
3232
return "xenograft derived organoid"
3333
if "_organoid" in low:
34-
return "organoid"
34+
return "patient derived organoid"
3535
if "_xenograft" in low:
3636
return "patient derived xenograft"
3737
if "_parental" in low:

build/bladderpdo/01_createBladderPDOOmicsFiles.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,11 @@ def get_bladder_pdo_mutations(synObject, samples, genes):
104104
final_mutations = merged_mutations_renamed[['entrez_id', "mutation", "variant_classification", "improve_sample_id"]]
105105
final_mutations['study'] = "Lee etal 2018 Bladder PDOs"
106106
final_mutations = final_mutations.dropna(subset=["entrez_id"])
107-
final_mutations["improve_sample_id"] = final_mutations["improve_sample_id"].astype(int)
108-
final_mutations["entrez_id"] = final_mutations["entrez_id"].astype(int)
107+
final_mutations["improve_sample_id"] = final_mutations["improve_sample_id"].astype(int)
108+
#drop entrez_ids equal to zero or N/A.
109+
final_mutations = final_mutations.dropna(subset=["entrez_id"])
110+
final_mutations["entrez_id"] = final_mutations["entrez_id"].astype(int)
111+
final_mutations = final_mutations[final_mutations["entrez_id"] != 0]
109112
return final_mutations
110113

111114
def get_bladder_pdo_copynumber(synObject, samples, genes):
@@ -124,7 +127,12 @@ def get_bladder_pdo_copynumber(synObject, samples, genes):
124127
final_copynumber['study'] = "Lee etal 2018 Bladder PDOs"
125128
final_copynumber = final_copynumber.dropna(subset=["entrez_id"])
126129
final_copynumber["improve_sample_id"] = final_copynumber["improve_sample_id"].astype(int)
127-
final_copynumber["entrez_id"] = final_copynumber["entrez_id"].astype(int)
130+
#Drop genes that don't map to genes.csv
131+
valid_entrez = set(genes['entrez_id'].astype(int))
132+
final_copynumber = final_copynumber[
133+
final_copynumber['entrez_id'].isin(valid_entrez)
134+
]
135+
final_copynumber["entrez_id"] = final_copynumber["entrez_id"].astype(int)
128136
return final_copynumber
129137

130138

build/broad_sanger/02-broadSangerOmics.R

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,7 @@ depmap_files<-function(fi,value){
405405

406406
res<-exp_file|>
407407
mutate(entrez_id=as.numeric(EntrezGeneID))|>
408+
filter(entrez_id %in% genes$entrez_id) |>
408409
left_join(as.data.frame(depmap_vtab))
409410

410411
##now many variants are missing???
@@ -442,7 +443,8 @@ depmap_files<-function(fi,value){
442443
print("wide to long")
443444
res = tidyr::pivot_longer(data=exp_file,cols=c(2:ncol(exp_file)),
444445
names_to='gene_entrez',values_to='transcriptomics',
445-
values_transform=list(expression=as.numeric))
446+
values_transform=list(transcriptomics=as.numeric))|>
447+
dplyr::mutate(transcriptomics = 2^transcriptomics - 1)
446448
colnames(res)[1]<-'other_id'
447449

448450
print('fixing gene names')

build/broad_sanger/05b_separate_datasets.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ def main():
4040
# Extract information to separate out datasets
4141
exp_improve_sample_ids = exp["improve_sample_id"].unique().to_list()
4242
exp_improve_drug_ids = exp["improve_drug_id"].unique().to_list()
43+
44+
#Ensure that the improve_sample_id column is in integer form.
45+
exp = exp.with_column(pl.col("improve_sample_id").cast(pl.Float64).cast(pl.Int64))
4346

4447
# Write Filtered Experiments File to TSV. Then delete it from memory.
4548
exp_filename_out = f"/tmp/{dataset}_experiments.tsv".lower()

build/cptac/getCptacData.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def buildTumorSampleTable(sample_names, cancer_type, samples, maxval):
129129
samples = samples.reset_index(drop=True)
130130
return samples, maxval
131131

132-
def formatMutData(df, dtype, ctype, samp_names, source, samples):
132+
def formatMutData(df, dtype, ctype, samp_names, source, genes, samples):
133133
'''
134134
Formats mutational data.
135135
'''
@@ -159,6 +159,10 @@ def formatMutData(df, dtype, ctype, samp_names, source, samples):
159159
'Mutation': 'mutation'
160160
})
161161
blongdf = blongdf[['improve_sample_id', 'entrez_id', 'mutation', 'variant_classification', 'source', 'study']]
162+
163+
#Ensure that genes that don't map to genes_file are dropped.
164+
valid = set(genes['entrez_id'].astype(int))
165+
blongdf = blongdf[blongdf.entrez_id.isin(valid)]
162166
return blongdf
163167

164168

@@ -366,7 +370,7 @@ def main():
366370
df.dropna(how='all', axis=0, inplace=True)
367371
print(cancertype + ' ' + dtype)
368372
if dtype == 'somatic_mutation':
369-
fdf = formatMutData(df, 'mutation', cancertype, tumor_samps, all_sources[dtype], samples)
373+
fdf = formatMutData(df, 'mutation', cancertype, tumor_samps, all_sources[dtype], genes, samples)
370374
fdf = fdf.reset_index(drop=True)
371375
dtype_key = 'mutations'
372376
elif dtype == 'CNV':
@@ -393,6 +397,7 @@ def main():
393397
print(df.to_string())
394398
df['entrez_id'] = df['entrez_id'].fillna(0)
395399
df['entrez_id'] = df['entrez_id'].astype(int)
400+
df = df[df.entrez_id != 0]
396401
df.to_csv("/tmp/" + "cptac_" + dtype_key + '.csv.gz', sep=',', index=False, compression='gzip')
397402

398403
if __name__ == '__main__':

build/crcpdo/01-samples-crcpdo.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,13 +118,13 @@ def generate_sample_file(sequencing_data_path:str = None, prev_samples_path:str
118118
for index, row in samples_df.iterrows():
119119
if "Tumor-Organoid" in samples_df.loc[index, 'other_id']:
120120
samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "T-O"
121-
samples_df.loc[index, 'model_type'] = "organoid"
121+
samples_df.loc[index, 'model_type'] = "patient derived organoid"
122122
if "Tumor-Biopsy" in samples_df.loc[index, 'other_id']:
123123
samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "T-B"
124-
samples_df.loc[index, 'model_type'] = "ex vivo"
124+
samples_df.loc[index, 'model_type'] = "tumor"
125125
if "Normal-Organoid" in samples_df.loc[index, 'other_id']:
126126
samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "N-O"
127-
samples_df.loc[index, 'model_type'] = "organoid"
127+
samples_df.loc[index, 'model_type'] = "patient derived organoid"
128128
samples_df['other_id_source'] = "vandeWetering_2015"
129129
samples_df['cancer_type'] = "Colorectal Carcinoma"
130130
samples_df['species'] = "Homo sapiens (Human)"

build/hcmi/01-createHCMISamplesFile.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,19 +22,19 @@ def align_to_linkml_schema(input_df):
2222
-------
2323
pd.DataFrame
2424
A copy of the input DataFrame with the 'model_type' column values mapped to
25-
a set of predefined categories ('tumor', 'organoid', 'cell line').
25+
a set of predefined categories ('tumor', 'patient derived organoid', 'cell line').
2626
The mapping is designed to align the DataFrame with the LinkML schema requirements.
2727
"""
2828

2929
mapping_dict = {
3030
'Solid Tissue': 'tumor',
31-
'3D Organoid': 'organoid',
31+
'3D Organoid': 'patient derived organoid',
3232
'Peripheral Blood Components NOS': 'tumor',
3333
'Buffy Coat': np.nan,
3434
None: np.nan,
3535
'Peripheral Whole Blood': 'tumor',
3636
'Adherent Cell Line': 'cell line',
37-
'3D Neurosphere': 'organoid',
37+
'3D Neurosphere': 'patient derived organoid',
3838
'2D Modified Conditionally Reprogrammed Cells': 'cell line',
3939
'Pleural Effusion': np.nan,
4040
'Human Original Cells': 'cell line',
@@ -50,6 +50,9 @@ def align_to_linkml_schema(input_df):
5050
input_df.dropna(subset=['model_type'], inplace=True)
5151
input_df = input_df.sort_values(by='improve_sample_id')
5252

53+
#Apparently any missing cancer type is normal tissue.
54+
input_df['cancer_type'] = input_df['cancer_type'].replace('', np.nan)
55+
input_df['cancer_type'] = input_df['cancer_type'].fillna('Normal Tissue')
5356
return input_df
5457

5558
def download_from_github(raw_url, save_path):

build/hcmi/02-getHCMIData.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ def map_and_combine(dataframe_list, data_type, metadata, entrez_map_file):
402402

403403
# Load mapping files using Polars
404404
genes = pl.read_csv(entrez_map_file) # Map gene_name to entrez_id
405-
405+
valid_entrez = genes["entrez_id"].cast(pl.Int64).unique().to_list()
406406
# Process each dataframe based on its data_type
407407
while dataframe_list:
408408
df = dataframe_list.pop()
@@ -428,8 +428,16 @@ def map_and_combine(dataframe_list, data_type, metadata, entrez_map_file):
428428
mapped_df = mapped_df.select(['entrez_id', 'mutation', 'Variant_Classification', 'file_id'])
429429
mapped_df = mapped_df.with_columns([pl.lit('GDC').alias('source'),
430430
pl.lit('HCMI').alias('study')])
431-
mapped_df = mapped_df.with_columns(mapped_df["entrez_id"].cast(str))
432-
431+
mapped_df = mapped_df.with_columns([
432+
pl.col("entrez_id").cast(pl.Int64),
433+
pl.lit('GDC' ).alias('source'),
434+
pl.lit('HCMI').alias('study'),
435+
])
436+
#drop genes not in genes file.
437+
mapped_df = mapped_df.filter(
438+
(pl.col("entrez_id") != 0) &
439+
pl.col("entrez_id").is_in(valid_entrez)
440+
)
433441
final_dataframe = pl.concat([final_dataframe, mapped_df])
434442
del df, mapped_df
435443
gc.collect()

build/mpnst/00_sample_gen.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ sampTable<-manifest|>
5555

5656
##third, generate a sample for the MTs if they were generated
5757
pdxmt<-subset(sampTable,!is.na(MicroTissueDrugFolder))
58-
pdxmt$model_type=rep('organoid',nrow(pdxmt))
58+
pdxmt$model_type=rep('xenograft derived organoid',nrow(pdxmt))
5959
print(pdxmt)
6060

6161
main<-rbind(sampTable,pdxmt)|>

0 commit comments

Comments
 (0)