Merge pull request #410 from PNNL-CompBio/multi_bug_fix

sgosline · web-flow · commit b3ed31fb9f07 · 2025-07-07T10:15:44.000-07:00
Multiple fixes for data inconsistencies across all datasets.
diff --git a/build/beatAML/GetBeatAML.py b/build/beatAML/GetBeatAML.py
@@ -665,6 +665,7 @@ def generate_drug_list(drug_map_path,drug_path):
             print(improve_map_file)
             t_df = map_and_combine(t_df, "transcriptomics", args.genes, improve_map_file, sample_mapping_file)
             t_df = t_df[t_df.entrez_id.notna()]
+            t_df = t_df[t_df.entrez_id != 0]
             t_df = t_df[["improve_sample_id","transcriptomics","entrez_id","source","study"]].drop_duplicates()
             t_df.to_csv("/tmp/beataml_transcriptomics.csv.gz",index=False,compression='gzip')
 
@@ -676,14 +677,15 @@ def generate_drug_list(drug_map_path,drug_path):
             p_df = pd.melt(p_df, id_vars=['Protein'], var_name='id', value_name='proteomics')
             p_df = map_and_combine(p_df, "proteomics", args.genes, improve_map_file, proteomics_map)
             p_df = p_df[["improve_sample_id","proteomics","entrez_id","source","study"]]
+            p_df = p_df[p_df.entrez_id != 0]
             p_df.to_csv("/tmp/beataml_proteomics.csv.gz",index=False,compression='gzip')
         
             # New Mutation Data
             print("Starting Mutation Data")
             m_df = pd.read_csv(mutations_file, sep = '\t')
-            
             m_df = map_and_combine(m_df, "mutations", args.genes,improve_map_file, mutation_map_file)
             m_df = m_df[["improve_sample_id","mutation", "entrez_id","variant_classification","source","study"]]
+            m_df = m_df[m_df.entrez_id != 0]
             m_df.to_csv("/tmp/beataml_mutations.csv.gz",index=False,compression='gzip')
         
     if args.exp:
diff --git a/build/bladderpdo/00_createBladderPDOSampleFile.py b/build/bladderpdo/00_createBladderPDOSampleFile.py
@@ -31,7 +31,7 @@ def _parse_model_type(sample_id):
     if "_xenoorganoid" in low:
         return "xenograft derived organoid"
     if "_organoid" in low:
-        return "organoid"
+        return "patient derived organoid"
     if "_xenograft" in low:
         return "patient derived xenograft"
     if "_parental" in low:
diff --git a/build/bladderpdo/01_createBladderPDOOmicsFiles.py b/build/bladderpdo/01_createBladderPDOOmicsFiles.py
@@ -104,8 +104,11 @@ def get_bladder_pdo_mutations(synObject, samples, genes):
     final_mutations = merged_mutations_renamed[['entrez_id', "mutation", "variant_classification", "improve_sample_id"]]
     final_mutations['study'] = "Lee etal 2018 Bladder PDOs"
     final_mutations = final_mutations.dropna(subset=["entrez_id"])
-    final_mutations["improve_sample_id"] = final_mutations["improve_sample_id"].astype(int)
-    final_mutations["entrez_id"]         = final_mutations["entrez_id"].astype(int)
+    final_mutations["improve_sample_id"] = final_mutations["improve_sample_id"].astype(int)    
+    #drop entrez_ids equal to zero or N/A. 
+    final_mutations = final_mutations.dropna(subset=["entrez_id"])
+    final_mutations["entrez_id"] = final_mutations["entrez_id"].astype(int)
+    final_mutations = final_mutations[final_mutations["entrez_id"] != 0]
     return final_mutations
 
 def get_bladder_pdo_copynumber(synObject, samples, genes):
@@ -124,7 +127,12 @@ def get_bladder_pdo_copynumber(synObject, samples, genes):
     final_copynumber['study'] = "Lee etal 2018 Bladder PDOs"
     final_copynumber = final_copynumber.dropna(subset=["entrez_id"])
     final_copynumber["improve_sample_id"] = final_copynumber["improve_sample_id"].astype(int)
-    final_copynumber["entrez_id"]         = final_copynumber["entrez_id"].astype(int)
+    #Drop genes that don't map to genes.csv
+    valid_entrez = set(genes['entrez_id'].astype(int))
+    final_copynumber = final_copynumber[
+        final_copynumber['entrez_id'].isin(valid_entrez)
+    ]
+    final_copynumber["entrez_id"] = final_copynumber["entrez_id"].astype(int)
     return final_copynumber
 
 
diff --git a/build/broad_sanger/02-broadSangerOmics.R b/build/broad_sanger/02-broadSangerOmics.R
@@ -405,6 +405,7 @@ depmap_files<-function(fi,value){
 
         res<-exp_file|>
           mutate(entrez_id=as.numeric(EntrezGeneID))|>
+            filter(entrez_id %in% genes$entrez_id) |>
             left_join(as.data.frame(depmap_vtab))
 
               ##now many variants are missing???
@@ -442,7 +443,8 @@ depmap_files<-function(fi,value){
         print("wide to long")
         res = tidyr::pivot_longer(data=exp_file,cols=c(2:ncol(exp_file)),
                                   names_to='gene_entrez',values_to='transcriptomics',
-                                  values_transform=list(expression=as.numeric))
+                                  values_transform=list(transcriptomics=as.numeric))|>
+                                  dplyr::mutate(transcriptomics = 2^transcriptomics - 1)
         colnames(res)[1]<-'other_id'
 
         print('fixing gene names')
diff --git a/build/broad_sanger/05b_separate_datasets.py b/build/broad_sanger/05b_separate_datasets.py
@@ -40,6 +40,9 @@ def main():
         # Extract information to separate out datasets
         exp_improve_sample_ids = exp["improve_sample_id"].unique().to_list()
         exp_improve_drug_ids = exp["improve_drug_id"].unique().to_list()
+        
+        #Ensure that the improve_sample_id column is in integer form.
+        exp = exp.with_column(pl.col("improve_sample_id").cast(pl.Float64).cast(pl.Int64))
 
         # Write Filtered Experiments File to TSV. Then delete it from memory.
         exp_filename_out = f"/tmp/{dataset}_experiments.tsv".lower()
diff --git a/build/cptac/getCptacData.py b/build/cptac/getCptacData.py
@@ -129,7 +129,7 @@ def buildTumorSampleTable(sample_names, cancer_type, samples, maxval):
     samples = samples.reset_index(drop=True)
     return samples, maxval
 
-def formatMutData(df, dtype, ctype, samp_names, source, samples):
+def formatMutData(df, dtype, ctype, samp_names, source, genes, samples):
     '''
     Formats mutational data.
     '''
@@ -159,6 +159,10 @@ def formatMutData(df, dtype, ctype, samp_names, source, samples):
         'Mutation': 'mutation'
     })
     blongdf = blongdf[['improve_sample_id', 'entrez_id', 'mutation', 'variant_classification', 'source', 'study']]
+
+    #Ensure that genes that don't map to genes_file are dropped.
+    valid = set(genes['entrez_id'].astype(int))
+    blongdf = blongdf[blongdf.entrez_id.isin(valid)]
     return blongdf
 
 
@@ -366,7 +370,7 @@ def main():
                 df.dropna(how='all', axis=0, inplace=True)
                 print(cancertype + ' ' + dtype)
                 if dtype == 'somatic_mutation':
-                    fdf = formatMutData(df, 'mutation', cancertype, tumor_samps, all_sources[dtype], samples)
+                    fdf = formatMutData(df, 'mutation', cancertype, tumor_samps, all_sources[dtype], genes, samples)
                     fdf = fdf.reset_index(drop=True)
                     dtype_key = 'mutations'
                 elif dtype == 'CNV':
@@ -393,6 +397,7 @@ def main():
             print(df.to_string())
             df['entrez_id'] = df['entrez_id'].fillna(0)
             df['entrez_id'] = df['entrez_id'].astype(int)
+            df = df[df.entrez_id != 0]
             df.to_csv("/tmp/" + "cptac_" + dtype_key + '.csv.gz', sep=',', index=False, compression='gzip')
 
 if __name__ == '__main__':
diff --git a/build/crcpdo/01-samples-crcpdo.py b/build/crcpdo/01-samples-crcpdo.py
@@ -118,13 +118,13 @@ def generate_sample_file(sequencing_data_path:str = None, prev_samples_path:str
     for index, row in samples_df.iterrows():
         if "Tumor-Organoid" in samples_df.loc[index, 'other_id']:
             samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "T-O"
-            samples_df.loc[index, 'model_type'] = "organoid"
+            samples_df.loc[index, 'model_type'] = "patient derived organoid"
         if "Tumor-Biopsy" in samples_df.loc[index, 'other_id']:
             samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "T-B"
-            samples_df.loc[index, 'model_type'] = "ex vivo"
+            samples_df.loc[index, 'model_type'] = "tumor"
         if "Normal-Organoid" in samples_df.loc[index, 'other_id']:
             samples_df.loc[index, 'common_name'] = samples_df.loc[index, 'common_name'] + "N-O"
-            samples_df.loc[index, 'model_type'] = "organoid"
+            samples_df.loc[index, 'model_type'] = "patient derived organoid"
     samples_df['other_id_source'] = "vandeWetering_2015"
     samples_df['cancer_type'] = "Colorectal Carcinoma"
     samples_df['species'] = "Homo sapiens (Human)"
diff --git a/build/hcmi/01-createHCMISamplesFile.py b/build/hcmi/01-createHCMISamplesFile.py
@@ -22,19 +22,19 @@ def align_to_linkml_schema(input_df):
     -------
     pd.DataFrame
         A copy of the input DataFrame with the 'model_type' column values mapped to 
-        a set of predefined categories ('tumor', 'organoid', 'cell line'). 
+        a set of predefined categories ('tumor', 'patient derived organoid', 'cell line'). 
         The mapping is designed to align the DataFrame with the LinkML schema requirements.
     """
     
     mapping_dict = {
     'Solid Tissue': 'tumor',
-    '3D Organoid': 'organoid',
+    '3D Organoid': 'patient derived organoid',
     'Peripheral Blood Components NOS': 'tumor',
     'Buffy Coat': np.nan,
      None: np.nan,
     'Peripheral Whole Blood': 'tumor',
     'Adherent Cell Line': 'cell line',
-    '3D Neurosphere': 'organoid',
+    '3D Neurosphere': 'patient derived organoid',
     '2D Modified Conditionally Reprogrammed Cells': 'cell line',
     'Pleural Effusion': np.nan,
     'Human Original Cells': 'cell line',
@@ -50,6 +50,9 @@ def align_to_linkml_schema(input_df):
     input_df.dropna(subset=['model_type'], inplace=True)
     input_df = input_df.sort_values(by='improve_sample_id')
     
+    #Apparently any missing cancer type is normal tissue.
+    input_df['cancer_type'] = input_df['cancer_type'].replace('', np.nan)
+    input_df['cancer_type'] = input_df['cancer_type'].fillna('Normal Tissue')
     return input_df
 
 def download_from_github(raw_url, save_path):
diff --git a/build/hcmi/02-getHCMIData.py b/build/hcmi/02-getHCMIData.py
@@ -402,7 +402,7 @@ def map_and_combine(dataframe_list, data_type, metadata, entrez_map_file):
 
     # Load mapping files using Polars
     genes = pl.read_csv(entrez_map_file)  # Map gene_name to entrez_id
-
+    valid_entrez = genes["entrez_id"].cast(pl.Int64).unique().to_list()
     # Process each dataframe based on its data_type
     while dataframe_list:
         df = dataframe_list.pop()
@@ -428,8 +428,16 @@ def map_and_combine(dataframe_list, data_type, metadata, entrez_map_file):
             mapped_df = mapped_df.select(['entrez_id', 'mutation', 'Variant_Classification', 'file_id'])
             mapped_df = mapped_df.with_columns([pl.lit('GDC').alias('source'),
                                                pl.lit('HCMI').alias('study')])
-            mapped_df = mapped_df.with_columns(mapped_df["entrez_id"].cast(str))
-
+            mapped_df = mapped_df.with_columns([
+                pl.col("entrez_id").cast(pl.Int64),
+                pl.lit('GDC' ).alias('source'),
+                pl.lit('HCMI').alias('study'),
+            ])
+            #drop genes not in genes file.
+            mapped_df = mapped_df.filter(
+                (pl.col("entrez_id") != 0) &
+                pl.col("entrez_id").is_in(valid_entrez)
+            )
         final_dataframe = pl.concat([final_dataframe, mapped_df])
         del df, mapped_df
         gc.collect()
diff --git a/build/mpnst/00_sample_gen.R b/build/mpnst/00_sample_gen.R
@@ -55,7 +55,7 @@ sampTable<-manifest|>
 
 ##third, generate a sample for the MTs if they were generated
 pdxmt<-subset(sampTable,!is.na(MicroTissueDrugFolder))
-pdxmt$model_type=rep('organoid',nrow(pdxmt))
+pdxmt$model_type=rep('xenograft derived organoid',nrow(pdxmt))
 print(pdxmt)
 
 main<-rbind(sampTable,pdxmt)|>
diff --git a/build/mpnst/01_mpnst_get_omics.R b/build/mpnst/01_mpnst_get_omics.R
@@ -34,7 +34,7 @@ samples_df <- fread(patients)|>
 
 pdx_samps<-subset(samples_df,model_type=='patient derived xenograft')
 tumor_samps<-subset(samples_df,model_type=='tumor')
-mt_samps<-subset(samples_df,model_type=='organoid')
+mt_samps<-subset(samples_df,model_type=='xenograft derived organoid')
 
 ##now get the manifest from synapse
 manifest<-synapser::synTableQuery("select * from syn53503360")$asDataFrame()|>
diff --git a/build/pancpdo/01-createPancPDOSamplesFile.py b/build/pancpdo/01-createPancPDOSamplesFile.py
@@ -50,13 +50,13 @@ def align_to_linkml_schema(input_df):
     
     mapping_dict = {
     'Solid Tissue': 'tumor',
-    '3D Organoid': 'organoid',
+    '3D Organoid': 'patient derived organoid',
     'Peripheral Blood Components NOS': 'tumor',
     'Buffy Coat': np.nan,
      None: np.nan,
     'Peripheral Whole Blood': 'tumor',
     'Adherent Cell Line': 'cell line',
-    '3D Neurosphere': 'organoid',
+    '3D Neurosphere': 'patient derived organoid',
     '2D Modified Conditionally Reprogrammed Cells': 'cell line',
     'Pleural Effusion': np.nan,
     'Human Original Cells': 'cell line',
@@ -301,6 +301,10 @@ def filter_and_subset_data(df, maxval, mapfile):
     if not missing_ids.empty:
         print("\nWarning: Some samples could not be assigned an 'improve_sample_id'.")
         print(missing_ids)
+        
+    # Missing cancer type indicates that it is normal tissue.
+    longtab['cancer_type'] = longtab['cancer_type'].replace('', np.nan)
+    longtab['cancer_type'] = longtab['cancer_type'].fillna('Normal Tissue')
     return longtab
 
 def main():
diff --git a/build/sarcpdo/00_createSarcPDOSampleFile.py b/build/sarcpdo/00_createSarcPDOSampleFile.py
@@ -86,10 +86,12 @@ def download_and_format_rna_samples(synLoginObject):
     rna_samples['model_type'] = modeltypeDF[0]
     # add rows by hand for SARC0139_1 that are missing from sample sheet but present in rnaseq data
     addrow1 = {'other_id' : 'SARC0139_1_Tumor', 'common_name':'SARC0139_1', 'other_id_source' : 'Synapse', 'other_names':'', "cancer_type" : "Leiomyosarcoma", 'species':"Homo sapiens(Human)", 'model_type':'tumor'}
-    addrow2 = {'other_id' : 'SARC0139_1_Organoid', 'common_name':'SARC0139_1', 'other_id_source' : 'Synapse', 'other_names':'', "cancer_type" : "Leiomyosarcoma", 'species':"Homo sapiens(Human)", 'model_type':'organoid'}
+    addrow2 = {'other_id' : 'SARC0139_1_Organoid', 'common_name':'SARC0139_1', 'other_id_source' : 'Synapse', 'other_names':'', "cancer_type" : "Leiomyosarcoma", 'species':"Homo sapiens(Human)", 'model_type':'patient derived organoid'}
     rna_samples.loc[len(rna_samples)] = addrow1
     rna_samples.loc[len(rna_samples)] = addrow2
-
+    
+    rna_samples.loc[rna_samples['model_type'] == 'organoid', 'model_type'] = 'patient derived organoid'
+    
     return rna_samples
 
     
diff --git a/build/sarcpdo/01_createSarcPDOOmicsFiles.py b/build/sarcpdo/01_createSarcPDOOmicsFiles.py
@@ -40,6 +40,8 @@ def download_and_format_transcriptomic(synLoginObject, genesTable, samplesTable)
     final = melted_joined_renamed[['entrez_id', 'improve_sample_id', 'transcriptomics', 'source', 'study']]
     #dropduplicates (see a few lines above - should be down here)
     final = final.drop_duplicates()
+    # make sure entrez id is in int format.
+    final['entrez_id'] = final['entrez_id'].astype(int)
     return final
 
 def download_and_format_genomic_mutation(synLoginObject, genesTable, samplesTable):
@@ -79,6 +81,9 @@ def download_and_format_genomic_mutation(synLoginObject, genesTable, samplesTabl
     mutationData =mutationData.rename({"Name": "mutation"}, axis=1)
     # drop duplicates
     mutationData = mutationData.drop_duplicates()
+    # make sure entrez_id is in integer format
+    mutationData['entrez_id'] = mutationData['entrez_id'].astype(int)
+
     return mutationData