|
3 | 3 | import os |
4 | 4 | import math |
5 | 5 | import argparse |
| 6 | +import synapseclient |
6 | 7 |
|
7 | 8 |
|
8 | 9 | def get_copy_call(a): |
@@ -194,6 +195,65 @@ def map_transcriptomics_novPDX(transcriptomics_data, improve_id_data, entrez_dat |
194 | 195 | return(sample_entrez_transcriptomics_df) |
195 | 196 |
|
196 | 197 |
|
| 198 | + |
| 199 | +def map_mutations_novPDX(mutation_data, improve_id_data, entrez_data): |
| 200 | + """ |
| 201 | + Maps transcriptomics data to improved sample id's and entrez gene data. Also does some data formatting. |
| 202 | + |
| 203 | + Parameters |
| 204 | + ---------- |
| 205 | + mutation_data : pd.Dataframe OR string |
| 206 | + Pandas dataframe object with mutations data OR path to csv with mutations data |
| 207 | +
|
| 208 | + improve_id_data : pd.Dataframe OR string |
| 209 | + Pandas dataframe object with improve id data OR path to csv with improve id data. This is one of the outputs of parse_mmc2() |
| 210 | +
|
| 211 | + entrez_data : pd.Dataframe OR string |
| 212 | + Pandas dataframe object with entrez gene data OR path to csv with entrez gene data. Use this code to get this file: https://github.com/PNNL-CompBio/coderdata/tree/e65634b99d060136190ec5fba0b7798f8d140dfb/build/genes |
| 213 | +
|
| 214 | + Returns |
| 215 | + ------- |
| 216 | + mutations_final : pd.DataFrame |
| 217 | + A DataFrame containing the mapped mutations data with columns: entrez_id, mutation, variant_classification, improve_sample_id, source, study |
| 218 | +
|
| 219 | + """ |
| 220 | + # read in data |
| 221 | + if isinstance(mutation_data, pd.DataFrame) == False: |
| 222 | + mutation_data = pd.read_csv(mutation_data) |
| 223 | + |
| 224 | + if isinstance(improve_id_data, pd.DataFrame) == False: |
| 225 | + improve_id_data = pd.read_csv(improve_id_data) |
| 226 | + |
| 227 | + if isinstance(entrez_data, pd.DataFrame) == False: |
| 228 | + entrez_data = pd.read_csv(entrez_data) |
| 229 | + # include only rows that are mutations (data had both cn and mutations) |
| 230 | + mutations_only_df = mutations_df[mutations_df['Category'].isin(["MutNovel","MutKnownFunctional","MutLikelyFunctional"])] |
| 231 | + |
| 232 | + # turn details column into mutation column |
| 233 | + mutations_only_df['mutation'] = mutations_only_df['Details'].str.split(pat = ",", expand=True).iloc[:,0] |
| 234 | + |
| 235 | + # create variant classifications with information that we have |
| 236 | + mutations_only_df['variant_classification'] = np.nan |
| 237 | + mutations_only_df.loc[mutations_only_df['mutation'].str.contains("-"),['variant_classification']] = "Frame_Shift_Del" |
| 238 | + mutations_only_df.loc[mutations_only_df['mutation'].str.contains(r'[A-Za-z]\d+[A-Za-z]$', regex=True, na=False),['variant_classification']] = "Missense_Mutation" |
| 239 | + mutations_only_df['variant_classification'] = mutations_only_df['variant_classification'].fillna("Undetermined") |
| 240 | + |
| 241 | + # missing entrex id's are not in genes.csv, so get rid of those rows |
| 242 | + mutations_only_df = mutations_only_df[mutations_only_df['Entrez'].notna()] |
| 243 | + |
| 244 | + # merge improve sample names |
| 245 | + improve_id_data['to_merge'] = improve_id_data['common_name'].str.replace("NIBR","") |
| 246 | + mutations_final = pd.merge(mutations_only_df, improve_id_data[['to_merge','improve_sample_id']], how = 'inner', left_on='Sample', right_on='to_merge') |
| 247 | + |
| 248 | + # clean up column names and data types |
| 249 | + mutations_final = mutations_final.rename(columns={'Entrez':'entrez_id'}) |
| 250 | + mutations_final = mutations_final.drop(columns=['Sample','Gene','Category','Details','to_merge']) |
| 251 | + mutations_final['source'] = "CPDM" |
| 252 | + mutations_final['study'] = "novartispdx" |
| 253 | + mutations_final = mutations_final.astype({'entrez_id':'int'}) |
| 254 | + |
| 255 | + return(mutations_final) |
| 256 | + |
197 | 257 | if __name__ == "__main__": |
198 | 258 | print('in main') |
199 | 259 | parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of omics data files for the Bladder PDO project") |
@@ -243,7 +303,7 @@ def map_transcriptomics_novPDX(transcriptomics_data, improve_id_data, entrez_dat |
243 | 303 | exit() |
244 | 304 | else: |
245 | 305 | print("Starting mutations data.") |
246 | | - mutation_df = map_mutations(mutation_data = "/tmp/mutation_data.csv", improve_id_data = "/tmp/novartispdx_samples.csv", entrez_data = "/tmp/genes.csv") |
| 306 | + mutation_df = map_mutations_novPDX(mutation_data = "/tmp/mutation_data.csv", improve_id_data = "/tmp/novartispdx_samples.csv", entrez_data = "/tmp/genes.csv") |
247 | 307 | mutation_df.to_csv("/tmp/crcpdo_mutations.csv", index=False) |
248 | 308 |
|
249 | 309 | if args.copy_number: |
|
0 commit comments