Merge branch 'novartisPDX-samples' of https://github.com/PNNL-CompBio/coderdata into novartisPDX

alexandriai168 · alexandriai168 · commit 8cd287c21a68 · 2025-05-09T14:33:53.000-07:00
pulling from ruby's sample branch
diff --git a/build/novartispdx/01-samples-novartispdx.py b/build/novartispdx/01-samples-novartispdx.py
@@ -0,0 +1,59 @@
+import pandas as pd
+import synapseclient
+import numpy as np
+import argparse
+import os
+
+def get_complete_novartispdx_sample_sheet(synObject):
+
+    files = list(synObject.getChildren(parent='syn66275995', includeTypes=['file']))
+
+    synIDs = [item['id'] for item in files]
+    # leave off synIDs for drug info
+    synIDs.remove('syn66276102')
+    synIDs.remove('syn66276098')
+    synIDs.remove("syn66477971")
+    # create empty dataframe
+    allsamplesheet = pd.DataFrame()
+    # iterate through IDs and concatenate
+    for id in synIDs:
+        curr = synObject.get(id)
+        currdf = pd.read_csv(curr.path)
+        allsamplesheet = pd.concat([allsamplesheet, currdf], ignore_index=True)
+    # rename columns and reformat cancer type from CANCER_HISTOLOGY column
+    allsamplesheet['other_id'] = allsamplesheet['Sample ID']
+    allsamplesheet['common_name'] = allsamplesheet['MODEL_ORIGINATOR_ID']
+    allsamplesheet['cancer_type'] = allsamplesheet['CANCER_HISTOLOGY'].str.lower().str.split(pat="^[^\s]*\s", expand=True)[1]
+    allsamplesheet['species'] = "Homo Sapiens(human)"
+    allsamplesheet['model_type'] = 'patient derived xenograft'
+    allsamplesheet['other_id_source'] = 'Synapse'
+    allsamplesheet['other_names'] = ''
+    finalsamplesheet = allsamplesheet[['other_id', 'common_name', 'other_id_source', 'other_names', 'cancer_type', 'species', 'model_type']]
+    return finalsamplesheet
+
+if __name__ == "__main__":
+    
+    parser = argparse.ArgumentParser(description="This script handles downloading, processing and formatting of sample files for the Novartis PDX data into a single samplesheet")
+    
+    parser.add_argument('-t', '--token', type=str, help='Synapse Token')
+
+    parser.add_argument("-p", '--prevSamples', nargs="?", type=str, default ="", const  = "", help = "Use this to provide previous sample file, will run sample file generation")
+
+    args = parser.parse_args()
+   
+    print("Logging into Synapse")
+    PAT = args.token
+    synObject = synapseclient.login(authToken=PAT)
+
+    samplesheet = get_complete_novartispdx_sample_sheet(synObject)
+
+    if (args.prevSamples):
+        prev_max_improve_id = max(pd.read_csv(args.prevSamples).improve_sample_id)
+    else: 
+        prev_max_improve_id = 0
+
+    samplesheet['improve_sample_id'] = range(prev_max_improve_id+1, prev_max_improve_id+samplesheet.shape[0]+1) 
+
+    samplesheet.to_csv('/tmp/novartispdx_samples.csv', index=False)
+
+        
diff --git a/build/novartispdx/build_samples.sh b/build/novartispdx/build_samples.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+set -euo pipefail
+
+trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
+
+echo "Running 01-samples-novartispdx.py with token and previous sample file $1"
+python3 01-samples-novartispdx.py --token $SYNAPSE_AUTH_TOKEN -p $1