|
| 1 | +import pandas as pd |
| 2 | +import requests |
| 3 | +import os |
| 4 | +import argparse |
| 5 | +import numpy as np |
| 6 | + |
| 7 | + |
| 8 | + |
| 9 | +def align_to_linkml_schema(input_df): |
| 10 | + """ |
| 11 | + Maps the 'model_type' column of the input DataFrame to a set of predefined categories |
| 12 | + according to a specified mapping dictionary. This alignment is intended to ensure |
| 13 | + the DataFrame's 'model_type' values conform to a schema compatible with the LinkML model. |
| 14 | + |
| 15 | + Parameters |
| 16 | + ---------- |
| 17 | + input_df : pd.DataFrame |
| 18 | + The input DataFrame containing a 'model_type' column with values to be mapped |
| 19 | + according to the predefined categories. |
| 20 | + |
| 21 | + Returns |
| 22 | + ------- |
| 23 | + pd.DataFrame |
| 24 | + A copy of the input DataFrame with the 'model_type' column values mapped to |
| 25 | + a set of predefined categories ('tumor', 'organoid', 'cell line'). |
| 26 | + The mapping is designed to align the DataFrame with the LinkML schema requirements. |
| 27 | + """ |
| 28 | + |
| 29 | + mapping_dict = { |
| 30 | + 'Solid Tissue': 'tumor', |
| 31 | + '3D Organoid': 'organoid', |
| 32 | + 'Peripheral Blood Components NOS': 'tumor', |
| 33 | + 'Buffy Coat': np.nan, |
| 34 | + None: np.nan, |
| 35 | + 'Peripheral Whole Blood': 'tumor', |
| 36 | + 'Adherent Cell Line': 'cell line', |
| 37 | + '3D Neurosphere': 'organoid', |
| 38 | + '2D Modified Conditionally Reprogrammed Cells': 'cell line', |
| 39 | + 'Pleural Effusion': np.nan, |
| 40 | + 'Human Original Cells': 'cell line', |
| 41 | + 'Not Reported': np.nan, |
| 42 | + 'Mixed Adherent Suspension': 'cell line', |
| 43 | + 'Cell': 'cell line', |
| 44 | + 'Saliva': np.nan |
| 45 | + } |
| 46 | + |
| 47 | + # Apply mapping |
| 48 | + input_df['species'] = 'Homo sapiens (Human)' ##i assume they're lal human? |
| 49 | + input_df['model_type'] = input_df['model_type'].map(mapping_dict) |
| 50 | + input_df.dropna(subset=['model_type'], inplace=True) |
| 51 | + input_df = input_df.sort_values(by='improve_sample_id') |
| 52 | + |
| 53 | + return input_df |
| 54 | + |
| 55 | +def download_from_github(raw_url, save_path): |
| 56 | + """ |
| 57 | + Download a file from a raw GitHub URL and save it to a local path. |
| 58 | + |
| 59 | + Parameters |
| 60 | + ---------- |
| 61 | + raw_url : string |
| 62 | + The raw GitHub URL to download the file from. |
| 63 | + |
| 64 | + save_path : string |
| 65 | + Local path where the downloaded file will be saved. |
| 66 | + |
| 67 | + Returns |
| 68 | + ------- |
| 69 | + None |
| 70 | + """ |
| 71 | + |
| 72 | + response = requests.get(raw_url) |
| 73 | + with open(save_path, 'wb') as f: |
| 74 | + f.write(response.content) |
| 75 | + return |
| 76 | + |
| 77 | +def extract_uuids_from_manifest(manifest_data): |
| 78 | + """ |
| 79 | + Extract UUIDs from the provided manifest data. |
| 80 | + |
| 81 | + Takes a manifests file generated from GDC portal (or manually) and parses through while collecting UUIDs. |
| 82 | + |
| 83 | + Parameters |
| 84 | + ---------- |
| 85 | + manifest_data : string |
| 86 | + file path to manifests file |
| 87 | +
|
| 88 | + Returns |
| 89 | + ------- |
| 90 | + List of UUIDs |
| 91 | + """ |
| 92 | + with open(manifest_data, 'r') as f: |
| 93 | + lines = f.readlines()[1:] # Skip header |
| 94 | + return [line.split("\t")[0] for line in lines] |
| 95 | + |
| 96 | + |
| 97 | +def fetch_metadata_for_samples(uuids): |
| 98 | + """ |
| 99 | + Fetch metadata for given UUIDs. |
| 100 | + |
| 101 | + This function makes a POST request to the GDC API endpoint to fetch relevant metadata for the provided UUIDs. |
| 102 | + |
| 103 | + Parameters |
| 104 | + ---------- |
| 105 | + uuids : list |
| 106 | + list of UUIDs |
| 107 | +
|
| 108 | + Returns |
| 109 | + ------- |
| 110 | + dict |
| 111 | + JSON Request Data |
| 112 | + """ |
| 113 | + |
| 114 | + endpoint = "https://api.gdc.cancer.gov/files" |
| 115 | + |
| 116 | + filters_content = { |
| 117 | + "field": "files.file_id", |
| 118 | + "value": uuids |
| 119 | + } |
| 120 | + |
| 121 | + payload = { |
| 122 | + "filters": { |
| 123 | + "op": "in", |
| 124 | + "content": filters_content |
| 125 | + }, |
| 126 | + "fields": ( |
| 127 | + "cases.sample_ids," |
| 128 | + "cases.case_id," |
| 129 | + "cases.submitter_id," |
| 130 | + "cases.annotations.case_submitter_id," |
| 131 | + "cases.samples.sample_id," |
| 132 | + "cases.samples.portions.analytes.aliquots.aliquot_id," |
| 133 | + "cases.samples.sample_type," |
| 134 | + "cases.diagnoses.submitter_id," |
| 135 | + "cases.diagnoses.diagnosis_id," |
| 136 | + "cases.diagnoses.classification_of_tumor," |
| 137 | + "cases.diagnoses.tissue_or_organ_of_origin," |
| 138 | + "cases.diagnoses.primary_diagnosis," |
| 139 | + "cases.diagnoses.treatments.treatment_id,"##getting these but ignoring for now |
| 140 | + "cases.diagnoses.treatments.submitter_id," ##getting these but ignoring for now |
| 141 | + "cases.samples.tumor_descriptor," |
| 142 | + "cases.samples.composition" |
| 143 | + ), |
| 144 | + "format": "JSON", |
| 145 | + "size": str(len(uuids)) |
| 146 | + } |
| 147 | + |
| 148 | + response = requests.post(endpoint, json=payload) |
| 149 | + return response.json() |
| 150 | + |
| 151 | + |
| 152 | +def extract_data(data): |
| 153 | + """ |
| 154 | + Write API returned JSON Data to Pandas Table |
| 155 | + |
| 156 | + Parameters |
| 157 | + ---------- |
| 158 | + data : json data |
| 159 | + json data from GDC Portal |
| 160 | +
|
| 161 | + Returns |
| 162 | + ------- |
| 163 | + Pandas Dataframe |
| 164 | + """ |
| 165 | + extracted = [] |
| 166 | + for hit in data['data']['hits']: |
| 167 | + for case in hit['cases']: |
| 168 | + for idx, sample in enumerate(case['samples']): |
| 169 | + for portion in sample['portions']: |
| 170 | + for analyte in portion['analytes']: |
| 171 | + |
| 172 | + for aliquot in analyte['aliquots']: |
| 173 | + if idx < len(case['diagnoses']): |
| 174 | + diagnosis = case['diagnoses'][idx] |
| 175 | + extracted.append({ |
| 176 | + 'entry_id': hit['id'], |
| 177 | + 'case_uuid': case['case_id'], |
| 178 | + 'case_id': case['submitter_id'], |
| 179 | + 'tissue_or_organ_of_origin': diagnosis['tissue_or_organ_of_origin'], |
| 180 | + 'primary_diagnosis': diagnosis['primary_diagnosis'], |
| 181 | + 'diagnosis_id':diagnosis['submitter_id'], |
| 182 | + 'tumor_classification':diagnosis['classification_of_tumor'], |
| 183 | + 'sample_id': sample['sample_id'], |
| 184 | + 'sample_type': sample['sample_type'], |
| 185 | + #'tumor_descriptor': sample.get('tumor_descriptor', None), |
| 186 | + 'composition': sample.get('composition', None), |
| 187 | + 'id': aliquot['aliquot_id'] |
| 188 | + }) |
| 189 | + return pd.DataFrame(extracted) |
| 190 | + |
| 191 | +def filter_and_subset_data(df, maxval, mapfile): |
| 192 | + """ |
| 193 | + Filter and subset the data, then assign improve_sample_id at the end. |
| 194 | +
|
| 195 | + Parameters |
| 196 | + ---------- |
| 197 | + df : pd.DataFrame |
| 198 | + A tidied pandas DataFrame containing the full samples table. |
| 199 | + maxval : int |
| 200 | + The maximum value of improve_sample_id from previous samples, used to continue numbering. |
| 201 | + mapfile : str |
| 202 | + File path to the mapping file that maps primary diagnosis and tissue of origin to common cancer types. |
| 203 | +
|
| 204 | + Returns |
| 205 | + ------- |
| 206 | + pd.DataFrame |
| 207 | + The processed DataFrame ready for further use. |
| 208 | + """ |
| 209 | + # Remove duplicates based on all columns except 'id' |
| 210 | + duplicates_mask = df.drop('id', axis=1).duplicated(keep='first') |
| 211 | + cmap = pd.read_csv(mapfile, encoding='ISO-8859-1') |
| 212 | + filt = df[~duplicates_mask] |
| 213 | + filt = filt.drop_duplicates() |
| 214 | + |
| 215 | + # Merge with the cancer type mapping file |
| 216 | + filt = pd.merge( |
| 217 | + filt, |
| 218 | + cmap, |
| 219 | + right_on=['tissue_or_organ_of_origin', 'primary_diagnosis'], |
| 220 | + left_on=['tissue_or_organ_of_origin', 'primary_diagnosis'], |
| 221 | + how='left' |
| 222 | + ) |
| 223 | + |
| 224 | + # Rename columns to match the schema |
| 225 | + filt = filt.rename( |
| 226 | + columns={ |
| 227 | + "composition": "model_type", |
| 228 | + "case_id": "common_name", |
| 229 | + "id": "other_names" |
| 230 | + } |
| 231 | + ) |
| 232 | + |
| 233 | + # Melt the dataframe to create 'other_id' and 'other_id_source' |
| 234 | + longtab = pd.melt( |
| 235 | + filt, |
| 236 | + id_vars=['common_name', 'other_names', 'model_type', 'cancer_type'], |
| 237 | + value_vars=['diagnosis_id', 'tumor_classification', 'sample_type'] |
| 238 | + ) |
| 239 | + longtab = longtab.rename(columns={'variable': 'other_id_source', 'value': 'other_id'}).drop_duplicates() |
| 240 | + |
| 241 | + # Handle missing 'other_names' |
| 242 | + missing_other_names = longtab[longtab['other_names'].isnull()] |
| 243 | + if not missing_other_names.empty: |
| 244 | + print("Warning: Some samples have missing 'other_names' (aliquot_id). These samples will be excluded.") |
| 245 | + print(missing_other_names) |
| 246 | + longtab = longtab.dropna(subset=['other_names']) |
| 247 | + |
| 248 | + # Convert 'other_names' to string to ensure consistency |
| 249 | + longtab['other_names'] = longtab['other_names'].astype(str) |
| 250 | + |
| 251 | + # Reassign 'improve_sample_id's at the end |
| 252 | + unique_other_names = longtab['other_names'].unique() |
| 253 | + print("Number of unique 'other_names' after filtering:", len(unique_other_names)) |
| 254 | + |
| 255 | + # Create a new mapping |
| 256 | + mapping = pd.DataFrame({ |
| 257 | + 'other_names': unique_other_names, |
| 258 | + 'improve_sample_id': range(int(maxval) + 1, int(maxval) + len(unique_other_names) + 1) |
| 259 | + }) |
| 260 | + |
| 261 | + # Merge the mapping back into 'longtab' |
| 262 | + longtab = pd.merge(longtab, mapping, on='other_names', how='left') |
| 263 | + |
| 264 | + # Debugging: Check longtab after reassigning IDs |
| 265 | + print("\nlongtab columns after reassigning 'improve_sample_id':", longtab.columns) |
| 266 | + print("longtab head after reassigning IDs:") |
| 267 | + print(longtab.head()) |
| 268 | + |
| 269 | + # Verify that all 'improve_sample_id's are assigned |
| 270 | + missing_ids = longtab[longtab['improve_sample_id'].isnull()] |
| 271 | + if not missing_ids.empty: |
| 272 | + print("\nWarning: Some samples could not be assigned an 'improve_sample_id'.") |
| 273 | + print(missing_ids) |
| 274 | + return longtab |
| 275 | + |
| 276 | +def main(): |
| 277 | + """ |
| 278 | + Retrieve and process PANCPDO (Human Cancer Models Initiative) samples metadata from GDC (Genomic Data Commons). |
| 279 | + Create samples.csv file for schema. |
| 280 | +
|
| 281 | + This function automates the workflow of: |
| 282 | + 1. Downloading a manifest file from the GitHub repository. |
| 283 | + 2. Extracting UUIDs (Unique Universal Identifiers) from the manifest. |
| 284 | + 3. Fetching the metadata for the samples corresponding to the UUIDs from GDC API via POST request. |
| 285 | + 4. Structuring the fetched metadata into a pandas dataframe. |
| 286 | + 5. Filtering and subsetting the dataframe to align with the schema. |
| 287 | + 6. Writing the processed dataframe to a CSV file. |
| 288 | +
|
| 289 | + Notes: |
| 290 | + ------ |
| 291 | + The GDC API is publicly accessible, so no authentication is required. |
| 292 | +
|
| 293 | + To Run: |
| 294 | + -------- |
| 295 | + python createPANCPDOSamplesFile.py |
| 296 | +
|
| 297 | + Output: |
| 298 | + ------- |
| 299 | + A local CSV file named '/tmp/pancpdo_samples.csv' containing the processed metadata. |
| 300 | + """ |
| 301 | + parser = argparse.ArgumentParser() |
| 302 | + parser.add_argument('--prevSamples',dest='prev_samps', nargs='?',type=str, default='', const='', help='Previous sample file') |
| 303 | + parser.add_argument('--mapfile',dest='map',help='Mapping to common_cancer from primary_diagnosis and tissue_or_organ_of_origin',default='pancpdo_cancer_types.csv') |
| 304 | + |
| 305 | + args = parser.parse_args() |
| 306 | + manifest_path = "full_manifest.txt" |
| 307 | + #manifest_url = "https://raw.githubusercontent.com/PNNL-CompBio/candleDataProcessing/hcmi_update/pancpdo/full_manifest.txt" |
| 308 | + #download_from_github(manifest_url, manifest_path) |
| 309 | + uuids = extract_uuids_from_manifest(manifest_path) |
| 310 | + metadata = fetch_metadata_for_samples(uuids) |
| 311 | + df = extract_data(metadata) |
| 312 | + |
| 313 | + if args.prev_samps is None or args.prev_samps=='': |
| 314 | + print("No Previous Samples file was found. PANCPDO Data will not align with other datasets. Use ONLY for testing purposes.") |
| 315 | + maxval = 0 |
| 316 | + else: |
| 317 | + print("Previous Samples File Provided. Running PANCPDO Sample File Generation") |
| 318 | + maxval = max(pd.read_csv(args.prev_samps).improve_sample_id) |
| 319 | + |
| 320 | + output = filter_and_subset_data(df,maxval,args.map) |
| 321 | + aligned = align_to_linkml_schema(output) |
| 322 | + print(aligned) |
| 323 | + aligned.to_csv("/tmp/pancpdo_samples.csv",index=False) |
| 324 | + |
| 325 | +main() |
| 326 | + |
| 327 | + |
0 commit comments