Skip to content

Commit 982395e

Browse files
committed
Added ability to pull pancpdo samples from GDC
Opening branch as part of fix for #106 At the moment still missing drug/experimental data
1 parent d0dc5ed commit 982395e

22 files changed

Lines changed: 2296 additions & 3 deletions

File tree

build/docker/Dockerfile.pancpdo

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
FROM python:3.9
2+
3+
WORKDIR /usr/src/app
4+
5+
COPY build/pancpdo/01-createPancPDOSamplesFile.py .
6+
COPY build/pancpdo/02-getPancPDOData.py .
7+
COPY build/pancpdo/full_manifest.txt .
8+
COPY build/pancpdo/requirements.txt .
9+
COPY build/pancpdo/*sh ./
10+
COPY build/pancpdo/pancpdo_cancer_types.csv ./
11+
12+
13+
# Set MPLCONFIGDIR to a writable directory
14+
ENV MPLCONFIGDIR=/app/tmp/matplotlib
15+
RUN mkdir -p /app/tmp/matplotlib
16+
17+
RUN pip install --no-cache-dir -r requirements.txt
18+

build/docker/docker-compose.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,14 @@ services:
2626
HTTPS_PROXY: ${HTTPS_PROXY}
2727
platform: linux/amd64
2828
image: hcmi:latest
29+
pancpdo:
30+
build:
31+
context: ../../
32+
dockerfile: build/docker/Dockerfile.pancpdo
33+
args:
34+
HTTPS_PROXY: ${HTTPS_PROXY}
35+
platform: linux/amd64
36+
image: pancpdo:latest
2937

3038
beataml:
3139
build:

build/hcmi/02-getHCMIData.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,19 @@ def download_tool(url):
3636
filename = wget.download(url)
3737
files_before = os.listdir()
3838
# shutil.unpack_archive(filename)
39-
39+
##there are two files to unpack
40+
print('Unpacking platform-specific path')
41+
shutil.unpack_archive(os.path.basename(url))
42+
#This is just set for AWS to debug. This will have to be mapped to OS. They changed their file structure. This should be updated.
43+
print('Unpacking secondary zip')
44+
fnames={
45+
'Darwin':"gdc-client_2.3_OSX_x64.zip",
46+
'Linux':"gdc-client_2.3_Ubuntu_x64.zip",
47+
'Windows':"gdc-client_2.3_Windows_x64.zip"
48+
}
49+
shutil.unpack_archive(fnames[platform.system()])
4050
#This is just set for AWS to debug. This will have to be mapped to OS. They changed their file structure. This should be updated.
41-
shutil.unpack_archive("gdc-client_2.3_Ubuntu_x64.zip")
51+
# shutil.unpack_archive("gdc-client_2.3_Ubuntu_x64.zip")
4252
if not os.path.exists('gdc-client'):
4353
raise FileNotFoundError("gdc-client executable not found after extraction.")
4454
# Ensure 'gdc-client' is executable
@@ -683,4 +693,4 @@ def main():
683693

684694
if __name__ == "__main__":
685695
main()
686-
696+
Lines changed: 327 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,327 @@
1+
import pandas as pd
2+
import requests
3+
import os
4+
import argparse
5+
import numpy as np
6+
7+
8+
9+
def align_to_linkml_schema(input_df):
10+
"""
11+
Maps the 'model_type' column of the input DataFrame to a set of predefined categories
12+
according to a specified mapping dictionary. This alignment is intended to ensure
13+
the DataFrame's 'model_type' values conform to a schema compatible with the LinkML model.
14+
15+
Parameters
16+
----------
17+
input_df : pd.DataFrame
18+
The input DataFrame containing a 'model_type' column with values to be mapped
19+
according to the predefined categories.
20+
21+
Returns
22+
-------
23+
pd.DataFrame
24+
A copy of the input DataFrame with the 'model_type' column values mapped to
25+
a set of predefined categories ('tumor', 'organoid', 'cell line').
26+
The mapping is designed to align the DataFrame with the LinkML schema requirements.
27+
"""
28+
29+
mapping_dict = {
30+
'Solid Tissue': 'tumor',
31+
'3D Organoid': 'organoid',
32+
'Peripheral Blood Components NOS': 'tumor',
33+
'Buffy Coat': np.nan,
34+
None: np.nan,
35+
'Peripheral Whole Blood': 'tumor',
36+
'Adherent Cell Line': 'cell line',
37+
'3D Neurosphere': 'organoid',
38+
'2D Modified Conditionally Reprogrammed Cells': 'cell line',
39+
'Pleural Effusion': np.nan,
40+
'Human Original Cells': 'cell line',
41+
'Not Reported': np.nan,
42+
'Mixed Adherent Suspension': 'cell line',
43+
'Cell': 'cell line',
44+
'Saliva': np.nan
45+
}
46+
47+
# Apply mapping
48+
input_df['species'] = 'Homo sapiens (Human)' ##i assume they're lal human?
49+
input_df['model_type'] = input_df['model_type'].map(mapping_dict)
50+
input_df.dropna(subset=['model_type'], inplace=True)
51+
input_df = input_df.sort_values(by='improve_sample_id')
52+
53+
return input_df
54+
55+
def download_from_github(raw_url, save_path):
56+
"""
57+
Download a file from a raw GitHub URL and save it to a local path.
58+
59+
Parameters
60+
----------
61+
raw_url : string
62+
The raw GitHub URL to download the file from.
63+
64+
save_path : string
65+
Local path where the downloaded file will be saved.
66+
67+
Returns
68+
-------
69+
None
70+
"""
71+
72+
response = requests.get(raw_url)
73+
with open(save_path, 'wb') as f:
74+
f.write(response.content)
75+
return
76+
77+
def extract_uuids_from_manifest(manifest_data):
78+
"""
79+
Extract UUIDs from the provided manifest data.
80+
81+
Takes a manifests file generated from GDC portal (or manually) and parses through while collecting UUIDs.
82+
83+
Parameters
84+
----------
85+
manifest_data : string
86+
file path to manifests file
87+
88+
Returns
89+
-------
90+
List of UUIDs
91+
"""
92+
with open(manifest_data, 'r') as f:
93+
lines = f.readlines()[1:] # Skip header
94+
return [line.split("\t")[0] for line in lines]
95+
96+
97+
def fetch_metadata_for_samples(uuids):
98+
"""
99+
Fetch metadata for given UUIDs.
100+
101+
This function makes a POST request to the GDC API endpoint to fetch relevant metadata for the provided UUIDs.
102+
103+
Parameters
104+
----------
105+
uuids : list
106+
list of UUIDs
107+
108+
Returns
109+
-------
110+
dict
111+
JSON Request Data
112+
"""
113+
114+
endpoint = "https://api.gdc.cancer.gov/files"
115+
116+
filters_content = {
117+
"field": "files.file_id",
118+
"value": uuids
119+
}
120+
121+
payload = {
122+
"filters": {
123+
"op": "in",
124+
"content": filters_content
125+
},
126+
"fields": (
127+
"cases.sample_ids,"
128+
"cases.case_id,"
129+
"cases.submitter_id,"
130+
"cases.annotations.case_submitter_id,"
131+
"cases.samples.sample_id,"
132+
"cases.samples.portions.analytes.aliquots.aliquot_id,"
133+
"cases.samples.sample_type,"
134+
"cases.diagnoses.submitter_id,"
135+
"cases.diagnoses.diagnosis_id,"
136+
"cases.diagnoses.classification_of_tumor,"
137+
"cases.diagnoses.tissue_or_organ_of_origin,"
138+
"cases.diagnoses.primary_diagnosis,"
139+
"cases.diagnoses.treatments.treatment_id,"##getting these but ignoring for now
140+
"cases.diagnoses.treatments.submitter_id," ##getting these but ignoring for now
141+
"cases.samples.tumor_descriptor,"
142+
"cases.samples.composition"
143+
),
144+
"format": "JSON",
145+
"size": str(len(uuids))
146+
}
147+
148+
response = requests.post(endpoint, json=payload)
149+
return response.json()
150+
151+
152+
def extract_data(data):
153+
"""
154+
Write API returned JSON Data to Pandas Table
155+
156+
Parameters
157+
----------
158+
data : json data
159+
json data from GDC Portal
160+
161+
Returns
162+
-------
163+
Pandas Dataframe
164+
"""
165+
extracted = []
166+
for hit in data['data']['hits']:
167+
for case in hit['cases']:
168+
for idx, sample in enumerate(case['samples']):
169+
for portion in sample['portions']:
170+
for analyte in portion['analytes']:
171+
172+
for aliquot in analyte['aliquots']:
173+
if idx < len(case['diagnoses']):
174+
diagnosis = case['diagnoses'][idx]
175+
extracted.append({
176+
'entry_id': hit['id'],
177+
'case_uuid': case['case_id'],
178+
'case_id': case['submitter_id'],
179+
'tissue_or_organ_of_origin': diagnosis['tissue_or_organ_of_origin'],
180+
'primary_diagnosis': diagnosis['primary_diagnosis'],
181+
'diagnosis_id':diagnosis['submitter_id'],
182+
'tumor_classification':diagnosis['classification_of_tumor'],
183+
'sample_id': sample['sample_id'],
184+
'sample_type': sample['sample_type'],
185+
#'tumor_descriptor': sample.get('tumor_descriptor', None),
186+
'composition': sample.get('composition', None),
187+
'id': aliquot['aliquot_id']
188+
})
189+
return pd.DataFrame(extracted)
190+
191+
def filter_and_subset_data(df, maxval, mapfile):
192+
"""
193+
Filter and subset the data, then assign improve_sample_id at the end.
194+
195+
Parameters
196+
----------
197+
df : pd.DataFrame
198+
A tidied pandas DataFrame containing the full samples table.
199+
maxval : int
200+
The maximum value of improve_sample_id from previous samples, used to continue numbering.
201+
mapfile : str
202+
File path to the mapping file that maps primary diagnosis and tissue of origin to common cancer types.
203+
204+
Returns
205+
-------
206+
pd.DataFrame
207+
The processed DataFrame ready for further use.
208+
"""
209+
# Remove duplicates based on all columns except 'id'
210+
duplicates_mask = df.drop('id', axis=1).duplicated(keep='first')
211+
cmap = pd.read_csv(mapfile, encoding='ISO-8859-1')
212+
filt = df[~duplicates_mask]
213+
filt = filt.drop_duplicates()
214+
215+
# Merge with the cancer type mapping file
216+
filt = pd.merge(
217+
filt,
218+
cmap,
219+
right_on=['tissue_or_organ_of_origin', 'primary_diagnosis'],
220+
left_on=['tissue_or_organ_of_origin', 'primary_diagnosis'],
221+
how='left'
222+
)
223+
224+
# Rename columns to match the schema
225+
filt = filt.rename(
226+
columns={
227+
"composition": "model_type",
228+
"case_id": "common_name",
229+
"id": "other_names"
230+
}
231+
)
232+
233+
# Melt the dataframe to create 'other_id' and 'other_id_source'
234+
longtab = pd.melt(
235+
filt,
236+
id_vars=['common_name', 'other_names', 'model_type', 'cancer_type'],
237+
value_vars=['diagnosis_id', 'tumor_classification', 'sample_type']
238+
)
239+
longtab = longtab.rename(columns={'variable': 'other_id_source', 'value': 'other_id'}).drop_duplicates()
240+
241+
# Handle missing 'other_names'
242+
missing_other_names = longtab[longtab['other_names'].isnull()]
243+
if not missing_other_names.empty:
244+
print("Warning: Some samples have missing 'other_names' (aliquot_id). These samples will be excluded.")
245+
print(missing_other_names)
246+
longtab = longtab.dropna(subset=['other_names'])
247+
248+
# Convert 'other_names' to string to ensure consistency
249+
longtab['other_names'] = longtab['other_names'].astype(str)
250+
251+
# Reassign 'improve_sample_id's at the end
252+
unique_other_names = longtab['other_names'].unique()
253+
print("Number of unique 'other_names' after filtering:", len(unique_other_names))
254+
255+
# Create a new mapping
256+
mapping = pd.DataFrame({
257+
'other_names': unique_other_names,
258+
'improve_sample_id': range(int(maxval) + 1, int(maxval) + len(unique_other_names) + 1)
259+
})
260+
261+
# Merge the mapping back into 'longtab'
262+
longtab = pd.merge(longtab, mapping, on='other_names', how='left')
263+
264+
# Debugging: Check longtab after reassigning IDs
265+
print("\nlongtab columns after reassigning 'improve_sample_id':", longtab.columns)
266+
print("longtab head after reassigning IDs:")
267+
print(longtab.head())
268+
269+
# Verify that all 'improve_sample_id's are assigned
270+
missing_ids = longtab[longtab['improve_sample_id'].isnull()]
271+
if not missing_ids.empty:
272+
print("\nWarning: Some samples could not be assigned an 'improve_sample_id'.")
273+
print(missing_ids)
274+
return longtab
275+
276+
def main():
277+
"""
278+
Retrieve and process PANCPDO (Human Cancer Models Initiative) samples metadata from GDC (Genomic Data Commons).
279+
Create samples.csv file for schema.
280+
281+
This function automates the workflow of:
282+
1. Downloading a manifest file from the GitHub repository.
283+
2. Extracting UUIDs (Unique Universal Identifiers) from the manifest.
284+
3. Fetching the metadata for the samples corresponding to the UUIDs from GDC API via POST request.
285+
4. Structuring the fetched metadata into a pandas dataframe.
286+
5. Filtering and subsetting the dataframe to align with the schema.
287+
6. Writing the processed dataframe to a CSV file.
288+
289+
Notes:
290+
------
291+
The GDC API is publicly accessible, so no authentication is required.
292+
293+
To Run:
294+
--------
295+
python createPANCPDOSamplesFile.py
296+
297+
Output:
298+
-------
299+
A local CSV file named '/tmp/pancpdo_samples.csv' containing the processed metadata.
300+
"""
301+
parser = argparse.ArgumentParser()
302+
parser.add_argument('--prevSamples',dest='prev_samps', nargs='?',type=str, default='', const='', help='Previous sample file')
303+
parser.add_argument('--mapfile',dest='map',help='Mapping to common_cancer from primary_diagnosis and tissue_or_organ_of_origin',default='pancpdo_cancer_types.csv')
304+
305+
args = parser.parse_args()
306+
manifest_path = "full_manifest.txt"
307+
#manifest_url = "https://raw.githubusercontent.com/PNNL-CompBio/candleDataProcessing/hcmi_update/pancpdo/full_manifest.txt"
308+
#download_from_github(manifest_url, manifest_path)
309+
uuids = extract_uuids_from_manifest(manifest_path)
310+
metadata = fetch_metadata_for_samples(uuids)
311+
df = extract_data(metadata)
312+
313+
if args.prev_samps is None or args.prev_samps=='':
314+
print("No Previous Samples file was found. PANCPDO Data will not align with other datasets. Use ONLY for testing purposes.")
315+
maxval = 0
316+
else:
317+
print("Previous Samples File Provided. Running PANCPDO Sample File Generation")
318+
maxval = max(pd.read_csv(args.prev_samps).improve_sample_id)
319+
320+
output = filter_and_subset_data(df,maxval,args.map)
321+
aligned = align_to_linkml_schema(output)
322+
print(aligned)
323+
aligned.to_csv("/tmp/pancpdo_samples.csv",index=False)
324+
325+
main()
326+
327+

0 commit comments

Comments
 (0)