Skip to content

Commit 34c24bc

Browse files
committed
skeleton code + index extraction for splits
1 parent 83b0e34 commit 34c24bc

1 file changed

Lines changed: 342 additions & 0 deletions

File tree

Lines changed: 342 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,342 @@
1+
2+
import argparse
3+
from os import PathLike
4+
from pathlib import Path
5+
from pathlib import PurePath
6+
from typing import Union
7+
import sys
8+
9+
import coderdata as cd
10+
import pandas as pd
11+
12+
def main():
13+
14+
main_parser = argparse.ArgumentParser(add_help=True)
15+
16+
command_parsers = main_parser.add_subparsers(
17+
dest="command",
18+
title="commands",
19+
required=True,
20+
)
21+
22+
p_shared_args = argparse.ArgumentParser(add_help=False)
23+
p_shared_args.add_argument(
24+
'-w', '--work_dir',
25+
dest='WORKDIR',
26+
type=_check_folder,
27+
default=Path.cwd(),
28+
)
29+
p_shared_args.add_argument(
30+
'--overwrite',
31+
dest='OVERWRITE',
32+
action='store_true',
33+
)
34+
35+
p_setup_workflow = command_parsers.add_parser(
36+
"setup",
37+
parents=[p_shared_args],
38+
add_help=True,
39+
)
40+
p_setup_workflow.set_defaults(func=setup_workflow)
41+
42+
p_download_datasets = command_parsers.add_parser(
43+
"download",
44+
parents=[p_shared_args],
45+
add_help=True
46+
)
47+
p_download_datasets.set_defaults(func=download_datasets)
48+
49+
p_process_datasets = command_parsers.add_parser(
50+
"process",
51+
parents=[p_shared_args],
52+
add_help=True
53+
)
54+
p_process_datasets.set_defaults(func=process_datasets)
55+
p_process_datasets.add_argument(
56+
'-s', '--split_type', dest="SPLIT_TYPE",
57+
type=str,
58+
choices=['mixed-set', 'drug-blind', 'cancer-blind'],
59+
default='mixed-set',
60+
)
61+
p_process_datasets.add_argument(
62+
'-n', '--num_splits', dest='NUM_SPLITS',
63+
type=int,
64+
default=10
65+
)
66+
67+
p_all = command_parsers.add_parser(
68+
"all",
69+
parents=[p_shared_args],
70+
add_help=True
71+
)
72+
p_all.set_defaults(func=full_workflow)
73+
74+
if len(sys.argv) == 1:
75+
main_parser.print_help(sys.stderr)
76+
sys.exit(0)
77+
try:
78+
args = main_parser.parse_args()
79+
except FileNotFoundError as e:
80+
sys.exit(e)
81+
except ValueError as e:
82+
sys.exit(e)
83+
args.func(args)
84+
85+
86+
def full_workflow(args):
87+
setup_workflow(args)
88+
download_datasets(args)
89+
90+
91+
def process_datasets(args):
92+
93+
94+
local_path = args.WORKDIR.joinpath('data_in_tmp')
95+
96+
# getting the info which datasets are available
97+
data_sets_info = cd.list_datasets(raw=True)
98+
99+
# loading all available datasets into a dict where the dataset name
100+
# is the key
101+
data_sets = {}
102+
for data_set in data_sets_info.keys():
103+
data_sets[data_set] = cd.load(name=data_set, local_path=local_path)
104+
105+
106+
#-------------------------------------------------------------------
107+
# concatting all experiments / responses to create response.tsv
108+
#-------------------------------------------------------------------
109+
experiments = []
110+
for data_set in data_sets_info.keys():
111+
# not all Datasets have experiments / drug response data
112+
if data_sets[data_set].experiments is not None:
113+
# formatting existing response data to wide
114+
experiment = data_sets[data_set].format(
115+
data_type='experiments',
116+
shape='wide',
117+
metrics=[
118+
'fit_auc',
119+
'fit_ic50',
120+
'fit_r2',
121+
'fit_ec50se',
122+
'fit_einf',
123+
'fit_hs',
124+
'aac',
125+
'auc',
126+
'dss',
127+
],
128+
)
129+
experiments.append(experiment)
130+
131+
# concatenating existing response data and "clean up"
132+
response_data = pd.concat(experiments, axis=0, ignore_index=True)
133+
# TODO: potentially more columns must be renamed
134+
# (e.g. fit_auc to auc). If so this would happen here
135+
response_data.rename(
136+
columns={'improve_drug_id': 'improve_chem_id'},
137+
inplace=True,
138+
)
139+
# temporary addition of "index column" to serve as a reference for
140+
# the extraction of split files
141+
response_data['index'] = response_data.index
142+
143+
#-------------------------------------------------------------------
144+
# creation of splits
145+
#-------------------------------------------------------------------
146+
147+
splits_folder = args.WORKDIR.joinpath('data_out', 'splits')
148+
split_type = args.SPLIT_TYPE
149+
# TODO: potentially change vars to be read from `args`
150+
ratio = (8,1,1)
151+
stratify_by = None
152+
random_state = None
153+
154+
for data_set in data_sets_info.keys():
155+
if data_sets[data_set].experiments is not None:
156+
splits = {}
157+
for i in range(0, args.NUM_SPLITS):
158+
splits[i] = data_sets[data_set].train_test_validate(
159+
split_type=split_type,
160+
ratio=ratio,
161+
stratify_by=stratify_by,
162+
random_state=random_state
163+
)
164+
train_keys = (
165+
splits[i]
166+
.train
167+
.experiments[[
168+
'improve_sample_id',
169+
'improve_drug_id',
170+
"time",
171+
"study"
172+
]]
173+
.drop_duplicates()
174+
)
175+
train_keys.rename(
176+
columns={'improve_drug_id': 'improve_chem_id'},
177+
inplace=True,
178+
)
179+
row_nums = pd.merge(
180+
response_data,
181+
train_keys,
182+
how='inner',
183+
on=['improve_sample_id', 'improve_chem_id', "time", "study"],
184+
)
185+
outfile_path = splits_folder.joinpath(f"{data_set}_split_{i}_train.txt")
186+
row_nums.to_csv(
187+
path_or_buf=outfile_path,
188+
columns=['index'],
189+
index=False,
190+
header=False
191+
)
192+
193+
test_keys = (
194+
splits[i]
195+
.test
196+
.experiments[[
197+
'improve_sample_id',
198+
'improve_drug_id',
199+
"time",
200+
"study"
201+
]]
202+
.drop_duplicates()
203+
)
204+
test_keys.rename(
205+
columns={'improve_drug_id': 'improve_chem_id'},
206+
inplace=True,
207+
)
208+
row_nums = pd.merge(
209+
response_data,
210+
test_keys,
211+
how='inner',
212+
on=['improve_sample_id', 'improve_chem_id', "time", "study"],
213+
)
214+
outfile_path = splits_folder.joinpath(f"{data_set}_split_{i}_test.txt")
215+
row_nums.to_csv(
216+
path_or_buf=outfile_path,
217+
columns=['index'],
218+
index=False,
219+
header=False
220+
)
221+
222+
val_keys = (
223+
splits[i]
224+
.validate
225+
.experiments[[
226+
'improve_sample_id',
227+
'improve_drug_id',
228+
"time",
229+
"study"
230+
]]
231+
.drop_duplicates()
232+
)
233+
val_keys.rename(
234+
columns={'improve_drug_id': 'improve_chem_id'},
235+
inplace=True,
236+
)
237+
row_nums = pd.merge(
238+
response_data,
239+
val_keys,
240+
how='inner',
241+
on=['improve_sample_id', 'improve_chem_id', "time", "study"],
242+
)
243+
outfile_path = splits_folder.joinpath(f"{data_set}_split_{i}_val.txt")
244+
row_nums.to_csv(
245+
path_or_buf=outfile_path,
246+
columns=['index'],
247+
index=False,
248+
header=False
249+
)
250+
251+
252+
# look up the row ids for all data items of each data source to
253+
# create "<STUDY>_all.txt in /splits"
254+
255+
256+
# join the "meta data tables" like copynumber etc.
257+
258+
259+
def download_datasets(args):
260+
local_path = args.WORKDIR.joinpath('data_in_tmp')
261+
exist_ok = args.OVERWRITE
262+
try:
263+
cd.download(name='all', local_path=local_path, exist_ok=exist_ok)
264+
except FileExistsError:
265+
sys.exit("data files already exist")
266+
267+
268+
def setup_workflow(args):
269+
270+
# Create the folder structure according to the IMPROVE pipeline
271+
# including some temporary working directories. The structure will
272+
# look like this:
273+
#
274+
# .
275+
# ├── data_in_tmp <- will contain the downloaded datasets etc.
276+
# └── data_out <- prepared data for IMPROVE pipeline
277+
# ├── splits <- contains n split files per dataset
278+
# ├── x_data <- contains combined "master tables" of data
279+
# └── y_data <- contains drug responses
280+
281+
parent = args.WORKDIR
282+
exist_ok = args.OVERWRITE
283+
284+
data_in = parent.joinpath('data_in_tmp')
285+
data_out = parent.joinpath('data_out')
286+
splits = data_out.joinpath('splits')
287+
x_data = data_out.joinpath('x_data')
288+
y_data = data_out.joinpath('y_data')
289+
290+
try:
291+
data_in.mkdir(exist_ok=exist_ok)
292+
data_out.mkdir(exist_ok=exist_ok)
293+
splits.mkdir(exist_ok=exist_ok)
294+
x_data.mkdir(exist_ok=exist_ok)
295+
y_data.mkdir(exist_ok=exist_ok)
296+
except FileExistsError:
297+
sys.exit(
298+
"Some folders already exist. To ovewrite contents use "
299+
"commandline argument '--overwrite'"
300+
)
301+
302+
303+
def _check_folder(path: Union[str, PathLike, Path]) -> Path:
304+
"""
305+
Helper function to check if a defined folder exists.
306+
307+
Returns
308+
-------
309+
Path
310+
Cleaned path object with the absolute path to the folder passed
311+
to the function.
312+
313+
Raises
314+
------
315+
TypeError
316+
If passed path argument is not of the requested type.
317+
OSError
318+
If the passed path argument does not link to a valid existing
319+
folder.
320+
"""
321+
322+
if not isinstance(path, (str, PathLike, Path)):
323+
raise TypeError(
324+
f"'path' must be of type str, PathLike or Path. Supplied argument "
325+
f"is of type {type(path)}."
326+
)
327+
if not isinstance(path, Path):
328+
abs_path = Path(path).absolute()
329+
else:
330+
abs_path = path.absolute()
331+
332+
if not abs_path.is_dir():
333+
raise OSError(
334+
f"The defined folder path '{path}' does not exist or is not a "
335+
f"folder."
336+
)
337+
338+
return abs_path
339+
340+
if __name__ == '__main__':
341+
try: main()
342+
except KeyboardInterrupt: pass

0 commit comments

Comments
 (0)