Skip to content

Commit 9184363

Browse files
committed
added copy_number -> copy_call conversion
1 parent 7c0d4b2 commit 9184363

1 file changed

Lines changed: 68 additions & 1 deletion

File tree

coderdata/dataset/dataset.py

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
"""
2+
_summary_
3+
4+
"""
15

26
from __future__ import annotations
37

@@ -11,6 +15,7 @@
1115
import numpy as np
1216
from numpy.random import RandomState
1317
import pandas as pd
18+
# import polars as pl
1419

1520
from sklearn.model_selection import GroupShuffleSplit
1621
from sklearn.model_selection import ShuffleSplit
@@ -27,6 +32,25 @@ class Split:
2732

2833
class Dataset:
2934

35+
data_format_params = {
36+
"samples": (
37+
"improve_sample_id", "cancer_type", "model_type", "common_name",
38+
"other_id", "other_names", "id_source", "species"
39+
),
40+
"transcriptomics": (
41+
"improve_sample_id", "entrez_id", "transcriptomics"
42+
),
43+
"proteomics": ("improve_sample_id", "entrez_id", "proteomics"),
44+
"mutations": ("improve_sample_id", "entrez_id", "mutation"),
45+
"copy_number": ("improve_sample_id", "entrez_id", "copy_number"),
46+
"methylation": ("improve_sample_id", "entrez_id", "methylation"),
47+
"experiments": (
48+
"improve_sample_id", "improve_drug_id", "dose_response_value"
49+
),
50+
"drugs": ("improve_drug_id", "chem_name", "isoSMILES"),
51+
"genes": ("entrez_id", "gene_symbol", "other_id")
52+
}
53+
3054
def __init__(
3155
self,
3256
name: str=None,
@@ -49,6 +73,35 @@ def __init__(
4973
5074
Each attribute will be a Pandas DataFrame corresponding to a file with the dataset prefix.
5175
Attributes include transcriptomics, proteomics, mutations, etc.
76+
77+
Parameters
78+
----------
79+
name : str
80+
The name of the dataset that is stored in the object
81+
transcriptomics : pd.DataFrame, optional
82+
_description_, by default None
83+
proteomics : pd.DataFrame, optional
84+
_description_, by default None
85+
mutations : pd.DataFrame, optional
86+
_description_, by default None
87+
copy_number : pd.DataFrame, optional
88+
_description_, by default None
89+
samples : pd.DataFrame, optional
90+
_description_, by default None
91+
drugs : pd.DataFrame, optional
92+
_description_, by default None
93+
mirna : pd.DataFrame, optional
94+
_description_, by default None
95+
experiments : pd.DataFrame, optional
96+
_description_, by default None
97+
methylation : pd.DataFrame, optional
98+
_description_, by default None
99+
metabolomics : pd.DataFrame, optional
100+
_description_, by default None
101+
genes : pd.DataFrame, optional
102+
_description_, by default None
103+
full : pd.DataFrame, optional
104+
_description_, by default None
52105
"""
53106

54107
self.name = name
@@ -72,6 +125,11 @@ def __init__(
72125
# ----------------------------
73126

74127

128+
@property
129+
def data_format_params(self):
130+
return self._data_format_params
131+
132+
75133
@property
76134
def name(self):
77135
return self._name
@@ -451,15 +509,23 @@ def format(
451509
raise ValueError(
452510
f"'{data_type}' attribute of Dataset cannot be 'None'"
453511
)
512+
copy_call = kwargs.get('copy_call', False)
454513

455-
# TODO: add way to extract copy_call
456514
ret = pd.pivot_table(
457515
data=data.copy_number,
458516
index='entrez_id',
459517
columns='improve_sample_id',
460518
values='copy_number',
461519
aggfunc='mean',
462520
)
521+
if copy_call:
522+
ret = ret.apply(
523+
pd.cut,
524+
bins = [0, 0.5210507, 0.7311832, 1.214125, 1.422233, 2],
525+
labels = ['deep del', 'het loss', 'diploid', 'gain', 'amp'],
526+
include_lowest=True
527+
)
528+
463529

464530
elif data_type == "proteomics":
465531
if data.proteomics is None:
@@ -920,6 +986,7 @@ def train_test_validate(
920986
return Split(data_train, data_test, data_val)
921987

922988

989+
923990
def _load_file(file_path: Path) -> pd.DataFrame:
924991
if file_path.suffix == '.gz':
925992
return pd.read_csv(

0 commit comments

Comments
 (0)