Skip to content

Commit ff1f26d

Browse files
committed
more docstrings and typo fix
A few more changes to documentation for coderdata package
1 parent ceb330a commit ff1f26d

2 files changed

Lines changed: 150 additions & 8 deletions

File tree

coderdata/dataset/dataset.py

Lines changed: 149 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,43 @@ def split_train_other(
322322
random_state: Optional[Union[int,RandomState]]=None,
323323
**kwargs: dict,
324324
) -> TwoWaySplit:
325+
"""
326+
Split the dataset into training and another subset (e.g., testing or validation).
325327
328+
Parameters
329+
----------
330+
split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
331+
The type of split to perform, by default 'mixed-set'.
332+
- `mixed-set`: A random split, disregarding drug or cancer associations.
333+
- `drug-blind`: Ensures disjoint splits by drug ID.
334+
- `cancer-blind`: Ensures disjoint splits by sample or cancer association.
335+
ratio : tuple[int, int], optional
336+
The ratio of train to other subset sizes, by default (8, 2).
337+
For instance, (8, 2) translates to an 80%-20% split.
338+
stratify_by : str, optional
339+
The column used for stratification, if stratification is needed, by default None.
340+
balance : bool, optional
341+
Whether to adjust to balanced splits (equal representation of classes), by default False.
342+
random_state : int | RandomState | None, optional
343+
A seed for reproducibility of the random split, by default None.
344+
**kwargs : dict
345+
Additional arguments for advanced customization of the split.
346+
347+
Returns
348+
-------
349+
TwoWaySplit
350+
An object containing the train and other subsets as separate datasets.
351+
352+
Notes
353+
-----
354+
This method is a wrapper around the `split_train_other` utility function and
355+
ensures that the split configuration is applied to the dataset (self).
356+
357+
Examples
358+
--------
359+
>>> split = dataset.split_train_other(split_type='cancer-blind', ratio=(7,3))
360+
>>> print(split.train, split.other)
361+
"""
326362
split = split_train_other(
327363
data=self,
328364
split_type=split_type,
@@ -347,6 +383,47 @@ def split_train_test_validate(
347383
random_state: Optional[Union[int,RandomState]]=None,
348384
**kwargs: dict,
349385
) -> Split:
386+
"""
387+
Split the dataset into training, testing, and validation subsets.
388+
389+
Parameters
390+
----------
391+
split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
392+
Defines the type of splitting to perform, by default 'mixed-set'.
393+
- `mixed-set`: Data is split randomly, disregarding drug or cancer associations.
394+
- `drug-blind`: Ensures disjoint splits by drug association.
395+
- `cancer-blind`: Ensures disjoint splits by sample or cancer association.
396+
ratio : tuple[int, int, int], optional
397+
Defines the ratio of train, test, and validate sizes, e.g., (8,1,1)
398+
means 80% train, 10% test, 10% validation.
399+
stratify_by : str, optional
400+
Column to use for stratification, if required, by default None.
401+
balance : bool, optional
402+
Whether to balance the splits (equal representation of classes), by default False.
403+
random_state : int | RandomState | None, optional
404+
A random seed for reproducibility, by default None.
405+
**kwargs : dict
406+
Additional arguments for customization of the split logic.
407+
408+
Returns
409+
-------
410+
Split
411+
A Split object containing the training, testing, and validation subsets.
412+
413+
Notes
414+
-----
415+
- This method uses the `split_train_test_validate` utility function internally.
416+
- Ensures disjoint subsets based on the specified splitting criteria, especially
417+
for `drug-blind` and `cancer-blind` splits.
418+
- Includes options for stratifying splits based on a drug response metric.
419+
420+
Examples
421+
--------
422+
>>> split = dataset.split_train_test_validate(
423+
... split_type='drug-blind', ratio=(7,2,1), stratify_by='auc'
424+
... )
425+
>>> print(split.train, split.test, split.validate)
426+
"""
350427
split = split_train_test_validate(
351428
data=self,
352429
split_type=split_type,
@@ -371,7 +448,46 @@ def train_test_validate(
371448
random_state: Optional[Union[int,RandomState]]=None,
372449
**kwargs: dict,
373450
) -> Split:
451+
"""
452+
Split the dataset into training, testing, and validation subsets.
374453
454+
Parameters
455+
----------
456+
split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
457+
Defines the type of splitting, by default 'mixed-set'.
458+
- `mixed-set`: Random splitting, disregarding drug or cancer associations.
459+
- `drug-blind`: Ensures disjoint splits based on drug associations.
460+
- `cancer-blind`: Ensures disjoint splits based on cancer or sample associations.
461+
ratio : tuple[int, int, int], optional
462+
The proportion of data for train, test, and validation splits
463+
(e.g., (8,1,1) means 80% train, 10% test, 10% validation), by default (8,1,1).
464+
stratify_by : str, optional
465+
The column used for stratification (e.g., a drug response metric), by default None.
466+
balance : bool, optional
467+
Whether to adjust splits to ensure balanced classes, by default False.
468+
random_state : int | RandomState | None, optional
469+
Random seed for reproducibility, by default None.
470+
**kwargs : dict
471+
Additional arguments for customization, passed to the stratification logic.
472+
473+
Returns
474+
-------
475+
Split
476+
An object containing the training, testing, and validation subsets.
477+
478+
Notes
479+
-----
480+
- This method wraps around the `split_train_test_validate` utility function.
481+
- Useful for creating disjoint and optionally stratified splits of the dataset.
482+
- Supports reproducibility through `random_state`.
483+
484+
Examples
485+
--------
486+
>>> split = dataset.train_test_validate(
487+
... split_type='cancer-blind', ratio=(6,2,2), stratify_by='fit_auc'
488+
... )
489+
>>> print(split.train, split.test, split.validate)
490+
"""
375491
split = split_train_test_validate(
376492
data=self,
377493
split_type=split_type,
@@ -441,28 +557,54 @@ def load(
441557
local_path: Union[str,Path]=Path.cwd(),
442558
from_pickle:bool=False
443559
) -> Dataset:
560+
444561
"""
445-
_summary_
562+
Load a dataset from local files.
563+
564+
This function allows loading either from raw data files (e.g., CSV, TSV)
565+
or from a pickled file. The raw data is parsed and indexed into a `Dataset`
566+
object based on predefined types. If pickled data is available, it can be
567+
directly loaded for faster access.
446568
447569
Parameters
448570
----------
449571
name : str
450-
_description_
451-
directory : str | Path, optional
452-
_description_, by default Path.cwd()
572+
The name of the dataset to load (used as a filename prefix).
573+
local_path : str | Path, optional
574+
The local directory where the dataset files are located, by default the current working directory.
575+
from_pickle : bool, optional
576+
If True, attempts to load the dataset from a pickled file, by default False.
453577
454578
Returns
455579
-------
456580
Dataset
457-
_description_
581+
An object containing the loaded dataset with attributes for specific data types like 'transcriptomics',
582+
'proteomics', 'mutations', etc.
458583
459584
Raises
460585
------
461586
OSError
462-
_description_
587+
If the specified directory does not exist.
463588
TypeError
464-
_description_
589+
If the provided path is not a valid path.
590+
FileNotFoundError
591+
If no suitable pickled file is found when `from_pickle=True`.
592+
593+
Notes
594+
-----
595+
- When loading from raw files, supported file formats are `.csv`, `.tsv`, `.csv.gz`, `.tsv.gz`.
596+
- The `genes` dataset is subsetted to include only genes relevant to other subdatasets ('transcriptomics', 'proteomics', etc.).
597+
- When loading from pickle, the function looks for files with extensions `.pkl` or `.pickle`.
598+
599+
Examples
600+
--------
601+
Load a dataset from raw files:
602+
>>> dataset = load(name='my_dataset', local_path='/data/datasets')
603+
604+
Load a dataset from a pickled file:
605+
>>> dataset = load(name='my_dataset', local_path='/data/datasets', from_pickle=True)
465606
"""
607+
466608

467609
data_types_to_load = (
468610
'transcriptomics',

coderdata/utils/stats.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def plot_2d_respones_metric(
4242
-------
4343
None
4444
Displays the 2D histogram plot.
45-
45+
"""
4646

4747
data_plot = _prepare_2d_hist_data(
4848
data=data.experiments,

0 commit comments

Comments
 (0)