@@ -322,7 +322,43 @@ def split_train_other(
322322 random_state : Optional [Union [int ,RandomState ]]= None ,
323323 ** kwargs : dict ,
324324 ) -> TwoWaySplit :
325+ """
326+ Split the dataset into training and another subset (e.g., testing or validation).
325327
328+ Parameters
329+ ----------
330+ split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
331+ The type of split to perform, by default 'mixed-set'.
332+ - `mixed-set`: A random split, disregarding drug or cancer associations.
333+ - `drug-blind`: Ensures disjoint splits by drug ID.
334+ - `cancer-blind`: Ensures disjoint splits by sample or cancer association.
335+ ratio : tuple[int, int], optional
336+ The ratio of train to other subset sizes, by default (8, 2).
337+ For instance, (8, 2) translates to an 80%-20% split.
338+ stratify_by : str, optional
339+ The column used for stratification, if stratification is needed, by default None.
340+ balance : bool, optional
341+ Whether to adjust to balanced splits (equal representation of classes), by default False.
342+ random_state : int | RandomState | None, optional
343+ A seed for reproducibility of the random split, by default None.
344+ **kwargs : dict
345+ Additional arguments for advanced customization of the split.
346+
347+ Returns
348+ -------
349+ TwoWaySplit
350+ An object containing the train and other subsets as separate datasets.
351+
352+ Notes
353+ -----
354+ This method is a wrapper around the `split_train_other` utility function and
355+ ensures that the split configuration is applied to the dataset (self).
356+
357+ Examples
358+ --------
359+ >>> split = dataset.split_train_other(split_type='cancer-blind', ratio=(7,3))
360+ >>> print(split.train, split.other)
361+ """
326362 split = split_train_other (
327363 data = self ,
328364 split_type = split_type ,
@@ -347,6 +383,47 @@ def split_train_test_validate(
347383 random_state : Optional [Union [int ,RandomState ]]= None ,
348384 ** kwargs : dict ,
349385 ) -> Split :
386+ """
387+ Split the dataset into training, testing, and validation subsets.
388+
389+ Parameters
390+ ----------
391+ split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
392+ Defines the type of splitting to perform, by default 'mixed-set'.
393+ - `mixed-set`: Data is split randomly, disregarding drug or cancer associations.
394+ - `drug-blind`: Ensures disjoint splits by drug association.
395+ - `cancer-blind`: Ensures disjoint splits by sample or cancer association.
396+ ratio : tuple[int, int, int], optional
397+ Defines the ratio of train, test, and validate sizes, e.g., (8,1,1)
398+ means 80% train, 10% test, 10% validation.
399+ stratify_by : str, optional
400+ Column to use for stratification, if required, by default None.
401+ balance : bool, optional
402+ Whether to balance the splits (equal representation of classes), by default False.
403+ random_state : int | RandomState | None, optional
404+ A random seed for reproducibility, by default None.
405+ **kwargs : dict
406+ Additional arguments for customization of the split logic.
407+
408+ Returns
409+ -------
410+ Split
411+ A Split object containing the training, testing, and validation subsets.
412+
413+ Notes
414+ -----
415+ - This method uses the `split_train_test_validate` utility function internally.
416+ - Ensures disjoint subsets based on the specified splitting criteria, especially
417+ for `drug-blind` and `cancer-blind` splits.
418+ - Includes options for stratifying splits based on a drug response metric.
419+
420+ Examples
421+ --------
422+ >>> split = dataset.split_train_test_validate(
423+ ... split_type='drug-blind', ratio=(7,2,1), stratify_by='auc'
424+ ... )
425+ >>> print(split.train, split.test, split.validate)
426+ """
350427 split = split_train_test_validate (
351428 data = self ,
352429 split_type = split_type ,
@@ -371,7 +448,46 @@ def train_test_validate(
371448 random_state : Optional [Union [int ,RandomState ]]= None ,
372449 ** kwargs : dict ,
373450 ) -> Split :
451+ """
452+ Split the dataset into training, testing, and validation subsets.
374453
454+ Parameters
455+ ----------
456+ split_type : {'mixed-set', 'drug-blind', 'cancer-blind'}, optional
457+ Defines the type of splitting, by default 'mixed-set'.
458+ - `mixed-set`: Random splitting, disregarding drug or cancer associations.
459+ - `drug-blind`: Ensures disjoint splits based on drug associations.
460+ - `cancer-blind`: Ensures disjoint splits based on cancer or sample associations.
461+ ratio : tuple[int, int, int], optional
462+ The proportion of data for train, test, and validation splits
463+ (e.g., (8,1,1) means 80% train, 10% test, 10% validation), by default (8,1,1).
464+ stratify_by : str, optional
465+ The column used for stratification (e.g., a drug response metric), by default None.
466+ balance : bool, optional
467+ Whether to adjust splits to ensure balanced classes, by default False.
468+ random_state : int | RandomState | None, optional
469+ Random seed for reproducibility, by default None.
470+ **kwargs : dict
471+ Additional arguments for customization, passed to the stratification logic.
472+
473+ Returns
474+ -------
475+ Split
476+ An object containing the training, testing, and validation subsets.
477+
478+ Notes
479+ -----
480+ - This method wraps around the `split_train_test_validate` utility function.
481+ - Useful for creating disjoint and optionally stratified splits of the dataset.
482+ - Supports reproducibility through `random_state`.
483+
484+ Examples
485+ --------
486+ >>> split = dataset.train_test_validate(
487+ ... split_type='cancer-blind', ratio=(6,2,2), stratify_by='fit_auc'
488+ ... )
489+ >>> print(split.train, split.test, split.validate)
490+ """
375491 split = split_train_test_validate (
376492 data = self ,
377493 split_type = split_type ,
@@ -441,28 +557,54 @@ def load(
441557 local_path : Union [str ,Path ]= Path .cwd (),
442558 from_pickle :bool = False
443559 ) -> Dataset :
560+
444561 """
445- _summary_
562+ Load a dataset from local files.
563+
564+ This function allows loading either from raw data files (e.g., CSV, TSV)
565+ or from a pickled file. The raw data is parsed and indexed into a `Dataset`
566+ object based on predefined types. If pickled data is available, it can be
567+ directly loaded for faster access.
446568
447569 Parameters
448570 ----------
449571 name : str
450- _description_
451- directory : str | Path, optional
452- _description_, by default Path.cwd()
572+ The name of the dataset to load (used as a filename prefix).
573+ local_path : str | Path, optional
574+ The local directory where the dataset files are located, by default the current working directory.
575+ from_pickle : bool, optional
576+ If True, attempts to load the dataset from a pickled file, by default False.
453577
454578 Returns
455579 -------
456580 Dataset
457- _description_
581+ An object containing the loaded dataset with attributes for specific data types like 'transcriptomics',
582+ 'proteomics', 'mutations', etc.
458583
459584 Raises
460585 ------
461586 OSError
462- _description_
587+ If the specified directory does not exist.
463588 TypeError
464- _description_
589+ If the provided path is not a valid path.
590+ FileNotFoundError
591+ If no suitable pickled file is found when `from_pickle=True`.
592+
593+ Notes
594+ -----
595+ - When loading from raw files, supported file formats are `.csv`, `.tsv`, `.csv.gz`, `.tsv.gz`.
596+ - The `genes` dataset is subsetted to include only genes relevant to other subdatasets ('transcriptomics', 'proteomics', etc.).
597+ - When loading from pickle, the function looks for files with extensions `.pkl` or `.pickle`.
598+
599+ Examples
600+ --------
601+ Load a dataset from raw files:
602+ >>> dataset = load(name='my_dataset', local_path='/data/datasets')
603+
604+ Load a dataset from a pickled file:
605+ >>> dataset = load(name='my_dataset', local_path='/data/datasets', from_pickle=True)
465606 """
607+
466608
467609 data_types_to_load = (
468610 'transcriptomics' ,
0 commit comments