diff --git a/src/leaspy/io/data/data.py b/src/leaspy/io/data/data.py index 1c685684..a0b59030 100644 --- a/src/leaspy/io/data/data.py +++ b/src/leaspy/io/data/data.py @@ -23,27 +23,30 @@ class Data(Iterable): Attributes ---------- - individuals : Dict[IDType, IndividualData] + individuals : :obj: `dict`[:class:`IDType`, :class:`IndividualData`] Included individuals and their associated data - iter_to_idx : Dict[int, IDType] + iter_to_idx : :obj:`dict`[:obj:`int`, :class:`IDType`] Maps an integer index to the associated individual ID - headers : List[FeatureType] + headers : :obj: `List`[:class:`FeatureType`] Feature names - dimension : int + dimension : :obj:`int` Number of features - n_individuals : int + n_individuals : :obj:`int` Number of individuals - n_visits : int + n_visits : :obj:`int` Total number of visits - cofactors : List[FeatureType] + cofactors : :obj:`List`[:class:`FeatureType`] Feature names corresponding to cofactors - event_time_name : str + event_time_name : :obj:`str` Name of the header that store the time at event in the original dataframe - event_bool_name : str + event_bool_name : :obj:`str` Name of the header that store the bool at event (censored or observed) in the original dataframe """ def __init__(self): + """ + Initialize the Data object + """ # Patients information self.individuals: Dict[IDType, IndividualData] = {} self.iter_to_idx: Dict[int, IDType] = {} @@ -59,25 +62,53 @@ def __init__(self): @property def dimension(self) -> Optional[int]: - """Number of features""" + """ + Number of features + + Returns + ------- + :obj:`int` or None: + Number of features in the dataset. If no features are present, returns None. + """ if self.headers is None: return None return len(self.headers) @property def n_individuals(self) -> int: - """Number of individuals""" + """ + Number of individuals + + Returns + ------- + :obj:`int`: + Number of individuals in the dataset. + """ return len(self.individuals) @property def n_visits(self) -> int: - """Total number of visits""" + """ + Total number of visits + + Returns + ------- + :obj:`int`: + Total number of visits in the dataset. + """ if self.dimension: return sum(len(indiv.timepoints) for indiv in self.individuals.values()) @property def cofactors(self) -> List[FeatureType]: - """Feature names corresponding to cofactors""" + """ + Feature names corresponding to cofactors + + Returns + ------- + :obj:`List`[:class:`FeatureType`]: + List of feature names corresponding to cofactors. + """ if len(self.individuals) == 0: return [] # Consistency checks are in place to ensure that cofactors are the same @@ -88,6 +119,28 @@ def cofactors(self) -> List[FeatureType]: def __getitem__( self, key: Union[int, IDType, slice, List[int], List[IDType]] ) -> Union[IndividualData, Data]: + """ + Access the individuals in the Data object using their ID or integer index. + + Parameters + ---------- + key : :obj:`int` or :class:`IDType` or :obj:`slice` or :obj:`List`[:obj:`int`] or :obj:`List`[:class:`IDType`] + The key(s) to access the individuals. + Can be an integer index, an ID, a slice object or a list of integers or IDs. + + Returns + ------- + :class:`IndividualData` or :class:`Data`: + The individual data corresponding to the key(s). + If a single key is provided, returns the corresponding `IndividualData` object. + If a slice or list of keys is provided, returns a new `Data` object + containing the selected individuals. + + Raises + ------ + :exc:`.LeaspyTypeError` + If the key is not of a valid type or if the list of keys contains mixed types. + """ if isinstance(key, int): return self.individuals[self.iter_to_idx[key]] @@ -116,6 +169,15 @@ def __getitem__( raise LeaspyTypeError("Cannot access a Data object this way") def __iter__(self) -> Iterator: + """ + Iterate over the individuals in the Data object. + + Returns + ------- + :class:`Iterator`: + An iterator over the individuals in the Data object. + """ + # Ordering the index list first ensures that the order used by the # iterator is consistent with integer indexing of individual data, # e.g. when using `enumerate` @@ -125,6 +187,24 @@ def __iter__(self) -> Iterator: return iter([self.individuals[it] for it in ordered_idx_list]) def __contains__(self, key: IDType) -> bool: + """ + Check if the Data object contains an individual with the given ID. + + Parameters + ---------- + key : :class:`IDType` + The ID of the individual to check for. + + Returns + ------- + :obj:`bool`: + True if the individual is present in the Data object, False otherwise. + + Raises + ------ + :exc:`.LeaspyTypeError` + If the key is not of a valid type. + """ if isinstance(key, IDType): return key in self.individuals.keys() else: @@ -140,17 +220,15 @@ def load_cofactors( Parameters ---------- - df : :class:`pandas.DataFrame` + df : :obj:`pandas.DataFrame` The dataframe where the cofactors are stored. Its index should be ID, the identifier of subjects and it should uniquely index the dataframe (i.e. one row per individual). - cofactors : List[FeatureType] or None (default) - Names of the column(s) of df which shall be loaded as cofactors. + cofactors : :obj:`List`[:class:`FeatureType`], optional + Names of the column(s) of dataframe which shall be loaded as cofactors. If None, all the columns from the input dataframe will be loaded as cofactors. + Default: None - Raises - ------ - :exc:`.LeaspyDataInputError` """ _check_cofactor_index(df) self._check_cofactor_index_is_consistent_with_data_index(df) @@ -163,6 +241,21 @@ def load_cofactors( self.individuals[subject_name].add_cofactors(subject_cofactors) def _check_cofactor_index_is_consistent_with_data_index(self, df: pd.DataFrame): + """ + Check that the index of the dataframe is consistent with the + index of the Data object. + + Parameters + ---------- + df : :obj:`pandas.DataFrame` + The dataframe where the cofactors are stored. + + Raises + ------ + :exc:`.LeaspyDataInputError` + If the index of the dataframe is not consistent with the + index of the Data object. + """ if (cofactors_dtype_indices := pd.api.types.infer_dtype(df.index)) != ( internal_dtype_indices := pd.api.types.infer_dtype( self.iter_to_idx.values() @@ -174,6 +267,19 @@ def _check_cofactor_index_is_consistent_with_data_index(self, df: pd.DataFrame): ) def _check_no_individual_missing(self, df: pd.DataFrame): + """ + Check that the individuals in the Data object are present in the dataframe. + + Parameters + ---------- + df : :obj:`pandas.DataFrame` + The dataframe where the cofactors are stored. + + Raises + ------ + :exc:`.LeaspyDataInputError` + If some individuals are missing in the dataframe. + """ internal_indices = pd.Index(self.iter_to_idx.values()) if len(missing_individuals := internal_indices.difference(df.index)): raise LeaspyDataInputError( @@ -198,14 +304,20 @@ def from_csv_file( Parameters ---------- - path : str + path : :obj: `str` Path to the CSV file to load (with extension) - **kws - Keyword arguments that are sent to :class:`.CSVDataReader` - + data_type : :obj: `str` + Type of data to read. Can be 'visit' or 'event'. + pd_read_csv_kws : :obj: `dict` + Keyword arguments that are sent to :func:`pandas.read_csv` + facto_kws : :obj: `dict` + Keyword arguments + **df_reader_kws : + Keyword arguments that are sent to :class:`AbstractDataframeDataReader` to :func:`dataframe_data_reader_factory` Returns ------- - :class:`.Data` + :class:`Data`: + A Data object containing the data from the CSV file. """ # enforce ID to be interpreted as string as default (can be overwritten) pd_read_csv_kws = {"dtype": {"ID": str}, **pd_read_csv_kws} @@ -224,27 +336,32 @@ def to_dataframe( reset_index: bool = True, ) -> pd.DataFrame: """ - Convert the Data object to a :class:`pandas.DataFrame` + Convert the Data object to a :obj:`pandas.DataFrame` Parameters ---------- - cofactors : List[FeatureType], 'all', or None (default None) + cofactors : :obj: `List`[:class:`FeatureType`] or :obj: `int`, optional Cofactors to include in the DataFrame. If None (default), no cofactors are included. If "all", all the available cofactors are included. - reset_index : bool (default True) + Default: None + + reset_index : :obj: `bool`, optional Whether to reset index levels in output. + Default: True Returns ------- - :class:`pandas.DataFrame` + :obj:`pandas.DataFrame`: A DataFrame containing the individuals' ID, timepoints and associated observations (optional - and cofactors). Raises ------ :exc:`.LeaspyDataInputError` + If the Data object does not contain any cofactors. :exc:`.LeaspyTypeError` + If the cofactors argument is not of a valid type. """ cofactors_list = self._validate_cofactors_input(cofactors) df = pd.concat( @@ -267,6 +384,29 @@ def to_dataframe( def _validate_cofactors_input( self, cofactors: Optional[Union[List[FeatureType], str]] = None ) -> List[FeatureType]: + """ + Validate the cofactors input for the to_dataframe method. + + Parameters + ---------- + cofactors : :obj: `List`[:class:`FeatureType`] or :obj: `int`, optional + Cofactors to include in the DataFrame. + If None (default), no cofactors are included. + If "all", all the available cofactors are included. + Default: None + + Returns + ------- + :obj:`List`[:class:`FeatureType`]: + A list of the validated cofactors. + + Raises + ------ + :exc:`.LeaspyDataInputError` + If the Data object does not contain given cofactors. + :exc:`.LeaspyTypeError` + If the cofactors argument is not of a valid type. + """ if cofactors is None: return [] if isinstance(cofactors, str): @@ -292,21 +432,39 @@ def from_dataframe( Parameters ---------- - df : :class:`pandas.DataFrame` + df : :obj:`pandas.DataFrame` Dataframe containing ID, TIME and features. + data_type : :obj:`str` + Type of data to read. Can be 'visit', 'event', 'joint' + factory_kws : :obj`dict` + Keyword arguments that are sent to :func:`.dataframe_data_reader_factory` **kws - Keyword arguments that are sent to :class:`.DataframeDataReader` + Keyword arguments that are sent to :class:`DataframeDataReader` Returns ------- - :class:`.Data` + :class:`Data` """ reader = dataframe_data_reader_factory(data_type, **factory_kws) reader.read(df, **kws) return Data._from_reader(reader) @staticmethod - def _from_reader(reader): + def _from_reader(reader) -> Data: + """ + Create a Data object from a reader + + Parameters + ---------- + reader : :class:`AbstractDataframeDataReader` + Reader object containing the data + + Returns + ------- + :class:`Data` + A Data object containing the data from the reader. + + """ data = Data() data.individuals = reader.individuals data.iter_to_idx = reader.iter_to_idx @@ -333,23 +491,24 @@ def from_individual_values( Parameters ---------- - indices : List[IDType] + indices : :obj:`List`[:class:`IDType`] List of the individuals' unique ID - timepoints : List[List[float]] + timepoints : :obj:`List`[:obj:`List`[:obj:`float`]] For each individual ``i``, list of timepoints associated with the observations. The number of such timepoints is noted ``n_timepoints_i`` - values : List[array-like[float, 2D]] + values : :obj:`List`[:obj:`array-like`[:obj:`float`,:obj:`2D`]] For each individual ``i``, two-dimensional array-like object containing observed data points. Its expected shape is ``(n_timepoints_i, n_features)`` - headers : List[FeatureType] + headers : :obj:`List`[:class:`FeatureType`] Feature names. The number of features is noted ``n_features`` Returns ------- - :class:`.Data` + :class:`Data`: + A Data object containing the individuals and their data. """ # Longitudinal input check @@ -393,14 +552,15 @@ def from_individuals( Parameters ---------- - individuals : List[IndividualData] + individuals : :obj:`List`[:class:`IndividualData`] List of individuals - headers : List[FeatureType] + headers : :obj:`List`[:class:`FeatureType`] List of feature names Returns ------- - :class:`.Data` + :class:`Data`: + A Data object containing the individuals and their data. """ data = Data() @@ -428,6 +588,20 @@ def from_individuals( return data def extract_longitudinal_only(self) -> Data: + """ + Extract longitudinal data from the Data object + + Returns + ------- + :class:`Data`: + A Data object containing only longitudinal data. + + Raises + ------ + :exc:`.LeaspyDataInputError` + If the Data object does not contain any longitudinal data. + """ + if not self.headers: raise LeaspyDataInputError( "You can't extract longitudinal data from data that have none" @@ -444,6 +618,19 @@ def extract_longitudinal_only(self) -> Data: def _check_cofactor_index(df: pd.DataFrame): + """ + Check that the index of the dataframe is a valid index for cofactors + + Parameters + ---------- + df : :obj:`pandas.DataFrame` + The dataframe where the cofactors are stored. + + Raises + ------ + :exc:`.LeaspyDataInputError` + If the index of the dataframe is not a valid index for cofactors. + """ if not ( isinstance(df, pd.DataFrame) and isinstance(df.index, pd.Index)