Source code for tempor.data.samples

"""Data handling for different data samples modalities supported by TemporAI."""

# pylint: disable=unnecessary-ellipsis

import abc
import contextlib
from typing import TYPE_CHECKING, Any, Generator, List, Optional, Tuple

import numpy as np
import pandas as pd
import pandera as pa
import pydantic
from packaging.version import Version
from typing_extensions import Self

import tempor.exc
from tempor.core import plugins, pydantic_utils
from tempor.log import log_helpers, logger

from . import data_typing, pandera_utils, utils
from .settings import DATA_SETTINGS


[docs]class DataSamples(plugins.Plugin, abc.ABC): _data: Any @property @abc.abstractmethod def modality(self) -> data_typing.DataModality: # pragma: no cover """Return the data modality enum corresponding to the class Returns: data_typing.DataModality: The data modality enum. """ ... def __init__( self, data: data_typing.DataContainer, # pylint: disable=unused-argument **kwargs: Any, ) -> None: # pragma: no cover """The abstract base class for all data samples classes. Args: data (data_typing.DataContainer): The data container. **kwargs (Any): Any additional keyword arguments. """ plugins.Plugin.__init__(self) if "_skip_validate" not in kwargs: # For efficiency, pass `_skip_validate` internally (e.g. in `__getitem__`) # when there is no need to validate. self.validate() def __repr__(self) -> str: """The `repr()` representation of the class. Returns: str: The representation. """ return f"{self.__class__.__name__} with data:\n{self.dataframe()}" def _repr_html_(self) -> str: """Return a HTML representation of the object, used in Jupyter notebooks. Returns: str: The HTML representation of the object. """ repr_ = ( # pylint: disable-next=protected-access f'<p><span style="font-family: monospace;">{self.__class__.__name__}</span> with data:</p>' f"{self.dataframe()._repr_html_()}" # pyright: ignore ) return repr_
[docs] def validate(self) -> None: """Validate the data contained. Raises: tempor.exc.DataValidationException: Raised if data validation fails. """ with log_helpers.exc_to_log(): try: self._validate() except ( pa.errors.SchemaError, # pyright: ignore pa.errors.SchemaErrors, # pyright: ignore ValueError, TypeError, ) as ex: raise tempor.exc.DataValidationException( "Data validation failed, see traceback for more details" ) from ex
@abc.abstractmethod def _validate(self) -> None: # pragma: no cover """Validate integrity of the data samples. Raise any of `ValueError`, `TypeError`, `pandera.errors.SchemaError`, `pandera.errors.SchemaErrors` (or exceptions derived from these) to indicate validation failure. """ ...
[docs] @staticmethod @abc.abstractmethod def from_numpy( array: np.ndarray, *, sample_index: Optional[data_typing.SampleIndex] = None, feature_index: Optional[data_typing.FeatureIndex] = None, **kwargs: Any, ) -> "DataSamples": # pragma: no cover """Create :class:`DataSamples` from `numpy.ndarray`. Args: array (np.ndarray): The array that represents the data. sample_index (Optional[data_typing.SampleIndex], optional): List with sample (row) index for each sample. Optional, if `None`, will be of form ``[0, 1, ...]``. Defaults to `None`. feature_index (Optional[data_typing.FeatureIndex], optional): List with feature (column) index for each feature. Optional, if `None`, will be of form ``["feat_0", "feat_1", ...]``. Defaults to `None`. **kwargs (Any): Any additional keyword arguments. Returns: DataSamples: :class:`DataSamples` object from ``array``. """ ...
[docs] @staticmethod @abc.abstractmethod def from_dataframe(dataframe: pd.DataFrame, **kwargs: Any) -> "DataSamples": # pragma: no cover """Create :class:`DataSamples` from `pandas.DataFrame`.""" ...
[docs] @abc.abstractmethod def numpy(self, **kwargs: Any) -> np.ndarray: # pragma: no cover """Return `numpy.ndarray` representation of the data.""" ...
[docs] @abc.abstractmethod def dataframe(self, **kwargs: Any) -> pd.DataFrame: # pragma: no cover """Return `pandas.DataFrame` representation of the data.""" ...
@property @abc.abstractmethod def num_samples(self) -> int: # pragma: no cover """Return number of samples.""" ...
[docs] @abc.abstractmethod def sample_index(self) -> data_typing.SampleIndex: # pragma: no cover """Return a list representing sample indexes.""" ...
def __len__(self) -> int: """The length, which is the number of samples. Returns: int: The number of samples. """ return self.num_samples @property @abc.abstractmethod def num_features(self) -> int: # pragma: no cover """Return number of features.""" ...
[docs] @abc.abstractmethod def short_repr(self) -> str: # pragma: no cover """A short string representation of the object. Returns: str: The short string representation of the object. """ ...
@abc.abstractmethod def __getitem__(self, key: data_typing.GetItemKey) -> Self: # pragma: no cover """Return a new subset :class:`DataSamples` object with the data indexed by the ``key``. Args: key (data_typing.GetItemKey): The key to index the data by. Returns: Self: A new subset :class:`DataSamples` object. """ ...
[docs]class StaticSamplesBase(DataSamples): @property def modality(self) -> data_typing.DataModality: """Return the data modality enum corresponding to the class. Here, ``STATIC``. Returns: data_typing.DataModality: The data modality enum. Here, ``STATIC``. """ return data_typing.DataModality.STATIC
[docs]class TimeSeriesSamplesBase(DataSamples): @property def modality(self) -> data_typing.DataModality: """Return the data modality enum corresponding to the class. Here, ``TIME_SERIES``. Returns: data_typing.DataModality: The data modality enum. Here, ``TIME_SERIES``. """ return data_typing.DataModality.TIME_SERIES
[docs] @abc.abstractmethod def time_indexes(self) -> data_typing.TimeIndexList: # pragma: no cover """Get a list containing time indexes for each sample. Each time index is represented as a list of time step elements. Returns: data_typing.TimeIndexList: A list containing time indexes for each sample. """ ...
[docs] @abc.abstractmethod def time_indexes_as_dict(self) -> data_typing.SampleToTimeIndexDict: # pragma: no cover """Get a dictionary mapping each sample index to its time index. Time index is represented as a list of time step elements. Returns: data_typing.SampleToTimeIndexDict: The dictionary mapping each sample index to its time index. """ ...
[docs] @abc.abstractmethod def time_indexes_float(self) -> List[np.ndarray]: # pragma: no cover """Return time indexes but converting their elements to `float` values. Date-time time index will be converted using :obj:`~tempor.data.utils.datetime_time_index_to_float`. Returns: List[np.ndarray]: List of 1D `numpy.ndarray` s of `float` values, corresponding to the time index. """ ...
[docs] @abc.abstractmethod def num_timesteps(self) -> List[int]: # pragma: no cover """Get the number of timesteps for each sample. Returns: List[int]: List containing the number of timesteps for each sample. """ ...
[docs] @abc.abstractmethod def num_timesteps_as_dict(self) -> data_typing.SampleToNumTimestepsDict: # pragma: no cover """Get a dictionary mapping each sample index to its the number of timesteps. Returns: data_typing.SampleToNumTimestepsDict: List containing the number of timesteps for each sample. """ ...
[docs] @abc.abstractmethod def num_timesteps_equal(self) -> bool: # pragma: no cover """Returns `True` if all samples share the same number of timesteps, `False` otherwise. Returns: bool: whether all samples share the same number of timesteps. """ ...
[docs] @abc.abstractmethod def list_of_dataframes(self) -> List[pd.DataFrame]: # pragma: no cover """Returns a list of dataframes where each dataframe has the data for each sample. Returns: List[pd.DataFrame]: List of dataframes for each sample. """ ...
_DEFAULT_EVENTS_TIME_FEATURE_SUFFIX = "_time"
[docs]class EventSamplesBase(DataSamples): @property def modality(self) -> data_typing.DataModality: """Return the data modality enum corresponding to the class. Here, ``EVENT``. Returns: data_typing.DataModality: The data modality enum. Here, ``EVENT``. """ return data_typing.DataModality.EVENT
[docs] @abc.abstractmethod def split(self, time_feature_suffix: str = _DEFAULT_EVENTS_TIME_FEATURE_SUFFIX) -> pd.DataFrame: """Return a `pandas.DataFrame` where the time component of each event feature has been split off to its own column. The new columns that contain the times will be named ``"<original column name><time_feature_suffix>"`` and will be inserted before each corresponding ``<original column name>`` column. The ``<original column name>`` columns will contain only the event value. Args: time_feature_suffix (str, optional): A column name suffix string to identify the time columns that will be split off. Defaults to ``"_time"``. Returns: pd.DataFrame: The output dataframe. """ ...
[docs] @abc.abstractmethod def split_as_two_dataframes( self, time_feature_suffix: str = _DEFAULT_EVENTS_TIME_FEATURE_SUFFIX ) -> Tuple[pd.DataFrame, pd.DataFrame]: """Analogous to :func:`~tempor.data.samples.EventSamples.split` but returns two `pandas.DataFrame` s: - first dataframe contains the event times of each feature. - second dataframe contains the event values (`True`/`False`) of each feature. Args: time_feature_suffix (str, optional): A column name suffix string to identify the time columns that will be split off. Defaults to ``"_time"``. Returns: Tuple[pd.DataFrame, pd.DataFrame]: Two `pandas.DataFrame` s containing event times and values respectively. """ ...
def _array_default_sample_index(array: np.ndarray) -> List[int]: n_samples, *_ = array.shape return list(range(0, n_samples)) def _array_default_feature_index(array: np.ndarray) -> List[str]: *_, n_features = array.shape return [f"feat_{x}" for x in range(0, n_features)] def _array_default_time_indexes(array: np.ndarray, padding_indicator: Any) -> List[List[int]]: lengths = utils.get_seq_lengths_timeseries_array3d(array, padding_indicator) return [list(range(x)) for x in lengths] plugins.register_plugin_category("static_samples", StaticSamplesBase, plugin_type="dataformat") plugins.register_plugin_category("time_series_samples", TimeSeriesSamplesBase, plugin_type="dataformat") plugins.register_plugin_category("event_samples", EventSamplesBase, plugin_type="dataformat")
[docs]@plugins.register_plugin(name="static_samples_df", category="static_samples", plugin_type="dataformat") class StaticSamples(StaticSamplesBase): _data: pd.DataFrame _schema: pa.DataFrameSchema @pydantic_utils.validate_arguments(config=pydantic.ConfigDict(arbitrary_types_allowed=True)) def __init__( self, data: data_typing.DataContainer, *, sample_index: Optional[data_typing.SampleIndex] = None, feature_index: Optional[data_typing.FeatureIndex] = None, **kwargs: Any, ) -> None: """Create a :class:`StaticSamples` object from the ``data``. Args: data (data_typing.DataContainer): A container with the data. sample_index (Optional[data_typing.SampleIndex], optional): Used only if ``data`` is a `numpy.ndarray`. List with sample (row) index for each sample. Optional, if `None`, will be of form ``[0, 1, ...]``. Defaults to `None`. feature_index (Optional[data_typing.FeatureIndex], optional): Used only if ``data`` is a `numpy.ndarray`. List with feature (column) index for each feature. Optional, if `None`, will be of form ``["feat_0", "feat_1", ...]``. Defaults to `None`. **kwargs (Any): Any additional keyword arguments to pass to the constructor. """ if isinstance(data, pd.DataFrame): self._data = data elif isinstance(data, np.ndarray): self._data = self._array_to_df(data, sample_index=sample_index, feature_index=feature_index, **kwargs) else: # pragma: no cover # Prevented by pydantic check. raise ValueError(f"Data object {type(data)} not supported") super().__init__(data, **kwargs) def _validate(self) -> None: schema = pandera_utils.init_schema(self._data, coerce=False) if TYPE_CHECKING: # pragma: no cover assert isinstance(schema, pa.DataFrameSchema) # nosec B101 logger.debug(f"Inferred schema:\n{schema}") # DataFrame-level validation: schema = pandera_utils.add_df_checks( schema, checks_list=[ pandera_utils.checks.forbid_multiindex_index, pandera_utils.checks.forbid_multiindex_columns, pandera_utils.checks.configurable.column_index_satisfies_dtype( pandera_utils.UnionDtype[DATA_SETTINGS.feature_index_dtypes], # type: ignore nullable=DATA_SETTINGS.feature_index_nullable, ), ], ) self._data = schema.validate(self._data) # Values validation: schema = pandera_utils.add_regex_column_checks( schema, regex=".*", dtype=pandera_utils.UnionDtype[DATA_SETTINGS.static_value_dtypes], # type: ignore nullable=DATA_SETTINGS.static_values_nullable, ) self._data = schema.validate(self._data) # Index validation: schema, data = pandera_utils.set_up_index( schema, self._data, dtype=pandera_utils.UnionDtype[DATA_SETTINGS.sample_index_dtypes], # type: ignore name=DATA_SETTINGS.sample_index_name, nullable=DATA_SETTINGS.sample_index_nullable, coerce=False, unique=DATA_SETTINGS.sample_index_unique, ) self._data = schema.validate(data) logger.debug(f"Final schema:\n{schema}") self._schema = schema
[docs] @staticmethod def from_dataframe( dataframe: pd.DataFrame, **kwargs: Any, ) -> "StaticSamples": # pyright: ignore """Create :class:`StaticSamples` from `pandas.DataFrame`. The rows represent samples, the columns represent features. Args: dataframe (pd.DataFrame): The dataframe that represents the data. **kwargs (Any): Any additional keyword arguments to pass to the constructor. Returns: StaticSamples: :class:`StaticSamples` object from ``dataframe``. """ return StaticSamples(dataframe, **kwargs)
[docs] @staticmethod def from_numpy( array: np.ndarray, *, sample_index: Optional[data_typing.SampleIndex] = None, feature_index: Optional[data_typing.FeatureIndex] = None, **kwargs: Any, ) -> "StaticSamples": # pyright: ignore """Create :class:`StaticSamples` from `numpy.ndarray`. The 0th dimension represents samples, the 1st dimension represents features. Args: array (np.ndarray): The array with the data. sample_index (Optional[data_typing.SampleIndex], optional): Sample indices to assign. Defaults to None. feature_index (Optional[data_typing.FeatureIndex], optional): Feature indices to assign. Defaults to None. **kwargs (Any): Any additional keyword arguments to pass to the constructor. Returns: StaticSamples: :class:`StaticSamples` object created from the ``array``. """ return StaticSamples(array, sample_index=sample_index, feature_index=feature_index, **kwargs)
@staticmethod def _array_to_df( array: np.ndarray, *, sample_index: Optional[data_typing.SampleIndex] = None, feature_index: Optional[data_typing.FeatureIndex] = None, **kwargs: Any, ) -> pd.DataFrame: if sample_index is None: sample_index = _array_default_sample_index(array) # pyright: ignore if feature_index is None: feature_index = _array_default_feature_index(array) return pd.DataFrame(data=array, index=sample_index, columns=feature_index, **kwargs)
[docs] def numpy(self, **kwargs: Any) -> np.ndarray: """Return the data as a `numpy.ndarray`. Args: **kwargs (Any): Any additional keyword arguments. Currently unused. Returns: np.ndarray: The `numpy.ndarray`. """ return self._data.to_numpy()
[docs] def dataframe(self, **kwargs: Any) -> pd.DataFrame: """Return the data as a `pandas.DataFrame`. Args: **kwargs (Any): Any additional keyword arguments. Currently unused. Returns: pd.DataFrame: The dataframe. """ return self._data
[docs] def sample_index(self) -> data_typing.SampleIndex: """Return a list representing sample indexes. Returns: data_typing.SampleIndex: Sample indexes. """ return list(self._data.index) # pyright: ignore
@property def num_samples(self) -> int: """Return number of samples. Returns: int: Number of samples. """ return self._data.shape[0] @property def num_features(self) -> int: """Return number of features. Returns: int: Number of features. """ return self._data.shape[1]
[docs] def short_repr(self) -> str: """A short string representation of the object. Returns: str: The short representation. """ return f"{self.__class__.__name__}([{self.num_samples}, {self.num_features}])"
def __getitem__(self, key: data_typing.GetItemKey) -> Self: """Return a new subset :class:`StaticSamples` object with the data indexed by the ``key``. Args: key (data_typing.GetItemKey): The key to index the data by. Returns: Self: A new subset :class:`StaticSamples` object. """ key_ = utils.ensure_pd_iloc_key_returns_df(key) return StaticSamples( # type: ignore [return-value] self._data.iloc[key_, :], # pyright: ignore _skip_validate=True, )
[docs]@contextlib.contextmanager def workaround_pandera_pd2_1_0_multiindex_compatibility(schema: pa.DataFrameSchema, data: pd.DataFrame) -> Generator: """A version compatibility issue exists between pandera and pandas 2.1.0, as reported here: https://github.com/unionai-oss/pandera/issues/1328 The error pertains to multiindex uniqueness validation giving an unexpected error. This is a workaround that will "manually" throw an error that is expected from pandera. """ def problem_versions() -> bool: # pragma: no cover return Version(pd.__version__) >= Version("2.1.0") # TODO: When/if fixed in pandera, add the below condition: # and Version(pa.__version__) < Version("0.XX.YY") try: yield except ValueError as ex: if problem_versions() and ( "Columns with duplicate values are not supported in stack" in str(ex) ): # pragma: no cover cols = data.index.names raise pa.errors.SchemaError( # type: ignore [no-untyped-call] schema=schema, data=data, message=f"columns {cols} not unique", ) else: # pragma: no cover raise finally: pass
[docs]@plugins.register_plugin(name="time_series_samples_df", category="time_series_samples", plugin_type="dataformat") class TimeSeriesSamples(TimeSeriesSamplesBase): _data: pd.DataFrame _schema: pa.DataFrameSchema @pydantic_utils.validate_arguments(config=pydantic.ConfigDict(arbitrary_types_allowed=True)) def __init__( self, data: data_typing.DataContainer, *, padding_indicator: Any = None, sample_index: Optional[data_typing.SampleIndex] = None, time_indexes: Optional[data_typing.TimeIndexList] = None, feature_index: Optional[data_typing.FeatureIndex] = None, **kwargs: Any, ) -> None: """Create a :class:`TimeSeriesSamples` object from the ``data``. If ``data`` is a `pandas.DataFrame`, this should be a 2-level multiindex (sample, timestep) dataframe. If ``data`` is a `numpy.ndarray`, this should be a 3D array, with dimensions ``(sample, timestep, feature)``. Optionally, padding values of ``padding_indicator`` can be set inside the array to pad out the length of arrays of different samples in case they differ. Padding needs to go at the end of the timesteps (dim 1). Padding must be the same across the feature dimension (dim 2) for each sample. Args: data (data_typing.DataContainer): A container with the data. padding_indicator (Any, optional): Padding indicator used in ``data`` to indicate padding. Defaults to `None`. sample_index (Optional[data_typing.SampleIndex], optional): Used only if ``data`` is a `numpy.ndarray`. List with sample (row) index for each sample. Optional, if `None`, will be of form ``[0, 1, ...]``. Defaults to `None`. time_indexes (Optional[data_typing.TimeIndexList], optional): Used only if ``data`` is a `numpy.ndarray`. List of lists containing timesteps for each sample (outer list should be the same length as dim 0 of `data`, inner list should contain as many elements as each sample has timesteps). Optional, if `None`, will be of form ``[[0, 1, ...], [0, 1, ...], ...]`` Defaults to `None`. feature_index (Optional[data_typing.FeatureIndex], optional): Used only if ``data`` is a `numpy.ndarray`. List with feature (column) index for each feature. Optional, if `None`, will be of form ``["feat_0", "feat_1", ...]``. Defaults to `None`. **kwargs (Any): Any additional keyword arguments to pass to the constructor. """ if isinstance(data, pd.DataFrame): self._data = data elif isinstance(data, np.ndarray): self._data = self._array_to_df( data, padding_indicator=padding_indicator, sample_index=sample_index, time_indexes=time_indexes, feature_index=feature_index, **kwargs, ) else: # pragma: no cover # Prevented by pydantic check. raise ValueError(f"Data object {type(data)} not supported") super().__init__(data, **kwargs) def _validate(self) -> None: schema = pandera_utils.init_schema(self._data, coerce=False) if TYPE_CHECKING: # pragma: no cover assert isinstance(schema, pa.DataFrameSchema) # nosec B101 logger.debug(f"Inferred schema:\n{schema}") # DataFrame-level validation: schema = pandera_utils.add_df_checks( schema, checks_list=[ pandera_utils.checks.forbid_multiindex_columns, pandera_utils.checks.require_2level_multiindex_index, pandera_utils.checks.configurable.column_index_satisfies_dtype( pandera_utils.UnionDtype[DATA_SETTINGS.feature_index_dtypes], # type: ignore nullable=DATA_SETTINGS.feature_index_nullable, ), ], ) self._data = schema.validate(self._data) # Values validation: schema = pandera_utils.add_regex_column_checks( schema, regex=".*", dtype=pandera_utils.UnionDtype[DATA_SETTINGS.time_series_value_dtypes], # type: ignore nullable=DATA_SETTINGS.time_series_values_nullable, ) self._data = schema.validate(self._data) # Index validation: if not (DATA_SETTINGS.sample_index_unique and DATA_SETTINGS.sample_timestep_index_unique): raise NotImplementedError("Only supported case: unique sample and unique timestep indexes") multiindex_unique_def = (DATA_SETTINGS.sample_index_name, DATA_SETTINGS.time_index_name) schema, data = pandera_utils.set_up_2level_multiindex( schema, self._data, dtypes=( pandera_utils.UnionDtype[DATA_SETTINGS.sample_index_dtypes], # type: ignore pandera_utils.UnionDtype[DATA_SETTINGS.time_index_dtypes], # type: ignore ), names=(DATA_SETTINGS.sample_index_name, DATA_SETTINGS.time_index_name), nullable=(DATA_SETTINGS.sample_index_nullable, DATA_SETTINGS.time_index_nullable), coerce=False, unique=multiindex_unique_def, ) with workaround_pandera_pd2_1_0_multiindex_compatibility(schema, data): self._data = schema.validate(data) logger.debug(f"Final schema:\n{schema}") self._schema = schema # TODO: # Possible additional validation checks: # - Ensure time index sorted ascending within each sample. # - Time index float / int expected non-negative values.
[docs] @staticmethod def from_dataframe( dataframe: pd.DataFrame, **kwargs: Any, ) -> "TimeSeriesSamples": # pyright: ignore """Create :class:`TimeSeriesSamples` from `pandas.DataFrame`. This row index of the dataframe should be a 2-level multiindex (sample, timestep). The columns should be the features. Args: dataframe (pd.DataFrame): The dataframe that contains the data. **kwargs (Any): Any additional keyword arguments to pass to the constructor. Returns: TimeSeriesSamples: The :class:`TimeSeriesSamples` object created from the ``dataframe``. """ return TimeSeriesSamples(dataframe, **kwargs)
[docs] @staticmethod def from_numpy( array: np.ndarray, *, padding_indicator: Any = None, sample_index: Optional[data_typing.SampleIndex] = None, time_indexes: Optional[data_typing.TimeIndexList] = None, feature_index: Optional[data_typing.FeatureIndex] = None, **kwargs: Any, ) -> "TimeSeriesSamples": # pyright: ignore """Create :class:`TimeSeriesSamples` from `numpy.ndarray`. This should be a 3D array, with dimensions ``(sample, timestep, feature)``. Optionally, padding values of ``padding_indicator`` can be set inside the array to pad out the length of arrays of different samples in case they differ. Padding needs to go at the end of the timesteps (dim 1). Padding must be the same across the feature dimension (dim 2) for each sample. Args: array (np.ndarray): The array that contains the data. padding_indicator (Any, optional): The padding indicator value. Defaults to `None`. sample_index (Optional[data_typing.SampleIndex], optional): Sample indexes as a list. Defaults to `None`. time_indexes (Optional[data_typing.TimeIndexList], optional): Time indexes as a list of list (that is, time indexes per sample). Defaults to `None`. feature_index (Optional[data_typing.FeatureIndex], optional): Feature indexes as a list. Defaults to `None`. **kwargs (Any): Any additional keyword arguments. Returns: TimeSeriesSamples: The :class:`TimeSeriesSamples` object created from the ``array``. """ return TimeSeriesSamples( array, padding_indicator=padding_indicator, sample_index=sample_index, time_indexes=time_indexes, feature_index=feature_index, **kwargs, )
@staticmethod def _array_to_df( # pylint: disable=unused-argument array: np.ndarray, *, padding_indicator: Any, sample_index: Optional[data_typing.SampleIndex] = None, time_indexes: Optional[data_typing.TimeIndexList] = None, feature_index: Optional[data_typing.FeatureIndex] = None, **kwargs: Any, ) -> pd.DataFrame: if sample_index is None: sample_index = _array_default_sample_index(array) # pyright: ignore if feature_index is None: feature_index = _array_default_feature_index(array) if time_indexes is None: time_indexes = _array_default_time_indexes(array, padding_indicator) if TYPE_CHECKING: # pragma: no cover assert sample_index is not None and feature_index is not None and time_indexes is not None # nosec B101 return utils.array3d_to_multiindex_timeseries_dataframe( array, sample_index=sample_index, time_indexes=time_indexes, feature_index=feature_index, padding_indicator=padding_indicator, )
[docs] def numpy(self, *, padding_indicator: Any = DATA_SETTINGS.default_padding_indicator, **kwargs: Any) -> np.ndarray: """Return the data as a `numpy.ndarray`. Args: padding_indicator (Any, optional): Padding indicator value. Defaults to `DATA_SETTINGS.default_padding_indicator`. **kwargs (Any): Any additional keyword arguments. Currently unused. Returns: np.ndarray: The `numpy.ndarray`. """ return utils.multiindex_timeseries_dataframe_to_array3d( df=self._data, padding_indicator=padding_indicator, max_timesteps=None )
[docs] def dataframe(self, **kwargs: Any) -> pd.DataFrame: """Return the data as a `pandas.DataFrame`. Args: **kwargs (Any): Any additional keyword arguments. Currently unused. Returns: pd.DataFrame: The `pandas.DataFrame`. """ return self._data
[docs] def sample_index(self) -> data_typing.SampleIndex: """Get a list containing sample indexes. Returns: data_typing.SampleIndex: A list containing sample indexes. """ return list(utils.get_df_index_level0_unique(self._data)) # pyright: ignore
[docs] def time_indexes(self) -> data_typing.TimeIndexList: """Get a list containing time indexes for each sample. Each time index is represented as a list of time step elements. Returns: data_typing.TimeIndexList: A list containing time indexes for each sample. """ return list(self.time_indexes_as_dict().values()) # pyright: ignore
[docs] def time_indexes_as_dict(self) -> data_typing.SampleToTimeIndexDict: """Get a dictionary mapping each sample index to its time index. Time index is represented as a list of time step elements. Returns: data_typing.SampleToTimeIndexDict: The dictionary mapping each sample index to its time index. """ multiindex = self._data.index if TYPE_CHECKING: # pragma: no cover assert isinstance(multiindex, pd.MultiIndex) # nosec B101 sample_index = self.sample_index() d = dict() for s in sample_index: time_index_locs = multiindex.get_locs([s, slice(None)]) d[s] = list(multiindex.get_level_values(1)[time_index_locs]) return d # type: ignore[return-value]
[docs] def time_indexes_float(self) -> List[np.ndarray]: """Return time indexes but converting their elements to `float` values. Date-time time index will be converted using :obj:`~tempor.data.utils.datetime_time_index_to_float`. Returns: List[np.ndarray]: List of 1D `numpy.ndarray` s of `float` values, corresponding to the time index. """ return [utils.datetime_time_index_to_float(ti) for ti in self.time_indexes()]
[docs] def num_timesteps(self) -> List[int]: """Get the number of timesteps for each sample. Returns: List[int]: List containing the number of timesteps for each sample. """ return [len(x) for x in self.time_indexes()]
[docs] def num_timesteps_as_dict(self) -> data_typing.SampleToNumTimestepsDict: """Get a dictionary mapping each sample index to its the number of timesteps. Returns: data_typing.SampleToNumTimestepsDict: List containing the number of timesteps for each sample. """ return {key: len(x) for key, x in self.time_indexes_as_dict().items()} # type: ignore
[docs] def num_timesteps_equal(self) -> bool: """Returns `True` if all samples share the same number of timesteps, `False` otherwise. Returns: bool: whether all samples share the same number of timesteps. """ timesteps = self.num_timesteps() return True if len(timesteps) == 0 else all([x == timesteps[0] for x in timesteps])
[docs] def list_of_dataframes(self) -> List[pd.DataFrame]: """Returns a list of dataframes where each dataframe has the data for each sample. Returns: List[pd.DataFrame]: List of dataframes for each sample. """ return utils.multiindex_timeseries_dataframe_to_list_of_dataframes(self._data)
@property def num_samples(self) -> int: """Return number of samples. Returns: int: Number of samples. """ sample_ids = utils.get_df_index_level0_unique(self._data) return len(sample_ids) @property def num_features(self) -> int: """Return number of features. Returns: int: Number of features. """ return self._data.shape[1]
[docs] def short_repr(self) -> str: """A short string representation of the object. Returns: str: The short representation. """ return f"{self.__class__.__name__}([{self.num_samples}, *, {self.num_features}])"
def __getitem__(self, key: data_typing.GetItemKey) -> Self: """Return a subset :class:`TimeSeriesSamples` object with the data indexed by the ``key``. Args: key (data_typing.GetItemKey): The key to index the data by. Returns: Self: A new subset :class:`TimeSeriesSamples` object. """ key_ = utils.ensure_pd_iloc_key_returns_df(key) sample_index = utils.get_df_index_level0_unique(self._data) selected = list(sample_index[key_]) # pyright: ignore return TimeSeriesSamples( # type: ignore [return-value] self._data.loc[(selected, slice(None)), :], # pyright: ignore _skip_validate=True, )
[docs]@plugins.register_plugin(name="event_samples_df", category="event_samples", plugin_type="dataformat") class EventSamples(EventSamplesBase): _data: pd.DataFrame _schema: pa.DataFrameSchema _schema_split: pa.DataFrameSchema @pydantic_utils.validate_arguments(config=pydantic.ConfigDict(arbitrary_types_allowed=True)) def __init__( self, data: data_typing.DataContainer, *, sample_index: Optional[data_typing.SampleIndex] = None, feature_index: Optional[data_typing.FeatureIndex] = None, **kwargs: Any, ) -> None: """Create an :class:`EventSamples` object from the ``data``. Args: data (data_typing.DataContainer): A container with the data. sample_index (Optional[data_typing.SampleIndex], optional): Used only if ``data`` is a `numpy.ndarray`. List with sample (row) index for each sample. Optional, if `None`, will be of form ``[0, 1, ...]``. Defaults to `None`. feature_index (Optional[data_typing.FeatureIndex], optional): Used only if ``data`` is a `numpy.ndarray`. List with feature (column) index for each feature. Optional, if `None`, will be of form ``["feat_0", "feat_1", ...]``. Defaults to `None`. **kwargs (Any): Any additional keyword arguments to pass to the constructor. """ if isinstance(data, pd.DataFrame): self._data = data elif isinstance(data, np.ndarray): self._data = self._array_to_df(data, sample_index=sample_index, feature_index=feature_index, **kwargs) else: # pragma: no cover # Prevented by pydantic check. raise ValueError(f"Data object {type(data)} not supported") super().__init__(data, **kwargs) def _validate(self) -> None: schema = pandera_utils.init_schema(self._data, coerce=False) if TYPE_CHECKING: # pragma: no cover assert isinstance(schema, pa.DataFrameSchema) # nosec B101 logger.debug(f"Inferred schema:\n{schema}") # DataFrame-level validation: schema = pandera_utils.add_df_checks( schema, checks_list=[ pandera_utils.checks.forbid_multiindex_index, pandera_utils.checks.forbid_multiindex_columns, pandera_utils.checks.configurable.column_index_satisfies_dtype( pandera_utils.UnionDtype[DATA_SETTINGS.feature_index_dtypes], # type: ignore nullable=DATA_SETTINGS.feature_index_nullable, ), ], ) self._data = schema.validate(self._data) # Values validation: schema = pandera_utils.add_regex_column_checks( schema, regex=".*", dtype=None, nullable=DATA_SETTINGS.event_values_nullable, checks_list=[pandera_utils.checks.require_element_len_2], ) self._data = schema.validate(self._data) # Validate event time and value components: suffix = _DEFAULT_EVENTS_TIME_FEATURE_SUFFIX data_split = self.split(time_feature_suffix=suffix) schema_split = pandera_utils.init_schema(data_split, coerce=False) schema_split = pandera_utils.add_regex_column_checks( schema_split, regex=f".*{suffix}$", # Event time columns, end in "_time". dtype=pandera_utils.UnionDtype[DATA_SETTINGS.time_index_dtypes], # type: ignore nullable=DATA_SETTINGS.time_index_nullable, ) schema_split = pandera_utils.add_regex_column_checks( schema_split, regex=f"^((?!{suffix}$).)*$", # Event value columns, do not end in "_time". dtype=pandera_utils.UnionDtype[DATA_SETTINGS.event_value_dtypes], # type: ignore nullable=DATA_SETTINGS.event_values_nullable, ) logger.debug(f"Time split-off schema (checks event time and values separately):\n{schema_split}") schema_split.validate(data_split) self._schema_split = schema_split # Index validation: schema, data = pandera_utils.set_up_index( schema, self._data, dtype=pandera_utils.UnionDtype[DATA_SETTINGS.sample_index_dtypes], # type: ignore name=DATA_SETTINGS.sample_index_name, nullable=DATA_SETTINGS.sample_index_nullable, coerce=False, unique=DATA_SETTINGS.sample_index_unique, ) self._data = schema.validate(data) logger.debug(f"Final schema:\n{schema}") self._schema = schema
[docs] @staticmethod def from_dataframe( dataframe: pd.DataFrame, **kwargs: Any, ) -> "EventSamples": # pyright: ignore """Create :class:`EventSamples` from `pandas.DataFrame`. The row index of the dataframe should be the sample indexes. The columns should be the features. Each feature should contain a tuple of ``(time, value)`` representing the event. Args: dataframe (pd.DataFrame): The dataframe that contains the data. **kwargs (Any): Any additional keyword arguments to pass to the constructor. Returns: EventSamples: The :class:`EventSamples` object created from the ``dataframe``. """ return EventSamples(dataframe, **kwargs)
[docs] @staticmethod def from_numpy( array: np.ndarray, *, sample_index: Optional[data_typing.SampleIndex] = None, feature_index: Optional[data_typing.FeatureIndex] = None, **kwargs: Any, ) -> "EventSamples": # pyright: ignore """Create :class:`EventSamples` from `numpy.ndarray`. The array should be a 2D array, with dimensions ``(sample, feature)``. Each element should contain a tuple of ``(time, value)`` representing the event. Args: array (np.ndarray): The array that contains the data. sample_index (Optional[data_typing.SampleIndex], optional): Sample indexes. Defaults to `None`. feature_index (Optional[data_typing.FeatureIndex], optional): Feature index. Defaults to `None`. **kwargs (Any): Any additional keyword arguments to pass to the constructor. Returns: EventSamples: The :class:`EventSamples` object created from the ``array``. """ return EventSamples(array, sample_index=sample_index, feature_index=feature_index, **kwargs)
@staticmethod def _array_to_df( array: np.ndarray, *, sample_index: Optional[data_typing.SampleIndex] = None, feature_index: Optional[data_typing.FeatureIndex] = None, **kwargs: Any, ) -> pd.DataFrame: if sample_index is None: sample_index = _array_default_sample_index(array) # pyright: ignore if feature_index is None: feature_index = _array_default_feature_index(array) return pd.DataFrame(data=array, index=sample_index, columns=feature_index, **kwargs)
[docs] def numpy(self, **kwargs: Any) -> np.ndarray: """Return the data as a `numpy.ndarray`. Args: **kwargs (Any): Any additional keyword arguments. Currently unused. Returns: np.ndarray: The `numpy.ndarray`. """ # TODO: May want at option to return a scikit-survive -style array. return self._data.to_numpy()
[docs] def dataframe(self, **kwargs: Any) -> pd.DataFrame: """Return the data as a `pandas.DataFrame`. Args: **kwargs (Any): Any additional keyword arguments. Currently unused. Returns: pd.DataFrame: The `pandas.DataFrame`. """ return self._data
[docs] def sample_index(self) -> data_typing.SampleIndex: """Return a list representing sample indexes. Returns: data_typing.SampleIndex: Sample indexes. """ return list(self._data.index) # pyright: ignore
@property def num_samples(self) -> int: """Return number of samples. Returns: int: Number of samples. """ return self._data.shape[0] @property def num_features(self) -> int: """Return number of features. Returns: int: Number of features. """ return self._data.shape[1]
[docs] @pydantic_utils.validate_arguments(config=pydantic.ConfigDict(arbitrary_types_allowed=True)) def split(self, time_feature_suffix: str = _DEFAULT_EVENTS_TIME_FEATURE_SUFFIX) -> pd.DataFrame: """Return a `pandas.DataFrame` where the time component of each event feature has been split off to its own column. The new columns that contain the times will be named ``"<original column name><time_feature_suffix>"`` and will be inserted before each corresponding ``<original column name>`` column. The ``<original column name>`` columns will contain only the event value. Args: time_feature_suffix (str, optional): A column name suffix string to identify the time columns that will be split off. Defaults to ``"_time"``. Returns: pd.DataFrame: The output dataframe. """ df = self._data.copy() features = list(df.columns) if any(time_feature_suffix in str(c) for c in features): raise ValueError(f"Column names must not contain '{time_feature_suffix}'") for f_idx, f in enumerate(features): df.insert(f_idx * 2, f"{f}{time_feature_suffix}", df[f].apply(lambda x: x[0])) for f in features: df[f] = df[f].apply(lambda x: x[1]) return df
[docs] def split_as_two_dataframes( self, time_feature_suffix: str = _DEFAULT_EVENTS_TIME_FEATURE_SUFFIX ) -> Tuple[pd.DataFrame, pd.DataFrame]: """Analogous to :func:`~tempor.data.samples.EventSamples.split` but returns two `pandas.DataFrame` s: - first dataframe contains the event times of each feature. - second dataframe contains the event values (`True`/`False`) of each feature. Args: time_feature_suffix (str, optional): A column name suffix string to identify the time columns that will be split off. Defaults to ``"_time"``. Returns: Tuple[pd.DataFrame, pd.DataFrame]: Two `pandas.DataFrame` s containing event times and values respectively. """ df_split = self.split(time_feature_suffix=time_feature_suffix) df_event_times = df_split.loc[:, [c for c in df_split.columns if time_feature_suffix in c]] df_event_values = df_split.loc[:, [c for c in df_split.columns if time_feature_suffix not in c]] return df_event_times, df_event_values
[docs] def short_repr(self) -> str: """A short string representation of the object. Returns: str: The short representation. """ return f"{self.__class__.__name__}([{self.num_samples}, {self.num_features}])"
def __getitem__(self, key: data_typing.GetItemKey) -> Self: """Return a new subset :class:`EventSamples` object with the data indexed by the ``key``. Args: key (data_typing.GetItemKey): The key to index the data by. Returns: Self: A new subset :class:`EventSamples` object. """ key_ = utils.ensure_pd_iloc_key_returns_df(key) return EventSamples( # type: ignore [return-value] self._data.iloc[key_, :], # pyright: ignore _skip_validate=True, )