Source code for tempor.models.clairvoyance2.datasets.dummy

# mypy: ignore-errors

import math
from typing import Any, Optional, Sequence

import numpy as np

from ..data import Dataset, StaticSamples, TimeSeriesSamples


[docs]class DummyDatasetGenerator: def __init__( self, n_samples: int, temporal_covariates_n_features: int, temporal_covariates_max_len: int, temporal_covariates_missing_prob: float, static_covariates_n_features: int, static_covariates_missing_prob: float, temporal_targets_n_features: int, temporal_targets_n_categories: Optional[int], temporal_treatments_n_features: int, temporal_treatments_n_categories: Optional[int], ) -> None: self.n_samples = n_samples self.temporal_covariates_n_features = temporal_covariates_n_features self.temporal_covariates_max_len = temporal_covariates_max_len self.temporal_covariates_missing_prob = temporal_covariates_missing_prob self.static_covariates_n_features = static_covariates_n_features self.static_covariates_missing_prob = static_covariates_missing_prob self.temporal_targets_n_features = temporal_targets_n_features self.temporal_targets_n_categories = temporal_targets_n_categories self.temporal_treatments_n_features = temporal_treatments_n_features self.temporal_treatments_n_categories = temporal_treatments_n_categories @staticmethod def _random_lengths(rng, max_len, n_samples) -> np.ndarray: return rng.integers(low=math.ceil(max_len / 2), high=max_len, size=(n_samples,)) + 1 @staticmethod def _apply_missing(array: np.ndarray, rng, missing_prob: float) -> np.ndarray: array = array.copy() missing = rng.uniform(low=0.0, high=1.0, size=array.shape) < missing_prob array[missing] = np.nan return array @staticmethod def _generate_temporal_continuous( random_seed: int, max_len: int, n_samples: int, n_features: int, missing_prob: float, lengths: Optional[Sequence[int]] = None, ) -> TimeSeriesSamples: rng = np.random.default_rng(seed=random_seed) lens: Any = DummyDatasetGenerator._random_lengths(rng, max_len, n_samples) if lengths is None else lengths noise_mus = rng.standard_normal(size=(n_features,)) noise_sigmas = 1.0 + rng.standard_normal(size=(n_features,)) list_ = [] for sample_len in lens: trend = np.tile(np.arange(sample_len), (n_features, 1)).T noise = noise_mus + noise_sigmas * rng.standard_normal(size=(sample_len, n_features)) final = trend + noise final = DummyDatasetGenerator._apply_missing(final, rng, missing_prob) list_.append(final) return TimeSeriesSamples(data=list_, sample_indices=None) @staticmethod def _generate_temporal_categorical( random_seed: int, max_len: int, n_samples: int, n_features: int, n_categories: int, lengths: Optional[Sequence[int]] = None, ) -> TimeSeriesSamples: rng = np.random.default_rng(seed=random_seed) lens: Any = DummyDatasetGenerator._random_lengths(rng, max_len, n_samples) if lengths is None else lengths list_ = [] for sample_len in lens: array = rng.integers(low=0, high=n_categories, size=(sample_len, n_features)) list_.append(array) return TimeSeriesSamples(data=list_, sample_indices=None) @staticmethod def _generate_static_continuous( random_seed: int, n_samples: int, n_features: int, missing_prob: float, ) -> StaticSamples: rng = np.random.default_rng(seed=random_seed) mus = 2.0 + rng.standard_normal(size=(n_features,)) sigmas = 0.5 * rng.standard_normal(size=(n_features,)) array = mus + sigmas * rng.standard_normal(size=(n_samples, n_features)) array = DummyDatasetGenerator._apply_missing(array, rng, missing_prob) return StaticSamples(data=array)
[docs] def generate(self, random_seed: int) -> Dataset: temporal_covariates = self._generate_temporal_continuous( random_seed, max_len=self.temporal_covariates_max_len, n_samples=self.n_samples, n_features=self.temporal_covariates_n_features, missing_prob=self.temporal_covariates_missing_prob, ) if self.static_covariates_n_features > 0: static_covariates = self._generate_static_continuous( random_seed, n_samples=self.n_samples, n_features=self.static_covariates_n_features, missing_prob=self.static_covariates_missing_prob, ) else: static_covariates = None if self.temporal_targets_n_features > 0: if self.temporal_targets_n_categories is not None: temporal_targets = self._generate_temporal_categorical( random_seed, max_len=-1, n_samples=self.n_samples, n_features=self.temporal_targets_n_features, n_categories=self.temporal_targets_n_categories, lengths=temporal_covariates.n_timesteps_per_sample, ) else: temporal_targets = self._generate_temporal_continuous( random_seed, max_len=-1, n_samples=self.n_samples, n_features=self.temporal_targets_n_features, missing_prob=0.0, lengths=temporal_covariates.n_timesteps_per_sample, ) else: temporal_targets = None if self.temporal_treatments_n_features > 0: if self.temporal_treatments_n_categories is not None: temporal_treatments = self._generate_temporal_categorical( random_seed, max_len=-1, n_samples=self.n_samples, n_features=self.temporal_treatments_n_features, n_categories=self.temporal_treatments_n_categories, lengths=temporal_covariates.n_timesteps_per_sample, ) else: temporal_treatments = self._generate_temporal_continuous( random_seed, max_len=-1, n_samples=self.n_samples, n_features=self.temporal_treatments_n_features, missing_prob=0.0, lengths=temporal_covariates.n_timesteps_per_sample, ) else: temporal_treatments = None return Dataset( temporal_covariates=temporal_covariates, static_covariates=static_covariates, temporal_targets=temporal_targets, temporal_treatments=temporal_treatments, )
[docs]def dummy_dataset( n_samples: int = 100, temporal_covariates_n_features: int = 5, temporal_covariates_max_len: int = 20, temporal_covariates_missing_prob: float = 0.1, static_covariates_n_features: int = 0, static_covariates_missing_prob: float = 0.0, temporal_targets_n_features: int = 0, temporal_targets_n_categories: Optional[int] = None, temporal_treatments_n_features: int = 0, temporal_treatments_n_categories: Optional[int] = None, random_seed: int = 12345, ) -> Dataset: dummy_dataset_generator = DummyDatasetGenerator( n_samples=n_samples, temporal_covariates_n_features=temporal_covariates_n_features, temporal_covariates_max_len=temporal_covariates_max_len, temporal_covariates_missing_prob=temporal_covariates_missing_prob, static_covariates_n_features=static_covariates_n_features, static_covariates_missing_prob=static_covariates_missing_prob, temporal_targets_n_features=temporal_targets_n_features, temporal_targets_n_categories=temporal_targets_n_categories, temporal_treatments_n_features=temporal_treatments_n_features, temporal_treatments_n_categories=temporal_treatments_n_categories, ) return dummy_dataset_generator.generate(random_seed=random_seed)