Source code for tempor.datasources.prediction.one_off.plugin_google_stocks

"""Module with the Google stocks data source."""

import io
from pathlib import Path
from typing import Any

import numpy as np
import pandas as pd
import requests
from sklearn.preprocessing import MinMaxScaler

from tempor.core import plugins
from tempor.data import dataset
from tempor.datasources import datasource


[docs]@plugins.register_plugin(name="google_stocks", category="prediction.one_off", plugin_type="datasource") class GoogleStocksDataSource(datasource.OneOffPredictionDataSource): def __init__(self, seq_len: int = 10, **kwargs: Any) -> None: """Google stocks data source, as found here: https://raw.githubusercontent.com/PacktPublishing/Learning-Pandas-Second-Edition/master/data/goog.csv The next day's opening price is the target. The samples are the sequence chunks of length ``seq_len``. Args: seq_len (int, optional): The number of time steps to use. Defaults to ``10``. **kwargs (Any): Keyword arguments to be passed to the parent class. """ super().__init__(**kwargs) self.seq_len = seq_len self.df_path = Path(self.dataset_dir()) / "goog.csv"
[docs] @staticmethod def url() -> str: # noqa: D102 return "https://raw.githubusercontent.com/PacktPublishing/Learning-Pandas-Second-Edition/master/data/goog.csv"
[docs] @staticmethod def dataset_dir() -> str: # noqa: D102 return str(Path(GoogleStocksDataSource.data_root_dir) / "google_stocks")
[docs] def load(self, **kwargs: Any) -> dataset.OneOffPredictionDataset: # noqa: D102 # Load Google Data if not self.df_path.exists(): s = requests.get(self.url(), timeout=5).content df = pd.read_csv(io.StringIO(s.decode("utf-8"))) df.to_csv(self.df_path, index=False) else: df = pd.read_csv(self.df_path) # Flip the data to make chronological data df = pd.DataFrame(df.values[::-1], columns=df.columns) T = pd.to_datetime(df["Date"]).astype(np.int64).astype(np.float64) / 10**9 # pyright: ignore T = pd.Series(MinMaxScaler().fit_transform(T.values.reshape(-1, 1)).squeeze()) # pyright: ignore df = df.drop(columns=["Date"]) df = pd.DataFrame(MinMaxScaler().fit_transform(df), columns=df.columns) # Build dataset dataX = [] outcome = [] # Cut data by sequence length sample_idxs = [] for i in range(0, len(df) - self.seq_len - 1): df_seq = df.loc[i : i + self.seq_len - 1].copy() horizons = T.loc[i : i + self.seq_len - 1].copy() out = df["Open"].loc[i + self.seq_len].copy() df_seq["time_idx"] = horizons df_seq["sample_idx"] = str(i) dataX.append(df_seq) outcome.append(out) sample_idxs.append(str(i)) time_series_df = pd.concat(dataX, ignore_index=True) time_series_df.set_index(keys=["sample_idx", "time_idx"], drop=True, inplace=True) outcome_df = pd.DataFrame(outcome) outcome_df.index = sample_idxs # pyright: ignore outcome_df.columns = ["out"] time_series_df.sort_index(level=[0, 1], inplace=True) outcome_df.sort_index(inplace=True) return dataset.OneOffPredictionDataset( time_series=time_series_df, targets=outcome_df, sample_index="sample_idx", time_index="time_idx", )