Source code for srvar.data.dataset
from __future__ import annotations
from collections.abc import Iterable, Sequence
from dataclasses import dataclass
import numpy as np
import pandas as pd
[docs]
@dataclass(frozen=True, slots=True)
class Dataset:
"""A lightweight container for multivariate time series data.
The library consistently represents a dataset as a matrix ``values`` with shape
``(T, N)``, where:
- ``T`` is the number of time points (observations)
- ``N`` is the number of variables (series)
Parameters
----------
time_index:
Time index for the observations. Can be a :class:`pandas.Index` (e.g. a
:class:`pandas.DatetimeIndex`) or anything coercible to one.
variables:
Variable names of length ``N``.
values:
Numeric array of shape ``(T, N)``.
Notes
-----
The class is immutable (``frozen=True``) and performs validation in
:meth:`~Dataset.__post_init__`.
"""
time_index: pd.Index
variables: list[str]
values: np.ndarray
[docs]
@staticmethod
def from_arrays(
*,
values: np.ndarray,
variables: Sequence[str],
time_index: Iterable[object] | pd.Index | None = None,
) -> Dataset:
"""Construct a :class:`~srvar.data.dataset.Dataset` from array-like inputs.
Parameters
----------
values:
A numeric array of shape ``(T, N)``.
variables:
Sequence of variable names of length ``N``.
time_index:
Optional time index. If omitted, a :class:`pandas.RangeIndex` with
``start=0`` is used.
Returns
-------
Dataset
Validated dataset instance.
Raises
------
ValueError
If shapes are inconsistent (e.g. ``len(variables) != values.shape[1]``)
or if ``values`` is not two-dimensional.
"""
x = np.asarray(values, dtype=float)
if x.ndim != 2:
raise ValueError("values must be a 2D array of shape (T, N)")
vars_list = list(variables)
if len(vars_list) != x.shape[1]:
raise ValueError("len(variables) must equal values.shape[1]")
if time_index is None:
idx = pd.RangeIndex(start=0, stop=x.shape[0], step=1)
else:
idx = time_index if isinstance(time_index, pd.Index) else pd.Index(list(time_index))
if len(idx) != x.shape[0]:
raise ValueError("len(time_index) must equal values.shape[0]")
return Dataset(time_index=idx, variables=vars_list, values=x)
def __post_init__(self) -> None:
x = np.asarray(self.values, dtype=float)
if x.ndim != 2:
raise ValueError("values must be a 2D array of shape (T, N)")
if len(self.variables) != x.shape[1]:
raise ValueError("len(variables) must equal values.shape[1]")
if len(self.time_index) != x.shape[0]:
raise ValueError("len(time_index) must equal values.shape[0]")
object.__setattr__(self, "values", x)
if not isinstance(self.time_index, pd.Index):
object.__setattr__(self, "time_index", pd.Index(self.time_index))
@property
def T(self) -> int:
"""Number of time points (rows) in the dataset."""
return int(self.values.shape[0])
@property
def N(self) -> int:
"""Number of variables (columns) in the dataset."""
return int(self.values.shape[1])