Source code for autogluon.timeseries.dataset.ts_dataframe
from __future__ import annotations
import itertools
from typing import Any, Tuple, Type, Optional
from collections.abc import Iterable
import pandas as pd
from pandas.core.internals import ArrayManager, BlockManager
ITEMID = "item_id"
TIMESTAMP = "timestamp"
[docs]class TimeSeriesDataFrame(pd.DataFrame):
"""``TimeSeriesDataFrame`` s represent a collection of time series, where each row
identifies the values of an (``item_id``, ``timestamp``) pair.
For example, a time series data frame could represent the daily sales of a collection
of products, where each ``item_id`` identifies a product and ``timestamp`` s correspond to
the days.
Parameters
----------
data: Any
Time-series data to construct a ``TimeSeriesDataFrame``. The class currently supports three
input formats.
1. Time-series data in Iterable format. For example::
iterable_dataset = [
{"target": [0, 1, 2], "start": pd.Timestamp("01-01-2019", freq='D')},
{"target": [3, 4, 5], "start": pd.Timestamp("01-01-2019", freq='D')},
{"target": [6, 7, 8], "start": pd.Timestamp("01-01-2019", freq='D')}
]
2. Time-series data in a pandas DataFrame format without multi-index. For example::
item_id timestamp target
0 0 2019-01-01 0
1 0 2019-01-02 1
2 0 2019-01-03 2
3 1 2019-01-01 3
4 1 2019-01-02 4
5 1 2019-01-03 5
6 2 2019-01-01 6
7 2 2019-01-02 7
8 2 2019-01-03 8
3. Time-series data in pandas DataFrame format with multi-index on item_id and timestamp. For example::
target
item_id timestamp
0 2019-01-01 0
2019-01-02 1
2019-01-03 2
1 2019-01-01 3
2019-01-02 4
2019-01-03 5
2 2019-01-01 6
2019-01-02 7
2019-01-03 8
Attributes
----------
freq: str
A pandas and gluon-ts compatible string describing the frequency of the time series. For example
"D" is daily data, etc. Also see,
https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
num_items: int
Number of items (time series) in the data set.
"""
index: pd.MultiIndex
def __init__(self, data: Any, *args, **kwargs):
if isinstance(data, (BlockManager, ArrayManager)):
# necessary for copy constructor to work. see _constructor
# and pandas.DataFrame
pass
elif isinstance(data, pd.DataFrame):
if isinstance(data.index, pd.MultiIndex):
self._validate_multi_index_data_frame(data)
else:
data = self._construct_pandas_frame_from_data_frame(data)
elif isinstance(data, Iterable):
data = self._construct_pandas_frame_from_iterable_dataset(data)
else:
raise ValueError(
"Data input type not recognized, must be DataFrame or iterable."
)
super().__init__(data=data, *args, **kwargs)
@property
def _constructor(self) -> Type[TimeSeriesDataFrame]:
return TimeSeriesDataFrame
@property
def freq(self):
ts_index = self.index.levels[1] # noqa
freq = (
ts_index.freq
or ts_index.inferred_freq
or self.loc[0].index.freq # fall back to freq of first item
or self.loc[0].index.inferred_freq
)
if freq is None:
raise ValueError("Frequency not provided and cannot be inferred")
if isinstance(freq, str):
return freq
elif isinstance(freq, pd._libs.tslibs.BaseOffset):
return freq.freqstr
return freq
def iter_items(self) -> Iterable[Any]:
return iter(self.index.levels[0])
@property
def num_items(self):
return len(self.index.levels[0])
@classmethod
def _validate_iterable(cls, data: Iterable):
if not isinstance(data, Iterable):
raise ValueError("data must be of type Iterable.")
first = next(iter(data), None)
if first is None:
raise ValueError("data has no time-series.")
for i, ts in enumerate(itertools.chain([first], data)):
if not isinstance(ts, dict):
raise ValueError(
f"{i}'th time-series in data must be a dict, got{type(ts)}"
)
if not ("target" in ts and "start" in ts):
raise ValueError(
f"{i}'th time-series in data must have 'target' and 'start', got{ts.keys()}"
)
if not isinstance(ts["start"], pd.Timestamp) or ts["start"].freq is None:
raise ValueError(
f"{i}'th time-series must have timestamp as 'start' with freq specified, got {ts['start']}"
)
@classmethod
def _validate_data_frame(cls, df: pd.DataFrame):
if not isinstance(df, pd.DataFrame):
raise ValueError(f"data must be a pd.DataFrame, got {type(df)}")
if ITEMID not in df.columns:
raise ValueError(f"data must have a `{ITEMID}` column")
if TIMESTAMP not in df.columns:
raise ValueError(f"data must have a `{TIMESTAMP}` column")
if df[ITEMID].isnull().any():
raise ValueError(f"`{ITEMID}` column can not have nan")
if df[TIMESTAMP].isnull().any():
raise ValueError(f"`{TIMESTAMP}` column can not have nan")
if not df[TIMESTAMP].dtype == "datetime64[ns]":
raise ValueError(
f"for {TIMESTAMP}, the only pandas dtype allowed is ‘datetime64[ns]’."
)
# TODO: check if time series are irregularly sampled. this check was removed as
# TODO: pandas is inconsistent in identifying freq when period-end timestamps
# TODO: are provided.
@classmethod
def _validate_multi_index_data_frame(cls, data: pd.DataFrame):
"""Validate a multi-index pd.DataFrame can be converted to TimeSeriesDataFrame
Parameters
----------
data: pd.DataFrame
a data frame in pd.DataFrame format.
"""
if not isinstance(data, pd.DataFrame):
raise ValueError(f"data must be a pd.DataFrame, got {type(data)}")
if not isinstance(data.index, pd.MultiIndex):
raise ValueError(f"data must have pd.MultiIndex, got {type(data.index)}")
if not data.index.dtypes.array[1] == "datetime64[ns]":
raise ValueError(
f"for {TIMESTAMP}, the only pandas dtype allowed is ‘datetime64[ns]’."
)
if not data.index.names == (f"{ITEMID}", f"{TIMESTAMP}"):
raise ValueError(
f"data must have index names as ('{ITEMID}', '{TIMESTAMP}'), got {data.index.names}"
)
@classmethod
def _construct_pandas_frame_from_iterable_dataset(
cls, iterable_dataset: Iterable
) -> pd.DataFrame:
cls._validate_iterable(iterable_dataset)
all_ts = []
for i, ts in enumerate(iterable_dataset):
start_timestamp = ts["start"]
target = ts["target"]
datetime_index = tuple(
pd.date_range(
start_timestamp, periods=len(target), freq=start_timestamp.freq
)
)
idx = pd.MultiIndex.from_product(
[(i,), datetime_index], names=[ITEMID, TIMESTAMP]
)
ts_df = pd.Series(target, name="target", index=idx).to_frame()
all_ts.append(ts_df)
return pd.concat(all_ts)
[docs] @classmethod
def from_iterable_dataset(cls, iterable_dataset: Iterable) -> pd.DataFrame:
"""Construct a ``TimeSeriesDataFrame`` from an Iterable of dictionaries each of which
represent a single time series.
This function also offers compatibility with GluonTS data sets, see
https://ts.gluon.ai/_modules/gluonts/dataset/common.html#ListDataset.
Parameters
----------
iterable_dataset: Iterable
An iterator over dictionaries, each with a ``target`` field specifying the value of the
(univariate) time series, and a ``start`` field that features a pandas Timestamp with features.
Example::
iterable_dataset = [
{"target": [0, 1, 2], "start": pd.Timestamp("01-01-2019", freq='D')},
{"target": [3, 4, 5], "start": pd.Timestamp("01-01-2019", freq='D')},
{"target": [6, 7, 8], "start": pd.Timestamp("01-01-2019", freq='D')}
]
Returns
-------
ts_df: TimeSeriesDataFrame
A data frame in TimeSeriesDataFrame format.
"""
return cls(cls._construct_pandas_frame_from_iterable_dataset(iterable_dataset))
@classmethod
def _construct_pandas_frame_from_data_frame(
cls,
df: pd.DataFrame,
id_column: Optional[str] = None,
timestamp_column: Optional[str] = None,
) -> pd.DataFrame:
df = df.copy()
if id_column is not None:
assert id_column in df.columns, f"Column {id_column} not found!"
df.rename(columns={id_column: ITEMID}, inplace=True)
if timestamp_column is not None:
assert (
timestamp_column in df.columns
), f"Column {timestamp_column} not found!"
df.rename(columns={timestamp_column: TIMESTAMP}, inplace=True)
cls._validate_data_frame(df)
return df.set_index([ITEMID, TIMESTAMP])
[docs] @classmethod
def from_data_frame(
cls,
df: pd.DataFrame,
id_column: Optional[str] = None,
timestamp_column: Optional[str] = None,
) -> TimeSeriesDataFrame:
"""Construct a ``TimeSeriesDataFrame`` from a pandas DataFrame.
Parameters
----------
df: pd.DataFrame
A pd.DataFrame with 'item_id' and 'timestamp' as columns. For example:
.. code-block::
item_id timestamp target
0 0 2019-01-01 0
1 0 2019-01-02 1
2 0 2019-01-03 2
3 1 2019-01-01 3
4 1 2019-01-02 4
5 1 2019-01-03 5
6 2 2019-01-01 6
7 2 2019-01-02 7
8 2 2019-01-03 8
id_column: str
Name of the 'item_id' column if column name is different
timestamp_column: str
Name of the 'timestamp' column if column name is different
Returns
-------
ts_df: TimeSeriesDataFrame
A data frame in TimeSeriesDataFrame format.
"""
return cls(
cls._construct_pandas_frame_from_data_frame(
df, id_column=id_column, timestamp_column=timestamp_column
)
)
[docs] def split_by_time(
self, cutoff_time: pd.Timestamp
) -> Tuple[TimeSeriesDataFrame, TimeSeriesDataFrame]:
"""Split dataframe to two different ``TimeSeriesDataFrame`` s before and after a certain
``cutoff_time``.
Parameters
----------
cutoff_time: pd.Timestamp
The time to split the current data frame into two data frames.
Returns
-------
data_before: TimeSeriesDataFrame
Data frame containing time series before the ``cutoff_time`` (exclude ``cutoff_time``).
data_after: TimeSeriesDataFrame
Data frame containing time series after the ``cutoff_time`` (include ``cutoff_time``).
"""
nanosecond_before_cutoff = cutoff_time - pd.Timedelta(nanoseconds=1)
data_before = self.loc[(slice(None), slice(None, nanosecond_before_cutoff)), :]
data_after = self.loc[(slice(None), slice(cutoff_time, None)), :]
return TimeSeriesDataFrame(data_before), TimeSeriesDataFrame(data_after)
[docs] def split_by_item(
self, cutoff_item: int
) -> Tuple[TimeSeriesDataFrame, TimeSeriesDataFrame]:
"""Split dataframe to two data frames containing items before and after a ``cutoff_item``.
Parameters
----------
cutoff_item: int
The item_id to split the current data frame into two data frames.
Returns
-------
data_before: TimeSeriesDataFrame
Data frame containing time-series before the ``cutoff_item`` (exclude ``cutoff_item``).
data_after: TimeSeriesDataFrame
Data frame containing time-series after the ``cutoff_item`` (include ``cutoff_item``).
"""
data_before = self.loc[(slice(None, cutoff_item - 1), slice(None)), :]
data_after = self.loc[(slice(cutoff_item, None), slice(None)), :]
return TimeSeriesDataFrame(data_before), TimeSeriesDataFrame(data_after)
[docs] def slice_by_timestep(self, time_step_slice: slice) -> TimeSeriesDataFrame:
"""Return a slice of time steps (with no regards to the actual timestamp) from within
each item in a time series data frame. For example, if a data frame is constructed as::
item_id timestamp target
0 2019-01-01 0
0 2019-01-02 1
0 2019-01-03 2
1 2019-01-02 3
1 2019-01-03 4
1 2019-01-04 5
2 2019-01-03 6
2 2019-01-04 7
2 2019-01-05 8
then :code:`df.slice_by_timestep(time_step_slice=slice(-2, None))` would return the last two
time steps from each item::
item_id timestamp target
0 2019-01-02 1
0 2019-01-03 2
1 2019-01-03 4
1 2019-01-04 5
2 2019-01-04 7
2 2019-01-05 8
Note that this function returns a copy of the original data. This function is useful for
constructing holdout sets for validation.
Parameters
----------
time_step_slice: slice
A python slice object representing the slices to return from each item
Returns
-------
ts_df: TimeSeriesDataFrame
Data frame containing only the time steps of each ``item_id`` sliced according to the
input ``time_step_slice``.
"""
slice_gen = (
(i, self.loc[i].iloc[time_step_slice]) for i in self.index.levels[0]
)
slices = []
for ix, data_slice in slice_gen:
idx = pd.MultiIndex.from_product(
[(ix,), data_slice.index], names=[ITEMID, TIMESTAMP]
)
data_slice.set_index(idx, inplace=True)
slices.append(data_slice)
return self.__class__(pd.concat(slices))
[docs] def subsequence(
self, start: pd.Timestamp, end: pd.Timestamp
) -> TimeSeriesDataFrame:
"""Extract time-series between start (inclusive) and end (exclusive) time.
Parameters
----------
start: pd.Timestamp
The start time (inclusive) of a time range that will be used for subsequence.
end: pd.Timestamp
The end time (exclusive) of a time range that will be used for subsequence.
Returns
-------
ts_df: TimeSeriesDataFrame
A new data frame in ``TimeSeriesDataFrame`` format contains time-series in a time range
defined between start and end time.
"""
if end < start:
raise ValueError(f"end time {end} is earlier than stat time {start}")
nanosecond_before_end = end - pd.Timedelta(nanoseconds=1)
return TimeSeriesDataFrame(
self.loc[(slice(None), slice(start, nanosecond_before_end)), :]
)
[docs] @classmethod
def from_pickle(cls, filepath_or_buffer: Any) -> "TimeSeriesDataFrame":
"""Convenience method to read pickled time series data frames. If the read pickle
file refers to a plain pandas DataFrame, it will be cast to a TimeSeriesDataFrame.
Parameters
----------
filepath_or_buffer: Any
Filename provided as a string or an ``IOBuffer`` containing the pickled object.
Returns
-------
ts_df: TimeSeriesDataFrame
The pickled time series data frame.
"""
try:
data = pd.read_pickle(filepath_or_buffer)
return data if isinstance(data, cls) else cls(data)
except Exception as err: # noqa
raise IOError(f"Could not load pickled data set due to error: {str(err)}")