Source code for autogluon.timeseries.dataset.ts_dataframe

from __future__ import annotations

import itertools
from typing import Any, Tuple, Type, Optional
from collections.abc import Iterable

import pandas as pd
from pandas.core.internals import ArrayManager, BlockManager

ITEMID = "item_id"
TIMESTAMP = "timestamp"


[docs]class TimeSeriesDataFrame(pd.DataFrame):
    """``TimeSeriesDataFrame`` s represent a collection of time series, where each row
    identifies the values of an (``item_id``, ``timestamp``) pair.

    For example, a time series data frame could represent the daily sales of a collection
    of products, where each ``item_id`` identifies a product and ``timestamp`` s correspond to
    the days.

    Parameters
    ----------
    data: Any
        Time-series data to construct a ``TimeSeriesDataFrame``. The class currently supports three
        input formats.

        1. Time-series data in Iterable format. For example::

                iterable_dataset = [
                    {"target": [0, 1, 2], "start": pd.Timestamp("01-01-2019", freq='D')},
                    {"target": [3, 4, 5], "start": pd.Timestamp("01-01-2019", freq='D')},
                    {"target": [6, 7, 8], "start": pd.Timestamp("01-01-2019", freq='D')}
                ]

        2. Time-series data in a pandas DataFrame format without multi-index. For example::

                   item_id  timestamp  target
                0        0 2019-01-01       0
                1        0 2019-01-02       1
                2        0 2019-01-03       2
                3        1 2019-01-01       3
                4        1 2019-01-02       4
                5        1 2019-01-03       5
                6        2 2019-01-01       6
                7        2 2019-01-02       7
                8        2 2019-01-03       8

        3. Time-series data in pandas DataFrame format with multi-index on item_id and timestamp. For example::

                                        target
                item_id timestamp
                0       2019-01-01       0
                        2019-01-02       1
                        2019-01-03       2
                1       2019-01-01       3
                        2019-01-02       4
                        2019-01-03       5
                2       2019-01-01       6
                        2019-01-02       7
                        2019-01-03       8

    Attributes
    ----------
    freq: str
        A pandas and gluon-ts compatible string describing the frequency of the time series. For example
        "D" is daily data, etc. Also see,
        https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
    num_items: int
        Number of items (time series) in the data set.
    """

    index: pd.MultiIndex

    def __init__(self, data: Any, *args, **kwargs):
        if isinstance(data, (BlockManager, ArrayManager)):
            # necessary for copy constructor to work. see _constructor
            # and pandas.DataFrame
            pass
        elif isinstance(data, pd.DataFrame):
            if isinstance(data.index, pd.MultiIndex):
                self._validate_multi_index_data_frame(data)
            else:
                data = self._construct_pandas_frame_from_data_frame(data)
        elif isinstance(data, Iterable):
            data = self._construct_pandas_frame_from_iterable_dataset(data)
        else:
            raise ValueError(
                "Data input type not recognized, must be DataFrame or iterable."
            )
        super().__init__(data=data, *args, **kwargs)

    @property
    def _constructor(self) -> Type[TimeSeriesDataFrame]:
        return TimeSeriesDataFrame

    @property
    def freq(self):
        ts_index = self.index.levels[1]  # noqa
        freq = (
            ts_index.freq
            or ts_index.inferred_freq
            or self.loc[0].index.freq  # fall back to freq of first item
            or self.loc[0].index.inferred_freq
        )
        if freq is None:
            raise ValueError("Frequency not provided and cannot be inferred")
        if isinstance(freq, str):
            return freq
        elif isinstance(freq, pd._libs.tslibs.BaseOffset):
            return freq.freqstr
        return freq

    def iter_items(self) -> Iterable[Any]:
        return iter(self.index.levels[0])

    @property
    def num_items(self):
        return len(self.index.levels[0])

    @classmethod
    def _validate_iterable(cls, data: Iterable):
        if not isinstance(data, Iterable):
            raise ValueError("data must be of type Iterable.")

        first = next(iter(data), None)
        if first is None:
            raise ValueError("data has no time-series.")

        for i, ts in enumerate(itertools.chain([first], data)):
            if not isinstance(ts, dict):
                raise ValueError(
                    f"{i}'th time-series in data must be a dict, got{type(ts)}"
                )
            if not ("target" in ts and "start" in ts):
                raise ValueError(
                    f"{i}'th time-series in data must have 'target' and 'start', got{ts.keys()}"
                )
            if not isinstance(ts["start"], pd.Timestamp) or ts["start"].freq is None:
                raise ValueError(
                    f"{i}'th time-series must have timestamp as 'start' with freq specified, got {ts['start']}"
                )

    @classmethod
    def _validate_data_frame(cls, df: pd.DataFrame):
        if not isinstance(df, pd.DataFrame):
            raise ValueError(f"data must be a pd.DataFrame, got {type(df)}")
        if ITEMID not in df.columns:
            raise ValueError(f"data must have a `{ITEMID}` column")
        if TIMESTAMP not in df.columns:
            raise ValueError(f"data must have a `{TIMESTAMP}` column")
        if df[ITEMID].isnull().any():
            raise ValueError(f"`{ITEMID}` column can not have nan")
        if df[TIMESTAMP].isnull().any():
            raise ValueError(f"`{TIMESTAMP}` column can not have nan")
        if not df[TIMESTAMP].dtype == "datetime64[ns]":
            raise ValueError(
                f"for {TIMESTAMP}, the only pandas dtype allowed is ‘datetime64[ns]’."
            )

        # TODO: check if time series are irregularly sampled. this check was removed as
        # TODO: pandas is inconsistent in identifying freq when period-end timestamps
        # TODO: are provided.

    @classmethod
    def _validate_multi_index_data_frame(cls, data: pd.DataFrame):
        """Validate a multi-index pd.DataFrame can be converted to TimeSeriesDataFrame

        Parameters
        ----------
        data: pd.DataFrame
            a data frame in pd.DataFrame format.
        """

        if not isinstance(data, pd.DataFrame):
            raise ValueError(f"data must be a pd.DataFrame, got {type(data)}")
        if not isinstance(data.index, pd.MultiIndex):
            raise ValueError(f"data must have pd.MultiIndex, got {type(data.index)}")
        if not data.index.dtypes.array[1] == "datetime64[ns]":
            raise ValueError(
                f"for {TIMESTAMP}, the only pandas dtype allowed is ‘datetime64[ns]’."
            )
        if not data.index.names == (f"{ITEMID}", f"{TIMESTAMP}"):
            raise ValueError(
                f"data must have index names as ('{ITEMID}', '{TIMESTAMP}'), got {data.index.names}"
            )

    @classmethod
    def _construct_pandas_frame_from_iterable_dataset(
        cls, iterable_dataset: Iterable
    ) -> pd.DataFrame:
        cls._validate_iterable(iterable_dataset)

        all_ts = []
        for i, ts in enumerate(iterable_dataset):
            start_timestamp = ts["start"]
            target = ts["target"]
            datetime_index = tuple(
                pd.date_range(
                    start_timestamp, periods=len(target), freq=start_timestamp.freq
                )
            )
            idx = pd.MultiIndex.from_product(
                [(i,), datetime_index], names=[ITEMID, TIMESTAMP]
            )
            ts_df = pd.Series(target, name="target", index=idx).to_frame()
            all_ts.append(ts_df)
        return pd.concat(all_ts)

[docs]    @classmethod
    def from_iterable_dataset(cls, iterable_dataset: Iterable) -> pd.DataFrame:
        """Construct a ``TimeSeriesDataFrame`` from an Iterable of dictionaries each of which
        represent a single time series.

        This function also offers compatibility with GluonTS data sets, see
        https://ts.gluon.ai/_modules/gluonts/dataset/common.html#ListDataset.

        Parameters
        ----------
        iterable_dataset: Iterable
            An iterator over dictionaries, each with a ``target`` field specifying the value of the
            (univariate) time series, and a ``start`` field that features a pandas Timestamp with features.
            Example::

                iterable_dataset = [
                    {"target": [0, 1, 2], "start": pd.Timestamp("01-01-2019", freq='D')},
                    {"target": [3, 4, 5], "start": pd.Timestamp("01-01-2019", freq='D')},
                    {"target": [6, 7, 8], "start": pd.Timestamp("01-01-2019", freq='D')}
                ]

        Returns
        -------
        ts_df: TimeSeriesDataFrame
            A data frame in TimeSeriesDataFrame format.
        """
        return cls(cls._construct_pandas_frame_from_iterable_dataset(iterable_dataset))

    @classmethod
    def _construct_pandas_frame_from_data_frame(
        cls,
        df: pd.DataFrame,
        id_column: Optional[str] = None,
        timestamp_column: Optional[str] = None,
    ) -> pd.DataFrame:

        df = df.copy()
        if id_column is not None:
            assert id_column in df.columns, f"Column {id_column} not found!"
            df.rename(columns={id_column: ITEMID}, inplace=True)

        if timestamp_column is not None:
            assert (
                timestamp_column in df.columns
            ), f"Column {timestamp_column} not found!"
            df.rename(columns={timestamp_column: TIMESTAMP}, inplace=True)

        cls._validate_data_frame(df)
        return df.set_index([ITEMID, TIMESTAMP])

[docs]    @classmethod
    def from_data_frame(
        cls,
        df: pd.DataFrame,
        id_column: Optional[str] = None,
        timestamp_column: Optional[str] = None,
    ) -> TimeSeriesDataFrame:
        """Construct a ``TimeSeriesDataFrame`` from a pandas DataFrame.

        Parameters
        ----------
        df: pd.DataFrame
            A pd.DataFrame with 'item_id' and 'timestamp' as columns. For example:

            .. code-block::

                   item_id  timestamp  target
                0        0 2019-01-01       0
                1        0 2019-01-02       1
                2        0 2019-01-03       2
                3        1 2019-01-01       3
                4        1 2019-01-02       4
                5        1 2019-01-03       5
                6        2 2019-01-01       6
                7        2 2019-01-02       7
                8        2 2019-01-03       8
        id_column: str
            Name of the 'item_id' column if column name is different
        timestamp_column: str
            Name of the 'timestamp' column if column name is different

        Returns
        -------
        ts_df: TimeSeriesDataFrame
            A data frame in TimeSeriesDataFrame format.
        """
        return cls(
            cls._construct_pandas_frame_from_data_frame(
                df, id_column=id_column, timestamp_column=timestamp_column
            )
        )

[docs]    def split_by_time(
        self, cutoff_time: pd.Timestamp
    ) -> Tuple[TimeSeriesDataFrame, TimeSeriesDataFrame]:
        """Split dataframe to two different ``TimeSeriesDataFrame`` s before and after a certain
        ``cutoff_time``.

        Parameters
        ----------
        cutoff_time: pd.Timestamp
            The time to split the current data frame into two data frames.

        Returns
        -------
        data_before: TimeSeriesDataFrame
            Data frame containing time series before the ``cutoff_time`` (exclude ``cutoff_time``).
        data_after: TimeSeriesDataFrame
            Data frame containing time series after the ``cutoff_time`` (include ``cutoff_time``).
        """

        nanosecond_before_cutoff = cutoff_time - pd.Timedelta(nanoseconds=1)
        data_before = self.loc[(slice(None), slice(None, nanosecond_before_cutoff)), :]
        data_after = self.loc[(slice(None), slice(cutoff_time, None)), :]
        return TimeSeriesDataFrame(data_before), TimeSeriesDataFrame(data_after)

[docs]    def split_by_item(
        self, cutoff_item: int
    ) -> Tuple[TimeSeriesDataFrame, TimeSeriesDataFrame]:
        """Split dataframe to two data frames containing items before and after a ``cutoff_item``.

        Parameters
        ----------
        cutoff_item: int
            The item_id to split the current data frame into two data frames.

        Returns
        -------
        data_before: TimeSeriesDataFrame
            Data frame containing time-series before the ``cutoff_item`` (exclude ``cutoff_item``).
        data_after: TimeSeriesDataFrame
            Data frame containing time-series after the ``cutoff_item`` (include ``cutoff_item``).
        """

        data_before = self.loc[(slice(None, cutoff_item - 1), slice(None)), :]
        data_after = self.loc[(slice(cutoff_item, None), slice(None)), :]
        return TimeSeriesDataFrame(data_before), TimeSeriesDataFrame(data_after)

[docs]    def slice_by_timestep(self, time_step_slice: slice) -> TimeSeriesDataFrame:
        """Return a slice of time steps (with no regards to the actual timestamp) from within
        each item in a time series data frame. For example, if a data frame is constructed as::

            item_id  timestamp  target
                  0 2019-01-01       0
                  0 2019-01-02       1
                  0 2019-01-03       2
                  1 2019-01-02       3
                  1 2019-01-03       4
                  1 2019-01-04       5
                  2 2019-01-03       6
                  2 2019-01-04       7
                  2 2019-01-05       8

        then :code:`df.slice_by_timestep(time_step_slice=slice(-2, None))` would return the last two
        time steps from each item::

            item_id  timestamp  target
                  0 2019-01-02       1
                  0 2019-01-03       2
                  1 2019-01-03       4
                  1 2019-01-04       5
                  2 2019-01-04       7
                  2 2019-01-05       8

        Note that this function returns a copy of the original data. This function is useful for
        constructing holdout sets for validation.

        Parameters
        ----------
        time_step_slice: slice
            A python slice object representing the slices to return from each item

        Returns
        -------
        ts_df: TimeSeriesDataFrame
            Data frame containing only the time steps of each ``item_id`` sliced according to the
            input ``time_step_slice``.
        """
        slice_gen = (
            (i, self.loc[i].iloc[time_step_slice]) for i in self.index.levels[0]
        )
        slices = []
        for ix, data_slice in slice_gen:
            idx = pd.MultiIndex.from_product(
                [(ix,), data_slice.index], names=[ITEMID, TIMESTAMP]
            )
            data_slice.set_index(idx, inplace=True)
            slices.append(data_slice)
        return self.__class__(pd.concat(slices))

[docs]    def subsequence(
        self, start: pd.Timestamp, end: pd.Timestamp
    ) -> TimeSeriesDataFrame:
        """Extract time-series between start (inclusive) and end (exclusive) time.

        Parameters
        ----------
        start: pd.Timestamp
            The start time (inclusive) of a time range that will be used for subsequence.
        end: pd.Timestamp
            The end time (exclusive) of a time range that will be used for subsequence.

        Returns
        -------
        ts_df: TimeSeriesDataFrame
            A new data frame in ``TimeSeriesDataFrame`` format contains time-series in a time range
            defined between start and end time.
        """

        if end < start:
            raise ValueError(f"end time {end} is earlier than stat time {start}")

        nanosecond_before_end = end - pd.Timedelta(nanoseconds=1)
        return TimeSeriesDataFrame(
            self.loc[(slice(None), slice(start, nanosecond_before_end)), :]
        )

[docs]    @classmethod
    def from_pickle(cls, filepath_or_buffer: Any) -> "TimeSeriesDataFrame":
        """Convenience method to read pickled time series data frames. If the read pickle
        file refers to a plain pandas DataFrame, it will be cast to a TimeSeriesDataFrame.

        Parameters
        ----------
        filepath_or_buffer: Any
            Filename provided as a string or an ``IOBuffer`` containing the pickled object.

        Returns
        -------
        ts_df: TimeSeriesDataFrame
            The pickled time series data frame.
        """
        try:
            data = pd.read_pickle(filepath_or_buffer)
            return data if isinstance(data, cls) else cls(data)
        except Exception as err:  # noqa
            raise IOError(f"Could not load pickled data set due to error: {str(err)}")