Source code for autogluon.features.generators.datetime

import logging

import pandas as pd
from pandas import DataFrame

from autogluon.core.features.types import R_DATETIME, S_DATETIME_AS_OBJECT

from .abstract import AbstractFeatureGenerator

logger = logging.getLogger(__name__)


[docs]class DatetimeFeatureGenerator(AbstractFeatureGenerator): """Transforms datetime features into numeric features.""" def _fit_transform(self, X: DataFrame, **kwargs) -> (DataFrame, dict): self._fillna_map = self._compute_fillna_map(X) X_out = self._transform(X) type_family_groups_special = dict( datetime_as_int=list(X_out.columns) ) return X_out, type_family_groups_special def _transform(self, X: DataFrame) -> DataFrame: return self._generate_features_datetime(X) @staticmethod def get_default_infer_features_in_args() -> dict: return dict(required_raw_special_pairs=[ (R_DATETIME, None), (None, [S_DATETIME_AS_OBJECT]) ]) def _compute_fillna_map(self, X: DataFrame): fillna_map = dict() for datetime_feature in self.features_in: datetime_series = pd.to_datetime(X[datetime_feature], errors='coerce') # Best guess is currently to fill by the mean. fillna_datetime = datetime_series.mean() fillna_map[datetime_feature] = fillna_datetime return fillna_map # TODO: Improve handling of missing datetimes def _generate_features_datetime(self, X: DataFrame) -> DataFrame: X_datetime = DataFrame(index=X.index) for datetime_feature in self.features_in: # TODO: Be aware: When converted to float32 by downstream models, the seconds value will be up to 3 seconds off the true time due to rounding error. If seconds matter, find a separate way to generate (Possibly subtract smallest datetime from all values). # TODO: could also return an extra boolean column is_nan which could provide predictive signal. X_datetime[datetime_feature] = pd.to_datetime(X[datetime_feature], errors='coerce').fillna(self._fillna_map[datetime_feature]) X_datetime[datetime_feature] = pd.to_numeric(X_datetime[datetime_feature]) # TODO: Use actual date info # X_datetime[datetime_feature] = pd.to_timedelta(X_datetime[datetime_feature]).dt.total_seconds() # TODO: Add fastai date features return X_datetime def _remove_features_in(self, features: list): super()._remove_features_in(features) if self._fillna_map: for feature in features: if feature in self._fillna_map: self._fillna_map.pop(feature)