Source code for autogluon.features.generators.drop_unique

import logging

from pandas import DataFrame

from autogluon.core.features.types import R_CATEGORY, R_OBJECT, S_TEXT
from autogluon.core.features.feature_metadata import FeatureMetadata

from .abstract import AbstractFeatureGenerator

logger = logging.getLogger(__name__)

# TODO: Not necessary to exist after fitting, can just update outer context feature_out/feature_in and then delete this
[docs]class DropUniqueFeatureGenerator(AbstractFeatureGenerator): """Drops features which only have 1 unique value or which have nearly no repeated values (based on max_unique_ratio) and are of category or object type.""" def __init__(self, max_unique_ratio=0.99, **kwargs): super().__init__(**kwargs) self.max_unique_ratio = max_unique_ratio def _fit_transform(self, X: DataFrame, **kwargs) -> (DataFrame, dict): features_to_drop = self._drop_unique_features(X, self.feature_metadata_in, max_unique_ratio=self.max_unique_ratio) self._remove_features_in(features_to_drop) X_out = X[self.features_in] return X_out, self.feature_metadata_in.type_group_map_special def _transform(self, X: DataFrame) -> DataFrame: return X @staticmethod def get_default_infer_features_in_args() -> dict: return dict() # TODO: Consider NaN? @staticmethod def _drop_unique_features(X: DataFrame, feature_metadata: FeatureMetadata, max_unique_ratio) -> list: features_to_drop = [] X_len = len(X) max_unique_value_count = X_len * max_unique_ratio for column in X: unique_value_count = len(X[column].unique()) # Drop features that are always the same if unique_value_count == 1: features_to_drop.append(column) elif feature_metadata.get_feature_type_raw(column) in [R_CATEGORY, R_OBJECT]\ and (unique_value_count > max_unique_value_count): if S_TEXT in feature_metadata.get_feature_types_special(column): # We should not drop a text column continue else: features_to_drop.append(column) return features_to_drop def _more_tags(self): return {'feature_interactions': False}