[docs]classCategoryMemoryMinimizeFeatureGenerator(AbstractFeatureGenerator):""" Minimizes memory usage of category features by converting the category values to monotonically increasing int values. This is important for category features with string values which can take up significant memory despite the string information not being used downstream. """def_fit_transform(self,X:DataFrame,**kwargs)->(DataFrame,dict):self._category_maps=self._get_category_map(X=X)X_out=self._transform(X)returnX_out,self.feature_metadata_in.type_group_map_specialdef_transform(self,X:DataFrame)->DataFrame:returnself._minimize_categorical_memory_usage(X)@staticmethoddefget_default_infer_features_in_args()->dict:returndict(valid_raw_types=[R_CATEGORY])def_get_category_map(self,X:DataFrame)->dict:category_maps={}forcolumninX:old_categories=list(X[column].cat.categories.values)new_categories=RangeIndex(len(old_categories))# Memory optimal categoriescategory_maps[column]=new_categoriesreturncategory_mapsdef_minimize_categorical_memory_usage(self,X:DataFrame):ifself._category_maps:X_renamed=dict()forcolumninself._category_maps:# rename_categories(inplace=True) is faster but it is deprecated as of pandas 1.3.0X_renamed[column]=X[column].cat.rename_categories(self._category_maps[column])X=DataFrame(X_renamed)returnXdef_remove_features_in(self,features:list):super()._remove_features_in(features)ifself._category_maps:forfeatureinfeatures:iffeatureinself._category_maps:self._category_maps.pop(feature)def_more_tags(self):return{"feature_interactions":False}
# TODO: What about nulls / unknowns?
[docs]classNumericMemoryMinimizeFeatureGenerator(AbstractFeatureGenerator):""" Clips and converts dtype of int features to minimize memory usage. dtype_out : np.dtype, default np.uint8 dtype to clip and convert features to. Clipping will automatically use the correct min and max values for the dtype provided. **kwargs : Refer to :class:`AbstractFeatureGenerator` documentation for details on valid key word arguments. """def__init__(self,dtype_out=np.uint8,**kwargs):super().__init__(**kwargs)self.dtype_out,self._clip_min,self._clip_max=self._get_dtype_clip_args(dtype_out)def_fit_transform(self,X:DataFrame,**kwargs)->(DataFrame,dict):X_out=self._transform(X)returnX_out,self.feature_metadata_in.type_group_map_specialdef_transform(self,X):returnself._minimize_numeric_memory_usage(X)@staticmethoddefget_default_infer_features_in_args()->dict:returndict(valid_raw_types=[R_INT])@staticmethoddef_get_dtype_clip_args(dtype)->(np.dtype,int,int):try:dtype_info=np.iinfo(dtype)exceptValueError:dtype_info=np.finfo(dtype)returndtype_info.dtype,dtype_info.min,dtype_info.maxdef_minimize_numeric_memory_usage(self,X:DataFrame):returnclip_and_astype(df=X,clip_min=self._clip_min,clip_max=self._clip_max,dtype=self.dtype_out)def_more_tags(self):return{"feature_interactions":False}