import warnings
from typing import Any, Dict, List, Optional, Union
import numpy as np
import pandas as pd
import phik # noqa - required for significance_matrix instrumentation on pandas dataframes
from pandas.core.dtypes.common import is_numeric_dtype
from scipy import stats
from scipy.cluster import hierarchy as hc
from scipy.stats import spearmanr
from .. import AnalysisState
from .base import AbstractAnalysis
__all__ = ["Correlation", "CorrelationSignificance", "FeatureInteraction", "DistributionFit"]
from autogluon.common.features.types import R_FLOAT, R_INT
[docs]class Correlation(AbstractAnalysis):
"""
Correlation analysis.
Note: it is recommended to apply AutoGluon standard pre-processing - this will allow to include categorical variables into the analysis.
This can be done via wrapping analysis into :py:class:`~autogluon.eda.analysis.transform.ApplyFeatureGenerator`
Parameters
----------
method: str {'pearson', 'kendall', 'spearman', 'phik'}, default='spearman'
Method of correlation:
* pearson : standard correlation coefficient
* kendall : Kendall Tau correlation coefficient
* spearman : Spearman rank correlation
* phik : phi_k correlation
Correlation matrix of bivariate gaussian derived from chi2-value Chi2-value
gets converted into correlation coefficient of bivariate gauss with correlation
value rho, assuming given binning and number of records. Correlation coefficient
value is between 0 and 1. Bivariate gaussian's range is set to [-5,5] by construction.
See Also `phik <https://github.com/KaveIO/PhiK>`_ documentation.
focus_field: Optional[str], default = None
field name to focus. Specifying a field would filter all correlations only when they are >= `focus_field_threshold`
This is helpful when dealing with a large number of variables.
focus_field_threshold: float, default = 0.5
a cut-off threshold when `focus_field` is specified
parent: Optional[AbstractAnalysis], default = None
parent Analysis
children: List[AbstractAnalysis], default = []
wrapped analyses; these will receive sampled `args` during `fit` call
Examples
--------
>>> import autogluon.eda.analysis as eda
>>> import autogluon.eda.visualization as viz
>>> import autogluon.eda.auto as auto
>>> import pandas as pd
>>> import numpy as np
>>> df_train = pd.DataFrame(...)
>>>
>>> auto.analyze(return_sttrain_data=df_train, label=target_col, anlz_facets=[
>>> # Apply standard AutoGluon pre-processing to transform categorical variables to numbers to ensure correlation includes them.
>>> eda.transform.ApplyFeatureGenerator(category_to_numbers=True, children=[
>>> # We use `spearman` correlation to capture non-linear interactions because it is based on the order rank.
>>> eda.interaction.Correlation(method='spearman', focus_field=target_col, focus_field_threshold=0.3),
>>> ])
>>> ], viz_facets=[
>>> viz.interaction.CorrelationVisualization(fig_args=dict(figsize=(12,8)), **common_args),
>>> ])
See Also
--------
:py:class:`~autogluon.eda.analysis.transform.ApplyFeatureGenerator`
"""
def __init__(
self,
method: str = "spearman",
focus_field: Optional[str] = None,
focus_field_threshold: float = 0.5,
parent: Optional[AbstractAnalysis] = None,
children: Optional[List[AbstractAnalysis]] = None,
**kwargs,
) -> None:
super().__init__(parent, children, **kwargs)
assert method in ["pearson", "kendall", "spearman", "phik"]
self.method = method
self.focus_field = focus_field
self.focus_field_threshold = focus_field_threshold
def can_handle(self, state: AnalysisState, args: AnalysisState) -> bool:
return True
def _fit(self, state: AnalysisState, args: AnalysisState, **fit_kwargs) -> None:
state.correlations = {}
state.correlations_method = self.method
for ds, df in self.available_datasets(args):
if args.label in df.columns and df[args.label].dtype not in [R_INT, R_FLOAT]:
df[args.label] = df[args.label].astype("category").cat.codes
if self.method == "phik":
state.correlations[ds] = df.phik_matrix(**self.args, verbose=False)
else:
state.correlations[ds] = df.corr(method=self.method, numeric_only=True, **self.args)
if self.focus_field is not None and self.focus_field in state.correlations[ds].columns:
state.correlations_focus_field = self.focus_field
state.correlations_focus_field_threshold = self.focus_field_threshold
state.correlations_focus_high_corr = {}
df_corr = state.correlations[ds]
df_corr = df_corr[df_corr[self.focus_field].abs() >= self.focus_field_threshold]
keep_cols = df_corr.index.values
state.correlations[ds] = df_corr[keep_cols]
high_corr = (
state.correlations[ds][[self.focus_field]]
.sort_values(self.focus_field, ascending=False)
.drop(self.focus_field)
)
state.correlations_focus_high_corr[ds] = high_corr
[docs]class CorrelationSignificance(AbstractAnalysis):
"""
Significance of correlation of all variable combinations in the DataFrame.
See :py:meth:`~phik.significance.significance_matrix` for more details.
This analysis requires :py:class:`~autogluon.eda.analysis.interaction.Correlation` results to be
available in the state.
Note: it is recommended to apply AutoGluon standard pre-processing - this will allow to include categorical variables into the analysis.
This can be done via wrapping analysis into :py:class:`~autogluon.eda.analysis.transform.ApplyFeatureGenerator`
Examples
--------
>>> import autogluon.eda.analysis as eda
>>> import autogluon.eda.visualization as viz
>>> import autogluon.eda.auto as auto
>>> import pandas as pd
>>> df_train = pd.DataFrame(...)
>>>
>>> auto.analyze(return_sttrain_data=df_train, label=target_col, anlz_facets=[
>>> # Apply standard AutoGluon pre-processing to transform categorical variables to numbers to ensure correlation includes them.
>>> eda.transform.ApplyFeatureGenerator(category_to_numbers=True, children=[
>>> # We use `spearman` correlation to capture non-linear interactions because it is based on the order rank.
>>> eda.interaction.Correlation(method='spearman', focus_field=target_col, focus_field_threshold=0.3),
>>> eda.interaction.CorrelationSignificance()
>>> ])
>>> ], viz_facets=[
>>> viz.interaction.CorrelationSignificanceVisualization(fig_args=dict(figsize=(12,8))),
>>> ])
See Also
--------
:py:meth:`~phik.significance.significance_matrix`
:py:class:`~autogluon.eda.analysis.interaction.Correlation`
:py:class:`~autogluon.eda.analysis.transform.ApplyFeatureGenerator`
"""
def can_handle(self, state: AnalysisState, args: AnalysisState) -> bool:
return self.all_keys_must_be_present(state, "correlations", "correlations_method")
def _fit(self, state: AnalysisState, args: AnalysisState, **fit_kwargs) -> None:
state.significance_matrix = {}
for ds, df in self.available_datasets(args):
state.significance_matrix[ds] = df[state.correlations[ds].columns].significance_matrix(
**self.args, verbose=False
)
[docs]class FeatureInteraction(AbstractAnalysis):
"""
Feature interaction analysis
Parameters
----------
x: Optional[str], default = None
variable to analyse which would be placed on x-axis
y: Optional[str], default = None
variable to analyse which would be placed on y-axis
hue: Optional[str], default = None
variable to use as hue in x/y-analysis.
key: Optional[str], default = None
key to use to store the analysis in the state; the value is later to be used by FeatureInteractionVisualization.
If the key is not provided, then use one of theform: 'x:A|y:B|hue:C' (omit corresponding x/y/hue if the value not provided)
See also :py:class:`~autogluon.eda.visualization.interaction.FeatureInteractionVisualization`
parent: Optional[AbstractAnalysis], default = None
parent Analysis
children: Optional[List[AbstractAnalysis]], default None
wrapped analyses; these will receive sampled `args` during `fit` call
kwargs
Examples
--------
>>> import pandas as pd
>>> import numpy as np
>>> import autogluon.eda.analysis as eda
>>> import autogluon.eda.visualization as viz
>>> import autogluon.eda.auto as auto
>>>
>>> df_train = pd.DataFrame(...)
>>>
>>> state = auto.analyze(
>>> train_data=df_train, label='Survived',
>>> anlz_facets=[
>>> eda.dataset.RawTypesAnalysis(),
>>> eda.interaction.FeatureInteraction(key='target_col', x='Survived'),
>>> eda.interaction.FeatureInteraction(key='target_col_vs_age', x='Survived', y='Age')
>>> ],
>>> viz_facets=[
>>> # Bar Plot with counts per each of the values in Survived
>>> viz.interaction.FeatureInteractionVisualization(key='target_col', headers=True),
>>> # Box Plot Survived vs Age
>>> viz.interaction.FeatureInteractionVisualization(key='target_col_vs_age', headers=True),
>>> ]
>>> )
>>>
>>> # Simplified shortcut for interactions: scatter plot of Fare vs Age colored based on Survived values.
>>> auto.analyze_interaction(x='Fare', y='Age', hue='Survived', train_data=df_train)
"""
def __init__(
self,
x: Optional[str] = None,
y: Optional[str] = None,
hue: Optional[str] = None,
key: Optional[str] = None,
parent: Optional[AbstractAnalysis] = None,
children: Optional[List[AbstractAnalysis]] = None,
**kwargs,
) -> None:
super().__init__(parent, children, **kwargs)
self.x = x
self.y = y
self.hue = hue
self.key = key
def can_handle(self, state: AnalysisState, args: AnalysisState) -> bool:
return self.all_keys_must_be_present(state, "raw_type")
def _fit(self, state: AnalysisState, args: AnalysisState, **fit_kwargs):
cols = {
"x": self.x,
"y": self.y,
"hue": self.hue,
}
self.key = self._generate_key_if_not_provided(self.key, cols)
cols = {k: v for k, v in cols.items() if v is not None}
interactions: Dict[str, Dict[str, Any]] = state.get("interactions", {})
for ds, df in self.available_datasets(args):
missing_cols = [c for c in cols.values() if c not in df.columns]
if len(missing_cols) == 0:
df = df[cols.values()]
interaction = {
"features": cols,
"data": df,
}
if ds not in interactions:
interactions[ds] = {}
interactions[ds][self.key] = interaction
state.interactions = interactions
def _generate_key_if_not_provided(self, key: Optional[str], cols: Dict[str, Optional[str]]) -> str:
# if key is not provided, then convert to form: 'x:A|y:B|hue:C'; if values is not provided, then skip the value
if key is None:
key_parts = []
for k, v in cols.items():
if v is not None:
key_parts.append(f"{k}:{v}")
key = "|".join(key_parts)
return key
[docs]class DistributionFit(AbstractAnalysis):
"""
This component attempts to fit various distributions for further plotting via
:py:class:`~autogluon.eda.visualization.interaction.FeatureInteractionVisualization`.
The data specified in `columns` must be numeric to be considered for fitting (categorical variables are not supported).
Only the distributions with statistical significance above `pvalue_min` threshold will be included in the results.
Note: this analysis is an augmentation for :py:class:`~autogluon.eda.analysis.interaction.FeatureInteraction` and should be used in pair
to be visualized via :py:class:`~autogluon.eda.visualization.interaction.FeatureInteractionVisualization`.
Parameters
----------
columns: Union[str, List[str]]
column to be included into analysis. Can be passed as a string or a list of strings.
pvalue_min: float = 0.01,
min pvalue to consider including distribution fit in the results.
keep_top_n: Optional[int] = None,
how many distributions exceeding `pvalue_min` to include in the results. I.e. if `keep_top_n=3`,
but 10 distributions satisfied `pvalue_min`, only top 3 will be included.
If not specified and `distributions_to_fit` is not provided, then only top 3 will be included in the results.
distributions_to_fit: Optional[Union[str, List[str]]] = None,
list of distributions to fit. See `DistributionFit.AVAILABLE_DISTRIBUTIONS` for the list of supported values.
See `scipy <https://docs.scipy.org/doc/scipy/reference/stats.html>`_ documentation for each distribution details.
If not specified, then all supported distributions will be attempted to fit.
parent: Optional[AbstractAnalysis], default = None
parent Analysis
children: Optional[List[AbstractAnalysis]], default None
wrapped analyses; these will receive sampled `args` during `fit` call
kwargs
Examples
--------
>>> import autogluon.eda.analysis as eda
>>> import autogluon.eda.visualization as viz
>>> import autogluon.eda.auto as auto
>>> import pandas as pd
>>> import numpy as np
>>>
>>> df_train = pd.DataFrame(...)
>>>
>>> auto.analyze(
>>> train_data=df_train, label=target_col,
>>> anlz_facets=[
>>> eda.dataset.RawTypesAnalysis(),
>>> eda.interaction.DistributionFit(columns=['Fare', 'Age'], distributions_to_fit=['lognorm', 'beta', 'gamma', 'fisk']),
>>> eda.interaction.FeatureInteraction(key='age-chart', x='Age'),
>>>
>>> ],
>>> viz_facets=[
>>> viz.interaction.FeatureInteractionVisualization(key='age-chart', headers=True),
>>> ]
>>> )
See Also
--------
:py:class:`~autogluon.eda.analysis.interaction.FeatureInteraction`
:py:class:`~autogluon.eda.visualization.interaction.FeatureInteractionVisualization`
"""
# Getting the list of distributions: https://docs.scipy.org/doc/scipy/tutorial/stats.html#getting-help
AVAILABLE_DISTRIBUTIONS = sorted(
[
dist
for dist in dir(stats)
if isinstance(getattr(stats, dist), stats.rv_continuous)
# kstwo can't be fit on a single variable
# levy_stable, studentized_range are too slow
and dist not in ["kstwo", "levy_stable", "studentized_range"]
]
)
def __init__(
self,
columns: Union[str, List[str]],
pvalue_min: float = 0.01,
keep_top_n: Optional[int] = None,
distributions_to_fit: Optional[Union[str, List[str]]] = None,
parent: Optional[AbstractAnalysis] = None,
children: Optional[List[AbstractAnalysis]] = None,
**kwargs,
) -> None:
super().__init__(parent, children, **kwargs)
if keep_top_n is None and distributions_to_fit is None:
keep_top_n = 3
if isinstance(columns, str):
columns = [columns]
self.columns = columns
self.pvalue_min = pvalue_min
self.keep_top_n = keep_top_n
if distributions_to_fit is None:
distributions_to_fit = self.AVAILABLE_DISTRIBUTIONS
if isinstance(distributions_to_fit, str):
distributions_to_fit = [distributions_to_fit]
not_supported = [d for d in distributions_to_fit if d not in self.AVAILABLE_DISTRIBUTIONS]
if len(not_supported) > 0:
raise ValueError(
f"The following distributions are not supported: {sorted(not_supported)}. "
f"Supported distributions are {sorted(self.AVAILABLE_DISTRIBUTIONS)}"
)
self.distributions_to_fit = distributions_to_fit
def can_handle(self, state: AnalysisState, args: AnalysisState) -> bool:
return True
def _fit(self, state: AnalysisState, args: AnalysisState, **fit_kwargs) -> None:
state.distributions_fit = {}
state.distributions_fit_pvalue_min = self.pvalue_min
for ds, df in self.available_datasets(args):
state.distributions_fit[ds] = {}
for c in self.columns:
if c in df.columns:
col = df[c]
col = col[col.notna()] # skip NaNs
dist = self._fit_dist(col, self.pvalue_min)
if dist is not None:
state.distributions_fit[ds][c] = dist
def _fit_dist(self, series, pvalue_min=0.01):
results = {}
if not is_numeric_dtype(series):
self.logger.warning(f"{series.name}: distribution cannot be fit; only numeric columns are supported")
return None
with warnings.catch_warnings():
warnings.simplefilter("ignore")
for i in self.distributions_to_fit:
dist = getattr(stats, i)
param = dist.fit(series)
statistic, pvalue = stats.kstest(series, i, args=param)
if pvalue >= pvalue_min:
results[i] = {
"param": param,
"shapes": self._list_parameters(dist),
"statistic": statistic,
"pvalue": pvalue,
}
if len(results) == 0:
return None
df = pd.DataFrame(results).T.sort_values("pvalue", ascending=False)
if self.keep_top_n is not None:
df = df[: self.keep_top_n]
results = df.T.to_dict()
return results
def _list_parameters(self, distribution):
"""List parameters for scipy.stats.distribution.
# Arguments
distribution: a string or scipy.stats distribution object.
# Returns
A list of distribution parameter strings.
"""
if isinstance(distribution, str):
distribution = getattr(stats, distribution)
if distribution.shapes:
parameters = [name.strip() for name in distribution.shapes.split(",")]
else:
parameters = []
if distribution.name in stats._discrete_distns._distn_names:
parameters += ["loc"]
elif distribution.name in stats._continuous_distns._distn_names:
parameters += ["loc", "scale"]
return parameters
class FeatureDistanceAnalysis(AbstractAnalysis):
"""
The component performs feature correlation distance analysis using Spearman rank correlation and hierarchical clustering
for the data passed in `train_data` excluding `label`.
The near duplicates grouping is automatically suggested given `near_duplicates_threshold`.
The results can be visualized using :py:class:`~autogluon.eda.visualization.interaction.FeatureDistanceAnalysisVisualization`.
Note: it is recommended to apply :py:class:`~autogluon.eda.analysis.transform.ApplyFeatureGenerator` before the analysis
to ensure correlations are calculated for categorical variables.
Parameters
----------
near_duplicates_threshold: float, default = 0.01
defines feature distance to be considered as near duplicates
parent: Optional[AbstractAnalysis], default = None
parent Analysis
children: Optional[List[AbstractAnalysis]], default None
wrapped analyses; these will receive sampled `args` during `fit` call
kwargs
Examples
--------
>>> import autogluon.eda.analysis as eda
>>> import autogluon.eda.visualization as viz
>>> import autogluon.eda.auto as auto
>>> import pandas as pd
>>> import numpy as np
>>>
>>> df_train = pd.DataFrame(...)
>>>
>>> auto.analyze(
>>> train_data=df_train, label=target_col,
>>> anlz_facets=[
>>> eda.transform.ApplyFeatureGenerator(category_to_numbers=True, children=[
>>> eda.interaction.FeatureDistanceAnalysis(near_duplicates_threshold=0.7),
>>> ])
>>> ],
>>> viz_facets=[
>>> viz.interaction.FeatureDistanceAnalysisVisualization(fig_args=dict(figsize=(12,6))),
>>> ]
>>> )
See Also
--------
:py:class:`~autogluon.eda.analysis.transform.ApplyFeatureGenerator`
:py:class:`~autogluon.eda.visualization.interaction.FeatureDistanceAnalysisVisualization`
`Removing redundant features <https://github.com/fastai/book_nbs/blob/master/10_tabular.ipynb>`_ section of
Jeremy Howard's "Deep Learning for Coders with Fastai and PyTorch" book.
"""
def __init__(
self,
near_duplicates_threshold: float = 0.01,
parent: Optional[AbstractAnalysis] = None,
children: Optional[List[AbstractAnalysis]] = None,
**kwargs,
) -> None:
super().__init__(parent, children, **kwargs)
self.near_duplicates_threshold = near_duplicates_threshold
def can_handle(self, state: AnalysisState, args: AnalysisState) -> bool:
return self.all_keys_must_be_present(args, "train_data", "label", "feature_generator")
def _fit(self, state: AnalysisState, args: AnalysisState, **fit_kwargs) -> None:
x = args.train_data
if args.label is not None:
x = x.drop(labels=[args.label], axis=1)
corr = np.round(spearmanr(x).correlation, 4)
np.fill_diagonal(corr, 1)
corr_condensed = hc.distance.squareform(1 - np.nan_to_num(corr))
z = hc.linkage(corr_condensed, method="average")
columns = list(x.columns)
s = {
"columns": columns,
"linkage": z,
"near_duplicates_threshold": self.near_duplicates_threshold,
"near_duplicates": self.__get_linkage_clusters(z, columns, self.near_duplicates_threshold),
}
state["feature_distance"] = s
@staticmethod
def __get_linkage_clusters(linkage, columns, threshold: float):
idx_to_col = {i: v for i, v in enumerate(columns)}
idx_to_dist: Dict[int, float] = {}
clusters: Dict[int, List[int]] = {}
for (f1, f2, d, _l), i in zip(linkage, np.arange(len(idx_to_col), len(idx_to_col) + len(linkage))):
idx_to_dist[i] = d
f1 = int(f1)
f2 = int(f2)
if d <= threshold:
clusters[i] = [*clusters.pop(f1, [f1]), *clusters.pop(f2, [f2])]
results = []
for i, nodes in clusters.items():
d = idx_to_dist[i]
nodes = [idx_to_col[n] for n in nodes]
results.append(
{
"nodes": sorted(nodes),
"distance": d,
}
)
return results