Source code for autogluon.eda.visualization.dataset
from typing import Any, Dict, List, Optional
import pandas as pd
from pandas import DataFrame
from ..state import AnalysisState
from .base import AbstractVisualization
from .jupyter import JupyterMixin
__all__ = ["DatasetStatistics", "DatasetTypeMismatch", "LabelInsightsVisualization"]
[docs]class DatasetStatistics(AbstractVisualization, JupyterMixin):
"""
Display aggregate dataset statistics and dataset-level information.
The report is a composite view of combination of performed analyses: :py:class:`~autogluon.eda.analysis.dataset.DatasetSummary`,
:py:class:`~autogluon.eda.analysis.dataset.RawTypesAnalysis`, :py:class:`~autogluon.eda.analysis.dataset.VariableTypeAnalysis`,
:py:class:`~autogluon.eda.analysis.dataset.SpecialTypesAnalysis`, :py:class:`~autogluon.eda.analysis.missing.MissingValuesAnalysis`.
The components can be present in any combination (assuming their dependencies are satisfied).
The report requires at least one of the analyses present to be rendered.
Parameters
----------
headers: bool, default = False
if `True` then render headers
namespace: str, default = None
namespace to use; can be nested like `ns_a.ns_b.ns_c`
sort_by: Optional[str], default = None
column to sort the resulting table
sort_asc: bool, default = True
if `sort_by` provided, then if sorting should ascending or descending
Examples
--------
>>> import autogluon.eda.analysis as eda
>>> import autogluon.eda.visualization as viz
>>> import autogluon.eda.auto as auto
>>> state = auto.analyze(
>>> train_data=..., label=..., return_state=True,
>>> anlz_facets=[
>>> eda.dataset.DatasetSummary(),
>>> eda.dataset.RawTypesAnalysis(),
>>> eda.dataset.VariableTypeAnalysis(),
>>> eda.dataset.SpecialTypesAnalysis(),
>>> eda.missing.MissingValuesAnalysis(),
>>> ],
>>> viz_facets=[
>>> viz.dataset.DatasetStatistics()
>>> ]
>>> )
See Also
--------
:py:class:`~autogluon.eda.analysis.dataset.DatasetSummary`
:py:class:`~autogluon.eda.analysis.dataset.RawTypesAnalysis`
:py:class:`~autogluon.eda.analysis.dataset.VariableTypeAnalysis`
:py:class:`~autogluon.eda.analysis.dataset.SpecialTypesAnalysis`
:py:class:`~autogluon.eda.analysis.missing.MissingValuesAnalysis`
"""
def __init__(
self,
headers: bool = False,
namespace: Optional[str] = None,
sort_by: Optional[str] = None,
sort_asc: bool = True,
**kwargs,
) -> None:
super().__init__(namespace, **kwargs)
self.headers = headers
self.sort_by = sort_by
self.sort_asc = sort_asc
def can_handle(self, state: AnalysisState) -> bool:
return self.at_least_one_key_must_be_present(
state, "dataset_stats", "missing_statistics", "raw_type", "special_types"
)
def _render(self, state: AnalysisState) -> None:
datasets = []
for k in ["dataset_stats", "missing_statistics", "raw_type", "variable_type", "special_types"]:
if k in state:
datasets = state[k].keys()
break
for ds in datasets:
# Merge different metrics
stats = self._merge_analysis_facets(ds, state)
# Fix counts
df = pd.DataFrame(stats)
if "dataset_stats" in state:
df = self._fix_counts(df, ["unique", "freq"])
if "missing_statistics" in state:
df = self._fix_counts(df, ["missing_count"])
df = df.fillna("")
self.render_header_if_needed(state, f"`{ds}` dataset summary")
if self.sort_by in df.columns:
df = df.sort_values(by=self.sort_by, ascending=self.sort_asc)
with pd.option_context("display.max_rows", 100 if len(df) <= 100 else 20):
self.display_obj(df)
@staticmethod
def _merge_analysis_facets(ds: str, state: AnalysisState):
stats: Dict[str, Any] = {}
if "dataset_stats" in state:
stats = state.dataset_stats[ds].copy()
if "missing_statistics" in state:
stats = {
**stats,
**{f"missing_{k}": v for k, v in state.missing_statistics[ds].items() if k in ["count", "ratio"]},
}
if "raw_type" in state:
stats["raw_type"] = state.raw_type[ds]
if "variable_type" in state:
stats["variable_type"] = state.variable_type[ds]
if "special_types" in state:
stats["special_types"] = state.special_types[ds]
return stats
@staticmethod
def _fix_counts(df: DataFrame, cols: List[str]) -> DataFrame:
for k in cols:
if k in df.columns:
df[k] = df[k].fillna(-1).astype(int).replace({-1: ""})
return df
[docs]class DatasetTypeMismatch(AbstractVisualization, JupyterMixin):
"""
Display mismatch between raw types between datasets provided. In case if mismatch found, mark the row with a warning.
The report requires :py:class:`~autogluon.eda.analysis.dataset.RawTypesAnalysis` analysis present.
Parameters
----------
headers: bool, default = False
if `True` then render headers
namespace: str, default = None
namespace to use; can be nested like `ns_a.ns_b.ns_c`
Examples
--------
>>> import autogluon.eda.analysis as eda
>>> import autogluon.eda.visualization as viz
>>> import autogluon.eda.auto as auto
>>> auto.analyze(
>>> train_data=..., test_data=...,
>>> anlz_facets=[
>>> eda.dataset.RawTypesAnalysis(),
>>> ],
>>> viz_facets=[
>>> viz.dataset.DatasetTypeMismatch()
>>> ]
>>> )
See Also
--------
:py:class:`~autogluon.eda.analysis.dataset.RawTypesAnalysis`
"""
def __init__(self, headers: bool = False, namespace: Optional[str] = None, **kwargs) -> None:
super().__init__(namespace, **kwargs)
self.headers = headers
def can_handle(self, state: AnalysisState) -> bool:
return self.all_keys_must_be_present(state, "raw_type")
def _render(self, state: AnalysisState) -> None:
df = pd.DataFrame(state.raw_type).sort_index()
warnings = df.eq(df.iloc[:, 0], axis=0)
df["warnings"] = warnings.all(axis=1).map({True: "", False: "warning"})
df.fillna("--", inplace=True)
df = df[df["warnings"] != ""]
if len(df) > 0:
self.render_header_if_needed(state, "Types warnings summary")
with pd.option_context("display.max_rows", 100 if len(df) <= 100 else 20):
self.display_obj(df)
[docs]class LabelInsightsVisualization(AbstractVisualization, JupyterMixin):
"""
Render label insights performed by :py:class:`~autogluon.eda.analysis.dataset.LabelInsightsAnalysis`.
The following insights can be rendered:
- classification: low cardinality classes detection
- classification: classes present in test data, but not in the train data
- regression: out-of-domain labels detection
Examples
--------
>>> import autogluon.eda.analysis as eda
>>> import autogluon.eda.visualization as viz
>>> import autogluon.eda.auto as auto
>>> auto.analyze(
>>> auto.analyze(train_data=..., test_data=..., label=..., anlz_facets=[
>>> eda.dataset.ProblemTypeControl(),
>>> eda.dataset.LabelInsightsAnalysis(low_cardinality_classes_threshold=50, regression_ood_threshold=0.01),
>>> ], viz_facets=[
>>> viz.dataset.LabelInsightsVisualization()
>>> ])
Parameters
----------
headers: bool, default = False
if `True` then render headers
namespace: str, default = None
namespace to use; can be nested like `ns_a.ns_b.ns_c`
See Also
--------
:py:class:`~autogluon.eda.analysis.dataset.ProblemTypeControl`
:py:class:`~autogluon.eda.analysis.dataset.LabelInsightsAnalysis`
"""
def __init__(self, headers: bool = False, namespace: Optional[str] = None, **kwargs) -> None:
super().__init__(namespace, **kwargs)
self.headers = headers
def can_handle(self, state: AnalysisState) -> bool:
return "label_insights" in state
def _render(self, state: AnalysisState) -> None:
insights = state.label_insights
md_lines: List[str] = []
self._classification_add_low_cardinality_classes_insights(insights, md_lines)
self._classification_add_minority_class_imbalance_insights(insights, md_lines)
self._classification_add_missing_classes_insights(insights, md_lines)
self._regression_add_out_of_domain_insights(insights, md_lines)
if len(md_lines) > 0:
self.render_header_if_needed(state, "Label insights")
self.render_markdown("\n".join(md_lines))
@staticmethod
def _regression_add_out_of_domain_insights(insights: AnalysisState, md_lines: List[str]):
if insights.ood is not None:
md_lines.append(
f" - Rows with out-of-domain labels were found. Consider removing rows with labels outside of this range or expand training data since "
f"some algorithms (i.e. trees) are unable to extrapolate beyond data present in the training data.\n"
f" - `{insights.ood.count}` rows\n"
f" - `train_data` values range `{insights.ood.train_range}`\n"
f" - `test_data` values range `{insights.ood.test_range}`"
)
@staticmethod
def _classification_add_missing_classes_insights(insights: AnalysisState, md_lines: List[str]):
if insights.not_present_in_train is not None:
md_lines.append(
f" - the following classes are found in `test_data`, but not present in `train_data`: "
f"`{'`, `'.join(map(str, insights.not_present_in_train))}`. "
f"Consider either removing the rows with classes not covered or adding more training data covering the classes."
)
@staticmethod
def _classification_add_minority_class_imbalance_insights(insights: AnalysisState, md_lines: List[str]):
if insights.minority_class_imbalance is not None:
if insights.minority_class_imbalance.ratio < 0.01:
severity = "Extreme"
elif insights.minority_class_imbalance.ratio <= 0.2:
severity = "Moderate"
else:
severity = "Mild"
md_lines.append(
f" - {severity} minority class imbalance detected - imbalance ratio is `{insights.minority_class_imbalance.ratio:.2%}`. "
f"Recommendations:\n"
f" - downsample majority class `{insights.minority_class_imbalance.majority_class}` to improve the balance\n"
f" - upweight downsampled class so that `sample_weight = original_weight x downsampling_factor`."
f"[TabularPredictor](https://auto.gluon.ai/stable/api/autogluon.predictor.html#module-0) "
f"supports this via `sample_weight` parameter"
)
@staticmethod
def _classification_add_low_cardinality_classes_insights(insights: AnalysisState, md_lines: List[str]):
if insights.low_cardinality_classes is not None:
classes_info = "\n".join(
[f" - class `{k}`: `{v}` instances" for k, v in insights.low_cardinality_classes.instances.items()]
)
md_lines.append(
f" - Low-cardinality classes are detected. It is recommended to have at least `{insights.low_cardinality_classes.threshold}` "
f"instances per class. Consider adding more data to cover the classes or remove such rows.\n"
f"{classes_info}"
)