Source code for autogluon.eda.visualization.dataset

from typing import Any, Dict, List, Optional

import pandas as pd
from pandas import DataFrame

from ..state import AnalysisState
from .base import AbstractVisualization
from .jupyter import JupyterMixin

__all__ = ["DatasetStatistics", "DatasetTypeMismatch", "LabelInsightsVisualization"]


[docs]class DatasetStatistics(AbstractVisualization, JupyterMixin): """ Display aggregate dataset statistics and dataset-level information. The report is a composite view of combination of performed analyses: :py:class:`~autogluon.eda.analysis.dataset.DatasetSummary`, :py:class:`~autogluon.eda.analysis.dataset.RawTypesAnalysis`, :py:class:`~autogluon.eda.analysis.dataset.VariableTypeAnalysis`, :py:class:`~autogluon.eda.analysis.dataset.SpecialTypesAnalysis`, :py:class:`~autogluon.eda.analysis.missing.MissingValuesAnalysis`. The components can be present in any combination (assuming their dependencies are satisfied). The report requires at least one of the analyses present to be rendered. Parameters ---------- headers: bool, default = False if `True` then render headers namespace: str, default = None namespace to use; can be nested like `ns_a.ns_b.ns_c` sort_by: Optional[str], default = None column to sort the resulting table sort_asc: bool, default = True if `sort_by` provided, then if sorting should ascending or descending Examples -------- >>> import autogluon.eda.analysis as eda >>> import autogluon.eda.visualization as viz >>> import autogluon.eda.auto as auto >>> state = auto.analyze( >>> train_data=..., label=..., return_state=True, >>> anlz_facets=[ >>> eda.dataset.DatasetSummary(), >>> eda.dataset.RawTypesAnalysis(), >>> eda.dataset.VariableTypeAnalysis(), >>> eda.dataset.SpecialTypesAnalysis(), >>> eda.missing.MissingValuesAnalysis(), >>> ], >>> viz_facets=[ >>> viz.dataset.DatasetStatistics() >>> ] >>> ) See Also -------- :py:class:`~autogluon.eda.analysis.dataset.DatasetSummary` :py:class:`~autogluon.eda.analysis.dataset.RawTypesAnalysis` :py:class:`~autogluon.eda.analysis.dataset.VariableTypeAnalysis` :py:class:`~autogluon.eda.analysis.dataset.SpecialTypesAnalysis` :py:class:`~autogluon.eda.analysis.missing.MissingValuesAnalysis` """ def __init__( self, headers: bool = False, namespace: Optional[str] = None, sort_by: Optional[str] = None, sort_asc: bool = True, **kwargs, ) -> None: super().__init__(namespace, **kwargs) self.headers = headers self.sort_by = sort_by self.sort_asc = sort_asc def can_handle(self, state: AnalysisState) -> bool: return self.at_least_one_key_must_be_present( state, "dataset_stats", "missing_statistics", "raw_type", "special_types" ) def _render(self, state: AnalysisState) -> None: datasets = [] for k in ["dataset_stats", "missing_statistics", "raw_type", "variable_type", "special_types"]: if k in state: datasets = state[k].keys() break for ds in datasets: # Merge different metrics stats = self._merge_analysis_facets(ds, state) # Fix counts df = pd.DataFrame(stats) if "dataset_stats" in state: df = self._fix_counts(df, ["unique", "freq"]) if "missing_statistics" in state: df = self._fix_counts(df, ["missing_count"]) df = df.fillna("") self.render_header_if_needed(state, f"`{ds}` dataset summary") if self.sort_by in df.columns: df = df.sort_values(by=self.sort_by, ascending=self.sort_asc) with pd.option_context("display.max_rows", 100 if len(df) <= 100 else 20): self.display_obj(df) @staticmethod def _merge_analysis_facets(ds: str, state: AnalysisState): stats: Dict[str, Any] = {} if "dataset_stats" in state: stats = state.dataset_stats[ds].copy() if "missing_statistics" in state: stats = { **stats, **{f"missing_{k}": v for k, v in state.missing_statistics[ds].items() if k in ["count", "ratio"]}, } if "raw_type" in state: stats["raw_type"] = state.raw_type[ds] if "variable_type" in state: stats["variable_type"] = state.variable_type[ds] if "special_types" in state: stats["special_types"] = state.special_types[ds] return stats @staticmethod def _fix_counts(df: DataFrame, cols: List[str]) -> DataFrame: for k in cols: if k in df.columns: df[k] = df[k].fillna(-1).astype(int).replace({-1: ""}) return df
[docs]class DatasetTypeMismatch(AbstractVisualization, JupyterMixin): """ Display mismatch between raw types between datasets provided. In case if mismatch found, mark the row with a warning. The report requires :py:class:`~autogluon.eda.analysis.dataset.RawTypesAnalysis` analysis present. Parameters ---------- headers: bool, default = False if `True` then render headers namespace: str, default = None namespace to use; can be nested like `ns_a.ns_b.ns_c` Examples -------- >>> import autogluon.eda.analysis as eda >>> import autogluon.eda.visualization as viz >>> import autogluon.eda.auto as auto >>> auto.analyze( >>> train_data=..., test_data=..., >>> anlz_facets=[ >>> eda.dataset.RawTypesAnalysis(), >>> ], >>> viz_facets=[ >>> viz.dataset.DatasetTypeMismatch() >>> ] >>> ) See Also -------- :py:class:`~autogluon.eda.analysis.dataset.RawTypesAnalysis` """ def __init__(self, headers: bool = False, namespace: Optional[str] = None, **kwargs) -> None: super().__init__(namespace, **kwargs) self.headers = headers def can_handle(self, state: AnalysisState) -> bool: return self.all_keys_must_be_present(state, "raw_type") def _render(self, state: AnalysisState) -> None: df = pd.DataFrame(state.raw_type).sort_index() warnings = df.eq(df.iloc[:, 0], axis=0) df["warnings"] = warnings.all(axis=1).map({True: "", False: "warning"}) df.fillna("--", inplace=True) df = df[df["warnings"] != ""] if len(df) > 0: self.render_header_if_needed(state, "Types warnings summary") with pd.option_context("display.max_rows", 100 if len(df) <= 100 else 20): self.display_obj(df)
[docs]class LabelInsightsVisualization(AbstractVisualization, JupyterMixin): """ Render label insights performed by :py:class:`~autogluon.eda.analysis.dataset.LabelInsightsAnalysis`. The following insights can be rendered: - classification: low cardinality classes detection - classification: classes present in test data, but not in the train data - regression: out-of-domain labels detection Examples -------- >>> import autogluon.eda.analysis as eda >>> import autogluon.eda.visualization as viz >>> import autogluon.eda.auto as auto >>> auto.analyze( >>> auto.analyze(train_data=..., test_data=..., label=..., anlz_facets=[ >>> eda.dataset.ProblemTypeControl(), >>> eda.dataset.LabelInsightsAnalysis(low_cardinality_classes_threshold=50, regression_ood_threshold=0.01), >>> ], viz_facets=[ >>> viz.dataset.LabelInsightsVisualization() >>> ]) Parameters ---------- headers: bool, default = False if `True` then render headers namespace: str, default = None namespace to use; can be nested like `ns_a.ns_b.ns_c` See Also -------- :py:class:`~autogluon.eda.analysis.dataset.ProblemTypeControl` :py:class:`~autogluon.eda.analysis.dataset.LabelInsightsAnalysis` """ def __init__(self, headers: bool = False, namespace: Optional[str] = None, **kwargs) -> None: super().__init__(namespace, **kwargs) self.headers = headers def can_handle(self, state: AnalysisState) -> bool: return "label_insights" in state def _render(self, state: AnalysisState) -> None: insights = state.label_insights md_lines: List[str] = [] self._classification_add_low_cardinality_classes_insights(insights, md_lines) self._classification_add_minority_class_imbalance_insights(insights, md_lines) self._classification_add_missing_classes_insights(insights, md_lines) self._regression_add_out_of_domain_insights(insights, md_lines) if len(md_lines) > 0: self.render_header_if_needed(state, "Label insights") self.render_markdown("\n".join(md_lines)) @staticmethod def _regression_add_out_of_domain_insights(insights: AnalysisState, md_lines: List[str]): if insights.ood is not None: md_lines.append( f" - Rows with out-of-domain labels were found. Consider removing rows with labels outside of this range or expand training data since " f"some algorithms (i.e. trees) are unable to extrapolate beyond data present in the training data.\n" f" - `{insights.ood.count}` rows\n" f" - `train_data` values range `{insights.ood.train_range}`\n" f" - `test_data` values range `{insights.ood.test_range}`" ) @staticmethod def _classification_add_missing_classes_insights(insights: AnalysisState, md_lines: List[str]): if insights.not_present_in_train is not None: md_lines.append( f" - the following classes are found in `test_data`, but not present in `train_data`: " f"`{'`, `'.join(map(str, insights.not_present_in_train))}`. " f"Consider either removing the rows with classes not covered or adding more training data covering the classes." ) @staticmethod def _classification_add_minority_class_imbalance_insights(insights: AnalysisState, md_lines: List[str]): if insights.minority_class_imbalance is not None: if insights.minority_class_imbalance.ratio < 0.01: severity = "Extreme" elif insights.minority_class_imbalance.ratio <= 0.2: severity = "Moderate" else: severity = "Mild" md_lines.append( f" - {severity} minority class imbalance detected - imbalance ratio is `{insights.minority_class_imbalance.ratio:.2%}`. " f"Recommendations:\n" f" - downsample majority class `{insights.minority_class_imbalance.majority_class}` to improve the balance\n" f" - upweight downsampled class so that `sample_weight = original_weight x downsampling_factor`." f"[TabularPredictor](https://auto.gluon.ai/stable/api/autogluon.predictor.html#module-0) " f"supports this via `sample_weight` parameter" ) @staticmethod def _classification_add_low_cardinality_classes_insights(insights: AnalysisState, md_lines: List[str]): if insights.low_cardinality_classes is not None: classes_info = "\n".join( [f" - class `{k}`: `{v}` instances" for k, v in insights.low_cardinality_classes.instances.items()] ) md_lines.append( f" - Low-cardinality classes are detected. It is recommended to have at least `{insights.low_cardinality_classes.threshold}` " f"instances per class. Consider adding more data to cover the classes or remove such rows.\n" f"{classes_info}" )