Source code for summarease.summarize_target

import pandas as pd
import altair as alt
import warnings

[docs] def summarize_target_df(dataset_name: pd.DataFrame, target_variable: str, target_type: str, threshold=0.2): """Summarize and evaluate the target variable for categarical or numerical types. Parameters ---------- dataset_name : DataFrame The input dataset containing target variable. target_variable : str The name of target column. target_type : str, within {"categorical", "numerical"} The type of target variable. threshold : float, optional Only feasible for "categorical" type to identify class imbalance. Default is 0.2. Returns ------- DataFrame If target_type="categorical", returns a summary DataFrame containing classes, proportions, imbalance flag, and threshold. If target_type="numerical", returns the DataFrame with the basic statistical summary. Notes: ----- For categorical type, the function does not distinguish between binary and multi-class classification. Balance criteria: Assume n classes, each class should between [(1-threshold)/n, (1+threshold)/n]. threshold : float, optional Only used if `target_type="categorical"`. It identifies class imbalance. User decides the threshold of imbalance. Typically, a target class is considered balanced if it varies within 20% of the average. Of course, users can choose a narrower balance range, such as 10%. Examples -------- >>> summarize_target( data, target_variable='target', target_type='categorical', threshold=0.2 ) """ if target_type == "categorical" and (threshold < 0 or threshold > 1): raise ValueError("Threshold must be between 0 and 1.") if target_type == "categorical": # Calculate class proportions value_counts = dataset_name[target_variable].value_counts(normalize=True).sort_index() n_classes = len(value_counts) # Deal with empty data if n_classes == 0: return pd.DataFrame(columns=['class', 'proportion', 'imbalanced', 'threshold']) # Calculate expected range for balance expected_proportion = 1 / n_classes lower_bound = expected_proportion * (1 - threshold) upper_bound = expected_proportion * (1 + threshold) imbalance_flag = (value_counts < lower_bound) | (value_counts > upper_bound) # Generate summary table summary_df = pd.DataFrame({ 'class': value_counts.index, 'proportion': value_counts.values, 'imbalanced': imbalance_flag.values }) summary_df['threshold'] = threshold elif target_type == "numerical": # Check for empty numerical data if dataset_name[target_variable].empty: return pd.DataFrame() # Warn if threshold is provided if threshold is not None: warnings.warn("Threshold is not used for numerical targets.", UserWarning) # Get statistical summary summary_df = dataset_name[target_variable].describe().to_frame().T else: raise ValueError("Invalid target_type. Must be 'categorical' or 'numerical'.") return summary_df
[docs] def summarize_target_balance_plot(summary_df: pd.DataFrame): """ Visualize the balance condition of a categorical target. Parameters ---------- summary_df : DataFrame The input DataFrame, expected to match the output of summarize_target_df() with target_type="categorical". It must contain the columns ['class', 'proportion', 'imbalanced', 'threshold']. Returns ------- alt.Chart The Altair chart visualizing the balance of the categorical target variable. Notes ----- The chart includes the following: - A bar plot for actual class proportions. - Expected proportion range (lower and upper bounds) as balance range. - Imbalance status for each class indicated by color. - Highlighted ticks for expected lower and upper bounds. """ # Validate input DataFrame required_columns = {'class', 'proportion', 'imbalanced', 'threshold'} if not required_columns.issubset(summary_df.columns): raise ValueError(f"Input DataFrame must contain columns: {', '.join(sorted(required_columns))}") # Handle empty DataFrame if summary_df.empty: return alt.Chart(pd.DataFrame()).mark_text().encode( text=alt.value("No data available for visualization.") ).properties( title="Categorical Target Balance Visualization (Empty)" ) # Add expected proportion range to the DataFrame n_classes = len(summary_df) expected_proportion = 1 / n_classes threshold = summary_df['threshold'].iloc[0] summary_df['expected_lower'] = expected_proportion * (1 - threshold) summary_df['expected_upper'] = expected_proportion * (1 + threshold) # Bar chart for actual proportions actual_dist = alt.Chart(summary_df).mark_bar(opacity=0.6).encode( x=alt.X('class:N', title='Class'), y=alt.Y('proportion:Q', title='Proportion'), color=alt.Color('imbalanced:N', scale=alt.Scale(domain=[True, False], range=['red', 'green']), legend=alt.Legend(title="Imbalanced")), tooltip=['class', 'proportion', 'imbalanced'] ) # Error bars for expected range error_bar = alt.Chart(summary_df).mark_errorbar(color='black').encode( x=alt.X('class:N', title='Class'), y=alt.Y('expected_lower:Q', title='Expected Proportion Range'), y2='expected_upper:Q' ) # Add ticks to highlight lower and upper bounds lower_ticks = alt.Chart(summary_df).mark_tick( color='black', thickness=2, size=20 ).encode( x=alt.X('class:N', title='Class'), y=alt.Y('expected_lower:Q') ) upper_ticks = alt.Chart(summary_df).mark_tick( color='black', thickness=2, size=20 ).encode( x=alt.X('class:N'), y=alt.Y('expected_upper:Q') ) balance_chart = (actual_dist + error_bar + lower_ticks + upper_ticks).properties( width=600, height=400, title="Categorical Target Balance Visualization" ) return balance_chart