import pandas as pd
import altair as alt
import warnings
[docs]
def summarize_target_df(dataset_name: pd.DataFrame, target_variable: str,
target_type: str, threshold=0.2):
"""Summarize and evaluate the target variable for categarical or numerical types.
Parameters
----------
dataset_name : DataFrame
The input dataset containing target variable.
target_variable : str
The name of target column.
target_type : str, within {"categorical", "numerical"}
The type of target variable.
threshold : float, optional
Only feasible for "categorical" type to identify class imbalance.
Default is 0.2.
Returns
-------
DataFrame
If target_type="categorical", returns a summary DataFrame
containing classes, proportions, imbalance flag,
and threshold.
If target_type="numerical", returns the DataFrame with the basic
statistical summary.
Notes:
-----
For categorical type, the function does not distinguish between binary and
multi-class classification.
Balance criteria: Assume n classes, each class should between
[(1-threshold)/n, (1+threshold)/n].
threshold : float, optional
Only used if `target_type="categorical"`.
It identifies class imbalance.
User decides the threshold of imbalance.
Typically, a target class is considered balanced if it varies within 20% of the average.
Of course, users can choose a narrower balance range, such as 10%.
Examples
--------
>>> summarize_target(
data, target_variable='target', target_type='categorical', threshold=0.2
)
"""
if target_type == "categorical" and (threshold < 0 or threshold > 1):
raise ValueError("Threshold must be between 0 and 1.")
if target_type == "categorical":
# Calculate class proportions
value_counts = dataset_name[target_variable].value_counts(normalize=True).sort_index()
n_classes = len(value_counts)
# Deal with empty data
if n_classes == 0:
return pd.DataFrame(columns=['class', 'proportion', 'imbalanced', 'threshold'])
# Calculate expected range for balance
expected_proportion = 1 / n_classes
lower_bound = expected_proportion * (1 - threshold)
upper_bound = expected_proportion * (1 + threshold)
imbalance_flag = (value_counts < lower_bound) | (value_counts > upper_bound)
# Generate summary table
summary_df = pd.DataFrame({
'class': value_counts.index,
'proportion': value_counts.values,
'imbalanced': imbalance_flag.values
})
summary_df['threshold'] = threshold
elif target_type == "numerical":
# Check for empty numerical data
if dataset_name[target_variable].empty:
return pd.DataFrame()
# Warn if threshold is provided
if threshold is not None:
warnings.warn("Threshold is not used for numerical targets.", UserWarning)
# Get statistical summary
summary_df = dataset_name[target_variable].describe().to_frame().T
else:
raise ValueError("Invalid target_type. Must be 'categorical' or 'numerical'.")
return summary_df
[docs]
def summarize_target_balance_plot(summary_df: pd.DataFrame):
"""
Visualize the balance condition of a categorical target.
Parameters
----------
summary_df : DataFrame
The input DataFrame, expected to match the output of summarize_target_df()
with target_type="categorical".
It must contain the columns ['class', 'proportion', 'imbalanced', 'threshold'].
Returns
-------
alt.Chart
The Altair chart visualizing the balance of the categorical target variable.
Notes
-----
The chart includes the following:
- A bar plot for actual class proportions.
- Expected proportion range (lower and upper bounds) as balance range.
- Imbalance status for each class indicated by color.
- Highlighted ticks for expected lower and upper bounds.
"""
# Validate input DataFrame
required_columns = {'class', 'proportion', 'imbalanced', 'threshold'}
if not required_columns.issubset(summary_df.columns):
raise ValueError(f"Input DataFrame must contain columns: {', '.join(sorted(required_columns))}")
# Handle empty DataFrame
if summary_df.empty:
return alt.Chart(pd.DataFrame()).mark_text().encode(
text=alt.value("No data available for visualization.")
).properties(
title="Categorical Target Balance Visualization (Empty)"
)
# Add expected proportion range to the DataFrame
n_classes = len(summary_df)
expected_proportion = 1 / n_classes
threshold = summary_df['threshold'].iloc[0]
summary_df['expected_lower'] = expected_proportion * (1 - threshold)
summary_df['expected_upper'] = expected_proportion * (1 + threshold)
# Bar chart for actual proportions
actual_dist = alt.Chart(summary_df).mark_bar(opacity=0.6).encode(
x=alt.X('class:N', title='Class'),
y=alt.Y('proportion:Q', title='Proportion'),
color=alt.Color('imbalanced:N', scale=alt.Scale(domain=[True, False], range=['red', 'green']),
legend=alt.Legend(title="Imbalanced")),
tooltip=['class', 'proportion', 'imbalanced']
)
# Error bars for expected range
error_bar = alt.Chart(summary_df).mark_errorbar(color='black').encode(
x=alt.X('class:N', title='Class'),
y=alt.Y('expected_lower:Q', title='Expected Proportion Range'),
y2='expected_upper:Q'
)
# Add ticks to highlight lower and upper bounds
lower_ticks = alt.Chart(summary_df).mark_tick(
color='black',
thickness=2,
size=20
).encode(
x=alt.X('class:N', title='Class'),
y=alt.Y('expected_lower:Q')
)
upper_ticks = alt.Chart(summary_df).mark_tick(
color='black',
thickness=2,
size=20
).encode(
x=alt.X('class:N'),
y=alt.Y('expected_upper:Q')
)
balance_chart = (actual_dist + error_bar + lower_ticks + upper_ticks).properties(
width=600,
height=400,
title="Categorical Target Balance Visualization"
)
return balance_chart