Functions for data exploration and statistical testing
This document provides an overview of the functions defined in src.utils.exploration
. Each function is listed with its signature and docstring.
get_table_one
def get_table_one(
ed_pts: pl.DataFrame | pl.LazyFrame,
outcome: str,
outcome_label: str,
output_path: str = "../outputs/reference",
disp_dict_path: str = "../outputs/reference/feat_name_map.json",
sensitive_attr_list: list = "None",
nn_attr: list = "None",
adjust_method="bonferroni",
cat_cols: list = None,
verbose: bool = False,
) -> TableOne:
"""
Generate a baseline patient summary table (Table 1) with adjusted p-values grouped by outcome.
Args:
ed_pts (pl.DataFrame | pl.LazyFrame): Patient data.
outcome (str): Outcome variable name.
outcome_label (str): Display name for the outcome.
output_path (str): Directory to save the HTML summary.
disp_dict_path (str): Path to JSON mapping feature names to display names.
sensitive_attr_list (list): List of sensitive attribute names.
nn_attr (list): List of non-normal columns.
adjust_method (str): Method for p-value adjustment.
cat_cols (list): List of categorical columns.
verbose (bool): If True, print summary information.
Returns:
TableOne: Generated TableOne summary object.
"""
assign_age_groups
def assign_age_groups(
ed_pts: pl.DataFrame | pl.LazyFrame,
age_col: str = "anchor_age",
bins: list = None,
labels: list = None,
use_lazy: bool = False,
) -> pl.DataFrame:
"""
Assign age groups to patients based on age column and specified bins/labels.
Args:
ed_pts (pl.DataFrame | pl.LazyFrame): Patient data.
age_col (str): Name of the age column.
bins (list): List of bin edges for age groups.
labels (list): List of labels for age groups.
use_lazy (bool): If True, return a LazyFrame.
Returns:
pl.DataFrame: DataFrame with an added 'age_group' column.
"""
get_age_table_by_sensitive_attr
def get_age_table_by_sensitive_attr(
ed_pts: pl.DataFrame | pl.LazyFrame,
attr_name: str,
outcome: str,
value_name_col: str,
) -> pl.DataFrame:
"""
Transform the dataset into long format to group samples with the outcome by age group and sensitive attribute.
Args:
ed_pts (pl.DataFrame | pl.LazyFrame): Patient data.
attr_name (str): Sensitive attribute column name.
outcome (str): Outcome variable name.
value_name_col (str): Name for the value column in the melted DataFrame.
Returns:
pd.DataFrame: Long-format DataFrame with counts and percentages by group.
"""
plot_outcome_dist_by_sensitive_attr
def plot_outcome_dist_by_sensitive_attr(
ed_pts: pl.DataFrame | pl.LazyFrame,
attr_col: str,
attr_xlabel: str,
output_path: str = "../outputs/reference",
outcome_list: list = None,
outcome_title: list = None,
outcome_legend: dict = None,
maxi: int = 2,
maxj: int = 2,
rot: int = 0,
figsize: tuple = (8, 6),
palette: list = None,
):
"""
Plot the distribution of health outcomes by a specified sensitive attribute.
Args:
ed_pts (pl.DataFrame | pl.LazyFrame): Patient data.
attr_col (str): Sensitive attribute column name.
attr_xlabel (str): Label for the sensitive attribute display label.
output_path (str): Directory to save the plot.
outcome_list (list): List of outcome variable names.
outcome_title (list): List of outcome display names.
outcome_legend (dict): Mapping of outcome titles to legend labels.
maxi (int): Number of rows in subplot grid.
maxj (int): Number of columns in subplot grid.
rot (int): Rotation angle for x-axis labels.
figsize (tuple): Figure size.
palette (list): List of colors for plotting.
Returns:
None
"""
plot_age_dist_by_sensitive_attr
def plot_age_dist_by_sensitive_attr(
ed_pts: pl.DataFrame | pl.LazyFrame,
attr_col: str,
attr_xlabel: str,
output_path: str = "../outputs/reference",
outcome_list: list = None,
outcome_title: list = None,
maxi: int = 2,
maxj: int = 2,
colors: list = None,
labels: list = None,
figsize: tuple = (12, 8),
rot: int = 0,
):
"""
Plot the distribution of age groups by a specified sensitive attribute for patients with adverse events.
Args:
ed_pts (pl.DataFrame | pl.LazyFrame): Patient data.
attr_col (str): Sensitive attribute column name.
attr_xlabel (str): Label for the sensitive attribute display name.
output_path (str): Directory to save the plot.
outcome_list (list): List of outcome variable names.
outcome_title (list): List of outcome display names.
maxi (int): Number of rows in subplot grid.
maxj (int): Number of columns in subplot grid.
colors (list): List of colors for age groups.
labels (list): List of age group labels.
figsize (tuple): Figure size.
rot (int): Rotation angle for x-axis labels.
Returns:
None
"""
plot_token_length_by_attribute
def plot_token_length_by_attribute(
ed_pts: pl.DataFrame | pl.LazyFrame,
output_path: str = "../outputs/reference",
sensitive_attr_list: list = None,
attr_title: list = None,
out_fname: str = "bhc_dist_by_attr.png",
maxi: int = 2,
maxj: int = 2,
figsize: tuple = (8, 6),
rot: int = 0,
ylim: tuple = (0, 12),
gr_pairs: dict = None,
suptitle: str = "BHC token length by sensitive variable in patients with ED attendance.",
outcome_mode: bool = False,
unique_value_order: list = None,
adjust_method: str = "bonferroni",
test_type: str = "t-test_welch",
):
"""
Display violin plots of aggregated BHC length by sensitive attribute, with statistical annotation.
Args:
ed_pts (pl.DataFrame | pl.LazyFrame): Patient data.
output_path (str): Directory to save the plot.
sensitive_attr_list (list): List of sensitive attribute names.
attr_title (list): List of attribute display names.
out_fname (str): Output filename for the plot.
maxi (int): Number of rows in subplot grid.
maxj (int): Number of columns in subplot grid.
figsize (tuple): Figure size.
rot (int): Rotation angle for x-axis labels.
ylim (tuple): Y-axis limits.
gr_pairs (dict): Dictionary of group pairs for statistical annotation.
suptitle (str): Figure title.
outcome_mode (bool): If True, relabel categories for outcome mode.
unique_value_order (list): Order of unique values for plotting.
adjust_method (str): Method for p-value adjustment.
test_type (str): Statistical test type.
Returns:
None
"""