Generic functions for manipulating Python files and objects
This document provides an overview of the functions defined in src.utils.functions
. Each function is listed with its signature and docstring.
load_pickle
def load_pickle(filepath: str) -> Any:
"""Load a pickled object.
Args:
filepath (str): Path to pickle (.pkl) file.
Returns:
Any: Loaded object.
"""
save_pickle
def save_pickle(target: dict, filepath: str, fname: str = "mm_feat.pkl") -> Any:
"""
Save a Python object as a pickle file.
Args:
target (dict): Object to pickle.
filepath (str): Directory to save the pickle file.
fname (str): Filename for the pickle file (default: "mm_feat.pkl").
Returns:
None
"""
impute_from_df
def impute_from_df(
impute_to: pl.DataFrame | pl.LazyFrame,
impute_from: pl.DataFrame,
use_col: str = None,
key_col: str = None,
) -> pl.DataFrame | pl.LazyFrame:
"""
Impute values from one dataframe to another using a key column.
Args:
impute_to (pl.DataFrame | pl.LazyFrame): Table to impute values into.
impute_from (pl.DataFrame): Table to impute values from.
use_col (str, optional): Column containing values to impute.
key_col (str, optional): Column to use to identify matching rows.
Returns:
pl.DataFrame | pl.LazyFrame: DataFrame with imputed values.
"""
get_final_episodes
def get_final_episodes(stays: pl.DataFrame | pl.LazyFrame) -> pl.DataFrame:
"""Extracts the final ED episode with hospitalisation for creating a unique patient cohort.
Args:
stays (pl.DataFrame): Stays data.
Returns:
pl.DataFrame: Patient-level data.
"""
get_n_unique_values
def get_n_unique_values(
table: pl.DataFrame | pl.LazyFrame, use_col: str = "subject_id"
) -> int:
"""Compute number of unique values in particular column in table.
Args:
table (pl.DataFrame | pl.LazyFrame): Table.
use_col (str, optional): Column to use. Defaults to "subject_id".
Returns:
int: Number of unique values.
"""
scale_numeric_features
def scale_numeric_features(
table: pl.DataFrame, numeric_cols: list = None, over: str = None
) -> pl.DataFrame:
"""Applies min/max scaling to numeric columns and rounds to 1 d.p.
Args:
table (pl.DataFrame): Table.
numeric_cols (list, optional): List of columns to apply to. Defaults to None.
over (str, optional): Column to group by before computing min/max. Defaults to None.
Returns:
pl.DataFrame: Updated table.
"""
read_icd_mapping
def read_icd_mapping(map_path: str) -> pl.DataFrame:
"""
Reads ICD-9 to ICD-10 mapping file for chronic conditions.
"""
contains_both_ltc_types
def contains_both_ltc_types(ltc_set: set) -> bool:
"""
Helper util function for physical-mental multimorbidity detection.
Args:
ltc_set (set): Set containing LTC codes.
Returns:
bool: True if both physical and mental LTC types are present, False otherwise.
"""
preview_data
def preview_data(filepath: str) -> None:
"""Prints a single example from data dictionary.
Args:
filepath (str): Path to .pkl file containing data dictionary.
"""
get_demographics_summary
def get_demographics_summary(ed_pts: pl.DataFrame | pl.LazyFrame) -> None:
"""
Summarises sensitive attributes and outcome prevalence.
Args:
demographics (pl.DataFrame): Demographics data.
Returns:
pl.DataFrame: Summary table.
"""
get_train_split_summary
def get_train_split_summary(
train: pd.DataFrame,
val: pd.DataFrame,
test: pd.DataFrame,
outcome: str = "in_hosp_death",
output_path: str = "../outputs/exp_data",
cont_cols: list = None,
nn_cols: list = None,
disp_dict: dict = None,
cat_cols: list = None,
verbose: bool = True,
) -> None:
"""
Print and save a statistical summary for the train, validation, and test splits.
Args:
train (pd.DataFrame): Training set DataFrame.
val (pd.DataFrame): Validation set DataFrame.
test (pd.DataFrame): Test set DataFrame.
outcome (str): Name of the outcome variable (default: "in_hosp_death").
output_path (str): Directory to save the summary HTML file.
cont_cols (list): List of continuous columns.
nn_cols (list): List of non-normal columns.
disp_dict (dict): Dictionary mapping original to display column names.
cat_cols (list): List of categorical columns.
verbose (bool): If True, print progress messages.
Returns:
None
"""
rename_fields
def rename_fields(col):
"""
Helper function to rename drug and specialty feature names.
Args:
col (Any): Column name or tuple of column names.
Returns:
str: Joined string if input is a tuple, otherwise the original column name.
"""