Skip to content

Generic functions for manipulating Python files and objects

This document provides an overview of the functions defined in src.utils.functions. Each function is listed with its signature and docstring.


load_pickle

def load_pickle(filepath: str) -> Any:
    """Load a pickled object.

    Args:
        filepath (str): Path to pickle (.pkl) file.

    Returns:
        Any: Loaded object.
    """

save_pickle

def save_pickle(target: dict, filepath: str, fname: str = "mm_feat.pkl") -> Any:
    """
    Save a Python object as a pickle file.

    Args:
        target (dict): Object to pickle.
        filepath (str): Directory to save the pickle file.
        fname (str): Filename for the pickle file (default: "mm_feat.pkl").

    Returns:
        None
    """

impute_from_df

def impute_from_df(
    impute_to: pl.DataFrame | pl.LazyFrame,
    impute_from: pl.DataFrame,
    use_col: str = None,
    key_col: str = None,
) -> pl.DataFrame | pl.LazyFrame:
    """
    Impute values from one dataframe to another using a key column.

    Args:
        impute_to (pl.DataFrame | pl.LazyFrame): Table to impute values into.
        impute_from (pl.DataFrame): Table to impute values from.
        use_col (str, optional): Column containing values to impute.
        key_col (str, optional): Column to use to identify matching rows.

    Returns:
        pl.DataFrame | pl.LazyFrame: DataFrame with imputed values.
    """

get_final_episodes

def get_final_episodes(stays: pl.DataFrame | pl.LazyFrame) -> pl.DataFrame:
    """Extracts the final ED episode with hospitalisation for creating a unique patient cohort.

    Args:
        stays (pl.DataFrame): Stays data.

    Returns:
        pl.DataFrame: Patient-level data.
    """

get_n_unique_values

def get_n_unique_values(
    table: pl.DataFrame | pl.LazyFrame, use_col: str = "subject_id"
) -> int:
    """Compute number of unique values in particular column in table.

    Args:
        table (pl.DataFrame | pl.LazyFrame): Table.
        use_col (str, optional): Column to use. Defaults to "subject_id".

    Returns:
        int: Number of unique values.
    """

scale_numeric_features

def scale_numeric_features(
    table: pl.DataFrame, numeric_cols: list = None, over: str = None
) -> pl.DataFrame:
    """Applies min/max scaling to numeric columns and rounds to 1 d.p.

    Args:
        table (pl.DataFrame): Table.
        numeric_cols (list, optional): List of columns to apply to. Defaults to None.
        over (str, optional): Column to group by before computing min/max. Defaults to None.

    Returns:
        pl.DataFrame: Updated table.
    """

read_icd_mapping

def read_icd_mapping(map_path: str) -> pl.DataFrame:
    """
    Reads ICD-9 to ICD-10 mapping file for chronic conditions.
    """

contains_both_ltc_types

def contains_both_ltc_types(ltc_set: set) -> bool:
    """
    Helper util function for physical-mental multimorbidity detection.

    Args:
        ltc_set (set): Set containing LTC codes.

    Returns:
        bool: True if both physical and mental LTC types are present, False otherwise.
    """

preview_data

def preview_data(filepath: str) -> None:
    """Prints a single example from data dictionary.

    Args:
        filepath (str): Path to .pkl file containing data dictionary.
    """

get_demographics_summary

def get_demographics_summary(ed_pts: pl.DataFrame | pl.LazyFrame) -> None:
    """
    Summarises sensitive attributes and outcome prevalence.
    Args:
        demographics (pl.DataFrame): Demographics data.

    Returns:
        pl.DataFrame: Summary table.
    """

get_train_split_summary

def get_train_split_summary(
    train: pd.DataFrame,
    val: pd.DataFrame,
    test: pd.DataFrame,
    outcome: str = "in_hosp_death",
    output_path: str = "../outputs/exp_data",
    cont_cols: list = None,
    nn_cols: list = None,
    disp_dict: dict = None,
    cat_cols: list = None,
    verbose: bool = True,
) -> None:
    """
    Print and save a statistical summary for the train, validation, and test splits.

    Args:
        train (pd.DataFrame): Training set DataFrame.
        val (pd.DataFrame): Validation set DataFrame.
        test (pd.DataFrame): Test set DataFrame.
        outcome (str): Name of the outcome variable (default: "in_hosp_death").
        output_path (str): Directory to save the summary HTML file.
        cont_cols (list): List of continuous columns.
        nn_cols (list): List of non-normal columns.
        disp_dict (dict): Dictionary mapping original to display column names.
        cat_cols (list): List of categorical columns.
        verbose (bool): If True, print progress messages.

    Returns:
        None
    """

rename_fields

def rename_fields(col):
    """
    Helper function to rename drug and specialty feature names.

    Args:
        col (Any): Column name or tuple of column names.

    Returns:
        str: Joined string if input is a tuple, otherwise the original column name.
    """

read_from_txt

def read_from_txt(filepath: str, as_type="str") -> list:
    """Read from line-seperated txt file.

    Args:
        filepath (str): Path to text file.

    Returns:
        list: List containing data.
    """