Skip to content

4_Summary

Summary page for NHSSynth Evaluation Dashboard.

Provides an overall summary of synthetic data quality with optional LLM-generated insights.

compute_summary_metrics(evaluations, experiments)

Compute summary statistics from evaluations.

Parameters:

Name Type Description Default
evaluations dict

Dictionary of evaluation DataFrames by metric group.

required
experiments DataFrame

DataFrame of experiment configurations.

required

Returns:

Type Description
dict

Dictionary containing summary metrics.

Source code in src/nhssynth/modules/dashboard/pages/4_Summary.py
def compute_summary_metrics(evaluations: dict, experiments: pd.DataFrame) -> dict:
    """
    Compute summary statistics from evaluations.

    Args:
        evaluations: Dictionary of evaluation DataFrames by metric group.
        experiments: DataFrame of experiment configurations.

    Returns:
        Dictionary containing summary metrics.
    """
    summary = {
        "table_metrics": {},
        "task_metrics": {},
        "privacy_metrics": {},
        "fairness_metrics": {},
        "columnwise_summary": {},
    }

    # Table-level fidelity metrics
    if "table" in evaluations:
        table_evals = evaluations["table"]
        for col in table_evals.columns:
            try:
                numeric_vals = pd.to_numeric(table_evals[col], errors="coerce")
                if not numeric_vals.isna().all():
                    mean_val = numeric_vals.mean()
                    summary["table_metrics"][col] = {
                        "mean": mean_val,
                        "min": numeric_vals.min(),
                        "max": numeric_vals.max(),
                        "rating": get_rating(mean_val),
                    }
            except Exception:
                continue

    # Task/downstream utility metrics
    if "task" in evaluations:
        task_evals = evaluations["task"]
        for col in task_evals.columns:
            try:
                numeric_vals = pd.to_numeric(task_evals[col], errors="coerce")
                if not numeric_vals.isna().all():
                    mean_val = numeric_vals.mean()
                    summary["task_metrics"][col] = {
                        "mean": mean_val,
                        "min": numeric_vals.min(),
                        "max": numeric_vals.max(),
                        "rating": get_rating(mean_val),
                    }
            except Exception:
                continue

    # Privacy metrics - need to handle non-numeric values from failed computations
    if "privacy" in evaluations:
        privacy_evals = evaluations["privacy"]
        for col in privacy_evals.columns:
            try:
                # Convert to numeric, coercing errors to NaN
                numeric_vals = pd.to_numeric(privacy_evals[col], errors="coerce")
                if not numeric_vals.isna().all():
                    mean_val = numeric_vals.mean()
                    # For privacy, higher distance/lower risk is better
                    summary["privacy_metrics"][col] = {
                        "mean": mean_val,
                        "min": numeric_vals.min(),
                        "max": numeric_vals.max(),
                        "rating": get_rating(mean_val),
                    }
            except Exception:
                # Skip metrics that can't be processed
                continue

    # Fairness metrics
    if "fairness" in evaluations:
        fairness_evals = evaluations["fairness"]
        for col in fairness_evals.columns:
            try:
                # Convert to numeric first
                numeric_col = pd.to_numeric(fairness_evals[col], errors="coerce")
                if numeric_col.isna().all():
                    continue

                # Separate real vs synthetic values for comparison
                # Handle both MultiIndex and regular Index
                if hasattr(fairness_evals.index, "get_level_values"):
                    # MultiIndex - get first level
                    idx_values = fairness_evals.index.get_level_values(0)
                    real_idx = fairness_evals.index[idx_values == "Real"]
                    synth_idx = fairness_evals.index[idx_values != "Real"]
                else:
                    # Regular index - try tuple access
                    real_idx = [
                        i for i in fairness_evals.index if (isinstance(i, tuple) and i[0] == "Real") or i == "Real"
                    ]
                    synth_idx = [
                        i
                        for i in fairness_evals.index
                        if not ((isinstance(i, tuple) and i[0] == "Real") or i == "Real")
                    ]

                real_vals = (
                    pd.to_numeric(fairness_evals.loc[real_idx, col], errors="coerce") if len(real_idx) > 0 else None
                )
                synth_vals = (
                    pd.to_numeric(fairness_evals.loc[synth_idx, col], errors="coerce") if len(synth_idx) > 0 else None
                )

                real_val = (
                    real_vals.mean()
                    if real_vals is not None and len(real_vals) > 0 and not real_vals.isna().all()
                    else None
                )
                synth_val = (
                    synth_vals.mean()
                    if synth_vals is not None and len(synth_vals) > 0 and not synth_vals.isna().all()
                    else None
                )

                # For fairness, lower is better (less disparity)
                summary["fairness_metrics"][col] = {
                    "real": real_val,
                    "synthetic_mean": synth_val,
                    "synthetic_min": (
                        synth_vals.min()
                        if synth_vals is not None and len(synth_vals) > 0 and not synth_vals.isna().all()
                        else None
                    ),
                    "synthetic_max": (
                        synth_vals.max()
                        if synth_vals is not None and len(synth_vals) > 0 and not synth_vals.isna().all()
                        else None
                    ),
                    "real_rating": get_fairness_rating(real_val) if real_val is not None else ("N/A", "gray"),
                    "synthetic_rating": get_fairness_rating(synth_val) if synth_val is not None else ("N/A", "gray"),
                }
            except Exception as e:
                # Store error for debugging
                summary["_fairness_errors"] = summary.get("_fairness_errors", [])
                summary["_fairness_errors"].append(f"{col}: {str(e)}")
                continue

    return summary

generate_llm_summary(summary, api_key, model='claude-3-haiku-20240307')

Generate an LLM-powered summary of evaluation results.

Parameters:

Name Type Description Default
summary dict

Dictionary of computed summary metrics.

required
api_key str

Anthropic API key.

required
model str

Model to use for generation.

'claude-3-haiku-20240307'

Returns:

Type Description
Optional[str]

LLM-generated summary string, or None if generation fails.

Source code in src/nhssynth/modules/dashboard/pages/4_Summary.py
def generate_llm_summary(summary: dict, api_key: str, model: str = "claude-3-haiku-20240307") -> Optional[str]:
    """
    Generate an LLM-powered summary of evaluation results.

    Args:
        summary: Dictionary of computed summary metrics.
        api_key: Anthropic API key.
        model: Model to use for generation.

    Returns:
        LLM-generated summary string, or None if generation fails.
    """
    try:
        import anthropic
    except ImportError:
        st.warning(
            "The `anthropic` package is not installed. Install it with `pip install anthropic` to enable LLM summaries."
        )
        return None

    # Build the prompt with evaluation data
    prompt_parts = ["Analyze these synthetic data evaluation metrics and provide a concise summary:\n"]

    if summary["table_metrics"]:
        prompt_parts.append("\nTable-Level Fidelity Metrics:")
        for metric, data in summary["table_metrics"].items():
            if not pd.isna(data["mean"]):
                prompt_parts.append(f"- {metric}: {data['mean']:.3f} (range: {data['min']:.3f}-{data['max']:.3f})")

    if summary["task_metrics"]:
        prompt_parts.append("\nDownstream Task Utility Metrics:")
        for metric, data in summary["task_metrics"].items():
            if not pd.isna(data["mean"]):
                prompt_parts.append(f"- {metric}: {data['mean']:.3f} (range: {data['min']:.3f}-{data['max']:.3f})")

    if summary["privacy_metrics"]:
        prompt_parts.append("\nPrivacy Metrics:")
        for metric, data in summary["privacy_metrics"].items():
            if not pd.isna(data["mean"]):
                prompt_parts.append(f"- {metric}: {data['mean']:.3f} (range: {data['min']:.3f}-{data['max']:.3f})")

    if summary["fairness_metrics"]:
        prompt_parts.append("\nFairness Metrics (lower is better - less disparity between groups):")
        for metric, data in summary["fairness_metrics"].items():
            real_str = f"{data['real']:.3f}" if data["real"] is not None else "N/A"
            synth_str = f"{data['synthetic_mean']:.3f}" if data["synthetic_mean"] is not None else "N/A"
            prompt_parts.append(f"- {metric}: Real={real_str}, Synthetic={synth_str}")

    prompt_parts.append("\nProvide a brief summary (2-3 paragraphs) that:")
    prompt_parts.append("1. Assesses overall synthetic data quality")
    prompt_parts.append("2. Highlights strengths and areas for improvement")
    prompt_parts.append("3. Provides actionable recommendations if applicable")
    prompt_parts.append("\nUse clear, non-technical language where possible.")

    prompt = "\n".join(prompt_parts)

    try:
        client = anthropic.Anthropic(api_key=api_key)
        message = client.messages.create(model=model, max_tokens=1024, messages=[{"role": "user", "content": prompt}])
        return message.content[0].text
    except Exception as e:
        st.error(f"Error generating LLM summary: {e}")
        return None

generate_static_summary(summary)

Generate a static text summary without LLM.

Parameters:

Name Type Description Default
summary dict

Dictionary of computed summary metrics.

required

Returns:

Type Description
str

Formatted summary string.

Source code in src/nhssynth/modules/dashboard/pages/4_Summary.py
def generate_static_summary(summary: dict) -> str:
    """
    Generate a static text summary without LLM.

    Args:
        summary: Dictionary of computed summary metrics.

    Returns:
        Formatted summary string.
    """
    lines = []
    lines.append("## Evaluation Summary\n")

    # Table-level fidelity
    if summary["table_metrics"]:
        lines.append("### Table-Level Fidelity\n")
        lines.append("| Metric | Mean Score | Rating |")
        lines.append("|--------|------------|--------|")
        for metric, data in summary["table_metrics"].items():
            rating, color = data["rating"]
            lines.append(f"| {metric} | {data['mean']:.3f} | :{color}[{rating}] |")
        lines.append("")

    # Downstream task utility
    if summary["task_metrics"]:
        lines.append("### Downstream Task Utility\n")
        lines.append("| Metric | Mean Score | Rating |")
        lines.append("|--------|------------|--------|")
        for metric, data in summary["task_metrics"].items():
            rating, color = data["rating"]
            lines.append(f"| {metric} | {data['mean']:.3f} | :{color}[{rating}] |")
        lines.append("")

    # Privacy metrics
    if summary["privacy_metrics"]:
        lines.append("### Privacy Metrics\n")
        lines.append("| Metric | Mean Score | Rating |")
        lines.append("|--------|------------|--------|")
        for metric, data in summary["privacy_metrics"].items():
            rating, color = data["rating"]
            lines.append(f"| {metric} | {data['mean']:.3f} | :{color}[{rating}] |")
        lines.append("")

    # Fairness metrics
    if summary["fairness_metrics"]:
        lines.append("### Fairness Metrics\n")
        lines.append("*Lower values indicate better fairness (less disparity between groups)*\n")
        lines.append("| Metric | Real Data | Synthetic Data | Fairness Preserved? |")
        lines.append("|--------|-----------|----------------|---------------------|")
        for metric, data in summary["fairness_metrics"].items():
            real_val = f"{data['real']:.3f}" if data["real"] is not None else "N/A"
            synth_val = f"{data['synthetic_mean']:.3f}" if data["synthetic_mean"] is not None else "N/A"
            real_rating, real_color = data["real_rating"]
            synth_rating, synth_color = data["synthetic_rating"]

            # Check if fairness is preserved (synthetic is similar or better)
            if data["real"] is not None and data["synthetic_mean"] is not None:
                diff = abs(data["synthetic_mean"] - data["real"])
                if diff <= 0.05:
                    preserved = ":green[Yes]"
                elif data["synthetic_mean"] < data["real"]:
                    preserved = ":green[Improved]"
                elif diff <= 0.10:
                    preserved = ":orange[Mostly]"
                else:
                    preserved = ":red[No]"
            else:
                preserved = "N/A"

            lines.append(
                f"| {metric} | {real_val} (:{real_color}[{real_rating}]) | {synth_val} (:{synth_color}[{synth_rating}]) | {preserved} |"
            )
        lines.append("")

    # Interpretation
    lines.append("### Interpretation\n")

    # Overall fidelity assessment
    if summary["table_metrics"]:
        fidelity_scores = [d["mean"] for d in summary["table_metrics"].values() if not pd.isna(d["mean"])]
        if fidelity_scores:
            avg_fidelity = sum(fidelity_scores) / len(fidelity_scores)
            rating, _ = get_rating(avg_fidelity)
            lines.append(f"**Overall Fidelity**: {rating} (average score: {avg_fidelity:.3f})")
            if avg_fidelity >= 0.90:
                lines.append("- The synthetic data closely matches the statistical properties of the real data.")
            elif avg_fidelity >= 0.80:
                lines.append("- The synthetic data reasonably captures the main patterns in the real data.")
            else:
                lines.append("- There are notable differences between synthetic and real data distributions.")

    # Task utility assessment
    if summary["task_metrics"]:
        task_scores = [d["mean"] for d in summary["task_metrics"].values() if not pd.isna(d["mean"])]
        if task_scores:
            avg_task = sum(task_scores) / len(task_scores)
            rating, _ = get_rating(avg_task)
            lines.append(f"\n**Downstream Utility**: {rating} (average score: {avg_task:.3f})")
            if avg_task >= 0.85:
                lines.append("- Models trained on synthetic data perform comparably to those trained on real data.")
            elif avg_task >= 0.70:
                lines.append("- Moderate utility for downstream tasks; some performance degradation expected.")
            else:
                lines.append("- Limited utility for downstream ML tasks; consider tuning generation parameters.")

    # Fairness assessment
    if summary["fairness_metrics"]:
        # Check how many metrics preserved fairness
        preserved_count = 0
        total_count = 0
        for data in summary["fairness_metrics"].values():
            if data["real"] is not None and data["synthetic_mean"] is not None:
                total_count += 1
                diff = abs(data["synthetic_mean"] - data["real"])
                if diff <= 0.10 or data["synthetic_mean"] < data["real"]:
                    preserved_count += 1

        if total_count > 0:
            preservation_rate = preserved_count / total_count
            lines.append(f"\n**Fairness Preservation**: {preserved_count}/{total_count} metrics maintained")
            if preservation_rate >= 0.8:
                lines.append("- Synthetic data maintains similar fairness properties to the original data.")
            elif preservation_rate >= 0.5:
                lines.append("- Some fairness properties are preserved, but others show increased disparity.")
            else:
                lines.append("- Fairness properties differ significantly; review protected attribute handling.")

    return "\n".join(lines)

get_fairness_rating(value)

Get a rating and color for a fairness metric value. For fairness metrics, LOWER values are better (less disparity between groups).

Parameters:

Name Type Description Default
value float

The fairness metric value (difference between groups, 0-1).

required

Returns:

Type Description
tuple[str, str]

Tuple of (rating string, color).

Source code in src/nhssynth/modules/dashboard/pages/4_Summary.py
def get_fairness_rating(value: float) -> tuple[str, str]:
    """
    Get a rating and color for a fairness metric value.
    For fairness metrics, LOWER values are better (less disparity between groups).

    Args:
        value: The fairness metric value (difference between groups, 0-1).

    Returns:
        Tuple of (rating string, color).
    """
    if pd.isna(value):
        return "N/A", "gray"

    if value <= FAIRNESS_THRESHOLDS["excellent"]:
        return "Excellent", "green"
    elif value <= FAIRNESS_THRESHOLDS["good"]:
        return "Good", "blue"
    elif value <= FAIRNESS_THRESHOLDS["fair"]:
        return "Fair", "orange"
    else:
        return "Poor", "red"

get_rating(value, higher_is_better=True)

Get a rating and color for a metric value.

Parameters:

Name Type Description Default
value float

The metric value (typically 0-1).

required
higher_is_better bool

Whether higher values are better.

True

Returns:

Type Description
tuple[str, str]

Tuple of (rating string, color).

Source code in src/nhssynth/modules/dashboard/pages/4_Summary.py
def get_rating(value: float, higher_is_better: bool = True) -> tuple[str, str]:
    """
    Get a rating and color for a metric value.

    Args:
        value: The metric value (typically 0-1).
        higher_is_better: Whether higher values are better.

    Returns:
        Tuple of (rating string, color).
    """
    if pd.isna(value):
        return "N/A", "gray"

    if not higher_is_better:
        value = 1 - value

    if value >= METRIC_THRESHOLDS["excellent"]:
        return "Excellent", "green"
    elif value >= METRIC_THRESHOLDS["good"]:
        return "Good", "blue"
    elif value >= METRIC_THRESHOLDS["fair"]:
        return "Fair", "orange"
    else:
        return "Poor", "red"