Skip to content

Web UI

Warning

The components described here are not part of the standard EvalSense API and may change without notice. This part of the documentation is only meant to be used for development purposes.

App

Module evalsense.webui.app.

Functions:

Name Description
launch_webui

Launches the EvalSense Gradio web UI.

launch_webui

launch_webui(
    password: str | None = None,
    no_auth: bool = False,
    share: bool = False,
    port: int = 7860,
)

Launches the EvalSense Gradio web UI.

Parameters:

Name Type Description Default
password str | None

Password for authentication. If None, a random password is generated.

None
no_auth bool

If True, disables authentication.

False
share bool

If True, enables Gradio public sharing. This will make the app publicly accessible over the internet. Use with caution.

False
port int

Port to run the Gradio server on.

7860
Source code in evalsense/webui/app.py
def launch_webui(
    password: str | None = None,
    no_auth: bool = False,
    share: bool = False,
    port: int = 7860,
):
    """Launches the EvalSense Gradio web UI.

    Args:
        password: Password for authentication. If None, a random password is generated.
        no_auth: If True, disables authentication.
        share: If True, enables Gradio public sharing. This will make the app publicly
            accessible over the internet. Use with caution.
        port: Port to run the Gradio server on.
    """
    theme = Default(primary_hue="blue")
    with gr.Blocks(theme=theme, title="EvalSense") as demo:
        state = gr.State(get_initial_state())
        gr.Markdown("# 🔎 EvalSense")
        gr.Markdown(
            "To run an evaluation, configure its settings on the individual tabs and start it from the **Execution** tab. For EvalSense documentation and guidance regarding the available evaluation metrics, please visit the [EvalSense homepage](https://nhsengland.github.io/evalsense/)."
        )
        with gr.Tab("Data"):
            data_tab(state)
        with gr.Tab("Generation"):
            generation_tab(state)
        with gr.Tab("Models"):
            models_tab(state)
        with gr.Tab("Evaluators"):
            evaluators_tab(state)
        with gr.Tab("Execution"):
            execution_tab(state)
        with gr.Tab("Results"):
            results_tab(state)

        # Regularly discover projects and update the state
        timer = gr.Timer(3, active=True)
        timer.tick(fn=discover_projects, inputs=[state], outputs=[state])

    if share:
        print("* CAUTION: Gradio public sharing is enabled!")

    if no_auth:
        print("* CAUTION: Authentication disabled!")
        demo.launch(share=share)
    else:
        if password is None:
            password = secrets.token_urlsafe(20)
        print("* Server username: user")
        print(f"* Server password: {password}")
        demo.launch(
            share=share,
            auth=("user", password) if not no_auth else None,
            server_port=port,
        )

Execution

Module evalsense.webui.execution.

Functions:

Name Description
execute_evaluation

Executes the evaluation based on the current application state.

execute_meta_evaluation

Executes a meta-evaluation based on the current application state.

execute_standard_evaluation

Executes a standard evaluation for the given application state.

get_dataset_manager

Creates and returns a DatasetManager based on the current application state.

get_evaluators

Creates and returns a list of Evaluator instances based on the current application state.

get_model_configs

Creates and returns a list of ModelConfig based on the current application state.

execute_evaluation

execute_evaluation(state: AppState)

Executes the evaluation based on the current application state.

Parameters:

Name Type Description Default
state AppState

The current application state.

required
Source code in evalsense/webui/execution.py
def execute_evaluation(state: AppState):
    """Executes the evaluation based on the current application state.

    Args:
        state (AppState): The current application state.
    """
    if state["is_meta_eval"]:
        execute_meta_evaluation(state)
    else:
        execute_standard_evaluation(state)

execute_meta_evaluation

execute_meta_evaluation(state: AppState)

Executes a meta-evaluation based on the current application state.

Parameters:

Name Type Description Default
state AppState

The current application state.

required
Source code in evalsense/webui/execution.py
def execute_meta_evaluation(state: AppState):
    """Executes a meta-evaluation based on the current application state.

    Args:
        state (AppState): The current application state.
    """
    dataset_manager = get_dataset_manager(state)

    tasks = []
    for tier_id, perturbation_tier_subprompt in enumerate(
        state["perturbation_tier_subprompts"]
    ):
        system_prompt = state["system_prompt"].replace(
            "{perturbation_tier_subprompt}", perturbation_tier_subprompt
        )
        user_prompt = state["user_prompt"].replace(
            "{perturbation_tier_subprompt}", perturbation_tier_subprompt
        )
        generation_steps = GenerationSteps(
            name=f"{state['generation_steps_name']} (Tier {tier_id + 1})",
            steps=[
                system_message(system_prompt),
                prompt_template(user_prompt),
                generate(),
            ],
        )

        # We use a RecordToSample function to add the perturbation tier
        # to the metadata
        def perturbation_record_to_sample(
            record: dict[str, Any],
            tier_id: int = tier_id,
        ) -> Sample:
            return Sample(
                input=record[state["input_field_name"]],
                target=record.get(state["target_field_name"], ""),
                choices=record.get(state["choices_field_name"]),
                id=record.get(state["id_field_name"]),
                metadata={k: record[k] for k in state["metadata_fields"]}
                | {"perturbation_tier": tier_id},
            )

        perturb_task_preprocessor = DefaultTaskPreprocessor(name="Perturbation")
        task_config = TaskConfig(
            dataset_manager=dataset_manager,
            generation_steps=generation_steps,
            field_spec=perturbation_record_to_sample,
            task_preprocessor=perturb_task_preprocessor,
        )
        tasks.append(task_config)

    model_configs = get_model_configs(state)
    evaluators = get_evaluators(state)
    experiment_config = ExperimentBatchConfig(
        tasks=tasks, model_configs=model_configs, evaluators=evaluators
    )
    project = Project(name=state["project_name"])
    pipeline = Pipeline(experiments=experiment_config, project=project)
    pipeline.run()

execute_standard_evaluation

execute_standard_evaluation(state: AppState)

Executes a standard evaluation for the given application state.

Parameters:

Name Type Description Default
state AppState

The current application state.

required
Source code in evalsense/webui/execution.py
def execute_standard_evaluation(state: AppState):
    """Executes a standard evaluation for the given application state.

    Args:
        state (AppState): The current application state.
    """
    dataset_manager = get_dataset_manager(state)
    generation_steps = GenerationSteps(
        name=state["generation_steps_name"],
        steps=[
            system_message(state["system_prompt"]),
            prompt_template(state["user_prompt"]),
            generate(),
        ],
    )
    field_spec = FieldSpec(
        input=state["input_field_name"],
        target=state["target_field_name"],
        choices=state["choices_field_name"],
        id=state["id_field_name"],
        metadata=state["metadata_fields"],
    )
    model_configs = get_model_configs(state)
    evaluators = get_evaluators(state)
    task_config = TaskConfig(
        dataset_manager=dataset_manager,
        generation_steps=generation_steps,
        field_spec=field_spec,
    )
    experiment_config = ExperimentBatchConfig(
        tasks=[task_config], model_configs=model_configs, evaluators=evaluators
    )
    project = Project(name=state["project_name"])
    pipeline = Pipeline(experiments=experiment_config, project=project)
    pipeline.run()

get_dataset_manager

get_dataset_manager(state: AppState) -> DatasetManager

Creates and returns a DatasetManager based on the current application state.

Parameters:

Name Type Description Default
state AppState

The current application state.

required

Returns:

Name Type Description
DatasetManager DatasetManager

The instantiated DatasetManager.

Source code in evalsense/webui/execution.py
def get_dataset_manager(state: AppState) -> DatasetManager:
    """Creates and returns a DatasetManager based on the current application state.

    Args:
        state (AppState): The current application state.

    Returns:
        DatasetManager: The instantiated DatasetManager.
    """
    return DatasetManager.create(
        name=state["dataset_name"],
        splits=state["dataset_splits"],
        version=state["dataset_version"],
    )

get_evaluators

get_evaluators(state: AppState) -> list[Evaluator]

Creates and returns a list of Evaluator instances based on the current application state.

Parameters:

Name Type Description Default
state AppState

The current application state.

required

Returns:

Type Description
list[Evaluator]

list[Evaluator]: The list of instantiated Evaluator objects.

Source code in evalsense/webui/execution.py
def get_evaluators(state: AppState) -> list[Evaluator]:
    """Creates and returns a list of Evaluator instances based on the current application state.

    Args:
        state (AppState): The current application state.

    Returns:
        list[Evaluator]: The list of instantiated Evaluator objects.
    """
    evaluators: list[Evaluator] = []
    for evaluator_config in state["evaluator_configs"]:
        configurator = EvaluatorConfigurator.create(evaluator_config["evaluator_name"])
        evaluator = configurator.instantiate_evaluator(
            **evaluator_config["evaluator_args"]
        )
        evaluators.append(evaluator)
    return evaluators

get_model_configs

get_model_configs(state: AppState) -> list[ModelConfig]

Creates and returns a list of ModelConfig based on the current application state.

Parameters:

Name Type Description Default
state AppState

The current application state.

required

Returns:

Type Description
list[ModelConfig]

list[ModelConfig]: The list of instantiated ModelConfig objects.

Source code in evalsense/webui/execution.py
def get_model_configs(state: AppState) -> list[ModelConfig]:
    """Creates and returns a list of ModelConfig based on the current application state.

    Args:
        state (AppState): The current application state.

    Returns:
        list[ModelConfig]: The list of instantiated ModelConfig objects.
    """
    return [
        ModelConfig(
            m["model_name"],
            model_args=m["model_args"],
            generation_args=GenerateConfigArgs(**m["generation_args"]),
        )
        for m in state["model_configs"]
    ]

State

Module evalsense.webui.state.

Classes:

Name Description
AppEvaluatorConfig

Evaluator configuration to be used within the Gradio application.

AppModelConfig

Model configuration to be used within the Gradio application.

AppState

Application state to be used within the Gradio application.

Functions:

Name Description
get_initial_state

Provides the initial application state.

AppEvaluatorConfig

Bases: TypedDict

Evaluator configuration to be used within the Gradio application.

Attributes:

Name Type Description
evaluator_name str

The name of the evaluator to use.

evaluator_args dict[str, Any]

The arguments to pass to the evaluator.

Source code in evalsense/webui/state.py
class AppEvaluatorConfig(TypedDict):
    """Evaluator configuration to be used within the Gradio application.

    Attributes:
        evaluator_name (str): The name of the evaluator to use.
        evaluator_args (dict[str, Any]): The arguments to pass to the evaluator.
    """

    evaluator_name: str
    evaluator_args: dict[str, Any]

AppModelConfig

Bases: TypedDict

Model configuration to be used within the Gradio application.

Attributes:

Name Type Description
model_name str

The name of the model to use.

model_args dict[str, Any]

The arguments to pass to the model.

generation_args dict[str, Any]

The arguments to use for text generation.

Source code in evalsense/webui/state.py
class AppModelConfig(TypedDict):
    """Model configuration to be used within the Gradio application.

    Attributes:
        model_name (str): The name of the model to use.
        model_args (dict[str, Any]): The arguments to pass to the model.
        generation_args (dict[str, Any]): The arguments to use for text generation.
    """

    model_name: str
    model_args: dict[str, Any]
    generation_args: dict[str, Any]

AppState

Bases: TypedDict

Application state to be used within the Gradio application.

Attributes:

Name Type Description
dataset_name str

The name of the dataset to evaluate on.

dataset_splits tuple[str]

The used splits of the dataset.

dataset_version str

The used version of the dataset.

input_field_name str

The name of the main input field in the dataset.

target_field_name str

The name of the target field in the dataset.

choices_field_name str

The name of the answer choices field in the dataset.

id_field_name str

The name of the ID field in the dataset.

metadata_fields tuple[str]

The names of the metadata fields in the dataset.

is_meta_eval bool

Whether the evaluation to be performed is a meta-evaluation.

perturbation_tiers int

The number of perturbation tiers to use for meta-evaluation.

perturbation_tier_subprompts list[str]

The subprompts to use for each perturbation tier.

generation_steps_name str

The name of the used generation strategy.

system_prompt str

The system prompt to use for generation.

user_prompt str

The user prompt to use for generation.

model_configs list[AppModelConfig]

The model configurations to use for generation.

evaluator_configs list[AppEvaluatorConfig]

The evaluator configurations to use for evaluation.

project_name str

The name of the evaluation project.

existing_projects list[str]

The list of existing evaluation projects.

Source code in evalsense/webui/state.py
class AppState(TypedDict):
    """Application state to be used within the Gradio application.

    Attributes:
        dataset_name (str): The name of the dataset to evaluate on.
        dataset_splits (tuple[str]): The used splits of the dataset.
        dataset_version (str): The used version of the dataset.
        input_field_name (str): The name of the main input field in the dataset.
        target_field_name (str): The name of the target field in the dataset.
        choices_field_name (str): The name of the answer choices field in the dataset.
        id_field_name (str): The name of the ID field in the dataset.
        metadata_fields (tuple[str]): The names of the metadata fields in the dataset.
        is_meta_eval (bool): Whether the evaluation to be performed is a meta-evaluation.
        perturbation_tiers (int): The number of perturbation tiers to use for
            meta-evaluation.
        perturbation_tier_subprompts (list[str]): The subprompts to use for each
            perturbation tier.
        generation_steps_name (str): The name of the used generation strategy.
        system_prompt (str): The system prompt to use for generation.
        user_prompt (str): The user prompt to use for generation.
        model_configs (list[AppModelConfig]): The model configurations to use for
            generation.
        evaluator_configs (list[AppEvaluatorConfig]): The evaluator configurations
            to use for evaluation.
        project_name (str): The name of the evaluation project.
        existing_projects (list[str]): The list of existing evaluation projects.
    """

    dataset_name: str
    dataset_splits: list[str]
    dataset_version: str | None
    input_field_name: str
    target_field_name: str
    choices_field_name: str
    id_field_name: str
    metadata_fields: list[str]
    is_meta_eval: bool
    perturbation_tiers: int
    perturbation_tier_subprompts: list[str]
    generation_steps_name: str
    system_prompt: str
    user_prompt: str
    model_configs: list[AppModelConfig]
    evaluator_configs: list[AppEvaluatorConfig]
    project_name: str
    existing_projects: list[str]

get_initial_state

get_initial_state() -> AppState

Provides the initial application state.

Returns:

Name Type Description
AppState AppState

The initial application state.

Source code in evalsense/webui/state.py
def get_initial_state() -> AppState:
    """Provides the initial application state.

    Returns:
        AppState: The initial application state.
    """
    return {
        "dataset_name": "",
        "dataset_splits": list(),
        "dataset_version": None,
        "input_field_name": "input",
        "target_field_name": "target",
        "choices_field_name": "choices",
        "id_field_name": "id",
        "metadata_fields": list(),
        "is_meta_eval": False,
        "perturbation_tiers": 2,
        "perturbation_tier_subprompts": list(),
        "generation_steps_name": "Default",
        "system_prompt": "",
        "user_prompt": "",
        "model_configs": list(),
        "evaluator_configs": list(),
        "project_name": "Default",
        "existing_projects": list(),
    }

Utils

Module evalsense.webui.utils.

Classes:

Name Description
ListenerConfig

Configuration for a textbox listener.

Functions:

Name Description
dict_parser

Parses a string representation of a dictionary into an actual dictionary.

discover_projects

Discovers existing evaluation projects in the projects directory.

empty_is_none_parser_for

Returns a parser function that returns None for empty strings.

list_parser

Parses a comma-separated string into a list of strings.

setup_listeners

Sets up listeners updating the application state based on user inputs.

ListenerConfig

Bases: TypedDict

Configuration for a textbox listener.

Attributes:

Name Type Description
state_field str

The name of the state field to update.

parser Callable[[str], Any] | None

An optional parser function to process the input value.

Source code in evalsense/webui/utils.py
class ListenerConfig(TypedDict):
    """Configuration for a textbox listener.

    Attributes:
        state_field (str): The name of the state field to update.
        parser (Callable[[str], Any] | None): An optional parser function
            to process the input value.
    """

    state_field: str
    parser: Callable[[str], Any] | None

dict_parser

dict_parser(input_string: str) -> dict[str, Any]

Parses a string representation of a dictionary into an actual dictionary.

Parameters:

Name Type Description Default
input_string str

The input string to parse.

required

Returns:

Type Description
dict[str, Any]

dict[str, Any]: The parsed dictionary.

Source code in evalsense/webui/utils.py
def dict_parser(input_string: str) -> dict[str, Any]:
    """Parses a string representation of a dictionary into an actual dictionary.

    Arguments:
        input_string (str): The input string to parse.

    Returns:
        dict[str, Any]: The parsed dictionary.
    """
    if not input_string:
        return {}
    try:
        return ast.literal_eval(input_string)
    except Exception:
        raise gr.Error(f"Invalid dictionary format: {input_string}")

discover_projects

discover_projects(state: AppState) -> AppState

Discovers existing evaluation projects in the projects directory.

Returns:

Name Type Description
AppState AppState

The updated application state with the list of existing projects.

Source code in evalsense/webui/utils.py
def discover_projects(state: AppState) -> AppState:
    """Discovers existing evaluation projects in the projects directory.

    Returns:
        AppState: The updated application state with the list of existing projects.
    """
    try:
        projects = [entry.name for entry in PROJECTS_PATH.iterdir() if entry.is_dir()]
    except FileNotFoundError:
        projects = []
    state["existing_projects"] = projects
    return state

empty_is_none_parser_for

empty_is_none_parser_for(
    type: type,
) -> Callable[[str], Any | None]

Returns a parser function that returns None for empty strings.

Parameters:

Name Type Description Default
type type

The type of the value to parse.

required

Returns:

Type Description
Callable[[str], Any | None]

Callable[[str], Any | None]: The parser function.

Source code in evalsense/webui/utils.py
def empty_is_none_parser_for(type: type) -> Callable[[str], Any | None]:
    """Returns a parser function that returns None for empty strings.

    Args:
        type (type): The type of the value to parse.

    Returns:
        Callable[[str], Any | None]: The parser function.
    """

    def parser(input_string: str) -> Any | None:
        if not input_string:
            return None
        try:
            return type(input_string)
        except Exception:
            raise ValueError(f"Unable to parse {input_string} as {type.__name__}.")

    return parser

list_parser

list_parser(input_string: str) -> list[str]

Parses a comma-separated string into a list of strings.

Parameters:

Name Type Description Default
input_string str

The input string to parse.

required

Returns:

Type Description
list[str]

list[str]: A list containing the parsed strings.

Source code in evalsense/webui/utils.py
def list_parser(input_string: str) -> list[str]:
    """Parses a comma-separated string into a list of strings.

    Arguments:
        input_string (str): The input string to parse.

    Returns:
        list[str]: A list containing the parsed strings.
    """
    return input_string.replace(" ", "").split(",")

setup_listeners

setup_listeners(
    listener_config: dict[GradioInput, ListenerConfig],
    state: State,
)

Sets up listeners updating the application state based on user inputs.

Parameters:

Name Type Description Default
listener_config dict[GradioInput, TextboxListenerConfig]

The configuration specifying the parsers for processing user inputs and the corresponding state fields to update.

required
state State

The current state of the Gradio application.

required
Source code in evalsense/webui/utils.py
def setup_listeners(
    listener_config: dict[GradioInput, ListenerConfig],
    state: gr.State,
):
    """Sets up listeners updating the application state based on user inputs.

    Arguments:
        listener_config (dict[GradioInput, TextboxListenerConfig]): The configuration
            specifying the parsers for processing user inputs and the corresponding
            state fields to update.
        state (gr.State): The current state of the Gradio application.
    """
    for input_element, element_config in listener_config.items():

        @input_element.change(inputs=[input_element, state], outputs=[state])
        def update_field(
            entered_value: str,
            state: AppState,
            config: ListenerConfig = element_config,
        ):
            value = entered_value
            if config["parser"] is not None:
                value = config["parser"](entered_value)
            state[config["state_field"]] = value
            return state