Web UI

Warning

The components described here are not part of the standard EvalSense API and may change without notice. This part of the documentation is only meant to be used for development purposes.

App

Module evalsense.webui.app.

Functions:

Name	Description
`launch_webui`	Launches the EvalSense Gradio web UI.

launch_webui

launch_webui(
    password: str | None = None,
    no_auth: bool = False,
    share: bool = False,
    port: int = 7860,
)

Launches the EvalSense Gradio web UI.

Parameters:

Name	Type	Description	Default
`password`	`str \| None`	Password for authentication. If None, a random password is generated.	`None`
`no_auth`	`bool`	If True, disables authentication.	`False`
`share`	`bool`	If True, enables Gradio public sharing. This will make the app publicly accessible over the internet. Use with caution.	`False`
`port`	`int`	Port to run the Gradio server on.	`7860`

Source code in evalsense/webui/app.py

def launch_webui(
    password: str | None = None,
    no_auth: bool = False,
    share: bool = False,
    port: int = 7860,
):
    """Launches the EvalSense Gradio web UI.

    Args:
        password: Password for authentication. If None, a random password is generated.
        no_auth: If True, disables authentication.
        share: If True, enables Gradio public sharing. This will make the app publicly
            accessible over the internet. Use with caution.
        port: Port to run the Gradio server on.
    """
    theme = Default(primary_hue="blue")
    with gr.Blocks(theme=theme, title="EvalSense") as demo:
        state = gr.State(get_initial_state())
        gr.Markdown("# 🔎 EvalSense")
        gr.Markdown(
            "To run an evaluation, configure its settings on the individual tabs and start it from the **Execution** tab. For EvalSense documentation and guidance regarding the available evaluation metrics, please visit the [EvalSense homepage](https://nhsengland.github.io/evalsense/)."
        )
        with gr.Tab("Data"):
            data_tab(state)
        with gr.Tab("Generation"):
            generation_tab(state)
        with gr.Tab("Models"):
            models_tab(state)
        with gr.Tab("Evaluators"):
            evaluators_tab(state)
        with gr.Tab("Execution"):
            execution_tab(state)
        with gr.Tab("Results"):
            results_tab(state)

        # Regularly discover projects and update the state
        timer = gr.Timer(3, active=True)
        timer.tick(fn=discover_projects, inputs=[state], outputs=[state])

    if share:
        print("* CAUTION: Gradio public sharing is enabled!")

    if no_auth:
        print("* CAUTION: Authentication disabled!")
        demo.launch(share=share)
    else:
        if password is None:
            password = secrets.token_urlsafe(20)
        print("* Server username: user")
        print(f"* Server password: {password}")
        demo.launch(
            share=share,
            auth=("user", password) if not no_auth else None,
            server_port=port,
        )

Execution

Module evalsense.webui.execution.

Functions:

Name	Description
`execute_evaluation`	Executes the evaluation based on the current application state.
`execute_meta_evaluation`	Executes a meta-evaluation based on the current application state.
`execute_standard_evaluation`	Executes a standard evaluation for the given application state.
`get_dataset_manager`	Creates and returns a DatasetManager based on the current application state.
`get_evaluators`	Creates and returns a list of Evaluator instances based on the current application state.
`get_model_configs`	Creates and returns a list of ModelConfig based on the current application state.

execute_evaluation

execute_evaluation(state: AppState)

Executes the evaluation based on the current application state.

Parameters:

Name	Type	Description	Default
`state`	`AppState`	The current application state.	required

Source code in evalsense/webui/execution.py

def execute_evaluation(state: AppState):
    """Executes the evaluation based on the current application state.

    Args:
        state (AppState): The current application state.
    """
    if state["is_meta_eval"]:
        execute_meta_evaluation(state)
    else:
        execute_standard_evaluation(state)

execute_meta_evaluation

execute_meta_evaluation(state: AppState)

Executes a meta-evaluation based on the current application state.

Parameters:

Name	Type	Description	Default
`state`	`AppState`	The current application state.	required

Source code in evalsense/webui/execution.py

def execute_meta_evaluation(state: AppState):
    """Executes a meta-evaluation based on the current application state.

    Args:
        state (AppState): The current application state.
    """
    dataset_manager = get_dataset_manager(state)

    tasks = []
    for tier_id, perturbation_tier_subprompt in enumerate(
        state["perturbation_tier_subprompts"]
    ):
        system_prompt = state["system_prompt"].replace(
            "{perturbation_tier_subprompt}", perturbation_tier_subprompt
        )
        user_prompt = state["user_prompt"].replace(
            "{perturbation_tier_subprompt}", perturbation_tier_subprompt
        )
        generation_steps = GenerationSteps(
            name=f"{state['generation_steps_name']} (Tier {tier_id + 1})",
            steps=[
                system_message(system_prompt),
                prompt_template(user_prompt),
                generate(),
            ],
        )

        # We use a RecordToSample function to add the perturbation tier
        # to the metadata
        def perturbation_record_to_sample(
            record: dict[str, Any],
            tier_id: int = tier_id,
        ) -> Sample:
            return Sample(
                input=record[state["input_field_name"]],
                target=record.get(state["target_field_name"], ""),
                choices=record.get(state["choices_field_name"]),
                id=record.get(state["id_field_name"]),
                metadata={k: record[k] for k in state["metadata_fields"]}
                | {"perturbation_tier": tier_id},
            )

        perturb_task_preprocessor = DefaultTaskPreprocessor(name="Perturbation")
        task_config = TaskConfig(
            dataset_manager=dataset_manager,
            generation_steps=generation_steps,
            field_spec=perturbation_record_to_sample,
            task_preprocessor=perturb_task_preprocessor,
        )
        tasks.append(task_config)

    model_configs = get_model_configs(state)
    evaluators = get_evaluators(state)
    experiment_config = ExperimentBatchConfig(
        tasks=tasks, model_configs=model_configs, evaluators=evaluators
    )
    project = Project(name=state["project_name"])
    pipeline = Pipeline(experiments=experiment_config, project=project)
    pipeline.run()

execute_standard_evaluation

execute_standard_evaluation(state: AppState)

Executes a standard evaluation for the given application state.

Parameters:

Name	Type	Description	Default
`state`	`AppState`	The current application state.	required

Source code in evalsense/webui/execution.py

def execute_standard_evaluation(state: AppState):
    """Executes a standard evaluation for the given application state.

    Args:
        state (AppState): The current application state.
    """
    dataset_manager = get_dataset_manager(state)
    generation_steps = GenerationSteps(
        name=state["generation_steps_name"],
        steps=[
            system_message(state["system_prompt"]),
            prompt_template(state["user_prompt"]),
            generate(),
        ],
    )
    field_spec = FieldSpec(
        input=state["input_field_name"],
        target=state["target_field_name"],
        choices=state["choices_field_name"],
        id=state["id_field_name"],
        metadata=state["metadata_fields"],
    )
    model_configs = get_model_configs(state)
    evaluators = get_evaluators(state)
    task_config = TaskConfig(
        dataset_manager=dataset_manager,
        generation_steps=generation_steps,
        field_spec=field_spec,
    )
    experiment_config = ExperimentBatchConfig(
        tasks=[task_config], model_configs=model_configs, evaluators=evaluators
    )
    project = Project(name=state["project_name"])
    pipeline = Pipeline(experiments=experiment_config, project=project)
    pipeline.run()

get_dataset_manager

get_dataset_manager(state: AppState) -> DatasetManager

Creates and returns a DatasetManager based on the current application state.

Parameters:

Name	Type	Description	Default
`state`	`AppState`	The current application state.	required

Returns:

Name	Type	Description
`DatasetManager`	`DatasetManager`	The instantiated DatasetManager.

Source code in evalsense/webui/execution.py

def get_dataset_manager(state: AppState) -> DatasetManager:
    """Creates and returns a DatasetManager based on the current application state.

    Args:
        state (AppState): The current application state.

    Returns:
        DatasetManager: The instantiated DatasetManager.
    """
    return DatasetManager.create(
        name=state["dataset_name"],
        splits=state["dataset_splits"],
        version=state["dataset_version"],
    )

get_evaluators

get_evaluators(state: AppState) -> list[Evaluator]

Creates and returns a list of Evaluator instances based on the current application state.

Parameters:

Name	Type	Description	Default
`state`	`AppState`	The current application state.	required

Returns:

Type	Description
`list[Evaluator]`	list[Evaluator]: The list of instantiated Evaluator objects.

Source code in evalsense/webui/execution.py

def get_evaluators(state: AppState) -> list[Evaluator]:
    """Creates and returns a list of Evaluator instances based on the current application state.

    Args:
        state (AppState): The current application state.

    Returns:
        list[Evaluator]: The list of instantiated Evaluator objects.
    """
    evaluators: list[Evaluator] = []
    for evaluator_config in state["evaluator_configs"]:
        configurator = EvaluatorConfigurator.create(evaluator_config["evaluator_name"])
        evaluator = configurator.instantiate_evaluator(
            **evaluator_config["evaluator_args"]
        )
        evaluators.append(evaluator)
    return evaluators

get_model_configs

get_model_configs(state: AppState) -> list[ModelConfig]

Creates and returns a list of ModelConfig based on the current application state.

Parameters:

Name	Type	Description	Default
`state`	`AppState`	The current application state.	required

Returns:

Type	Description
`list[ModelConfig]`	list[ModelConfig]: The list of instantiated ModelConfig objects.

Source code in evalsense/webui/execution.py

def get_model_configs(state: AppState) -> list[ModelConfig]:
    """Creates and returns a list of ModelConfig based on the current application state.

    Args:
        state (AppState): The current application state.

    Returns:
        list[ModelConfig]: The list of instantiated ModelConfig objects.
    """
    return [
        ModelConfig(
            m["model_name"],
            model_args=m["model_args"],
            generation_args=GenerateConfigArgs(**m["generation_args"]),
        )
        for m in state["model_configs"]
    ]

State

Module evalsense.webui.state.

Classes:

Name	Description
`AppEvaluatorConfig`	Evaluator configuration to be used within the Gradio application.
`AppModelConfig`	Model configuration to be used within the Gradio application.
`AppState`	Application state to be used within the Gradio application.

Functions:

Name	Description
`get_initial_state`	Provides the initial application state.

AppEvaluatorConfig

Bases: TypedDict

Evaluator configuration to be used within the Gradio application.

Attributes:

Name	Type	Description
`evaluator_name`	`str`	The name of the evaluator to use.
`evaluator_args`	`dict[str, Any]`	The arguments to pass to the evaluator.

Source code in evalsense/webui/state.py

class AppEvaluatorConfig(TypedDict):
    """Evaluator configuration to be used within the Gradio application.

    Attributes:
        evaluator_name (str): The name of the evaluator to use.
        evaluator_args (dict[str, Any]): The arguments to pass to the evaluator.
    """

    evaluator_name: str
    evaluator_args: dict[str, Any]

AppModelConfig

Bases: TypedDict

Model configuration to be used within the Gradio application.

Attributes:

Name	Type	Description
`model_name`	`str`	The name of the model to use.
`model_args`	`dict[str, Any]`	The arguments to pass to the model.
`generation_args`	`dict[str, Any]`	The arguments to use for text generation.

Source code in evalsense/webui/state.py

class AppModelConfig(TypedDict):
    """Model configuration to be used within the Gradio application.

    Attributes:
        model_name (str): The name of the model to use.
        model_args (dict[str, Any]): The arguments to pass to the model.
        generation_args (dict[str, Any]): The arguments to use for text generation.
    """

    model_name: str
    model_args: dict[str, Any]
    generation_args: dict[str, Any]

AppState

Bases: TypedDict

Application state to be used within the Gradio application.

Attributes:

Name	Type	Description
`dataset_name`	`str`	The name of the dataset to evaluate on.
`dataset_splits`	`tuple[str]`	The used splits of the dataset.
`dataset_version`	`str`	The used version of the dataset.
`input_field_name`	`str`	The name of the main input field in the dataset.
`target_field_name`	`str`	The name of the target field in the dataset.
`choices_field_name`	`str`	The name of the answer choices field in the dataset.
`id_field_name`	`str`	The name of the ID field in the dataset.
`metadata_fields`	`tuple[str]`	The names of the metadata fields in the dataset.
`is_meta_eval`	`bool`	Whether the evaluation to be performed is a meta-evaluation.
`perturbation_tiers`	`int`	The number of perturbation tiers to use for meta-evaluation.
`perturbation_tier_subprompts`	`list[str]`	The subprompts to use for each perturbation tier.
`generation_steps_name`	`str`	The name of the used generation strategy.
`system_prompt`	`str`	The system prompt to use for generation.
`user_prompt`	`str`	The user prompt to use for generation.
`model_configs`	`list[AppModelConfig]`	The model configurations to use for generation.
`evaluator_configs`	`list[AppEvaluatorConfig]`	The evaluator configurations to use for evaluation.
`project_name`	`str`	The name of the evaluation project.
`existing_projects`	`list[str]`	The list of existing evaluation projects.

Source code in evalsense/webui/state.py

class AppState(TypedDict):
    """Application state to be used within the Gradio application.

    Attributes:
        dataset_name (str): The name of the dataset to evaluate on.
        dataset_splits (tuple[str]): The used splits of the dataset.
        dataset_version (str): The used version of the dataset.
        input_field_name (str): The name of the main input field in the dataset.
        target_field_name (str): The name of the target field in the dataset.
        choices_field_name (str): The name of the answer choices field in the dataset.
        id_field_name (str): The name of the ID field in the dataset.
        metadata_fields (tuple[str]): The names of the metadata fields in the dataset.
        is_meta_eval (bool): Whether the evaluation to be performed is a meta-evaluation.
        perturbation_tiers (int): The number of perturbation tiers to use for
            meta-evaluation.
        perturbation_tier_subprompts (list[str]): The subprompts to use for each
            perturbation tier.
        generation_steps_name (str): The name of the used generation strategy.
        system_prompt (str): The system prompt to use for generation.
        user_prompt (str): The user prompt to use for generation.
        model_configs (list[AppModelConfig]): The model configurations to use for
            generation.
        evaluator_configs (list[AppEvaluatorConfig]): The evaluator configurations
            to use for evaluation.
        project_name (str): The name of the evaluation project.
        existing_projects (list[str]): The list of existing evaluation projects.
    """

    dataset_name: str
    dataset_splits: list[str]
    dataset_version: str | None
    input_field_name: str
    target_field_name: str
    choices_field_name: str
    id_field_name: str
    metadata_fields: list[str]
    is_meta_eval: bool
    perturbation_tiers: int
    perturbation_tier_subprompts: list[str]
    generation_steps_name: str
    system_prompt: str
    user_prompt: str
    model_configs: list[AppModelConfig]
    evaluator_configs: list[AppEvaluatorConfig]
    project_name: str
    existing_projects: list[str]

get_initial_state

get_initial_state() -> AppState

Provides the initial application state.

Returns:

Name	Type	Description
`AppState`	`AppState`	The initial application state.

Source code in evalsense/webui/state.py

def get_initial_state() -> AppState:
    """Provides the initial application state.

    Returns:
        AppState: The initial application state.
    """
    return {
        "dataset_name": "",
        "dataset_splits": list(),
        "dataset_version": None,
        "input_field_name": "input",
        "target_field_name": "target",
        "choices_field_name": "choices",
        "id_field_name": "id",
        "metadata_fields": list(),
        "is_meta_eval": False,
        "perturbation_tiers": 2,
        "perturbation_tier_subprompts": list(),
        "generation_steps_name": "Default",
        "system_prompt": "",
        "user_prompt": "",
        "model_configs": list(),
        "evaluator_configs": list(),
        "project_name": "Default",
        "existing_projects": list(),
    }

Utils

Module evalsense.webui.utils.

Classes:

Name	Description
`ListenerConfig`	Configuration for a textbox listener.

Functions:

Name	Description
`dict_parser`	Parses a string representation of a dictionary into an actual dictionary.
`discover_projects`	Discovers existing evaluation projects in the projects directory.
`empty_is_none_parser_for`	Returns a parser function that returns None for empty strings.
`list_parser`	Parses a comma-separated string into a list of strings.
`setup_listeners`	Sets up listeners updating the application state based on user inputs.

ListenerConfig

Bases: TypedDict

Configuration for a textbox listener.

Attributes:

Name	Type	Description
`state_field`	`str`	The name of the state field to update.
`parser`	`Callable[[str], Any] \| None`	An optional parser function to process the input value.

Source code in evalsense/webui/utils.py

class ListenerConfig(TypedDict):
    """Configuration for a textbox listener.

    Attributes:
        state_field (str): The name of the state field to update.
        parser (Callable[[str], Any] | None): An optional parser function
            to process the input value.
    """

    state_field: str
    parser: Callable[[str], Any] | None

dict_parser

dict_parser(input_string: str) -> dict[str, Any]

Parses a string representation of a dictionary into an actual dictionary.

Parameters:

Name	Type	Description	Default
`input_string`	`str`	The input string to parse.	required

Returns:

Type	Description
`dict[str, Any]`	dict[str, Any]: The parsed dictionary.

Source code in evalsense/webui/utils.py

def dict_parser(input_string: str) -> dict[str, Any]:
    """Parses a string representation of a dictionary into an actual dictionary.

    Arguments:
        input_string (str): The input string to parse.

    Returns:
        dict[str, Any]: The parsed dictionary.
    """
    if not input_string:
        return {}
    try:
        return ast.literal_eval(input_string)
    except Exception:
        raise gr.Error(f"Invalid dictionary format: {input_string}")

discover_projects

discover_projects(state: AppState) -> AppState

Discovers existing evaluation projects in the projects directory.

Returns:

Name	Type	Description
`AppState`	`AppState`	The updated application state with the list of existing projects.

Source code in evalsense/webui/utils.py

def discover_projects(state: AppState) -> AppState:
    """Discovers existing evaluation projects in the projects directory.

    Returns:
        AppState: The updated application state with the list of existing projects.
    """
    try:
        projects = [entry.name for entry in PROJECTS_PATH.iterdir() if entry.is_dir()]
    except FileNotFoundError:
        projects = []
    state["existing_projects"] = projects
    return state

empty_is_none_parser_for

empty_is_none_parser_for(
    type: type,
) -> Callable[[str], Any | None]

Returns a parser function that returns None for empty strings.

Parameters:

Name	Type	Description	Default
`type`	`type`	The type of the value to parse.	required

Returns:

Type	Description
`Callable[[str], Any \| None]`	Callable[[str], Any \| None]: The parser function.

Source code in evalsense/webui/utils.py

def empty_is_none_parser_for(type: type) -> Callable[[str], Any | None]:
    """Returns a parser function that returns None for empty strings.

    Args:
        type (type): The type of the value to parse.

    Returns:
        Callable[[str], Any | None]: The parser function.
    """

    def parser(input_string: str) -> Any | None:
        if not input_string:
            return None
        try:
            return type(input_string)
        except Exception:
            raise ValueError(f"Unable to parse {input_string} as {type.__name__}.")

    return parser

list_parser

list_parser(input_string: str) -> list[str]

Parses a comma-separated string into a list of strings.

Parameters:

Name	Type	Description	Default
`input_string`	`str`	The input string to parse.	required

Returns:

Type	Description
`list[str]`	list[str]: A list containing the parsed strings.

Source code in evalsense/webui/utils.py

def list_parser(input_string: str) -> list[str]:
    """Parses a comma-separated string into a list of strings.

    Arguments:
        input_string (str): The input string to parse.

    Returns:
        list[str]: A list containing the parsed strings.
    """
    return input_string.replace(" ", "").split(",")

setup_listeners

setup_listeners(
    listener_config: dict[GradioInput, ListenerConfig],
    state: State,
)

Sets up listeners updating the application state based on user inputs.

Parameters:

Name	Type	Description	Default
`listener_config`	`dict[GradioInput, TextboxListenerConfig]`	The configuration specifying the parsers for processing user inputs and the corresponding state fields to update.	required
`state`	`State`	The current state of the Gradio application.	required

Source code in evalsense/webui/utils.py

def setup_listeners(
    listener_config: dict[GradioInput, ListenerConfig],
    state: gr.State,
):
    """Sets up listeners updating the application state based on user inputs.

    Arguments:
        listener_config (dict[GradioInput, TextboxListenerConfig]): The configuration
            specifying the parsers for processing user inputs and the corresponding
            state fields to update.
        state (gr.State): The current state of the Gradio application.
    """
    for input_element, element_config in listener_config.items():

        @input_element.change(inputs=[input_element, state], outputs=[state])
        def update_field(
            entered_value: str,
            state: AppState,
            config: ListenerConfig = element_config,
        ):
            value = entered_value
            if config["parser"] is not None:
                value = config["parser"](entered_value)
            state[config["state_field"]] = value
            return state