Skip to content

Tasks

Modules:

Name Description
task_preprocessor

Classes:

Name Description
DefaultTaskPreprocessor

Default task preprocessor that returns the input dataset unchanged.

TaskPreprocessingFunction

A protocol for a function that preprocesses datasets.

TaskPreprocessor

A class preprocessing a dataset for a specific task.

DefaultTaskPreprocessor

Bases: TaskPreprocessor

Default task preprocessor that returns the input dataset unchanged.

Methods:

Name Description
__call__

Preprocesses the input dataset for a specific task.

__init__

Initializes the default task preprocessor.

Source code in evalsense/tasks/task_preprocessor.py
class DefaultTaskPreprocessor(TaskPreprocessor):
    """Default task preprocessor that returns the input dataset unchanged."""

    def __init__(self, name="Identity") -> None:
        """Initializes the default task preprocessor."""
        super().__init__(
            name=name,
            preprocessing_function=default_task_preprocessing_function,
        )

__call__

__call__(
    hf_dataset: Dataset,
    dataset_manager: DatasetManager,
    field_spec: FieldSpec | RecordToSample | None = None,
    force_reprocess: bool = False,
) -> Dataset

Preprocesses the input dataset for a specific task.

Parameters:

Name Type Description Default
hf_dataset Dataset

The input dataset to preprocess, in HuggingFace format.

required
dataset_manager DatasetManager

The dataset manager used to retrieve the dataset.

required
field_spec FieldSpec

Specification mapping dataset fields to sample fields. See Inspect AI documentation for more details.

None
force_reprocess bool

Whether to force reprocess the dataset even if it already exists. Defaults to False.

False

Returns:

Type Description
Dataset

The preprocessed dataset.

Source code in evalsense/tasks/task_preprocessor.py
def __call__(
    self,
    hf_dataset: datasets.Dataset,
    dataset_manager: DatasetManager,
    field_spec: FieldSpec | RecordToSample | None = None,
    force_reprocess: bool = False,
) -> Dataset:
    """Preprocesses the input dataset for a specific task.

    Args:
        hf_dataset (datasets.Dataset): The input dataset to preprocess,
            in HuggingFace format.
        dataset_manager (DatasetManager): The dataset manager used to
            retrieve the dataset.
        field_spec (FieldSpec): Specification mapping dataset fields to
            sample fields. See Inspect AI documentation for more details.
        force_reprocess (bool): Whether to force reprocess the dataset
            even if it already exists. Defaults to False.

    Returns:
        (Dataset): The preprocessed dataset.
    """
    tasks_path = Path(dataset_manager.version_path / "tasks")
    tasks_path.mkdir(parents=True, exist_ok=True)
    dataset_filename = (
        f"{to_safe_filename(dataset_manager.name)}-"
        f"{to_safe_filename(self.name)}.jsonl"
    )
    task_data_path = tasks_path / dataset_filename

    if not task_data_path.exists() or force_reprocess:
        preprocessed_hf_dataset = self.preprocessing_function(
            hf_dataset, dataset_manager
        )
        with disable_dataset_progress_bars():
            preprocessed_hf_dataset.to_json(task_data_path, lines=True)

    dataset = json_dataset(str(task_data_path), sample_fields=field_spec)
    return dataset

__init__

__init__(name='Identity') -> None

Initializes the default task preprocessor.

Source code in evalsense/tasks/task_preprocessor.py
def __init__(self, name="Identity") -> None:
    """Initializes the default task preprocessor."""
    super().__init__(
        name=name,
        preprocessing_function=default_task_preprocessing_function,
    )

TaskPreprocessingFunction

Bases: Protocol

A protocol for a function that preprocesses datasets.

You can pass this function to a TaskPreprocessor to perform some task-specific preprocessing on a dataset. This is especially useful in cases in which a single dataset can be used for multiple different tasks, with each requiring different preprocessing steps.

Methods:

Name Description
__call__

Preprocesses the input dataset for a specific task.

Source code in evalsense/tasks/task_preprocessor.py
class TaskPreprocessingFunction(Protocol):
    """A protocol for a function that preprocesses datasets.

    You can pass this function to a TaskPreprocessor to perform some
    task-specific preprocessing on a dataset. This is especially useful
    in cases in which a single dataset can be used for multiple different
    tasks, with each requiring different preprocessing steps.
    """

    def __call__(
        self, hf_dataset: datasets.Dataset, dataset_manager: DatasetManager
    ) -> datasets.Dataset:
        """Preprocesses the input dataset for a specific task.

        Args:
            hf_dataset (datasets.Dataset): The input dataset to preprocess,
                in HuggingFace format.
            dataset_manager (DatasetManager): The dataset manager used to
                retrieve the dataset.

        Returns:
            (datasets.Dataset): The preprocessed dataset.
        """
        ...

__call__

__call__(
    hf_dataset: Dataset, dataset_manager: DatasetManager
) -> datasets.Dataset

Preprocesses the input dataset for a specific task.

Parameters:

Name Type Description Default
hf_dataset Dataset

The input dataset to preprocess, in HuggingFace format.

required
dataset_manager DatasetManager

The dataset manager used to retrieve the dataset.

required

Returns:

Type Description
Dataset

The preprocessed dataset.

Source code in evalsense/tasks/task_preprocessor.py
def __call__(
    self, hf_dataset: datasets.Dataset, dataset_manager: DatasetManager
) -> datasets.Dataset:
    """Preprocesses the input dataset for a specific task.

    Args:
        hf_dataset (datasets.Dataset): The input dataset to preprocess,
            in HuggingFace format.
        dataset_manager (DatasetManager): The dataset manager used to
            retrieve the dataset.

    Returns:
        (datasets.Dataset): The preprocessed dataset.
    """
    ...

TaskPreprocessor

A class preprocessing a dataset for a specific task.

Methods:

Name Description
__call__

Preprocesses the input dataset for a specific task.

__init__

Initializes the task preprocessor.

Source code in evalsense/tasks/task_preprocessor.py
class TaskPreprocessor:
    """A class preprocessing a dataset for a specific task."""

    def __init__(
        self,
        name: str,
        preprocessing_function: TaskPreprocessingFunction,
    ) -> None:
        """Initializes the task preprocessor.

        Args:
            name (str): The name of the task preprocessor.
            preprocessing_function (TaskPreprocessingFunction): The function used to
                preprocess the dataset.
        """
        self.name = name
        self.preprocessing_function = preprocessing_function

    def __call__(
        self,
        hf_dataset: datasets.Dataset,
        dataset_manager: DatasetManager,
        field_spec: FieldSpec | RecordToSample | None = None,
        force_reprocess: bool = False,
    ) -> Dataset:
        """Preprocesses the input dataset for a specific task.

        Args:
            hf_dataset (datasets.Dataset): The input dataset to preprocess,
                in HuggingFace format.
            dataset_manager (DatasetManager): The dataset manager used to
                retrieve the dataset.
            field_spec (FieldSpec): Specification mapping dataset fields to
                sample fields. See Inspect AI documentation for more details.
            force_reprocess (bool): Whether to force reprocess the dataset
                even if it already exists. Defaults to False.

        Returns:
            (Dataset): The preprocessed dataset.
        """
        tasks_path = Path(dataset_manager.version_path / "tasks")
        tasks_path.mkdir(parents=True, exist_ok=True)
        dataset_filename = (
            f"{to_safe_filename(dataset_manager.name)}-"
            f"{to_safe_filename(self.name)}.jsonl"
        )
        task_data_path = tasks_path / dataset_filename

        if not task_data_path.exists() or force_reprocess:
            preprocessed_hf_dataset = self.preprocessing_function(
                hf_dataset, dataset_manager
            )
            with disable_dataset_progress_bars():
                preprocessed_hf_dataset.to_json(task_data_path, lines=True)

        dataset = json_dataset(str(task_data_path), sample_fields=field_spec)
        return dataset

__call__

__call__(
    hf_dataset: Dataset,
    dataset_manager: DatasetManager,
    field_spec: FieldSpec | RecordToSample | None = None,
    force_reprocess: bool = False,
) -> Dataset

Preprocesses the input dataset for a specific task.

Parameters:

Name Type Description Default
hf_dataset Dataset

The input dataset to preprocess, in HuggingFace format.

required
dataset_manager DatasetManager

The dataset manager used to retrieve the dataset.

required
field_spec FieldSpec

Specification mapping dataset fields to sample fields. See Inspect AI documentation for more details.

None
force_reprocess bool

Whether to force reprocess the dataset even if it already exists. Defaults to False.

False

Returns:

Type Description
Dataset

The preprocessed dataset.

Source code in evalsense/tasks/task_preprocessor.py
def __call__(
    self,
    hf_dataset: datasets.Dataset,
    dataset_manager: DatasetManager,
    field_spec: FieldSpec | RecordToSample | None = None,
    force_reprocess: bool = False,
) -> Dataset:
    """Preprocesses the input dataset for a specific task.

    Args:
        hf_dataset (datasets.Dataset): The input dataset to preprocess,
            in HuggingFace format.
        dataset_manager (DatasetManager): The dataset manager used to
            retrieve the dataset.
        field_spec (FieldSpec): Specification mapping dataset fields to
            sample fields. See Inspect AI documentation for more details.
        force_reprocess (bool): Whether to force reprocess the dataset
            even if it already exists. Defaults to False.

    Returns:
        (Dataset): The preprocessed dataset.
    """
    tasks_path = Path(dataset_manager.version_path / "tasks")
    tasks_path.mkdir(parents=True, exist_ok=True)
    dataset_filename = (
        f"{to_safe_filename(dataset_manager.name)}-"
        f"{to_safe_filename(self.name)}.jsonl"
    )
    task_data_path = tasks_path / dataset_filename

    if not task_data_path.exists() or force_reprocess:
        preprocessed_hf_dataset = self.preprocessing_function(
            hf_dataset, dataset_manager
        )
        with disable_dataset_progress_bars():
            preprocessed_hf_dataset.to_json(task_data_path, lines=True)

    dataset = json_dataset(str(task_data_path), sample_fields=field_spec)
    return dataset

__init__

__init__(
    name: str,
    preprocessing_function: TaskPreprocessingFunction,
) -> None

Initializes the task preprocessor.

Parameters:

Name Type Description Default
name str

The name of the task preprocessor.

required
preprocessing_function TaskPreprocessingFunction

The function used to preprocess the dataset.

required
Source code in evalsense/tasks/task_preprocessor.py
def __init__(
    self,
    name: str,
    preprocessing_function: TaskPreprocessingFunction,
) -> None:
    """Initializes the task preprocessor.

    Args:
        name (str): The name of the task preprocessor.
        preprocessing_function (TaskPreprocessingFunction): The function used to
            preprocess the dataset.
    """
    self.name = name
    self.preprocessing_function = preprocessing_function