Skip to content

Dataset Managers

Modules:

Name Description
aci_bench
huggingface

Classes:

Name Description
AciBenchDatasetManager

A dataset manager for the ACI Bench dataset.

HuggingFaceDatasetManager

A dataset manager for Hugging Face datasets.

AciBenchDatasetManager

Bases: FileBasedDatasetManager

A dataset manager for the ACI Bench dataset.

Methods:

Name Description
__init__

Initializes a new AciBenchDatasetManager.

can_handle

Checks if the DatasetManager can handle the given dataset.

create

Creates a new dataset manager for the specified dataset.

is_retrieved

Checks if the dataset at the specific version is already downloaded.

load

Loads the dataset as a HuggingFace dataset.

remove

Deletes the dataset at the specific version from disk.

retrieve

Downloads and preprocesses a dataset.

unload

Unloads the dataset from memory.

Attributes:

Name Type Description
dataset_path Path

The top-level directory for storing this dataset.

main_data_path Path

The path for storing the preprocessed dataset files for a specific version.

record DatasetRecord

Returns a record identifying the dataset.

version_path Path

The directory for storing a specific version of this dataset.

Source code in evalsense/datasets/managers/aci_bench.py
@manager
class AciBenchDatasetManager(FileBasedDatasetManager):
    """A dataset manager for the ACI Bench dataset."""

    _DATASET_NAME = "ACI-BENCH"
    priority = 7

    def __init__(
        self,
        version: str | None = _DEFAULT_VERSION,
        splits: list[str] | None = None,
        data_dir: str | None = None,
        **kwargs,
    ):
        """Initializes a new AciBenchDatasetManager.

        Args:
            version (str, optional): The dataset version to retrieve.
            splits (list[str], optional): The dataset splits to retrieve.
            data_dir (str, optional): The top-level directory for storing all
                datasets. Defaults to "datasets" in the user cache directory.
            **kwargs (dict): Additional keyword arguments.
        """
        kwargs.pop("name", None)
        if version is None:
            version = _DEFAULT_VERSION
        super().__init__(
            name=self._DATASET_NAME,
            version=version,
            splits=splits,
            data_dir=data_dir,
            **kwargs,
        )

    @override
    def _preprocess_files(self, **kwargs) -> None:
        """Preprocesses the downloaded dataset files.

        This method preprocesses the downloaded dataset files and saves them
        as a HuggingFace DatasetDict in the `self.main_data_path` directory.

        Args:
            **kwargs (dict): Additional keyword arguments.
        """
        dataset_dict = {}
        for split in self.all_splits:
            # Join all data files into a single DataFrame
            data_df = None
            for file in self.config.get_files(self.version, [split]).values():
                if data_df is None:
                    data_df = pl.read_csv(self.version_path / file.name)
                else:
                    other_df = pl.read_csv(self.version_path / file.name)
                    data_df = data_df.join(
                        other_df, on=["dataset", "encounter_id"], how="inner"
                    )
            if data_df is None:
                raise RuntimeError(f"No data found for split '{split}'.")
            dataset = Dataset.from_polars(data_df)
            dataset_dict[split] = dataset

        # Save the dataset to disk
        with disable_dataset_progress_bars():
            hf_dataset = DatasetDict(dataset_dict)
            hf_dataset.save_to_disk(self.main_data_path)

    @classmethod
    @override
    def can_handle(cls, name: str) -> bool:
        """Checks if the DatasetManager can handle the given dataset.

        Args:
            name (str): The name of the dataset.

        Returns:
            (bool): True if the manager can handle the dataset, False otherwise.
        """
        return name.lower() == cls._DATASET_NAME.lower()

dataset_path property

dataset_path: Path

The top-level directory for storing this dataset.

Returns:

Type Description
Path

The dataset directory.

main_data_path property

main_data_path: Path

The path for storing the preprocessed dataset files for a specific version.

Returns:

Type Description
Path

The main dataset directory.

record property

record: DatasetRecord

Returns a record identifying the dataset.

Returns:

Type Description
DatasetRecord

The dataset record.

version_path property

version_path: Path

The directory for storing a specific version of this dataset.

Returns:

Type Description
Path

The dataset version directory.

__init__

__init__(
    version: str | None = _DEFAULT_VERSION,
    splits: list[str] | None = None,
    data_dir: str | None = None,
    **kwargs,
)

Initializes a new AciBenchDatasetManager.

Parameters:

Name Type Description Default
version str

The dataset version to retrieve.

_DEFAULT_VERSION
splits list[str]

The dataset splits to retrieve.

None
data_dir str

The top-level directory for storing all datasets. Defaults to "datasets" in the user cache directory.

None
**kwargs dict

Additional keyword arguments.

{}
Source code in evalsense/datasets/managers/aci_bench.py
def __init__(
    self,
    version: str | None = _DEFAULT_VERSION,
    splits: list[str] | None = None,
    data_dir: str | None = None,
    **kwargs,
):
    """Initializes a new AciBenchDatasetManager.

    Args:
        version (str, optional): The dataset version to retrieve.
        splits (list[str], optional): The dataset splits to retrieve.
        data_dir (str, optional): The top-level directory for storing all
            datasets. Defaults to "datasets" in the user cache directory.
        **kwargs (dict): Additional keyword arguments.
    """
    kwargs.pop("name", None)
    if version is None:
        version = _DEFAULT_VERSION
    super().__init__(
        name=self._DATASET_NAME,
        version=version,
        splits=splits,
        data_dir=data_dir,
        **kwargs,
    )

can_handle classmethod

can_handle(name: str) -> bool

Checks if the DatasetManager can handle the given dataset.

Parameters:

Name Type Description Default
name str

The name of the dataset.

required

Returns:

Type Description
bool

True if the manager can handle the dataset, False otherwise.

Source code in evalsense/datasets/managers/aci_bench.py
@classmethod
@override
def can_handle(cls, name: str) -> bool:
    """Checks if the DatasetManager can handle the given dataset.

    Args:
        name (str): The name of the dataset.

    Returns:
        (bool): True if the manager can handle the dataset, False otherwise.
    """
    return name.lower() == cls._DATASET_NAME.lower()

create classmethod

create(
    name: str,
    splits: list[str],
    version: str | None = None,
    data_dir: str | None = None,
    **kwargs: dict,
) -> DatasetManager

Creates a new dataset manager for the specified dataset.

Parameters:

Name Type Description Default
name str

The name of the dataset.

required
splits list[str]

The dataset splits to retrieve.

required
version str | None

The dataset version to retrieve.

None
data_dir str | None

The top-level directory for storing all datasets.

None
**kwargs dict

Additional keyword arguments.

{}

Returns:

Type Description
DatasetManager

The created dataset manager.

Source code in evalsense/datasets/dataset_manager.py
@classmethod
def create(
    cls,
    name: str,
    splits: list[str],
    version: str | None = None,
    data_dir: str | None = None,
    **kwargs: dict,
) -> "DatasetManager":
    """Creates a new dataset manager for the specified dataset.

    Args:
        name (str): The name of the dataset.
        splits (list[str]): The dataset splits to retrieve.
        version (str | None): The dataset version to retrieve.
        data_dir (str | None): The top-level directory for storing all datasets.
        **kwargs (dict): Additional keyword arguments.

    Returns:
        (DatasetManager): The created dataset manager.
    """
    manager = DatasetManagerRegistry.get(name)
    if manager is not None:
        return manager(
            name=name,
            splits=splits,
            version=version,
            data_dir=data_dir,
            **kwargs,
        )
    raise ValueError(f"No suitable dataset manager found for {name}")

is_retrieved

is_retrieved() -> bool

Checks if the dataset at the specific version is already downloaded.

Returns:

Type Description
bool

True if the dataset exists locally, False otherwise.

Source code in evalsense/datasets/dataset_manager.py
def is_retrieved(self) -> bool:
    """Checks if the dataset at the specific version is already downloaded.

    Returns:
        (bool): True if the dataset exists locally, False otherwise.
    """
    return self.main_data_path.exists()

load

load(
    *,
    retrieve: bool = True,
    cache: bool = True,
    force_retrieve: bool = False,
    load_as_dict: Literal[False] = ...,
) -> Dataset
load(
    *,
    retrieve: bool = True,
    cache: bool = True,
    force_retrieve: bool = False,
    load_as_dict: Literal[True],
) -> DatasetDict
load(
    *,
    retrieve: bool = True,
    cache: bool = True,
    force_retrieve: bool = False,
    load_as_dict: bool = False,
) -> Dataset | DatasetDict

Loads the dataset as a HuggingFace dataset.

Parameters:

Name Type Description Default
retrieve bool

Whether to retrieve the dataset if it does not exist locally. Defaults to True.

True
cache bool

Whether to cache the dataset in memory. Defaults to True.

True
force_retrieve bool

Whether to force retrieving and reloading the dataset even if it is already cached. Overrides the retrieve flag if set to True. Defaults to False.

False
load_as_dict bool

Whether to load the dataset with multiple splits as a DatasetDict. If False (the default), the selected dataset splits are concatenated into a single dataset.

False

Returns:

Type Description
Dataset | DatasetDict

The loaded dataset.

Source code in evalsense/datasets/dataset_manager.py
def load(
    self,
    *,
    retrieve: bool = True,
    cache: bool = True,
    force_retrieve: bool = False,
    load_as_dict: bool = False,
) -> Dataset | DatasetDict:
    """Loads the dataset as a HuggingFace dataset.

    Args:
        retrieve (bool, optional): Whether to retrieve the dataset if it
            does not exist locally. Defaults to True.
        cache (bool, optional): Whether to cache the dataset in memory.
            Defaults to True.
        force_retrieve (bool, optional): Whether to force retrieving and
            reloading the dataset even if it is already cached. Overrides
            the `retrieve` flag if set to True. Defaults to False.
        load_as_dict (bool, optional): Whether to load the dataset with
            multiple splits as a DatasetDict. If False (the default),
            the selected dataset splits are concatenated into a single
            dataset.

    Returns:
        (Dataset | DatasetDict): The loaded dataset.
    """
    # Return quickly if we already have the dataset cached
    if not load_as_dict and self.dataset is not None and not force_retrieve:
        return self.dataset
    if load_as_dict and self.dataset_dict is not None and not force_retrieve:
        return self.dataset_dict

    # Retrieve the dataset if needed
    if (not self.is_retrieved() and retrieve) or force_retrieve:
        self.retrieve()
    elif not self.is_retrieved():
        raise ValueError(
            f"Dataset {self.name} is not available locally and "
            "retrieve is set to False. Either `retrieve` the dataset first or "
            "set the retrieve flag to True."
        )

    # Load the retrieved dataset
    hf_dataset = load_from_disk(self.main_data_path)
    if not isinstance(hf_dataset, DatasetDict):
        raise ValueError(
            "Expected dataset to be DatasetDict, but got regular Dataset."
        )
    try:
        hf_dataset = DatasetDict({sid: hf_dataset[sid] for sid in self.splits})
    except KeyError as e:
        raise ValueError(f"No such split {e}.")

    if load_as_dict:
        # Return the dataset as a dictionary
        if cache:
            self.dataset_dict = hf_dataset
        return hf_dataset

    # Concatenate the splits and return the data as a single Dataset object
    hf_dataset = concatenate_datasets(
        [
            hf_dataset[s].cast(hf_dataset[self.splits[0]].features)
            for s in self.splits
        ]
    )
    if cache:
        self.dataset = hf_dataset
    return hf_dataset

remove

remove() -> None

Deletes the dataset at the specific version from disk.

Source code in evalsense/datasets/dataset_manager.py
def remove(self) -> None:
    """Deletes the dataset at the specific version from disk."""
    if self.version_path.exists():
        shutil.rmtree(self.version_path)

retrieve

retrieve(**kwargs) -> None

Downloads and preprocesses a dataset.

Parameters:

Name Type Description Default
**kwargs dict

Additional keyword arguments.

{}
Source code in evalsense/datasets/dataset_manager.py
@override
def retrieve(self, **kwargs) -> None:
    """Downloads and preprocesses a dataset.

    Args:
        **kwargs (dict): Additional keyword arguments.
    """
    self.version_path.mkdir(parents=True, exist_ok=True)
    self._retrieve_files(**kwargs)
    self._preprocess_files(**kwargs)

unload

unload() -> None

Unloads the dataset from memory.

Source code in evalsense/datasets/dataset_manager.py
def unload(self) -> None:
    """Unloads the dataset from memory."""
    self.dataset = None
    self.dataset_dict = None

HuggingFaceDatasetManager

Bases: DatasetManager

A dataset manager for Hugging Face datasets.

Methods:

Name Description
__init__

Initializes a new HuggingFaceDatasetManager.

can_handle

Checks if the DatasetManager can handle the given dataset.

create

Creates a new dataset manager for the specified dataset.

is_retrieved

Checks if the dataset at the specific version is already downloaded.

load

Loads the dataset as a HuggingFace dataset.

remove

Deletes the dataset at the specific version from disk.

retrieve

Downloads and preprocesses a dataset.

unload

Unloads the dataset from memory.

Attributes:

Name Type Description
dataset_path Path

The top-level directory for storing this dataset.

main_data_path Path

The path for storing the preprocessed dataset files for a specific version.

record DatasetRecord

Returns a record identifying the dataset.

version_path Path

The directory for storing a specific version of this dataset.

Source code in evalsense/datasets/managers/huggingface.py
@manager
class HuggingFaceDatasetManager(DatasetManager):
    """A dataset manager for Hugging Face datasets."""

    priority = 3

    def __init__(
        self,
        name: str,
        version: str = "main",
        splits: list[str] | None = None,
        data_dir: str | None = None,
        **kwargs,
    ):
        """Initializes a new HuggingFaceDatasetManager.

        Args:
            name (str): The name of the dataset.
            version (str, optional): The dataset version to retrieve.
            splits (list[str], optional): The dataset splits to retrieve.
            data_dir (str, optional): The top-level directory for storing all
                datasets. Defaults to "datasets" in the user cache directory.
            **kwargs (dict): Additional keyword arguments.
        """
        if splits is None:
            splits = cast(list[str], get_dataset_split_names(name, revision=version))

        super().__init__(
            name=name,
            version=version,
            splits=splits,
            data_dir=data_dir,
            **kwargs,
        )

    @override
    def retrieve(self, **kwargs) -> None:
        """Downloads and preprocesses a dataset.

        Args:
            **kwargs (dict): Additional keyword arguments.
        """
        dataset = load_dataset(self.name, revision=self.version)
        if not isinstance(dataset, DatasetDict):
            raise ValueError(f"Unexpected dataset type: {type(dataset)}.")
        with disable_dataset_progress_bars():
            dataset.save_to_disk(self.main_data_path)

    @classmethod
    @override
    def can_handle(cls, name: str) -> bool:
        """Checks if the DatasetManager can handle the given dataset.

        Args:
            name (str): The name of the dataset.

        Returns:
            (bool): True if the manager can handle the dataset, False otherwise.
        """
        return repo_exists(name, repo_type="dataset")

dataset_path property

dataset_path: Path

The top-level directory for storing this dataset.

Returns:

Type Description
Path

The dataset directory.

main_data_path property

main_data_path: Path

The path for storing the preprocessed dataset files for a specific version.

Returns:

Type Description
Path

The main dataset directory.

record property

record: DatasetRecord

Returns a record identifying the dataset.

Returns:

Type Description
DatasetRecord

The dataset record.

version_path property

version_path: Path

The directory for storing a specific version of this dataset.

Returns:

Type Description
Path

The dataset version directory.

__init__

__init__(
    name: str,
    version: str = "main",
    splits: list[str] | None = None,
    data_dir: str | None = None,
    **kwargs,
)

Initializes a new HuggingFaceDatasetManager.

Parameters:

Name Type Description Default
name str

The name of the dataset.

required
version str

The dataset version to retrieve.

'main'
splits list[str]

The dataset splits to retrieve.

None
data_dir str

The top-level directory for storing all datasets. Defaults to "datasets" in the user cache directory.

None
**kwargs dict

Additional keyword arguments.

{}
Source code in evalsense/datasets/managers/huggingface.py
def __init__(
    self,
    name: str,
    version: str = "main",
    splits: list[str] | None = None,
    data_dir: str | None = None,
    **kwargs,
):
    """Initializes a new HuggingFaceDatasetManager.

    Args:
        name (str): The name of the dataset.
        version (str, optional): The dataset version to retrieve.
        splits (list[str], optional): The dataset splits to retrieve.
        data_dir (str, optional): The top-level directory for storing all
            datasets. Defaults to "datasets" in the user cache directory.
        **kwargs (dict): Additional keyword arguments.
    """
    if splits is None:
        splits = cast(list[str], get_dataset_split_names(name, revision=version))

    super().__init__(
        name=name,
        version=version,
        splits=splits,
        data_dir=data_dir,
        **kwargs,
    )

can_handle classmethod

can_handle(name: str) -> bool

Checks if the DatasetManager can handle the given dataset.

Parameters:

Name Type Description Default
name str

The name of the dataset.

required

Returns:

Type Description
bool

True if the manager can handle the dataset, False otherwise.

Source code in evalsense/datasets/managers/huggingface.py
@classmethod
@override
def can_handle(cls, name: str) -> bool:
    """Checks if the DatasetManager can handle the given dataset.

    Args:
        name (str): The name of the dataset.

    Returns:
        (bool): True if the manager can handle the dataset, False otherwise.
    """
    return repo_exists(name, repo_type="dataset")

create classmethod

create(
    name: str,
    splits: list[str],
    version: str | None = None,
    data_dir: str | None = None,
    **kwargs: dict,
) -> DatasetManager

Creates a new dataset manager for the specified dataset.

Parameters:

Name Type Description Default
name str

The name of the dataset.

required
splits list[str]

The dataset splits to retrieve.

required
version str | None

The dataset version to retrieve.

None
data_dir str | None

The top-level directory for storing all datasets.

None
**kwargs dict

Additional keyword arguments.

{}

Returns:

Type Description
DatasetManager

The created dataset manager.

Source code in evalsense/datasets/dataset_manager.py
@classmethod
def create(
    cls,
    name: str,
    splits: list[str],
    version: str | None = None,
    data_dir: str | None = None,
    **kwargs: dict,
) -> "DatasetManager":
    """Creates a new dataset manager for the specified dataset.

    Args:
        name (str): The name of the dataset.
        splits (list[str]): The dataset splits to retrieve.
        version (str | None): The dataset version to retrieve.
        data_dir (str | None): The top-level directory for storing all datasets.
        **kwargs (dict): Additional keyword arguments.

    Returns:
        (DatasetManager): The created dataset manager.
    """
    manager = DatasetManagerRegistry.get(name)
    if manager is not None:
        return manager(
            name=name,
            splits=splits,
            version=version,
            data_dir=data_dir,
            **kwargs,
        )
    raise ValueError(f"No suitable dataset manager found for {name}")

is_retrieved

is_retrieved() -> bool

Checks if the dataset at the specific version is already downloaded.

Returns:

Type Description
bool

True if the dataset exists locally, False otherwise.

Source code in evalsense/datasets/dataset_manager.py
def is_retrieved(self) -> bool:
    """Checks if the dataset at the specific version is already downloaded.

    Returns:
        (bool): True if the dataset exists locally, False otherwise.
    """
    return self.main_data_path.exists()

load

load(
    *,
    retrieve: bool = True,
    cache: bool = True,
    force_retrieve: bool = False,
    load_as_dict: Literal[False] = ...,
) -> Dataset
load(
    *,
    retrieve: bool = True,
    cache: bool = True,
    force_retrieve: bool = False,
    load_as_dict: Literal[True],
) -> DatasetDict
load(
    *,
    retrieve: bool = True,
    cache: bool = True,
    force_retrieve: bool = False,
    load_as_dict: bool = False,
) -> Dataset | DatasetDict

Loads the dataset as a HuggingFace dataset.

Parameters:

Name Type Description Default
retrieve bool

Whether to retrieve the dataset if it does not exist locally. Defaults to True.

True
cache bool

Whether to cache the dataset in memory. Defaults to True.

True
force_retrieve bool

Whether to force retrieving and reloading the dataset even if it is already cached. Overrides the retrieve flag if set to True. Defaults to False.

False
load_as_dict bool

Whether to load the dataset with multiple splits as a DatasetDict. If False (the default), the selected dataset splits are concatenated into a single dataset.

False

Returns:

Type Description
Dataset | DatasetDict

The loaded dataset.

Source code in evalsense/datasets/dataset_manager.py
def load(
    self,
    *,
    retrieve: bool = True,
    cache: bool = True,
    force_retrieve: bool = False,
    load_as_dict: bool = False,
) -> Dataset | DatasetDict:
    """Loads the dataset as a HuggingFace dataset.

    Args:
        retrieve (bool, optional): Whether to retrieve the dataset if it
            does not exist locally. Defaults to True.
        cache (bool, optional): Whether to cache the dataset in memory.
            Defaults to True.
        force_retrieve (bool, optional): Whether to force retrieving and
            reloading the dataset even if it is already cached. Overrides
            the `retrieve` flag if set to True. Defaults to False.
        load_as_dict (bool, optional): Whether to load the dataset with
            multiple splits as a DatasetDict. If False (the default),
            the selected dataset splits are concatenated into a single
            dataset.

    Returns:
        (Dataset | DatasetDict): The loaded dataset.
    """
    # Return quickly if we already have the dataset cached
    if not load_as_dict and self.dataset is not None and not force_retrieve:
        return self.dataset
    if load_as_dict and self.dataset_dict is not None and not force_retrieve:
        return self.dataset_dict

    # Retrieve the dataset if needed
    if (not self.is_retrieved() and retrieve) or force_retrieve:
        self.retrieve()
    elif not self.is_retrieved():
        raise ValueError(
            f"Dataset {self.name} is not available locally and "
            "retrieve is set to False. Either `retrieve` the dataset first or "
            "set the retrieve flag to True."
        )

    # Load the retrieved dataset
    hf_dataset = load_from_disk(self.main_data_path)
    if not isinstance(hf_dataset, DatasetDict):
        raise ValueError(
            "Expected dataset to be DatasetDict, but got regular Dataset."
        )
    try:
        hf_dataset = DatasetDict({sid: hf_dataset[sid] for sid in self.splits})
    except KeyError as e:
        raise ValueError(f"No such split {e}.")

    if load_as_dict:
        # Return the dataset as a dictionary
        if cache:
            self.dataset_dict = hf_dataset
        return hf_dataset

    # Concatenate the splits and return the data as a single Dataset object
    hf_dataset = concatenate_datasets(
        [
            hf_dataset[s].cast(hf_dataset[self.splits[0]].features)
            for s in self.splits
        ]
    )
    if cache:
        self.dataset = hf_dataset
    return hf_dataset

remove

remove() -> None

Deletes the dataset at the specific version from disk.

Source code in evalsense/datasets/dataset_manager.py
def remove(self) -> None:
    """Deletes the dataset at the specific version from disk."""
    if self.version_path.exists():
        shutil.rmtree(self.version_path)

retrieve

retrieve(**kwargs) -> None

Downloads and preprocesses a dataset.

Parameters:

Name Type Description Default
**kwargs dict

Additional keyword arguments.

{}
Source code in evalsense/datasets/managers/huggingface.py
@override
def retrieve(self, **kwargs) -> None:
    """Downloads and preprocesses a dataset.

    Args:
        **kwargs (dict): Additional keyword arguments.
    """
    dataset = load_dataset(self.name, revision=self.version)
    if not isinstance(dataset, DatasetDict):
        raise ValueError(f"Unexpected dataset type: {type(dataset)}.")
    with disable_dataset_progress_bars():
        dataset.save_to_disk(self.main_data_path)

unload

unload() -> None

Unloads the dataset from memory.

Source code in evalsense/datasets/dataset_manager.py
def unload(self) -> None:
    """Unloads the dataset from memory."""
    self.dataset = None
    self.dataset_dict = None