Skip to content

Dataset Managers

Modules:

Name Description
aci_bench

Classes:

Name Description
AciBenchDatasetManager

A dataset manager for the ACI Bench dataset.

AciBenchDatasetManager

Bases: DatasetManager

A dataset manager for the ACI Bench dataset.

Methods:

Name Description
__init__

Initializes a new AciBenchDatasetManager.

can_handle

Checks if the DatasetManager can handle the given dataset.

get

Downloads and preprocesses a dataset.

is_retrieved

Checks if the dataset at the specific version is already downloaded.

load

Loads the dataset as a HuggingFace dataset.

load_dict

Loads the dataset as a HuggingFace dataset dictionary.

remove

Deletes the dataset at the specific version from disk.

unload

Unloads the dataset from memory.

unload_dict

Unloads the dataset dictionary from memory.

Attributes:

Name Type Description
dataset_path Path

The top-level directory for storing this dataset.

main_data_path Path

The path for storing the preprocessed dataset files for a specific version.

record DatasetRecord

Returns a record identifying the dataset.

version_path Path

The directory for storing a specific version of this dataset.

Source code in evalsense/datasets/managers/aci_bench.py
class AciBenchDatasetManager(DatasetManager):
    """A dataset manager for the ACI Bench dataset."""

    _DATASET_NAME = "ACI-BENCH"

    def __init__(
        self,
        version: str = "5d3cd4d8a25b4ebb5b2b87c3923a7b2b7150e33d",
        splits: list[str] | None = None,
        data_dir: str | None = None,
        **kwargs,
    ):
        """Initializes a new AciBenchDatasetManager.

        Args:
            version (str, optional): The dataset version to retrieve.
            splits (list[str], optional): The dataset splits to retrieve.
            data_dir (str, optional): The top-level directory for storing all
                datasets. Defaults to "datasets" in the user cache directory.
            **kwargs (dict): Additional keyword arguments.
        """
        super().__init__(
            self._DATASET_NAME,
            version=version,
            splits=splits,
            priority=7,
            data_dir=data_dir,
            **kwargs,
        )

    @override
    def _preprocess_files(self, **kwargs) -> None:
        """Preprocesses the downloaded dataset files.

        This method preprocesses the downloaded dataset files and saves them
        as a HuggingFace DatasetDict in the `self.main_data_path` directory.

        Args:
            **kwargs (dict): Additional keyword arguments.
        """
        dataset_dict = {}
        for split in self.splits:
            # Join all data files into a single DataFrame
            data_df = None
            for file in self.config.get_files(self.version, [split]).values():
                if data_df is None:
                    data_df = pl.read_csv(self.version_path / file.name)
                else:
                    other_df = pl.read_csv(self.version_path / file.name)
                    data_df = data_df.join(
                        other_df, on=["dataset", "encounter_id"], how="inner"
                    )
            if data_df is None:
                raise RuntimeError(f"No data found for split '{split}'.")
            dataset = Dataset.from_polars(data_df)
            dataset_dict[split] = dataset

        # Save the dataset to disk
        with disable_dataset_progress_bars():
            hf_dataset = DatasetDict(dataset_dict)
            hf_dataset.save_to_disk(self.main_data_path)

    @classmethod
    @override
    def can_handle(cls, name: str) -> bool:
        """Checks if the DatasetManager can handle the given dataset.

        Args:
            name (str): The name of the dataset.

        Returns:
            (bool): True if the manager can handle the dataset, False otherwise.
        """
        return name == cls._DATASET_NAME

dataset_path property

dataset_path: Path

The top-level directory for storing this dataset.

Returns:

Type Description
Path

The dataset directory.

main_data_path property

main_data_path: Path

The path for storing the preprocessed dataset files for a specific version.

Returns:

Type Description
Path

The main dataset directory.

record property

record: DatasetRecord

Returns a record identifying the dataset.

Returns:

Type Description
DatasetRecord

The dataset record.

version_path property

version_path: Path

The directory for storing a specific version of this dataset.

Returns:

Type Description
Path

The dataset version directory.

__init__

__init__(
    version: str = "5d3cd4d8a25b4ebb5b2b87c3923a7b2b7150e33d",
    splits: list[str] | None = None,
    data_dir: str | None = None,
    **kwargs,
)

Initializes a new AciBenchDatasetManager.

Parameters:

Name Type Description Default
version str

The dataset version to retrieve.

'5d3cd4d8a25b4ebb5b2b87c3923a7b2b7150e33d'
splits list[str]

The dataset splits to retrieve.

None
data_dir str

The top-level directory for storing all datasets. Defaults to "datasets" in the user cache directory.

None
**kwargs dict

Additional keyword arguments.

{}
Source code in evalsense/datasets/managers/aci_bench.py
def __init__(
    self,
    version: str = "5d3cd4d8a25b4ebb5b2b87c3923a7b2b7150e33d",
    splits: list[str] | None = None,
    data_dir: str | None = None,
    **kwargs,
):
    """Initializes a new AciBenchDatasetManager.

    Args:
        version (str, optional): The dataset version to retrieve.
        splits (list[str], optional): The dataset splits to retrieve.
        data_dir (str, optional): The top-level directory for storing all
            datasets. Defaults to "datasets" in the user cache directory.
        **kwargs (dict): Additional keyword arguments.
    """
    super().__init__(
        self._DATASET_NAME,
        version=version,
        splits=splits,
        priority=7,
        data_dir=data_dir,
        **kwargs,
    )

can_handle classmethod

can_handle(name: str) -> bool

Checks if the DatasetManager can handle the given dataset.

Parameters:

Name Type Description Default
name str

The name of the dataset.

required

Returns:

Type Description
bool

True if the manager can handle the dataset, False otherwise.

Source code in evalsense/datasets/managers/aci_bench.py
@classmethod
@override
def can_handle(cls, name: str) -> bool:
    """Checks if the DatasetManager can handle the given dataset.

    Args:
        name (str): The name of the dataset.

    Returns:
        (bool): True if the manager can handle the dataset, False otherwise.
    """
    return name == cls._DATASET_NAME

get

get(**kwargs) -> None

Downloads and preprocesses a dataset.

Parameters:

Name Type Description Default
**kwargs dict

Additional keyword arguments.

{}
Source code in evalsense/datasets/dataset_manager.py
def get(self, **kwargs) -> None:
    """Downloads and preprocesses a dataset.

    Args:
        **kwargs (dict): Additional keyword arguments.
    """
    self.version_path.mkdir(parents=True, exist_ok=True)
    self._retrieve_files(**kwargs)
    self._preprocess_files(**kwargs)

is_retrieved

is_retrieved() -> bool

Checks if the dataset at the specific version is already downloaded.

Returns:

Type Description
bool

True if the dataset exists locally, False otherwise.

Source code in evalsense/datasets/dataset_manager.py
def is_retrieved(self) -> bool:
    """Checks if the dataset at the specific version is already downloaded.

    Returns:
        (bool): True if the dataset exists locally, False otherwise.
    """
    return self.main_data_path.exists()

load

load(
    retrieve: bool = True,
    cache: bool = True,
    force_retrieve: bool = False,
) -> Dataset

Loads the dataset as a HuggingFace dataset.

If multiple splits are specified, they are concatenated into a single dataset. See the load_dict method if you wish to load the dataset as a DatasetDict.

Parameters:

Name Type Description Default
retrieve bool

Whether to retrieve the dataset if it does not exist locally. Defaults to True.

True
cache bool

Whether to cache the dataset in memory. Defaults to True.

True
force_retrieve bool

Whether to force retrieving and reloading the dataset even if it is already cached. Overrides the retrieve flag if set to True. Defaults to False.

False

Returns:

Type Description
Dataset

The loaded dataset.

Source code in evalsense/datasets/dataset_manager.py
def load(
    self, retrieve: bool = True, cache: bool = True, force_retrieve: bool = False
) -> Dataset:
    """Loads the dataset as a HuggingFace dataset.

    If multiple splits are specified, they are concatenated into a single
    dataset. See the `load_dict` method if you wish to load the dataset as a
    `DatasetDict`.

    Args:
        retrieve (bool, optional): Whether to retrieve the dataset if it
            does not exist locally. Defaults to True.
        cache (bool, optional): Whether to cache the dataset in memory.
            Defaults to True.
        force_retrieve (bool, optional): Whether to force retrieving and
            reloading the dataset even if it is already cached. Overrides
            the `retrieve` flag if set to True. Defaults to False.

    Returns:
        (Dataset): The loaded dataset.
    """
    if self.dataset is not None and not force_retrieve:
        return self.dataset

    if (not self.is_retrieved() and retrieve) or force_retrieve:
        self.get()
    elif not self.is_retrieved():
        raise ValueError(
            f"Dataset {self.name} is not available locally and "
            "retrieve is set to False. Either `get` the dataset first or "
            "set the retrieve flag to True."
        )
    hf_dataset = load_from_disk(self.main_data_path)
    if isinstance(hf_dataset, Dataset) and self.splits is not None:
        raise ValueError(
            f"Cannot load specific splits for an unpartitioned dataset {self.name}."
        )
    if isinstance(hf_dataset, DatasetDict):
        if self.splits is not None:
            hf_dataset = concatenate_datasets(
                [
                    hf_dataset[s].cast(hf_dataset[self.splits[0]].features)
                    for s in self.splits
                ]
            )
        else:
            hf_dataset = concatenate_datasets(list(hf_dataset.values()))
    if cache:
        self.dataset = hf_dataset
    return hf_dataset

load_dict

load_dict(
    retrieve: bool = True,
    cache: bool = True,
    force_retrieve: bool = False,
) -> DatasetDict

Loads the dataset as a HuggingFace dataset dictionary.

See the load method if you wish to concatenate the splits into a single dataset.

Parameters:

Name Type Description Default
retrieve bool

Whether to retrieve the dataset if it does not exist locally. Defaults to True.

True
cache bool

Whether to cache the dataset in memory. Defaults to True.

True
force_retrieve bool

Whether to force retrieving and reloading the dataset even if it is already cached. Overrides the retrieve flag if set to True. Defaults to False.

False

Returns:

Type Description
DatasetDict

The loaded dataset dictionary.

Source code in evalsense/datasets/dataset_manager.py
def load_dict(
    self, retrieve: bool = True, cache: bool = True, force_retrieve: bool = False
) -> DatasetDict:
    """Loads the dataset as a HuggingFace dataset dictionary.

    See the `load` method if you wish to concatenate the splits into
    a single dataset.

    Args:
        retrieve (bool, optional): Whether to retrieve the dataset if it
            does not exist locally. Defaults to True.
        cache (bool, optional): Whether to cache the dataset in memory.
            Defaults to True.
        force_retrieve (bool, optional): Whether to force retrieving and
            reloading the dataset even if it is already cached. Overrides
            the `retrieve` flag if set to True. Defaults to False.

    Returns:
        (DatasetDict): The loaded dataset dictionary.
    """
    if self.dataset_dict is not None and not force_retrieve:
        return self.dataset_dict

    if (not self.is_retrieved() and retrieve) or force_retrieve:
        self.get()
    elif not self.is_retrieved():
        raise ValueError(
            f"Dataset {self.name} is not available locally and "
            "retrieve is set to False. Either `get` the dataset first or "
            "set the retrieve flag to True."
        )
    hf_dataset = load_from_disk(self.main_data_path)
    if isinstance(hf_dataset, Dataset):
        raise ValueError(
            f"Cannot load an unpartitioned dataset {self.name} as dict."
        )
    if self.splits is not None:
        hf_dataset = cast(DatasetDict, hf_dataset[self.splits])
    if cache:
        self.dataset_dict = hf_dataset
    return hf_dataset

remove

remove() -> None

Deletes the dataset at the specific version from disk.

Source code in evalsense/datasets/dataset_manager.py
def remove(self) -> None:
    """Deletes the dataset at the specific version from disk."""
    if self.version_path.exists():
        shutil.rmtree(self.version_path)

unload

unload() -> None

Unloads the dataset from memory.

Source code in evalsense/datasets/dataset_manager.py
def unload(self) -> None:
    """Unloads the dataset from memory."""
    self.dataset = None

unload_dict

unload_dict() -> None

Unloads the dataset dictionary from memory.

Source code in evalsense/datasets/dataset_manager.py
def unload_dict(self) -> None:
    """Unloads the dataset dictionary from memory."""
    self.dataset_dict = None