Dataset Managers

Modules:

Name	Description
`aci_bench`

Classes:

Name	Description
`AciBenchDatasetManager`	A dataset manager for the ACI Bench dataset.

AciBenchDatasetManager

Bases: DatasetManager

A dataset manager for the ACI Bench dataset.

Methods:

Name	Description
`__init__`	Initializes a new AciBenchDatasetManager.
`can_handle`	Checks if the DatasetManager can handle the given dataset.
`get`	Downloads and preprocesses a dataset.
`is_retrieved`	Checks if the dataset at the specific version is already downloaded.
`load`	Loads the dataset as a HuggingFace dataset.
`load_dict`	Loads the dataset as a HuggingFace dataset dictionary.
`remove`	Deletes the dataset at the specific version from disk.
`unload`	Unloads the dataset from memory.
`unload_dict`	Unloads the dataset dictionary from memory.

Attributes:

Name	Type	Description
`dataset_path`	`Path`	The top-level directory for storing this dataset.
`main_data_path`	`Path`	The path for storing the preprocessed dataset files for a specific version.
`record`	`DatasetRecord`	Returns a record identifying the dataset.
`version_path`	`Path`	The directory for storing a specific version of this dataset.

Source code in evalsense/datasets/managers/aci_bench.py

class AciBenchDatasetManager(DatasetManager):
    """A dataset manager for the ACI Bench dataset."""

    _DATASET_NAME = "ACI-BENCH"

    def __init__(
        self,
        version: str = "5d3cd4d8a25b4ebb5b2b87c3923a7b2b7150e33d",
        splits: list[str] | None = None,
        data_dir: str | None = None,
        **kwargs,
    ):
        """Initializes a new AciBenchDatasetManager.

        Args:
            version (str, optional): The dataset version to retrieve.
            splits (list[str], optional): The dataset splits to retrieve.
            data_dir (str, optional): The top-level directory for storing all
                datasets. Defaults to "datasets" in the user cache directory.
            **kwargs (dict): Additional keyword arguments.
        """
        super().__init__(
            self._DATASET_NAME,
            version=version,
            splits=splits,
            priority=7,
            data_dir=data_dir,
            **kwargs,
        )

    @override
    def _preprocess_files(self, **kwargs) -> None:
        """Preprocesses the downloaded dataset files.

        This method preprocesses the downloaded dataset files and saves them
        as a HuggingFace DatasetDict in the `self.main_data_path` directory.

        Args:
            **kwargs (dict): Additional keyword arguments.
        """
        dataset_dict = {}
        for split in self.splits:
            # Join all data files into a single DataFrame
            data_df = None
            for file in self.config.get_files(self.version, [split]).values():
                if data_df is None:
                    data_df = pl.read_csv(self.version_path / file.name)
                else:
                    other_df = pl.read_csv(self.version_path / file.name)
                    data_df = data_df.join(
                        other_df, on=["dataset", "encounter_id"], how="inner"
                    )
            if data_df is None:
                raise RuntimeError(f"No data found for split '{split}'.")
            dataset = Dataset.from_polars(data_df)
            dataset_dict[split] = dataset

        # Save the dataset to disk
        with disable_dataset_progress_bars():
            hf_dataset = DatasetDict(dataset_dict)
            hf_dataset.save_to_disk(self.main_data_path)

    @classmethod
    @override
    def can_handle(cls, name: str) -> bool:
        """Checks if the DatasetManager can handle the given dataset.

        Args:
            name (str): The name of the dataset.

        Returns:
            (bool): True if the manager can handle the dataset, False otherwise.
        """
        return name == cls._DATASET_NAME

dataset_path `property`

dataset_path: Path

The top-level directory for storing this dataset.

Returns:

Type	Description
`Path`	The dataset directory.

main_data_path `property`

main_data_path: Path

The path for storing the preprocessed dataset files for a specific version.

Returns:

Type	Description
`Path`	The main dataset directory.

record `property`

record: DatasetRecord

Returns a record identifying the dataset.

Returns:

Type	Description
`DatasetRecord`	The dataset record.

version_path `property`

version_path: Path

The directory for storing a specific version of this dataset.

Returns:

Type	Description
`Path`	The dataset version directory.

init

__init__(
    version: str = "5d3cd4d8a25b4ebb5b2b87c3923a7b2b7150e33d",
    splits: list[str] | None = None,
    data_dir: str | None = None,
    **kwargs,
)

Initializes a new AciBenchDatasetManager.

Parameters:

Name	Type	Description	Default
`version`	`str`	The dataset version to retrieve.	`'5d3cd4d8a25b4ebb5b2b87c3923a7b2b7150e33d'`
`splits`	`list[str]`	The dataset splits to retrieve.	`None`
`data_dir`	`str`	The top-level directory for storing all datasets. Defaults to "datasets" in the user cache directory.	`None`
`**kwargs`	`dict`	Additional keyword arguments.	`{}`

Source code in evalsense/datasets/managers/aci_bench.py

def __init__(
    self,
    version: str = "5d3cd4d8a25b4ebb5b2b87c3923a7b2b7150e33d",
    splits: list[str] | None = None,
    data_dir: str | None = None,
    **kwargs,
):
    """Initializes a new AciBenchDatasetManager.

    Args:
        version (str, optional): The dataset version to retrieve.
        splits (list[str], optional): The dataset splits to retrieve.
        data_dir (str, optional): The top-level directory for storing all
            datasets. Defaults to "datasets" in the user cache directory.
        **kwargs (dict): Additional keyword arguments.
    """
    super().__init__(
        self._DATASET_NAME,
        version=version,
        splits=splits,
        priority=7,
        data_dir=data_dir,
        **kwargs,
    )

can_handle `classmethod`

can_handle(name: str) -> bool

Checks if the DatasetManager can handle the given dataset.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the dataset.	required

Returns:

Type	Description
`bool`	True if the manager can handle the dataset, False otherwise.

Source code in evalsense/datasets/managers/aci_bench.py

@classmethod
@override
def can_handle(cls, name: str) -> bool:
    """Checks if the DatasetManager can handle the given dataset.

    Args:
        name (str): The name of the dataset.

    Returns:
        (bool): True if the manager can handle the dataset, False otherwise.
    """
    return name == cls._DATASET_NAME

get

get(**kwargs) -> None

Downloads and preprocesses a dataset.

Parameters:

Name	Type	Description	Default
`**kwargs`	`dict`	Additional keyword arguments.	`{}`

Source code in evalsense/datasets/dataset_manager.py

def get(self, **kwargs) -> None:
    """Downloads and preprocesses a dataset.

    Args:
        **kwargs (dict): Additional keyword arguments.
    """
    self.version_path.mkdir(parents=True, exist_ok=True)
    self._retrieve_files(**kwargs)
    self._preprocess_files(**kwargs)

is_retrieved

is_retrieved() -> bool

Checks if the dataset at the specific version is already downloaded.

Returns:

Type	Description
`bool`	True if the dataset exists locally, False otherwise.

Source code in evalsense/datasets/dataset_manager.py

def is_retrieved(self) -> bool:
    """Checks if the dataset at the specific version is already downloaded.

    Returns:
        (bool): True if the dataset exists locally, False otherwise.
    """
    return self.main_data_path.exists()

load

load(
    retrieve: bool = True,
    cache: bool = True,
    force_retrieve: bool = False,
) -> Dataset

Loads the dataset as a HuggingFace dataset.

If multiple splits are specified, they are concatenated into a single dataset. See the load_dict method if you wish to load the dataset as a DatasetDict.

Parameters:

Name	Type	Description	Default
`retrieve`	`bool`	Whether to retrieve the dataset if it does not exist locally. Defaults to True.	`True`
`cache`	`bool`	Whether to cache the dataset in memory. Defaults to True.	`True`
`force_retrieve`	`bool`	Whether to force retrieving and reloading the dataset even if it is already cached. Overrides the `retrieve` flag if set to True. Defaults to False.	`False`

Returns:

Type	Description
`Dataset`	The loaded dataset.

Source code in evalsense/datasets/dataset_manager.py

def load(
    self, retrieve: bool = True, cache: bool = True, force_retrieve: bool = False
) -> Dataset:
    """Loads the dataset as a HuggingFace dataset.

    If multiple splits are specified, they are concatenated into a single
    dataset. See the `load_dict` method if you wish to load the dataset as a
    `DatasetDict`.

    Args:
        retrieve (bool, optional): Whether to retrieve the dataset if it
            does not exist locally. Defaults to True.
        cache (bool, optional): Whether to cache the dataset in memory.
            Defaults to True.
        force_retrieve (bool, optional): Whether to force retrieving and
            reloading the dataset even if it is already cached. Overrides
            the `retrieve` flag if set to True. Defaults to False.

    Returns:
        (Dataset): The loaded dataset.
    """
    if self.dataset is not None and not force_retrieve:
        return self.dataset

    if (not self.is_retrieved() and retrieve) or force_retrieve:
        self.get()
    elif not self.is_retrieved():
        raise ValueError(
            f"Dataset {self.name} is not available locally and "
            "retrieve is set to False. Either `get` the dataset first or "
            "set the retrieve flag to True."
        )
    hf_dataset = load_from_disk(self.main_data_path)
    if isinstance(hf_dataset, Dataset) and self.splits is not None:
        raise ValueError(
            f"Cannot load specific splits for an unpartitioned dataset {self.name}."
        )
    if isinstance(hf_dataset, DatasetDict):
        if self.splits is not None:
            hf_dataset = concatenate_datasets(
                [
                    hf_dataset[s].cast(hf_dataset[self.splits[0]].features)
                    for s in self.splits
                ]
            )
        else:
            hf_dataset = concatenate_datasets(list(hf_dataset.values()))
    if cache:
        self.dataset = hf_dataset
    return hf_dataset

load_dict

load_dict(
    retrieve: bool = True,
    cache: bool = True,
    force_retrieve: bool = False,
) -> DatasetDict

Loads the dataset as a HuggingFace dataset dictionary.

See the load method if you wish to concatenate the splits into a single dataset.

Parameters:

Name	Type	Description	Default
`retrieve`	`bool`	Whether to retrieve the dataset if it does not exist locally. Defaults to True.	`True`
`cache`	`bool`	Whether to cache the dataset in memory. Defaults to True.	`True`
`force_retrieve`	`bool`	Whether to force retrieving and reloading the dataset even if it is already cached. Overrides the `retrieve` flag if set to True. Defaults to False.	`False`

Returns:

Type	Description
`DatasetDict`	The loaded dataset dictionary.

Source code in evalsense/datasets/dataset_manager.py

def load_dict(
    self, retrieve: bool = True, cache: bool = True, force_retrieve: bool = False
) -> DatasetDict:
    """Loads the dataset as a HuggingFace dataset dictionary.

    See the `load` method if you wish to concatenate the splits into
    a single dataset.

    Args:
        retrieve (bool, optional): Whether to retrieve the dataset if it
            does not exist locally. Defaults to True.
        cache (bool, optional): Whether to cache the dataset in memory.
            Defaults to True.
        force_retrieve (bool, optional): Whether to force retrieving and
            reloading the dataset even if it is already cached. Overrides
            the `retrieve` flag if set to True. Defaults to False.

    Returns:
        (DatasetDict): The loaded dataset dictionary.
    """
    if self.dataset_dict is not None and not force_retrieve:
        return self.dataset_dict

    if (not self.is_retrieved() and retrieve) or force_retrieve:
        self.get()
    elif not self.is_retrieved():
        raise ValueError(
            f"Dataset {self.name} is not available locally and "
            "retrieve is set to False. Either `get` the dataset first or "
            "set the retrieve flag to True."
        )
    hf_dataset = load_from_disk(self.main_data_path)
    if isinstance(hf_dataset, Dataset):
        raise ValueError(
            f"Cannot load an unpartitioned dataset {self.name} as dict."
        )
    if self.splits is not None:
        hf_dataset = cast(DatasetDict, hf_dataset[self.splits])
    if cache:
        self.dataset_dict = hf_dataset
    return hf_dataset

remove

remove() -> None

Deletes the dataset at the specific version from disk.

Source code in evalsense/datasets/dataset_manager.py

def remove(self) -> None:
    """Deletes the dataset at the specific version from disk."""
    if self.version_path.exists():
        shutil.rmtree(self.version_path)

unload

unload() -> None

Unloads the dataset from memory.

Source code in evalsense/datasets/dataset_manager.py

def unload(self) -> None:
    """Unloads the dataset from memory."""
    self.dataset = None

unload_dict

unload_dict() -> None

Unloads the dataset dictionary from memory.

Source code in evalsense/datasets/dataset_manager.py

def unload_dict(self) -> None:
    """Unloads the dataset dictionary from memory."""
    self.dataset_dict = None

Dataset Managers

AciBenchDatasetManager

dataset_path property

main_data_path property

record property

version_path property

__init__

can_handle classmethod

get

is_retrieved

load

load_dict

remove

unload

unload_dict

dataset_path `property`

main_data_path `property`

record `property`

version_path `property`

init

can_handle `classmethod`