Skip to content

Generating Useful Synthetic Data

model

nhsengland/NHSSynth

model

`Model`

Bases: Module, ABC

Abstract base class for all NHSSynth models

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	The data to train on	required
`metatransformer`	`MetaTransformer`	A `MetaTransformer` to use for converting the generated data to match the original data	required
`batch_size`	`int`	The batch size to use during training	`32`
`use_gpu`	`bool`	Flag to determine whether to use the GPU (if available)	`False`

Attributes:

Name	Type	Description
`nrows`		The number of rows in the `data`
`ncols`		The number of columns in the `data`
`columns`	`Index`	The names of the columns in the `data`
`metatransformer`		The `MetaTransformer` (potentially) associated with the model
`multi_column_indices`	`list[list[int]]`	A list of lists of column indices, where each sublist containts the indices for a one-hot encoded column
`single_column_indices`	`list[int]`	Indices of all non-onehot columns
`data_loader`	`DataLoader`	A PyTorch DataLoader for the `data`
`private`	`DataLoader`	Whether the model is private, i.e. whether the `DPMixin` class has been inherited
`device`	`DataLoader`	The device to use for training (CPU or GPU)

Raises:

Type	Description
`TypeError`	If the `Model` class is directly instantiated (i.e. not inherited)
`AssertionError`	If the number of columns in the `data` does not match the number of indices in `multi_column_indices` and `single_column_indices`
`UserWarning`	If `use_gpu` is True but no GPU is available

Source code in src/nhssynth/modules/model/common/model.py

class Model(nn.Module, ABC):
    """
    Abstract base class for all NHSSynth models

    Args:
        data: The data to train on
        metatransformer: A `MetaTransformer` to use for converting the generated data to match the original data
        batch_size: The batch size to use during training
        use_gpu: Flag to determine whether to use the GPU (if available)

    Attributes:
        nrows: The number of rows in the `data`
        ncols: The number of columns in the `data`
        columns: The names of the columns in the `data`
        metatransformer: The `MetaTransformer` (potentially) associated with the model
        multi_column_indices: A list of lists of column indices, where each sublist containts the indices for a one-hot encoded column
        single_column_indices: Indices of all non-onehot columns
        data_loader: A PyTorch DataLoader for the `data`
        private: Whether the model is private, i.e. whether the `DPMixin` class has been inherited
        device: The device to use for training (CPU or GPU)

    Raises:
        TypeError: If the `Model` class is directly instantiated (i.e. not inherited)
        AssertionError: If the number of columns in the `data` does not match the number of indices in `multi_column_indices` and `single_column_indices`
        UserWarning: If `use_gpu` is True but no GPU is available
    """

    def __init__(
        self,
        data: pd.DataFrame,
        metatransformer: MetaTransformer,
        cond: Optional[Union[pd.DataFrame, pd.Series, np.ndarray]] = None,
        batch_size: int = 32,
        use_gpu: bool = False,
    ) -> None:
        if type(self) is Model:
            raise TypeError("Cannot directly instantiate the `Model` class")
        super().__init__()

        self.nrows, self.ncols = data.shape
        self.columns: pd.Index = data.columns

        self.batch_size = batch_size

        self.metatransformer = metatransformer
        self.multi_column_indices: list[list[int]] = metatransformer.multi_column_indices
        self.single_column_indices: list[int] = metatransformer.single_column_indices
        assert len(self.single_column_indices) + sum([len(x) for x in self.multi_column_indices]) == self.ncols

        tensor_data = torch.Tensor(data.to_numpy())
        self.cond_encoder: Optional[OneHotEncoder] = None
        if cond is not None:
            cond = np.asarray(cond)
            if len(cond.shape) == 1:
                cond = cond.reshape(-1, 1)
            self.cond_encoder = OneHotEncoder(handle_unknown="ignore").fit(cond)
            cond = self.cond_encoder.transform(cond).toarray()
            self.n_units_conditional = cond.shape[-1]
            dataset = TensorDataset(tensor_data, cond)
        else:
            self.n_units_conditional = 0
            dataset = TensorDataset(tensor_data)

        self.data_loader: DataLoader = DataLoader(
            dataset,
            pin_memory=True,
            batch_size=self.batch_size,
        )
        self.setup_device(use_gpu)

    def setup_device(self, use_gpu: bool) -> None:
        """Sets up the device to use for training (CPU or GPU) depending on `use_gpu` and device availability."""
        if use_gpu:
            if torch.cuda.is_available():
                self.device: torch.device = torch.device("cuda:0")
            else:
                warnings.warn("`use_gpu` was provided but no GPU is available, using CPU")
        self.device: torch.device = torch.device("cpu")

    def save(self, filename: str) -> None:
        """Saves the model to `filename`."""
        torch.save(self.state_dict(), filename)

    def load(self, path: str) -> None:
        """Loads the model from `path`."""
        self.load_state_dict(torch.load(path))

    @classmethod
    @abstractmethod
    def get_args() -> list[str]:
        """Returns the list of arguments to look for in an `argparse.Namespace`, these must map to the arguments of the inheritor."""
        raise NotImplementedError

    @classmethod
    @abstractmethod
    def get_metrics() -> list[str]:
        """Returns the list of metrics to track during training."""
        raise NotImplementedError

    def _start_training(self, num_epochs: int, patience: int, displayed_metrics: list[str], notebook_run: bool) -> None:
        """
        Initialises the training process.

        Args:
            num_epochs: The number of epochs to train for
            patience: The number of epochs to wait before stopping training early if the loss does not improve
            displayed_metrics: The metrics to display during training, this should be set to an empty list if running `train` in a notebook or the output may be messy

        Attributes:
            metrics: A dictionary of lists of tracked metrics, where each list contains the values for each batch
            stats_bars: A dictionary of tqdm status bars for each tracked metric
            max_length: The maximum length of the tracked metric names, used for formatting the tqdm status bars
            start_time: The time at which training started
            update_time: The time at which the tqdm status bars were last updated
        """
        self.num_epochs = num_epochs
        self.patience = patience
        self.metrics = {metric: np.empty(0, dtype=float) for metric in self.get_metrics()}
        displayed_metrics = displayed_metrics or self.get_metrics()
        self.stats_bars = (
            {
                metric: tqdm(total=0, desc="", position=i, bar_format="{desc}", leave=True)
                for i, metric in enumerate(displayed_metrics)
            }
            if not notebook_run
            else {}
        )
        self.max_length = max([len(add_spaces_before_caps(s)) + 5 for s in displayed_metrics] + [20])
        self.start_time = self.update_time = time.time()

    def _generate_metric_str(self, key) -> str:
        """Generates a string to display the current value of the metric `key`."""
        return f"{(add_spaces_before_caps(key) + ':').ljust(self.max_length)}  {np.mean(self.metrics[key][-len(self.data_loader) :]):.4f}"

    def _record_metrics(self, losses):
        """Records the metrics for the current batch to file and updates the tqdm status bars."""
        for key in self.metrics.keys():
            if key in losses:
                if losses[key]:
                    self.metrics[key] = np.append(
                        self.metrics[key], losses[key].item() if isinstance(losses[key], torch.Tensor) else losses[key]
                    )
        if time.time() - self.update_time > 0.5:
            for key, stats_bar in self.stats_bars.items():
                stats_bar.set_description_str(self._generate_metric_str(key))
                self.update_time = time.time()

    def _check_patience(self, epoch: int, metric: float) -> bool:
        """Maintains `_min_metric` and `_stop_counter` to determine whether to stop training early according to `patience`."""
        if epoch == 0:
            self._stop_counter = 0
            self._min_metric = metric
            self._patience_delta = self._min_metric / 1e4
        if metric < (self._min_metric - self._patience_delta):
            self._min_metric = metric
            self._stop_counter = 0  # Set counter to zero
        else:  # elbo has not improved
            self._stop_counter += 1
        return self._stop_counter == self.patience

    def _finish_training(self, num_epochs: int) -> None:
        """Closes each of the tqdm status bars and prints the time taken to do `num_epochs`."""
        for stats_bar in self.stats_bars.values():
            stats_bar.close()
        tqdm.write(f"Completed {num_epochs} epochs in {time.time() - self.start_time:.2f} seconds.\033[0m")

`get_args()` `abstractmethod` `classmethod`

Returns the list of arguments to look for in an argparse.Namespace, these must map to the arguments of the inheritor.

Source code in src/nhssynth/modules/model/common/model.py

@classmethod
@abstractmethod
def get_args() -> list[str]:
    """Returns the list of arguments to look for in an `argparse.Namespace`, these must map to the arguments of the inheritor."""
    raise NotImplementedError

`get_metrics()` `abstractmethod` `classmethod`

Returns the list of metrics to track during training.

Source code in src/nhssynth/modules/model/common/model.py

@classmethod
@abstractmethod
def get_metrics() -> list[str]:
    """Returns the list of metrics to track during training."""
    raise NotImplementedError

`load(path)`

Loads the model from path.

Source code in src/nhssynth/modules/model/common/model.py

def load(self, path: str) -> None:
    """Loads the model from `path`."""
    self.load_state_dict(torch.load(path))

`save(filename)`

Saves the model to filename.

Source code in src/nhssynth/modules/model/common/model.py

def save(self, filename: str) -> None:
    """Saves the model to `filename`."""
    torch.save(self.state_dict(), filename)

`setup_device(use_gpu)`

Sets up the device to use for training (CPU or GPU) depending on use_gpu and device availability.

Source code in src/nhssynth/modules/model/common/model.py

def setup_device(self, use_gpu: bool) -> None:
    """Sets up the device to use for training (CPU or GPU) depending on `use_gpu` and device availability."""
    if use_gpu:
        if torch.cuda.is_available():
            self.device: torch.device = torch.device("cuda:0")
        else:
            warnings.warn("`use_gpu` was provided but no GPU is available, using CPU")
    self.device: torch.device = torch.device("cpu")