SeldonIO · ascillitoe · Jan 24, 2023 · Sep 2, 2022 · Sep 7, 2022 · Nov 3, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@
 See the [documentation](https://docs.seldon.io/projects/alibi-detect/en/latest/cd/methods/mmddrift.html) and [example notebook](https://docs.seldon.io/projects/alibi-detect/en/latest/examples/cd_mmd_keops.html) for more info ([#548](https://github.com/SeldonIO/alibi-detect/pull/548)).
 - **New feature** Added support for serializing detectors with PyTorch backends, and detectors containing PyTorch models in their proprocessing functions ([#656](https://github.com/SeldonIO/alibi-detect/pull/656)).
 - **New feature** Added support for serializing detectors with KeOps backends ([#681](https://github.com/SeldonIO/alibi-detect/pull/681)).
+- **New feature** Added support for saving and loading online detectors' state. This allows a detector to be restarted from previously generated checkpoints ([#604](https://github.com/SeldonIO/alibi-detect/pull/604)).
 - **New feature** Added a PyTorch version of the `UAE` preprocessing utility function ([#656](https://github.com/SeldonIO/alibi-detect/pull/656)).
 - If a `categories_per_feature` dictionary is not passed to `TabularDrift`, a warning is now raised to inform the user that all features are assumed to be numerical ([#606](https://github.com/SeldonIO/alibi-detect/pull/606)).
 - For the `ClassifierDrift` and `SpotTheDiffDrift` detectors, we can also return the out-of-fold instances of the reference and test sets. When using `train_size` for training the detector, this allows to associate the returned prediction probabilities with the correct instances.

diff --git a/alibi_detect/base.py b/alibi_detect/base.py
@@ -216,11 +216,7 @@ def predict(self) -> Any: ...
 class ConfigurableDetector(Detector, Protocol):
     """Type Protocol for detectors that have support for saving via config.
 
-    Used for typing save and load functionality in `alibi_detect.saving.saving.py`.
-
-    Note:
-        This exists to distinguish between detectors with and without support for config saving and loading. Once all
-        detector support this then this protocol will be removed.
+    Used for typing save and load functionality in `alibi_detect.saving.saving`.
     """
     def get_config(self): ...
 
@@ -229,6 +225,17 @@ def from_config(self): ...
     def _set_config(self): ...
 
 
+@runtime_checkable
+class StatefulDetectorOnline(ConfigurableDetector, Protocol):
+    """Type Protocol for detectors that have support for save/loading of online state.
+
+    Used for typing save and load functionality in `alibi_detect.saving.saving`.
+    """
+    def save_state(self, filepath): ...
+
+    def load_state(self, filepath): ...
+
+
 class NumpyEncoder(json.JSONEncoder):
     def default(self, obj):
         if isinstance(

diff --git a/alibi_detect/cd/base_online.py b/alibi_detect/cd/base_online.py
@@ -1,11 +1,16 @@
+import os
+from pathlib import Path
 import logging
+import warnings
 from abc import abstractmethod
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
 
 import numpy as np
 from alibi_detect.base import BaseDetector, concept_drift_dict
 from alibi_detect.cd.utils import get_input_shape
-from alibi_detect.utils.frameworks import has_pytorch, has_tensorflow
+from alibi_detect.utils.frameworks import Framework, has_pytorch, has_tensorflow
+from alibi_detect.utils._state import save_state_dict, load_state_dict
+from alibi_detect.utils._types import Literal
 
 if has_pytorch:
     import torch
@@ -18,6 +23,8 @@
 
 class BaseMultiDriftOnline(BaseDetector):
     thresholds: np.ndarray
+    backend: Literal['pytorch', 'tensorflow']
+    online_state_keys: Tuple[str, ...]
 
     def __init__(
             self,
@@ -106,6 +113,66 @@ def _configure_ref_subset(self):
     def _update_state(self, x_t: Union[np.ndarray, 'tf.Tensor', 'torch.Tensor']):
         pass
 
+    def _set_state_dir(self, dirpath: Union[str, os.PathLike]):
+        """
+        Set the directory path to store state in, and create an empty directory if it doesn't already exist.
+
+        Parameters
+        ----------
+        dirpath
+            The directory to save state file inside.
+        """
+        self.state_dir = Path(dirpath)
+        self.state_dir.mkdir(parents=True, exist_ok=True)
+
+    def save_state(self, filepath: Union[str, os.PathLike]):
+        """
+        Save a detector's state to disk in order to generate a checkpoint.
+
+        Parameters
+        ----------
+        filepath
+            The directory to save state to.
+        """
+        self._set_state_dir(filepath)
+        self._save_state()
+
+    def load_state(self, filepath: Union[str, os.PathLike]):
+        """
+        Load the detector's state from disk, in order to restart from a checkpoint previously generated with
+        `save_state`.
+
+        Parameters
+        ----------
+        filepath
+            The directory to load state from.
+        """
+        self._set_state_dir(filepath)
+        self._load_state()
+        logger.info('State loaded for t={} from {}'.format(self.t, self.state_dir))
+
+    def _save_state(self):
+        """
+        Private method to save a detector's state to disk.
+
+        TODO - Method slightly verbose as designed to facilitate saving of "offline" state in follow-up PR.
+        """
+        suffix = '.pt' if self.backend == Framework.PYTORCH else '.npz'
+        filename = 'state'
+        keys = self.online_state_keys
+        save_state_dict(self, keys, self.state_dir.joinpath(filename + suffix))
+        logger.info('Saved state for t={} to {}'.format(self.t, self.state_dir))
+
+    def _load_state(self, offline: bool = False):
+        """
+        Private method to load a detector's state from disk.
+
+        TODO - Method slightly verbose as designed to facilitate loading of "offline" state in follow-up PR.
+        """
+        suffix = '.pt' if self.backend == Framework.PYTORCH else '.npz'
+        filename = 'state'
+        load_state_dict(self, self.state_dir.joinpath(filename + suffix), raise_error=True)
+
     def _preprocess_xt(self, x_t: Union[np.ndarray, Any]) -> np.ndarray:
         """
         Private method to preprocess a single test instance ready for _update_state.
@@ -126,17 +193,46 @@ def _preprocess_xt(self, x_t: Union[np.ndarray, Any]) -> np.ndarray:
         return x_t[None, :]
 
     def get_threshold(self, t: int) -> float:
+        """
+        Return the threshold for timestep `t`.
+
+        Parameters
+        ----------
+        t
+            The timestep to return a threshold for.
+
+        Returns
+        -------
+        The threshold at timestep `t`.
+        """
         return self.thresholds[t] if t < self.window_size else self.thresholds[-1]
 
-    def _initialise(self) -> None:
+    def _initialise_state(self) -> None:
+        """
+        Initialise online state (the stateful attributes updated by `score` and `predict`).
+
+        If a subclassed detector has additional online state, an additional `_initialise_state` should be defined,
+        with a call to `super()._initialise_state()` included (see `LSDDDriftOnlineTorch._initialise_state()` for
+        an example).
+        """
         self.t = 0  # corresponds to a test set of ref data
         self.test_stats = np.array([])  # type: ignore[var-annotated]
         self.drift_preds = np.array([])  # type: ignore[var-annotated]
-        self._configure_ref_subset()
 
     def reset(self) -> None:
-        "Resets the detector but does not reconfigure thresholds."
-        self._initialise()
+        """
+        Deprecated reset method. This method will be repurposed or removed in the future. To reset the detector to
+        its initial state (`t=0`) use :meth:`reset_state`.
+        """
+        self.reset_state()
+        warnings.warn('This method is deprecated and will be removed/repurposed in the future. To reset the detector '
+                      'to its initial state use `reset_state`.', DeprecationWarning)
+
+    def reset_state(self) -> None:
+        """
+        Resets the detector to its initial state (`t=0`). This does not include reconfiguring thresholds.
+        """
+        self._initialise_state()
 
     def predict(self, x_t: Union[np.ndarray, Any], return_test_stat: bool = True,
                 ) -> Dict[Dict[str, str], Dict[str, Union[int, float]]]:
@@ -179,6 +275,7 @@ def predict(self, x_t: Union[np.ndarray, Any], return_test_stat: bool = True,
 
 class BaseUniDriftOnline(BaseDetector):
     thresholds: np.ndarray
+    online_state_keys: Tuple[str, ...]
 
     def __init__(
             self,
@@ -290,7 +387,79 @@ def _configure_ref(self):
     def _update_state(self, x_t: np.ndarray):
         pass
 
+    def _set_state_dir(self, dirpath: Union[str, os.PathLike]):
+        """
+        Set the directory path to store state in, and create an empty directory if it doesn't already exist.
+
+        Parameters
+        ----------
+        dirpath
+            The directory to save state file inside.
+        """
+        self.state_dir = Path(dirpath)
+        self.state_dir.mkdir(parents=True, exist_ok=True)
+
+    def save_state(self, filepath: Union[str, os.PathLike]):
+        """
+        Save a detector's state to disk in order to generate a checkpoint.
+
+        Parameters
+        ----------
+        filepath
+            The directory to save state to.
+        """
+        self._set_state_dir(filepath)
+        self._save_state()
+        logger.info('Saved state for t={} to {}'.format(self.t, self.state_dir))
+
+    def load_state(self, filepath: Union[str, os.PathLike]):
+        """
+        Load the detector's state from disk, in order to restart from a checkpoint previously generated with
+        `save_state`.
+
+        Parameters
+        ----------
+        filepath
+            The directory to load state from.
+        """
+        self._set_state_dir(filepath)
+        self._load_state()
+        logger.info('State loaded for t={} from {}'.format(self.t, self.state_dir))
+
+    def _save_state(self):
+        """
+        Private method to save a detector's state to disk.
+
+        TODO - Method slightly verbose as designed to facilitate saving of "offline" state in follow-up PR.
+        """
+        filename = 'state'
+        keys = self.online_state_keys
+        save_state_dict(self, keys, self.state_dir.joinpath(filename + '.npz'))
+
+    def _load_state(self, offline: bool = False):
+        """
+        Private method to load a detector's state from disk.
+
+        TODO - Method slightly verbose as designed to facilitate loading of "offline" state in follow-up PR.
+        """
+        filename = 'state'
+        load_state_dict(self, self.state_dir.joinpath(filename + '.npz'), raise_error=True)
+
     def _check_x(self, x: Any, x_ref: bool = False) -> np.ndarray:
+        """
+        Check the type and shape of the data `x`, and coerces it to the correct shape if possible.
+
+        Parameters
+        ----------
+        x
+            The data to be checked.
+        x_ref
+            Whether `x` is a batch of reference data instances (if `True`), or a single test data instance (if `False`).
+
+        Returns
+        -------
+        The checked data, coerced to be a np.ndarray of the correct shape.
+        """
         # Check the type of x
         if isinstance(x, np.ndarray):
             pass
@@ -333,21 +502,51 @@ def _preprocess_xt(self, x_t: Union[np.ndarray, Any]) -> np.ndarray:
         return x_t
 
     def get_threshold(self, t: int) -> np.ndarray:
+        """
+        Return the threshold for timestep `t`.
+
+        Parameters
+        ----------
+        t
+            The timestep to return a threshold for.
+
+        Returns
+        -------
+        The threshold at timestep `t`.
+        """
         return self.thresholds[t] if t < len(self.thresholds) else self.thresholds[-1]
 
-    def _initialise(self) -> None:
+    def _initialise_state(self) -> None:
+        """
+        Initialise online state (the stateful attributes updated by `score` and `predict`).
+
+        If a subclassed detector has additional online state, an additional `_initialise_state` should be defined,
+        with a call to `super()._initialise_state()` included (see `CVMDriftOnlineTorch._initialise_state()` for
+        an example).
+        """
         self.t = 0
+        self.xs = np.array([])
         self.test_stats = np.empty([0, len(self.window_sizes), self.n_features])
         self.drift_preds = np.array([])  # type: ignore[var-annotated]
-        self._configure_ref()
 
     @abstractmethod
     def _check_drift(self, test_stats: np.ndarray, thresholds: np.ndarray) -> int:
         pass
 
     def reset(self) -> None:
-        "Resets the detector but does not reconfigure thresholds."
-        self._initialise()
+        """
+        Deprecated reset method. This method will be repurposed or removed in the future. To reset the detector to
+        its initial state (`t=0`) use :meth:`reset_state`.
+        """
+        self.reset_state()
+        warnings.warn('This method is deprecated and will be removed/repurposed in the future. To reset the detector '
+                      'to its initial state use `reset_state`.', DeprecationWarning)
+
+    def reset_state(self) -> None:
+        """
+        Resets the detector to its initial state (`t=0`). This does not include reconfiguring thresholds.
+        """
+        self._initialise_state()
 
     def predict(self, x_t: Union[np.ndarray, Any], return_test_stat: bool = True,
                 ) -> Dict[Dict[str, str], Dict[str, Union[int, float]]]:

diff --git a/alibi_detect/cd/cvm_online.py b/alibi_detect/cd/cvm_online.py
@@ -9,6 +9,8 @@
 
 
 class CVMDriftOnline(BaseUniDriftOnline, DriftConfigMixin):
+    online_state_keys = ('t', 'test_stats', 'drift_preds', 'xs', 'ids_ref_wins', 'ids_wins_ref', 'ids_wins_wins')
+
     def __init__(
             self,
             x_ref: Union[np.ndarray, list],
@@ -92,10 +94,14 @@ def __init__(
         self.batch_size = n_bootstraps if batch_size is None else batch_size
 
         # Configure thresholds and initialise detector
-        self._initialise()
+        self._initialise_state()
         self._configure_thresholds()
+        self._configure_ref()
 
     def _configure_ref(self) -> None:
+        """
+        Configure the reference data.
+        """
         ids_ref_ref = self.x_ref[None, :, :] >= self.x_ref[:, None, :]
         self.ref_cdf_ref = np.sum(ids_ref_ref, axis=0) / self.n
 
@@ -162,6 +168,14 @@ def _simulate_streams(self, t_max: int) -> np.ndarray:
         return stats
 
     def _update_state(self, x_t: np.ndarray):
+        """
+        Update online state based on the provided test instance.
+
+        Parameters
+        ----------
+        x_t
+            The test instance.
+        """
         self.t += 1
         if self.t == 1:
             # Initialise stream
@@ -186,6 +200,15 @@ def _update_state(self, x_t: np.ndarray):
                 [self.ids_wins_wins, (x_t <= self.xs[-self.max_ws:, :])[None, :, :]], 0
             )
 
+    def _initialise_state(self) -> None:
+        """
+        Initialise online state (the stateful attributes updated by `score` and `predict`).
+        """
+        super()._initialise_state()
+        self.ids_ref_wins = np.array([])
+        self.ids_wins_ref = np.array([])
+        self.ids_wins_wins = np.array([])
+
     def score(self, x_t: Union[np.ndarray, Any]) -> np.ndarray:
         """
         Compute the test-statistic (CVM) between the reference window(s) and test window.