diff --git a/docs/notebooks/dmri_covariance.ipynb b/docs/notebooks/dmri_covariance.ipynb
index aeecc707..e27ceff8 100644
--- a/docs/notebooks/dmri_covariance.ipynb
+++ b/docs/notebooks/dmri_covariance.ipynb
@@ -24,7 +24,7 @@
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "\n",
-    "from eddymotion.model._sklearn import (\n",
+    "from eddymotion.model.gpr import (\n",
     "    compute_pairwise_angles,\n",
     "    exponential_covariance,\n",
     "    spherical_covariance,\n",
@@ -345,7 +345,7 @@
     }
    ],
    "source": [
-    "from eddymotion.model._sklearn import EddyMotionGPR, SphericalKriging\n",
+    "from eddymotion.model.gpr import EddyMotionGPR, SphericalKriging\n",
     "\n",
     "K = SphericalKriging(beta_a=PARAMETER_SPHERICAL_a, beta_l=PARAMETER_lambda)(X_real)\n",
     "K -= K.min()\n",
diff --git a/scripts/dwi_gp_estimation_error_analysis.py b/scripts/dwi_gp_estimation_error_analysis.py
index 0d68dba7..2591f58d 100644
--- a/scripts/dwi_gp_estimation_error_analysis.py
+++ b/scripts/dwi_gp_estimation_error_analysis.py
@@ -36,7 +36,7 @@
 import pandas as pd
 from sklearn.model_selection import KFold, RepeatedKFold, cross_val_predict, cross_val_score
 
-from eddymotion.model._sklearn import (
+from eddymotion.model.gpr import (
     EddyMotionGPR,
     SphericalKriging,
 )
@@ -63,7 +63,7 @@ def cross_validate(
         Number of folds.
     n_repeats : :obj:`int`
         Number of times the cross-validator needs to be repeated.
-    gpr : obj:`~eddymotion.model._sklearn.EddyMotionGPR`
+    gpr : obj:`~eddymotion.model.gpr.EddyMotionGPR`
         The eddymotion Gaussian process regressor object.
 
     Returns
diff --git a/scripts/dwi_gp_estimation_simulated_signal.py b/scripts/dwi_gp_estimation_simulated_signal.py
index 3b534c68..ade883f7 100644
--- a/scripts/dwi_gp_estimation_simulated_signal.py
+++ b/scripts/dwi_gp_estimation_simulated_signal.py
@@ -33,7 +33,7 @@
 import numpy as np
 from dipy.core.sphere import Sphere
 
-from eddymotion.model._sklearn import EddyMotionGPR, SphericalKriging
+from eddymotion.model.gpr import EddyMotionGPR, SphericalKriging
 from eddymotion.testing import simulations as testsims
 
 SAMPLING_DIRECTIONS = 200
diff --git a/src/eddymotion/estimator.py b/src/eddymotion/estimator.py
index 0322fc05..ada6bccb 100644
--- a/src/eddymotion/estimator.py
+++ b/src/eddymotion/estimator.py
@@ -30,7 +30,7 @@
 
 from eddymotion import utils as eutils
 from eddymotion.data.splitting import lovo_split
-from eddymotion.model import ModelFactory
+from eddymotion.model.base import ModelFactory
 from eddymotion.registration.ants import _prepare_registration_data, _run_registration
 
 
diff --git a/src/eddymotion/model/_dipy.py b/src/eddymotion/model/_dipy.py
index f1a1f0d9..d7eb1773 100644
--- a/src/eddymotion/model/_dipy.py
+++ b/src/eddymotion/model/_dipy.py
@@ -31,7 +31,7 @@
 from dipy.reconst.base import ReconstModel
 from sklearn.gaussian_process import GaussianProcessRegressor
 
-from eddymotion.model._sklearn import (
+from eddymotion.model.gpr import (
     EddyMotionGPR,
     ExponentialKriging,
     SphericalKriging,
diff --git a/src/eddymotion/model/_sklearn.py b/src/eddymotion/model/gpr.py
similarity index 86%
rename from src/eddymotion/model/_sklearn.py
rename to src/eddymotion/model/gpr.py
index a3b333ad..6798d4bd 100644
--- a/src/eddymotion/model/_sklearn.py
+++ b/src/eddymotion/model/gpr.py
@@ -20,70 +20,7 @@
 #
 #     https://www.nipreps.org/community/licensing/
 #
-r"""
-Derivations from scikit-learn for Gaussian Processes.
-
-Gaussian Process Model: Pairwise orientation angles
----------------------------------------------------
-Squared Exponential covariance kernel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Kernel based on a squared exponential function for Gaussian processes on
-multi-shell DWI data following to eqs. 14 and 16 in [Andersson15]_.
-For a 2-shell case, the :math:`\mathbf{K}` kernel can be written as:
-
-.. math::
-    \begin{equation}
-    \mathbf{K} = \left[
-    \begin{matrix}
-        \lambda C_{\theta}(\theta (\mathbf{G}_{1}); a) + \sigma_{1}^{2} \mathbf{I} &
-        \lambda C_{\theta}(\theta (\mathbf{G}_{2}, \mathbf{G}_{1}); a) C_{b}(b_{2}, b_{1}; \ell) \\
-        \lambda C_{\theta}(\theta (\mathbf{G}_{1}, \mathbf{G}_{2}); a) C_{b}(b_{1}, b_{2}; \ell) &
-        \lambda C_{\theta}(\theta (\mathbf{G}_{2}); a) + \sigma_{2}^{2} \mathbf{I} \\
-    \end{matrix}
-    \right]
-    \end{equation}
-
-**Squared exponential shell-wise covariance kernel**:
-Compute the squared exponential smooth function describing how the
-covariance changes along the b direction.
-It uses the log of the b-values as the measure of distance along the
-b-direction according to eq. 15 in [Andersson15]_.
-
-.. math::
-    C_{b}(b, b'; \ell) = \exp\left( - \frac{(\log b - \log b')^2}{2 \ell^2} \right).
-
-**Squared exponential covariance kernel**:
-Compute the squared exponential covariance matrix following to eq. 14 in [Andersson15]_.
-
-.. math::
-    k(\textbf{x}, \textbf{x'}) = C_{\theta}(\mathbf{g}, \mathbf{g'}; a) C_{b}(|b - b'|; \ell)
-
-where :math:`C_{\theta}` is given by:
-
-.. math::
-    \begin{equation}
-    C(\theta) =
-    \begin{cases}
-    1 - \frac{3 \theta}{2 a} + \frac{\theta^3}{2 a^3} & \textnormal{if} \; \theta \leq a \\
-    0 & \textnormal{if} \; \theta > a
-    \end{cases}
-    \end{equation}
-
-:math:`\theta` being computed as:
-
-.. math::
-    \theta(\mathbf{g}, \mathbf{g'}) = \arccos(|\langle \mathbf{g}, \mathbf{g'} \rangle|)
-
-and :math:`C_{b}` is given by:
-
-.. math::
-    C_{b}(b, b'; \ell) = \exp\left( - \frac{(\log b - \log b')^2}{2 \ell^2} \right)
-
-:math:`b` and :math:`b'` being the b-values, and :math:`\mathbf{g}` and
-:math:`\mathbf{g'}` the unit diffusion-encoding gradient unit vectors of the
-shells; and :math:`{a, \ell}` some hyperparameters.
-
-"""
+"""Derivations from scikit-learn for Gaussian Processes."""
 
 from __future__ import annotations
 
@@ -101,10 +38,10 @@
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.utils._param_validation import Interval, StrOptions
 
-BOUNDS_A: tuple[float, float] = (0.1, np.pi)
-"""The limits for the parameter *a*."""
+BOUNDS_A: tuple[float, float] = (0.1, 2.35)
+"""The limits for the parameter *a* (angular distance in rad)."""
 BOUNDS_LAMBDA: tuple[float, float] = (1e-3, 1000)
-"""The limits for the parameter lambda."""
+"""The limits for the parameter λ (signal scaling factor)."""
 THETA_EPSILON: float = 1e-5
 """Minimum nonzero angle."""
 LBFGS_CONFIGURABLE_OPTIONS = {"disp", "maxiter", "ftol", "gtol"}
@@ -143,8 +80,7 @@ class EddyMotionGPR(GaussianProcessRegressor):
 
     In principle, Scikit-Learn's implementation normalizes the training data
     as in [Andersson15]_ (see
-    `FSL's souce code <https://git.fmrib.ox.ac.uk/fsl/eddy/-/blob/\
-2480dda293d4cec83014454db3a193b87921f6b0/DiffusionGP.cpp#L218>`__).
+    `FSL's souce code <https://git.fmrib.ox.ac.uk/fsl/eddy/-/blob/2480dda293d4cec83014454db3a193b87921f6b0/DiffusionGP.cpp#L218>`__).
     From their paper (p. 167, end of first column):
 
         Typically one just substracts the mean (:math:`\bar{\mathbf{f}}`)
@@ -161,7 +97,7 @@ class EddyMotionGPR(GaussianProcessRegressor):
     I believe this is overlooked in [Andersson15]_, or they actually did not
     use analytical gradient-descent:
 
-        _A note on optimisation_
+        *A note on optimisation*
 
         It is suggested, for example in Rasmussen and Williams (2006), that
         an optimisation method that uses derivative information should be
@@ -175,6 +111,44 @@ class EddyMotionGPR(GaussianProcessRegressor):
         frequently better at avoiding local maxima.
         Hence, that was the method we used for all optimisations in the present
         paper.
+    
+    **Multi-shell regression (TODO).**
+    For multi-shell modeling, the kernel :math:`k(\textbf{x}, \textbf{x'})`
+    is updated following Eq. (14) in [Andersson15]_.
+
+    .. math::
+        k(\textbf{x}, \textbf{x'}) = C_{\theta}(\mathbf{g}, \mathbf{g'}; a) C_{b}(|b - b'|; \ell)
+
+    and :math:`C_{b}` is based the log of the b-values ratio, a measure of distance along the
+    b-direction, according to Eq. (15) given by:
+
+    .. math::
+        C_{b}(b, b'; \ell) = \exp\left( - \frac{(\log b - \log b')^2}{2 \ell^2} \right),
+
+    :math:`b` and :math:`b'` being the b-values, and :math:`\mathbf{g}` and
+    :math:`\mathbf{g'}` the unit diffusion-encoding gradient unit vectors of the
+    shells; and :math:`{a, \ell}` some hyperparameters.
+
+    The full GP regression kernel :math:`\mathbf{K}` is then updated for a 2-shell case as
+    follows (Eq. (16) in [Andersson15]_):
+
+    .. math::
+        \begin{equation}
+        \mathbf{K} = \left[
+        \begin{matrix}
+            \lambda C_{\theta}(\theta (\mathbf{G}_{1}); a) + \sigma_{1}^{2} \mathbf{I} &
+            \lambda C_{\theta}(\theta (\mathbf{G}_{2}, \mathbf{G}_{1}); a) C_{b}(b_{2}, b_{1}; \ell) \\
+            \lambda C_{\theta}(\theta (\mathbf{G}_{1}, \mathbf{G}_{2}); a) C_{b}(b_{1}, b_{2}; \ell) &
+            \lambda C_{\theta}(\theta (\mathbf{G}_{2}); a) + \sigma_{2}^{2} \mathbf{I} \\
+        \end{matrix}
+        \right]
+        \end{equation}
+
+    References
+    ----------
+    .. [Andersson15] J. L. R. Andersson. et al., An integrated approach to
+        correction for off-resonance effects and subject movement in diffusion MR
+        imaging, NeuroImage 125 (2016) 1063-11078
 
     """
 
@@ -184,7 +158,7 @@ class EddyMotionGPR(GaussianProcessRegressor):
         "optimizer": [StrOptions(SUPPORTED_OPTIMIZERS), callable, None],
         "n_restarts_optimizer": [Interval(Integral, 0, None, closed="left")],
         "copy_X_train": ["boolean"],
-        "zeromean_y": ["boolean"],
+        "normalize_y": ["boolean"],
         "n_targets": [Interval(Integral, 1, None, closed="left"), None],
         "random_state": ["random_state"],
     }
@@ -497,9 +471,21 @@ def __repr__(self) -> str:
 
 
 def exponential_covariance(theta: np.ndarray, a: float) -> np.ndarray:
-    """
+    r"""
     Compute the exponential covariance for given distances and scale parameter.
 
+    Implements :math:`C_{\theta}`, following Eq. (9) in [Andersson15]_:
+
+    .. math::
+        \begin{equation}
+        C(\theta) = e^{-\theta/a} \,\, \text{for} \, 0 \leq \theta \leq \pi,
+        \end{equation}
+
+    :math:`\theta` being computed as:
+
+    .. math::
+        \theta(\mathbf{g}, \mathbf{g'}) = \arccos(|\langle \mathbf{g}, \mathbf{g'} \rangle|)
+
     Parameters
     ----------
     theta : :obj:`~numpy.ndarray`
@@ -517,9 +503,25 @@ def exponential_covariance(theta: np.ndarray, a: float) -> np.ndarray:
 
 
 def spherical_covariance(theta: np.ndarray, a: float) -> np.ndarray:
-    """
+    r"""
     Compute the spherical covariance for given distances and scale parameter.
 
+    Implements :math:`C_{\theta}`, following Eq. (10) in [Andersson15]_:
+
+    .. math::
+        \begin{equation}
+        C(\theta) =
+        \begin{cases}
+        1 - \frac{3 \theta}{2 a} + \frac{\theta^3}{2 a^3} & \textnormal{if} \; \theta \leq a \\
+        0 & \textnormal{if} \; \theta > a
+        \end{cases}
+        \end{equation}
+
+    :math:`\theta` being computed as:
+
+    .. math::
+        \theta(\mathbf{g}, \mathbf{g'}) = \arccos(|\langle \mathbf{g}, \mathbf{g'} \rangle|)
+
     Parameters
     ----------
     theta : :obj:`~numpy.ndarray`
@@ -583,12 +585,6 @@ def compute_pairwise_angles(
     >>> compute_pairwise_angles(X)[0, 1]
     0.0
 
-    References
-    ----------
-    .. [Andersson15] J. L. R. Andersson. et al., An integrated approach to
-       correction for off-resonance effects and subject movement in diffusion MR
-       imaging, NeuroImage 125 (2016) 1063-11078
-
     """
 
     cosines = np.clip(cosine_similarity(X, Y, dense_output=dense_output), -1.0, 1.0)
diff --git a/test/test_sklearn.py b/test/test_gpr.py
similarity index 98%
rename from test/test_sklearn.py
rename to test/test_gpr.py
index 277ba0b9..efc42b26 100644
--- a/test/test_sklearn.py
+++ b/test/test_gpr.py
@@ -26,7 +26,7 @@
 import pytest
 from dipy.io import read_bvals_bvecs
 
-from eddymotion.model import _sklearn as ems
+from eddymotion.model import gpr
 
 GradientTablePatch = namedtuple("gtab", ["bvals", "bvecs"])
 
@@ -263,7 +263,7 @@ def test_compute_pairwise_angles(bvecs1, bvecs2, closest_polarity, expected):
     if bvecs2 is not None:
         _bvecs2 = (bvecs2 / np.linalg.norm(bvecs2, axis=0)).T
 
-    obtained = ems.compute_pairwise_angles(_bvecs1, _bvecs2, closest_polarity)
+    obtained = gpr.compute_pairwise_angles(_bvecs1, _bvecs2, closest_polarity)
 
     if _bvecs2 is not None:
         assert (_bvecs1.shape[0], _bvecs2.shape[0]) == obtained.shape
@@ -282,7 +282,7 @@ def test_kernel(repodata, covariance):
 
     bvecs = bvecs[bvals > 10]
 
-    KernelType = getattr(ems, f"{covariance}Kriging")
+    KernelType = getattr(gpr, f"{covariance}Kriging")
     kernel = KernelType()
     K = kernel(bvecs)