diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6fc32ce34e2..22a9f457ab4 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -77,7 +77,8 @@ default:
      CC: 'gcc-9'
      CXX: 'g++-9'
   script:
-    - export with_cuda=false myconfig=default with_coverage=true with_scafacos=true
+    - export with_cuda=false myconfig=default with_coverage=true
+    - export with_scafacos=true with_stokesian_dynamics=true
     - bash maintainer/CI/build_cmake.sh
   tags:
     - docker
@@ -90,7 +91,8 @@ maxset:
      CC: 'gcc-9'
      CXX: 'g++-9'
   script:
-    - export with_cuda=false myconfig=maxset with_coverage=true with_scafacos=true
+    - export with_cuda=false myconfig=maxset with_coverage=true
+    - export with_scafacos=true with_stokesian_dynamics=true
     - bash maintainer/CI/build_cmake.sh
   tags:
     - docker
@@ -127,7 +129,7 @@ debian:10:
   stage: build
   image: docker.pkg.github.com/espressomd/docker/debian:446ff604bbfa63f30ddb462697fa0d0dc2630460
   script:
-    - export with_cuda=false myconfig=maxset make_check_python=false
+    - export with_cuda=false myconfig=maxset make_check_python=false with_stokesian_dynamics=true
     - bash maintainer/CI/build_cmake.sh
   tags:
     - docker
@@ -138,7 +140,7 @@ opensuse:15.1:
   stage: build
   image: docker.pkg.github.com/espressomd/docker/opensuse:446ff604bbfa63f30ddb462697fa0d0dc2630460
   script:
-    - export with_cuda=false myconfig=maxset make_check_python=false
+    - export with_cuda=false myconfig=maxset make_check_python=false with_stokesian_dynamics=true
     - bash maintainer/CI/build_cmake.sh
   tags:
     - docker
@@ -149,7 +151,7 @@ centos:7:
   stage: build
   image: docker.pkg.github.com/espressomd/docker/centos:446ff604bbfa63f30ddb462697fa0d0dc2630460
   script:
-    - export with_cuda=false myconfig=maxset make_check_python=true
+    - export with_cuda=false myconfig=maxset make_check_python=true with_stokesian_dynamics=true
     - bash maintainer/CI/build_cmake.sh
   tags:
     - docker
@@ -160,7 +162,7 @@ fedora:32:
   stage: build
   image: docker.pkg.github.com/espressomd/docker/fedora:fc7628d32de0fce605976ba9edebe7eff186e618
   script:
-    - export with_cuda=false myconfig=maxset make_check_python=false
+    - export with_cuda=false myconfig=maxset make_check_python=false with_stokesian_dynamics=true
     - bash maintainer/CI/build_cmake.sh
   tags:
     - docker
@@ -187,8 +189,9 @@ clang-sanitizer:
      CC: 'clang-9'
      CXX: 'clang++-9'
   script:
-    - export myconfig=maxset with_cuda=true with_cuda_compiler=clang with_coverage=false with_scafacos=true
+    - export myconfig=maxset with_cuda=true with_cuda_compiler=clang with_coverage=false
     - export with_static_analysis=true test_timeout=900 with_asan=true with_ubsan=true
+    - export with_scafacos=true with_stokesian_dynamics=true
     - bash maintainer/CI/build_cmake.sh
   timeout: 2h
   tags:
@@ -203,7 +206,8 @@ cuda10-maxset:
      CC: 'gcc-8'
      CXX: 'g++-8'
   script:
-    - export myconfig=maxset with_cuda=true with_coverage=false with_scafacos=true test_timeout=900 srcdir=${CI_PROJECT_DIR}
+    - export myconfig=maxset with_cuda=true with_coverage=false test_timeout=900 srcdir=${CI_PROJECT_DIR}
+    - export with_scafacos=true with_stokesian_dynamics=true
     - bash maintainer/CI/build_cmake.sh
   artifacts:
     paths:
@@ -223,7 +227,8 @@ cuda9-maxset:
      CXX: 'g++-6'
      GCOV: 'gcov-6'
   script:
-    - export myconfig=maxset with_cuda=true with_coverage=true with_scafacos=true test_timeout=900 srcdir=${CI_PROJECT_DIR}
+    - export myconfig=maxset with_cuda=true with_coverage=true test_timeout=900 srcdir=${CI_PROJECT_DIR}
+    - export with_scafacos=true with_stokesian_dynamics=true
     - bash maintainer/CI/build_cmake.sh
   artifacts:
     paths:
@@ -307,7 +312,8 @@ installation:
      CC: 'gcc-8'
      CXX: 'g++-8'
   script:
-    - export myconfig=maxset with_cuda=true with_coverage=false with_scafacos=true make_check_unit_tests=false make_check_python=false
+    - export myconfig=maxset with_cuda=true with_coverage=false make_check_unit_tests=false make_check_python=false
+    - export with_scafacos=true with_stokesian_dynamics=true
     - export srcdir=${CI_PROJECT_DIR} build_type=Release
     - bash maintainer/CI/build_cmake.sh
     - cd build
@@ -335,7 +341,8 @@ empty:
      CC: 'clang-9'
      CXX: 'clang++-9'
   script:
-    - export myconfig=empty with_cuda=true with_cuda_compiler=clang with_static_analysis=true with_scafacos=false
+    - export myconfig=empty with_cuda=true with_cuda_compiler=clang with_static_analysis=true
+    - export with_scafacos=false with_stokesian_dynamics=false
     - bash maintainer/CI/build_cmake.sh
   tags:
     - docker
@@ -347,9 +354,10 @@ empty:
 rocm-maxset:
   <<: *global_job_definition
   stage: build
-  image: docker.pkg.github.com/espressomd/docker/rocm:06b6216c7aa3555bcf28c90734dbb84e7285c96f
+  image: docker.pkg.github.com/espressomd/docker/rocm:fd7ed514a00d38bf336299ddb34ffcdc8a42e906
   script:
     - export myconfig=maxset with_cuda=true with_cuda_compiler=hip
+    - export with_stokesian_dynamics=false
     - bash maintainer/CI/build_cmake.sh
   tags:
     - amdgpu
@@ -360,6 +368,7 @@ rocm:latest:
   image: docker.pkg.github.com/espressomd/docker/rocm:latest_base
   script:
     - export myconfig=maxset with_cuda=true with_cuda_compiler=hip
+    - export with_stokesian_dynamics=false
     - bash maintainer/CI/build_cmake.sh
   tags:
     - amdgpu
@@ -385,6 +394,7 @@ intel:19:
   script:
     - export myconfig=maxset with_cuda=true with_coverage=false I_MPI_SHM_LMT=shm
     - export cmake_params="-DCMAKE_CXX_FLAGS=-O2"
+    - export with_stokesian_dynamics=true
     - bash maintainer/CI/build_cmake.sh
   tags:
     - docker
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fde9eaf1344..e60c00f0a78 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,6 +74,7 @@ option(WITH_CUDA "Build with GPU support" OFF)
 option_if_available(WITH_HDF5 "Build with HDF5 support" ON)
 option(WITH_TESTS "Enable tests" ON)
 option_if_available(WITH_SCAFACOS "Build with ScaFaCoS support" OFF)
+option_if_available(WITH_STOKESIAN_DYNAMICS "Build with Stokesian Dynamics" ON)
 option(WITH_BENCHMARKS "Enable benchmarks" OFF)
 option(WITH_VALGRIND_INSTRUMENTATION
        "Build with valgrind instrumentation markers" OFF)
@@ -253,6 +254,45 @@ if(WITH_GSL)
   endif(GSL_FOUND)
 endif(WITH_GSL)
 
+find_package(BLAS)
+if(BLAS_FOUND)
+  set(BLAS 1)
+endif()
+find_package(LAPACK)
+if(LAPACK_FOUND)
+  set(LAPACK 1)
+endif()
+
+if(WITH_STOKESIAN_DYNAMICS)
+  if(BLAS AND LAPACK)
+    set(STOKESIAN_DYNAMICS 1)
+  endif()
+  if(CUDA)
+    set(STOKESIAN_DYNAMICS_GPU 1)
+    if(HIP)
+      find_gpu_library(VARNAME CUDA_CUBLAS_LIBRARIES NAMES rocblas)
+      find_gpu_library(VARNAME CUDA_cusolver_LIBRARY NAMES rocsolver)
+    else()
+      if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 10.0)
+        # when importing package cublas, CMake versions older than 3.12.2 will
+        # look for a library cublas_device that no longer exists in CUDA 10
+        cmake_minimum_required(VERSION 3.12.2)
+      endif()
+      find_gpu_library(VARNAME CUDA_CUBLAS_LIBRARIES NAMES cublas)
+      find_gpu_library(VARNAME CUDA_cusolver_LIBRARY NAMES cusolver)
+    endif()
+  endif(CUDA)
+  if(NOT (STOKESIAN_DYNAMICS
+          OR (STOKESIAN_DYNAMICS_GPU AND CUDA_CUBLAS_LIBRARIES
+              AND CUDA_cusolver_LIBRARY))
+     AND NOT WITH_STOKESIAN_DYNAMICS_IS_DEFAULT_VALUE)
+    message(
+      FATAL_ERROR
+        "Optional feature Stokesian Dynamics explicitly requested, but dependencies not found."
+    )
+  endif()
+endif(WITH_STOKESIAN_DYNAMICS)
+
 if(WITH_VALGRIND_INSTRUMENTATION)
   find_package(PkgConfig)
   pkg_check_modules(VALGRIND valgrind)
diff --git a/cmake/FindCUDACompilerHIP.cmake b/cmake/FindCUDACompilerHIP.cmake
index 34278c17e65..3b38bb29da7 100644
--- a/cmake/FindCUDACompilerHIP.cmake
+++ b/cmake/FindCUDACompilerHIP.cmake
@@ -27,15 +27,18 @@ find_package(HIP ${CUDACompilerHIP_FIND_VERSION} MODULE REQUIRED)
 # patch HCC_PATH environment variable and reload HIP
 if(HIP_VERSION VERSION_LESS 3.1)
   set(HCC_PATH "${HIP_ROOT_DIR}")
-else()
+elseif(HIP_VERSION VERSION_LESS 3.5)
   set(HCC_PATH "${ROCM_HOME}/hcc")
+else()
+  set(HIP_HIPCC_CMAKE_LINKER_HELPER "${HIP_HIPCC_EXECUTABLE}")
+  unset(HCC_PATH)
 endif()
 find_package(HIP ${CUDACompilerHIP_FIND_VERSION} MODULE REQUIRED)
 
 set(CUDA 1)
 set(HIP 1)
 
-list(APPEND HIP_HCC_FLAGS
+list(APPEND HIP_HIPCC_FLAGS
        -std=c++${CMAKE_CUDA_STANDARD} -pedantic -Wall -Wextra
        -Wno-sign-compare -Wno-unused-function -Wno-unused-variable
        -Wno-unused-parameter -Wno-missing-braces -Wno-gnu-anonymous-struct
diff --git a/cmake/cmake_config.cmakein b/cmake/cmake_config.cmakein
index 9389e251138..1bc7dcd59e2 100644
--- a/cmake/cmake_config.cmakein
+++ b/cmake/cmake_config.cmakein
@@ -13,6 +13,14 @@
 
 #cmakedefine GSL
 
+#cmakedefine BLAS
+
+#cmakedefine LAPACK
+
+#cmakedefine STOKESIAN_DYNAMICS
+
+#cmakedefine STOKESIAN_DYNAMICS_GPU
+
 #cmakedefine VALGRIND_INSTRUMENTATION
 
 #define PACKAGE_NAME "${PROJECT_NAME}"
diff --git a/doc/doxygen/Doxyfile.in b/doc/doxygen/Doxyfile.in
index 813762115cb..ef637dcb38f 100644
--- a/doc/doxygen/Doxyfile.in
+++ b/doc/doxygen/Doxyfile.in
@@ -790,6 +790,7 @@ WARN_LOGFILE           =
 # Note: If this tag is empty the current directory is searched.
 
 INPUT                  = "@CMAKE_SOURCE_DIR@/src" \
+                         "@CMAKE_SOURCE_DIR@/libs/stokesian_dynamics" \
                          "@CMAKE_SOURCE_DIR@/doc/doxygen/main.dox"
 
 # This tag can be used to specify the character encoding of the source files
diff --git a/doc/doxygen/bibliography.bib b/doc/doxygen/bibliography.bib
index 967580da4d2..6df6aa756c6 100644
--- a/doc/doxygen/bibliography.bib
+++ b/doc/doxygen/bibliography.bib
@@ -22,6 +22,16 @@ @Article{ahlrichs99a
   doi                      = {10.1063/1.480156},
 }
 
+@book{allen2017,
+  title={{Computer simulation of liquids}},
+  author={Allen, Michael P and Tildesley, Dominic J},
+  year={2017},
+  publisher={Oxford University Press},
+  doi={10.1093/oso/9780198803195.001.0001},
+  isbn={9780198803195},
+  edition={2nd},
+}
+
 @Article{andersen83a,
   title                    = {{Rattle: A "velocity" version of the shake algorithm for molecular dynamics calculations}},
   author                   = {Andersen, Hans C.},
@@ -55,6 +65,27 @@ @Article{arnold13c
   issn                     = {1099-4300},
 }
 
+@Article{banchio03a,
+  author  = {Adolfo J. Banchio and John F. Brady},
+  title   = {{Accelerated Stokesian dynamics: Brownian motion}},
+  journal = {Journal of Chemical Physics},
+  year    = {2003},
+  volume  = {118},
+  number  = {22},
+  pages   = {10323},
+  doi     = {10.1063/1.1571819},
+}
+
+@Article{brady88a,
+  author  = {J. F. Brady and G. Bossis},
+  title   = {{Stokesian dynamics}},
+  journal = {Annual Review of Fluid Mechanics},
+  year    = {1988},
+  volume  = {20},
+  pages   = {111-157},
+  doi     = {10.1146/annurev.fl.20.010188.000551},
+}
+
 @Article{brodka04a,
   author    = {Br\'{o}dka, A.},
   title     = {{Ewald summation method with electrostatic layer correction for interactions of point dipoles in slab geometry}},
@@ -91,6 +122,16 @@ @Article{cerda08d
   doi           = {10.1063/1.3000389},
 }
 
+@Article{cortez15a,
+  author  = {Cortez, Ricardo and Varela, Douglas},
+  title   = {{A general system of images for regularized Stokeslets and other elements near a plane wall}},
+  journal = {Journal of Computational Physics},
+  year    = {2015},
+  volume  = {285},
+  pages   = {41-54},
+  doi     = {10.1016/j.jcp.2015.01.019},
+}
+
 @PhdThesis{deserno00b,
   author        = {Deserno, Markus},
   title         = {Counterion condensation for rigid linear polyelectrolytes},
@@ -192,6 +233,16 @@ @Article{dupin08a
 doi = {10.1080/10618560802238242},
 }
 
+@Article{durlofsky87a,
+  author  = {L. Durlofsky and J. F. Brady and G. Bossis},
+  title   = {{Dynamic simulation of hydrodynamically interacting particles}},
+  journal = {Journal of Fluid Mechanics},
+  year    = {1987},
+  volume  = {180},
+  pages   = {21-49},
+  doi     = {10.1017/S002211208700171X},
+}
+
 @Article{essmann95a,
   title                    = {{A smooth Particle Mesh Ewald method}},
   author                   = {Essmann, U. and Perera, L. and Berkowitz, M. L. and Darden, T. and Lee, H. and Pedersen, L.},
@@ -265,6 +316,27 @@ @Book{hockney88a
   url       = {https://www.crcpress.com/Computer-Simulation-Using-Particles/Hockney-Eastwood/p/book/9780852743928},
 }
 
+@Article{jancigova16a,
+author = {Jan\v{c}igov\'{a}, Iveta and Cimr\'{a}k, Ivan},
+title = {{Non-uniform force allocation for area preservation in spring network models}},
+journal = {International Journal for Numerical Methods in Biomedical Engineering},
+volume = {32},
+number = {10},
+pages = {e02757},
+year = {2016},
+doi = {10.1002/cnm.2757},
+}
+
+@Article{jeffrey84a,
+  author  = {D. J. Jeffrey and Y. Onishi},
+  title   = {{Calculation of the resistance and mobility functions for two unequal rigid spheres in low-Reynolds-number flow}},
+  journal = {Journal of Fluid Mechanics},
+  year    = {1984},
+  volume  = {139},
+  pages   = {261-290},
+  doi     = {10.1017/S0022112084000355},
+}
+
 @Article{kolafa92a,
   title                    = {{Cutoff errors in the Ewald summation formulae for point charge systems}},
   author                   = {Kolafa, Jiri and Perram, John W.},
@@ -286,17 +358,6 @@ @Book{kruger12a
 doi={10.1007/978-3-8348-2376-2},
 }
 
-@Article{jancigova16a,
-author = {Jan\v{c}igov\'{a}, Iveta and Cimr\'{a}k, Ivan},
-title = {{Non-uniform force allocation for area preservation in spring network models}},
-journal = {International Journal for Numerical Methods in Biomedical Engineering},
-volume = {32},
-number = {10},
-pages = {e02757},
-year = {2016},
-doi = {10.1002/cnm.2757},
-}
-
 @Article{ladd01a,
   title                    = {{Lattice-Boltzmann simulations of particle-fluid suspensions}},
   author                   = {Ladd, A. J. C. and Verberg, R.},
@@ -329,6 +390,17 @@ @Article{marsili10a
 year        = {2010},
 }
 
+@article{martys99a,
+  title = {{Velocity Verlet algorithm for dissipative-particle-dynamics-based models of suspensions}},
+  author = {Martys, Nicos S. and Mountain, Raymond D.},
+  journal = {Physical Review E},
+  volume = {59},
+  number = {3},
+  pages = {3733--3736},
+  year = {1999},
+  doi = {10.1103/PhysRevE.59.3733},
+}
+
 @Article{moss96a,
 author={Moss, G. P.},
 journal={Pure and Applied Chemistry},
@@ -351,6 +423,17 @@ @Article{neumann85b
 doi = {10.1063/1.448553},
 }
 
+@article{omelyan98a,
+author = {Omelyan, Igor P.},
+title = {{On the numerical integration of motion for rigid polyatomics: The modified quaternion approach}},
+journal = {Computers in Physics},
+volume = {12},
+number = {1},
+pages = {97-103},
+year = {1998},
+doi = {10.1063/1.168642},
+}
+
 @article{panneton06a,
 author = {Panneton, Fran\c{c}ois and L'Ecuyer, Pierre and Matsumoto, Makoto},
 title = {Improved long-period generators based on linear recurrences modulo 2},
@@ -442,38 +525,6 @@ @Article{sonnenschein85a
 doi = {10.1016/0021-9991(85)90151-2},
 }
 
-@book{allen2017,
-  title={{Computer simulation of liquids}},
-  author={Allen, Michael P and Tildesley, Dominic J},
-  year={2017},
-  publisher={Oxford University Press},
-  doi={10.1093/oso/9780198803195.001.0001},
-  isbn={9780198803195},
-  edition={2nd},
-}
-
-@article{martys99a,
-  title = {{Velocity Verlet algorithm for dissipative-particle-dynamics-based models of suspensions}},
-  author = {Martys, Nicos S. and Mountain, Raymond D.},
-  journal = {Physical Review E},
-  volume = {59},
-  number = {3},
-  pages = {3733--3736},
-  year = {1999},
-  doi = {10.1103/PhysRevE.59.3733},
-}
-
-@article{omelyan98a,
-author = {Omelyan, Igor P.},
-title = {{On the numerical integration of motion for rigid polyatomics: The modified quaternion approach}},
-journal = {Computers in Physics},
-volume = {12},
-number = {1},
-pages = {97-103},
-year = {1998},
-doi = {10.1063/1.168642},
-}
-
 @Article{swope92a,
 author = {Swope, William C. and Ferguson, David M.},
 title = {{Alternative expressions for energies and forces due to angle bending and torsional energy}},
diff --git a/doc/sphinx/installation.rst b/doc/sphinx/installation.rst
index f58459b419e..06e5c084e18 100644
--- a/doc/sphinx/installation.rst
+++ b/doc/sphinx/installation.rst
@@ -485,9 +485,15 @@ Finally, there is a flag for debugging:
 Features marked as experimental
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Some of the above features are marked as EXPERIMENTAL. Activating these features can have unexpected side effects and some of them have known issues. If you activate any of these features, you should understand the corresponding source code and do extensive testing. Furthermore, it is necessary to define ``EXPERIMENTAL_FEATURES`` in :file:`myconfig.hpp`.
+Some of the above features are marked as EXPERIMENTAL. Activating these
+features can have unexpected side effects and some of them have known issues.
+If you activate any of these features, you should understand the corresponding
+source code and do extensive testing. Furthermore, it is necessary to define
+``EXPERIMENTAL_FEATURES`` in :file:`myconfig.hpp`.
 
 
+.. _External features:
+
 External features
 ^^^^^^^^^^^^^^^^^
 
@@ -508,6 +514,14 @@ using a CMake flag (see :ref:`Options and Variables`).
 - ``GSL`` Enables features relying on the GNU Scientific Library, e.g.
   :meth:`espressomd.cluster_analysis.Cluster.fractal_dimension`.
 
+- ``STOKESIAN_DYNAMICS`` Enables the Stokesian Dynamics feature for CPU
+  (see :ref:`Stokesian Dynamics`). Requires BLAS and LAPACK.
+
+- ``STOKESIAN_DYNAMICS_GPU`` Enables the Stokesian Dynamics feature for GPU
+  (see :ref:`Stokesian Dynamics`). Requires thrust/cuBLAS/cuSolver for NVIDIA
+  GPUs or rocrand/rocthrust/rocblas/rocsolver for AMD GPUs.
+  Requires ``EXPERIMENTAL_FEATURES``.
+
 
 
 .. _Configuring:
@@ -563,11 +577,11 @@ Then you can simply compile two different versions of |es| via:
 
 .. code-block:: bash
 
-    cd builddir1
+    cd $builddir1
     cmake ..
     make
 
-    cd builddir2
+    cd $builddir2
     cmake ..
     make
 
@@ -670,6 +684,8 @@ options are available:
 
 * ``WITH_SCAFACOS``: Build with ScaFaCoS support
 
+* ``WITH_STOKESIAN_DYNAMICS`` Build with Stokesian Dynamics support
+
 * ``WITH_VALGRIND_INSTRUMENTATION``: Build with valgrind instrumentation
   markers
 
@@ -697,6 +713,40 @@ correct one with ``CUDA_BIN_PATH=/usr/local/cuda-10.0 cmake .. -DWITH_CUDA=ON``
 path with ``-DCMAKE_CXX_FLAGS=--cuda-path=/usr/local/cuda-10.0``).
 
 
+.. _Configuring without a network connection:
+
+Configuring without a network connection
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Several :ref:`external features <External features>` in |es| rely on
+external libraries that are downloaded automatically by CMake. When a
+network connection cannot be established due to firewall restrictions,
+the CMake logic needs editing:
+
+* ``WITH_HDF5``: when cloning |es|, the :file:`/libs/h5xx` folder will be
+  a git submodule containing a :file:`.git` subfolder. To prevent CMake from
+  updating this submodule with git, delete the corresponding command with:
+
+  .. code-block:: bash
+
+    sed -i '/execute_process(COMMAND ${GIT_EXECUTABLE} submodule update -- libs\/h5xx/,+1 d' CMakeLists.txt
+
+  When installing a release version of |es|, no network communication
+  is needed for HDF5.
+
+* ``WITH_STOKESIAN_DYNAMICS``: this library is installed using `FetchContent
+  <https://cmake.org/cmake/help/latest/module/FetchContent.html>`_.
+  The repository URL can be found in the ``GIT_REPOSITORY`` field of the
+  corresponding ``FetchContent_Declare()`` command. The ``GIT_TAG`` field
+  provides the commit. Clone this repository locally next to the |es|
+  folder and edit the |es| build system such that ``GIT_REPOSITORY`` points
+  to the absolute path of the Stokesian Dynamics clone, for example with:
+
+  .. code-block:: bash
+
+    sed -ri 's|GIT_REPOSITORY +.+/stokesian_dynamics|GIT_REPOSITORY /work/username/stokesian_dynamics|' CMakeLists.txt
+
+
 Compiling, testing and installing
 ---------------------------------
 
diff --git a/doc/sphinx/introduction.rst b/doc/sphinx/introduction.rst
index 2277afedde7..8010ff8dd8f 100644
--- a/doc/sphinx/introduction.rst
+++ b/doc/sphinx/introduction.rst
@@ -434,6 +434,10 @@ report so to the developers using the instructions in :ref:`Contributing`.
 +--------------------------------+------------------------+------------------+------------+
 | Quaternion Integrator          | Core                   | Good             | Yes        |
 +--------------------------------+------------------------+------------------+------------+
+| Stokesian Dynamics on CPU      | Single                 | None             | Yes        |
++--------------------------------+------------------------+------------------+------------+
+| Stokesian Dynamics on GPU      | Experimental           | None             | No         |
++--------------------------------+------------------------+------------------+------------+
 |                                **Interactions**                                         |
 +--------------------------------+------------------------+------------------+------------+
 | Short-range Interactions       | Core                   | Core             | Yes        |
diff --git a/doc/sphinx/running.rst b/doc/sphinx/running.rst
index ea6b9367ce2..5c608e2ca29 100644
--- a/doc/sphinx/running.rst
+++ b/doc/sphinx/running.rst
@@ -273,3 +273,70 @@ This convenience function only exists for historical reasons and remains availab
 for backward compatibility. New scripts should setup the steepest descent
 integrator with the :meth:`~espressomd.integrate.IntegratorHandle.set_steepest_descent`
 handle directly.
+
+.. _Stokesian Dynamics:
+
+Stokesian Dynamics
+------------------
+
+:meth:`espressomd.integrate.IntegratorHandle.set_stokesian_dynamics`
+
+The Stokesian Dynamics method allows to study the behaviour of spherical
+particles in a viscous fluid. It is targeted at systems with very low Reynolds
+numbers. In such systems, particles stop moving immediately as soon as any
+force on them is removed. In other words, motion has no memory of the past.
+
+The integration scheme is relatively simple. Only the particle's positions,
+radii and forces (including torques) are needed to compute the momentary
+velocities (including angular velocities). The particle positions are
+integrated by the simple Euler scheme.
+
+The computation of the velocities is an approximation with good results
+in the far field.
+The Stokesian Dynamics method is only available for open systems,
+i.e. no periodic boundary conditions are supported. The box size has
+no effect either.
+
+The Stokesian Dynamics method is outlined in :cite:`durlofsky87a`.
+
+The following minimal example illustrates how to use the SDM in |es|::
+
+    import espressomd
+    system = espressomd.System(box_l=[1.0, 1.0, 1.0])
+    system.time_step = 0.01
+    system.cell_system.skin = 0.4
+    system.part.add(pos=[0, 0, 0], rotation=[1, 0, 0])
+    system.integrator.set_stokesian_dynamics(viscosity=1.0, radii={0: 1.0})
+
+    system.integrator.run(100)
+
+Because there is no force on the particle yet, nothing will move. You will need
+to add your own actors to the system. The parameter ``radii`` is a dictionary
+that maps particle types to different radii. ``viscosity`` is the dynamic
+viscosity of the ambient infinite fluid. There are additional optional
+parameters for ``set_stokesian_dynamics()``. For more information, see
+:py:meth:`espressomd.integrate.IntegratorHandle.set_stokesian_dynamics()`.
+
+Note that this setup represents a system at zero temperature. In order to
+thermalize the system, the SD thermostat needs to be activated (see
+:ref:`Stokesian thermostat`).
+
+.. _Important_SD:
+
+Important
+~~~~~~~~~
+
+The particles must be prevented from overlapping. It is mathematically allowed
+for the particles to overlap to a certain degree. However, once the distance
+of the sphere centers is less than 2/3 of the sphere diameter, the mobility
+matrix is no longer positive definite and the Stokesian Dynamics integrator
+will fail. Therefore, the particle centers must be kept apart from each
+other by a strongly repulsive potential, for example the WCA potential
+that is set to the appropriate particle radius (for more information about
+the available interaction types see :ref:`Non-bonded interactions`).
+
+The current implementation of SD only includes the far field approximation.
+The near field (so-called lubrication) correction is planned. For now,
+Stokesian Dynamics provides a good approximation of the hydrodynamics
+in dilute systems where the average distance between particles is several
+sphere diameters.
diff --git a/doc/sphinx/system_setup.rst b/doc/sphinx/system_setup.rst
index be5d738e122..b4149093150 100644
--- a/doc/sphinx/system_setup.rst
+++ b/doc/sphinx/system_setup.rst
@@ -473,6 +473,21 @@ can be omitted in subsequent calls of ``set_brownian()``. It is the user's
 responsibility to decide whether the thermostat should be deterministic (by
 using a fixed seed) or not (by using a randomized seed).
 
+.. _Stokesian thermostat:
+
+Stokesian thermostat
+~~~~~~~~~~~~~~~~~~~~
+
+In order to thermalize a Stokesian Dynamics simulation, the SD thermostat
+needs to be activated via::
+
+    system.thermostat.set_stokesian(kT=1.0, seed=43)
+
+where ``kT`` denotes the desired temperature of the system, and ``seed`` the
+seed for the random number generator of the Stokesian Dynamics thermostat.
+It is independent from the other random number generators in |es|.
+
+
 .. _CUDA:
 
 CUDA
diff --git a/doc/sphinx/zrefs.bib b/doc/sphinx/zrefs.bib
index 4908486963d..2e503448604 100644
--- a/doc/sphinx/zrefs.bib
+++ b/doc/sphinx/zrefs.bib
@@ -1,7 +1,6 @@
-% This file was created with JabRef 2.9.
 % Encoding: US-ASCII
 @article{ahlrichs99,
-author = {Ahlrichs,Patrick  and Dünweg,Burkhard },
+author = {Ahlrichs, Patrick and D\"{u}nweg, Burkhard},
 title = {Simulation of a single polymer chain in solution by combining lattice {B}oltzmann and molecular dynamics},
 journal = {The Journal of Chemical Physics},
 volume = {111},
@@ -998,3 +997,13 @@ @article{yaghoubi2015a
   doi={10.1209/0295-5075/110/24002},
   publisher={IOP Publishing}
 }
+
+@Article{durlofsky87a,
+  author  = {L. Durlofsky and J. F. Brady and G. Bossis},
+  title   = {{Dynamic simulation of hydrodynamically interacting particles}},
+  journal = {Journal of Fluid Mechanics},
+  year    = {1987},
+  volume  = {180},
+  pages   = {21-49},
+  doi     = {10.1017/S002211208700171X},
+}
diff --git a/libs/CMakeLists.txt b/libs/CMakeLists.txt
index ac344dca2fb..9b8583c44b7 100644
--- a/libs/CMakeLists.txt
+++ b/libs/CMakeLists.txt
@@ -4,3 +4,10 @@ target_compile_definitions(Random123 INTERFACE R123_USE_MULHILO64_C99)
 
 add_library(h5xx INTERFACE)
 target_include_directories(h5xx SYSTEM INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/h5xx)
+
+if(STOKESIAN_DYNAMICS OR STOKESIAN_DYNAMICS_GPU)
+  add_library(stokesian_dynamics INTERFACE)
+  target_include_directories(stokesian_dynamics INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/stokesian_dynamics/include>)
+  set(CMAKE_INSTALL_LIBDIR "${CMAKE_INSTALL_PREFIX}/${PYTHON_INSTDIR}/espressomd")
+  add_subdirectory(stokesian_dynamics/src)
+endif()
diff --git a/libs/stokesian_dynamics/CMakeLists.txt b/libs/stokesian_dynamics/CMakeLists.txt
new file mode 100644
index 00000000000..716ea134b6a
--- /dev/null
+++ b/libs/stokesian_dynamics/CMakeLists.txt
@@ -0,0 +1,12 @@
+# TODO: License header
+project(StokesianDynamics LANGUAGES CXX)
+
+add_library(stokesian_dynamics INTERFACE)
+target_include_directories(stokesian_dynamics INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
+add_subdirectory(src)
+
+export(EXPORT stokesiandynamics-targets
+       FILE ${CMAKE_CURRENT_BINARY_DIR}/StokesianDynamicsTargets.cmake
+       NAMESPACE StokesianDynamics::)
+
+export(PACKAGE StokesianDynamics)
diff --git a/libs/stokesian_dynamics/include/stokesian_dynamics/sd_cpu.hpp b/libs/stokesian_dynamics/include/stokesian_dynamics/sd_cpu.hpp
new file mode 100644
index 00000000000..331ece54f43
--- /dev/null
+++ b/libs/stokesian_dynamics/include/stokesian_dynamics/sd_cpu.hpp
@@ -0,0 +1,12 @@
+#ifndef SD_CPU_HPP
+#define SD_CPU_HPP
+
+#include <vector>
+
+std::vector<double> sd_cpu(std::vector<double> const &x_host,
+                           std::vector<double> const &f_host,
+                           std::vector<double> const &a_host,
+                           std::size_t n_part, double eta, double sqrt_kT_Dt,
+                           std::size_t offset, std::size_t seed, int flg);
+
+#endif
diff --git a/libs/stokesian_dynamics/include/stokesian_dynamics/sd_gpu.hpp b/libs/stokesian_dynamics/include/stokesian_dynamics/sd_gpu.hpp
new file mode 100644
index 00000000000..4721ad7bfc7
--- /dev/null
+++ b/libs/stokesian_dynamics/include/stokesian_dynamics/sd_gpu.hpp
@@ -0,0 +1,12 @@
+#ifndef SD_GPU_HPP
+#define SD_GPU_HPP
+
+#include <vector>
+
+std::vector<double> sd_gpu(std::vector<double> const &x_host,
+                           std::vector<double> const &f_host,
+                           std::vector<double> const &a_host,
+                           std::size_t n_part, double eta, double sqrt_kT_Dt,
+                           std::size_t offset, std::size_t seed, int flg);
+
+#endif
diff --git a/libs/stokesian_dynamics/src/CMakeLists.txt b/libs/stokesian_dynamics/src/CMakeLists.txt
new file mode 100644
index 00000000000..85cb1e6544f
--- /dev/null
+++ b/libs/stokesian_dynamics/src/CMakeLists.txt
@@ -0,0 +1,51 @@
+# TODO: License header
+
+if(STOKESIAN_DYNAMICS_GPU)
+  add_gpu_library(sd_gpu SHARED sd_gpu.cu)
+  add_library(StokesianDynamics::sd_gpu ALIAS sd_gpu)
+  target_link_libraries(sd_gpu PRIVATE
+    stokesian_dynamics
+    ${CUDA_CUBLAS_LIBRARIES}
+    ${CUDA_cusolver_LIBRARY})
+  target_compile_definitions(sd_gpu PRIVATE SD_USE_THRUST)
+  install(TARGETS sd_gpu
+    EXPORT stokesiandynamics-targets
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+  set_target_properties(sd_gpu PROPERTIES EXPORT_NAME sd_gpu)
+endif()
+
+if(STOKESIAN_DYNAMICS)
+  find_package(BLAS REQUIRED)
+  find_package(LAPACK REQUIRED)
+  find_package(Boost 1.65 REQUIRED)
+  add_library(sd_cpu SHARED sd_cpu.cpp)
+  add_library(StokesianDynamics::sd_cpu ALIAS sd_cpu)
+  target_link_libraries(sd_cpu
+    PRIVATE
+      stokesian_dynamics
+      ${BLAS_LIBRARIES}
+      ${LAPACK_LIBRARIES}
+      Boost::boost
+      Random123)
+
+  # In case the GPU is used, Thrust is present and can be used to parallelize
+  # the CPU code, too. The standard compiler needs to be told the location of
+  # Thrust
+  if(STOKESIAN_DYNAMICS_GPU)
+    target_compile_definitions(sd_cpu PUBLIC SD_USE_THRUST PRIVATE THRUST_DEVICE_SYSTEM=4)
+    if(HIP_VERSION)
+      target_include_directories(sd_cpu PRIVATE
+        ${HIP_ROOT_DIR}/include
+        ${ROCM_HOME}/include)
+    else()
+      target_include_directories(sd_cpu PRIVATE
+        ${CUDA_INCLUDE_DIRS})
+    endif()
+  endif()
+  install(TARGETS sd_cpu
+    EXPORT stokesiandynamics-targets
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+  set_target_properties(sd_cpu PROPERTIES EXPORT_NAME sd_cpu)
+endif()
diff --git a/libs/stokesian_dynamics/src/device_matrix.hpp b/libs/stokesian_dynamics/src/device_matrix.hpp
new file mode 100644
index 00000000000..d123fd6c284
--- /dev/null
+++ b/libs/stokesian_dynamics/src/device_matrix.hpp
@@ -0,0 +1,803 @@
+/** @file
+ *  In this file, various types are provided which are useful for matrix
+ *  arithmetics on a parallel computing "device". Various BLAS and LAPACK
+ *  routines are accessed for efficient matrix calculations.
+ */
+
+#ifndef SD_DEVICE_MATRIX_HPP
+#define SD_DEVICE_MATRIX_HPP
+
+#include <iomanip>
+#include <limits>
+#include <type_traits>
+
+#include "thrust_wrapper.hpp"
+
+#if defined(__CUDACC__)
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+#elif defined(__HIPCC__)
+#include <rocblas.h>
+#include <rocsolver.h>
+#endif
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#define DEVICE_FUNC __host__ __device__
+#else
+#define DEVICE_FUNC
+#endif
+
+#ifdef __GNUC__
+#define MAYBE_UNUSED __attribute__((unused))
+#else
+#define MAYBE_UNUSED
+#endif
+
+/** The `host` and `device` structs are used to distinguish between routines
+ *  that are executed on the Thrust "host" and on the Thrust "device". These
+ *  types are passed to certain routines within this file as a template
+ *  parameter. Within those routines, its `vector` type denotes either
+ *  `host_vector` or `device_vector` depending on whether the routine is called
+ *  for host or device. The `par()` function returns the according Thrust
+ *  execution policy that is needed when Thrust routines are called.
+ */
+namespace policy {
+
+struct host {
+    template <typename T>
+    using vector = thrust_wrapper::host_vector<T>;
+    // the remove_const is necessary, because things cannot be static and const
+    // at the same time
+    static std::remove_const<decltype(thrust_wrapper::host)>::type par()
+      { return thrust_wrapper::host; }
+};
+
+struct device {
+    template <typename T>
+    using vector = thrust_wrapper::device_vector<T>;
+    static std::remove_const<decltype(thrust_wrapper::device)>::type par()
+      { return thrust_wrapper::device; }
+};
+
+template <typename...>
+using void_t = void;
+
+template <typename Policy, typename = void>
+struct is_policy : std::false_type {};
+
+template <typename Policy>
+struct is_policy<Policy, void_t<typename Policy::template vector<class T>,
+                                decltype(Policy::par())>> : std::true_type {};
+
+} // namespace policy
+
+DEVICE_FUNC inline thrust_wrapper::tuple<std::size_t, std::size_t>
+unravel_index(std::size_t index, std::size_t lda) {
+    std::size_t i, j;
+    i = index % lda;
+    index /= lda;
+    j = index;
+    return thrust_wrapper::make_tuple(i, j);
+    ;
+}
+
+/// \cond
+
+// LAPACK prototypes
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
+extern "C" {
+int dgemm_(char *transa, char *transb, int *m, int *n, int *k, double *alpha,
+           double *a, int *lda, double *b, int *ldb, double *beta, double *c,
+           int *ldc);
+
+int dgemv_(char *trans, int *m, int *n, double *alpha, double *a, int *lda,
+           double *x, int *incx, double *beta, double *y, int *incy);
+
+int dpotrf_(char *uplo, int *n, double *a, int *lda, int *info);
+
+int dpotrs_(char *uplo, int *n, int *nrhs, double *a, int *lda, double *b,
+            int *ldb, int *info);
+}
+#endif
+
+namespace internal {
+
+/** The `cublas` struct channels access to efficient basic matrix operations
+ *  provided by either the cuBLAS library (after which it is named), which
+ *  executes on an Nvidia GPU, rocBLAS, which executes on an AMD GPU, or by
+ *  BLAS (Basic Linear Algebra Subprograms), which executes on the CPU.
+ *  \tparam Policy Specifies whether the routines are executed on host or on
+ *                 device, by either passing \ref policy::host or \ref
+ *                 policy::device
+ *  \tparam T      Specifies the data type (only `double` is available).
+ */
+template <typename Policy, typename T>
+struct cublas {};
+
+#if defined(__CUDACC__)
+/** Basic matrix operations on device (Nvidia-GPU) using the cuBLAS library
+ */
+template <>
+struct cublas<policy::device, double> {
+    /** Transpose matrix
+     *  \param A buffer on device for the matrix to be transposed.
+     *  \param C buffer on device to store the result. May be the same as A.
+     *  \param m number of rows of A
+     *  \param n number of columns of A
+     */
+    static void geam(double const *A, double *C, int m, int n) {
+        double const alpha = 1;
+        double const beta = 0;
+
+        MAYBE_UNUSED cublasStatus_t stat;
+        cublasHandle_t handle;
+        stat = cublasCreate(&handle);
+        assert(CUBLAS_STATUS_SUCCESS == stat);
+
+        stat = cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, n, m, &alpha, A, m,
+                           &beta, A, m, C, n);
+        assert(CUBLAS_STATUS_SUCCESS == stat);
+
+        cublasDestroy(handle);
+    }
+
+    /** Matrix matrix multiplication
+     *  \param A buffer on device for the matrix: first factor
+     *  \param B buffer on device for the matrix: second factor
+     *  \param C buffer on device to store the result
+     *  \param m number of rows of A
+     *  \param n number of columns of B
+     *  \param k number of columns of A, rows of B
+     */
+    static void gemm(const double *A, const double *B, double *C, int m, int k,
+                     int n) {
+        int lda = m, ldb = k, ldc = m;
+        double alpha = 1;
+        double beta = 0;
+
+        MAYBE_UNUSED cublasStatus_t stat;
+        cublasHandle_t handle;
+        stat = cublasCreate(&handle);
+        assert(CUBLAS_STATUS_SUCCESS == stat);
+
+        stat = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, A,
+                           lda, B, ldb, &beta, C, ldc);
+        assert(CUBLAS_STATUS_SUCCESS == stat);
+
+        cublasDestroy(handle);
+    }
+
+    /** Matrix vector multiplication
+     *  \param A buffer on device for the matrix
+     *  \param x buffer on device for the vector
+     *  \param y buffer on device for the result
+     *  \param m number of rows of A
+     *  \param n number of columns of A
+     */
+    static void gemv(const double *A, const double *x, double *y, int m,
+                     int n) {
+        int lda = m;
+        double alpha = 1;
+        double beta = 0;
+        int incx = 1;
+        int incy = 1;
+
+        MAYBE_UNUSED cublasStatus_t stat;
+        cublasHandle_t handle;
+        stat = cublasCreate(&handle);
+        assert(CUBLAS_STATUS_SUCCESS == stat);
+
+        stat = cublasDgemv(handle, CUBLAS_OP_N, m, n, &alpha, A, lda, x, incx,
+                           &beta, y, incy);
+        assert(CUBLAS_STATUS_SUCCESS == stat);
+
+        cublasDestroy(handle);
+    }
+};
+#elif defined(__HIPCC__)
+/** Basic matrix operations on device (AMD-GPU) using the rocBLAS library
+ */
+template <>
+struct cublas<policy::device, double> {
+    /** Transpose matrix
+     *  \param A buffer on device for the matrix to be transposed.
+     *  \param C buffer on device to store the result. May be the same as A.
+     *  \param m number of rows of A
+     *  \param n number of columns of A
+     */
+    static void geam(double const *A, double *C, int m, int n) {
+        double const alpha = 1;
+        double const beta = 0;
+
+        MAYBE_UNUSED rocblas_status stat;
+        rocblas_handle handle;
+        stat = rocblas_create_handle(&handle);
+        assert(rocblas_status_success == stat);
+
+        stat = rocblas_dgeam(handle, rocblas_operation_transpose,
+                             rocblas_operation_transpose, n, m, &alpha, A, m,
+                             &beta, A, m, C, n);
+        assert(rocblas_status_success == stat);
+
+        rocblas_destroy_handle(handle);
+    }
+
+    /** Matrix matrix multiplication
+     *  \param A buffer on device for the matrix: first factor
+     *  \param B buffer on device for the matrix: second factor
+     *  \param C buffer on device to store the result
+     *  \param m number of rows of A
+     *  \param n number of columns of B
+     *  \param k number of columns of A, rows of B
+     */
+    static void gemm(const double *A, const double *B, double *C, int m, int k,
+                     int n) {
+        int lda = m, ldb = k, ldc = m;
+        double alpha = 1;
+        double beta = 0;
+
+        MAYBE_UNUSED rocblas_status stat;
+        rocblas_handle handle;
+        stat = rocblas_create_handle(&handle);
+        assert(rocblas_status_success == stat);
+
+        stat = rocblas_dgemm(handle, rocblas_operation_none,
+                             rocblas_operation_none, m, n, k, &alpha, A, lda,
+                             B, ldb, &beta, C, ldc);
+        assert(rocblas_status_success == stat);
+
+        rocblas_destroy_handle(handle);
+    }
+
+    /** Matrix vector multiplication
+     *  \param A buffer on device for the matrix
+     *  \param x buffer on device for the vector
+     *  \param y buffer on device for the result
+     *  \param m number of rows of A
+     *  \param n number of columns of A
+     */
+    static void gemv(const double *A, const double *x, double *y, int m,
+                     int n) {
+        int lda = m;
+        double alpha = 1;
+        double beta = 0;
+        int incx = 1;
+        int incy = 1;
+
+        MAYBE_UNUSED rocblas_status stat;
+        rocblas_handle handle;
+        stat = rocblas_create_handle(&handle);
+        assert(rocblas_status_success == stat);
+
+        stat = rocblas_dgemv(handle, rocblas_operation_none, m, n, &alpha, A,
+                             lda, x, incx, &beta, y, incy);
+        assert(rocblas_status_success == stat);
+
+        rocblas_destroy_handle(handle);
+    }
+};
+#else
+/** Basic matrix operations on host (CPU) using the BLAS library
+ */
+template <>
+struct cublas<policy::host, double> {
+    /** Transpose matrix
+     *  \param A buffer on host for the matrix to be transposed.
+     *  \param C buffer on host to store the result. May be the same as A.
+     *  \param m number of rows of A
+     *  \param n number of columns of A
+     */
+    static void geam(double const *A, double *C, int m, int n) {
+        // m = m_rows, n = m_cols
+        for (int i = 0; i < m; ++i) {
+            for (int j = 0; j < n; ++j) {
+                // (row,col) = row + col * m_rows
+                C[j + i * n] = A[i + j * m];
+            }
+        }
+    }
+
+    /** Matrix matrix multiplication
+     *  \param A buffer on host for the matrix: first factor
+     *  \param B buffer on host for the matrix: second factor
+     *  \param C buffer on host to store the result
+     *  \param m number of rows of A
+     *  \param n number of columns of B
+     *  \param k number of columns of A, rows of B
+     */
+    static void gemm(const double *A, const double *B, double *C, int m, int k,
+                     int n) {
+        int lda = m, ldb = k, ldc = m;
+        double alpha = 1;
+        double beta = 0;
+
+        char N = 'N';
+        dgemm_(&N, &N, &m, &n, &k, &alpha, const_cast<double *>(A), &lda,
+               const_cast<double *>(B), &ldb, &beta, C, &ldc);
+    }
+
+    /** Matrix vector multiplication
+     *  \param A buffer on host for the matrix
+     *  \param x buffer on host for the vector
+     *  \param y buffer on host for the result
+     *  \param m number of rows of A
+     *  \param n number of columns of A
+     */
+    static void gemv(const double *A, const double *x, double *y, int m,
+                     int n) {
+        int lda = m;
+        double alpha = 1;
+        double beta = 0;
+        int incx = 1;
+        int incy = 1;
+
+        char N = 'N';
+        dgemv_(&N, &m, &n, &alpha, const_cast<double *>(A), &lda,
+               const_cast<double *>(x), &incx, &beta, y, &incy);
+    }
+};
+#endif
+
+/** The `cusolver` struct channels access to efficient linear algebra solvers
+ *  provided by either the cuSolver library (after which it is named), which
+ *  executes on an Nvidia GPU, rocSolver, which executes on an AMD GPU, or by
+ *  LAPACK (Linear Algebra PACKage), which executes on the CPU.
+ *  The first template parameter specifies whether the routines are executed on
+ *  host or on device, by either passing `policy::host` or `policy::device`.
+ *  The second template parameter specifies the data type (only `double` is
+ *  available).
+ *
+ *  Note: since at the time of writing, the rocSolver library lacks the
+ *  `dpotrs` function, the implementation for AMD GPUs is less efficient than
+ *  for the other platforms.
+ */
+template <typename, typename>
+struct cusolver;
+
+#if defined(__CUDACC__)
+template <>
+struct cusolver<policy::device, double> {
+    /** Computes the Cholesky factorization and the inverse of a real symmetric
+     *  positive definite matrix, looking only in the top half of the symmetric
+     *  matrix.
+     *
+     *  \param A buffer on device for the symmetric input matrix,
+     *           serves as output for the Cholesky decomposition
+     *  \param B buffer on device expects identity matrix,
+     *           serves as output for the inverse
+     *  \param N size of the matrix
+     */
+    static void potrf(double *A, double *B, int N) {
+        MAYBE_UNUSED cusolverStatus_t stat;
+        cusolverDnHandle_t handle;
+        stat = cusolverDnCreate(&handle);
+        assert(CUSOLVER_STATUS_SUCCESS == stat);
+
+        int lwork = -1;
+        stat = cusolverDnDpotrf_bufferSize(handle, CUBLAS_FILL_MODE_UPPER, N, A,
+                                           N, &lwork);
+        assert(CUSOLVER_STATUS_SUCCESS == stat);
+
+        assert(lwork != -1);
+
+        thrust_wrapper::device_vector<double> workspace(lwork);
+        thrust_wrapper::device_vector<int> info(1);
+        stat = cusolverDnDpotrf(handle, CUBLAS_FILL_MODE_UPPER, N, A, N,
+                                thrust_wrapper::raw_pointer_cast(workspace.data()),
+                                lwork, thrust_wrapper::raw_pointer_cast(info.data()));
+        assert(CUSOLVER_STATUS_SUCCESS == stat);
+        assert(info[0] == 0);
+
+        stat = cusolverDnDpotrs(handle, CUBLAS_FILL_MODE_UPPER, N, N, A, N, B,
+                                N, thrust_wrapper::raw_pointer_cast(info.data()));
+        assert(CUSOLVER_STATUS_SUCCESS == stat);
+        assert(info[0] == 0);
+
+        cusolverDnDestroy(handle);
+    }
+};
+#elif defined(__HIPCC__)
+// ROCm API documentation
+// https://rocsolver.readthedocs.io/en/latest/userguidedocu.html
+template <>
+struct cusolver<policy::device, double> {
+    /** Computes the Cholesky factorization and the inverse of a real symmetric
+     *  positive definite matrix, looking only in the top half of the symmetric
+     *  matrix.
+     *
+     *  \param A buffer on device for the symmetric input matrix,
+     *           serves as output for the Cholesky decomposition
+     *  \param B buffer on device expects identity matrix,
+     *           serves as output for the inverse
+     *  \param N size of the matrix
+     */
+    static void potrf(double *A, double *B, int N) {
+        // copy input so that it is available twice
+        thrust_wrapper::device_vector<double> C(N*N);
+        thrust_wrapper::copy_n(policy::device::par(), A, N*N, C.begin());
+
+        MAYBE_UNUSED rocsolver_status stat;
+        rocsolver_handle handle;
+        stat = rocsolver_create_handle(&handle);
+        assert(rocblas_status_success == stat);
+
+        // Cholesky decomposition
+        thrust_wrapper::device_vector<rocsolver_int> info(1);
+        stat = rocsolver_dpotrf(handle, rocblas_fill_upper, N, A, N,
+                                thrust_wrapper::raw_pointer_cast(info.data()));
+        assert(rocblas_status_success == stat);
+        assert(info[0] == 0);
+
+        // Matrix inversion - relatively inefficient due to lack of rocsolver_dpotrs
+        thrust_wrapper::device_vector<rocsolver_int> ipiv(N);
+        stat = rocsolver_dgetrf(handle, N, N,
+                                thrust_wrapper::raw_pointer_cast(C.data()), N,
+                                thrust_wrapper::raw_pointer_cast(ipiv.data()),
+                                thrust_wrapper::raw_pointer_cast(info.data()));
+        assert(rocblas_status_success == stat);
+        assert(info[0] == 0);
+
+        stat = rocsolver_dgetrs(handle, rocblas_operation_none, N, N,
+                                thrust_wrapper::raw_pointer_cast(C.data()), N,
+                                thrust_wrapper::raw_pointer_cast(ipiv.data()),
+                                B, N);
+        assert(rocblas_status_success == stat);
+
+        rocblas_destroy_handle(handle);
+    }
+};
+#else
+template <>
+struct cusolver<policy::host, double> {
+    /** Computes the Cholesky factorization and the inverse of a real symmetric
+     *  positive definite matrix, looking only in the top half of the symmetric
+     *  matrix.
+     *
+     *  \param A buffer on host for the symmetric input matrix,
+     *           serves as output for the Cholesky decomposition
+     *  \param B buffer on host expects identity matrix,
+     *           serves as output for the inverse
+     *  \param N size of the matrix
+     */
+    static void potrf(double *A, double *B, int N) {
+        char uplo = 'U';
+        int info;
+
+        // compute the cholesky factorization, result in A
+        dpotrf_(&uplo, &N, A, &N, &info);
+        assert(info == 0);
+
+        // compute the inverse of the original matrix in A,
+        // result in B, cholesky factorization remains in A
+        dpotrs_(&uplo, &N, &N, A, &N, B, &N, &info);
+        assert(info == 0);
+    }
+};
+#endif
+
+template <typename T>
+struct almostEqual {
+    T epsilon;
+    DEVICE_FUNC bool operator()(T x, T y) const {
+        return std::abs(x - y) < epsilon;
+    }
+};
+
+template <typename T>
+struct IdentityGenerator {
+    std::size_t lda;
+
+    DEVICE_FUNC T operator()(std::size_t index) {
+        std::size_t i, j;
+
+        thrust_wrapper::tie(i, j) = unravel_index(index, lda);
+        return T(i == j ? 1.0 : 0.0);
+    }
+};
+
+} // namespace internal
+
+/// \endcond
+
+
+/** Matrix datatype with arithemtic operations that can run on a THRUST DEVICE.
+ *  Storage is column-major order.
+ */
+template <typename T, typename Policy = policy::host>
+class device_matrix { // NOLINT(bugprone-exception-escape)
+    static_assert(policy::is_policy<Policy>::value,
+                  "The execution policy must meet the requirements");
+
+public:
+    using storage_type = typename Policy::template vector<T>;
+    using value_type = typename storage_type::value_type;
+    using size_type = typename storage_type::size_type;
+    using difference_type = typename storage_type::difference_type;
+    using reference = typename storage_type::reference;
+    using const_reference = typename storage_type::const_reference;
+    using pointer = typename storage_type::pointer;
+    using const_pointer = typename storage_type::const_pointer;
+    using iterator = typename storage_type::iterator;
+
+private:
+    // Storage
+    size_type m_rows;
+    size_type m_cols;
+    storage_type m_data;
+
+public:
+    device_matrix() : m_rows(0), m_cols(0), m_data(m_rows * m_cols) {}
+
+    explicit device_matrix(size_type rows, size_type cols)
+        : m_rows(rows), m_cols(cols), m_data(m_rows * m_cols) {}
+
+    reference operator()(size_type row, size_type col) noexcept {
+        return m_data[row + col * m_rows];
+    }
+
+    const_reference operator()(size_type row, size_type col) const noexcept {
+        return m_data[row + col * m_rows];
+    }
+
+    void fill(value_type const &value) noexcept(
+        std::is_nothrow_copy_assignable<value_type>::value) {
+        thrust_wrapper::fill(Policy::par(), m_data.begin(), m_data.end(), value);
+    }
+
+    void
+    swap(device_matrix &other) noexcept(noexcept(m_data.swap(other.m_data))) {
+        std::swap(m_rows, other.m_rows);
+        std::swap(m_cols, other.m_cols);
+        m_data.swap(other.m_data);
+    }
+
+    pointer data() noexcept { return m_data.data(); }
+    const_pointer data() const noexcept { return m_data.data(); }
+    size_type rows() const noexcept { return m_rows; }
+    size_type cols() const noexcept { return m_cols; }
+    size_type size() const noexcept { return m_rows * m_cols; }
+
+    /// \defgroup arithmetic Arithmetic operators
+    /// \{
+
+    /// Matrix-matrix multiplication
+    device_matrix operator*(device_matrix const &B) const {
+        static_assert(std::is_arithmetic<T>::value,
+                      "Data type of device_matrix must be arithmetic for "
+                      "arithmetic operations");
+        assert(m_cols == B.m_rows);
+        device_matrix C(m_rows, B.m_cols);
+        internal::cublas<Policy, value_type>::gemm(
+            thrust_wrapper::raw_pointer_cast(data()),
+            thrust_wrapper::raw_pointer_cast(B.data()),
+            thrust_wrapper::raw_pointer_cast(C.data()), m_rows, m_cols, B.m_cols);
+        return C;
+    }
+
+    /// Matrix-vector multiplication
+    storage_type operator*(storage_type const &x) const {
+        static_assert(std::is_arithmetic<T>::value,
+                      "Data type of device_matrix must be arithmetic for "
+                      "arithmetic operations");
+        assert(m_cols == x.size());
+        storage_type y(m_rows);
+        internal::cublas<Policy, value_type>::gemv(
+            thrust_wrapper::raw_pointer_cast(data()),
+            thrust_wrapper::raw_pointer_cast(x.data()),
+            thrust_wrapper::raw_pointer_cast(y.data()), m_rows, m_cols);
+        return y;
+    }
+
+    /// Add element-wise
+    device_matrix operator+(device_matrix const &B) const {
+        static_assert(std::is_arithmetic<T>::value,
+                      "Data type of device_matrix must be arithmetic for "
+                      "arithmetic operations");
+        assert(m_rows == B.m_rows);
+        assert(m_cols == B.m_cols);
+        device_matrix C(m_rows, m_cols);
+        thrust_wrapper::transform(Policy::par(), data(), data() + size(),
+                                  B.data(), C.data(),
+                                  thrust_wrapper::plus<value_type>{});
+        return C;
+    }
+
+    /// Subtract element-wise
+    device_matrix operator-(device_matrix const &B) const {
+        static_assert(std::is_arithmetic<T>::value,
+                      "Data type of device_matrix must be arithmetic for "
+                      "arithmetic operations");
+        assert(m_rows == B.m_rows);
+        assert(m_cols == B.m_cols);
+        device_matrix C(m_rows, m_cols);
+        thrust_wrapper::transform(Policy::par(), data(), data() + size(),
+                                  B.data(), C.data(),
+                                  thrust_wrapper::minus<value_type>{});
+        return C;
+    }
+
+    /// Negate element-wise
+    device_matrix &operator-() {
+        static_assert(std::is_arithmetic<T>::value,
+                      "Data type of device_matrix must be arithmetic for "
+                      "arithmetic operations");
+        thrust_wrapper::transform(Policy::par(), data(), data() + size(),
+                                  data(),
+                                  thrust_wrapper::negate<value_type>{});
+        return *this;
+    }
+
+    /// \}
+
+    /// \defgroup compare Comparison
+    /// \{
+
+    /// Compare for exact bit-wise equality
+    bool operator==(device_matrix const &B) const {
+        return m_rows == B.m_rows && m_cols == B.m_cols &&
+               thrust_wrapper::equal(Policy::par(), data(),
+                                     data() + size(), B.data());
+    }
+
+    /// Compare for equality with threshold \p epsilon.
+    ///
+    /// \param epsilon maximum allowed difference
+    bool almostEqual(device_matrix const &B,
+                     value_type const epsilon =
+                         std::numeric_limits<value_type>::epsilon()) const {
+        static_assert(std::is_arithmetic<T>::value,
+                      "Data type of device_matrix must be arithmetic for "
+                      "arithmetic operations");
+        return m_rows == B.m_rows && m_cols == B.m_cols &&
+               thrust_wrapper::equal(Policy::par(), data(), data() + size(),
+                      B.data(), internal::almostEqual<value_type>{epsilon});
+    }
+
+    /// \}
+
+    /// \defgroup linalg Linear algebra
+    /// \{
+
+    /// Compute the transpose.
+    device_matrix transpose() const {
+        static_assert(std::is_same<T, double>::value,
+                      "Data type of device_matrix must be floating point for "
+                      "BLAS/LAPACK operations");
+        device_matrix C(m_cols, m_rows);
+        internal::cublas<Policy, value_type>::geam(
+            thrust_wrapper::raw_pointer_cast(data()),
+            thrust_wrapper::raw_pointer_cast(C.data()), m_rows, m_cols);
+        return C;
+    }
+
+    /// Compute the inverse and the Cholesky decomposition.
+    thrust_wrapper::tuple<device_matrix, device_matrix> inverse_and_cholesky() const {
+        static_assert(std::is_same<T, double>::value,
+                      "Data type of device_matrix must be floating point for "
+                      "BLAS/LAPACK operations");
+        assert(m_rows == m_cols);
+        device_matrix A = *this;
+        device_matrix B = device_matrix::Identity(m_rows, m_cols);
+
+        internal::cusolver<Policy, double>::potrf(
+            thrust_wrapper::raw_pointer_cast(A.data()),
+            thrust_wrapper::raw_pointer_cast(B.data()), m_rows);
+
+        return thrust_wrapper::make_tuple(B, A);
+    }
+
+    /// Compute the inverse.
+    device_matrix inverse() const {
+        return thrust_wrapper::get<0>(inverse_and_cholesky());
+    }
+
+    /// \}
+
+    /// \defgroup generators Convenience generator functions
+    /// \{
+
+    /// Generate an identity matrix with size \p rows x \p cols.
+    static device_matrix Identity(size_type rows, size_type cols) {
+        static_assert(std::is_same<T, double>::value,
+                      "Data type of device_matrix must be floating point for "
+                      "BLAS/LAPACK operations");
+        device_matrix I(rows, cols);
+        thrust_wrapper::tabulate(Policy::par(), I.data(), I.data() + I.size(),
+                                 internal::IdentityGenerator<value_type>{rows});
+        return I;
+    }
+
+    /// \}
+};
+
+
+/** Read-only reference to another device_matrix object. More accurately, it
+ *  is a separate object that has different routines by which the same region
+ *  of memory can be viewed.
+ *
+ *  Interestingly, by clever abuse of the rows and cols parameters and the data
+ *  pointer, certain areas (e.g. blocks) of the referenced matrix could be
+ *  accessed conveniently through a device_matrix_view object.
+ */
+template <typename T, typename Policy>
+class device_matrix_view {
+public:
+    using storage_type = typename Policy::template vector<T>;
+    using value_type = T;
+    using size_type = std::size_t;
+    using difference_type = std::ptrdiff_t;
+    using reference = value_type &;
+    using const_reference = value_type const &;
+    using pointer = T *;
+    using const_pointer = T const *;
+    using iterator = pointer;
+
+private:
+    // Storage
+    size_type m_rows;
+    size_type m_cols;
+    T *m_data;
+
+public:
+    device_matrix_view(pointer data, size_type rows, size_type cols)
+        : m_rows(rows), m_cols(cols), m_data(data) {}
+
+    device_matrix_view(device_matrix<T, Policy> &v)
+        : m_rows(v.rows()), m_cols(v.cols()),
+          m_data(thrust_wrapper::raw_pointer_cast(v.data())) {}
+
+    DEVICE_FUNC reference operator()(size_type row, size_type col) noexcept {
+        return m_data[row + col * m_rows];
+    }
+
+    DEVICE_FUNC const_reference operator()(size_type row, size_type col) const
+        noexcept {
+        return m_data[row + col * m_rows];
+    }
+
+    DEVICE_FUNC pointer data() noexcept { return m_data; }
+    DEVICE_FUNC const_pointer data() const noexcept { return m_data; }
+    DEVICE_FUNC size_type rows() const noexcept { return m_rows; }
+    DEVICE_FUNC size_type cols() const noexcept { return m_cols; }
+    DEVICE_FUNC size_type size() const noexcept { return m_rows * m_cols; }
+};
+
+template <typename T, typename Policy = policy::host>
+class device_vector_view {
+public:
+    using storage_type = typename Policy::template vector<T>;
+    using value_type = T;
+    using size_type = std::size_t;
+    using difference_type = std::ptrdiff_t;
+    using reference = value_type &;
+    using const_reference = value_type const &;
+    using pointer = T *;
+    using const_pointer = T const *;
+    using iterator = pointer;
+
+private:
+    // Storage
+    size_type m_size;
+    T *m_data;
+
+public:
+    device_vector_view(pointer data, size_type size)
+        : m_size(size), m_data(data) {}
+
+    device_vector_view(storage_type &v)
+        : m_size(v.size()), m_data(thrust_wrapper::raw_pointer_cast(v.data())) {}
+
+    DEVICE_FUNC reference operator()(size_type i) noexcept { return m_data[i]; }
+
+    DEVICE_FUNC const_reference operator()(size_type i) const noexcept {
+        return m_data[i];
+    }
+
+    DEVICE_FUNC pointer data() noexcept { return m_data; }
+    DEVICE_FUNC const_pointer data() const noexcept { return m_data; }
+    DEVICE_FUNC size_type size() const noexcept { return m_size; }
+};
+
+#undef DEVICE_FUNC
+#undef MAYBE_UNUSED
+#endif
diff --git a/libs/stokesian_dynamics/src/lubrication_data.inl b/libs/stokesian_dynamics/src/lubrication_data.inl
new file mode 100644
index 00000000000..e7601d8ef6a
--- /dev/null
+++ b/libs/stokesian_dynamics/src/lubrication_data.inl
@@ -0,0 +1,285 @@
+static constexpr double rsabc[] = {
+    2.10000000e+00, 2.15000000e+00, 2.20000000e+00, 2.25000000e+00,
+    2.30000000e+00, 2.35000000e+00, 2.40000000e+00, 2.45000000e+00,
+    2.50000000e+00, 2.55000000e+00, 2.60000000e+00, 2.65000000e+00,
+    2.70000000e+00, 2.75000000e+00, 2.80000000e+00, 2.85000000e+00,
+    2.90000000e+00, 2.95000000e+00, 3.00000000e+00, 3.05000000e+00,
+    3.10000000e+00, 3.15000000e+00, 3.20000000e+00, 3.25000000e+00,
+    3.30000000e+00, 3.35000000e+00, 3.40000000e+00, 3.45000000e+00,
+    3.50000000e+00, 3.55000000e+00, 3.60000000e+00, 3.65000000e+00,
+    3.70000000e+00, 3.75000000e+00, 3.80000000e+00, 3.85000000e+00,
+    3.90000000e+00, 3.95000000e+00, 4.00000000e+00};
+
+static constexpr double x11as[] = {
+    1.98069000e+00, 1.14261000e+00, 7.40120000e-01, 5.11730000e-01,
+    3.69290000e-01, 2.74840000e-01, 2.09440000e-01, 1.62630000e-01,
+    1.28260000e-01, 1.02480000e-01, 8.28000000e-02, 6.75400000e-02,
+    5.55600000e-02, 4.60300000e-02, 3.83800000e-02, 3.21700000e-02,
+    2.71000000e-02, 2.29200000e-02, 1.94500000e-02, 1.65500000e-02,
+    1.40900000e-02, 1.20200000e-02, 1.02800000e-02, 8.78000000e-03,
+    7.50000000e-03, 6.39000000e-03, 5.43000000e-03, 4.61000000e-03,
+    3.88000000e-03, 3.25000000e-03, 2.69000000e-03, 2.20000000e-03,
+    1.77000000e-03, 1.38000000e-03, 1.04000000e-03, 7.40000000e-04,
+    4.60000000e-04, 2.20000000e-04, 0.00000000e+00};
+
+static constexpr double x12as[] = {
+    -1.97823200e+00, -1.14019200e+00, -7.37752000e-01, -5.09422000e-01,
+    -3.67042000e-01, -2.72662000e-01, -2.07332000e-01, -1.60603000e-01,
+    -1.26311000e-01, -1.00617000e-01, -8.10230000e-02, -6.58530000e-02,
+    -5.39560000e-02, -4.45190000e-02, -3.69560000e-02, -3.08400000e-02,
+    -2.58530000e-02, -2.17570000e-02, -1.83680000e-02, -1.55470000e-02,
+    -1.31570000e-02, -1.11750000e-02, -9.49800000e-03, -8.07200000e-03,
+    -6.85500000e-03, -5.81300000e-03, -4.91500000e-03, -4.14100000e-03,
+    -3.47000000e-03, -2.89300000e-03, -2.38300000e-03, -1.93800000e-03,
+    -1.54700000e-03, -1.20400000e-03, -9.01000000e-04, -6.34000000e-04,
+    -3.97000000e-04, -1.87000000e-04, 0.00000000e+00};
+
+static constexpr double y11as[] = {
+    8.52200000e-02, 5.37500000e-02, 3.67000000e-02, 2.63700000e-02,
+    1.96700000e-02, 1.50800000e-02, 1.18300000e-02, 9.46000000e-03,
+    7.67000000e-03, 6.30000000e-03, 5.24000000e-03, 4.39000000e-03,
+    3.71000000e-03, 3.15000000e-03, 2.70000000e-03, 2.32000000e-03,
+    2.00000000e-03, 1.73000000e-03, 1.50000000e-03, 1.30000000e-03,
+    1.13000000e-03, 9.90000000e-04, 8.60000000e-04, 7.40000000e-04,
+    6.50000000e-04, 5.60000000e-04, 4.90000000e-04, 4.10000000e-04,
+    3.50000000e-04, 3.00000000e-04, 2.50000000e-04, 2.10000000e-04,
+    1.60000000e-04, 1.30000000e-04, 1.00000000e-04, 7.00000000e-05,
+    4.00000000e-05, 2.00000000e-05, 0.00000000e+00};
+
+static constexpr double y12as[] = {
+    -8.14540000e-02, -5.03280000e-02, -3.35920000e-02, -2.35600000e-02,
+    -1.71210000e-02, -1.27850000e-02, -9.75800000e-03, -7.58300000e-03,
+    -5.98200000e-03, -4.77900000e-03, -3.86200000e-03, -3.15100000e-03,
+    -2.59300000e-03, -2.14900000e-03, -1.79200000e-03, -1.50300000e-03,
+    -1.26700000e-03, -1.07100000e-03, -9.10000000e-04, -7.74000000e-04,
+    -6.60000000e-04, -5.63000000e-04, -4.82000000e-04, -4.12000000e-04,
+    -3.51000000e-04, -3.00000000e-04, -2.54000000e-04, -2.15000000e-04,
+    -1.82000000e-04, -1.52000000e-04, -1.26000000e-04, -1.03000000e-04,
+    -8.20000000e-05, -6.40000000e-05, -4.80000000e-05, -3.40000000e-05,
+    -2.10000000e-05, -1.00000000e-05, 0.00000000e+00};
+
+static constexpr double y11bs[] = {
+    -7.87158000e-02, -4.82308000e-02, -3.18458000e-02, -2.20328000e-02,
+    -1.57498000e-02, -1.15348000e-02, -8.60950000e-03, -6.52500000e-03,
+    -5.00680000e-03, -3.88220000e-03, -3.03670000e-03, -2.39320000e-03,
+    -1.89830000e-03, -1.51410000e-03, -1.21350000e-03, -9.76500000e-04,
+    -7.88600000e-04, -6.38700000e-04, -5.18600000e-04, -4.21900000e-04,
+    -3.43600000e-04, -2.80100000e-04, -2.28500000e-04, -1.86200000e-04,
+    -1.51700000e-04, -1.23300000e-04, -9.99000000e-05, -8.07000000e-05,
+    -6.48000000e-05, -5.16000000e-05, -4.08000000e-05, -3.18000000e-05,
+    -2.43000000e-05, -1.82000000e-05, -1.30000000e-05, -8.80000000e-06,
+    -5.30000000e-06, -2.40000000e-06, 0.00000000e+00};
+
+static constexpr double y12bs[] = {
+    8.02421000e-02, 4.95261000e-02, 3.29531000e-02, 2.29871000e-02,
+    1.65771000e-02, 1.22561000e-02, 9.24210000e-03, 7.08110000e-03,
+    5.49710000e-03, 4.31510000e-03, 3.42010000e-03, 2.73410000e-03,
+    2.20110000e-03, 1.78310000e-03, 1.45210000e-03, 1.18910000e-03,
+    9.78100000e-04, 8.07100000e-04, 6.68100000e-04, 5.55100000e-04,
+    4.62100000e-04, 3.85100000e-04, 3.21100000e-04, 2.68100000e-04,
+    2.23000000e-04, 1.85700000e-04, 1.54300000e-04, 1.27800000e-04,
+    1.05300000e-04, 8.61000000e-05, 6.99000000e-05, 5.60000000e-05,
+    4.40000000e-05, 3.37000000e-05, 2.49000000e-05, 1.73000000e-05,
+    1.07000000e-05, 4.90000000e-06, 0.00000000e+00};
+
+static constexpr double x11cs[] = {
+    2.19500000e-02, 1.67400000e-02, 1.30800000e-02, 1.04000000e-02,
+    8.37000000e-03, 6.79000000e-03, 5.56000000e-03, 4.58000000e-03,
+    3.80000000e-03, 3.17000000e-03, 2.65000000e-03, 2.23000000e-03,
+    1.88000000e-03, 1.59000000e-03, 1.34000000e-03, 1.14000000e-03,
+    9.70000000e-04, 8.30000000e-04, 7.20000000e-04, 6.10000000e-04,
+    5.20000000e-04, 4.40000000e-04, 3.80000000e-04, 3.20000000e-04,
+    2.70000000e-04, 2.30000000e-04, 2.00000000e-04, 1.70000000e-04,
+    1.40000000e-04, 1.20000000e-04, 9.00000000e-05, 8.00000000e-05,
+    7.00000000e-05, 5.00000000e-05, 3.00000000e-05, 3.00000000e-05,
+    2.00000000e-05, 1.00000000e-05, 0.00000000e+00};
+
+static constexpr double x12cs[] = {
+    -1.00995000e-02, -6.79550000e-03, -4.73150000e-03, -3.37250000e-03,
+    -2.45050000e-03, -1.80550000e-03, -1.34810000e-03, -1.01760000e-03,
+    -7.75600000e-04, -5.96300000e-04, -4.62100000e-04, -3.60600000e-04,
+    -2.83200000e-04, -2.23800000e-04, -1.77800000e-04, -1.41900000e-04,
+    -1.13900000e-04, -9.17000000e-05, -7.41000000e-05, -6.01000000e-05,
+    -4.88000000e-05, -3.97000000e-05, -3.25000000e-05, -2.65000000e-05,
+    -2.17000000e-05, -1.76000000e-05, -1.44000000e-05, -1.17000000e-05,
+    -9.50000000e-06, -7.60000000e-06, -6.10000000e-06, -4.80000000e-06,
+    -3.70000000e-06, -2.90000000e-06, -2.10000000e-06, -1.40000000e-06,
+    -9.00000000e-07, -4.00000000e-07, 0.00000000e+00};
+
+static constexpr double y11cs[] = {
+    1.59990000e-01, 1.04880000e-01, 7.36000000e-02, 5.38700000e-02,
+    4.06100000e-02, 3.12900000e-02, 2.45200000e-02, 1.94900000e-02,
+    1.56600000e-02, 1.27100000e-02, 1.04000000e-02, 8.57000000e-03,
+    7.11000000e-03, 5.92000000e-03, 4.95000000e-03, 4.17000000e-03,
+    3.52000000e-03, 2.97000000e-03, 2.51000000e-03, 2.14000000e-03,
+    1.82000000e-03, 1.55000000e-03, 1.32000000e-03, 1.12000000e-03,
+    9.50000000e-04, 8.10000000e-04, 6.80000000e-04, 5.70000000e-04,
+    4.80000000e-04, 4.00000000e-04, 3.20000000e-04, 2.70000000e-04,
+    2.10000000e-04, 1.60000000e-04, 1.20000000e-04, 8.00000000e-05,
+    5.00000000e-05, 2.00000000e-05, 0.00000000e+00};
+
+static constexpr double y12cs[] = {
+    5.80310000e-03,  -1.96900000e-04, -2.24620000e-03, -2.82460000e-03,
+    -2.81500000e-03, -2.57600000e-03, -2.26330000e-03, -1.94570000e-03,
+    -1.65200000e-03, -1.39250000e-03, -1.16880000e-03, -9.78800000e-04,
+    -8.18900000e-04, -6.84900000e-04, -5.73000000e-04, -4.79600000e-04,
+    -4.01600000e-04, -3.36500000e-04, -2.82300000e-04, -2.36900000e-04,
+    -1.98700000e-04, -1.66800000e-04, -1.40000000e-04, -1.17300000e-04,
+    -9.82000000e-05, -8.20000000e-05, -6.84000000e-05, -5.67000000e-05,
+    -4.68000000e-05, -3.84000000e-05, -3.12000000e-05, -2.50000000e-05,
+    -1.97000000e-05, -1.51000000e-05, -1.12000000e-05, -7.80000000e-06,
+    -4.90000000e-06, -2.30000000e-06, 0.00000000e+00};
+
+static constexpr double rsgh[] = {
+    2.10000000e+00, 2.11000000e+00, 2.12000000e+00, 2.13000000e+00,
+    2.14000000e+00, 2.15000000e+00, 2.16000000e+00, 2.17000000e+00,
+    2.18000000e+00, 2.19000000e+00, 2.20000000e+00, 2.25000000e+00,
+    2.30000000e+00, 2.35000000e+00, 2.40000000e+00, 2.45000000e+00,
+    2.50000000e+00, 2.55000000e+00, 2.60000000e+00, 2.65000000e+00,
+    2.70000000e+00, 2.75000000e+00, 2.80000000e+00, 2.85000000e+00,
+    2.90000000e+00, 2.95000000e+00, 3.00000000e+00, 3.05000000e+00,
+    3.10000000e+00, 3.15000000e+00, 3.20000000e+00, 3.25000000e+00,
+    3.30000000e+00, 3.35000000e+00, 3.40000000e+00, 3.45000000e+00,
+    3.50000000e+00, 3.55000000e+00, 3.60000000e+00, 3.65000000e+00,
+    3.70000000e+00, 3.75000000e+00, 3.80000000e+00, 3.85000000e+00,
+    3.90000000e+00, 3.95000000e+00, 4.00000000e+00};
+
+static constexpr double x11gs[] = {
+    2.02675700e+00, 1.79415700e+00, 1.60105700e+00, 1.43840700e+00,
+    1.29974700e+00, 1.18030700e+00, 1.07649700e+00, 9.85595000e-01,
+    9.05431000e-01, 8.34337000e-01, 7.70931000e-01, 5.36914000e-01,
+    3.89924000e-01, 2.91795000e-01, 2.23413000e-01, 1.74196000e-01,
+    1.37866000e-01, 1.10486000e-01, 8.94920000e-02, 7.31530000e-02,
+    6.02730000e-02, 5.00040000e-02, 4.17330000e-02, 3.50110000e-02,
+    2.95020000e-02, 2.49530000e-02, 2.11710000e-02, 1.80080000e-02,
+    1.53460000e-02, 1.30940000e-02, 1.11780000e-02, 9.54200000e-03,
+    8.13800000e-03, 6.92901000e-03, 5.88300000e-03, 4.97500000e-03,
+    4.18500000e-03, 3.49400000e-03, 2.88800000e-03, 2.35590000e-03,
+    1.88630000e-03, 1.47180000e-03, 1.10500000e-03, 7.79300000e-04,
+    4.89100000e-04, 2.30600000e-04, 0.00000000e+00};
+
+static constexpr double x12gs[] = {
+    -2.01782500e+00, -1.78525500e+00, -1.59219500e+00, -1.42959500e+00,
+    -1.29095500e+00, -1.17156500e+00, -1.06780500e+00, -9.76940000e-01,
+    -8.96828000e-01, -8.25778000e-01, -7.62428000e-01, -5.28699000e-01,
+    -3.82021000e-01, -2.84233000e-01, -2.16214000e-01, -1.67376000e-01,
+    -1.31428000e-01, -1.04435000e-01, -8.38270000e-02, -6.78690000e-02,
+    -5.53630000e-02, -4.54540000e-02, -3.75310000e-02, -3.11420000e-02,
+    -2.59500000e-02, -2.17020000e-02, -1.82060000e-02, -1.53100000e-02,
+    -1.28990000e-02, -1.08820000e-02, -9.18700000e-03, -7.75598000e-03,
+    -6.54299000e-03, -5.51198000e-03, -4.63101000e-03, -3.87700000e-03,
+    -3.22699000e-03, -2.66901000e-03, -2.18600000e-03, -1.76600000e-03,
+    -1.40199000e-03, -1.08500000e-03, -8.08000000e-04, -5.66010000e-04,
+    -3.52990000e-04, -1.65000000e-04, 0.00000000e+00};
+
+static constexpr double y11gs[] = {
+    4.13886800e-02, 3.75377800e-02, 3.42245800e-02, 3.13491800e-02,
+    2.88322800e-02, 2.66164800e-02, 2.46530800e-02, 2.29039800e-02,
+    2.13386800e-02, 1.99318800e-02, 1.86613800e-02, 1.38389800e-02,
+    1.06807800e-02, 8.49198000e-03, 6.90678000e-03, 5.71738000e-03,
+    4.79848000e-03, 4.07128000e-03, 3.48418000e-03, 3.00218000e-03,
+    2.60068000e-03, 2.26268000e-03, 1.97518000e-03, 1.72848000e-03,
+    1.51548000e-03, 1.33048000e-03, 1.16878000e-03, 1.02707000e-03,
+    9.02260000e-04, 7.92050000e-04, 6.94320000e-04, 6.07540000e-04,
+    5.30230000e-04, 4.61250000e-04, 3.99530000e-04, 3.44250000e-04,
+    2.94620000e-04, 2.50000000e-04, 2.09830000e-04, 1.73600000e-04,
+    1.40880000e-04, 1.11300000e-04, 8.45200000e-05, 6.02500000e-05,
+    3.82200000e-05, 1.82000000e-05, 0.00000000e+00};
+
+static constexpr double y12gs[] = {
+    -3.20663998e-02, -2.84543998e-02, -2.53723998e-02, -2.27203998e-02,
+    -2.04252998e-02, -1.84231998e-02, -1.66682998e-02, -1.51228998e-02,
+    -1.37559998e-02, -1.25411998e-02, -1.14591998e-02, -7.50679980e-03,
+    -5.11039980e-03, -3.58899980e-03, -2.58849980e-03, -1.91129980e-03,
+    -1.44179980e-03, -1.10879980e-03, -8.67899800e-04, -6.90400800e-04,
+    -5.57201800e-04, -4.55599800e-04, -3.76798800e-04, -3.14699800e-04,
+    -2.65099800e-04, -2.24899800e-04, -1.91899800e-04, -1.64399800e-04,
+    -1.41497800e-04, -1.21899800e-04, -1.05299800e-04, -9.09988000e-05,
+    -7.84998000e-05, -6.77002000e-05, -5.81001000e-05, -4.96996000e-05,
+    -4.22695000e-05, -3.56794000e-05, -2.98401000e-05, -2.45399000e-05,
+    -1.98698000e-05, -1.56597000e-05, -1.18399000e-05, -8.42010000e-06,
+    -5.33000000e-06, -2.52950000e-06, 0.00000000e+00};
+
+static constexpr double y11hs[] = {
+    -1.93897170e-02, -1.93518170e-02, -1.91942170e-02, -1.89486170e-02,
+    -1.86386170e-02, -1.82817170e-02, -1.78912170e-02, -1.74775170e-02,
+    -1.70483170e-02, -1.66097170e-02, -1.61666170e-02, -1.39867170e-02,
+    -1.20005170e-02, -1.02675170e-02, -8.78215700e-03, -7.51826700e-03,
+    -6.44545700e-03, -5.53487700e-03, -4.76101700e-03, -4.10210700e-03,
+    -3.53977700e-03, -3.05875700e-03, -2.64624700e-03, -2.29165700e-03,
+    -1.98608700e-03, -1.72215700e-03, -1.49368700e-03, -1.29544700e-03,
+    -1.12308700e-03, -9.72917000e-04, -8.41807000e-04, -7.27120000e-04,
+    -6.26604000e-04, -5.38357000e-04, -4.60730000e-04, -3.92344000e-04,
+    -3.31976000e-04, -2.78612000e-04, -2.31361000e-04, -1.89457000e-04,
+    -1.52240000e-04, -1.19136000e-04, -8.96460000e-05, -6.33400000e-05,
+    -3.98430000e-05, -1.88240000e-05, 0.00000000e+00};
+
+static constexpr double y12hs[] = {
+    7.95613997e-02, 7.23663997e-02, 6.60993997e-02, 6.05963997e-02,
+    5.57303997e-02, 5.14003997e-02, 4.75263997e-02, 4.40443997e-02,
+    4.09023997e-02, 3.80543997e-02, 3.54663997e-02, 2.54763997e-02,
+    1.88113997e-02, 1.41783997e-02, 1.08573997e-02, 8.42039970e-03,
+    6.60139970e-03, 5.22039970e-03, 4.16079970e-03, 3.33870970e-03,
+    2.69489970e-03, 2.18679970e-03, 1.78249970e-03, 1.45918970e-03,
+    1.19879970e-03, 9.87899700e-04, 8.16400700e-04, 6.76300700e-04,
+    5.61199700e-04, 4.66298700e-04, 3.87701700e-04, 3.22401700e-04,
+    2.68101700e-04, 2.22399700e-04, 1.84301700e-04, 1.52200700e-04,
+    1.24897700e-04, 1.01998700e-04, 8.25007000e-05, 6.57997000e-05,
+    5.15990000e-05, 3.94993000e-05, 2.89995000e-05, 2.00998000e-05,
+    1.23996000e-05, 5.69970000e-06, 0.00000000e+00};
+
+static constexpr double rsm[] = {
+    2.10000000e+00, 2.11000000e+00, 2.12000000e+00, 2.13000000e+00,
+    2.14000000e+00, 2.15000000e+00, 2.16000000e+00, 2.17000000e+00,
+    2.18000000e+00, 2.19000000e+00, 2.20000000e+00, 2.25000000e+00,
+    2.30000000e+00, 2.35000000e+00, 2.40000000e+00, 2.45000000e+00,
+    2.50000000e+00, 2.55000000e+00, 2.60000000e+00, 2.65000000e+00,
+    2.70000000e+00, 2.75000000e+00, 2.80000000e+00, 2.85000000e+00,
+    2.90000000e+00, 2.95000000e+00, 3.00000000e+00, 3.05000000e+00,
+    3.10000000e+00, 3.15000000e+00, 3.20000000e+00, 3.25000000e+00,
+    3.30000000e+00, 3.35000000e+00, 3.40000000e+00, 3.45000000e+00,
+    3.50000000e+00, 3.55000000e+00, 3.60000000e+00, 3.65000000e+00,
+    3.70000000e+00, 3.75000000e+00, 3.80000000e+00, 3.85000000e+00,
+    3.90000000e+00, 3.95000000e+00, 4.00000000e+00};
+
+static constexpr double xms[] = {
+    2.75887000e+00, 2.44633000e+00, 2.18653000e+00, 1.96742000e+00,
+    1.78033000e+00, 1.61896000e+00, 1.47851000e+00, 1.35532000e+00,
+    1.24655000e+00, 1.14993000e+00, 1.06364000e+00, 7.43872000e-01,
+    5.41620000e-01, 4.05796000e-01, 3.10632000e-01, 2.41868000e-01,
+    1.90947000e-01, 1.52487000e-01, 1.22971000e-01, 9.99960000e-02,
+    8.19111000e-02, 6.75067000e-02, 5.59467000e-02, 4.65911000e-02,
+    3.89544000e-02, 3.26756000e-02, 2.74956000e-02, 2.31911000e-02,
+    1.95933000e-02, 1.65811000e-02, 1.40344000e-02, 1.18778000e-02,
+    1.00367000e-02, 8.48110000e-03, 7.14000000e-03, 5.98444000e-03,
+    4.99778000e-03, 4.14222000e-03, 3.39333000e-03, 2.74444000e-03,
+    2.18333000e-03, 1.68333000e-03, 1.26333000e-03, 8.88890000e-04,
+    5.56670000e-04, 2.64440000e-04, 0.00000000e+00};
+
+static constexpr double yms[] = {
+    8.07955110e-02, 7.36955110e-02, 6.76211110e-02, 6.23600110e-02,
+    5.77778110e-02, 5.37511110e-02, 5.01989110e-02, 4.70433110e-02,
+    4.42178110e-02, 4.16755110e-02, 3.93933110e-02, 3.06900110e-02,
+    2.49100110e-02, 2.07933110e-02, 1.76922110e-02, 1.52489110e-02,
+    1.32633110e-02, 1.16033110e-02, 1.01933110e-02, 8.98000100e-03,
+    7.91889100e-03, 6.99667100e-03, 6.18000100e-03, 5.46222100e-03,
+    4.83111100e-03, 4.26222100e-03, 3.75889100e-03, 3.31222100e-03,
+    2.91778100e-03, 2.56444100e-03, 2.24444100e-03, 1.96555100e-03,
+    1.71333100e-03, 1.48000100e-03, 1.28555100e-03, 1.10444100e-03,
+    9.41111000e-04, 7.90001000e-04, 6.67781000e-04, 5.51111000e-04,
+    4.37781000e-04, 3.45555000e-04, 2.63333000e-04, 1.88889000e-04,
+    1.12222000e-04, 5.33330000e-05, 0.00000000e+00};
+
+static constexpr double zms[] = {
+    8.64000330e-03, 8.31444330e-03, 8.00222330e-03, 7.69444330e-03,
+    7.41000330e-03, 7.13111330e-03, 6.86555330e-03, 6.61666330e-03,
+    6.37222330e-03, 6.14222330e-03, 5.91889330e-03, 4.92555330e-03,
+    4.11889330e-03, 3.46000330e-03, 2.91666330e-03, 2.47444330e-03,
+    2.09777330e-03, 1.78000330e-03, 1.52000330e-03, 1.30555330e-03,
+    1.11333330e-03, 9.58893300e-04, 8.23333300e-04, 7.08889300e-04,
+    6.15555300e-04, 5.33333300e-04, 4.58889300e-04, 4.01111300e-04,
+    3.45555300e-04, 2.98889300e-04, 2.54444300e-04, 2.17777300e-04,
+    1.92222300e-04, 1.63333300e-04, 1.42222300e-04, 1.23333300e-04,
+    9.88893000e-05, 9.00003000e-05, 7.11113000e-05, 6.22223000e-05,
+    4.44444000e-05, 4.22222000e-05, 3.44444000e-05, 2.33333000e-05,
+    2.00000000e-05, 1.55555000e-05, 0.00000000e+00};
diff --git a/libs/stokesian_dynamics/src/multi_array.hpp b/libs/stokesian_dynamics/src/multi_array.hpp
new file mode 100644
index 00000000000..368e78ecb36
--- /dev/null
+++ b/libs/stokesian_dynamics/src/multi_array.hpp
@@ -0,0 +1,253 @@
+/** @file
+ *  In this file, an array type of arbitrary dimensionality is provided,
+ *  which is suited for use on a parallel computing "device".
+ */
+
+#ifndef SD_MULTI_ARRAY_HPP
+#define SD_MULTI_ARRAY_HPP
+
+#include <cmath>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+#if defined(__HIPCC__) or defined(__CUDACC__)
+#define DEVICE_FUNC __host__ __device__
+#else
+#define DEVICE_FUNC
+#endif
+
+/// \cond
+namespace meta {
+
+// product
+
+template <typename T, T...>
+struct product;
+
+template <typename T, T dim0, T... dim>
+struct product<T, dim0, dim...> {
+    static constexpr T const value = dim0 * product<T, dim...>::value;
+};
+
+template <typename T>
+struct product<T> {
+    static constexpr T const value = 1;
+};
+
+///
+/// compile-time logical AND between all parameters of the parameter pack
+///
+template <bool...>
+struct all;
+
+// If current parameter is true, it depends on the other parameters.
+template <bool... b>
+struct all<true, b...> {
+    static constexpr bool const value = all<b...>::value;
+};
+
+// If current parameter is false, the whole thing is false.
+template <bool... b>
+struct all<false, b...> {
+    static constexpr bool const value = false;
+};
+
+template <>
+struct all<> {
+    static constexpr bool const value = true;
+};
+
+// linearized_index
+
+template <size_t n, size_t... N>
+struct linearized_index {
+    template <typename... Idx>
+    DEVICE_FUNC constexpr std::size_t operator()(Idx... idx) const noexcept {
+        using unpack = std::size_t[];
+        return unpack{std::size_t(idx)...}[n] +
+               unpack{std::size_t(N)...}[n] *
+                   linearized_index<n - 1, N...>{}(idx...);
+    }
+};
+
+template <size_t... N>
+struct linearized_index<0, N...> {
+    template <typename... Idx>
+    DEVICE_FUNC constexpr std::size_t operator()(Idx... idx) const noexcept {
+        using unpack = std::size_t[];
+        return unpack{std::size_t(idx)...}[0];
+    }
+};
+
+// check_bounds
+
+template <size_t n, size_t... N>
+struct check_bounds {
+    template <typename... Idx>
+    constexpr bool operator()(Idx... idx) const {
+        using unpack = std::size_t[];
+        return unpack{std::size_t(idx)...}[n] < unpack{std::size_t(N)...}[n]
+                   ? check_bounds<n - 1, N...>{}(idx...)
+                   : throw std::out_of_range("index out of bounds: " +
+                                             std::to_string(n));
+    }
+};
+
+template <size_t... N>
+struct check_bounds<0, N...> {
+    template <typename... Idx>
+    constexpr bool operator()(Idx... idx) const {
+        using unpack = std::size_t[];
+        return unpack{std::size_t(idx)...}[0] < unpack{std::size_t(N)...}[0]
+                   ? false
+                   : throw std::out_of_range("index out of bounds: " +
+                                             std::to_string(0));
+    }
+};
+
+} // namespace meta
+/// \endcond
+
+/// Multidimensional array similar to `std::array`
+///
+/// Storage is always in row-major order for easy list initialization
+///
+/// \tparam T value type
+/// \tparam N list of dimensions
+template <typename T, std::size_t... N>
+class multi_array {
+public:
+    using value_type = T;
+    using size_type = std::size_t;
+    using difference_type = std::ptrdiff_t;
+    using reference = value_type &;
+    using const_reference = const value_type &;
+    using pointer = value_type *;
+    using const_pointer = const value_type *;
+    using iterator = pointer;
+
+private:
+    template <size_type... s>
+    using size_product = meta::product<size_type, s...>;
+
+    // Storage
+    value_type m_data[size_product<N...>::value] = {0};
+
+public:
+    DEVICE_FUNC constexpr multi_array() = default;
+
+    /// Constructor for aggregate initializers
+    template <typename... U>
+    DEVICE_FUNC constexpr multi_array(U... data)
+        : m_data{value_type(data)...} {}
+
+    /// fill the container with specified value
+    DEVICE_FUNC void fill(value_type const &value) noexcept(
+        std::is_nothrow_copy_assignable<value_type>::value) {
+        for (iterator it = begin(); it != end(); ++it) {
+            *it = value;
+        }
+    }
+
+    /// swaps the contents
+    DEVICE_FUNC void swap(multi_array &other) noexcept(
+        std::is_nothrow_move_assignable<value_type>::value) {
+        iterator oit = other.begin();
+        for (iterator it = begin(); it != end(); ++it, (void)++oit) {
+            value_type tmp = std::move(*it);
+            *it = std::move(*oit);
+            *oit = std::move(tmp);
+        }
+    }
+
+    /// returns an iterator to the beginning
+    DEVICE_FUNC constexpr iterator begin() const noexcept {
+        return iterator(data());
+    }
+
+    /// returns an iterator to the end
+    DEVICE_FUNC constexpr iterator end() const noexcept {
+        return iterator(data() + size_product<N...>::value);
+    }
+
+    /// direct access to the underlying array
+    DEVICE_FUNC constexpr pointer data() noexcept { return m_data; }
+    /// \overload DEVICE_FUNC constexpr pointer data()
+    DEVICE_FUNC constexpr const_pointer data() const noexcept {
+        return m_data;
+    }
+
+    /// access specified element
+    ///
+    /// \throws std::out_of_range if index is out of range (only in DEBUG mode)
+    template <typename... Idx>
+    DEVICE_FUNC constexpr reference operator()(Idx... idx)
+#ifdef NDEBUG
+        noexcept
+#endif
+    {
+        static_assert(sizeof...(idx) == sizeof...(N), "dimension mismatch");
+        static_assert(
+            meta::all<std::is_convertible<Idx, size_type>::value...>::value,
+            "type mismatch");
+        return
+#if !defined(NDEBUG) && !defined(__CUDACC__) && !defined(__HIPCC__)
+            meta::check_bounds<sizeof...(idx) - 1, N...>{}(idx...),
+#endif
+            m_data[meta::linearized_index<sizeof...(idx) - 1, N...>{}(idx...)];
+    }
+
+    /// \overload
+    template <typename... Idx>
+    DEVICE_FUNC constexpr const_reference operator()(Idx... idx) const
+#ifdef NDEBUG
+        noexcept
+#endif
+    {
+        static_assert(sizeof...(idx) == sizeof...(N), "dimension mismatch");
+        static_assert(
+            meta::all<std::is_convertible<Idx, size_type>::value...>::value,
+            "type mismatch");
+        return
+#if !defined(NDEBUG) && !defined(__CUDACC__) && !defined(__HIPCC__)
+            meta::check_bounds<sizeof...(idx) - 1, N...>{}(idx...),
+#endif
+            m_data[meta::linearized_index<sizeof...(idx) - 1, N...>{}(idx...)];
+    }
+
+    /// returns the size along dimension \p i
+    ///
+    /// \tparam i dimension
+    template <size_type i>
+    DEVICE_FUNC constexpr size_type size() const noexcept {
+        static_assert(i < sizeof...(N), "index out of bounds");
+        using unpack = std::size_t[];
+        return unpack{std::size_t(N)...}[i];
+    }
+
+    /// returns the number of elements
+    DEVICE_FUNC constexpr size_type size() const noexcept {
+        return size_product<N...>::value;
+    }
+};
+
+/// Computes the outer product of two vectors, i.e. one-dimensional variable of
+/// type \ref multi_array.
+///
+/// \return two-dimensional \ref multi_array
+template <typename T, std::size_t M, std::size_t N>
+DEVICE_FUNC constexpr multi_array<T,M,N> outer(multi_array<T,M> const & a, multi_array<T,N> const & b) {
+    multi_array<T,M,N> c;
+    for (std::size_t i = 0; i < M; ++i) {
+        for (std::size_t j = 0; j < N; ++j) {
+            c(i, j) = a(i) * b(j);
+        }
+    }
+    return c;
+}
+
+
+#undef DEVICE_FUNC
+
+#endif
diff --git a/libs/stokesian_dynamics/src/sd.hpp b/libs/stokesian_dynamics/src/sd.hpp
new file mode 100644
index 00000000000..b66ac5c6c3c
--- /dev/null
+++ b/libs/stokesian_dynamics/src/sd.hpp
@@ -0,0 +1,1594 @@
+#ifndef SD_HPP
+#define SD_HPP
+
+#include <vector>
+#include <cmath>
+
+#include "device_matrix.hpp"
+#include "multi_array.hpp"
+#include "thrust_wrapper.hpp"
+
+
+#if defined(__CUDACC__)
+#include <curand_kernel.h>
+#elif defined(__HIPCC__)
+#include <rocrand/rocrand.hpp>
+#else
+#include <Random123/philox.h>
+typedef r123::Philox2x64 RNG;
+#endif
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#define DEVICE_FUNC __host__ __device__
+#else
+#define DEVICE_FUNC
+#endif
+
+
+/** \file
+ *  This file contains computations required for Stokesian Dynamics, namely
+ *  the particle's translational and angular velocities are computed from the
+ *  particle's positions and radii and the forces and torques that are acting
+ *  on the particles.
+ *
+ *  In its current implementation, the neccessary subroutines are implemented
+ *  as functors. The main functor is called solver, which wraps up the
+ *  functionality of all the other functors in this file. For details, please
+ *  see the description of the solver functor.
+ *
+ *  For details on the thermalization, please see the description of the
+ *  thermalizer functor.
+ *
+ *  To understand the Stokesian Dynamics method, it is recommended to read
+ *  @cite durlofsky87a, because it describes the method that is
+ *  implemented here, apart from the thermalization.
+ *
+ *      "Dynamic simulation of hydrodynamically interacting suspensions"
+ *      https://core.ac.uk/download/pdf/4895234.pdf
+ *
+ *  All references to formulae in this file refer to the paper
+ *  @cite durlofsky87a, unless otherwise noted.
+ *
+ *  As they state in the paragraph below equation (2.17) it may be useful to
+ *  also read @cite jeffrey84a to better understand the notation used to
+ *  compute the grand mobility matrix in this Stokesian Dynamics method.
+ *
+ *      "Calculation of the resistance and mobility functions for two unequal
+ *      rigid spheres in low-Reynolds-number flow"
+ *      https://doi.org/10.1017/S0022112084000355
+ *
+ *  Both the F-T-version and the F-T-S-version have been implemented and can be
+ *  selected via the FTS flag.
+ *
+ *  The general Stokesian Dynamics method allows to impose an external shear
+ *  flow on the system. This has not been included in the ESPResSo interface,
+ *  but could easily be included by passing the shear flow tensor to the SD
+ *  routine and some almost trivial additional initialization.
+ *
+ *  Also, the figures in @cite cortez15a might help with an intuitive
+ *  understanding of dipole, stokeslet and rotlet flow (missing out only on
+ *  the stresslet flow).
+ *
+ *      https://doi.org/10.1016/j.jcp.2015.01.019
+ *
+ */
+
+
+namespace sd {
+
+namespace flags {
+enum flags {
+    NONE = 0,
+    SELF_MOBILITY = 1 << 0,
+    PAIR_MOBILITY = 1 << 1,
+    LUBRICATION = 1 << 2,
+    FTS = 1 << 3,
+};
+}
+
+/** This functor computes the distance between all particle pairs and checks
+ *  for overlaps.
+ *  A matrix containing unit vectors along particle center connection lines,
+ *  and actual distances is the result of this operation.
+ *  If there is a pair of particles which overlap, their distance
+ *  will be set to NAN, which eventually causes NAN to appear all over the
+ *  simulation. No warning is printed out, because the sheer presence of a
+ *  print command could impair the parallel execution performance, even if
+ *  there are no overlaps.
+ */
+template <typename Policy, typename T>
+struct check_dist {
+    /** contains particle positions */
+    device_vector_view<T, Policy> const x;
+    /** contains particle radii */
+    device_vector_view<T, Policy> const a;
+    /** contains various distance information.
+     *  pd[0] through pd[2] contain a unit vector connecting the particles
+     *  pd[3] contains the actual distance
+     */
+    device_matrix_view<T, Policy> pd;
+    /** this is a list of all pairs, containing the indices of particles
+     *  that belong to each pair
+     */
+    device_matrix_view<std::size_t, Policy> const part_id;
+
+    DEVICE_FUNC void operator()(std::size_t i) {
+        std::size_t k = 6 * part_id(0, i);
+        std::size_t j = 6 * part_id(1, i);
+
+        T dx = x(j + 0) - x(k + 0);
+        T dy = x(j + 1) - x(k + 1);
+        T dz = x(j + 2) - x(k + 2);
+#if defined(__HIPCC__)
+        T dr = sqrtf(dx * dx + dy * dy + dz * dz);
+#else
+        T dr = std::sqrt(dx * dx + dy * dy + dz * dz);
+#endif
+        T dr_inv = 1 / dr;
+
+        // Check if particles overlap and if so, cause NAN to appear
+        // throughout the simulation
+        // Currently out of use, though, since Lubrication isn't supported
+        // and this check isn't necessary for far field approximation
+        // if (dr <= a(part_id(0, i)) + a(part_id(1, i))) {
+        //     dr = NAN;
+        // }
+
+        pd(0, i) = dx * dr_inv;
+        pd(1, i) = dy * dr_inv;
+        pd(2, i) = dz * dr_inv;
+        pd(3, i) = dr;
+    }
+};
+
+
+
+
+// The expression for the mobility matrix is given in Appendix A of
+//
+//   Durlofsky, L. and Brady, J.F. and Bossis G., J . Fluid Mech. 180, 21-49
+//   (1987) https://authors.library.caltech.edu/32068/1/DURjfm87.pdf
+//
+// The submatrices mob_a, mob_b, mob_c, mob_gt, mob_ht, mob_m are
+// given by equation (A 2).
+//
+// Alternatively, the same expressions are given in
+//
+//   Kim, S. and Mifflin, R.T., Physics of Fluids 28, 2033 (1985)
+//   https://doi.org/10.1063/1.865384
+
+
+
+
+
+/** The mobility functors fill in the elements of the grand mobility matrix.
+ *  The goal is to have a numerical representation of the grand mobility
+ *  matrix that can be found in equations (A 1) to (A 3).
+ *  There are two mobility functors. One with isself=true and one with
+ *  isself=false. They are described below.
+ *
+ *  To achieve this goal, all the sub-tensors a_mn, b_mn, c_mn, g_mn, h_mn and
+ *  m_mn have to be computed. (m and n are used here instead of alpha and
+ *  beta). The subscript indices denote single particles.
+ *  This is done using the detailed description of the grand mobility matrix
+ *  in equation (A 2).
+ *
+ *  In this source file, those sub-tensors are merged into larger sub-tensors
+ *  of the grand mobility matrix. They are called mob_a, mob_b, mob_c, mob_gt,
+ *  mob_ht and mob_m, respectively. (The blocks of the grand mobility matrix in
+ *  (A 1) with common letters)
+ *
+ *  Those blocks are further merged. mob_a, mob_b and its mirror, and mob_c
+ *  are merged into zmuf in the source code. mob_gt and mob_ht are merged into
+ *  zmus. And mob_m is represented by zmes. To make that clear, compare
+ *  equations (A 1) and (2.17). (Not entirely correct!! Blocks are organized
+ *  in (UF) Blocks per particle)
+ *
+ *  The functor with isself=true computes the self-contributions of the grand
+ *  mobility matrix, i.e. the sub-tensors a_mn, c_mn and m_mn that lie on the
+ *  diagonal. For those sub-tensors, x_11 and y_11 from equation (A 3) have to
+ *  be plugged into equation (A 2) (choose the one with matching superscript).
+ *
+ *  The functor with isself=false computes all the other contributions. Because
+ *  there is always two particles involved for each contribution, they depend
+ *  on the distance between the particle centers. For these expressions, x_12
+ *  and y_12 from equation (A 3) have to be plugged into equation (A 2).
+ */
+template <typename Policy, typename T, bool isself>
+struct mobility;
+
+/** The functor that computes the self contribution to the mobility matrix of
+ *  all particles. For further information, see the description of
+ * \ref mobility .
+ */
+template <typename Policy, typename T>
+struct mobility<Policy, T, true> {
+    device_matrix_view<T, Policy> zmuf, zmus, zmes;
+    device_vector_view<T, Policy> const a;
+    T const eta;
+    int const flg;
+
+    // Determine the self contribution
+    // This is independent of dr_inv, dx, dy, dz
+    // (that is, the distance between particle centers)
+    DEVICE_FUNC void operator()(std::size_t part_id) {
+        // Fill the self mobility terms
+        // These are the sub-tensors of the grand mobility matrix as shown
+        // in equation (A 1) that are located on the diagonal.
+        // This functor fills in the sub-tensors for one single particle and
+        // has to be executed for each particle.
+
+        // mob_a, mob_c and mob_m are templates for those sub-tensors that just
+        // have to be rescaled by a constant factor.
+        static constexpr multi_array<T, 3, 3> const mob_a = {
+            // clang-format off
+            1, 0, 0,
+            0, 1, 0,
+            0, 0, 1
+            // clang-format on
+        };
+        static constexpr multi_array<T, 3, 3> const mob_c = {
+            // clang-format off
+            3./4., 0    , 0    ,
+            0    , 3./4., 0    ,
+            0    , 0    , 3./4.
+            // clang-format on
+        };
+        static constexpr multi_array<T, 5, 5> const mob_m = {
+            // clang-format off
+            9./5. , 0    , 0    , 0    , 9./10.,
+            0     , 9./5., 0    , 0    , 0     ,
+            0     , 0    , 9./5., 0    , 0     ,
+            0     , 0    , 0    , 9./5., 0     ,
+            9./10., 0    , 0    , 0    , 9./5.
+            // clang-format on
+        };
+
+
+        // Compute where the self mobility submatrices of the current particle
+        // are located in the grand mobility matrix.
+        // For velocities/forces, there are 6 independent components.
+        // (3 translation and 3 rotation)
+        // For shear rate/stresslets there are 5 independent components.
+        std::size_t const ph1 = 6 * part_id;
+        std::size_t const ph2 = ph1 + 3;
+        std::size_t const ph3 = 5 * part_id;
+
+        // These are the non-dimensionalizations as stated in the paragraph
+        // below equation (A 1).
+        auto const visc1 = T{M_1_PI / 6. / eta / a(part_id)};
+        auto const visc2 = T{visc1 / a(part_id)};
+        auto const visc3 = T{visc2 / a(part_id)};
+
+        // Now put the entries into the grand mobility matrix.
+        for (std::size_t i = 0; i < 3; ++i) {
+            zmuf(ph1 + i, ph1 + i) = visc1 * mob_a(i, i);
+            zmuf(ph2 + i, ph2 + i) = visc3 * mob_c(i, i);
+        }
+
+        for (std::size_t i = 0; i < 5; ++i) {
+            for (std::size_t j = 0; j < 5; ++j) {
+                zmes(ph3 + i, ph3 + j) = visc3 * mob_m(i, j);
+            }
+        }
+    }
+};
+
+/** The functor that computes all pair contributions to the mobility matrix.
+ *  For further information, see the description of \ref mobility .
+ */
+template <typename Policy, typename T>
+struct mobility<Policy, T, false> {
+    device_matrix_view<T, Policy> zmuf, zmus, zmes;
+    device_matrix_view<T, Policy> const pd;
+    device_matrix_view<std::size_t, Policy> const part_id;
+    device_vector_view<T, Policy> const a;
+    T const eta;
+    int const flg;
+
+    // Determine the pair contribution
+    DEVICE_FUNC void operator()(std::size_t pair_id) {
+        // Kronecker-Delta
+        static constexpr multi_array<T, 3, 3> const delta = {
+            // clang-format off
+            1, 0, 0,
+            0, 1, 0,
+            0, 0, 1
+            // clang-format on
+        };
+
+        // Levi-Civita tensor
+        static constexpr multi_array<T, 3, 3, 3> const eps = {
+            // clang-format off
+            0, 0, 0,   0, 0, 1,   0,-1, 0,
+            0, 0,-1,   0, 0, 0,   1, 0, 0,
+            0, 1, 0,  -1, 0, 0,   0, 0, 0
+            // clang-format on
+        };
+
+        // Lookup table for the linearization of the shear rate tensor E and
+        // the stresslet tensor S in equation (A 1).
+        static constexpr multi_array<std::size_t, 2, 5> const mesid = {
+            // clang-format off
+            0, 0, 0, 1, 1,
+            2, 1, 2, 2, 2
+            // clang-format on
+        };
+
+        // particle ids of the involved particles
+        std::size_t ph1 = part_id(0, pair_id);
+        std::size_t ph2 = part_id(1, pair_id);
+        // These are the non-dimensionalizations as stated in the paragraph
+        // below equation (A 1).
+        // However, modified, so that the case with two unequal spheres is
+        // covered.
+        T a12 = T{.5} * (a(ph1) + a(ph2));
+        auto const visc1 = T{M_1_PI / 6. / eta / a12};
+        auto const visc2 = T{visc1 / a12};
+        auto const visc3 = T{visc2 / a12};
+
+        // Components of unit vector along particle connection line as
+        // described in paragraph below equation (A 1).
+        T dx = pd(0, pair_id);
+        T dy = pd(1, pair_id);
+        T dz = pd(2, pair_id);
+        // Non-dimensionalized inverted distance between particles 1/r
+        T dr_inv = a12 / pd(3, pair_id);
+
+        // Combine components into unit vector along particle connection line
+        multi_array<T, 3> e = {dx, dy, dz};
+
+        // This creates a lookup-table for the many e_i * e_j like
+        // multiplications in equation (A 2).
+        auto ee = outer(e, e);
+
+        // Several powers of inverted inter-particle distance
+        T dr_inv2 = dr_inv * dr_inv;
+        T dr_inv3 = dr_inv2 * dr_inv;
+        T dr_inv4 = dr_inv3 * dr_inv;
+        T dr_inv5 = dr_inv4 * dr_inv;
+
+        // The following scalar mobility functions can be found in
+        // equation (A 3).
+        T x12a = T{3. / 2.} * dr_inv - dr_inv3;
+        T y12a = T{3. / 4.} * dr_inv + T{1. / 2.} * dr_inv3;
+
+        T y12b = T{-3. / 4.} * dr_inv2;
+
+        T x12c = T{3. / 4.} * dr_inv3;
+        T y12c = T{-3. / 8.} * dr_inv3;
+
+        T x12g = T{9. / 4.} * dr_inv2 - T{18. / 5.} * dr_inv4;
+        T y12g = T{6. / 5.} * dr_inv4;
+
+        T y12h = T{-1.0 * 9. / 8.} * dr_inv3;
+
+        T x12m = T{-1.0 * 9. / 2.} * dr_inv3 + T{54. / 5.} * dr_inv5;
+        T y12m = T{9. / 4.} * dr_inv3 - T{36. / 5.} * dr_inv5;
+        T z12m = T{9. / 5.} * dr_inv5;
+
+        // Equation (A 2) first, second, and third line
+        multi_array<T, 3, 3> mob_a;
+        multi_array<T, 3, 3> mob_b;
+        multi_array<T, 3, 3> mob_c;
+        for (std::size_t i = 0; i < 3; ++i) {
+            for (std::size_t j = 0; j < 3; ++j) {
+                // if i and j are different from one another, this returns the
+                // "other" index missing in the set {0, 1, 2}
+                std::size_t k = (3 - i - j) % 3;
+
+                mob_a(i, j) = x12a * ee(i, j) + y12a * (delta(i, j) - ee(i, j));
+                mob_b(i, j) = y12b * eps(i, j, k) * e(k);
+                mob_c(i, j) = x12c * ee(i, j) + y12c * (delta(i, j) - ee(i, j));
+            }
+        }
+
+        // Equation (A 2) fourth and fifth line
+        multi_array<T, 3, 3, 3> gt;
+        multi_array<T, 3, 3, 3> ht;
+        for (std::size_t k = 0; k < 3; ++k) {
+            for (std::size_t i = 0; i < 3; ++i) {
+                for (std::size_t j = 0; j < 3; ++j) {
+                    gt(k, i, j) =
+                        -(x12g * (ee(i, j) - T{1. / 3.} * delta(i, j)) * e(k) +
+                          y12g * (e(i) * delta(j, k) + e(j) * delta(i, k) -
+                                  T{2.0} * ee(i, j) * e(k)));
+
+                    std::size_t l = (3 - j - k) % 3;
+                    std::size_t m = (3 - i - k) % 3;
+                    ht(k, i, j) = y12h * (ee(i, l) * eps(j, k, l) +
+                                          ee(j, m) * eps(i, k, m));
+                }
+            }
+        }
+
+        // Equation (A 2) sixth line
+        multi_array<T, 3, 3, 3, 3> m;
+        for (std::size_t i = 0; i < 3; ++i) {
+            for (std::size_t j = 0; j < 3; ++j) {
+                for (std::size_t k = 0; k < 3; ++k) {
+                    for (std::size_t l = 0; l < 3; ++l) {
+                        m(i, j, k, l) =
+                            T{3. / 2.} * x12m *
+                                (ee(i, j) - T{1. / 3.} * delta(i, j)) *
+                                (ee(k, l) - T{1. / 3.} * delta(k, l)) +
+                            T{1. / 2.} * y12m *
+                                (ee(i, k) * delta(j, l) +
+                                 ee(j, k) * delta(i, l) +
+                                 ee(i, l) * delta(j, k) +
+                                 ee(j, l) * delta(i, k) -
+                                 T{4.0} * ee(i, j) * ee(k, l)) +
+                            T{1. / 2.} * z12m *
+                                (delta(i, k) * delta(j, l) +
+                                 delta(j, k) * delta(i, l) -
+                                 delta(i, j) * delta(k, l) +
+                                 ee(i, j) * delta(k, l) +
+                                 ee(k, l) * delta(i, j) -
+                                 ee(i, k) * delta(j, l) -
+                                 ee(j, k) * delta(i, l) -
+                                 ee(i, l) * delta(j, k) -
+                                 ee(j, l) * delta(i, k) + ee(i, j) * ee(k, l));
+                    }
+                }
+            }
+        }
+
+        // This segment of code converts the pair contributions to the grand
+        // mobility tensor into a symmetric matrix.  Using the following
+        // conversions of the shear rate E and the stresslet S into the vectors
+        // EV and SV respectively. EV_1 = E_11 - E_33, EV_2 = 2 E_12, EV_3 = 2
+        // E_13, EV_4 = 2 E_23, EV_5 = E_22 - E_33 SV_1 = S_11, SV_2 = S_12 =
+        // S_21, SV_3 = S_13 = S_31, SV_4 = S_23 = S_32, SV_5 = S_22
+        multi_array<T, 3, 5> mob_gt;
+        multi_array<T, 3, 5> mob_ht;
+        for (std::size_t i = 0; i < 3; ++i) {
+            mob_gt(i, 0) = gt(i, 0, 0) - gt(i, 2, 2);
+            mob_gt(i, 1) = T{2.0} * gt(i, 0, 1);
+            mob_gt(i, 2) = T{2.0} * gt(i, 0, 2);
+            mob_gt(i, 3) = T{2.0} * gt(i, 1, 2);
+            mob_gt(i, 4) = gt(i, 1, 1) - gt(i, 2, 2);
+
+            mob_ht(i, 0) = ht(i, 0, 0) - ht(i, 2, 2);
+            mob_ht(i, 1) = T{2.0} * ht(i, 0, 1);
+            mob_ht(i, 2) = T{2.0} * ht(i, 0, 2);
+            mob_ht(i, 3) = T{2.0} * ht(i, 1, 2);
+            mob_ht(i, 4) = ht(i, 1, 1) - ht(i, 2, 2);
+        }
+
+        multi_array<T, 5, 5> mob_m;
+        for (std::size_t i = 0; i < 5; ++i) {
+            if (i == 0 || i == 4) {
+                mob_m(i, 0) = m(mesid(0, i), mesid(0, i), 0, 0) -
+                              m(mesid(0, i), mesid(0, i), 2, 2) -
+                              (m(mesid(1, i), mesid(1, i), 0, 0) -
+                               m(mesid(1, i), mesid(1, i), 2, 2));
+                mob_m(i, 1) = T{2.0} * (m(mesid(0, i), mesid(0, i), 0, 1) -
+                                        m(mesid(1, i), mesid(1, i), 0, 1));
+                mob_m(i, 2) = T{2.0} * (m(mesid(0, i), mesid(0, i), 0, 2) -
+                                        m(mesid(1, i), mesid(1, i), 0, 2));
+                mob_m(i, 3) = T{2.0} * (m(mesid(0, i), mesid(0, i), 1, 2) -
+                                        m(mesid(1, i), mesid(1, i), 1, 2));
+                mob_m(i, 4) = m(mesid(0, i), mesid(0, i), 1, 1) -
+                              m(mesid(0, i), mesid(0, i), 2, 2) -
+                              (m(mesid(1, i), mesid(1, i), 1, 1) -
+                               m(mesid(1, i), mesid(1, i), 2, 2));
+            } else {
+                mob_m(i, 0) = T{2.0} * (m(mesid(0, i), mesid(1, i), 0, 0) -
+                                        m(mesid(0, i), mesid(1, i), 2, 2));
+                mob_m(i, 1) = T{4.0} * m(mesid(0, i), mesid(1, i), 0, 1);
+                mob_m(i, 2) = T{4.0} * m(mesid(0, i), mesid(1, i), 0, 2);
+                mob_m(i, 3) = T{4.0} * m(mesid(0, i), mesid(1, i), 1, 2);
+                mob_m(i, 4) = T{2.0} * (m(mesid(0, i), mesid(1, i), 1, 1) -
+                                        m(mesid(0, i), mesid(1, i), 2, 2));
+            }
+        }
+
+        // Fill the pair mobility terms. All the necessary values have been
+        // computed in the previous lines of code. Now they need to be
+        // distributed to the correct locations in the grand mobility matrix.
+
+        // Compute where the various submatrices of the current particle pair
+        // are located in the grand mobility matrix.
+        // For velocities/forces, there are 6 independent components.
+        // (3 translation and 3 rotation)
+        // For shear rate/stresslets there are 5 independent components.
+        std::size_t ph5 = 5 * ph1;
+        std::size_t ph6 = 5 * ph2;
+
+        ph1 = 6 * ph1;
+        ph2 = 6 * ph2;
+
+        std::size_t ph3 = ph1 + 3;
+        std::size_t ph4 = ph2 + 3;
+
+        // Now copy values into the correct locations in the "big" matrix
+        // and apply scaling
+        for (std::size_t i = 0; i < 3; ++i) {
+            for (std::size_t j = 0; j < 3; ++j) {
+                zmuf(ph1 + i, ph2 + j) = visc1 * mob_a(i, j);
+                zmuf(ph3 + i, ph2 + j) = visc2 * mob_b(i, j);
+                zmuf(ph1 + i, ph4 + j) =
+                    -visc2 * mob_b(j, i); // mob_b transpose
+                zmuf(ph3 + i, ph4 + j) = visc3 * mob_c(i, j);
+
+                zmuf(ph2 + i, ph1 + j) = visc1 * mob_a(j, i);
+                zmuf(ph4 + i, ph1 + j) = visc2 * mob_b(j, i);
+                zmuf(ph2 + i, ph3 + j) =
+                    -visc2 * mob_b(i, j); // mob_b transpose
+                zmuf(ph4 + i, ph3 + j) = visc3 * mob_c(j, i);
+            }
+
+            for (std::size_t j = 0; j < 5; ++j) {
+                // The paragraph under equation (A 1) claims that we would need
+                // exponent n=3, but n=2 yields correct results.
+                // Needs to be analytically verified.
+                zmus(ph1 + i, ph6 + j) = visc2 * mob_gt(i, j);
+                zmus(ph2 + i, ph5 + j) = T{-1.0} * visc2 * mob_gt(i, j);
+
+                // We don't know whether this is the correct exponent.
+                zmus(ph3 + i, ph6 + j) = visc3 * mob_ht(i, j);
+                zmus(ph4 + i, ph5 + j) = visc3 * mob_ht(i, j);
+            }
+        }
+
+        for (std::size_t i = 0; i < 5; ++i) {
+            for (std::size_t j = 0; j < 5; ++j) {
+                // We don't know whether this is the correct exponent.
+                zmes(ph5 + i, ph6 + j) = visc3 * mob_m(i, j);
+                zmes(ph6 + i, ph5 + j) = visc3 * mob_m(j, i);
+            }
+        }
+    }
+};
+
+
+
+/** The lubrication functor adds pair wise interactions to the resistance
+ *  matrix. This section of code has not been thoroughly commented yet.
+ *  Most of the comments are an educated guess of what is happening.
+ */
+template <typename Policy, typename T>
+struct lubrication {
+    device_matrix_view<T, Policy> rfu, rfe, rse;
+    device_matrix_view<T, Policy> const pd;
+    device_matrix_view<std::size_t, Policy> const part_id;
+    device_vector_view<T, Policy> const a;
+    T const eta;
+    int const flg;
+
+    // Add the lubrication forces to the mobility inverse (i.e. the grand
+    // resistance matrix).
+    // Lubrication interactions are added pair-wise.
+    DEVICE_FUNC void operator()(std::size_t pair_id) {
+        T dx = pd(0, pair_id);
+        T dy = pd(1, pair_id);
+        T dz = pd(2, pair_id);
+        multi_array<T, 3> d = {dx, dy, dz};
+        T dr = pd(3, pair_id);
+
+        // non-dimensionalization of the distance for lubrication cutoff
+        // TODO: is that actually correct for spheres with different radii?
+        T a12 = T{.5} * (a(part_id(0, pair_id)) + a(part_id(1, pair_id)));
+
+        if (dr/a12 < 4.0) {
+
+            // Compute indices that are needed to fill in the results of
+            // calc_lub() into the correct locations of the grand resistance
+            // matrix (the mobility inverse).
+            std::size_t i = part_id(0, pair_id);
+            std::size_t j = part_id(1, pair_id);
+
+            std::size_t ira = i * 6;
+            std::size_t irg = ira;
+            std::size_t irm = i * 5;
+            std::size_t icg = irm;
+
+            std::size_t jca = j * 6;
+            std::size_t jrg = jca;
+            std::size_t jcm = j * 5;
+            std::size_t jcg = jcm;
+
+            // Compute the actual lubrication correction
+            multi_array<T, 12, 12> tabc;
+            multi_array<T, 12, 10> tght;
+            multi_array<T, 10, 10> tzm;
+            calc_lub(pair_id, dr, d, tabc, tght, tzm);
+
+            // Fill in the values to the appropriate locations in the
+            // mobility inverse.
+            for (std::size_t jc = 0; jc < 6; ++jc) {
+                std::size_t jl = jc + 6;
+                std::size_t j1 = ira + jc;
+                std::size_t j2 = jca + jc;
+
+                for (std::size_t ir = 0; ir < jc + 1; ++ir) {
+                    std::size_t il = ir + 6;
+                    std::size_t i1 = ira + ir;
+                    std::size_t i2 = jca + ir;
+
+                    rfu(i1, j1) += tabc(ir, jc);
+                    rfu(i2, j2) += tabc(il, jl);
+                }
+            }
+            for (std::size_t jc = 6; jc < 12; ++jc) {
+                std::size_t j1 = jca + jc - 6;
+
+                for (std::size_t ir = 0; ir < 6; ++ir) {
+                    std::size_t i1 = ira + ir;
+
+                    rfu(i1, j1) += tabc(ir, jc);
+                }
+            }
+
+            if (flg & flags::FTS) {
+                for (std::size_t jc = 0; jc < 5; ++jc) {
+                    std::size_t jl = jc + 5;
+                    std::size_t j1 = icg + jc;
+                    std::size_t j2 = jcg + jc;
+
+                    for (std::size_t ir = 0; ir < 6; ++ir) {
+                        std::size_t il = ir + 6;
+                        std::size_t i1 = irg + ir;
+                        std::size_t i2 = jrg + ir;
+
+                        rfe(i1, j1) += tght(ir, jc);
+                        rfe(i2, j2) += tght(il, jl);
+                        rfe(i1, j2) += tght(ir, jl);
+                        rfe(i2, j1) += tght(il, jc);
+                    }
+                }
+                for (std::size_t jc = 0; jc < 5; ++jc) {
+                    std::size_t jl = jc + 5;
+                    std::size_t j1 = irm + jc;
+                    std::size_t j2 = jcm + jc;
+
+                    for (std::size_t ir = 0; ir < jc + 1; ++ir) {
+                        std::size_t il = ir + 5;
+                        std::size_t i1 = irm + ir;
+                        std::size_t i2 = jcm + ir;
+
+                        rse(i1, j1) += tzm(ir, jc);
+                        rse(i2, j2) += tzm(il, jl);
+                    }
+                }
+                for (std::size_t jc = 5; jc < 10; ++jc) {
+                    std::size_t j1 = jcm + jc - 5;
+
+                    for (std::size_t ir = 0; ir < 5; ++ir) {
+                        std::size_t i1 = irm + ir;
+
+                        rse(i1, j1) += tzm(ir, jc);
+                    }
+                }
+            }
+        }
+    }
+
+    // Computes the pair-wise lubrication interactions between particle pairs
+    DEVICE_FUNC void calc_lub(size_t pair_id, double dr,
+                              multi_array<T, 3> const &d,
+                              multi_array<T, 12, 12> &tabc,
+                              multi_array<T, 12, 10> &tght,
+                              multi_array<T, 10, 10> &tzm) {
+        static constexpr multi_array<T, 3, 3> const delta = {
+            // clang-format off
+            1, 0, 0,
+            0, 1, 0,
+            0, 0, 1
+            // clang-format on
+        };
+
+        static constexpr multi_array<T, 3, 3, 3> const eps = {
+            // clang-format off
+            0, 0, 0,   0, 0, 1,   0,-1, 0,
+            0, 0,-1,   0, 0, 0,   1, 0, 0,
+            0, 1, 0,  -1, 0, 0,   0, 0, 0
+            // clang-format on
+        };
+
+        static constexpr multi_array<std::size_t, 2, 5> const mesid = {
+            // clang-format off
+            0, 0, 0, 1, 1,
+            2, 1, 2, 2, 2
+            // clang-format on
+        };
+
+#include "lubrication_data.inl"
+
+        std::size_t ph1 = part_id(0, pair_id);
+        std::size_t ph2 = part_id(1, pair_id);
+
+        T a11 = a(ph1);
+        auto const visc11_1 = T{M_PI * 6. * eta * a11};
+        auto const visc11_2 = T{visc11_1 * a11};
+        auto const visc11_3 = T{visc11_2 * a11};
+
+        T a22 = a(ph2);
+        auto const visc22_1 = T{M_PI * 6. * eta * a22};
+        auto const visc22_2 = T{visc22_1 * a22};
+        auto const visc22_3 = T{visc22_2 * a22};
+
+        T a12 = T{.5} * (a(ph1) + a(ph2));
+        auto const visc12_1 = T{M_PI * 6. * eta * a12};
+        auto const visc12_2 = T{visc12_1 * a12};
+        auto const visc12_3 = T{visc12_2 * a12};
+
+        double x11a, x12a, y11a, y12a, y11b, y12b, x11c, x12c, y11c, y12c, x11g,
+            x12g, y11g, y12g, y11h, y12h, xm, ym, zm;
+
+        // The following section of code computes the so-called scalar
+        // resistance functions (?). Depending on the distance of the two
+        // spheres, there are two possible cases:
+        //   - almost touching spheres. An analytic expansion around the case
+        //     of touching spheres is used.
+        //   - intermediate regime. Tabulated values are used, because in this
+        //     regime, it would be very expensive to get the functions up to a
+        //     decent accuracy.
+
+        // Approximation for nearly touching spheres
+        if (dr <= 2.1) { // TODO: non-dimensionalize cutoff
+            double xi = dr - 2;
+
+            double xi1 = 1 / xi;
+#if defined(__HIPCC__)
+            double dlx = logf(xi1);
+#else
+            double dlx = std::log(xi1);
+#endif
+
+            double xdlx = xi * dlx;
+            double dlx1 = dlx + xdlx;
+
+            double csa1 = dlx * 1. / 6.;
+            double csa2 = xdlx * 1. / 6.;
+            double csa3 = dlx1 * 1. / 6.;
+            double csa4 = 0.25 * xi1 + 0.225 * dlx;
+            double csa5 = dlx * 1. / 15.;
+
+            //*** a, btilda, and c terms for rfu.
+
+            x11a = csa4 - 1.23041 + 3. / 112. * xdlx + 1.8918 * xi;
+            x12a = -x11a + 0.00312 - 0.0011 * xi;
+            y11a = csa1 - 0.39394 + 0.95665 * xi;
+            y12a = -y11a + 0.00463606 - 0.007049 * xi;
+
+            y11b = -csa1 + 0.408286 - xdlx * 1. / 12. - 0.84055 * xi;
+            y12b = -y11b + 0.00230818 - 0.007508 * xi;
+
+            x11c = 0.0479 - csa2 + 0.12494 * xi;
+            x12c = -0.031031 + csa2 - 0.174476 * xi;
+            y11c = 4. * csa5 - 0.605434 + 94. / 375. * xdlx + 0.939139 * xi;
+            y12c = csa5 - 0.212032 + 31. / 375. * xdlx + 0.452843 * xi;
+
+            //*** g and h terms for rsu.
+
+            double csg1 = csa4 + 39. / 280. * xdlx;
+            double csg2 = dlx * 1. / 12. + xdlx * 1. / 24.;
+
+            x11g = csg1 - 1.16897 + 1.47882 * xi;
+            x12g = -csg1 + 1.178967 - 1.480493 * xi;
+            y11g = csg2 - 0.2041 + 0.442226 * xi;
+            y12g = -csg2 + 0.216365 - 0.469830 * xi;
+
+            y11h = 0.5 * csa5 - 0.143777 + 137. / 1500. * xdlx + 0.264207 * xi;
+            y12h = 2. * csa5 - 0.298166 + 113. / 1500. * xdlx + 0.534123 * xi;
+
+            //*** m term for rse.
+
+            xm = 1. / 3. * xi1 + 0.3 * dlx - 1.48163 + 0.335714 * xdlx +
+                 1.413604 * xi;
+            ym = csa3 - 0.423489 + 0.827286 * xi;
+            zm = 0.0129151 - 0.042284 * xi;
+
+        // Intermediate regime
+        } else {
+            auto ida = static_cast<int>(20. * (dr - 2));
+            int ib = -2 + ida;
+            int ia = ib + 1;
+
+            double c1 = (dr - rsabc[ib]) / (rsabc[ia] - rsabc[ib]);
+
+            x11a = (x11as[ia] - x11as[ib]) * c1 + x11as[ib];
+            x12a = (x12as[ia] - x12as[ib]) * c1 + x12as[ib];
+            y11a = (y11as[ia] - y11as[ib]) * c1 + y11as[ib];
+            y12a = (y12as[ia] - y12as[ib]) * c1 + y12as[ib];
+
+            y11b = (y11bs[ia] - y11bs[ib]) * c1 + y11bs[ib];
+            y12b = (y12bs[ia] - y12bs[ib]) * c1 + y12bs[ib];
+
+            y11c = (y11cs[ia] - y11cs[ib]) * c1 + y11cs[ib];
+            y12c = (y12cs[ia] - y12cs[ib]) * c1 + y12cs[ib];
+            x11c = (x11cs[ia] - x11cs[ib]) * c1 + x11cs[ib];
+            x12c = (x12cs[ia] - x12cs[ib]) * c1 + x12cs[ib];
+
+            if (dr < 2.2) {
+                ib = -10 + static_cast<int>(100. * (dr - 2));
+            } else {
+                ib = 6 + ida;
+            }
+
+            ia = ib + 1;
+
+            double cgh = (dr - rsgh[ib]) / (rsgh[ia] - rsgh[ib]);
+
+            x11g = (x11gs[ia] - x11gs[ib]) * cgh + x11gs[ib];
+            x12g = (x12gs[ia] - x12gs[ib]) * cgh + x12gs[ib];
+            y11g = (y11gs[ia] - y11gs[ib]) * cgh + y11gs[ib];
+            y12g = (y12gs[ia] - y12gs[ib]) * cgh + y12gs[ib];
+
+            y11h = (y11hs[ia] - y11hs[ib]) * cgh + y11hs[ib];
+            y12h = (y12hs[ia] - y12hs[ib]) * cgh + y12hs[ib];
+
+            double cm = (dr - rsm[ib]) / (rsm[ia] - rsm[ib]);
+
+            xm = (xms[ia] - xms[ib]) * cm + xms[ib];
+            ym = (yms[ia] - yms[ib]) * cm + yms[ib];
+            zm = (zms[ia] - zms[ib]) * cm + zms[ib];
+        }
+        // The scalar resistance functions have been computed.
+        // The next goal is to form all the sub-tensors, i.e. contributions to
+        // the grand resistance matrix using the scalar functions from above.
+
+        //***************************************************************************c
+        //***************************************************************************c
+
+        auto ee = outer(d, d);
+
+        //***************************************************************************c
+        //***************************************************************************c
+        //****************************** form tabc for rfu
+        //**************************c
+
+        double xmy11a = x11a - y11a;
+        double xmy12a = x12a - y12a;
+        double xmy11c = x11c - y11c;
+        double xmy12c = x12c - y12c;
+
+        //*** insert upper half of a11.
+
+        T tabc00 = xmy11a * ee(0, 0) + y11a;
+        T tabc11 = xmy11a * ee(1, 1) + y11a;
+        T tabc22 = xmy11a * ee(2, 2) + y11a;
+        T tabc01 = xmy11a * ee(0, 1);
+        T tabc02 = xmy11a * ee(0, 2);
+        T tabc12 = xmy11a * ee(1, 2);
+        tabc(0, 0) = visc11_1 * tabc00;
+        tabc(1, 1) = visc11_1 * tabc11;
+        tabc(2, 2) = visc11_1 * tabc22;
+        tabc(0, 1) = visc11_1 * tabc01;
+        tabc(0, 2) = visc11_1 * tabc02;
+        tabc(1, 2) = visc11_1 * tabc12;
+
+        //*** insert a12.
+
+        tabc(0, 6) = visc12_1 * xmy12a * ee(0, 0) + y12a;
+        tabc(1, 7) = visc12_1 * xmy12a * ee(1, 1) + y12a;
+        tabc(2, 8) = visc12_1 * xmy12a * ee(2, 2) + y12a;
+        tabc(0, 7) = visc12_1 * xmy12a * ee(0, 1);
+        tabc(0, 8) = visc12_1 * xmy12a * ee(0, 2);
+        tabc(1, 8) = visc12_1 * xmy12a * ee(1, 2);
+        tabc(1, 6) = tabc(0, 7);
+        tabc(2, 6) = tabc(0, 8);
+        tabc(2, 7) = tabc(1, 8);
+
+        //*** insert upper half of c11.
+
+        T tabc33 = xmy11c * ee(0, 0) + y11c;
+        T tabc44 = xmy11c * ee(1, 1) + y11c;
+        T tabc55 = xmy11c * ee(2, 2) + y11c;
+        T tabc34 = xmy11c * ee(0, 1);
+        T tabc35 = xmy11c * ee(0, 2);
+        T tabc45 = xmy11c * ee(1, 2);
+        tabc(3, 3) = visc11_3 * tabc33;
+        tabc(4, 4) = visc11_3 * tabc44;
+        tabc(5, 5) = visc11_3 * tabc55;
+        tabc(3, 4) = visc11_3 * tabc34;
+        tabc(3, 5) = visc11_3 * tabc35;
+        tabc(4, 5) = visc11_3 * tabc45;
+
+        //*** insert c12.
+
+        tabc(3, 9) = visc12_3 * xmy12c * ee(0, 0) + y12c;
+        tabc(4, 10) = visc12_3 * xmy12c * ee(1, 1) + y12c;
+        tabc(5, 11) = visc12_3 * xmy12c * ee(2, 2) + y12c;
+        tabc(3, 10) = visc12_3 * xmy12c * ee(0, 1);
+        tabc(3, 11) = visc12_3 * xmy12c * ee(0, 2);
+        tabc(4, 11) = visc12_3 * xmy12c * ee(1, 2);
+        tabc(4, 9) = tabc(3, 10);
+        tabc(5, 9) = tabc(3, 11);
+        tabc(5, 10) = tabc(4, 11);
+
+        //*** fill in upper half of a22 (=a11).
+
+        tabc(6, 6) = visc22_1 * tabc00;
+        tabc(6, 7) = visc22_1 * tabc01;
+        tabc(6, 8) = visc22_1 * tabc02;
+        tabc(7, 7) = visc22_1 * tabc11;
+        tabc(7, 8) = visc22_1 * tabc12;
+        tabc(8, 8) = visc22_1 * tabc22;
+
+        //*** fill in upper half of c22 (=c11).
+
+        tabc(9, 9) = visc22_3 * tabc33;
+        tabc(9, 10) = visc22_3 * tabc34;
+        tabc(9, 11) = visc22_3 * tabc35;
+        tabc(10, 10) = visc22_3 * tabc44;
+        tabc(10, 11) = visc22_3 * tabc45;
+        tabc(11, 11) = visc22_3 * tabc55;
+
+        //*** insert bt11.
+
+        tabc(0, 3) = 0.0;
+        tabc(0, 4) = -visc11_2 * y11b * d(2);
+        tabc(0, 5) = visc11_2 * y11b * d(1);
+        tabc(1, 4) = 0.0;
+        tabc(1, 5) = -visc11_2 * y11b * d(0);
+        tabc(1, 3) = -tabc(0, 4);
+        tabc(2, 3) = -tabc(0, 5);
+        tabc(2, 4) = -tabc(1, 5);
+        tabc(2, 5) = 0.0;
+
+        //*** insert bt12.
+
+        tabc(0, 9) = 0.0;
+        tabc(0, 10) = visc12_2 * y12b * d(2);
+        tabc(0, 11) = -visc12_2 * y12b * d(1);
+        tabc(1, 10) = 0.0;
+        tabc(1, 11) = visc12_2 * y12b * d(0);
+        tabc(1, 9) = -tabc(0, 10);
+        tabc(2, 9) = -tabc(0, 11);
+        tabc(2, 10) = -tabc(1, 11);
+        tabc(2, 11) = 0.0;
+
+        //***************************************************************************c
+        //***************************************************************************c
+        //*** fill in bt22 (=-bt11) and b12 (=bt12).
+
+        for (std::size_t j3 = 3; j3 < 6; ++j3) {
+            std::size_t j6 = j3 + 3;
+            std::size_t j9 = j3 + 6;
+
+            for (std::size_t i = 0; i < 3; ++i) {
+                std::size_t i3 = i + 3;
+                std::size_t i6 = i + 6;
+
+                tabc(i3, j6) = tabc(i, j9);
+                tabc(i6, j9) = -tabc(i, j3);
+            }
+        }
+
+        if (!(flg & flags::FTS)) {
+            return;
+        }
+
+        //***************************************************************************c
+        //***************************************************************************c
+        //****************************** form tght for rfe
+        //**************************c
+        //*** insert gt11.
+        multi_array<T, 3, 3, 3> gt;
+        multi_array<T, 3, 3, 3> ht;
+        for (std::size_t k = 0; k < 3; ++k) {
+            for (std::size_t i = 0; i < 3;
+                 ++i) { // TODO: Is the 3 actually correct?  In the Fortran
+                        // version it's 2
+                for (std::size_t j = 0; j < 3; ++j) {
+                    gt(k, i, j) =
+                        x11g * (ee(i, j) - T{1. / 3.} * delta(i, j)) * d(k) +
+                        y11g * (d(i) * delta(j, k) + d(j) * delta(i, k) -
+                                T{2.0} * ee(i, j) * d(k));
+
+                    std::size_t l = (3 - j - k) % 3;
+                    std::size_t m = (3 - i - k) % 3;
+                    ht(k, i, j) = y11h * (ee(i, l) * eps(j, k, l) +
+                                          ee(j, m) * eps(i, k, m));
+                }
+            }
+        }
+
+        for (std::size_t i = 0; i < 3; ++i) {
+            std::size_t k = i + 6;
+            std::size_t j = i + 3;
+
+            tght(k, 0) = visc11_3 * gt(i, 0, 0) - gt(i, 2, 2);
+            tght(k, 1) = visc11_3 * 2.0 * gt(i, 0, 1);
+            tght(k, 2) = visc11_3 * 2.0 * gt(i, 0, 2);
+            tght(k, 3) = visc11_3 * 2.0 * gt(i, 1, 2);
+            tght(k, 4) = visc11_3 * gt(i, 1, 1) - gt(i, 2, 2);
+
+            tght(j, 5) = visc11_3 * ht(i, 0, 0) - ht(i, 2, 2);
+            tght(j, 6) = visc11_3 * 2.0 * ht(i, 0, 1);
+            tght(j, 7) = visc11_3 * 2.0 * ht(i, 0, 2);
+            tght(j, 8) = visc11_3 * 2.0 * ht(i, 1, 2);
+            tght(j, 9) = visc11_3 * ht(i, 1, 1) - ht(i, 2, 2);
+        }
+
+        double c13x11g = 1. / 3. * x11g;
+        double c2y11g = 2.0 * y11g;
+        double xm2y11g = x11g - c2y11g;
+        double comd11 = ee(0, 0) * xm2y11g;
+        double comd22 = ee(1, 1) * xm2y11g;
+        double comd33 = ee(2, 2) * xm2y11g;
+        double c2ymx11 = c2y11g - c13x11g;
+        double con34 = comd11 - c13x11g;
+        double con56 = comd11 + y11g;
+        double con712 = comd22 + y11g;
+        double con89 = comd33 + y11g;
+        double con1011 = comd22 - c13x11g;
+
+        tght(0, 0) = visc11_3 * d(0) * (comd11 + c2ymx11);
+        tght(0, 1) = visc11_3 * d(1) * con56;
+        tght(0, 2) = visc11_3 * d(2) * con56;
+        tght(0, 3) = visc11_3 * d(0) * ee(1, 2) * xm2y11g;
+        tght(0, 4) = visc11_3 * d(0) * con1011;
+        tght(1, 0) = visc11_3 * d(1) * con34;
+        tght(1, 1) = visc11_3 * d(0) * con712;
+        tght(1, 2) = tght(0, 3);
+        tght(1, 3) = visc11_3 * d(2) * con712;
+        tght(1, 4) = visc11_3 * d(1) * (comd22 + c2ymx11);
+        tght(1, 0) = visc11_3 * d(2) * con34;
+        tght(2, 1) = tght(0, 3);
+        tght(2, 2) = visc11_3 * d(0) * con89;
+        tght(2, 3) = visc11_3 * d(1) * con89;
+        tght(2, 4) = visc11_3 * d(2) * con1011;
+
+        //*** insert gt21.
+
+        double c13x12g = 1. / 3. * x12g;
+        double c2y12g = 2.0 * y12g;
+        double xm2y12g = x12g - c2y12g;
+        double cumd11 = ee(0, 0) * xm2y12g;
+        double cumd22 = ee(1, 1) * xm2y12g;
+        double cumd33 = ee(2, 2) * xm2y12g;
+        double c2ymx12 = c2y12g - c13x12g;
+        double cun34 = cumd11 - c13x12g;
+        double cun56 = cumd11 + y12g;
+        double cun712 = cumd22 + y12g;
+        double cun89 = cumd33 + y12g;
+        double cun1011 = cumd22 - c13x12g;
+
+        tght(6, 0) = visc12_3 * d(0) * (cumd11 + c2ymx12);
+        tght(6, 1) = visc12_3 * d(1) * cun56;
+        tght(6, 2) = visc12_3 * d(2) * cun56;
+        tght(6, 3) = visc12_3 * d(0) * ee(1, 2) * xm2y12g;
+        tght(6, 4) = visc12_3 * d(0) * cun1011;
+        tght(7, 0) = visc12_3 * d(1) * cun34;
+        tght(7, 1) = visc12_3 * d(0) * cun712;
+        tght(7, 2) = tght(6, 3);
+        tght(7, 3) = visc12_3 * d(2) * cun712;
+        tght(7, 4) = visc12_3 * d(1) * (cumd22 + c2ymx12);
+        tght(8, 0) = visc12_3 * d(2) * cun34;
+        tght(8, 1) = tght(6, 3);
+        tght(8, 2) = visc12_3 * d(0) * cun89;
+        tght(8, 3) = visc12_3 * d(1) * cun89;
+        tght(8, 4) = visc12_3 * d(2) * cun1011;
+
+        //*** insert ht11.
+
+        double d11md22 = ee(0, 0) - ee(1, 1);
+        double d22md33 = ee(1, 1) - ee(2, 2);
+        double d33md11 = ee(2, 2) - ee(0, 0);
+        double y11hd12 = y11h * ee(0, 1);
+        double y11hd13 = y11h * ee(0, 2);
+        double y11hd23 = y11h * ee(1, 2);
+        double cyhd12a = 2.0 * y11hd12;
+
+        tght(3, 0) = 0.0;
+        tght(3, 1) = -visc11_3 * y11hd13;
+        tght(3, 2) = visc11_3 * y11hd12;
+        tght(3, 3) = visc11_3 * y11h * d22md33;
+        tght(3, 4) = -visc11_3 * 2.0 * y11hd23;
+        tght(4, 0) = visc11_3 * 2.0 * y11hd13;
+        tght(4, 1) = visc11_3 * y11hd23;
+        tght(4, 2) = visc11_3 * y11h * d33md11;
+        tght(4, 3) = -visc11_3 * y11hd12;
+        tght(4, 4) = 0.0;
+        tght(5, 0) = -visc11_3 * cyhd12a;
+        tght(5, 1) = visc11_3 * y11h * d11md22;
+        tght(5, 2) = -visc11_3 * y11hd23;
+        tght(5, 3) = visc11_3 * y11hd13;
+        tght(5, 4) = visc11_3 * cyhd12a;
+
+        //*** insert ht12.
+
+        double y12hd12 = y12h * ee(0, 1);
+        double y12hd13 = y12h * ee(0, 2);
+        double y12hd23 = y12h * ee(1, 2);
+        double cyhd12b = 2.0 * y12hd12;
+
+        tght(3, 5) = 0.0;
+        tght(3, 6) = -visc12_3 * y12h * ee(0, 2);
+        tght(3, 7) = visc12_3 * y12h * ee(0, 1);
+        tght(3, 8) = visc12_3 * y12h * d22md33;
+        tght(3, 9) = -visc12_3 * 2.0 * y12hd23;
+        tght(4, 5) = visc12_3 * 2.0 * y12hd13;
+        tght(4, 6) = visc12_3 * y12hd23;
+        tght(4, 7) = visc12_3 * y12h * d33md11;
+        tght(4, 8) = -visc12_3 * y12hd12;
+        tght(4, 9) = 0.0;
+        tght(5, 5) = -visc12_3 * cyhd12b;
+        tght(5, 6) = visc12_3 * y12h * d11md22;
+        tght(5, 7) = -visc12_3 * y12hd23;
+        tght(5, 8) = visc12_3 * y12hd13;
+        tght(5, 9) = visc12_3 * cyhd12b;
+
+        //***************************************************************************c
+        //***************************************************************************c
+        //*** insert gt12 (=-gt21), gt22(=-gt11), ht21 (=ht12), ht22 (=ht11).
+
+        for (std::size_t i = 0; i < 3; ++i) {
+            std::size_t i3 = i + 3;
+            std::size_t i6 = i + 6;
+            std::size_t i9 = i + 9;
+
+            for (std::size_t j = 0; j < 5; ++j) {
+                std::size_t j5 = j + 5;
+
+                tght(i, j5) = -tght(i6, j);
+                tght(i6, j5) = -tght(i, j);
+                tght(i9, j) = tght(i3, j5);
+                tght(i9, j5) = tght(i3, j);
+            }
+        }
+        //***************************************************************************c
+        //***************************************************************************c
+        //**************************** form tzm for rse
+        //*****************************c
+        multi_array<T, 3, 3, 3, 3> m;
+        for (std::size_t i = 0; i < 3; ++i) {
+            for (std::size_t j = i; j < 3; ++j) {
+                for (std::size_t k = 0; k < 3; ++k) {
+                    for (std::size_t l = k; l < 3; ++l) {
+                        m(i, j, k, l) =
+                            T{3. / 2.} * xm *
+                                (ee(i, j) - T{1. / 3.} * delta(i, j)) *
+                                (ee(k, l) - T{1. / 3.} * delta(k, l)) +
+                            T{1. / 2.} * ym *
+                                (ee(i, k) * delta(j, l) +
+                                 ee(j, k) * delta(i, l) +
+                                 ee(i, l) * delta(j, k) +
+                                 ee(j, l) * delta(i, k) -
+                                 T{4.0} * ee(i, j) * ee(k, l)) +
+                            T{1. / 2.} * zm *
+                                (delta(i, k) * delta(j, l) +
+                                 delta(j, k) * delta(i, l) -
+                                 delta(i, j) * delta(k, l) +
+                                 ee(i, j) * delta(k, l) +
+                                 ee(k, l) * delta(i, j) -
+                                 ee(i, k) * delta(j, l) -
+                                 ee(j, k) * delta(i, l) -
+                                 ee(i, l) * delta(j, k) -
+                                 ee(j, l) * delta(i, k) + ee(i, j) * ee(k, l));
+                    }
+                }
+            }
+        }
+
+        for (std::size_t i = 0; i < 5; ++i) {
+
+            if (i == 0 || i == 4) {
+                tzm(i, 0) = m(mesid(0, i), mesid(0, i), 0, 0) -
+                            m(mesid(0, i), mesid(0, i), 2, 2) -
+                            (m(mesid(1, i), mesid(1, i), 0, 0) -
+                             m(mesid(1, i), mesid(1, i), 2, 2));
+                tzm(i, 1) = 2.0 * (m(mesid(0, i), mesid(0, i), 0, 1) -
+                                   m(mesid(1, i), mesid(1, i), 0, 1));
+                tzm(i, 2) = 2.0 * (m(mesid(0, i), mesid(0, i), 0, 2) -
+                                   m(mesid(1, i), mesid(1, i), 0, 2));
+                tzm(i, 3) = 2.0 * (m(mesid(0, i), mesid(0, i), 1, 2) -
+                                   m(mesid(1, i), mesid(1, i), 1, 2));
+                tzm(i, 4) = m(mesid(0, i), mesid(0, i), 1, 1) -
+                            m(mesid(0, i), mesid(0, i), 2, 2) -
+                            (m(mesid(1, i), mesid(1, i), 1, 1) -
+                             m(mesid(1, i), mesid(1, i), 2, 2));
+
+            } else {
+
+                tzm(i, 0) = 2.0 * (m(mesid(0, i), mesid(1, i), 0, 0) -
+                                   m(mesid(0, i), mesid(1, i), 2, 2));
+                tzm(i, 1) = 4.0 * m(mesid(0, i), mesid(1, i), 0, 1);
+                tzm(i, 2) = 4.0 * m(mesid(0, i), mesid(1, i), 0, 2);
+                tzm(i, 3) = 4.0 * m(mesid(0, i), mesid(1, i), 1, 2);
+                tzm(i, 4) = 2.0 * (m(mesid(0, i), mesid(1, i), 1, 1) -
+                                   m(mesid(0, i), mesid(1, i), 2, 2));
+            }
+        }
+        //***************************************************************************c
+        //***************************************************************************c
+        //*** fill in upper half of m12 (=m11) and m22 (=m11).
+
+        for (std::size_t j = 0; j < 5; ++j) {
+
+            std::size_t j5 = j + 5;
+
+            for (std::size_t i = 0; i < j + 1; ++i) {
+
+                std::size_t i5 = i + 5;
+
+                tzm(i, j5) = visc12_3 * tzm(i, j);
+                tzm(i5, j5) = visc22_3 * tzm(i, j);
+            }
+        }
+        //*** fill in the lower half of m12.
+        for (std::size_t i = 0; i < 5; ++i) {
+
+            std::size_t i5 = i + 5;
+            for (std::size_t j = i + 1; j < 5; ++j) {
+
+                std::size_t j5 = j + 5;
+
+                tzm(j, i5) = visc12_3 * tzm(i, j5);
+            }
+        }
+
+        for (std::size_t i = 0; i < 5; ++i) {
+            for (std::size_t j = 0; j < 5; ++j) {
+                tzm(i, j) = visc11_3 * tzm(i, j);
+            }
+        }
+    }
+};
+
+template <typename T>
+struct thermalizer {
+    T sqrt_kT_Dt;
+    std::size_t offset;
+    std::size_t seed;
+#if defined(__CUDACC__) || defined(__HIPCC__)
+    __device__
+#endif
+    T operator()(std::size_t index) {
+#if defined(__CUDACC__)
+        uint4 rnd_ints = curand_Philox4x32_10(make_uint4(offset >> 32, seed >> 32, index >> 32, index),
+                                              make_uint2(offset, seed));
+        T rnd = _curand_uniform_double_hq(rnd_ints.w, rnd_ints.x);
+#elif defined (__HIPCC__)
+        rocrand_state_philox4x32_10 state(seed, index, offset);
+        T rnd = rocrand_uniform(&state);
+#else
+        // get two 64-bit random unsigned integers (of which only one is used)
+        typedef r123::Philox2x64 RNG;
+        RNG rng;
+        RNG::ctr_type c;
+        RNG::key_type k;
+        k[0] = seed;
+        c[0] = offset;
+        c[1] = index;
+        RNG::ctr_type rint = rng(c, k);
+
+        // convert to uniform distribution
+        uint64_t constexpr const max = std::numeric_limits<uint64_t>::max();
+        double constexpr const fac = 1. / (max + 1.);
+        T rnd = (rint[0] + 0.5) * fac;
+#endif
+        return std::sqrt(T{2.0}) * sqrt_kT_Dt * std::sqrt(T{12.0}) * (rnd - 0.5);
+
+        // sqrt(12) * (rnd - 0.5) is a uniformly distributed random number
+        // with zero mean and unit variance.
+
+        // sqrt(2 * kT * \Delta t) is the desired standard deviation for
+        // random displacement.
+
+        // NOTE: Here, we do not compute a random displacement but a random
+        // velocity, therefore the standard deviation has been divided by
+        // the time step (see sd_interface.cpp).
+
+        // why use the _curand_uniform_double_hq function?
+        // I don't think it is meant to be used outside of the
+        // curand_kernel.h file.
+        // Apparently it works, so keep it for now.
+    }
+};
+
+/** Functor that takes all the relevant particle data (i.e. positions, radii,
+    external forces and torques) and computes the translational and angular
+    velocities. These can be used to propagate the system.
+
+    The basic idea of the method is as follows: The Stokes equation is linear,
+    i.e. the relations between force and motion (velocity) is purely linear.
+    All forces and torques that act on individual particles are merged into
+    one large vector. All velocities and angular velocities of individual
+    particles are merged into one vector alike.
+    The relationship between force and velocity is now given by the so-called
+    grand mobility matrix, or its inverse, the resistance matrix:
+
+            \f[\vec{U} = M_{UF} \vec{F}\f]
+
+    where \f$\vec{F}\f$ is the force, \f$\vec{U}\f$ the velocities and \f$M\f$
+    the grand mobility matrix. The subscript \f$_{UF}\f$ indicates that this
+    tensor describes the relationship between forces and velocities.
+    the relationship between forces and velocities.
+    The mobility matrix depends on the positions of all particles and therefore
+    is only valid for the momentary configuration. It needs to be recalculated
+    each time step.
+
+    To obtain the velocities, the mobility matrix must be applied to the forces
+    that act on the particles, giving the velocities.
+
+    These are the steps that are taken to compute the grand mobility matrix:
+      -# Starting point is an empty mobility matrix.
+      -# All self mobility terms are added. Its translational part is widely
+         known as Stokes' Law.
+      -# All pair mobility terms are added.
+      -# The mobility matrix is inverted to become a resistance matrix.
+      -# Lubrication corrections are added to the resistance matrix. They are
+         necessary for particles in close vicinity because simplifications
+         have been made to efficiently cover long range interactions.
+      -# In the end, the finished resistance matrix is inverted again. The
+         result is the finished mobility matrix that includes the short-ranged
+         lubrication interactions.
+
+    The thermalisation is achieved by a generalization of the
+    Einstein-Smoluchowski equation, which relates mobility and diffusion:
+
+            \f[D = \mu k_B T\f]
+
+    where \f$\mu\f$ is the mobility and \f$D\f$ is the diffusion coefficient.
+    The mean square displacement during a time step of length \f$\Delta t\f$,
+    and along one degree of freedom is given by
+
+            \f[\langle \Delta x^2 \rangle / \Delta t = 2 D\f]
+
+    By combining these two equations we get the random displacement, which we
+    can then directly use to propagate the particles:
+
+            \f[\Delta x = \sqrt{2 k_B T \Delta t} \times \mu^{1/2} \cdot \Phi\f]
+
+    where \f$\Phi\f$ is a random number drawn from a zero mean and unit
+    variance distribution.
+    That way we can determine the random displacement that our system
+    experiences along one of its many degrees of freedom. In our case,
+    \f$\mu\f$ is a matrix and we need its square root. The latter is
+    obtained via Cholesky decomposition. And \f$\Phi\f$ is a vector filled with
+    random numbers from a zero mean and unit variance distribution.
+ */
+template <typename Policy, typename T>
+struct solver {
+    static_assert(policy::is_policy<Policy>::value,
+                  "The execution policy must meet the requirements");
+    static_assert(std::is_arithmetic<T>::value,
+                  "Data type of device_matrix must be arithmetic for "
+                  "arithmetic operations");
+
+    template <typename U>
+    using vector_type = typename Policy::template vector<U>;
+
+    /** viscosity of the Stokes fluid */
+    T const eta;
+    /** number of particles */
+    std::size_t const n_part;
+    /** number of pairs of particles = n_part*(n_part-1)/2 */
+    std::size_t const n_pair;
+
+
+
+    /** Initialization of the SD solver */
+    solver(T eta, std::size_t const n_part)
+        : eta{eta}, n_part(n_part), n_pair(n_part * (n_part - 1) / 2) {}
+
+
+    /** Invert the grand-mobility tensor.  This is done in several steps
+     *  which minimize the computation time. Input matrices are not preserved.
+     *
+     *    \param zmuf sub-tensor of the input mobility matrix, relating forces
+     *                to velocities
+     *    \param zmus input, relating stresslets to velocities
+     *    \param zmes input, relating ambient shear flow to stresslets
+     *    \param rfu sub-tensor of the output resistance matrix, relating
+     *               velocities to forces
+     *    \param rfe output, relating ambient shear flow to forces
+     *    \param rse output, relating ambient shear flow to stresslets
+     */
+    void invert_grand_mobility_matrix(device_matrix<T, Policy> &zmuf,
+                                      device_matrix<T, Policy> &zmus,
+                                      device_matrix<T, Policy> &zmes,
+                                      device_matrix<T, Policy> &rfu,
+                                      device_matrix<T, Policy> &rfe,
+                                      device_matrix<T, Policy> &rse,
+                                      int const flg) {
+        // TODO: Where are these steps from?
+        // Invert R1 = Muf ^ -1 => zmuf = zmuf ^ -1
+        zmuf = zmuf.inverse();
+
+        if (flg & flags::FTS) {
+            // Compute R2 = Mus(t) * R1 => rsu = zmus(t) * zmuf
+            device_matrix<T, Policy> const &rsu = zmus.transpose() * zmuf;
+
+            // Compute R3 = R2 * Mus - Mes => zmes = rsu * zmus - zmes
+            zmes = zmes - rsu * zmus;
+
+            // Invert  R4 = R3 ^ -1 => zmes = zmes ^ -1
+            zmes = zmes.inverse();
+
+            // Compute R5 = -R3 * R4 => zmus = -rsu(t) * zmes
+            zmus = -(rsu.transpose() * zmes);
+
+            // Compute R6 = R1 - R5  => zmuf = zmuf - zmus * rsu
+            zmuf = zmuf - zmus * rsu;
+        }
+        rfu = zmuf;
+        rfe = zmus;
+        rse = zmes;
+    }
+
+    /** Thermalization with stochastic force, making use of the fluctuation-
+     *  dissipation-theorem.
+     *
+     *  \return stochastic force vector
+     *  \param rfu_sqrt Square root of the resistance matrix
+     *  \param size size of the force vector, is 6 times number of particles
+     *  \param sqrt_kT_Dt Square root of kT / Delta t
+     *  \param offset Simulation time, serves as RNG seed for each step
+     *  \param seed global seed for the whole simulation
+     */
+    vector_type<T> thermalization(device_matrix<T, Policy> &rfu_sqrt,
+                                  std::size_t size, T sqrt_kT_Dt,
+                                  std::size_t offset, std::size_t seed) {
+
+            // This method is combined from two locations,
+            // namely @cite banchio03a,
+            // Banchio, Brady 2002 https://doi.org/10.1063/1.1571819
+            // equation (6)
+            //  -- and --
+            // @cite brady88a
+            // Brady, Bossis 1988
+            // https://doi.org/10.1146/annurev.fl.20.010188.000551
+            // equation (3)
+
+            // We adopt the more detailed method of the paper @cite banchio03a.
+
+            // However, the matrix A in that paper is NOT a byproduct of the
+            // matrix inversion of R_{FU} as they claim. (We get the
+            // decomposition of R_{FU} but not of its inverse)
+            // Also, the random displacement between formulas (5) and (6) is
+            // wrong, since the square root needs to include kT and Delta t
+            // as well.
+
+            // Luckily, with the decomposition of R_{FU} we CAN compute an
+            // appropriate random force with the variance given by the paper
+            // @cite brady88a (as opposed to random displacement).
+
+            // Psi is a vector filled with random numbers, scaled correctly
+            vector_type<T> psi(size);
+            thrust_wrapper::tabulate(Policy::par(), psi.begin(), psi.end(),
+                             thermalizer<T>{sqrt_kT_Dt, offset, seed});
+
+            return rfu_sqrt * psi;
+
+            // There is possibly an additional term for the thermalization
+            //
+            //     \nabla \cdot R_{FU}^{-1} \Delta t
+            //
+            // But this seems to be omitted in most cases in the literature.
+            // It is also very unclear how to actually calculate it.
+    }
+
+
+    /** main function doing the SD calculation */
+    std::vector<T> calc_vel(std::vector<T> const &x_host,
+                            std::vector<T> const &f_host,
+                            std::vector<T> const &a_host,
+                            T sqrt_kT_Dt,
+                            std::size_t offset,
+                            std::size_t seed,
+                            int const flg = flags::SELF_MOBILITY | flags::PAIR_MOBILITY | flags::FTS) {
+        assert(x_host.size() == 6 * n_part);
+        vector_type<T> x(x_host.begin(), x_host.end());
+        assert(a_host.size() == n_part);
+        vector_type<T> a(a_host.begin(), a_host.end());
+
+        // TODO: More efficient!
+        // generate lookup table of all pairs
+        device_matrix<std::size_t, Policy> part_id(2, n_pair);
+        std::size_t k = 0;
+        for (std::size_t i = 0; i < n_part; ++i) {
+            for (std::size_t j = i + 1; j < n_part; ++j) {
+                part_id(0, k) = i;
+                part_id(1, k) = j;
+                k += 1;
+            }
+        }
+
+        device_matrix<T, Policy> pd(4, n_pair);
+        thrust_wrapper::counting_iterator<std::size_t> begin(0UL);
+
+        // check_dist
+        thrust_wrapper::for_each(Policy::par(), begin, begin + n_pair,
+                         check_dist<Policy, T>{x, a, pd, part_id});
+
+        // 1. Generate empty grand mobility matrix
+
+        // The following (sub-)tensors can be found in equation (2.17)
+        // part of the grand mobility matrix that relates forces (f)
+        // to velocities (u)
+        device_matrix<T, Policy> zmuf(n_part * 6, n_part * 6);
+        // part of the grand mobility matrix that relates stresslets (s)
+        // to velocities (u)
+        device_matrix<T, Policy> zmus(n_part * 6, n_part * 5);
+        // part of the grand mobility matrix that relates stresslets (s)
+        // to rate of strain (e)
+        device_matrix<T, Policy> zmes(n_part * 5, n_part * 5);
+
+        // 2. add self mobility terms to the grand mobility matrix
+        if (flg & flags::SELF_MOBILITY) {
+            thrust_wrapper::for_each(
+                Policy::par(), begin, begin + n_part,
+                mobility<Policy, T, true>{zmuf, zmus, zmes, a, eta, flg});
+        }
+
+        // 3. add pair mobility terms to the grand mobility matrix
+        if (flg & flags::PAIR_MOBILITY) {
+            thrust_wrapper::for_each(Policy::par(), begin, begin + n_pair,
+                                     mobility<Policy, T, false>{zmuf, zmus, zmes, pd,
+                                                                part_id, a, eta, flg});
+        }
+
+        // 4. invert M to obtain grand resistance matrix
+        device_matrix<T, Policy> rfu(n_part * 6, n_part * 6);
+        device_matrix<T, Policy> rfe(n_part * 6, n_part * 5);
+        device_matrix<T, Policy> rse(n_part * 5, n_part * 5);
+        invert_grand_mobility_matrix(zmuf, zmus, zmes, rfu, rfe, rse, flg);
+
+        // 5. add lubrication corrections (equation (2.18) or (2.21) resp.)
+        if (flg & flags::LUBRICATION) {
+            thrust_wrapper::for_each(Policy::par(), begin, begin + n_pair,
+                                     lubrication<Policy, T>{rfu, rfe, rse, pd, part_id,
+                                                            a, eta, flg});
+
+            for (std::size_t i = 0; i < 6 * n_part; ++i) {
+                for (std::size_t j = 0; j < i; ++j) {
+                    rfu(i, j) = rfu(j, i);
+                }
+            }
+
+            for (std::size_t i = 0; i < 5 * n_part; ++i) {
+                for (std::size_t j = 0; j < i; ++j) {
+                    rse(i, j) = rse(j, i);
+                }
+            }
+        }
+
+        // The inverse of the resistance matrix will be the mobility matrix
+        // which we need in the end
+        device_matrix<T, Policy> rfu_inv;
+        // The square root of R_FU is a byproduct of the matrix inversion,
+        // which we use for the thermalization
+        device_matrix<T, Policy> rfu_sqrt;
+        // 6. invert resistance matrix to obtain mobility matrix
+        // The grand mobility matrix is now finished.
+        thrust_wrapper::tie(rfu_inv, rfu_sqrt) = rfu.inverse_and_cholesky();
+
+        // Prepare the thermal stochastic forces
+        vector_type<T> frnd(n_part * 6, T{0.0});
+        if (sqrt_kT_Dt > 0.0) {
+            frnd = thermalization(rfu_sqrt, f_host.size(), sqrt_kT_Dt,
+                                  offset, seed);
+        }
+        // Finally, perform the matrix-multiplication
+        // multiply the force vector onto the mobility matrix (Eq. 2.22)
+
+        // prepare the force vector
+        assert(f_host.size() == 6 * n_part);
+        vector_type<T> fext(f_host.begin(), f_host.end());
+        // initialize ambient flow
+        vector_type<T> uinf(n_part * 6, T{0.0});
+        // initialize ambient shear flow
+        vector_type<T> einf(n_part * 5, T{0.0});
+        // Note: if we were to implement the case einf != 0 we would need to
+        // initialize the ambient flow according to the particle's positions.
+        // E.g. like   uinf_i = einf * r_i   where i is particle index.
+
+        // This is equation (2.22), plus thermal forces.
+        vector_type<T> u = rfu_inv * (fext + rfe * einf + frnd) + uinf;
+
+        // return the velocities due to hydrodynamic interactions
+        std::vector<T> out(u.size());
+        thrust_wrapper::copy(u.begin(), u.end(), out.begin());
+
+        return out;
+    }
+};
+
+} // namespace sd
+
+#endif
diff --git a/libs/stokesian_dynamics/src/sd_cpu.cpp b/libs/stokesian_dynamics/src/sd_cpu.cpp
new file mode 100644
index 00000000000..b3760db949a
--- /dev/null
+++ b/libs/stokesian_dynamics/src/sd_cpu.cpp
@@ -0,0 +1,31 @@
+#include <vector>
+
+#if THRUST_VERSION < 100908
+#define host_pinned_memory_resource universal_host_pinned_memory_resource
+#endif
+
+#include "sd.hpp"
+#include "stokesian_dynamics/sd_cpu.hpp"
+
+/** This executes the Stokesian Dynamics solver on the CPU. "_host" refers to
+ *  data being stored on the host (in THRUST terminology, as opposed to
+ *  "device" storage), which is ordinary RAM.
+ *
+ *  \param x_host particle positions
+ *  \param f_host particle forces and torques (total 6 entries per particle)
+ *  \param a_host particle radii
+ *  \param n_part number of particles
+ *  \param eta dynamic viscosity of the ambient fluid
+ *  \param sqrt_kT_Dt square root of kT/Dt, necessary for thermalization
+ *  \param offset integer simulation time (steps) used as seed for the RNG
+ *  \param seed global seed for the RNG used for the whole simulation
+ *  \param flg certain bits set in this register correspond to certain features activated
+ */
+std::vector<double> sd_cpu(std::vector<double> const &x_host,
+                           std::vector<double> const &f_host,
+                           std::vector<double> const &a_host,
+                           std::size_t n_part, double eta, double sqrt_kT_Dt,
+                           std::size_t offset, std::size_t seed, int flg) {
+  sd::solver<policy::host, double> viscous_force{eta, n_part};
+  return viscous_force.calc_vel(x_host, f_host, a_host, sqrt_kT_Dt, offset, seed, flg);
+}
diff --git a/libs/stokesian_dynamics/src/sd_gpu.cu b/libs/stokesian_dynamics/src/sd_gpu.cu
new file mode 100644
index 00000000000..118ce5ebfde
--- /dev/null
+++ b/libs/stokesian_dynamics/src/sd_gpu.cu
@@ -0,0 +1,28 @@
+#include <vector>
+
+#include "sd.hpp"
+#include "stokesian_dynamics/sd_gpu.hpp"
+
+/** This executes the Stokesian Dynamics solver on the GPU. "_host" refers to
+ *  data being stored on the host (in THRUST terminology, as opposed to
+ *  "device" storage), which is ordinary RAM.
+ *
+ *  \param x_host particle positions
+ *  \param f_host particle forces and torques (total 6 entries per particle)
+ *  \param a_host particle radii
+ *  \param n_part number of particles
+ *  \param eta dynamic viscosity of the ambient fluid
+ *  \param sqrt_kT_Dt square root of kT/Dt, necessary for thermalization
+ *  \param offset integer simulation time (steps) used as seed for the RNG
+ *  \param seed global seed for the RNG used for the whole simulation
+ *  \param flg certain bits set in this register correspond to certain features activated
+ */
+std::vector<double> sd_gpu(std::vector<double> const &x_host,
+                           std::vector<double> const &f_host,
+                           std::vector<double> const &a_host,
+                           std::size_t n_part, double eta, double sqrt_kT_Dt,
+                           std::size_t offset, std::size_t seed, int flg) {
+  sd::solver<policy::device, double> viscous_force{eta, n_part};
+  // NOLINTNEXTLINE(clang-analyzer-core.NonNullParamChecker)
+  return viscous_force.calc_vel(x_host, f_host, a_host, sqrt_kT_Dt, offset, seed, flg);
+}
diff --git a/libs/stokesian_dynamics/src/thrust_wrapper.hpp b/libs/stokesian_dynamics/src/thrust_wrapper.hpp
new file mode 100644
index 00000000000..0602c1b16d7
--- /dev/null
+++ b/libs/stokesian_dynamics/src/thrust_wrapper.hpp
@@ -0,0 +1,195 @@
+#ifndef THRUST_WRAPPER_HPP
+#define THRUST_WRAPPER_HPP
+
+/** \file
+ *  This file provides a wrapper around THRUST functions and types. In case
+ *  that CUDA/THRUST is not present, equivalent standard C++ types and
+ *  functions are used.
+ */
+
+
+
+// Dependencies with THRUST
+#ifdef SD_USE_THRUST
+#  include <thrust/device_vector.h>
+#  include <thrust/execution_policy.h>
+#  include <thrust/tabulate.h>
+#  include <thrust/tuple.h>
+
+namespace thrust_wrapper {
+  // types and constants
+  using thrust::counting_iterator;
+  using thrust::device_vector;
+  using thrust::host_vector;
+  using thrust::minus;
+  using thrust::negate;
+  using thrust::plus;
+  using thrust::tuple;
+
+  using thrust::host;
+  using thrust::device;
+
+  // routines
+  using thrust::copy;
+  using thrust::copy_n;
+  using thrust::equal;
+  using thrust::fill;
+  using thrust::for_each;
+  using thrust::get;
+  using thrust::make_tuple;
+  using thrust::raw_pointer_cast;
+  using thrust::tabulate;
+  using thrust::tie;
+  using thrust::transform;
+}
+
+
+// vector addition
+template <typename T>
+thrust::device_vector<T> operator+(thrust::device_vector<T> const &x,
+                                   thrust::device_vector<T> const &y) {
+    assert(x.size() == y.size());
+    thrust::device_vector<T> z(x.size());
+    thrust::transform(x.begin(), x.end(), y.begin(), z.begin(),
+                      thrust::plus<T>{});
+    return z;
+}
+
+template <typename T>
+thrust::host_vector<T> operator+(thrust::host_vector<T> const &x,
+                                 thrust::host_vector<T> const &y) {
+    assert(x.size() == y.size());
+    thrust::host_vector<T> z(x.size());
+    thrust::transform(x.begin(), x.end(), y.begin(), z.begin(),
+                      thrust::plus<T>{});
+    return z;
+}
+
+
+
+
+
+
+
+
+#else // Dependencies without THRUST
+#  include <algorithm>
+#  include <cassert>
+#  include <iterator>
+#  include <tuple>
+#  include <vector>
+
+#  include <boost/iterator/counting_iterator.hpp>
+
+
+namespace thrust_wrapper {
+
+  // types
+  using boost::counting_iterator;
+  template<typename T> using device_vector = std::vector<T>;
+  template<typename T> using host_vector   = std::vector<T>;
+  using std::minus;
+  using std::negate;
+  using std::plus;
+  using std::tuple;
+
+  // dummy constants, they could be anything
+  const int host = 0;
+  const int device = 0;
+
+  // routines
+  using std::copy;
+  using std::copy_n;
+  using std::for_each;
+  using std::get;
+  using std::make_tuple;
+  using std::tie;
+
+  // tabulate doesn't exist in the standard library
+  template <typename DerivedPolicy, typename ForwardIterator,
+            typename UnaryOperation>
+  void tabulate(const DerivedPolicy &,
+                ForwardIterator first,
+                ForwardIterator last,
+                UnaryOperation unary_op) {
+    ForwardIterator i = first;
+    while (i != last) {
+      *i = unary_op(i - first);
+      ++i;
+    }
+  }
+
+  // literally does nothing with its argument
+  template <typename T>
+  T *raw_pointer_cast(T *ptr) {
+    return ptr;
+  }
+
+  // These wrapper functions are needed because the THRUST variant gets the
+  // execution policy as argument, which the STD library one doesn't.
+  template <typename DerivedPolicy, typename InputIterator1,
+            typename InputIterator2>
+  bool equal(const DerivedPolicy &,
+             InputIterator1 first1,
+             InputIterator1 last1,
+             InputIterator2 first2) {
+    return std::equal(first1, last1, first2);
+  }
+
+  template <typename DerivedPolicy, typename ForwardIterator, typename T>
+  void fill(const DerivedPolicy &,
+            ForwardIterator first,
+            ForwardIterator last,
+            const T &value) {
+    std::fill(first, last, value);
+  }
+
+  template <typename DerivedPolicy, typename InputIterator,
+            typename UnaryFunction>
+  void for_each(const DerivedPolicy &,
+                InputIterator first,
+                InputIterator last,
+                UnaryFunction f) {
+    std::for_each(first, last, f);
+  }
+
+  // unary transform
+  template <typename DerivedPolicy, typename ForwardIterator,
+            typename OutputIterator, typename UnaryFunction>
+  void transform(const DerivedPolicy &,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 OutputIterator result,
+                 UnaryFunction op) {
+    std::transform(first, last, result, op);
+  }
+
+  // binary transform
+  template <typename DerivedPolicy, typename InputIterator1,
+            typename InputIterator2, typename OutputIterator,
+            typename BinaryFunction>
+  bool transform(const DerivedPolicy &,
+                 InputIterator1 first1,
+                 InputIterator1 last1,
+                 InputIterator2 first2,
+                 OutputIterator result,
+                 BinaryFunction op) {
+    return std::transform(first1, last1, first2, result, op);
+  }
+
+}
+
+
+// vector addition
+template <typename T>
+std::vector<T> operator+(std::vector<T> const &x,
+                         std::vector<T> const &y) {
+    assert(x.size() == y.size());
+    std::vector<T> z(x.size());
+    std::transform(x.begin(), x.end(), y.begin(), z.begin(),
+                   std::plus<T>{});
+    return z;
+}
+
+#endif
+#endif
diff --git a/maintainer/CI/build_cmake.sh b/maintainer/CI/build_cmake.sh
index 341d87f63ae..fa351b7ac82 100755
--- a/maintainer/CI/build_cmake.sh
+++ b/maintainer/CI/build_cmake.sh
@@ -97,6 +97,7 @@ set_default_value with_cuda_compiler "nvcc"
 set_default_value build_type "RelWithAssert"
 set_default_value with_ccache false
 set_default_value with_scafacos false
+set_default_value with_stokesian_dynamics false
 set_default_value test_timeout 300
 set_default_value hide_gpu false
 
@@ -120,6 +121,13 @@ if [ "${with_ccache}" = true ]; then
 fi
 if [ "${with_scafacos}" = true ]; then
     cmake_params="${cmake_params} -DWITH_SCAFACOS=ON"
+else
+    cmake_params="${cmake_params} -DWITH_SCAFACOS=OFF"
+fi
+if [ "${with_stokesian_dynamics}" = true ]; then
+    cmake_params="${cmake_params} -DWITH_STOKESIAN_DYNAMICS=ON"
+else
+    cmake_params="${cmake_params} -DWITH_STOKESIAN_DYNAMICS=OFF"
 fi
 
 if [ "${with_fftw}" = true ]; then
diff --git a/pylint.log b/pylint.log
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/samples/dancing.py b/samples/dancing.py
new file mode 100644
index 00000000000..67c0b5390b9
--- /dev/null
+++ b/samples/dancing.py
@@ -0,0 +1,101 @@
+#
+# Copyright (C) 2019-2020 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+"""
+Stokesian Dynamics simulation of particle sedimentation.
+Reproduce the trajectory in Figure 5b from :cite:`durlofsky87a`.
+"""
+import espressomd
+import espressomd.constraints
+import espressomd.observables
+import espressomd.accumulators
+import numpy as np
+import matplotlib.pyplot as plt
+
+import argparse
+
+parser = argparse.ArgumentParser(epilog=__doc__)
+group = parser.add_mutually_exclusive_group()
+group.add_argument('--cpu', action='store_true', help='Use CPU implementation')
+group.add_argument('--gpu', action='store_true', help='Use GPU implementation')
+group = parser.add_mutually_exclusive_group()
+group.add_argument('--ft', action='store_true', help='Use FT approximation')
+group.add_argument('--fts', action='store_true', help='Use FTS approximation')
+args = parser.parse_args()
+
+required_features = ["STOKESIAN_DYNAMICS"]
+
+if args.gpu:
+    print("Using GPU implementation")
+    required_features.append("CUDA")
+    required_features.append("STOKESIAN_DYNAMICS_GPU")
+    sd_device = "gpu"
+else:
+    print("Using CPU implementation")
+    sd_device = "cpu"
+
+if args.ft:
+    print("Using FT approximation method")
+    sd_method = "ft"
+else:
+    print("Using FTS approximation method")
+    sd_method = "fts"
+
+espressomd.assert_features(required_features)
+
+system = espressomd.System(box_l=[10, 10, 10])
+system.time_step = 1.5
+system.cell_system.skin = 0.4
+system.periodicity = [False, False, False]
+
+system.integrator.set_stokesian_dynamics(
+    viscosity=1.0, radii={0: 1.0}, device=sd_device,
+    approximation_method=sd_method)
+
+system.part.add(pos=[-5, 0, 0], rotation=[1, 1, 1])
+system.part.add(pos=[0, 0, 0], rotation=[1, 1, 1])
+system.part.add(pos=[7, 0, 0], rotation=[1, 1, 1])
+
+gravity = espressomd.constraints.Gravity(g=[0, -1, 0])
+system.constraints.add(gravity)
+
+obs = espressomd.observables.ParticlePositions(ids=system.part[:].id)
+acc = espressomd.accumulators.TimeSeries(obs=obs, delta_N=1)
+system.auto_update_accumulators.add(acc)
+acc.update()
+intsteps = int(10500 / system.time_step)
+system.integrator.run(intsteps)
+
+positions = acc.time_series()
+ref_data = "../testsuite/python/data/dancing.txt"
+data = np.loadtxt(ref_data)
+
+for i in range(3):
+    plt.plot(positions[:, i, 0], positions[:, i, 1], linestyle='solid')
+
+plt.gca().set_prop_cycle(None)
+
+for i in range(0, 6, 2):
+    plt.plot(data[:, i], data[:, i + 1], linestyle='dashed')
+
+plt.title("Trajectory of sedimenting spheres\nsolid line: simulation "
+          "({} on {}), dashed line: paper (FTS)"
+          .format(sd_method.upper(), sd_device.upper()))
+plt.xlabel("x")
+plt.ylabel("y")
+plt.show()
diff --git a/src/config/features.def b/src/config/features.def
index f72b4f287f0..a18167fafa2 100644
--- a/src/config/features.def
+++ b/src/config/features.def
@@ -24,6 +24,12 @@ COLLISION_DETECTION
 NPT
 ENGINE                          implies ROTATION, EXTERNAL_FORCES
 PARTICLE_ANISOTROPY             implies ROTATION
+STOKESIAN_DYNAMICS              requires BLAS and LAPACK
+STOKESIAN_DYNAMICS              implies ROTATION
+# Needs cuBLAS, which is expected to be included with CUDA
+STOKESIAN_DYNAMICS_GPU          requires CUDA
+STOKESIAN_DYNAMICS_GPU          implies ROTATION
+STOKESIAN_DYNAMICS_GPU          requires EXPERIMENTAL_FEATURES
 
 /* Rotation */
 ROTATION
@@ -96,8 +102,13 @@ ADDITIONAL_CHECKS
 
 # External switches
 # Switches that are set by configure or gcc, not to be set manually
+# All these switches must also be present in cmake/cmake_config.cmakein
 CUDA external
 FFTW external
 H5MD external
 SCAFACOS external
 GSL external
+BLAS external
+LAPACK external
+STOKESIAN_DYNAMICS external
+STOKESIAN_DYNAMICS_GPU external
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index e1439128472..e04fe271a02 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -110,3 +110,13 @@ add_subdirectory(virtual_sites)
 if(WITH_UNIT_TESTS)
   add_subdirectory(unit_tests)
 endif(WITH_UNIT_TESTS)
+
+if(STOKESIAN_DYNAMICS OR STOKESIAN_DYNAMICS_GPU)
+  add_subdirectory(stokesian_dynamics)
+  if(STOKESIAN_DYNAMICS)
+    target_link_libraries(EspressoCore PRIVATE StokesianDynamics::sd_cpu)
+  endif()
+  if(STOKESIAN_DYNAMICS_GPU)
+    target_link_libraries(EspressoCore PRIVATE StokesianDynamics::sd_gpu)
+  endif()
+endif()
diff --git a/src/core/communication.cpp b/src/core/communication.cpp
index 693e9d5e67d..902b0011df3 100644
--- a/src/core/communication.cpp
+++ b/src/core/communication.cpp
@@ -57,6 +57,7 @@
 #include "particle_data.hpp"
 #include "pressure.hpp"
 #include "rotation.hpp"
+#include "stokesian_dynamics/sd_interface.hpp"
 #include "virtual_sites.hpp"
 
 #include "electrostatics_magnetostatics/coulomb.hpp"
diff --git a/src/core/event.cpp b/src/core/event.cpp
index 517d121c9be..8f17c290b16 100644
--- a/src/core/event.cpp
+++ b/src/core/event.cpp
@@ -301,6 +301,15 @@ void on_parameter_change(int field) {
       Scafacos::update_system_params();
     }
 #endif
+#endif
+#ifdef STOKESIAN_DYNAMICS
+    if (integ_switch == INTEG_METHOD_SD) {
+      if (box_geo.periodic(0) || box_geo.periodic(1) || box_geo.periodic(2))
+        runtimeErrorMsg() << "Illegal box periodicity for Stokesian Dynamics: "
+                          << box_geo.periodic(0) << " " << box_geo.periodic(1)
+                          << " " << box_geo.periodic(2) << "\n"
+                          << "  Required: 0 0 0\n";
+    }
 #endif
   case FIELD_MIN_GLOBAL_CUT:
   case FIELD_SKIN:
diff --git a/src/core/integrate.cpp b/src/core/integrate.cpp
index 9f5a048f4ac..6ac545174f3 100644
--- a/src/core/integrate.cpp
+++ b/src/core/integrate.cpp
@@ -40,6 +40,7 @@
 #include "event.hpp"
 #include "forces.hpp"
 #include "global.hpp"
+#include "grid.hpp"
 #include "grid_based_algorithms/lb_interface.hpp"
 #include "grid_based_algorithms/lb_particle_coupling.hpp"
 #include "nonbonded_interactions/nonbonded_interaction_data.hpp"
@@ -53,6 +54,7 @@
 #include "integrators/brownian_inline.hpp"
 #include "integrators/langevin_inline.hpp"
 #include "integrators/steepest_descent.hpp"
+#include "integrators/stokesian_dynamics_inline.hpp"
 #include "integrators/velocity_verlet_inline.hpp"
 #include "integrators/velocity_verlet_npt.hpp"
 
@@ -125,6 +127,11 @@ bool integrator_step_1(ParticleRange &particles) {
     // the Ermak-McCammon's Brownian Dynamics requires a single step
     // so, just skip here
     break;
+#ifdef STOKESIAN_DYNAMICS
+  case INTEG_METHOD_SD:
+    stokesian_dynamics_step_1(particles);
+    break;
+#endif // STOKESIAN_DYNAMICS
   default:
     throw std::runtime_error("Unknown value for integ_switch");
   }
@@ -150,6 +157,11 @@ void integrator_step_2(ParticleRange &particles) {
     // the Ermak-McCammon's Brownian Dynamics requires a single step
     brownian_dynamics_propagator(brownian, particles);
     break;
+#ifdef STOKESIAN_DYNAMICS
+  case INTEG_METHOD_SD:
+    stokesian_dynamics_step_2(particles);
+    break;
+#endif // STOKESIAN_DYNAMICS
   default:
     throw std::runtime_error("Unknown value for INTEG_SWITCH");
   }
@@ -202,7 +214,6 @@ int integrate(int n_steps, int reuse_forces) {
 #ifdef VALGRIND_INSTRUMENTATION
   CALLGRIND_START_INSTRUMENTATION;
 #endif
-
   /* Integration loop */
   ESPRESSO_PROFILER_CXX_MARK_LOOP_BEGIN(integration_loop, "Integration loop");
   int integrated_steps = 0;
@@ -280,6 +291,7 @@ int integrate(int n_steps, int reuse_forces) {
 
   } // for-loop over integration steps
   ESPRESSO_PROFILER_CXX_MARK_LOOP_END(integration_loop);
+
 #ifdef VALGRIND_INSTRUMENTATION
   CALLGRIND_STOP_INSTRUMENTATION;
 #endif
@@ -418,6 +430,16 @@ void integrate_set_bd() {
   mpi_bcast_parameter(FIELD_INTEG_SWITCH);
 }
 
+int integrate_set_sd() {
+  if (box_geo.periodic(0) || box_geo.periodic(1) || box_geo.periodic(2)) {
+    runtimeErrorMsg() << "Stokesian Dynamics requires periodicity 0 0 0\n";
+    return ES_ERROR;
+  }
+  integ_switch = INTEG_METHOD_SD;
+  mpi_bcast_parameter(FIELD_INTEG_SWITCH);
+  return ES_OK;
+}
+
 #ifdef NPT
 int integrate_set_npt_isotropic(double ext_pressure, double piston,
                                 bool xdir_rescale, bool ydir_rescale,
diff --git a/src/core/integrate.hpp b/src/core/integrate.hpp
index aee38748e44..6bd8e859819 100644
--- a/src/core/integrate.hpp
+++ b/src/core/integrate.hpp
@@ -31,6 +31,7 @@
 #define INTEG_METHOD_NVT 1
 #define INTEG_METHOD_STEEPEST_DESCENT 2
 #define INTEG_METHOD_BD 3
+#define INTEG_METHOD_SD 7
 
 /** Switch determining which integrator to use. */
 extern int integ_switch;
@@ -126,6 +127,9 @@ void integrate_set_nvt();
 /** @brief Set the Brownian Dynamics integrator. */
 void integrate_set_bd();
 
+/** @brief Set the Stokesian Dynamics integrator. */
+int integrate_set_sd();
+
 /** @brief Set the velocity Verlet integrator modified for the NpT ensemble
  *  with isotropic rescaling.
  *
diff --git a/src/core/integrators/stokesian_dynamics_inline.hpp b/src/core/integrators/stokesian_dynamics_inline.hpp
new file mode 100644
index 00000000000..f44268942b7
--- /dev/null
+++ b/src/core/integrators/stokesian_dynamics_inline.hpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2010-2019 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef STOKESIAN_DYNAMICS_INLINE_HPP
+#define STOKESIAN_DYNAMICS_INLINE_HPP
+
+#include "ParticleRange.hpp"
+#include "config.hpp"
+#include "particle_data.hpp"
+#include "rotation.hpp"
+#include "stokesian_dynamics/sd_interface.hpp"
+
+#include <utils/Vector.hpp>
+
+#ifdef STOKESIAN_DYNAMICS
+
+inline void
+stokesian_dynamics_propagate_vel_pos(const ParticleRange &particles) {
+  auto const skin2 = Utils::sqr(0.5 * skin);
+
+  // Compute new (translational and rotational) velocities
+  propagate_vel_pos_sd(particles);
+
+  for (auto &p : particles) {
+
+    // Translate particle
+    p.r.p += p.m.v * time_step;
+
+    // Perform rotation
+    double norm = p.m.omega.norm();
+    if (norm != 0) {
+      Utils::Vector3d omega_unit = (1 / norm) * p.m.omega;
+      local_rotate_particle(p, omega_unit, norm * time_step);
+    }
+
+    // Verlet criterion check
+    if ((p.r.p - p.l.p_old).norm2() > skin2)
+      cell_structure.set_resort_particles(Cells::RESORT_LOCAL);
+  }
+}
+
+inline void stokesian_dynamics_step_1(const ParticleRange &particles) {
+  stokesian_dynamics_propagate_vel_pos(particles);
+  sim_time += time_step;
+}
+
+inline void stokesian_dynamics_step_2(const ParticleRange &particles) {}
+
+#endif // STOKESIAN_DYNAMICS
+#endif
diff --git a/src/core/serialization/SD_particle_data.hpp b/src/core/serialization/SD_particle_data.hpp
new file mode 100644
index 00000000000..84fd2162839
--- /dev/null
+++ b/src/core/serialization/SD_particle_data.hpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2010-2020 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef SERIALIZATION_SD_PARTICLE_DATA_HPP
+#define SERIALIZATION_SD_PARTICLE_DATA_HPP
+
+#include "stokesian_dynamics/sd_interface.hpp"
+
+#include <boost/serialization/is_bitwise_serializable.hpp>
+#include <boost/serialization/split_free.hpp>
+
+#if defined(STOKESIAN_DYNAMICS) or defined(STOKESIAN_DYNAMICS_GPU)
+
+BOOST_IS_BITWISE_SERIALIZABLE(SD_particle_data)
+
+namespace boost {
+namespace serialization {
+template <typename Archive>
+void load(Archive &ar, SD_particle_data &p,
+          const unsigned int /* file_version */) {
+  ar >> make_array(reinterpret_cast<char *>(&p), sizeof(SD_particle_data));
+}
+
+template <typename Archive>
+void save(Archive &ar, SD_particle_data const &p,
+          const unsigned int /* file_version */) {
+  /* Cruel but effective */
+  ar << make_array(reinterpret_cast<char const *>(&p),
+                   sizeof(SD_particle_data));
+}
+
+template <class Archive>
+void serialize(Archive &ar, SD_particle_data &p,
+               const unsigned int file_version) {
+  split_free(ar, p, file_version);
+}
+} // namespace serialization
+} // namespace boost
+
+#endif
+#endif
diff --git a/src/core/stokesian_dynamics/CMakeLists.txt b/src/core/stokesian_dynamics/CMakeLists.txt
new file mode 100644
index 00000000000..ffaaa02af88
--- /dev/null
+++ b/src/core/stokesian_dynamics/CMakeLists.txt
@@ -0,0 +1,25 @@
+#
+# Copyright (C) 2019-2020 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+target_include_directories(
+  EspressoCore PRIVATE ${CMAKE_SOURCE_DIR}/libs/stokesian_dynamics/include
+                       ${CMAKE_SOURCE_DIR}/src/core/stokesian_dynamics)
+
+target_sources(EspressoCore
+               PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/sd_interface.cpp)
diff --git a/src/core/stokesian_dynamics/sd_interface.cpp b/src/core/stokesian_dynamics/sd_interface.cpp
new file mode 100644
index 00000000000..bfb8ef08e31
--- /dev/null
+++ b/src/core/stokesian_dynamics/sd_interface.cpp
@@ -0,0 +1,308 @@
+/*
+ * Copyright (C) 2010-2020 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config.hpp"
+
+#if defined(STOKESIAN_DYNAMICS) || defined(STOKESIAN_DYNAMICS_GPU)
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <boost/config.hpp>
+
+#include "ParticleRange.hpp"
+#include "communication.hpp"
+#include "errorhandling.hpp"
+#include "integrate.hpp"
+#include "particle_data.hpp"
+#include "rotation.hpp"
+#include "thermostat.hpp"
+
+#include "serialization/SD_particle_data.hpp"
+
+#include <utils/Vector.hpp>
+#include <utils/mpi/gather_buffer.hpp>
+#include <utils/mpi/scatter_buffer.hpp>
+
+#ifdef STOKESIAN_DYNAMICS
+#include "stokesian_dynamics/sd_cpu.hpp"
+#endif
+
+#ifdef STOKESIAN_DYNAMICS_GPU
+#include "stokesian_dynamics/sd_gpu.hpp"
+#endif
+
+#include "sd_interface.hpp"
+
+namespace {
+double sd_viscosity = -1.0;
+
+enum { CPU, GPU, INVALID } device = INVALID;
+
+std::unordered_map<int, double> radius_dict;
+
+double sd_kT = 0.0;
+
+std::size_t sd_seed = 0UL;
+
+int sd_flags = 0;
+
+/** Buffer that holds local particle data, and all particles on the master node
+ *  used for sending particle data to master node. */
+std::vector<SD_particle_data> parts_buffer{};
+
+/** Buffer that holds the (translational and angular) velocities of the local
+ *  particles on each node, used for returning results. */
+std::vector<double> v_sd{};
+
+} // namespace
+
+/** Pack selected properties of all local particles into a buffer. */
+void sd_gather_local_particles(ParticleRange const &parts) {
+  std::size_t n_part = parts.size();
+
+  parts_buffer.resize(n_part);
+  std::size_t i = 0;
+
+  for (auto const &p : parts) {
+    parts_buffer[i].id = p.p.is_virtual ? -1 : p.p.identity;
+    parts_buffer[i].type = p.p.type;
+
+    parts_buffer[i].pos[0] = p.r.p[0];
+    parts_buffer[i].pos[1] = p.r.p[1];
+    parts_buffer[i].pos[2] = p.r.p[2];
+
+    // Velocity is not needed for SD, thus omit while gathering.
+    // Particle orientation is also not needed for SD
+    // since all particles are assumed spherical.
+
+    parts_buffer[i].ext_force[0] = p.f.f[0];
+    parts_buffer[i].ext_force[1] = p.f.f[1];
+    parts_buffer[i].ext_force[2] = p.f.f[2];
+
+    parts_buffer[i].ext_torque[0] = p.f.torque[0];
+    parts_buffer[i].ext_torque[1] = p.f.torque[1];
+    parts_buffer[i].ext_torque[2] = p.f.torque[2];
+
+    // radius_dict is not initialized on slave nodes -> need to assign radius
+    // later on master node
+    parts_buffer[i].r = 0;
+
+    i++;
+  }
+}
+
+/** Update translational and rotational velocities of all particles. */
+void sd_update_locally(ParticleRange const &parts) {
+  std::size_t i = 0;
+
+  // Even though on the master node, the v_sd vector is larger than
+  // the (local) parts vector, this should still work. Because the local
+  // particles correspond to the first 6*n entries in the master's v_sd
+  // (which holds the velocities of ALL particles).
+
+  for (auto &p : parts) {
+    // skip virtual particles
+    if (p.p.is_virtual) {
+      continue;
+    }
+
+    // Copy velocities
+    p.m.v[0] = v_sd[6 * i];
+    p.m.v[1] = v_sd[6 * i + 1];
+    p.m.v[2] = v_sd[6 * i + 2];
+
+    p.m.omega[0] = v_sd[6 * i + 3];
+    p.m.omega[1] = v_sd[6 * i + 4];
+    p.m.omega[2] = v_sd[6 * i + 5];
+
+    i++;
+  }
+}
+
+void set_sd_viscosity(double eta) { sd_viscosity = eta; }
+
+double get_sd_viscosity() { return sd_viscosity; }
+
+void set_sd_device(std::string const &dev) {
+  device = INVALID;
+#ifdef STOKESIAN_DYNAMICS
+  if (dev == "cpu") {
+    device = CPU;
+  }
+#endif
+#ifdef STOKESIAN_DYNAMICS_GPU
+  if (dev == "gpu") {
+    device = GPU;
+  }
+#endif
+}
+
+std::string get_sd_device() {
+  switch (device) {
+  case CPU:
+    return "cpu";
+  case GPU:
+    return "gpu";
+  default:
+    return "invalid";
+  }
+}
+
+void set_sd_radius_dict(std::unordered_map<int, double> const &x) {
+  radius_dict = x;
+}
+
+std::unordered_map<int, double> get_sd_radius_dict() { return radius_dict; }
+
+void set_sd_kT(double kT) { sd_kT = kT; }
+
+double get_sd_kT() { return sd_kT; }
+
+void set_sd_seed(std::size_t seed) { sd_seed = seed; }
+
+std::size_t get_sd_seed() { return sd_seed; }
+
+void set_sd_flags(int flg) { sd_flags = flg; }
+
+int get_sd_flags() { return sd_flags; }
+
+void propagate_vel_pos_sd(const ParticleRange &particles) {
+  std::size_t n_part_local = particles.size();
+
+  if (this_node == 0) {
+    if (integ_switch & INTEG_METHOD_SD) {
+      if (BOOST_UNLIKELY(sd_viscosity < 0.0)) {
+        runtimeErrorMsg() << "sd_viscosity has an invalid value: " +
+                                 std::to_string(sd_viscosity);
+        return;
+      }
+
+      if (BOOST_UNLIKELY(sd_kT < 0.0)) {
+        runtimeErrorMsg() << "sd_kT has an invalid value: " +
+                                 std::to_string(sd_viscosity);
+        return;
+      }
+
+      sd_gather_local_particles(particles);
+      Utils::Mpi::gather_buffer(parts_buffer, comm_cart, 0);
+
+      std::size_t n_part = parts_buffer.size();
+
+      static std::vector<int> id{};
+      static std::vector<double> x_host{};
+      static std::vector<double> f_host{};
+      static std::vector<double> a_host{};
+
+      // n_part not expected to change, but anyway ...
+      id.resize(n_part);
+      x_host.resize(6 * n_part);
+      f_host.resize(6 * n_part);
+      a_host.resize(n_part);
+
+      std::size_t i = 0;
+      for (auto const &p : parts_buffer) {
+        id[i] = parts_buffer[i].id;
+
+        x_host[6 * i + 0] = p.pos[0];
+        x_host[6 * i + 1] = p.pos[1];
+        x_host[6 * i + 2] = p.pos[2];
+        // Actual orientation is not needed, just need default.
+        x_host[6 * i + 3] = 1;
+        x_host[6 * i + 4] = 0;
+        x_host[6 * i + 5] = 0;
+
+        f_host[6 * i + 0] = p.ext_force[0];
+        f_host[6 * i + 1] = p.ext_force[1];
+        f_host[6 * i + 2] = p.ext_force[2];
+
+        f_host[6 * i + 3] = p.ext_torque[0];
+        f_host[6 * i + 4] = p.ext_torque[1];
+        f_host[6 * i + 5] = p.ext_torque[2];
+
+        double radius = radius_dict[p.type];
+
+        if (BOOST_UNLIKELY(radius < 0.0)) {
+          runtimeErrorMsg()
+              << "particle of type " +
+                     std::to_string(int(parts_buffer[i].type)) +
+                     " has an invalid radius: " + std::to_string(radius);
+          return;
+        }
+
+        a_host[i] = radius;
+
+        ++i;
+      }
+
+      std::size_t offset = std::round(sim_time / time_step);
+      switch (device) {
+
+#ifdef STOKESIAN_DYNAMICS
+      case CPU:
+        v_sd = sd_cpu(x_host, f_host, a_host, n_part, sd_viscosity,
+                      std::sqrt(sd_kT / time_step), offset, sd_seed, sd_flags);
+        break;
+#endif
+
+#ifdef STOKESIAN_DYNAMICS_GPU
+      case GPU:
+        v_sd = sd_gpu(x_host, f_host, a_host, n_part, sd_viscosity,
+                      std::sqrt(sd_kT / time_step), offset, sd_seed, sd_flags);
+        break;
+#endif
+
+      default:
+        runtimeErrorMsg()
+            << "Invalid device for Stokesian dynamics. Available devices:"
+#ifdef STOKESIAN_DYNAMICS
+               " cpu"
+#endif
+#ifdef STOKESIAN_DYNAMICS_GPU
+               " gpu"
+#endif
+            ;
+        return;
+      }
+
+      Utils::Mpi::scatter_buffer(
+          v_sd.data(), static_cast<int>(n_part_local * 6), comm_cart, 0);
+      sd_update_locally(particles);
+    }
+
+  } else { // if (this_node == 0)
+
+    if (thermo_switch & THERMO_SD) {
+      sd_gather_local_particles(particles);
+      Utils::Mpi::gather_buffer(parts_buffer, comm_cart, 0);
+
+      v_sd.resize(n_part_local * 6);
+
+      // now wait while master node is busy ...
+
+      Utils::Mpi::scatter_buffer(
+          v_sd.data(), static_cast<int>(n_part_local * 6), comm_cart, 0);
+      sd_update_locally(particles);
+    }
+
+  } // if (this_node == 0) {...} else
+}
+
+#endif // STOKESIAN_DYNAMICS
diff --git a/src/core/stokesian_dynamics/sd_interface.hpp b/src/core/stokesian_dynamics/sd_interface.hpp
new file mode 100644
index 00000000000..e36a0efe520
--- /dev/null
+++ b/src/core/stokesian_dynamics/sd_interface.hpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) 2010-2020 The ESPResSo project
+ *
+ * This file is part of ESPResSo.
+ *
+ * ESPResSo is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * ESPResSo is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file
+ *  See @cite durlofsky87a for the Stokesian dynamics method used here.
+ *  See @cite banchio03a and @cite brady88a for the thermalization method.
+ */
+
+#ifndef STOKESIAN_DYNAMICS_INTERFACE_H
+#define STOKESIAN_DYNAMICS_INTERFACE_H
+
+#include "ParticleRange.hpp"
+#include "config.hpp"
+
+#include <string>
+#include <unordered_map>
+#include <utils/Vector.hpp>
+
+#ifdef STOKESIAN_DYNAMICS
+
+/* type for particle data transfer between nodes */
+struct SD_particle_data {
+  int on_node = -1;
+  int id = -1;
+  int type = 0;
+
+  /* particle radius */
+  double r = -1;
+
+  /* particle position */
+  Utils::Vector3d pos = {0., 0., 0.};
+
+  /* particle velocity */
+  Utils::Vector3d vel = {0., 0., 0.};
+
+  /* particle rotational velocity */
+  Utils::Vector3d omega = {0., 0., 0.};
+
+  /* external force */
+  Utils::Vector3d ext_force = {0.0, 0.0, 0.0};
+
+  /* external torque */
+  Utils::Vector3d ext_torque = {0.0, 0.0, 0.0};
+};
+
+void set_sd_viscosity(double eta);
+double get_sd_viscosity();
+
+void set_sd_device(std::string const &dev);
+std::string get_sd_device();
+
+void set_sd_radius_dict(std::unordered_map<int, double> const &x);
+std::unordered_map<int, double> get_sd_radius_dict();
+
+void set_sd_kT(double kT);
+double get_sd_kT();
+
+void set_sd_flags(int flg);
+int get_sd_flags();
+
+void set_sd_seed(std::size_t seed);
+std::size_t get_sd_seed();
+
+/** Takes the forces and torques on all particles and computes their
+ *  velocities. Acts globally on particles on all nodes; i.e. particle data
+ *  is gathered from all nodes and their velocities and angular velocities are
+ *  set according to the Stokesian Dynamics method.
+ */
+void propagate_vel_pos_sd(const ParticleRange &particles);
+
+#endif // STOKESIAN_DYNAMICS
+
+#endif // STOKESIAN_DYNAMICS_INTERFACE_H
diff --git a/src/core/thermostat.hpp b/src/core/thermostat.hpp
index cc7115a4e83..f87b600ede1 100644
--- a/src/core/thermostat.hpp
+++ b/src/core/thermostat.hpp
@@ -46,6 +46,7 @@
 #define THERMO_NPT_ISO 4
 #define THERMO_LB 8
 #define THERMO_BROWNIAN 16
+#define THERMO_SD 32
 /*@}*/
 
 namespace Thermostat {
diff --git a/src/python/espressomd/globals.pyx b/src/python/espressomd/globals.pyx
index 719cf68761c..20ca445af16 100644
--- a/src/python/espressomd/globals.pyx
+++ b/src/python/espressomd/globals.pyx
@@ -25,7 +25,7 @@ from .globals cimport timing_samples
 from .globals cimport forcecap_set
 from .globals cimport forcecap_get
 from .utils import array_locked
-from .utils cimport Vector3d, make_array_locked
+from .utils cimport Vector3d, make_array_locked, handle_errors
 
 cdef class Globals:
     property box_l:
@@ -68,6 +68,7 @@ cdef class Globals:
                 box_geo.set_periodic(i, _periodic[i])
 
             mpi_bcast_parameter(FIELD_PERIODIC)
+            handle_errors("Error while assigning system periodicity")
 
         def __get__(self):
             periodicity = np.zeros(3)
diff --git a/src/python/espressomd/integrate.pxd b/src/python/espressomd/integrate.pxd
index 60a6577adfb..6fe47ebeaa2 100644
--- a/src/python/espressomd/integrate.pxd
+++ b/src/python/espressomd/integrate.pxd
@@ -17,6 +17,8 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 from libcpp cimport bool as cbool
+from libcpp.string cimport string
+from libcpp.unordered_map cimport unordered_map
 
 include "myconfig.pxi"
 
@@ -25,6 +27,7 @@ cdef extern from "config.hpp":
 
 cdef extern from "integrate.hpp" nogil:
     cdef int python_integrate(int n_steps, cbool recalc_forces, int reuse_forces)
+    cdef void integrate_set_sd()
     cdef void integrate_set_nvt()
     cdef int integrate_set_steepest_descent(const double f_max, const double gamma,
                                             const int max_steps, const double max_displacement)
@@ -38,6 +41,28 @@ IF NPT:
                                              cbool xdir_rescale, cbool ydir_rescale,
                                              cbool zdir_rescale, cbool cubic_box)
 
+cdef extern from "stokesian_dynamics/sd_interface.hpp":
+    IF(STOKESIAN_DYNAMICS or STOKESIAN_DYNAMICS_GPU):
+        void set_sd_viscosity(double eta)
+        double get_sd_viscosity()
+
+        void set_sd_device(const string & dev)
+        string get_sd_device()
+
+        void set_sd_radius_dict(const unordered_map[int, double] & radius_dict)
+        unordered_map[int, double] get_sd_radius_dict()
+
+        void set_sd_flags(int flg)
+        int get_sd_flags()
+
+IF(STOKESIAN_DYNAMICS or STOKESIAN_DYNAMICS_GPU):
+    cpdef enum flags:
+        NONE = 0,
+        SELF_MOBILITY = 1 << 0,
+        PAIR_MOBILITY = 1 << 1,
+        LUBRICATION = 1 << 2,
+        FTS = 1 << 3
+
 cdef inline int _integrate(int nSteps, cbool recalc_forces, int reuse_forces):
     with nogil:
         return python_integrate(nSteps, recalc_forces, reuse_forces)
diff --git a/src/python/espressomd/integrate.pyx b/src/python/espressomd/integrate.pyx
index 044a4640dd1..9b698850f45 100644
--- a/src/python/espressomd/integrate.pyx
+++ b/src/python/espressomd/integrate.pyx
@@ -19,6 +19,7 @@
 from cpython.exc cimport PyErr_CheckSignals, PyErr_SetInterrupt
 include "myconfig.pxi"
 from .utils cimport handle_errors, check_type_or_throw_except
+from .utils import to_char_pointer
 from . cimport integrate
 
 cdef class IntegratorHandle:
@@ -100,6 +101,13 @@ cdef class IntegratorHandle:
         """
         self._integrator = BrownianDynamics()
 
+    def set_stokesian_dynamics(self, *args, **kwargs):
+        """
+        Set the integration method to Stokesian Dynamics (:class:`StokesianDynamics`).
+
+        """
+        self._integrator = StokesianDynamics(*args, **kwargs)
+
 
 cdef class Integrator:
     """
@@ -405,3 +413,105 @@ cdef class BrownianDynamics(Integrator):
 
     def _set_params_in_es_core(self):
         integrate_set_bd()
+
+
+IF(STOKESIAN_DYNAMICS or STOKESIAN_DYNAMICS_GPU):
+    cdef class StokesianDynamics(Integrator):
+        """
+        Stokesian Dynamics integrator.
+
+        Parameters
+        ----------
+        viscosity : :obj:`float`
+            Bulk viscosity.
+        radii : :obj:`dict`
+            Dictionary that maps particle types to radii.
+        device : :obj:`str`, optional, \{'cpu', 'gpu'\}
+            Device to execute on.
+        approximation_method : :obj:`str`, optional, \{'ft', 'fts'\}
+            Chooses the method of the mobility approximation.
+            ``'fts'`` is more accurate. Default is ``'fts'``.
+        self_mobility : :obj:`bool`, optional
+            Switches off or on the mobility terms for single particles. Default
+            is ``True``.
+        pair_mobility : :obj:`bool`, optional
+            Switches off or on the hydrodynamic interactions between particles.
+            Default is ``True``.
+
+        """
+
+        def default_params(self):
+            IF STOKESIAN_DYNAMICS:
+                sd_device_str = "cpu"
+            ELIF STOKESIAN_DYNAMICS_GPU:
+                sd_device_str = "gpu"
+            return {"lubrication": False, "approximation_method": "fts",
+                    "self_mobility": True, "pair_mobility": True,
+                    "device": sd_device_str}
+
+        def valid_keys(self):
+            """All parameters that can be set.
+
+            """
+            return {"radii", "viscosity", "device", "lubrication",
+                    "approximation_method", "self_mobility", "pair_mobility"}
+
+        def required_keys(self):
+            """Parameters that have to be set.
+
+            """
+            return {"radii", "viscosity"}
+
+        def validate_params(self):
+            check_type_or_throw_except(
+                self._params["viscosity"], 1, float,
+                "viscosity must be a number")
+            check_type_or_throw_except(
+                self._params["device"], 1, str,
+                "device must be a string")
+            check_type_or_throw_except(
+                self._params["radii"], 1, dict,
+                "radii must be a dictionary")
+            check_type_or_throw_except(
+                self._params["lubrication"], 1, bool,
+                "lubrication must be a bool")
+            if self._params["lubrication"]:
+                raise NotImplementedError(
+                    "Stokesian Dynamics lubrication is not available yet")
+            check_type_or_throw_except(
+                self._params["approximation_method"], 1, str,
+                "approximation_method must be a string")
+            if self._params["approximation_method"].lower() not in {
+                    "ft", "fts"}:
+                raise ValueError(
+                    "approximation_method must be either 'ft' or 'fts'")
+            check_type_or_throw_except(
+                self._params["self_mobility"], 1, bool,
+                "self_mobility must be a bool")
+            check_type_or_throw_except(
+                self._params["pair_mobility"], 1, bool,
+                "pair_mobility must be a bool")
+
+        def _set_params_in_es_core(self):
+            integrate_set_sd()
+            set_sd_radius_dict(self._params["radii"])
+            set_sd_device(to_char_pointer(self._params["device"].lower()))
+            set_sd_viscosity(self._params["viscosity"])
+            fl = flags.NONE
+            if self._params["lubrication"]:
+                fl = fl | flags.LUBRICATION
+            if self._params["approximation_method"].lower() == "fts":
+                fl = fl | flags.FTS
+            if self._params["self_mobility"]:
+                fl = fl | flags.SELF_MOBILITY
+            if self._params["pair_mobility"]:
+                fl = fl | flags.PAIR_MOBILITY
+            set_sd_flags(fl)
+
+            handle_errors(
+                "Encountered error while setting integration method to SD")
+
+ELSE:
+    cdef class StokesianDynamics(Integrator):
+        def __init__(self, *args, **kwargs):
+            raise Exception("Stokesian Dynamics not compiled in.")
diff --git a/src/python/espressomd/thermostat.pxd b/src/python/espressomd/thermostat.pxd
index 707cd2de124..622cda0f333 100644
--- a/src/python/espressomd/thermostat.pxd
+++ b/src/python/espressomd/thermostat.pxd
@@ -32,6 +32,7 @@ cdef extern from "thermostat.hpp":
     int THERMO_NPT_ISO
     int THERMO_DPD
     int THERMO_BROWNIAN
+    int THERMO_SD
 
     IF PARTICLE_ANISOTROPY:
         ctypedef struct langevin_thermostat_struct "LangevinThermostat":
@@ -69,6 +70,14 @@ cdef extern from "thermostat.hpp":
     IF DPD:
         stdint.uint64_t dpd_get_rng_state()
 
+cdef extern from "stokesian_dynamics/sd_interface.hpp":
+    IF(STOKESIAN_DYNAMICS or STOKESIAN_DYNAMICS_GPU):
+        void set_sd_kT(double kT)
+        double get_sd_kT()
+
+        void set_sd_seed(size_t seed)
+        size_t get_sd_seed()
+
 cdef extern from "script_interface/Globals.hpp":
     # links intern C-struct with python object
     cdef extern langevin_thermostat_struct langevin
diff --git a/src/python/espressomd/thermostat.pyx b/src/python/espressomd/thermostat.pyx
index a89bf3994ac..2636ac0255e 100644
--- a/src/python/espressomd/thermostat.pyx
+++ b/src/python/espressomd/thermostat.pyx
@@ -124,6 +124,8 @@ cdef class Thermostat:
                                   gamma_rotation=thmst["gamma_rotation"],
                                   act_on_virtual=thmst["act_on_virtual"],
                                   seed=thmst["seed"])
+            if thmst["type"] == "SD":
+                self.set_stokesian(kT=thmst["kT"], seed=thmst["seed"])
 
     def get_ts(self):
         return thermo_switch
@@ -205,6 +207,13 @@ cdef class Thermostat:
                 dpd_dict["kT"] = temperature
                 dpd_dict["seed"] = int(dpd_get_rng_state())
                 thermo_list.append(dpd_dict)
+        if (thermo_switch & THERMO_SD):
+            IF STOKESIAN_DYNAMICS:
+                sd_dict = {}
+                sd_dict["type"] = "SD"
+                sd_dict["kT"] = get_sd_kT()
+                sd_dict["seed"] = get_sd_seed()
+                thermo_list.append(sd_dict)
         return thermo_list
 
     def turn_off(self):
@@ -698,3 +707,35 @@ cdef class Thermostat:
 
             mpi_bcast_parameter(FIELD_THERMO_SWITCH)
             mpi_bcast_parameter(FIELD_TEMPERATURE)
+
+    IF STOKESIAN_DYNAMICS:
+        def set_stokesian(self, kT=None, seed=None):
+            """
+            Sets the SD thermostat with required parameters.
+
+            This thermostat requires the feature ``STOKESIAN_DYNAMICS`` or
+            ``STOKESIAN_DYNAMICS_GPU``.
+
+            Parameters
+            ----------
+            kT : :obj:`float`, optional
+                Temperature
+            seed : :obj:`int`, optional
+                Seed for the random number generator
+
+            """
+
+            if (kT is None) or (kT == 0):
+                set_sd_kT(0.0)
+            else:
+                utils.check_type_or_throw_except(
+                    kT, 1, float, "kT must be a float")
+                set_sd_kT(kT)
+
+                utils.check_type_or_throw_except(
+                    seed, 1, int, "seed must be an integer")
+                set_sd_seed(seed)
+
+            global thermo_switch
+            thermo_switch = (thermo_switch | THERMO_SD)
+            mpi_bcast_parameter(FIELD_THERMO_SWITCH)
diff --git a/testsuite/python/CMakeLists.txt b/testsuite/python/CMakeLists.txt
index 0faeccab91e..37ff49f305e 100644
--- a/testsuite/python/CMakeLists.txt
+++ b/testsuite/python/CMakeLists.txt
@@ -56,7 +56,7 @@ endfunction(PYTHON_TEST)
 # Separate features with hyphens, use a period to add an optional flag.
 foreach(
   TEST_COMBINATION
-  lb.cpu-p3m.cpu-lj-therm.lb;lb.gpu-p3m.cpu-lj-therm.lb;ek.gpu;lb.off-therm.npt-int.npt-minimization;lb.off-therm.langevin-int.nvt;lb.off-therm.dpd-int.sd;lb.off-therm.bd-int.bd
+  lb.cpu-p3m.cpu-lj-therm.lb;lb.gpu-p3m.cpu-lj-therm.lb;ek.gpu;lb.off-therm.npt-int.npt-minimization;lb.off-therm.langevin-int.nvt;lb.off-therm.dpd-int.sd;lb.off-therm.bd-int.bd;lb.off-therm.sdm-int.sdm.cpu;lb.off-therm.sdm-int.sdm.gpu
 )
   if(${TEST_COMBINATION} MATCHES "\\.gpu")
     set(TEST_LABELS "gpu")
@@ -200,9 +200,11 @@ python_test(FILE gpu_availability.py MAX_NUM_PROC 1 LABELS gpu)
 python_test(FILE features.py MAX_NUM_PROC 1)
 python_test(FILE galilei.py MAX_NUM_PROC 32)
 python_test(FILE time_series.py MAX_NUM_PROC 1)
-python_test(FILE linear_momentum.py)
+python_test(FILE linear_momentum.py MAX_NUM_PROC 4)
 python_test(FILE linear_momentum_lb.py MAX_NUM_PROC 2 LABELS gpu)
 python_test(FILE mmm1d.py MAX_NUM_PROC 2 LABELS gpu)
+python_test(FILE stokesian_dynamics_cpu.py MAX_NUM_PROC 2)
+python_test(FILE stokesian_dynamics_gpu.py MAX_NUM_PROC 4 LABELS gpu)
 python_test(FILE elc.py MAX_NUM_PROC 2)
 python_test(FILE elc_vs_analytic.py MAX_NUM_PROC 2)
 python_test(FILE rotation.py MAX_NUM_PROC 1)
@@ -224,6 +226,9 @@ add_custom_target(
     ${CMAKE_CURRENT_BINARY_DIR}
   COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/ek_common.py
           ${CMAKE_CURRENT_BINARY_DIR}
+  COMMAND
+    ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/stokesian_dynamics.py
+    ${CMAKE_CURRENT_BINARY_DIR}
   COMMAND
     ${CMAKE_COMMAND} -E copy
     ${CMAKE_CURRENT_SOURCE_DIR}/ek_eof_one_species_base.py
diff --git a/testsuite/python/data/dancing.txt b/testsuite/python/data/dancing.txt
new file mode 100644
index 00000000000..e4628ebf48f
--- /dev/null
+++ b/testsuite/python/data/dancing.txt
@@ -0,0 +1,125 @@
+# F-T-S trajectory digitized from Figure 5b in Durlofsky, Brady, Bossis
+# (1987). Dynamic simulation of hydrodynamically interacting particles.
+# Journal of Fluid Mechanics, 180, 21-49. doi:10.1017/S002211208700171X
+-5.0 0.0	0.0 0.0	7.0 0.0
+-4.99668 -0.713	-0.01829 -0.487	6.94144 -0.271
+-4.98734 -6.788	-0.02043 -6.451	6.96641 -5.903
+-4.96921 -12.86	-0.02558 -10.758	6.8727 -11.347
+-4.90709 -19.304	-0.03301 -20.568	6.77196 -16.189
+-4.82728 -26.884	-0.02689 -27.026	6.67141 -23.141
+-4.6861 -32.553	-0.0403 -31.814	6.51147 -29.351
+-4.54493 -38.222	-0.04236 -39.709	6.24687 -35.582
+-4.36852 -44.644	-0.00852 -46.52	5.99622 -41.659
+-4.17456 -50.682	0.01422 -52.975	5.7281 -47.588
+-3.95402 -58.994	0.07287 -58.586	5.41122 -54.431
+-3.71617 -64.265	0.104 -66.752	5.07323 -59.32
+-3.52218 -70.683	0.23208 -73.373	4.71101 -66.322
+-3.2843 -76.334	0.30747 -80.764	4.33477 -72.574
+-3.08145 -83.51	0.43989 -86.815	3.98316 -81.233
+-2.95778 -89.942	0.57249 -94.954	3.63823 -86.424
+-2.8606 -95.239	0.74897 -102.135	3.26553 -93.278
+-2.78066 -104.338	0.91214 -107.99	2.94153 -98.616
+-2.77128 -110.792	1.11486 -113.647	2.61085 -107.119
+-2.73554 -117.241	1.33543 -122.339	2.29731 -112.304
+-2.72613 -124.075	1.58657 -128.746	1.98406 -120.654
+-2.72545 -131.67	1.82438 -133.637	1.73691 -127.032
+-2.76869 -139.654	2.11099 -143.455	1.4199 -132.368
+-2.80344 -144.218	2.42352 -148.141	1.17984 -139.95
+-2.85537 -153.342	2.73175 -153.968	0.92575 -146.63
+-2.97797 -158.682	3.06642 -160.549	0.71708 -154.055
+-3.09168 -165.16	3.40558 -168.078	0.57461 -160.413
+-3.24062 -170.886	3.8016 -172.559	0.40082 -167.68
+-3.4247 -176.997	4.18904 -179.319	0.25487 -174.189
+-3.55606 -182.719	4.58096 -187.028	0.14042 -181.897
+-3.79271 -190.74	4.98141 -191.888	0.06416 -187.338
+-3.96795 -197.23	5.31609 -198.469	0.05085 -194.574
+-4.11683 -203.715	5.66836 -205.236	0.04098 -201.207
+-4.20433 -208.289	5.9678 -211.064	0.12173 -206.768
+-4.27395 -216.277	6.24972 -217.655	0.18532 -215.497
+-4.2736 -220.075	6.54027 -222.346	0.28694 -220.45
+-4.27296 -227.29	6.76055 -227.81	0.40972 -227.811
+-4.25479 -233.743	6.90168 -232.909	0.48357 -234.277
+-4.17502 -240.943	7.04732 -239.337	0.63075 -241.483
+-4.10424 -245.866	7.19739 -246.144	0.75692 -247.788
+-3.95417 -252.673	7.24187 -252.021	0.88652 -253.49
+-3.74263 -258.708	7.26443 -258.473	0.99859 -258.14
+-3.56612 -266.269	7.19456 -263.613	1.00915 -259.193
+-3.38095 -272.309	7.11152 -268.946	1.13539 -266.252
+-3.14307 -277.959	6.98464 -275.616	1.19164 -271.064
+-2.83482 -283.976	6.76989 -282.684	1.30061 -280.085
+-2.55295 -289.997	6.54177 -287.855	1.33956 -286.407
+-2.22712 -296.01	6.20834 -295.135	1.44128 -292.416
+-1.93656 -300.891	5.84403 -301.092	1.45917 -297.386
+-1.65444 -309.571	5.52372 -307.61	1.53676 -306.564
+-1.43408 -315.984	5.20323 -312.039	1.58247 -310.473
+-1.24005 -322.782	4.86109 -320.27	1.64259 -319.504
+-1.09891 -328.071	4.56718 -326.973	1.69221 -328.085
+-1.00155 -335.268	4.21171 -333.498	1.7311 -333.804
+-0.94812 -342.853	3.84755 -341.164	1.84686 -340.715
+-0.92989 -350.065	3.5536 -347.487	1.9244 -349.441
+-0.97323 -356.909	3.22455 -354.576	2.08887 -354.834
+-0.99016 -364.127	2.93498 -360.708	2.20809 -361.442
+-1.09512 -370.224	2.64551 -367.98	2.43208 -369.236
+-1.12963 -377.446	2.37366 -375.628	2.63853 -375.978
+-1.20821 -383.538	2.12367 -381.942	2.90775 -382.255
+-1.2691 -390.765	1.91341 -390.148	3.20145 -389.281
+-1.3123 -399.128	1.80408 -396.435	3.57531 -395.237
+-1.3381 -405.589	1.75649 -404.989	3.92491 -403.157
+-1.27601 -411.653	1.84504 -411.997	4.41045 -409.543
+-1.10832 -418.836	2.05227 -418.983	4.89598 -415.779
+-0.84379 -427.519	2.45288 -425.551	5.39202 -422.615
+-0.42119 -434.273	2.92399 -434.004	5.94386 -429.14
+0.04537 -441.018	3.48293 -441.491	6.49245 -438.227
+0.55576 -446.236	4.20446 -448.377	7.03008 -441.89
+1.08382 -452.59	4.82041 -454.334	7.54701 -448.27
+1.61196 -459.703	5.40148 -463.525	8.05703 -455.255
+2.14878 -465.675	5.82411 -470.469	8.55671 -463.748
+2.65038 -470.894	6.19403 -477.992	8.93058 -469.855
+3.10828 -479.16	6.44075 -484.211	9.23483 -477.783
+3.55709 -484.01	6.5294 -492.358	9.42714 -482.418
+3.94463 -491.91	6.4949 -499.77	9.47669 -490.245
+4.34086 -498.669	6.39002 -506.626	9.46054 -504.566
+4.63156 -505.068	6.22348 -512.165	9.37751 -512.419
+4.93971 -509.946	6.00879 -519.992	9.31517 -517.555
+5.13387 -518.263	5.81163 -527.245	9.15885 -525.12
+5.31914 -525.442	5.5661 -534.319	9.06867 -531.016
+5.48672 -531.486	5.32066 -542.341	8.95083 -539.78
+5.60312 -537.658	5.02225 -547.906	8.85011 -544.923
+5.6895 -543.622	4.77684 -556.308	8.68331 -552.34
+5.7635 -550.485	4.52669 -560.914	8.55833 -559.297
+5.83875 -555.794	4.28115 -567.797	8.46127 -566.248
+5.85595 -562.25	4.03131 -575.82	8.3676 -572.144
+5.90105 -571.093	3.73739 -582.523	8.27406 -579.547
+5.89598 -576.357	3.52696 -588.83	8.16984 -584.541
+5.89652 -582.338	3.33401 -593.994	8.09043 -593.749
+5.89717 -589.514	3.1367 -599.539	8.03155 -598.583
+5.88109 -595.498	2.92653 -608.694	7.95536 -604.928
+5.89271 -601.477	2.73357 -613.858	7.89312 -611.119
+5.86555 -607.463	2.58029 -620.154	7.84145 -618.514
+5.8384 -613.449	2.42707 -627.209	7.79681 -626.51
+5.83336 -618.953	2.26923 -631.797	7.79035 -632.238
+5.80075 -625.897	2.12917 -638.47	7.75944 -637.669
+5.77909 -631.403	2.02867 -645.135	7.77052 -644.449
+5.7353 -637.153	1.84456 -650.867	7.77796 -649.572
+5.69149 -642.664	1.77029 -656.008	7.77162 -656.656
+5.66458 -651.282	1.61702 -662.494	7.80708 -662.979
+5.67063 -657.022	1.49023 -670.114	7.80755 -668.254
+5.61581 -663.253	1.35904 -677.734	7.83945 -673.824
+5.59983 -670.314	1.30229 -682.113	7.87851 -681.352
+5.54213 -675.349	1.21042 -687.067	7.93472 -685.711
+5.50668 -681.456	1.08801 -694.496	7.99107 -691.578
+5.47959 -688.16	0.97429 -700.785	8.051 -698.348
+5.44966 -694.027	0.90881 -705.924	8.06904 -704.976
+5.40864 -699.777	0.7995 -712.401	8.15679 -710.837
+5.37055 -707.32	0.70767 -717.736	8.22705 -716.248
+5.31297 -713.671	0.64245 -725.723	8.31842 -723.465
+5.30231 -718.338	0.50232 -731.637	8.38871 -729.178
+5.31399 -725.034	0.41486 -736.78	8.48694 -735.187
+5.27302 -731.382	0.35396 -743.818	8.58866 -741.196
+5.23482 -737.609	0.20513 -750.872	8.68342 -747.357
+5.21315 -742.996	0.15274 -754.87	8.80963 -754.114
+5.14998 -748.87	0.02588 -761.73	8.9148 -759.67
+5.15612 -755.448	-0.03085 -766.298	9.02002 -765.828
+5.11519 -762.274	-0.12256 -772.962	9.11844 -773.948
+5.11009 -767.179	-0.30214 -780.212	9.24798 -778.896
+5.09677 -772.923	-0.37187 -786.871	9.36707 -784.148
diff --git a/testsuite/python/save_checkpoint.py b/testsuite/python/save_checkpoint.py
index 78ffe793041..1260a9d0715 100644
--- a/testsuite/python/save_checkpoint.py
+++ b/testsuite/python/save_checkpoint.py
@@ -153,6 +153,9 @@
         system.thermostat.set_npt(kT=1.0, gamma0=2.0, gammav=0.1, seed=42)
     elif 'THERM.DPD' in modes and has_features('DPD'):
         system.thermostat.set_dpd(kT=1.0, seed=42)
+    elif 'THERM.SDM' in modes and (has_features('STOKESIAN_DYNAMICS') or has_features('STOKESIAN_DYNAMICS_GPU')):
+        system.periodicity = [0, 0, 0]
+        system.thermostat.set_stokesian(kT=1.0, seed=42)
     # set integrator
     if 'INT.NPT' in modes and has_features('NPT'):
         system.integrator.set_isotropic_npt(ext_pressure=2.0, piston=0.01,
@@ -164,6 +167,16 @@
         system.integrator.set_nvt()
     elif 'INT.BD' in modes:
         system.integrator.set_brownian_dynamics()
+    elif 'INT.SDM.CPU' in modes and has_features('STOKESIAN_DYNAMICS'):
+        system.periodicity = [0, 0, 0]
+        system.integrator.set_stokesian_dynamics(
+            approximation_method='ft', device='cpu', viscosity=0.5,
+            radii={0: 1.5}, pair_mobility=False, self_mobility=True)
+    elif 'INT.SDM.GPU' in modes and has_features('STOKESIAN_DYNAMICS_GPU') and espressomd.gpu_available():
+        system.periodicity = [0, 0, 0]
+        system.integrator.set_stokesian_dynamics(
+            approximation_method='fts', device='gpu', viscosity=2.0,
+            radii={0: 1.0}, pair_mobility=True, self_mobility=False)
     # set minimization
     if 'MINIMIZATION' in modes:
         steepest_descent(system, f_max=1, gamma=10, max_steps=0,
diff --git a/testsuite/python/stokesian_dynamics.py b/testsuite/python/stokesian_dynamics.py
new file mode 100644
index 00000000000..3f710126421
--- /dev/null
+++ b/testsuite/python/stokesian_dynamics.py
@@ -0,0 +1,203 @@
+#
+# Copyright (C) 2019-2020 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+import espressomd
+import espressomd.observables
+import espressomd.accumulators
+import espressomd.constraints
+import numpy as np
+import unittest as ut
+from tests_common import abspath
+
+s = espressomd.System(box_l=[1.0, 1.0, 1.0])
+
+
+class StokesianDynamicsSetupTest(ut.TestCase):
+    system = s
+    device = 'none'
+
+    def setUp(self):
+        self.system.box_l = [10] * 3
+
+        self.system.time_step = 1.0
+        self.system.cell_system.skin = 0.4
+
+        # unset SD integrator so we can test whether set_stokesian_dynamics()
+        # fails. set_nvt() is the only way to ensure that integ_switch is
+        # set to a different value than INTEG_METHOD_SD
+        self.system.integrator.set_nvt()
+
+    def pbc_checks(self):
+        self.system.periodicity = [0, 0, 1]
+        with self.assertRaises(Exception):
+            self.system.integrator.set_stokesian_dynamics(
+                viscosity=1.0, device=self.device, radii={0: 1.0})
+
+        self.system.periodicity = [0, 0, 0]
+        self.system.integrator.set_stokesian_dynamics(
+            viscosity=1.0, device=self.device, radii={0: 1.0})
+
+        with self.assertRaises(Exception):
+            self.system.periodicity = [0, 1, 0]
+
+
+class StokesianDynamicsTest(ut.TestCase):
+    system = s
+    device = 'none'
+
+    # Digitized reference data of Figure 5b from
+    # Durlofsky et al., J. Fluid Mech. 180, 21 (1987)
+    # https://doi.org/10.1017/S002211208700171X
+    data = np.loadtxt(abspath('data/dancing.txt'))
+
+    def setUp(self):
+        self.system.box_l = [10] * 3
+        self.system.periodicity = [0, 0, 0]
+        self.system.cell_system.skin = 0.4
+
+    def falling_spheres(self, time_step, l_factor, t_factor,
+                        sd_method='fts', sd_short=False):
+        self.system.time_step = time_step
+        self.system.part.add(pos=[-5 * l_factor, 0, 0], rotation=[1, 1, 1])
+        self.system.part.add(pos=[0 * l_factor, 0, 0], rotation=[1, 1, 1])
+        self.system.part.add(pos=[7 * l_factor, 0, 0], rotation=[1, 1, 1])
+
+        self.system.integrator.set_stokesian_dynamics(
+            viscosity=1.0 / (t_factor * l_factor),
+            device=self.device, radii={0: 1.0 * l_factor},
+            approximation_method=sd_method)
+
+        gravity = espressomd.constraints.Gravity(
+            g=[0, -1.0 * l_factor / (t_factor**2), 0])
+        self.system.constraints.add(gravity)
+        self.system.time_step = 1.0 * t_factor
+
+        obs = espressomd.observables.ParticlePositions(ids=(0, 1, 2))
+        acc = espressomd.accumulators.TimeSeries(obs=obs, delta_N=1)
+        self.system.auto_update_accumulators.add(acc)
+        acc.update()
+
+        if sd_short:
+            y_min = -80
+            intsteps = 1300
+        elif sd_method == 'fts':
+            y_min = -555
+            intsteps = 8000
+        else:
+            y_min = -200
+            intsteps = 3000
+        intsteps = int(intsteps * t_factor / self.system.time_step)
+
+        self.system.integrator.run(intsteps)
+
+        simul = acc.time_series()[:, :, 0:2]
+        paper = self.data.reshape([-1, 3, 2])
+
+        for pid in range(3):
+            dist = []
+            # the simulated trajectory is oversampled by a ratio of
+            # (90/t_factor):1 compared to the published trajectory
+            for desired in paper[:, pid] * l_factor:
+                if desired[1] < y_min * l_factor:
+                    break
+                # find the closest point in the simulated trajectory
+                idx = np.abs(simul[:, pid, 1] - desired[1]).argmin()
+                actual = simul[idx, pid]
+                dist.append(np.linalg.norm(actual - desired))
+            self.assertLess(idx, intsteps, msg='Insufficient sampling')
+            np.testing.assert_allclose(dist, 0, rtol=0, atol=0.5 * l_factor)
+
+    def tearDown(self):
+        self.system.constraints.clear()
+        self.system.part.clear()
+
+
+class StokesianDiffusionTest(ut.TestCase):
+    system = s
+    device = 'none'
+
+    kT = 1e-4
+    R = 1.5
+    eta = 2.4
+
+    def setUp(self):
+        self.system.box_l = [10] * 3
+        self.system.periodicity = [0, 0, 0]
+
+        self.system.time_step = 1.0
+        self.system.cell_system.skin = 0.4
+
+        self.system.part.add(pos=[0, 0, 0], rotation=[1, 1, 1])
+
+    def check(self):
+        self.system.integrator.set_stokesian_dynamics(
+            viscosity=self.eta, device=self.device, radii={0: self.R})
+        self.system.thermostat.set_stokesian(kT=self.kT, seed=42)
+
+        intsteps = int(100000 / self.system.time_step)
+        pos = np.empty([intsteps + 1, 3])
+        orientation = np.empty((intsteps + 1, 3))
+
+        pos[0, :] = self.system.part[0].pos
+        orientation[0, :] = self.system.part[0].director
+        for i in range(intsteps):
+            self.system.integrator.run(1)
+            pos[i + 1, :] = self.system.part[0].pos
+            orientation[i + 1, :] = self.system.part[0].director
+
+        # translational diffusion coefficient
+        D_expected = self.kT / (6 * np.pi * self.eta * self.R)
+
+        # NOTE on steps_per_slice:
+        # The shorter these trajectories are, the more trajectories we get
+        # and the more accurate the result is.
+        # However since we want to test that diffusion works for
+        # "long" trajectories we won't go too low.
+        n_steps_per_slice = 200
+        n_slices = int(intsteps / n_steps_per_slice)
+
+        squared_displacement_per_slice = np.empty(n_slices)
+        for i in range(n_slices):
+            squared_displacement_per_slice[i] = np.linalg.norm(
+                pos[i * n_steps_per_slice] - pos[(i + 1) * n_steps_per_slice])**2
+        D_measured = np.mean(squared_displacement_per_slice) / \
+            (6 * n_steps_per_slice * self.system.time_step)
+        self.assertAlmostEqual(D_expected, D_measured, delta=D_expected * 0.1)
+
+        # rotational diffusion coefficient
+        Dr_expected = self.kT / (8 * np.pi * self.eta * self.R**3)
+        n_steps_per_slice = 200
+        n_slices = int(intsteps / n_steps_per_slice)
+
+        # This is the displacement equivalent to measure rotational diffusion
+        costheta_per_slice = np.empty(n_slices)
+
+        for i in range(n_slices):
+            costheta_per_slice[i] = np.dot(orientation[i * n_steps_per_slice, :],
+                                           orientation[(i + 1) * n_steps_per_slice, :])
+        Dr_measured = -np.log(np.mean(costheta_per_slice)) / \
+            (2 * n_steps_per_slice * self.system.time_step)
+        self.assertAlmostEqual(
+            Dr_expected,
+            Dr_measured,
+            delta=Dr_expected * 0.1)
+
+    def tearDown(self):
+        self.system.constraints.clear()
+        self.system.part.clear()
+        self.system.thermostat.set_stokesian(kT=0)
diff --git a/testsuite/python/stokesian_dynamics_cpu.py b/testsuite/python/stokesian_dynamics_cpu.py
new file mode 100644
index 00000000000..4959da98631
--- /dev/null
+++ b/testsuite/python/stokesian_dynamics_cpu.py
@@ -0,0 +1,58 @@
+#
+# Copyright (C) 2019-2020 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+import unittest as ut
+import unittest_decorators as utx
+import stokesian_dynamics as sd
+
+
+@utx.skipIfMissingFeatures(["STOKESIAN_DYNAMICS"])
+class StokesianDynamicsSetupTest(sd.StokesianDynamicsSetupTest):
+    device = 'cpu'
+
+    def test_pbc_checks(self):
+        self.pbc_checks()
+
+
+@utx.skipIfMissingFeatures(["STOKESIAN_DYNAMICS"])
+class StokesianDynamicsTest(sd.StokesianDynamicsTest):
+    device = 'cpu'
+
+    def test_default(self):
+        self.falling_spheres(1.0, 1.0, 1.0)
+
+    def test_rescaled(self):
+        self.falling_spheres(1.0, 4.5, 2.5)
+
+    def test_different_time_step(self):
+        self.falling_spheres(0.7, 1.0, 1.0)
+
+    def test_default_ft(self):
+        self.falling_spheres(1.0, 1.0, 1.0, 'ft')
+
+
+@utx.skipIfMissingFeatures(["STOKESIAN_DYNAMICS"])
+class StokesianDiffusionTest(sd.StokesianDiffusionTest):
+    device = 'cpu'
+
+    def test(self):
+        self.check()
+
+
+if __name__ == '__main__':
+    ut.main()
diff --git a/testsuite/python/stokesian_dynamics_gpu.py b/testsuite/python/stokesian_dynamics_gpu.py
new file mode 100644
index 00000000000..b9ca6d0a73e
--- /dev/null
+++ b/testsuite/python/stokesian_dynamics_gpu.py
@@ -0,0 +1,55 @@
+#
+# Copyright (C) 2019-2020 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+import unittest as ut
+import unittest_decorators as utx
+import stokesian_dynamics as sd
+
+
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["STOKESIAN_DYNAMICS_GPU"])
+class StokesianDynamicsSetupTest(sd.StokesianDynamicsSetupTest):
+    device = 'gpu'
+
+    def test_pbc_checks(self):
+        self.pbc_checks()
+
+
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["STOKESIAN_DYNAMICS_GPU"])
+class StokesianDynamicsTest(sd.StokesianDynamicsTest):
+    device = 'gpu'
+
+    def test_default_fts(self):
+        self.falling_spheres(1.0, 1.0, 1.0, 'fts', sd_short=True)
+
+    def test_default_ft(self):
+        self.falling_spheres(1.0, 1.0, 1.0, 'ft', sd_short=True)
+
+
+@utx.skipIfMissingGPU()
+@utx.skipIfMissingFeatures(["STOKESIAN_DYNAMICS_GPU"])
+class StokesianDiffusionTest(sd.StokesianDiffusionTest):
+    device = 'gpu'
+
+    def test(self):
+        pass
+
+
+if __name__ == '__main__':
+    ut.main()
diff --git a/testsuite/python/test_checkpoint.py b/testsuite/python/test_checkpoint.py
index 4faf7b9d1f9..1ebba9fae27 100644
--- a/testsuite/python/test_checkpoint.py
+++ b/testsuite/python/test_checkpoint.py
@@ -35,6 +35,14 @@
       and espressomd.has_features('ELECTROKINETICS'))
 
 
+def skipIfMissingFeatureStokesianDynamics():
+    """Specialized unittest skipIf decorator for missing Stokesian Dynamics."""
+    if not espressomd.has_features(["STOKESIAN_DYNAMICS"]) and (not espressomd.has_features(
+            ["STOKESIAN_DYNAMICS_GPU"]) or not espressomd.gpu_available()):
+        return ut.skip("Skipping test: feature STOKESIAN_DYNAMICS unavailable")
+    return utx._id
+
+
 class CheckpointTest(ut.TestCase):
 
     @classmethod
@@ -191,6 +199,14 @@ def test_thermostat_NPT(self):
         self.assertEqual(thmst['gamma0'], 2.0)
         self.assertEqual(thmst['gammav'], 0.1)
 
+    @skipIfMissingFeatureStokesianDynamics()
+    @ut.skipIf('THERM.SDM' not in modes, 'SDM thermostat not in modes')
+    def test_thermostat_SDM(self):
+        thmst = system.thermostat.get_state()[0]
+        self.assertEqual(thmst['type'], 'SD')
+        self.assertEqual(thmst['kT'], 1.0)
+        self.assertEqual(thmst['seed'], 42)
+
     @utx.skipIfMissingFeatures('NPT')
     @ut.skipIf('INT.NPT' not in modes, 'NPT integrator not in modes')
     def test_integrator_NPT(self):
@@ -233,6 +249,31 @@ def test_integrator_BD(self):
         params = integ.get_params()
         self.assertEqual(params, {})
 
+    @utx.skipIfMissingFeatures('STOKESIAN_DYNAMICS')
+    @ut.skipIf('INT.SDM.CPU' not in modes, 'SDM CPU integrator not in modes')
+    def test_integrator_SDM_cpu(self):
+        integ = system.integrator.get_state()
+        self.assertIsInstance(integ, espressomd.integrate.StokesianDynamics)
+        expected_params = {
+            'approximation_method': 'ft', 'device': 'cpu', 'radii': {0: 1.5},
+            'viscosity': 0.5, 'lubrication': False, 'pair_mobility': False,
+            'self_mobility': True}
+        params = integ.get_params()
+        self.assertEqual(params, expected_params)
+
+    @utx.skipIfMissingGPU()
+    @utx.skipIfMissingFeatures('STOKESIAN_DYNAMICS_GPU')
+    @ut.skipIf('INT.SDM.GPU' not in modes, 'SDM GPU integrator not in modes')
+    def test_integrator_SDM_gpu(self):
+        integ = system.integrator.get_state()
+        self.assertIsInstance(integ, espressomd.integrate.StokesianDynamics)
+        expected_params = {
+            'approximation_method': 'fts', 'device': 'gpu', 'radii': {0: 1.0},
+            'viscosity': 2.0, 'lubrication': False, 'pair_mobility': True,
+            'self_mobility': False}
+        params = integ.get_params()
+        self.assertEqual(params, expected_params)
+
     @utx.skipIfMissingFeatures('LENNARD_JONES')
     @ut.skipIf('LJ' not in modes, "Skipping test due to missing mode.")
     def test_non_bonded_inter(self):
diff --git a/testsuite/scripts/samples/CMakeLists.txt b/testsuite/scripts/samples/CMakeLists.txt
index 7fd4d80f588..803d6aa0e83 100644
--- a/testsuite/scripts/samples/CMakeLists.txt
+++ b/testsuite/scripts/samples/CMakeLists.txt
@@ -24,6 +24,10 @@ add_custom_target(
 sample_test(FILE test_billiard.py)
 sample_test(FILE test_chamber_game.py)
 sample_test(FILE test_constraints.py)
+sample_test(FILE test_dancing.py SUFFIX cpu_ft)
+sample_test(FILE test_dancing.py SUFFIX cpu_fts)
+sample_test(FILE test_dancing.py SUFFIX gpu_ft LABELS "gpu")
+sample_test(FILE test_dancing.py SUFFIX gpu_fts LABELS "gpu")
 sample_test(FILE test_diffusion_coefficient.py)
 sample_test(FILE test_dpd.py)
 sample_test(FILE test_drude_bmimpf6.py SUFFIX cpu)
diff --git a/testsuite/scripts/samples/test_dancing.py b/testsuite/scripts/samples/test_dancing.py
new file mode 100644
index 00000000000..1d322504438
--- /dev/null
+++ b/testsuite/scripts/samples/test_dancing.py
@@ -0,0 +1,63 @@
+# Copyright (C) 2020 The ESPResSo project
+#
+# This file is part of ESPResSo.
+#
+# ESPResSo is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# ESPResSo is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest as ut
+import importlib_wrapper
+import numpy as np
+
+sd_device = "--gpu" if "gpu" in "@TEST_SUFFIX@" else "--cpu"
+sd_method = "--fts" if "fts" in "@TEST_SUFFIX@" else "--ft"
+if sd_method == "--fts" and sd_device == "--cpu":
+    y_min = -555
+    intsteps = 5500
+else:
+    y_min = -200
+    intsteps = 2100
+
+sample, skipIfMissingFeatures = importlib_wrapper.configure_and_import(
+    "@SAMPLES_DIR@/dancing.py", script_suffix="@TEST_SUFFIX@",
+    gpu=(sd_device == "--gpu"), cmd_arguments=[sd_device, sd_method],
+    intsteps=intsteps,
+    ref_data="@CMAKE_SOURCE_DIR@/testsuite/python/data/dancing.txt")
+
+
+@skipIfMissingFeatures
+class Sample(ut.TestCase):
+    system = sample.system
+
+    def test_trajectory(self):
+        # compare the simulated and published trajectories
+        simul = sample.positions[:, :, 0:2]
+        paper = sample.data.reshape([-1, 3, 2])
+
+        for pid in range(3):
+            dist = []
+            # the simulated trajectory is oversampled by a ratio of 60:1
+            # compared to the published trajectory (60 +/- 5 to 1)
+            for desired in paper[:, pid]:
+                if desired[1] < y_min:
+                    break
+                # find the closest point in the simulated trajectory
+                idx = np.abs(simul[:, pid, 1] - desired[1]).argmin()
+                actual = simul[idx, pid]
+                dist.append(np.linalg.norm(actual - desired))
+            self.assertLess(idx, sample.intsteps, msg='Insufficient sampling')
+            np.testing.assert_allclose(dist, 0, rtol=0, atol=0.7)
+
+
+if __name__ == "__main__":
+    ut.main()