From 01183f8ffcdf3943a32b273bfc0d2dd11d694dfa Mon Sep 17 00:00:00 2001
From: Henri Menke <henri@icp.uni-stuttgart.de>
Date: Thu, 20 Apr 2017 10:13:17 +1200
Subject: [PATCH] [core] Remove SD

---
 doc/sphinx/features.rst               |   13 -
 doc/sphinx/introduction.rst           |    2 -
 doc/sphinx/setup.rst                  |   24 -
 doc/ug/features.tex                   |    9 -
 doc/ug/introduction.tex               |    1 -
 doc/ug/run.tex                        |   72 -
 doc/ug/setup.tex                      |   26 -
 src/core/global.cpp                   |   18 +-
 src/core/global.hpp                   |   20 +-
 src/core/integrate.cpp                |    9 -
 src/core/integrate_sd.cpp             |  648 ---------
 src/core/integrate_sd.hpp             |  123 --
 src/core/integrate_sd_cuda.cu         | 1743 -----------------------
 src/core/integrate_sd_cuda.hpp        |  251 ----
 src/core/integrate_sd_cuda_debug.cu   |  337 -----
 src/core/integrate_sd_cuda_debug.hpp  |  143 --
 src/core/integrate_sd_cuda_device.cu  |  130 --
 src/core/integrate_sd_cuda_device.hpp |   43 -
 src/core/integrate_sd_cuda_kernel.cu  | 1884 -------------------------
 src/core/integrate_sd_cuda_kernel.hpp |  191 ---
 src/core/integrate_sd_matrix.cu       |  283 ----
 src/core/integrate_sd_matrix.hpp      |   72 -
 src/core/thermostat.hpp               |    2 -
 23 files changed, 11 insertions(+), 6033 deletions(-)
 delete mode 100644 src/core/integrate_sd.cpp
 delete mode 100644 src/core/integrate_sd.hpp
 delete mode 100644 src/core/integrate_sd_cuda.cu
 delete mode 100644 src/core/integrate_sd_cuda.hpp
 delete mode 100644 src/core/integrate_sd_cuda_debug.cu
 delete mode 100644 src/core/integrate_sd_cuda_debug.hpp
 delete mode 100644 src/core/integrate_sd_cuda_device.cu
 delete mode 100644 src/core/integrate_sd_cuda_device.hpp
 delete mode 100644 src/core/integrate_sd_cuda_kernel.cu
 delete mode 100644 src/core/integrate_sd_cuda_kernel.hpp
 delete mode 100644 src/core/integrate_sd_matrix.cu
 delete mode 100644 src/core/integrate_sd_matrix.hpp

diff --git a/doc/sphinx/features.rst b/doc/sphinx/features.rst
index 504acdebcf3..bd5e4afb696 100644
--- a/doc/sphinx/features.rst
+++ b/doc/sphinx/features.rst
@@ -138,16 +138,6 @@ integrator or thermostat:
 -  Enables the implicit calculation of electro-hydrodynamics for charged
    particles and salt ions in an electric field.
 
--  enable Stokesian Dynamics.
-
--  disable periodic boundary conditions in Stokesian Dynamics.
-
--  use float instead of double in Stokesian Dynamics.
-
--  disable nearfield of Stokesian Dynamics.
-
--  enable debug Stokesian Dynamics.
-
 Interactions
 ------------
 
@@ -279,6 +269,3 @@ looking directly at the code.
 -  Causes to try to provoke a core dump when exiting unexpectedly.
 
 -  Causes to try this even with MPI errors.
-
--  Causes to check more things in the Stokesian Dynamics code. If is
-   larger 1, SD prints more informations.
diff --git a/doc/sphinx/introduction.rst b/doc/sphinx/introduction.rst
index b8db0ab80a6..5df4a43398b 100644
--- a/doc/sphinx/introduction.rst
+++ b/doc/sphinx/introduction.rst
@@ -183,8 +183,6 @@ report so to the developers.
 +--------------------------------+------------------------+------------------+
 | Tunable Slip Boundary          | Single                 | Single           |
 +--------------------------------+------------------------+------------------+
-| Stokesian Dynamics             | Single                 | Single           |
-+--------------------------------+------------------------+------------------+
 |                             **Analysis**                                   |
 +--------------------------------+------------------------+------------------+
 | uwerr                          | None                   | Good             |
diff --git a/doc/sphinx/setup.rst b/doc/sphinx/setup.rst
index 70f2b7779b9..883578e0420 100644
--- a/doc/sphinx/setup.rst
+++ b/doc/sphinx/setup.rst
@@ -132,19 +132,6 @@ re-used.
 (int) if non-zero (default), some warnings are printed out. Set this to
 zero if you get annoyed by them.
 
-(double) viscosity of the fluid for the Stokesian Dynamics simulation.
-
-(double) hydrodynamic radius of the particles used in Stokesian
-Dynamics.
-
-(int[2]) seed of the Stokes Dynamics random number generator.
-
-(int[2]) offset of the random number generator. Together with the seed,
-the state of the random number generator is well defined.
-
-(double) precision used for the approximation of the square root of the
-mobility. Sometimes higher accuracy can speedup the simulation.
-
 Setting global variables in Python
 ----------------------------------
 
@@ -508,17 +495,6 @@ unit operating at a constant temperature.
 
 Be aware that this thermostat requires to be given in Kelvin.
 
-Stokesian Dynamics thermostat
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. todo::
-    This is not implemented yet for the python interface
-
-This thermostat should be used together with the Stokesian Dynamics
-implementation. No other thermostat is able to thermalize SD correctly.
-The precision of the farfield contribution of the thermostat can be
-tuned with
-
 .. _nemd:
 
 ``nemd``: Setting up non-equilibrium MD
diff --git a/doc/ug/features.tex b/doc/ug/features.tex
index 0447f3b08a8..d3a2130a944 100755
--- a/doc/ug/features.tex
+++ b/doc/ug/features.tex
@@ -146,13 +146,6 @@ \section{General features}
 \item \newfeature{LB_ELECTROHYDRODYNAMICS} Enables the implicit
   calculation of electro-hydrodynamics for charged particles and salt
   ions in an electric field.
-\item \newfeature{SD} enable Stokesian Dynamics.
-\item \newfeature{SD_NOT_PERIODIC} disable periodic boundary conditions in
-  Stokesian Dynamics.
-\item \newfeature{SD_USE_FLOAT} use float instead of double in Stokesian
-  Dynamics.
-\item \newfeature{SD_FF_ONLY} disable nearfield of Stokesian Dynamics.
-\item \newfeature{SD_DEBUG} enable debug Stokesian Dynamics.
   
 \end{itemize}
 
@@ -264,8 +257,6 @@ \section{Debug messages}
 \item \newfeature{FORCE\_CORE} Causes \es{} to try to provoke a core dump
   when exiting unexpectedly.
 \item \newfeature{MPI\_CORE} Causes \es{} to try this even with MPI errors.
-\item \newfeature{SD_DEBUG} Causes \es{} to check more things in the Stokesian
-  Dynamics code. If \var{warnings} is larger 1, SD prints more information.
 \end{itemize}
 
 %%% Local Variables: 
diff --git a/doc/ug/introduction.tex b/doc/ug/introduction.tex
index fdef43da751..9b0cefd88c1 100755
--- a/doc/ug/introduction.tex
+++ b/doc/ug/introduction.tex
@@ -166,7 +166,6 @@ \section{Available simulation methods}
   DPD                                 & None   & Good   \\
   Shan-Chen Multicomponent Fluid      & Group  & Group  \\
   Tunable Slip Boundary               & Single & Single \\
-  Stokesian Dynamics                  & Single & Single \\
   \multicolumn{3}{|c|}{\textbf{Analysis}} \\
   uwerr                               & None   & Good   \\
   \multicolumn{3}{|c|}{\textbf{Input/Output}} \\
diff --git a/doc/ug/run.tex b/doc/ug/run.tex
index 4048abd9de5..0a52327f051 100755
--- a/doc/ug/run.tex
+++ b/doc/ug/run.tex
@@ -588,78 +588,6 @@ \subsubsection{Details on implementation}
 codes) but so-called Lucy functions. See \cite{marsili09} for more
 details. These avoid the calculation of exponentials.
 
-\section{\texttt{integrate_sd}: Running a stokesian dynamics simulation}
-\newescommand[integrate-sd]{integrate_sd}
-
-\begin{essyntax}
-  \variant{1} integrate_sd \var{steps}
-\end{essyntax}
-
-\es uses in the stokesian dynamics algorithm  the euler integrater for the
-equations of motion. The motion are overdamped. The velocities of the
-particles are related to the displacement in the last timestep. This are, at
-least if the system is thermalized, however not usefull, as the displacements
-don't scale linear with the timestep. The command \texttt{integrate_sd} with
-an integer \var{steps} as parameter integrates the system for
-\var{steps} time steps. This is implemented using CUDA, so \required{CUDA}
-has to be available in the system.
-
-Currently there is no parallel implementation of this integrator, so all
-particles have to be in a single process.
-
-\subsection{Setting up the system}
-Before running a stokesian dynamics simulation, you have to set a view
-constants:
-\begin{globvar}
-\item[sd_radius] The hydrodynamic particle radius of the (sperical)
-  particles. Only one particle size is supported.
-\item[sd_viscosity] The viscisity of the fluid. Remember this is only a
-  scaling of the timestep, so you can set it without problems to one.
-\item[sd_seed] (int[2]) seed of the Stokes Dynamics random number generator.
-  As the generator used for the SD runes on the GPU (cuRAND) it has its own
-  seed and own state.
-\item[sd_random_state] (int[2]) offset of the random number
-  generator. Together with the seed, the state of the random number generator
-  is well defined.
-\item[sd_precision_random] (double) precision used for the approximation of
-  the square root of the mobility. Sometimes higher accuracy can speedup the
-  simulation.
-\end{globvar}
-
-Make sure to only use the stokesian dynamics thermostat. If you made a warmup
-integration without SD, set it with:
-\begin{tclcode}
-thermostat off
-thermostat sd \$temp
-\end{tclcode}
-
-\subsection{Periodicity}
-The Code uses the ewald-summation (as derived in \cite{beenakker86a}) of the
-Rotne-Prager tensor to be usefull in periodic boundary conditions. To disable
-the peridoicity of the hydrodynamic, use the feature
-\feature{SD_NOT_PERIODIC}. 
-
-The ewald summation is required if the system is periodic, as otherwise an
-cutoff is introduced, which can lead to negative eigenvalues of the
-mobility. This leads to a break down of the code, as it is not possible to
-thermalize such a system.
-
-\subsection{Floatingpoint precision}
-It is possible to switch between double and float as datatype in Stokesian
-Dynamics. As GPUs have more single than double precision floating point
-units, the single precision is faster. However this can lead to problems, if
-the eigenvalues are not precise enough and get negative. Therfore the default
-is double precision. If you want to use single precision, use the feature
-\feature{SD_USE_FLOAT}.
-
-\subsection{Farfield only}
-The mobility matrix consists of two contribution, the farfield and the
-nearfield lubrication correction.
-If the nearfield is neglegible, than the feature \feature{SD_FF_ONLY} can be
-used.
-This should speedup the simulation, as the nearfield doesn't need to be
-inverted. Additional the thermal displacements can be directly calculated.
-
 \section{Multi-timestepping}
 \newescommand[multitimestepping]{multitimestepping}
 
diff --git a/doc/ug/setup.tex b/doc/ug/setup.tex
index 7c6918f3835..99882f3e3eb 100755
--- a/doc/ug/setup.tex
+++ b/doc/ug/setup.tex
@@ -151,17 +151,6 @@ \section{\texttt{setmd}: Setting global variables in TCL}
   verlet list has been re-used.
 \item[warnings] (int) if non-zero (default), some warnings are printed
   out. Set this to zero if you get annoyed by them.
-\item[sd_viscosity] (double) viscosity of the fluid for the Stokesian Dynamics
-  simulation.
-\item[sd_radius] (double) hydrodynamic radius of the particles used in
-  Stokesian Dynamics.
-\item[sd_seed] (int[2]) seed of the Stokes Dynamics random number generator.
-\item[sd_random_state] (int[2]) offset of the random number
-  generator. Together with the seed, the state of the random number generator
-  is well defined.
-\item[sd_precision_random] (double) precision used for the approximation of
-  the square root of the mobility. Sometimes higher accuracy can speedup the
-  simulation.
 \end{globvar}
 
 
@@ -602,21 +591,6 @@ \subsection{CPU thermostat}
 Be aware that this thermostat requires \var{temperature} to be given
 in Kelvin.
 
-\subsection{Stokesian Dynamics thermostat}
-\label{ssec:SDthermostat}
-\begin{essyntax}
-  thermostat sd \var{temperature}
-  \begin{features}
-    \required{CUDA} and \required{SD}
-  \end{features}
-\end{essyntax}
-
-This thermostat should be used together with the Stokesian Dynamics
-implementation.
-No other thermostat is able to thermalize SD correctly.
-The precision of the farfield contribution of the thermostat can be tuned
-with
-
 \section{\texttt{nemd}: Setting up non-equilibrium MD}
 \newescommand{nemd}
 \label{sec:NEMD}
diff --git a/src/core/global.cpp b/src/core/global.cpp
index 6bb972f3abe..2b22b44f8c7 100755
--- a/src/core/global.cpp
+++ b/src/core/global.cpp
@@ -34,7 +34,6 @@
 #include "imd.hpp"
 #include "ghmc.hpp"
 #include "lb.hpp"
-#include "integrate_sd.hpp"
 
 /** This array contains the description of all global variables.
 
@@ -99,19 +98,14 @@ const Datafield fields[] = {
   {&lb_components,      TYPE_INT, 1, "lb_components",     2, false },         /* 51 from ghmc.cpp */
   {&warnings,           TYPE_INT, 1, "warnings",          1, false },         /* 52 from global.cpp */
   {&dpd_ignore_fixed_particles, TYPE_INT, 1, "dpd_ignore_fixed_particles", 1, false },         /* 53 from global.cpp */
-  {&sd_viscosity,            TYPE_DOUBLE, 1, "sd_viscosoity",              4, false },         /* 54 from integrate_sd.cpp */
-  {&sd_radius,               TYPE_DOUBLE, 1, "sd_radius",                  4, false },         /* 55 from integrate_sd.cpp */
-  {&sd_seed,                    TYPE_INT, 2, "sd_seed",                    4, false },         /* 56 from integrate_sd.cpp */
-  {&sd_random_state,            TYPE_INT, 2, "sd_random_state",            4, false },         /* 57 from integrate_sd.cpp */
-  {&sd_random_precision,     TYPE_DOUBLE, 1, "sd_precision_random",        4, false },         /* 58 from integrate_sd.cpp */
-  {&smaller_time_step,TYPE_DOUBLE,1, "smaller_time_step", 5, false },         /* 59 from integrate.cpp */
-  {configtemp,       TYPE_DOUBLE, 2, "configtemp",        1, false },         /* 60 from integrate.cpp */
-  {&langevin_trans,  TYPE_BOOL, 1, "langevin_trans_switch", 1, false },       /* 61 from thermostat.cpp */
-  {&langevin_rotate,  TYPE_BOOL, 1, "langevin_rotate_switch", 1, false },     /* 62 from thermostat.cpp */
+  {&smaller_time_step,TYPE_DOUBLE,1, "smaller_time_step", 5, false },         /* 54 from integrate.cpp */
+  {configtemp,       TYPE_DOUBLE, 2, "configtemp",        1, false },         /* 55 from integrate.cpp */
+  {&langevin_trans,  TYPE_BOOL, 1, "langevin_trans_switch", 1, false },       /* 56 from thermostat.cpp */
+  {&langevin_rotate,  TYPE_BOOL, 1, "langevin_rotate_switch", 1, false },     /* 57 from thermostat.cpp */
 #ifndef ROTATIONAL_INERTIA
-  {&langevin_gamma_rotation,  TYPE_DOUBLE, 1, "gamma_rot",1, false },    /* 63 from thermostat.cpp */
+  {&langevin_gamma_rotation,  TYPE_DOUBLE, 1, "gamma_rot",1, false },    /* 58 from thermostat.cpp */
 #else
-  {langevin_gamma_rotation,  TYPE_DOUBLE, 3, "gamma_rot",1, false },    /* 63 from thermostat.cpp */
+  {langevin_gamma_rotation,  TYPE_DOUBLE, 3, "gamma_rot",1, false },    /* 58 from thermostat.cpp */
 #endif
   { NULL, 0, 0, NULL, 0, false }
 };
diff --git a/src/core/global.hpp b/src/core/global.hpp
index 54e7004be9e..5c3dbcd61ac 100755
--- a/src/core/global.hpp
+++ b/src/core/global.hpp
@@ -185,26 +185,16 @@ extern const Datafield fields[];
 #define FIELD_WARNINGS            52
 /** DPD_IGNORE_FIXED_PARTICLES */
 #define FIELD_DPD_IGNORE_FIXED_PARTICLES 53
-/** index of \ref sd_viscosity in \ref #fields */
-#define FIELD_SD_VISCOSITY        54
-/** index of \ref sd_radius in \ref_#fields */
-#define FIELD_SD_RADIUS           55
-/** index of \ref sd_seed in \ref #fields */
-#define FIELD_SD_SEED             56
-/** index of \ref sd_random_state i_ \ref #fields */
-#define FIELD_SD_RANDOM_STATE     57
-/** index of \ref sd_precision_random in \ref #fields */
-#define FIELD_SD_RANDOM_PRECISION 58
 /** index of \ref smaller_timestep in \ref #fields */
-#define FIELD_SMALLERTIMESTEP     59
+#define FIELD_SMALLERTIMESTEP     54
 /** index of \ref configtemp in \ref #fields */
-#define FIELD_CONFIGTEMP          60
+#define FIELD_CONFIGTEMP          55
 /** index of \ref langevin_trans in \ref #fields */
-#define FIELD_LANGEVIN_TRANS_SWITCH 61
+#define FIELD_LANGEVIN_TRANS_SWITCH 56
 /** index of \ref langevin_rotate in \ref #fields */
-#define FIELD_LANGEVIN_ROT_SWITCH 62
+#define FIELD_LANGEVIN_ROT_SWITCH 57
 /** index of \ref langevin_gamma_rotation in  \ref #fields */
-#define FIELD_LANGEVIN_GAMMA_ROTATION 63
+#define FIELD_LANGEVIN_GAMMA_ROTATION 58
 
 /*@}*/
 
diff --git a/src/core/integrate.cpp b/src/core/integrate.cpp
index e10871348d8..eff1ef9e32e 100755
--- a/src/core/integrate.cpp
+++ b/src/core/integrate.cpp
@@ -361,15 +361,6 @@ void integrate_vv(int n_steps, int reuse_forces) {
     }
 #endif
 
-#ifdef SD
-    if (thermo_switch & THERMO_SD) {
-      runtimeWarning("Use integrate_sd to use Stokesian Dynamics Thermalizer.");
-    }
-    if (thermo_switch & THERMO_BD) {
-      runtimeWarning("Use integrate_sd to use Brownian Dynamics Thermalizer.");
-    }
-#endif
-
     /* Integration Steps: Step 1 and 2 of Velocity Verlet scheme:
        v(t+0.5*dt) = v(t) + 0.5*dt * f(t)
        p(t + dt)   = p(t) + dt * v(t+0.5*dt)
diff --git a/src/core/integrate_sd.cpp b/src/core/integrate_sd.cpp
deleted file mode 100644
index 7b61f94eaeb..00000000000
--- a/src/core/integrate_sd.cpp
+++ /dev/null
@@ -1,648 +0,0 @@
-/*
-  Copyright (C) 2010,2011,2012,2013,2014,2015,2016 The ESPResSo project
-  Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 
-    Max-Planck-Institute for Polymer Research, Theory Group
-  
-  This file is part of ESPResSo.
-  
-  ESPResSo is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-  
-  ESPResSo is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-  
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
-0*/
-
-/** \file integrate_sd.cpp   Stokesian dynamics integrator.
- *
- *  For more information about the integrator 
- *  see \ref integrate_sd.hpp "integrate_sd.hpp".
-*/
-
-#include <mpi.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <cmath>
-#include <assert.h>
-#include "utils.hpp"
-#include "integrate_sd.hpp"
-#include "integrate.hpp"
-#include "reaction.hpp"
-#include "electrokinetics.hpp"
-#include "interaction_data.hpp"
-#include "particle_data.hpp"
-#include "communication.hpp"
-#include "grid.hpp"
-#include "cells.hpp"
-#include "verlet.hpp"
-#include "rotation.hpp"
-#include "ghosts.hpp"
-#include "pressure.hpp"
-#include "p3m.hpp"
-#include "maggs.hpp"
-#include "thermostat.hpp"
-#include "initialize.hpp"
-#include "forces.hpp"
-#include "nsquare.hpp"
-#include "domain_decomposition.hpp"
-#include "layered.hpp"
-#include "nemd.hpp"
-#include "rattle.hpp"
-#include "errorhandling.hpp"
-#include "lattice.hpp"
-#include "lb.hpp"
-#include "virtual_sites.hpp"
-#include "correlators/Correlator.hpp"
-#include "ghmc.hpp"
-
-using std::ostringstream;
-
-/************************************************
- * DEFINES
- ************************************************/
-
-/** Tag for communication in verlet fix: propagate_positions()  */
-#define REQ_INT_VERLET   400
-
-/*******************  variables  *******************/
-
-/// switch between double and float
-#ifdef SD_USE_FLOAT
-typedef float real;
-#else
-typedef double real;
-#endif
-
-/*******************  variables  *******************/
-
-double sd_viscosity=1;
-double sd_radius=-1;
-
-int sd_seed[]={0,('E'+'S'+'P'+'R'+'e'+'s'+'S'+'o')};
-int sd_random_state[]={0,0};
-double sd_random_precision=1e-3;
-
-/** \name Privat Functions */
-/************************************************************/
-/*@{*/
-
-/** Propagate the positions. Integration step:<br>
-    \f[ p(t+\Delta t) = p(t) + \Delta t  \mu f(t) \f] */
-void propagate_pos_sd();
-
-#ifdef CUDA
-void propagate_pos_sd_cuda(real * box_l, int N,real * pos_h, real * force_h, real * velo_h);
-#endif
-
-void propagate_pos_bd(int N, real * pos, real * force, real * velocity);
-
-int sd_get_particle_num();
-/*@}*/
-
-void integrator_sanity_checks_sd()
-{
-  if ( time_step < 0.0 ) {
-      ostringstream msg;
-      msg <<"time_step not set";
-      runtimeError(msg);
-  }
-  if ( temperature < 0.0 ) {
-      ostringstream msg;
-      msg <<"thermostat not initialized";
-      runtimeError(msg);
-  }
-  if (sd_radius < 0) {
-      ostringstream msg;
-      msg <<"Stokesian Dynamics Hydrodynamic particle radius not initialized";
-      runtimeError(msg);
-  }
-  if (sd_viscosity < 0) {
-      ostringstream msg;
-      msg <<"Stokesian Dynamics fluid viscosity not initialized";
-      runtimeError(msg);
-  }
-}
-
-/************************************************************/
-
-
-/************************************************************/
-
-#if defined(SD) || defined(BD)
-
-void integrate_sd(int n_steps)
-{
-  /* Prepare the Integrator */
-  on_integration_start();
-
-  /* if any method vetoes (P3M not initialized), immediately bail out */
-  if (check_runtime_errors())
-    return;
-
-  INTEG_TRACE(fprintf(stderr,"%d: integrate_vv: integrating %d steps (recalc_forces=%d)\n",
-		      this_node, n_steps, recalc_forces));
-   
-  /* Integration Step:
-     Calculate forces f(t) as function of positions p(t) ( and velocities v(t) ) */
-  //if (recalc_forces) { 
-  //thermo_heat_up();
-
-
-  ghost_communicator(&cell_structure.collect_ghost_force_comm);
-
-#ifdef ROTATION
-  convert_initial_torques();
-#endif
-
-  //thermo_cool_down();
-
-  /* Communication Step: ghost forces */
-
-
-  /*apply trap forces to trapped molecules*/
-#ifdef MOLFORCES
-  // prob. works only with harmonic bounds
-  calc_and_apply_mol_constraints();
-#endif
-
-  /* should be pretty late, since it needs to zero out the total force */
-#ifdef COMFIXED
-  calc_comfixed();
-#endif
-
-  //rescale_forces();
-    
-#ifdef COLLISION_DETECTION
-  //should not be neccessery, as integrator avoids collision
-  handle_collisions();
-#endif
-  // end of force calculation
-
-#ifdef GHMC
-  if(thermo_switch & THERMO_GHMC)
-    ghmc_init();
-#endif
-  
-  if (check_runtime_errors())
-    return;
-
-  n_verlet_updates = 0;
-
-  /* Integration loop */
-  for(int step=0;step<n_steps;step++) {
-    INTEG_TRACE(fprintf(stderr,"%d: STEP %d\n",this_node,step));
-    //sd_set_particles_apart();
-#ifdef BOND_CONSTRAINT
-    save_old_pos();
-#endif
-
-#ifdef GHMC
-    if(thermo_switch & THERMO_GHMC) {
-      if (step % ghmc_nmd == 0)
-        ghmc_momentum_update();
-    }
-#endif
-    if(thermo_switch & ~(THERMO_SD|THERMO_BD) ){
-      static bool warned_thermo_sd_other=false;
-      if (!warned_thermo_sd_other){
-	fprintf (stderr, "Warning, using another thermo than the one provided by StokesDynamics breaks (most likely) StokesDynamics.\n");
-	warned_thermo_sd_other=true;
-      }
-    }
-    if (thermo_switch & THERMO_SD && thermo_switch &THERMO_BD) {
-      fprintf (stderr, "Warning: cannot use BD and SD. Disabeling BD!\n");
-      thermo_switch &= ~THERMO_BD;
-    }
-
-    /* Integration Step: Step 3 of Velocity Verlet scheme:
-       Calculate f(t) as function of positions p(t) ( and ``velocities'' v(t) ) */
-
-#ifdef LB
-    transfer_momentum = 1;
-#endif
-#ifdef LB_GPU
-    transfer_momentum_gpu = 1;
-#endif
-
-    force_calc();
-
-#ifdef CATALYTIC_REACTIONS
-    integrate_reaction();
-#endif
-
-    if (check_runtime_errors())
-      break;
-
-    
-#ifdef LB
-    if (lattice_switch & LATTICE_LB)
-      lattice_boltzmann_update();
-      
-    if (check_runtime_errors())
-      break;
-#endif
-
-#ifdef LB_GPU
-    if(this_node == 0){
-#ifdef ELECTROKINETICS
-      if (ek_initialized) {
-        ek_integrate();
-      }
-      else {
-#endif
-        if (lattice_switch & LATTICE_LB_GPU)
-          lattice_boltzmann_update_gpu();
-#ifdef ELECTROKINETICS
-      }
-#endif
-    }
-#endif //LB_GPU
-
-#ifdef ELECTROSTATICS
-    if(coulomb.method == COULOMB_MAGGS) {
-      maggs_propagate_B_field(0.5*time_step); 
-    }
-#endif
-
-#ifdef NPT
-    if((this_node==0) && (integ_switch == INTEG_METHOD_NPT_ISO))
-      nptiso.p_inst_av += nptiso.p_inst;
-#endif
-
-#ifdef GHMC
-    if(thermo_switch & THERMO_GHMC) {
-      if (step % ghmc_nmd == ghmc_nmd-1)
-        ghmc_mc();
-    }
-#endif
-
-
-
-    /** Integration Steps: Update the Positions
-      \[ p_i(t + dt)   = p_i(t) + dt * \mu_{ij} * f_j(t) + dt * \mu_{ij} * f^B_j \]
-    */
-    propagate_pos_sd(); // we dont have velocities
-
-#ifdef BOND_CONSTRAINT
-    static bool bond_constraint_with_sd_warned=false;
-    if (!bond_constraint_with_sd_warned){ // warn only once
-      fprintf (stderr, "Warning, using BOND_CONSTRAINT with StokesDynamics might not work as expected!.\n");    
-      bond_constraint_with_sd_warned=true;
-    }
-    /**Correct those particle positions that participate in a rigid/constrained bond */
-    cells_update_ghosts();
-
-    correct_pos_shake();
-#endif
-
-#ifdef ELECTROSTATICS
-    if(coulomb.method == COULOMB_MAGGS) {
-      maggs_propagate_B_field(0.5*time_step); 
-    }
-#endif
-
-#ifdef NPT
-    if (check_runtime_errors())
-      break;
-#endif
-
-
-    /* Propagate time: t = t+dt */
-    sim_time += time_step;
-  }
-
-  /* verlet list statistics */
-  if(n_verlet_updates>0) verlet_reuse = n_steps/(double) n_verlet_updates;
-  else verlet_reuse = 0;
-
-#ifdef NPT
-  if(integ_switch == INTEG_METHOD_NPT_ISO) {
-    nptiso.invalidate_p_vel = 0;
-    MPI_Bcast(&nptiso.p_inst, 1, MPI_DOUBLE, 0, comm_cart);
-    MPI_Bcast(&nptiso.p_diff, 1, MPI_DOUBLE, 0, comm_cart);
-    MPI_Bcast(&nptiso.volume, 1, MPI_DOUBLE, 0, comm_cart);
-    if(this_node==0) nptiso.p_inst_av /= 1.0*n_steps;
-    MPI_Bcast(&nptiso.p_inst_av, 1, MPI_DOUBLE, 0, comm_cart);
-  }
-#endif
-
-#ifdef GHMC
-  if(thermo_switch & THERMO_GHMC)
-    ghmc_close();
-#endif
-
-}
-
-//#endif /* SD */
-
-/* Privat functions */
-/************************************************************/
-
-//#ifdef SD
-
-void propagate_pos_sd()
-{
-  /* Verlet list criterion */
-  double skin2 = SQR(0.5 * skin);
-
-  INTEG_TRACE(fprintf(stderr,"%d: propagate_pos:\n",this_node));
-  Cell *cell;
-  Particle *p;
-  int c, i, np;
-  //get total number of particles
-  int N=sd_get_particle_num();
-  // gather all the data for mobility calculation
-  real * pos=NULL;
-  pos=(real *)Utils::malloc(DIM*N*sizeof(double));
-  assert(pos!=NULL);
-  real * force=NULL;
-  force=(real *)Utils::malloc(DIM*N*sizeof(double));
-  assert(force!=NULL);
-  real * velocity=NULL;
-  velocity=(real *)Utils::malloc(DIM*N*sizeof(real));
-  assert(velocity!=NULL);
-#ifdef EXTERNAL_FORCES
-  const int COORD_ALL=COORD_FIXED(0)&COORD_FIXED(1)&COORD_FIXED(2);
-#endif
-  int j=0; // total particle counter
-  for (c = 0; c < local_cells.n; c++){
-    cell = local_cells.cell[c];
-    p    = cell->part;
-    np   = cell->n;
-    for (i = 0; i < np; i++) { // only count nonVirtual Particles
-#ifdef EXTERNAL_FORCES
-      if (p[i].p.ext_flag & COORD_ALL)
-	{
-	  fprintf (stderr, "Warning: Fixing particle in StokesDynamics this way with EXTERNAL_FORCES is not possible (and will be ignored). Please try to bind them e.g. harmonicaly.\n");
-	}
-#endif
-#ifdef  VIRTUAL_SITES
-      if (!ifParticleIsVirtual(&p[i]))
-#endif
-      {
-#ifdef SD_USE_FLOAT
-	for (int d=0;d<3;d++){
-	  pos[3*j+d]        = p[i].r.p[d];
-	  pos[3*j+d]        -=rint(pos[3*j+d]/box_l[d])*box_l[d];
-	  force[3*j+d]      = p[i].f.f[d];
-	}
-#else
-        memmove(&pos[3*j], p[i].r.p, 3*sizeof(double));
-        memmove(&force[3*j], p[i].f.f, 3*sizeof(double));
-	for (int d=0;d<3;d++){
-	  pos[3*j+d]        -=rint(pos[3*j+d]/box_l[d])*box_l[d];
-	}
-#endif
-	j++;
-      }
-    }
-  }
-  if (!(thermo_switch & THERMO_SD) && thermo_switch & THERMO_BD){
-    propagate_pos_bd(N,pos,force, velocity);
-  } else {
-    // cuda part
-#ifdef CUDA
-    //void propagate_pos_sd_cuda(double * box_l_h, int N,double * pos_h, double * force_h, double * velo_h){
-#ifdef SD_USE_FLOAT
-    real box_size[3];
-    for (int d=0;d<3;d++){
-      box_size[d]=box_l[d];
-    }
-#else
-    real * box_size = box_l;
-#endif
-    if (!(thermo_switch & THERMO_SD)){
-      temperature*=-1;
-    }
-  propagate_pos_sd_cuda(box_size,N,pos,force, velocity);
-  if (!(thermo_switch & THERMO_SD)){
-    temperature*=-1;
-  }
-#else
-  fprintf(stderr, "Warning - CUDA is currently required for SD\n");
-  fprintf(stderr, "So i am just sitting here and copying stupidly stuff :'(\n");
-#endif
-}
-  
-
-#ifdef NEMD
-  /* change momentum of each particle in top and bottom slab */
-  fprintf (stderr, "Warning: NEMD is in SD not supported.\n");
-#endif
-  j=0;
-  for (c = 0; c < local_cells.n; c++) {
-    cell = local_cells.cell[c];
-    p    = cell->part;
-    np   = cell->n;
-    for (i = 0; i < np; i++) {
-#ifdef VIRTUAL_SITES
-      if (ifParticleIsVirtual(&p[i])) continue;
-#endif
-      // write back of position and velocity data
-#ifdef SD_USE_FLOAT
-      for (int d=0;d<3;d++){
-	p[i].r.p[d] = pos[3*j+d]+box_l[d]*rint(p[i].r.p[d]/box_l[d]);
-	p[i].m.v[d] = velocity[3*j+d];
-	//p[i].f.f[d] *= (0.5*time_step*time_step)/(*part).p.mass;
-      }
-#else
-      for (int d=0;d<3;d++){
-	p[i].r.p[d] = pos[3*j+d]+box_l[d]*rint(p[i].r.p[d]/box_l[d]);
-      }
-      memmove(p[i].m.v, &velocity[DIM*j], 3*sizeof(double));
-#endif
-      // somehow this does not effect anything, although it is called ...
-      for (int d=0;d<3;d++){
-	p[i].f.f[d] *= (0.5*time_step*time_step)/(*p).p.mass;
-      }
-      for (int d=0;d<DIM;d++){
-        assert(!isnan(pos[DIM*i+d]));
-      }
-      j++;
-      ONEPART_TRACE(if(p[i].p.identity==check_id) fprintf(stderr,"%d: OPT: PV_1 v_new = (%.3e,%.3e,%.3e)\n",this_node,p[i].m.v[0],p[i].m.v[1],p[i].m.v[2]));
-      ONEPART_TRACE(if(p[i].p.identity==check_id) fprintf(stderr,"%d: OPT: PPOS p = (%.3f,%.3f,%.3f)\n",this_node,p[i].r.p[0],p[i].r.p[1],p[i].r.p[2]));
-   
-
-#ifdef ROTATION
-      propagate_omega_quat_particle(&p[i]);
-#endif
-
-      /* Verlet criterion check */
-      if(distance2(p[i].r.p,p[i].l.p_old) > skin2 ) resort_particles = 1;
-    }
-  }
-  free(pos);
-  free(force);
-  free(velocity);
-  announce_resort_particles();
-}
-
-
-int sd_set_particles_apart(){
-  {
-    double V=box_l[0];
-    V*=box_l[1];
-    V*=box_l[2];
-    int N=sd_get_particle_num();
-    double Vpart=4.*M_PI/3.*sd_radius*SQR(sd_radius)*N;
-    double phi = Vpart/V;
-    //fprintf(stderr,"info: sd_set_particles_apart: Volume fraction of particles is %f.\n",phi);
-    if (phi > M_PI/3./sqrt(2)){
-      return -5;
-    }
-    if (phi > 0.7){
-      return -4;
-    }
-    if (phi > 0.5){
-      fprintf(stderr,"warning: sd_set_particles_apart: Volume fraction of particles is %f.\n\
-It could be difficult to set particles apart!\n",phi);
-    }
-  }
-  bool outer=false;
-  bool moved=false;
-  do {
-    outer=false;
-    for (int c = 0; c < local_cells.n; c++){
-      Cell * cell  = local_cells.cell[c];
-      Particle * p = cell->part;
-      int np   = cell->n;
-      bool inner=false;
-      do {
-	inner=false;
-	for (int i = 0; i < np; i++) { // only count nonVirtual Particles
-	  for (int j = i+1; j <np; j++){
-	    //position: p[i].r.p
-	    double dr2=0;
-	    double dr[3];
-	    for (int d=0; d<3;d++){
-	      dr[d]=p[i].r.p[d]-p[j].r.p[d];
-	      dr[d]-=box_l[d]*rint(dr[d]/box_l[d]);
-	      dr2+=SQR(dr[d]);
-	    }
-	    if (dr2 <= SQR(2*sd_radius*(1+1e-5))){
-	      double drn = sqrt(dr2);
-	      // push to a distance of 1e-5;
-	      double fac=(sd_radius*(1+1.2e-5)-drn/2)/drn;
-	      assert(!isnan(fac));
-	      //printf("%d %d\t\t",i,j);
-	      for (int d=0; d<3;d++){
-		if(isnan(dr[d]*fac)){
-		  fprintf(stderr, "%4d %4d %6e %6e ",i,j,dr[d],fac);
-		}
-		assert(!isnan(dr[d]));
-		assert(!isnan(dr[d]*fac));
-		p[i].r.p[d]+=fac*dr[d];
-		p[j].r.p[d]-=fac*dr[d];
-	      }
-	      dr2=0;
-	      for (int d=0; d<3;d++){
-		dr[d]=p[i].r.p[d]-p[j].r.p[d];
-		dr[d]-=box_l[d]*rint(dr[d]/box_l[d]);
-		dr2+=SQR(dr[d]);
-	      }
-	      assert(dr2 > SQR(2*sd_radius*(1+1e-5)));
-	      assert(dr2 < SQR(2*sd_radius*(1+2e-5)));
-	      inner=true;
-	      moved=true;
-	    }
-	  }
-	}
-      } while (inner);
-      for (int cj =0; cj < local_cells.n;cj++){
-	Cell * cellJ  = local_cells.cell[cj];
-	Particle * pj = cellJ->part;
-	int npj       = cellJ->n;
-	for (int i = 0; i < np; i++) {
-	  for (int j = (c==cj?i+1:0); j <npj; j++){
-	    //position: p[i].r.p
-	    double dr2=0;
-	    double dr[3];
-	    for (int d=0; d<3;d++){
-	      dr[d]=p[i].r.p[d]-pj[j].r.p[d];
-	      dr[d]-=box_l[d]*rint(dr[d]/box_l[d]);
-	      dr2+=SQR(dr[d]);
-	    }
-	    if (dr2 <= SQR(2*sd_radius*(1+1e-5))){
-	      double drn = sqrt(dr2);
-	      // push to a distance of 1e-5;
-	      double fac=(sd_radius*(1+1.2e-5)-drn/2)/drn;
-	      assert(!isnan(fac));
-	      assert (fac > 0);
-	      for (int d=0; d<3;d++){
-		p[i].r.p[d] +=fac*dr[d];
-		pj[j].r.p[d]-=fac*dr[d];
-	      }
-	      outer=true;
-	      moved=true;
-	    }
-	    for (int d=0;d<3;d++){
-	      assert(!isnan( pj[j].r.p[d]));
-	      assert(!isnan( p[i].r.p[d]));
-	    }
-	  }
-	}
-      }
-    }
-    if (moved) fprintf(stderr,"+");
-  } while (outer);
-  //fprintf(stderr,"set_apart suceeded ");
-  for (int c = 0; c < local_cells.n; c++){
-    Cell * cell  = local_cells.cell[c];
-    Particle * p = cell->part;
-    int np   = cell->n;
-    for (int cj =0; cj < local_cells.n;cj++){
-      Cell * cellJ  = local_cells.cell[cj];
-      Particle * pj = cellJ->part;
-      int npj       = cellJ->n;
-      for (int i = 0; i < np; i++) {
-	for (int j = (c==cj?i+1:0); j <npj; j++){
-	  double dr2=0;
-	  double dr[3];
-	  for (int d=0; d<3;d++){
-	    dr[d]=p[i].r.p[d]-pj[j].r.p[d];
-	    dr[d]-=box_l[d]*rint(dr[d]/box_l[d]);
-	    dr2+=SQR(dr[d]);
-	  }
-	  assert((dr2 > SQR(2*sd_radius*(1+1e-5))));
-	}
-      }
-    }
-  }
-  return 0;
-}
-
-int sd_get_particle_num(){
-  int N=0;
-  for (int c = 0; c < local_cells.n; c++){
-#ifdef VIRTUAL_SITES
-    Cell * cell = local_cells.cell[c];
-    Particle * p    = cell->part;
-    int np   = cell->n;
-    for (int i = 0; i < np; i++) { // only count nonVirtual Particles
-      if (!ifParticleIsVirtual(&p[i])) ++N;
-    } 
-#else
-    N  += local_cells.cell[c]->n;
-#endif
-  }
-  return N;
-}
-
-void propagate_pos_bd(int N, real * pos, real * force, real * velocity){
-  real self_mobility=1/(6*M_PI*sd_viscosity*sd_radius);
-  real scal_f = time_step*self_mobility;
-  real scal_r=sqrt(2*temperature*time_step*self_mobility);
-  for (int i=0;i<3*N;i++){
-    pos[i]+=scal_f * force[i]
-          + scal_r * gaussian_random();
-  }
-}
-
-#endif /* SD */
diff --git a/src/core/integrate_sd.hpp b/src/core/integrate_sd.hpp
deleted file mode 100644
index fc3ae150d39..00000000000
--- a/src/core/integrate_sd.hpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
-  Copyright (C) 2010,2012,2013,2014,2015,2016 The ESPResSo project
-  Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 
-    Max-Planck-Institute for Polymer Research, Theory Group
-  
-  This file is part of ESPResSo.
-  
-  ESPResSo is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-  
-  ESPResSo is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-  
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
-*/
-#ifndef __INTEGRATE_SD_H
-#define __INTEGRATE_SD_H
-
-/** Average number of integration steps the verlet list has been re
-    used. */
-extern double verlet_reuse;
-
-extern double sd_viscosity;
-
-extern double sd_radius;
-
-extern int sd_seed[2];
-extern int sd_random_state[2];
-
-extern double sd_random_precision;
-
-#ifdef SD
-
-#ifdef CUDA
-#  define HAVE_CUBLAS
-#  include <cublas_v2.h>
-//#include <magma.h>
-#  ifdef __CUDA_ARCH__
-#    if __CUDA_ARCH__ < 200
-#      error "Stokesian Dynamics needs at least CUDA ComputeCapability 2.0"
-#    endif //#if __CUDA_ARCH__ < 200
-#  endif //#ifdef  __CUDA_ARCH__
-#else
-#  ifdef SD
-#    error "CUDA is not given!"
-#    error "StokesDynamics requires CUDA"
-#  endif
-#endif
-
-#endif
-
-#define DIM (3)
-
-/** \file integrate_sd.hpp    Stokes dynamics integrator.
- *
- *  For more information see \ref integrate_sd.cpp "integrate_sd.cpp".
-*/
-
-/************************************************************/
-/** \name Exported Variables */
-/************************************************************/
-/*@{*/
-
-#ifdef SD_USE_FLOAT
-typedef float real;
-#else
-typedef double real;
-#endif
-
-/** Switch determining which Integrator to use. */
-extern int integ_switch;
-
-/** incremented if a Verlet update is done, aka particle resorting. */
-extern int n_verlet_updates;
-
-/** Time step for the integration. */
-extern double time_step;
-extern double time_step_half;
-extern double time_step_squared;
-extern double time_step_squared_half;
-
-/** Old time step needed for rescaling of forces. */
-extern double old_time_step;
-/** Actual simulation time (only on MASTER NODE). */
-extern double sim_time;
-/** Maximal interaction cutoff. */
-extern double max_cut;
-/** Verlet list skin. */
-extern double skin;
-
-/** If non-zero, the particle data will be resorted before the next integration. */
-extern int    resort_particles;
-/** If non-zero, the forces will be recalculated before the next integration. */
-extern int    recalc_forces;
-
-/*@}*/
-
-/** \name Exported Functions */
-/************************************************************/
-/*@{*/
-
-/** check sanity of integrator params */
-void integrator_sanity_checks_sd();
-
-/** integrate with euler integrator.
-    \param n_steps number of steps to integrate.
- */
-void integrate_sd(int n_steps);
-
-/** set particle apart so they dont overlap.
- */
-int sd_set_particles_apart();
-
-int sd_test(int size, int type);
-
-/*@}*/
-//#endif /* SD */
-#endif
diff --git a/src/core/integrate_sd_cuda.cu b/src/core/integrate_sd_cuda.cu
deleted file mode 100644
index e214420ea3a..00000000000
--- a/src/core/integrate_sd_cuda.cu
+++ /dev/null
@@ -1,1743 +0,0 @@
-
-/*
-   Copyright (C) 2010,2011,2012,2016 The ESPResSo project
-
-   This file is part of ESPResSo.
-
-   ESPResSo is free software: you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation, either version 3 of the License, or
-   (at your option) any later version.
-
-   ESPResSo is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-#include "config.hpp"
-#ifdef SD /* Terminates at end of file */
-//#define SD_PC
-
-/** \file integrate_sd_cuda.cu    Stokes dynamics integrator.
- *
- *  Here the calculation of the displacements is implemented.
-*/
-
-
-#include <stdio.h>
-#include <iostream>
-#include "cuda_runtime.h"
-#include "curand.h"
-#include <device_functions.h>
-// for std::swap:
-// C++98:
-#include <algorithm>
-// C++11:
-//#include <utility>
-
-#include "assert.h"
-#include "integrate_sd_cuda.hpp"
-#include "integrate_sd_cuda_debug.hpp"
-#include "integrate_sd_cuda_kernel.hpp"
-#include "integrate_sd.hpp"
-#include <cublas_v2.h>
-//#include "errorhandling.hpp"
-#include "global.hpp"
-
-#if defined(OMPI_MPI_H) || defined(_MPI_H)
-#error CU-file includes mpi.h! This should not happen!
-#endif
-
-
-// global variables for usage in this file
-cublasHandle_t     cublas    = NULL;
-curandGenerator_t  generator = NULL;
-int reprint=-1;
-/* *************************************************************************************************************** *
- * ********************************************     implementation    ******************************************** *
- * *************************************************************************************************************** */
-/* *************************************************************************************************************** *
- * *******     III MM   MM PPP  L     EEEE MM   MM EEEE NN    N TTTTTTT  AAA  TTTTTTT III  OOO  NN    N    ******* *
- * *******      I  M M M M P  P L     E    M M M M E    N N   N    T    A   A    T     I  O   O N N   N    ******* *
- * *******      I  M  M  M PPP  L     EEE  M  M  M EEE  N  N  N    T    AAAAA    T     I  O   O N  N  N    ******* *
- * *******      I  M     M P    L     E    M     M E    N   N N    T    A   A    T     I  O   O N   N N    ******* *
- * *******     III M     M P    LLLL  EEEE M     M EEEE N    NN    T    A   A    T    III  OOO  N    NN    ******* *
- * *************************************************************************************************************** */
-/* *************************************************************************************************************** */
-
-/* *************************************************************************************************************** *
- * ********************************************     HOST-Functions    ******************************************** *
- * *************************************************************************************************************** */
-
-// this calls all the functions to:
-//  * generate the mobility matrix (farfield and nearfield)
-//  * compute the displacements
-//  * add the displacements to the positions
-// PARAMTERS:
-// box_l_h : the size of the box in x,y and z-direction, on the host (in)
-// N       : Number of particles (in)
-// pos_h   : position of the particles, simple* array on host (in and out)
-// force_h : forces on the particles, simple* array on host (in)
-// velo_h  : velocities of the particles, simple* array on host (in and out)
-// * : a simple array is e.g. [x_1, y_1, z_1, x_2, y_2, z_2, ...]
-void propagate_pos_sd_cuda(real * box_l_h, int N,real * pos_h, real * force_h, real * velo_h){
-  //printVectorHost(pos_h,3*N,"pos after call");
-  real viscosity=sd_viscosity;
-  real radius   =sd_radius;
-  if (viscosity  < 0){
-    std::cerr << "The viscosity for SD was not set\n";
-    errexit();
-  }
-  if (radius  < 0){
-    std::cerr << "The particle radius for SD was not set\n";
-    errexit();
-  }
-  if (time_step < 0){
-    std::cerr << "The timestep was not set\n";
-    errexit();
-  }
-  
-  int lda=((3*N+31)/32)*32;
-  
-  //static cublasHandle_t cublas=NULL;
-  if (cublas==NULL){
-    cublasCall(cublasCreate(&cublas));
-    //magma_init();
-  }
-
-  real * box_l_d=NULL;
-  cuda_safe_mem(cudaMalloc((void**)&box_l_d, 3*sizeof(real)));
-  cuda_safe_mem(cudaMemcpy(box_l_d,box_l_h,3*sizeof(real),cudaMemcpyHostToDevice));
-  real * pos_d=NULL;
-  cuda_safe_mem(cudaMalloc((void**)&pos_d, (DIM)*(((N+31)/32)*32)*sizeof(real)));
-  cuda_safe_mem(cudaMemcpy(pos_d,pos_h,N*DIM*sizeof(real),cudaMemcpyHostToDevice));
-  //printVectorDev(pos_d,3*N,"pos after copy");
-  real * force_d=NULL;
-  cuda_safe_mem(cudaMalloc((void**)&force_d, DIM*(((N+31)/32)*32)*sizeof(real)));
-  cuda_safe_mem(cudaMemcpy(force_d,force_h,N*DIM*sizeof(real),cudaMemcpyHostToDevice));
-  real * mobility_d=NULL;
-  cuda_safe_mem(cudaMalloc((void**)&mobility_d, lda*N*3*sizeof(real)));
-  real * disp_d=NULL;
-  cuda_safe_mem(cudaMalloc((void**)&disp_d, DIM*(((N+31)/32)*32)*sizeof(real)));
-  int myInfo_h[]={0,0,0};
-  int * myInfo_d=NULL;
-  cuda_safe_mem(cudaMalloc((void**)&myInfo_d, 3*sizeof(int)));
-  cuda_safe_mem(cudaMemcpy(myInfo_d,myInfo_h,3*sizeof(int),cudaMemcpyHostToDevice));
-  
-  sd_compute_displacement( pos_d, N, viscosity, radius, box_l_d, box_l_h, mobility_d, force_d, disp_d, myInfo_d);
-  
-  int numBlocks = (N+numThreadsPerBlock-1)/numThreadsPerBlock;
-
-  // copy before rescaling to get velocities
-  cuda_safe_mem(cudaMemcpy(velo_h,disp_d,N*DIM*sizeof(real),cudaMemcpyDeviceToHost));
-  // rescale displacements
-  real alpha=time_step;
-  cublasCall(cublasRscal( cublas, 3*N, &alpha, disp_d, 1));  
-#ifdef SD_MAX_STEP
-  sd_real_integrate_prepare<<< numBlocks , numThreadsPerBlock  >>>(pos_d , disp_d, box_l_d, sd_radius, N);
-#endif
-  sd_real_integrate<<< numBlocks , numThreadsPerBlock  >>>(pos_d , disp_d, box_l_d, sd_radius, N);
-  
-  // copy back the positions
-  cuda_safe_mem(cudaMemcpy(pos_h,pos_d,N*DIM*sizeof(real),cudaMemcpyDeviceToHost));
-  // save the displacements as velocities (maybe somebody is interested)
-  //alpha=1/time_step;
-  //cublasCall(cublasRscal(cublas, DIM*N, &alpha, disp_d, 1));
-  
-  
-  cuda_safe_mem(cudaFree((void*)box_l_d));
-  cuda_safe_mem(cudaFree((void*)pos_d));
-  cuda_safe_mem(cudaFree((void*)force_d));
-  cuda_safe_mem(cudaFree((void*)mobility_d));
-  cuda_safe_mem(cudaFree((void*)disp_d));
-  cuda_safe_mem(cudaFree((void*)myInfo_d));
-}
-
-
-
-// calculate the farfield and the nearfield and add them
-// PARAMETERS:
-// cublas    : a valid handle of cublas (in)
-// r_d       : position of the particles on the device, size 3*N (in)
-//             the form has to be [x_1, y_1, z_1, x_2, y_2, z_2, ...]
-// N         : Number of particles (in)
-// viscosity : viscositiy of the fluid (in)
-// radius    : Particle radius (in)
-// L_d       : boxsize in x y and z-directions (in)
-// total_mobility_d: matrix of the computed total mobility, size 3*3*N*N (in/out, is overwritten)
-void sd_compute_displacement(const real * r_d, int N, real viscosity, real radius,const real * L_d,const real * L_h,
-			     const real * total_mobility_d, const real * force_d, real * disp_d, int * myInfo_d)
-{
-#ifndef SD_FF_ONLY
-  real det_prec=1e-3;
-#endif
-  real ran_prec=sd_random_precision;
-  static real ran_prec_last=1e-3;
-  if ( ran_prec_last != ran_prec){
-    ran_prec_last=ran_prec;
-    printf("\nSetting the precision for the random part to %e\n",ran_prec);
-  }
-  cudaThreadSynchronize(); // just for debugging
-  cudaCheckError("START");
-  const int lda=((3*N+31)/32)*32;
-  const int numBlocks = (N+numThreadsPerBlock-1)/numThreadsPerBlock;
-  // sort particles in buckets
-#ifndef SD_FF_ONLY
-  bool _use_buckets=false;
-  for (int d=0;d<3;d++){
-    if (L_h[d] > 4*4*radius){ // check if we can make at least 4 boxes in any direction
-      //_use_buckets=true;
-      //#warning "buckets disabled - they are still buggy"
-    }
-  }
-  const bool use_buckets=_use_buckets;
-  // variables for the bucket version
-  real * bucket_size=NULL;
-  int * bucket_num=NULL;
-  int * particle_count=NULL;
-  int * particle_sorted_list=NULL;
-  int * particle_to_bucket_list=NULL;
-  int total_bucket_num=1;
-  int max_particles_per_bucket=N;
-  //const int bucket_size_factor=3; // is global
-  if ( use_buckets ){
-    int bucket_num_h[3];
-    real bucket_size_h[3];
-    real bucket_volume=1;
-    real bucket_volume_with_border=1;
-    for (int d=0;d<3;d++){
-      bucket_num_h[d]=L_h[d]/4/radius*bucket_size_factor;
-      //bucket_num_h[d]=bucket_num_h[d]>(2*bucket_size_factor+1)?bucket_num_h[d]:1; // buckets are propably only usefull if there are more than 3 ...
-      bucket_num_h[d]=bucket_num_h[d]>N?N:bucket_num_h[d]; // never more than N**3 buckets ...
-      total_bucket_num *= bucket_num_h[d];
-      bucket_size_h[d]=L_h[d]/bucket_num_h[d];
-      bucket_volume*=bucket_size_h[d];
-      bucket_volume_with_border*=bucket_size_h[d]+2*radius;
-    }
-    max_particles_per_bucket=bucket_volume_with_border/(4*M_PI*radius*radius*radius/3)*0.74048;
-    max_particles_per_bucket=max_particles_per_bucket > N || max_particles_per_bucket < 0?N:max_particles_per_bucket; // there cannot be more than all particles in one bucket
-    cuda_safe_mem(cudaMalloc((void **) &bucket_size,             sizeof(real)*3));
-    cuda_safe_mem(cudaMemcpy(bucket_size,bucket_size_h,          sizeof(real)*3, cudaMemcpyHostToDevice));
-    cuda_safe_mem(cudaMalloc((void **) &bucket_num,              sizeof(int) *3));
-    cuda_safe_mem(cudaMemcpy(bucket_num,bucket_num_h,            sizeof(int) *3, cudaMemcpyHostToDevice));
-    cuda_safe_mem(cudaMalloc((void **) &particle_count,          sizeof(int) *total_bucket_num));
-    sd_set_value<<<32,32>>>(particle_count,total_bucket_num,0);
-    cuda_safe_mem(cudaMalloc((void **) &particle_to_bucket_list, sizeof(int) *N));
-    cuda_safe_mem(cudaMalloc((void **) &particle_sorted_list,    sizeof(int) *total_bucket_num*max_particles_per_bucket));
-    sd_bucket_sort<<<numBlocks, numThreadsPerBlock >>>(r_d, bucket_size, bucket_num, N, particle_count, particle_sorted_list,
-						       max_particles_per_bucket, total_bucket_num, particle_to_bucket_list);
-  }
-#endif  
-  
-  // compute the mobility Matrix
-  matrix mobility;
-  mobility.size      = N*3;
-  mobility.ldd       = ((N*3+31)/32)*32;
-  mobility.ldd_short = ((N+31)/32)*32;
-  real * &mobility_d=mobility.data;
-  cuda_safe_mem(cudaMalloc( (void**)&mobility_d, lda*DIM*N*sizeof(real) ));
-  assert(mobility_d);
-  sd_set_zero_matrix<<<numBlocks, numThreadsPerBlock >>>(mobility_d,mobility.size, mobility.ldd);
-  cudaThreadSynchronize(); // just for debugging
-  cudaCheckError("");
-#ifdef SD_NOT_PERIODIC
-  sd_compute_mobility_matrix<<< numBlocks , numThreadsPerBlock  >>>(r_d,N,1./(6.*M_PI*viscosity*radius), radius, mobility_d);
-  cudaThreadSynchronize(); // just for debugging
-  cudaCheckError("compute mobility error");
-#else
-  real ew_prec=1e-8;
-  real L_min=L_h[0];
-  L_min=min(L_min,L_h[1]);
-  L_min=min(L_min,L_h[2]);
-  real L_max=L_h[0];
-  L_max=max(L_max,L_h[1]);
-  L_max=max(L_max,L_h[2]);
-  assert(ew_prec < 1);
-  real xi=2./L_min*sqrt(log(1./ew_prec));
-  real kmax2=-4*xi*xi*log(ew_prec);
-  kmax2*=SQR(L_max/2./M_PI);
-  {
-    static int printed=0;
-    if ((printed%reprint == 1)){
-      fprintf(stderr,"\nL_min is %f\n",L_min);
-      fprintf(stderr,"xi is %f\n",xi);
-      fprintf(stderr,"kmax is %e\n",sqrt(kmax2));
-      fprintf(stderr,"selfmobility is %e\n",1./(6.*M_PI*viscosity*radius));
-      //printed=1;
-    }
-    printed++;
-  }
-  //fprintf(stderr,"kmax is %e\n",sqrt(kmax2));
-  sd_compute_mobility_matrix_real_short<<<numBlocks, numThreadsPerBlock>>>(r_d, N, 1./(6.*M_PI*viscosity*radius), radius, L_d, 
-									   mobility.data, L_min/2., xi, xi*radius, xi*radius*xi*radius*xi*radius);
-  //std::cerr << "kmax is " << ceil(sqrt(kmax2)) << std::endl;
-  sd_compute_mobility_matrix_wave_cpu(ceil(sqrt(kmax2))+1, (ceil(sqrt(kmax2))+1)*(ceil(sqrt(kmax2))+1), mobility,
-				      radius, L_h, xi, 1./(6.*M_PI*viscosity*radius), ew_prec);
-  
-  //cudaCheckError("");
-  //mobility.hash_data();
-  if (mobility.wavespace){
-    cudaThreadSynchronize(); 
-    cudaCheckError("sd_compute_mobility_matrix_real_short");
-    sd_compute_mobility_sines<<<numBlocks, numThreadsPerBlock>>>(r_d, N, mobility.wavespace->vecs, mobility.wavespace->num,
-								 mobility.wavespace->sines, mobility.wavespace->cosines, 
-								 mobility.ldd_short);
-    cudaThreadSynchronize(); 
-    cudaCheckError("sd_compute_mobility_sines");
-
-    //cuda_safe_mem(cudaMalloc( (void**)&mobility.dense,    mobility.ldd*mobility.size*sizeof(real) ));
-    //cublasCall(cublasRcopy(cublas, mobility.size*mobility.ldd,mobility.data, 1, mobility.dense,1));
-    if (false) {
-      sd_wavepart_addto_matrix<<<dim3(numBlocks,N,1), dim3(numThreadsPerBlock,1,1),
-                                 numThreadsPerBlock*3*sizeof(real)>>>(mobility.wavespace->num,   mobility.wavespace->matrices,
-								      mobility.wavespace->sines, mobility.wavespace->cosines,
-								      mobility.ldd_short,        N,
-								      mobility.data,            mobility.ldd);
-      cudaThreadSynchronize(); 
-      cudaCheckError("add wavepart to matrix");
-      mobility._free_wavespace();
-    }
-#ifdef SD_DEBUG    
-    /* *********************************
-     * **   Test to check wavespace   **
-     * ********************************* *
-    real * testvec=NULL;
-    cuda_safe_mem(cudaMalloc( (void**)&testvec,    mobility.ldd*3*sizeof(real) ));
-    if (true){
-      if (generator == NULL){
-	curandCall(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT));
-      }
-      curandCall(curandGenerateNormalReal( generator, testvec, ((mobility.size+1)/2)*2, 0, sqrt(2.)));
-    } else {
-      sd_set_zero<<<numBlocks, numThreadsPerBlock>>>(testvec, mobility.size);
-      //real val=1e-20;
-      //for (int i = 0 ; i < mobility.size;i++){
-      //if (i > 30)
-      //sd_set_value<<<1,1>>>(testvec+i,1,val);
-      //val*=10;
-      //}
-      sd_set_value<<<1,1>>>(testvec+6,3,1);
-    }
-    std::cout << "called ";
-    real * testout1 = testvec+mobility.ldd*1;
-    real * testout2 = testvec+mobility.ldd*2;
-    const real one=1;
-    sd_Rgemv(&one, mobility, testvec, testout1);
-    if (warnings > 2){
-      mobility.print();
-      }
-#endif
-    //sd_wavepart_addto_matrix<<<dim3(3,N,1),dim3(2,1,1),numThreadsPerBlock*3*sizeof(real)>>>(mobility.wavespace->num, 
-    sd_wavepart_addto_matrix<<<dim3(numBlocks,N,1), dim3(numThreadsPerBlock,1,1),
-                               numThreadsPerBlock*3*sizeof(real)>>>(mobility.wavespace->num,   mobility.wavespace->matrices,
-								    mobility.wavespace->sines, mobility.wavespace->cosines,
-								    mobility.ldd_short,        N,
-								    mobility.data,             mobility.ldd);
-    
-    cudaCheckError("");
-    mobility._free_wavespace();
-    cudaCheckError("");
-#ifdef SD_DEBUG
-    sd_Rgemv(&one, mobility, testvec, testout2);
-    const real minusone=-1;
-    cublasCall(cublasRaxpy(cublas, mobility.size, &minusone, testout2, 1, testout1, 1));
-    real erg;
-    cublasCall(cublasRnrm2(cublas, mobility.size, testout1, 1, &erg));
-    if (erg > 0.01){
-      printVectorDev(testvec,  mobility.size, "in         ");
-      printVectorDev(testout2, mobility.size, "correct out");
-      printVectorDev(testout1, mobility.size, "difference ");
-      mobility.print();
-      fflush(stdout);
-    }
-    assert(erg < 0.01);
-    */
-#endif
-  }
-  //mobility.printStats();
-  if (!isSymmetricDev(mobility)){
-    if (warnings > 2){
-      printMatrixDev(mobility);
-    }
-  }
-  assert(isSymmetricDev(mobility));
-  if (warnings > 20){
-    //printMatrixDev(mobility);
-    //mobility.printWavespace();
-  }
-#endif
-    
-#ifndef SD_FF_ONLY
-  // compute the resistance matrix
-  matrix resistance;
-  resistance.size      = N*3;
-  resistance.ldd       = ((N*3+31)/32)*32;
-  resistance.ldd_short = ((N+31)/32)*32;
-  int lda_short=resistance.ldd_short;
-  resistance.is_sparse=true;
-  const int max_part_in_lc=125;
-  cuda_safe_mem(cudaMalloc( (void**)&resistance.data,    lda* 3*    max_part_in_lc* sizeof(real) ));
-  cuda_safe_mem(cudaMalloc( (void**)&resistance.col_idx, lda_short* max_part_in_lc* sizeof(int) ));
-  cuda_safe_mem(cudaMalloc( (void**)&resistance.row_l,   lda_short*sizeof(int) ));
-  
-  if (use_buckets){
-    cudaThreadSynchronize(); 
-    cudaCheckError("finding interaction error");
-    sd_find_interacting_particles<<<numBlocks, numThreadsPerBlock>>>(r_d, L_d, N, resistance.col_idx, resistance.row_l, 
-								     particle_count, particle_sorted_list, bucket_size,
-								     bucket_num, particle_to_bucket_list, 4*radius,total_bucket_num);
-    cudaThreadSynchronize(); 
-    cudaCheckError("finding interaction error");
-  } else {
-    cudaThreadSynchronize(); 
-    cudaCheckError("finding interaction error");
-    sd_find_interacting_particles<<<numBlocks, numThreadsPerBlock>>>(r_d, L_d, N, resistance.col_idx, resistance.row_l, 
-								     4*radius);
-    cudaThreadSynchronize(); 
-    cudaCheckError("finding interaction error");
-    
-  }
-  //#ifdef SD_DEBUG
-  {
-    assert(sd_find_max(resistance.row_l,N) < max_part_in_lc );
-  }
-  //#endif // SD_DEBUG
-  sd_compute_resistance_matrix_sparse<<<numBlocks, numThreadsPerBlock>>>(r_d, N, -1./(6.*M_PI*viscosity*radius), radius, L_d,
-									   resistance.data, resistance.col_idx, resistance.row_l, myInfo_d);
-    /*} else {
-  resistance.is_sparse=false;
-    cuda_safe_mem(cudaMalloc( (void**)&resistance_d, lda*N*3*sizeof(real) ));
-    sd_set_zero_matrix<<<numBlocks, numThreadsPerBlock >>>(resistance.data,resistance.size,resistance.ldd);
-    cudaThreadSynchronize(); // debug
-    cudaCheckError("sd_set_zero");
-    sd_compute_resistance_matrix<<< numBlocks , numThreadsPerBlock  >>>(r_d,N,-1./(6.*M_PI*viscosity*radius), radius, L_d, resistance_d, myInfo_d);
-    //#warning "bug"
-    }*/
-  cudaThreadSynchronize(); // we need both matrices to continue;
-  cudaCheckError("compute resistance error");
-#endif
-  //printVectorDev(resistance.data,resistance.size,"resis first line");
-#ifdef SD_DEBUG
-  assert(!hasAnyNanDev(mobility_d,N*3*lda));
-#ifndef SD_FF_ONLY
-  if (hasAnyNanDev(resistance_d,N*3*lda)){
-    printPosDev(r_d,N,"positions");
-    //printMatrixDev(resistance_d,lda, 3*N,"Resistance with nans ...");
-  }
-  assert(!hasAnyNanDev(resistance_d,N*3*lda));
-  assert(isSymmetricDev(resistance_d,lda,N*3));
-#endif
-#endif
-  // initialize displacement
-  static int last_N=-1;
-  static real * last_det_disp=NULL;
-  if (N == last_N && last_det_disp!=NULL){
-    cublasCall(cublasRcopy(cublas, N*3, last_det_disp,1, disp_d,1));
-  } else {
-    sd_set_zero<<<numBlocks, numThreadsPerBlock>>>(disp_d,N*3);
-  }
-  //printVectorDev(r_d, 3*N,"pos");
-#ifdef SD_FF_ONLY
-  const real one=1;
-  //cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, 3*N, 3*N, &one, mobility_d, lda, force_d, 1, &zero, disp_d, 1));
-  sd_Rgemv(&one, mobility,force_d,disp_d);
-#else
-  real err_force = sd_iterative_solver(mobility, resistance, force_d, disp_d,det_prec,0, true);
-#endif
-#ifdef SD_DEBUG
-  if (hasAnyNanDev(disp_d,N*3)){
-    printVectorDev(disp_d,N*3,"disp");
-    printVectorDev(force_d,N*3,"force");
-#ifndef SD_FF_ONLY
-    //printMatrixDev(resistance_d,lda,N*3,"resistance produces nans?");
-#endif
-  }
-  assert(!hasAnyNanDev(disp_d,N*3));
-#endif
-  // save deterministic displacement
-  if (N > last_N){
-    if (last_det_disp!= NULL){
-      cuda_safe_mem(cudaFree((void*)last_det_disp));
-      last_det_disp=NULL;
-    }
-  }
-  if (last_det_disp==NULL){
-    cuda_safe_mem(cudaMalloc( (void **) &last_det_disp, N*3*sizeof(real)));        assert(last_det_disp !=NULL);
-  }
-  cublasCall(cublasRcopy(cublas, N*3, disp_d, 1, last_det_disp,1));
-  last_N=N;
-  {
-    static int printed=0;
-    if (!printed){
-      ;//printMatrixDev(mobility);
-    }
-  }
-  // brownian part
-  if (temperature > 0 ){
-#ifndef SD_FF_ONLY
-    int myInfo_h[3];
-    cuda_safe_mem(cudaMemcpy(myInfo_h,myInfo_d,3*sizeof(int),cudaMemcpyDeviceToHost));
-#else
-    const int myInfo_h[3]={0,0,0};
-#endif
-    int N_ldd = ((N+31)/32)*32;
-    int num_of_rands = N_ldd*myInfo_h[2]*2*DIM+N_ldd*DIM; // has to be even!
-    if (myInfo_h[0]){
-      fprintf(stderr,"We had %d overlapping particles!\n",myInfo_h[0]);
-    }
-    real * brownian_force_nf = NULL;
-    if (myInfo_h[2]){
-      cuda_safe_mem(cudaMalloc( (void**)&brownian_force_nf, (3*N)*sizeof(real) ));     assert(brownian_force_nf != NULL);
-    }
-    real * brownian_force_ff = NULL;
-    cuda_safe_mem(cudaMalloc( (void**)&brownian_force_ff, (3*N)*sizeof(real) ));     assert(brownian_force_ff != NULL);
-    real * gaussian = NULL;
-    cuda_safe_mem(cudaMalloc( (void**)&gaussian, (num_of_rands)*sizeof(real) ));     assert(gaussian != NULL);
-    real * gaussian_ff = gaussian;
-    real * gaussian_nf = gaussian+N_ldd*DIM;
-    unsigned long long *      sd_random_generator_offset     = (unsigned long long *)sd_random_state;
-    unsigned long long *      sd_seed_ull                    = (unsigned long long *)sd_seed;
-    static unsigned long long sd_random_generator_offset_last= *sd_random_generator_offset;
-    if (generator == NULL){
-      curandCall(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT));
-      curandCall(curandSetPseudoRandomGeneratorSeed(generator, *sd_seed_ull));
-      curandCall(curandSetGeneratorOrdering( generator, CURAND_ORDERING_PSEUDO_BEST));
-      curandCall(curandSetGeneratorOffset( generator, *sd_random_generator_offset));
-    }
-    if (* sd_random_generator_offset != sd_random_generator_offset_last){
-      curandCall(curandSetGeneratorOffset( generator, *sd_random_generator_offset));
-    }
-    //#ifdef FLATNOISE
-    // this does not work yet:
-    //curandCall(curandGenerateUniformReal(generator, gaussian_d, num_of_rands, 0, sqrt(24.*temperature/time_step)));
-    //#else
-    sd_random_generator_offset[0]+=num_of_rands/2;
-    curandCall(curandGenerateNormalReal(generator, gaussian, num_of_rands, 0, sqrt(4*temperature/time_step)));
-    sd_random_generator_offset_last=sd_random_generator_offset[0];
-    //#endif
-    if (myInfo_h[2]){
-      sd_set_zero<<<64,192>>>(brownian_force_nf,3*N);
-      sd_compute_brownian_force_nearfield<<<numBlocks, numThreadsPerBlock>>>(r_d, gaussian_nf, N, L_d, radius,
-									     1./(6.*M_PI*viscosity*radius), brownian_force_nf);
-    }// end of near field
-    int size=N*3;
-    real alpha=1/sqrt(2.);
-    // in the case of FF_ONLY this computes directly the displacement
-    real E_cheby=sd_compute_brownian_force_farfield(mobility, gaussian_ff, ran_prec, brownian_force_ff);
-    //real bff_nrm;
-    //cublasCall(cublasRnrm2(cublas, size, brownian_force_ff, 1, &bff_nrm));
-    if (E_cheby > 10*ran_prec){
-      E_cheby=sd_compute_brownian_force_farfield(mobility, gaussian_ff, ran_prec, brownian_force_ff);
-      //cublasCall(cublasRnrm2(cublas, size, brownian_force_ff, 1, &bff_nrm));
-      if (warnings > 1) fprintf(stderr, "Recalculating the Chebyshev-polynome\n");
-    }
-    if ((E_cheby>100*ran_prec && warnings) || (E_cheby > 10*ran_prec &&  warnings > 1) ){
-      fprintf(stderr, "The error of the Chebyshev-approximation was %7.3f%%\n",E_cheby*100);
-      //printVectorDev(mobility.wavespace->sines,mobility.wavespace->max*mobility.ldd_short,"sines");
-      //printVectorDev(mobility.wavespace->matrices,mobility.wavespace->max*6,"matrices");
-    }
-    //printVectorDev(gaussian_ff,size, "gaussian");
-    //printVectorDev(brownian_force_ff, size, "brownian force");
-    //printVectorDev(mobility_d, 1, "mobility");
-    //printf("E_Cheby is %e   and bff is %e\n",E_cheby,bff_nrm);
-    //printVectorDev(brownian_force_ff,size,"FBff: ");
-    //printVectorDev(brownian_force_nf,size,"FBnf: ");
-    real * brownian_disp;
-#ifndef SD_FF_ONLY
-    cuda_safe_mem(cudaMalloc( (void**)&brownian_disp, size*sizeof(real) ));    assert(brownian_disp != NULL);
-    sd_set_zero<<<32,32>>>(brownian_disp,size);
-    const real one=1;
-    if (myInfo_h[2]){
-      cublasCall(cublasRaxpy(cublas, size, &one, brownian_force_nf, 1, brownian_force_ff, 1 ));
-    }
-    sd_iterative_solver(mobility, resistance, brownian_force_ff, brownian_disp, ran_prec,0, false);//(E_cheby/2+err_force/2)*1e-3
-#else
-    //real one=1, zero=0;
-    //cuda_safe_mem(cudaMalloc( (void**)&brownian_disp, size*sizeof(real) ));    assert(brownian_disp != NULL);
-    //cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &one, mobility_d, lda, brownian_force_ff, 1, &zero, brownian_disp, 1));
-    brownian_disp = brownian_force_ff;
-#endif
-    //real tmp;
-    //printVectorDev(brownian_disp, size, "brownian disp");
-    //printVectorDev(disp_d, size, "det disp");
-    //cublasCall(cublasRnrm2(cublas, size, brownian_disp, 1, &tmp));
-    //printf("Brownian disp is: %e   ",tmp);
-    //cublasCall(cublasRnrm2(cublas, size, disp_d, 1, &tmp));
-    //printf("Deterministic disp is: %e   \n",tmp);
-    
-    cublasCall(cublasRaxpy( cublas, size, &alpha, brownian_disp, 1, disp_d, 1 ));
-#ifndef SD_FF_ONLY
-    cuda_safe_mem(cudaFree((void*)brownian_disp));
-#endif
-    cuda_safe_mem(cudaFree((void*)brownian_force_ff));
-    if (myInfo_h[2]){
-      cuda_safe_mem(cudaFree((void*)brownian_force_nf));
-    }
-    cuda_safe_mem(cudaFree((void*)gaussian));
-  }// end of brownian motion
-  cudaCheckError("brownian motion error");
-  
-  // free everything
-#ifndef SD_FF_ONLY
-  if (use_buckets) {
-    cuda_safe_mem(cudaFree((void*)bucket_size));
-    cuda_safe_mem(cudaFree((void*)bucket_num));
-    cuda_safe_mem(cudaFree((void*)particle_count));
-    cuda_safe_mem(cudaFree((void*)particle_to_bucket_list));
-    cuda_safe_mem(cudaFree((void*)particle_sorted_list));
-  }
-#endif
-  cudaCheckError("in mobility");
-}
-
-/// this calls an iterative solver to solve the problem: 
-/// disp * (1+resistance*mobility) = mobility_d *  force_d 
-/// and returnes \param disp
-/// \param mobility and \param resistance are square matrizes with \param size <size> and ldd <((size+31)/32)*32>
-/// \param force and \param disp are vectors of \param size <size>
-real sd_iterative_solver(const matrix& mobility, const matrix& resistance, const real * force, real * disp, real rel_tol, real abs_tol, bool recalc)
-{
-  int size=mobility.size;
-  if (abs(abs_tol) > 1e-8){
-    if (warnings > 1) fprintf(stderr,"Solving for brownian forces.\n");
-  } else {
-    if (warnings > 1) fprintf(stderr,"Solving for interparticle forces.\n");
-  }
-#ifdef SD_DEBUG
-  assert(!hasAnyNanDev(mobility.data,mobility.size*mobility.ldd));
-  assert(!hasAnyNanDev(resistance.data,resistance.size*mobility.ldd));
-  assert(!hasAnyNanDev(force,size));
-  assert(!hasAnyNanDev(disp,size));
-#endif
-#ifdef SD_PC
-  static real * mat_a = NULL;
-  if (mat_a==NULL){
-    cuda_safe_mem(cudaMalloc( (void**)&mat_a, lda*size*sizeof(real) ));
-	recalc=true;
-  }       assert(mat_a != NULL);
-  if (recalc){
-    sd_set_zero_matrix<<<32,192>>>(mat_a,size,lda);
-  }
-#endif
-  real * mob_force=NULL;
-  cuda_safe_mem(cudaMalloc( (void**)&mob_force, size*sizeof(real) ));       assert(mob_force !=NULL);
-  real * result_checker=NULL;
-  cuda_safe_mem(cudaMalloc( (void**)&result_checker, size*sizeof(real) ));  assert(result_checker !=NULL);
-  // vars for cuBLAS calls
-  real alpha=1;
-#ifdef SD_PC
-  // mat_a = (1+resistance*mobility)
-  if (recalc){
-    assert(!resistance.is_sparse);
-    assert(!mobility.is_sparse);
-    assert(mobility.wavepart==NULL);
-    cublasCall(cublasRgemm(cublas,CUBLAS_OP_N,CUBLAS_OP_N, size , size ,size, &alpha, mobility.data, lda,resistance.data, lda, &beta,mat_a, lda));
-    sd_add_identity_matrix<<<64,192>>>(mat_a,size,lda);
-  }
-#endif
-  // mob_force = mobility * force
-  //cublasCall(cublasRgemv( cublas, CUBLAS_OP_N, size, size, &alpha, mobility.data, lda, force, 1, &beta, mob_force, 1));
-  sd_Rgemv(&alpha, mobility, force,mob_force);
-#ifdef SD_DEBUG
-  assert(!hasAnyNanDev(mob_force,size));
-#  ifdef SD_PC
-  assert(!hasAnyNanDev(mat_a,size*lda));
-#  endif
-#endif
-  int info;
-  real res;
-  //printVectorDev((real *)force,size,"Kraft");
-  //printMatrixDev((real *)mobility, lda, size, "mobility");
-  //printMatrixDev((real* )mat_a,lda,size,"A");
-  //printVectorDev(disp,6,"before");
-  static int last_solv_info=4;
-  if (last_solv_info>1){
-    sd_set_zero<<<16,192>>>(disp,size);
-  }
-  //info = sd_bicgstab_solver(cublas ,size, mat_a,lda, mob_force, rel_tol, abs_tol, 10*size+100, disp, &res);
-#ifdef SD_PC
-  info = sd_gmres_solver(cublas ,size, mat_a,                lda, mob_force, rel_tol, abs_tol, size, disp, &res);
-  //info = sd_bicgstab_solver(cublas ,size, mat_a,                lda, mob_force, rel_tol, abs_tol, size, disp, &res);
-#else
-  info = sd_gmres_solver(mobility, resistance, mob_force, rel_tol, abs_tol, size, disp, &res);
-  //info = sd_bicgstab_solver(cublas ,size, mobility, resistance, lda, mob_force, rel_tol, abs_tol, size, disp, &res);
-#endif
-  last_solv_info=info;
-  //printVectorDev(disp,6,"after");
-  // compary to expected result
-  //cuda_safe_mem(cudaMemcpy(mat_a, mat_a_bak, lda*size*sizeof(real),cudaMemcpyDeviceToDevice));
-  
-  if (info != 0){
-    if (info == 1){
-      if (warnings>1) fprintf(stderr, "Iterative solver did not fully converge ... the residuum was %6e\nWe will continue anyway ...\n",res);
-    }
-    else{ // info == 2 || info == 4
-      if (info == 1){
-	if (warnings>1) fprintf(stderr, "Iterative solver did not fully converge ... the residuum was %6e\nWe will continue anyway ...\n",res);
-      }
-      else if (info == 2){
-	if(warnings) fprintf(stdout, "Iterative solver failed ... the residuum was %6e\nWe will continue but the results may be problematic ...\n",res);
-      }
-    }
-    // dgetrs is not better - the contrary: results are worse ...
-    /*int ipiv[size];
-      magma_dgetrf_gpu( size, size,mat_a, lda, ipiv, &info);
-      assert(info==0);
-      magma_dgetrs_gpu('N', size, 1,
-      mat_a, lda, ipiv,
-      disp, size, &info);
-      assert(info==0);
-      // compary to expected result
-      cuda_safe_mem(cudaMemcpy(mat_a, mat_a_bak, lda*size*sizeof(real),cudaMemcpyDeviceToDevice));
-      cublasCall(cublasRgemv( cublas, CUBLAS_OP_N, size, size, &alpha, mat_a, lda, disp, 1, &beta, result_checker, 1));
-      alpha=-1;
-      cublasCall(cublasRaxpy( cublas, size, &alpha, mob_force, 1, result_checker, 1));
-      alpha=1;
-      cublasCall(cublasRdot( cublas, size, result_checker, 1, result_checker, 1,&res));
-      if (res > 1e-1){
-      fprintf(stderr, "All methods failed :(. The residuum from getrs was %e\n",res);
-      //cublasCall(cublasRgemv( cublas, CUBLAS_OP_N, size, size, &alpha, mat_a, lda, disp, 1, &beta, result_checker, 1));
-      //printVectorDev(mob_force, size, "mob_force");
-      //printVectorDev(result_checker, size, "result_checker");
-      //printVectorDev(disp, size, "disp");
-      //printMatrixDev((real *)mobility,lda,size,"mobility");
-      //printMatrixDev((real *)resistance,lda,size,"res");
-      //printMatrixDev((real *)mat_a,lda,size,"mat_a");
-      }*/
-    //magma_int_t magma_dgetrs_gpu( magma_trans_t trans, magma_int_t n, magma_int_t nrhs,
-    //				  double *dA, magma_int_t ldda, magma_int_t *ipiv,
-    //				  double *dB, magma_int_t lddb, magma_int_t *info);
-  }
-#ifdef SD_DEBUG
-  assert(!hasAnyNanDev(disp,size));
-#endif
-  //assert(info==0);
-  //cuda_safe_mem(cudaFree((void*)mat_a));
-  //cuda_safe_mem(cudaFree((void*)mat_a_bak));
-  cuda_safe_mem(cudaFree((void*)mob_force));
-  cuda_safe_mem(cudaFree((void*)result_checker));
-  return res;
-}
-
-// BICGSTAB-Solver
-// GMRes works better, therefore no support for BiCGStab
-// implimented as given in Numerik linearer Gleichungssysteme by Prof. Dr. Andreas Meister
-// this solves A*x=b
-// cublas a handle for cublas
-// size   the size n of the matrix
-// A      the given n*n matrix = A1*A2+1 (in)
-// A1     the two matrices
-// A2     
-// lda    the leading demension of A(1,2)
-// b      the given solution vector (in)
-// tol    requested tolerance of the solution
-// maxit  maximum number of iterations
-// x      the requested solution with an initial guess (in/out)
-// returns 0 on success, else error code
-/*#define sd_bicgstab_solver_exit()  cuda_safe_mem(cudaFree((void*)r0)); \
-  cuda_safe_mem(cudaFree((void*)r));cuda_safe_mem(cudaFree((void*)p));	\
-  cuda_safe_mem(cudaFree((void*)v));cuda_safe_mem(cudaFree((void*)t));	\
-  cuda_safe_mem(cudaFree((void*)tmp));					\
-  res[0] = normr;							\
-  if (warnings > 1) printf("BICSTAB used %d iterations.\n", iteration);	\
-  if (tolb*1.01 > normr)     { return 0;}				\
-  if (tolb*100 > normr)      { return 1;}				\
-  if (initnorm*1e10 > normr) { return 2;}				\
-  else                       { return 4;}
-#ifdef SD_PC
-int sd_bicgstab_solver(cublasHandle_t cublas ,int size, const real * A, int lda,
-		       real * b, real tol,real abs_tol, int maxit, real * x, real * res){
-#else
-int sd_bicgstab_solver(cublasHandle_t cublas ,int size, const real * A1, const real * A2, int lda,
-		       real * b, real tol,real abs_tol, int maxit, real * x, real * res){
-#endif
-  // vector malloc
-  real * r0=NULL;
-  cuda_safe_mem(cudaMalloc( (void**)&r0, size*sizeof(real) ));       assert(r0 != NULL);
-  real * r=NULL;
-  cuda_safe_mem(cudaMalloc( (void**)&r, size*sizeof(real) ));        assert(r != NULL);
-  real * p=NULL;
-  cuda_safe_mem(cudaMalloc( (void**)&p, size*sizeof(real) ));        assert(p != NULL);
-  real * v=NULL;
-  cuda_safe_mem(cudaMalloc( (void**)&v, size*sizeof(real) ));        assert(v != NULL);
-  real * t=NULL;
-  cuda_safe_mem(cudaMalloc( (void**)&t, size*sizeof(real) ));        assert(t != NULL);
-  real * tmp=NULL;
-  cuda_safe_mem(cudaMalloc( (void**)&tmp, 2*lda*sizeof(real) ));      assert(tmp != NULL);
-  //printMatrixDev(A,lda,size,"mat_a");
-  //printVectorDev(b,size,"force");
-#ifdef SD_PC
-  cublasCall(cublasRtrsv(cublas, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, A, lda, b, 1)); 
-#endif
-  // constants
-  real eps;
-  if (sizeof(real) == sizeof(double)){
-    eps = 1e-15;
-  } else {
-    eps = 1e-7;
-  }
-  eps = min(eps,tol*1e-2);
-  // other variables
-  real alpha=1;
-  real beta=0;
-  real tolb;
-  // compute the norm of b
-  real normb;
-  cublasCall(cublasRnrm2( cublas, size, b, 1, &normb));
-  // compute the tolerance we want to achieve
-  tolb=tol*normb+abs_tol;
-  // r0 = b-A*x
-  alpha = -1;
-#ifdef SD_PC
-  cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A, lda, x, 1, &beta, r0, 1));
-  cublasCall(cublasRtrsv(cublas, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, A, lda, r0, 1)); 
-#else
-  cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A2, lda, x, 1, &beta, tmp, 1));
-  alpha=1;
-  cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A1, lda, tmp, 1, &beta, r0, 1));
-  alpha=-1;
-  cublasCall(cublasRaxpy(cublas, size, &alpha, x, 1, r0, 1));
-#endif
-  alpha = 1;
-  cublasCall(cublasRaxpy(cublas, size, &alpha, b, 1, r0, 1));
-  // r = r0
-  cublasCall(cublasRcopy(cublas, size,r0,1,r, 1));
-  // rr0 = r*r0
-  real rr0;
-  cublasCall(cublasRdot( cublas, size, r0, 1, r0, 1, &rr0));
-  // p =r
-  cublasCall(cublasRcopy(cublas, size,r0,1,p, 1));
-  // normr=norm(r)
-  real normr=sqrt(rr0);
-  int iteration=0;
-  real lastnorm=normr;
-  real initnorm=normr;
-  // check for conversion or max iterations
-  while (iteration < maxit && normr >= tolb){
-    // v=A*p
-#ifdef SD_PC
-    cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A, lda, p, 1, &beta, v, 1));
-    cublasCall(cublasRtrsv(cublas, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, A, lda, v, 1)); 
-#else
-    cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A1, lda, p, 1, &beta, tmp, 1));
-    cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A1, lda, tmp, 1, &beta, v, 1));
-    cublasCall(cublasRaxpy(cublas, size, &alpha, p, 1, v, 1));
-#endif
-    // vr0 = v*r0
-    real vr0;
-    cublasCall(cublasRdot( cublas, size, v, 1, r0, 1, &vr0));
-    if (fabs(vr0) < eps || rr0 == 0){
-      if (fabs(vr0) < eps){
-	if (warnings > 1) fprintf(stderr, "BICGSTAB break-down.\n");
-      }else{
-	if (warnings > 1) fprintf(stderr, "BICGSTAB solution stagnates.\n");
-      }
-      sd_bicgstab_solver_exit();
-    }
-    // alpha = rr0/vr0
-    real myAlpha=rr0/vr0;
-    real minusMyAlpha = -myAlpha;
-    // s = r - alpha v
-    //cublasCall(cublasRcopy(cublas, size,r,1,s, 1));
-    cublasCall(cublasRaxpy(cublas, size, &minusMyAlpha, v, 1, r, 1)); //s->r
-    // t = A * s
-#ifdef SD_PC
-    cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A, lda, r, 1, &beta, t, 1));// s->r
-    cublasCall(cublasRtrsv(cublas, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, A, lda, t, 1)); 
-#else
-    cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A2, lda, r, 1, &beta, tmp, 1));// s->r
-    cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A1, lda, tmp, 1, &beta, t, 1));// s->r
-    cublasCall(cublasRaxpy(cublas, size, &alpha, r, 1, t, 1));
-#endif
-    // ts = s * t
-    real ts;
-    cublasCall(cublasRdot( cublas, size, t, 1, r, 1, &ts));// s->r
-    // tt = t * t
-    real tt;
-    cublasCall(cublasRdot( cublas, size, t, 1, t, 1, &tt));
-    if (abs(tt)<eps || ts == 0){
-      if (warnings > 1) fprintf(stderr, "Exit: abs(tt)<eps || ts == 0\n");
-      if (warnings > 1) fprintf(stderr, "BICGSTAB break-down.\n");
-      sd_bicgstab_solver_exit();
-    }
-    // omega = ts/tt
-    real myOmega=ts/tt;
-    // x = x + alpha p + omega s
-    cublasCall(cublasRaxpy(cublas, size, &myAlpha, p, 1, x, 1));
-    cublasCall(cublasRaxpy(cublas, size, &myOmega, r, 1, x, 1));
-    // copyback of s to r
-    // r = s - omega t
-    real minusMyOmega=-1*myOmega;
-    cublasCall(cublasRaxpy(cublas, size, &minusMyOmega, t, 1, r, 1));
-    //myOmega*=-1;
-    // r1r0 = r * r0
-    real r1r0;
-    cublasCall(cublasRdot( cublas, size, r, 1, r0, 1, &r1r0));
-    // beta = (alpha * r1r0 ) / (omega rr0)
-    real myBeta = (myAlpha*r1r0)/(myOmega*rr0);
-    if (abs(myBeta)>1/eps){
-      if (warnings > 1) fprintf(stderr,"Exit: abs(myBeta)<1/eps\n");
-      sd_bicgstab_solver_exit()
-    }
-    // p = r + beta ( p - omega v)= beta p + r - beta omega v
-    cublasCall(cublasRscal(cublas, size, &myBeta, p, 1));
-    cublasCall(cublasRaxpy(cublas, size, &alpha, r, 1, p, 1));
-    alpha=-myBeta*myOmega;
-    cublasCall(cublasRaxpy(cublas, size, &alpha, v, 1, p, 1));
-    alpha=1;
-    rr0=r1r0;
-    cublasCall(cublasRnrm2( cublas, size, r, 1, &normr));
-    if (lastnorm*sqrt(eps) > normr){ // restart
-      cublasCall(cublasRcopy(cublas, size,b,1,r, 1));
-      alpha=-1;beta=1; // r <-- r-A*x
-#ifdef SD_PC
-      cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A, lda, x, 1, &beta, r, 1));
-      cublasCall(cublasRtrsv(cublas, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, A, lda, r, 1)); 
-#else // TODO
-      alpha=1, beta=0;
-      cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A2, lda, x, 1, &beta, tmp+lda, 1));// s->r
-      cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A1, lda, tmp+lda, 1, &beta, tmp, 1));// s->r
-      cublasCall(cublasRaxpy(cublas, size, &alpha, x, 1, tmp, 1));
-      alpha=-1;
-      cublasCall(cublasRaxpy(cublas, size, &alpha, tmp, 1, r, 1));
-#endif
-      alpha= 1;beta=0;
-      cublasCall(cublasRdot( cublas, size, r, 1, r, 1, &rr0));
-      normr=sqrt(rr0);
-      lastnorm = normr;
-      // r = r0
-      cublasCall(cublasRcopy(cublas, size,r,1,r0, 1));
-      // p =r
-      cublasCall(cublasRcopy(cublas, size,r,1,p, 1));
-    }
-    if (iteration%50000 == 0){ // enable debugging by setting this to a lower value
-      real realnorm;
-      {// recalculate normr
-        alpha=-1;beta=0;
-#ifdef SD_PC
-	cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A, lda, x, 1, &beta, tmp, 1));
-	cublasCall(cublasRtrsv(cublas, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, A, lda, tmp, 1)); 
-#else
-	cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A2, lda, x, 1, &beta, tmp+lda, 1));// s->r
-	alpha=1;
-	cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, A1, lda, tmp+lda, 1, &beta, tmp, 1));// s->r
-	alpha=-1;
-	cublasCall(cublasRaxpy(cublas, size, &alpha, r, 1, tmp, 1));
-#endif
-	alpha=1;
-	cublasCall(cublasRaxpy(cublas, size, &alpha, b,1,tmp, 1));
-	alpha= 1;beta=0;
-	cublasCall(cublasRnrm2(cublas, size, tmp, 1, &realnorm));
-      }
-      fprintf(stderr,"  Iteration: %6d Residuum: %12f RealResiduum: %12f\n",iteration, normr, realnorm);
-    }
-    iteration++;
-    if (initnorm*1e10 < normr){ // somehow our solution explodes ...
-      if (warnings) fprintf(stderr, "BICGSTAB did not converge, residuum exploded. Aborting.\n");
-      sd_bicgstab_solver_exit();
-    }
-  }
-  res[0]=normr;
-  if (normr > tolb*1.01){
-    if (warnings) fprintf(stderr, "BICGSTAB solution did not converge after %d iterations. Error was %e1 %% to high.\n",iteration,(normr/tolb-1)*100);
-  }
-  //fprintf(stderr, "BICGSTAB solution did converge after %d iterations.\n",iteration);
-  sd_bicgstab_solver_exit();
-}
-*/
-
-// GMRes-Solver
-// implimented as given in Numerik linearer Gleichungssysteme by Prof. Dr. Andreas Meister
-// this solves A*x=b
-// cublas a handle for cublas
-// size   the size n of the matrix
-// A      the given n*n matrix (in)
-// lda    the leading demension of A
-// b      the given solution vector (in)
-// tol    requested tolerance of the solution
-// maxit  maximum number of iterations
-// x      the requested solution with an initial guess (in/out)
-// returns 0 on success, else error code
-#define sd_gmres_solver_exit()  cuda_safe_mem(cudaFree((void*)r0));	\
-  cuda_safe_mem(cudaFree((void*)v));cuda_safe_mem(cudaFree((void*)w));	\
-  cuda_safe_mem(cudaFree((void*)tmp)); res[0] = normr;			\
-  if (warnings > 1) printf("GMRes used %d iterations.\n", iteration);	\
-  if (tolb*1.01 >= normr)     { return 0;}				\
-  printf("tolb: %e, normr: %e, eps: %e \n",tolb, normr, eps);		\
-  if (tolb*100 > normr)       { return 1;}				\
-  if (initnorm*1e10 >= normr) { return 2;}				\
-  else                        { return 4;}
-#ifdef SD_PC
-int sd_gmres_solver(int size, const real * A,int lda, const real * b, real tol,real abs_tol, int maxit, real * x, real * res){
-#else
-int sd_gmres_solver(const matrix & A1, const matrix & A2 , const real * b, real tol,real abs_tol, int maxit, real * x, real * res){
-#endif
-  int size=A1.size;
-  int lda =A1.ldd;
-  const int m=60;
-  // vector malloc device
-  real * r0=NULL;
-  cuda_safe_mem(cudaMalloc( (void**)&r0,  size*sizeof(real) ));      assert(r0  != NULL);
-  real * v=NULL;
-  cuda_safe_mem(cudaMalloc( (void**)&v,   m*lda*sizeof(real) ));     assert(v   != NULL);
-  real * w=NULL;
-  cuda_safe_mem(cudaMalloc( (void**)&w,   size*sizeof(real) ));      assert(w   != NULL);
-  real * tmp=NULL;
-  cuda_safe_mem(cudaMalloc( (void**)&tmp, size*sizeof(real) ));      assert(tmp != NULL);
-  // malloc host
-  const real minusone=-1;
-  const real one=1;
-  real gamma[m+1];
-  real alpha[m];
-  real h[(m+1)*(m+1)];
-  real s[m];
-  real c[m];
-#ifdef SD_DEBUG
-  for (int i=0;i<m;i++){
-    s[i]=0;
-    c[i]=0;
-    alpha[i]=0;
-    gamma[i]=0;
-  }
-  gamma[m]=0;
-  for (int i=0;i<m*(m+1);i++){
-    h[i]=0;
-  }
-#endif
-#ifdef SD_PC
-  // we use left preconditioner, so we dont need to do triang-solv on exit
-  // on a random test upper reduced the conditional number more
-  cublasCall(cublasRtrsv(cublas, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, A, lda, b, 1)); 
-#endif
-  // constants
-  real eps;
-  if (sizeof(real) == sizeof(double)){
-    eps = 1e-15;
-  } else {
-    eps = 1e-7;
-  }
-  eps = min(eps,tol*1e-2);
-  // other variables
-  real tolb;
-  // compute the norm of b
-  real normb;
-  cublasCall(cublasRnrm2( cublas, size, b, 1, &normb));
-  // compute the tolerance we want to achieve
-  tolb=tol*normb+abs_tol;
-  // r0 = b-A*x
-#ifdef SD_PC
-  cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &minusone, A, lda, x, 1, &zero, r0, 1));
-  cublasCall(cublasRtrsv(cublas, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, A, lda, r0, 1)); 
-#else
-  sd_Rgemv(&minusone, A2, x,  tmp);
-  sd_Rgemv(&one,      A1, tmp, r0);
-  //cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &one,      A1, lda, tmp, 1, &zero, r0, 1));
-  cublasCall(cublasRaxpy(cublas, size, &minusone, x, 1, r0, 1));
-#endif
-  cublasCall(cublasRaxpy(cublas, size, &one, b, 1, r0, 1));
-  real normr;
-  cublasCall(cublasRnrm2(cublas, size, r0, 1, &normr));
-  real initnorm=normr;
-  if (warnings > 1) printf("inital normr: %e\n",normr);
-  int iteration=0;
-  if (normr==0){
-    sd_gmres_solver_exit();
-  }
-  if (normb==0){
-    sd_set_zero<<<192,192>>>(x,size);
-    normr=0;
-    sd_gmres_solver_exit();
-  }
-  do {
-    cublasCall(cublasRcopy(cublas, size, r0, 1, v  ,1));
-    gamma[0]=normr;
-    real igamma=1/normr;
-    cublasCall(cublasRscal(cublas, size, &igamma, v, 1));
-    int j;
-    for (j=0;j<m && normr > tolb ;j++){
-#ifdef SD_PC
-      cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &one, A, lda, v+lda*j, 1, &zero, w, 1));
-      cublasCall(cublasRtrsv(cublas, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, A, lda, w, 1)); 
-#else
-      sd_Rgemv(&one, A2, v+lda*j, tmp);
-      sd_Rgemv(&one, A1, tmp, w);
-      //cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &one, A2, lda, v+lda*j, 1, &zero, tmp, 1));
-      //cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &one, A1, lda, tmp, 1, &zero, w, 1));
-      cublasCall(cublasRaxpy(cublas, size, &one, v+lda*j, 1, w, 1));
-#endif
-      for (int i=0;i<j+1;i++){
-	cublasCall(cublasRdot(cublas, size, v+i*lda, 1, w, 1, h+j*(m+1)+i));
-	real tmp=-h[(m+1)*j+i];
-	cublasCall(cublasRaxpy(cublas, size, &tmp , v+lda*i, 1, w, 1));
-      }
-      cublasCall(cublasRnrm2(cublas, size, w, 1, h+j*(m+1)+j+1));
-      for (int i=0;i<j;i++){
-	real hij =h[j*(m+1)+i];
-	real hipj=h[j*(m+1)+i+1];
-	h[j*(m+1)+i]  = c[i]*hij +s[i]*hipj;
-	h[j*(m+1)+i+1]=-s[i]*hij +c[i]*hipj;
-      }
-      real hjj =h[j*(m+1)+j];
-      real hjjp=h[j*(m+1)+j+1];
-      real beta=sqrt(SQR(hjj)+SQR(hjjp));
-      s[j]     =hjjp/beta;
-      c[j]     =hjj /beta;
-      
-      
-      gamma[j+1]  =-s[j]*gamma[j];
-      gamma[j]    = c[j]*gamma[j];
-      
-      normr=abs(gamma[j+1]);
-      if (j+1<m){
-	// TODO: optimize: set w = v+j+1*lda
-	cublasCall(cublasRcopy(cublas, size, w, 1, v+(j+1)*lda, 1));
-	real tmp =1/h[j*(m+1)+j+1];
-	cublasCall(cublasRscal(cublas, size, &tmp, v+(j+1)*lda, 1));
-      }
-      h[j*(m+1)+j]=beta;
-      h[j*(m+1)+j+1]=0;
-    }
-    for (int i=j-1;i>=0;i--){
-      real tmp=gamma[i];
-      for (int k=i+1;k<j;k++){
-	tmp-=h[k*(m+1)+i]*alpha[k];
-      }
-      alpha[i]=tmp/(h[i*(m+1)+i]);
-    }
-    for (int i=0;i<j;i++){
-      cublasCall(cublasRaxpy(cublas, size, alpha+i, v+i*lda, 1, x, 1));
-    }
-    if (warnings > 1) printf("Iteration: %d, j was %d, residiuum: %e\n", iteration, j, normr );
-    iteration+=j;
-    {
-      real newnormr;
-      // r0 = b-A*x
-#ifdef SD_PC
-      cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &minusone, A, lda, x, 1, &zero, r0, 1));
-      cublasCall(cublasRtrsv(cublas, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, size, A, lda, r0, 1));
-#else
-      sd_Rgemv(&minusone, A2, x,  tmp);
-      sd_Rgemv(&one,      A1, tmp, r0);
-      //cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &minusone, A2, lda, x,  1, &zero, tmp, 1));
-      //cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &one,      A1, lda, tmp, 1, &zero, r0, 1));
-      cublasCall(cublasRaxpy(cublas, size, &minusone, x, 1, r0, 1));
-#endif
-      cublasCall(cublasRaxpy(cublas, size, &one, b, 1, r0, 1));
-      cublasCall(cublasRnrm2(cublas, size, r0, 1, &newnormr));
-      if ((newnormr/normr > 1.1 || newnormr/normr < 0.9) && newnormr > sqrt(eps)){
-        if (warnings) printf("Our norm changed strangely: Expected value: %e, real value: %e - eps %e- Bug in GMRes Solver?\n",normr,newnormr, eps);
-      }
-      normr=newnormr;
-    }
-  } while (iteration < maxit && normr > tolb);
-  sd_gmres_solver_exit();
-}
-
-// calculates the largest and snalles eigenvalue of the matrix
-// size        : size of the eigenvector / the matrix           (IN)
-// mobility_d  : handle of the mobility matrix (on the device)  (IN)
-// lambda_min  : smalles eigenvalue                            (OUT)
-// lambda_max  : largest eigenvalue                            (OUT)
-void calculate_maxmin_eigenvalues(const matrix & mobility, real * lambda_min, real * lambda_max, real tol){
-  int size=mobility.size;
-  int lda = ((size+31)/32)*32;
-  int maxit=max(500,size);
-  int IDO;
-  char BMAT='I'; // standard eigenvalue problem
-  char WHICH[]="LR"; // start with largest eigenvalue
-  int NEV = 1; // only one eigenvalue
-  // TUNING: these could be adjusted?
-  real TOL=tol;
-  if (sizeof(double) == sizeof(real)){
-    TOL=max(1e-12,TOL);
-  } else {
-    TOL=max(1e-6,TOL);
-  }
-  // TUNING: make some tests to find good value ...
-  int NCV=min(size, 6); // must be at least 3, but a bit bigger should be better ...
-  int LDV=lda;
-  int LWORKL=3*NCV*(NCV + 2);
-  int mode=1;
-  int IPARAM[11] = {1,0,maxit,1,0,0,mode,0,0,0,0};
-  int IPNTR[14];
-  int INFO=0;
-  real RESID[size];
-  real V[LDV*NCV];
-  real WORKD[3*size];
-  real WORKL[LWORKL];
-  real * vec_in_d;
-  real * vec_out_d;
-  cuda_safe_mem(cudaMalloc((void**)&vec_in_d , lda*sizeof(real)));
-  cuda_safe_mem(cudaMalloc((void**)&vec_out_d, lda*sizeof(real)));
-  for (int minmax=0;minmax<2;minmax++){
-    IDO=0;
-    if (minmax){
-      sprintf(WHICH,"SR");
-      INFO=0;
-      IPARAM[2]=maxit;
-      TOL=sqrt(tol);
-    }
-    while (IDO != 99){
-      //dnaupd_(&IDO,&BMAT,&N,WHICH,&NEV,&TOL,RESID.memptr(),&NCV,V.memptr(),&LDV,IPARAM,IPNTR,WORKD,WORKL,&LWORKL,&INFONAUP);
-      rnaupd(&IDO,&BMAT,&size, WHICH, &NEV, &TOL, RESID, &NCV, V, &LDV, IPARAM, IPNTR, WORKD, WORKL, &LWORKL, &INFO);
-      switch (IDO){
-      case 1:
-	cuda_safe_mem(cudaMemcpy(vec_in_d,WORKD+IPNTR[0]-1,size*sizeof(real),cudaMemcpyHostToDevice));
-	{
-	  const real one=1;
-	  //cublasCall(cublasRgemv( cublas, CUBLAS_OP_N, size, size, &alpha, mobility_d, lda, vec_in_d, 1, &zero, vec_out_d, 1));
-	  sd_Rgemv(&one, mobility, vec_in_d, vec_out_d);
-	}
-	cuda_safe_mem(cudaMemcpy(WORKD+IPNTR[1]-1,vec_out_d,size*sizeof(real),cudaMemcpyDeviceToHost));
-	break;
-      case -1:
-      case 2:
-      case 3:
-      case 4:
-	fprintf(stderr,"Error in %s l. %d: unexpected work from rnaupd: %d: Not Implemented!\n",__FILE__,__LINE__,IDO);
-	break;
-      case 99: //we are done
-	break;
-      default:
-	fprintf(stderr,"Error in %s l. %d: unexpected work from rnaupd: %d: Not Understood!\n",__FILE__,__LINE__,IDO);
-	break;
-      }
-    }
-    if (warnings > 1) fprintf(stderr,"calculating eigenvalue needed %d iterations and %d gemv operations (tolerance is %e, EW is %e).\n"
-	    ,IPARAM[2], IPARAM[8], TOL,WORKL[IPNTR[5]-1]);
-    if (INFO){
-      if (INFO == 1 && warnings > 1){
-	fprintf(stderr,"Maximum iterations taken to find Eigenvalues.\n");
-      } else if( INFO > 1 && warnings){
-	fprintf(stderr,"Unexpected return value in %s l. %d from rnaupd_: %d (debug info: %d %e)\n",__FILE__,__LINE__,INFO, minmax, V[0]);
-      }
-      /*
-	c  INFO    Integer.  (INPUT/OUTPUT)
-	c          If INFO .EQ. 0, a randomly initial residual vector is used.
-	c          If INFO .NE. 0, RESID contains the initial residual vector,
-	c                          possibly from a previous run.
-	c          Error flag on output.
-	c          =  0: Normal exit.
-	c          =  1: Maximum number of iterations taken.
-	c                All possible eigenvalues of OP has been found. IPARAM(5)
-	c                returns the number of wanted converged Ritz values.
-	c          =  2: No longer an informational error. Deprecated starting
-	c                with release 2 of ARPACK.
-	c          =  3: No shifts could be applied during a cycle of the
-	c                Implicitly restarted Arnoldi iteration. One possibility
-	c                is to increase the size of NCV relative to NEV.
-	c                See remark 4 below.
-	c          = -1: N must be positive.
-	c          = -2: NEV must be positive.
-	c          = -3: NCV-NEV >= 2 and less than or equal to N.
-	c          = -4: The maximum number of Arnoldi update iteration
-	c                must be greater than zero.
-	c          = -5: WHICH must be one of 'LM', 'SM', 'LR', 'SR', 'LI', 'SI'
-	c          = -6: BMAT must be one of 'I' or 'G'.
-	c          = -7: Length of private work array is not sufficient.
-	c          = -8: Error return from LAPACK eigenvalue calculation;
-	c          = -9: Starting vector is zero.
-	c          = -10: IPARAM(7) must be 1,2,3,4.
-	c          = -11: IPARAM(7) = 1 and BMAT = 'G' are incompatable.
-	c          = -12: IPARAM(1) must be equal to 0 or 1.
-	c          = -9999: Could not build an Arnoldi factorization.
-	c                   IPARAM(5) returns the size of the current Arnoldi
-	c                   factorization.
-      */
-    }
-    if (WORKL[IPNTR[5]-1]<0 && TOL > 1e-3){
-      minmax--;
-      TOL=min(tol,1e-4);
-      tol=min(tol*tol, 1e-8);
-      INFO=1;
-      IPARAM[2]=maxit*100;
-    }
-    if (minmax){ // make them a bit larger/smaller to be sure that we are in the interval of interrest ...
-      *lambda_min=WORKL[IPNTR[5]-1]*(1+TOL);
-    } else {
-      *lambda_max=WORKL[IPNTR[5]-1]*(1-TOL);
-    }
-  }
-  /* FORTRAN Comments of the dnaupd
-     c          IPNTR(6): pointer to the real part of the ritz value array     
-     c                    RITZR in WORKL.                                          
-     c          IPNTR(7): pointer to the imaginary part of the ritz value array    
-     c                    RITZI in WORKL.                                          
-     c          IPNTR(8): pointer to the Ritz estimates in array WORKL associated
-     c                    with the Ritz values located in RITZR and RITZI in WORK
-  */
-  if (warnings > 1) {
-    printf("Range of Farfield-Mobility: [%e, %e]\n", *lambda_min, *lambda_max);
-  }
-  cuda_safe_mem(cudaFree((void*)vec_in_d));
-  cuda_safe_mem(cudaFree((void*)vec_out_d));
-}
-
-/// \brief caclulate the chebyshev coefficents for the squareroot of the matrix
-/// It needs the largest and smalles eigenvalue which have to be computed before.
-/// Using chebyshev-gausquadrature \link https://en.wikipedia.org/wiki/Chebyshev%E2%80%93Gauss_quadrature \endlink.
-/// 
-/// \param lambda_min   : the lower boundery
-/// \param lambda_max   : the upper boundery of the interval
-/// \param tol          : the given tollerance which should be achieved
-/// \param coefficents  : the pointer where the data will be stored
-real calculate_chebyshev_coefficents(real lambda_min, real lambda_max, real tol,real ** coefficents){
-  const int steps=1024*128; // with 1024 it should fit in L1
-  unsigned int N=1024; // guess the number of coefficents we need, if more are needed -> realocate
-  if (*coefficents==NULL){
-    *coefficents = (real *)Utils::malloc(N*sizeof(real));
-  } else {
-    *coefficents = (real *)Utils::realloc(*coefficents,N*sizeof(real));
-  }
-  real * current_polynome=NULL;
-  real * last_polynome=NULL;
-  last_polynome    = (real *) Utils::malloc(steps * sizeof(real));
-  current_polynome = (real *) Utils::malloc(steps * sizeof(real));
-  real x[steps];
-  real weight_and_func[steps];
-  real lambda_m=lambda_max-lambda_min;
-  real lambda_p=lambda_max+lambda_min;
-  //fprintf(stderr,"lambda_min: %e lambda_max: %e  ", lambda_min, lambda_max);
-  //fprintf(stderr,"lambda_minus: %e lambda_plusminus: %e\n", lambda_m, lambda_pm);
-  // init
-  real fac=2./steps;
-  //fprintf(stderr,"fac: %e\n", fac);
-  double ai=0;
-  double a1=0;
-  for (int i=0;i<steps;i++){
-    last_polynome[i]=1; //T0(x)=1
-    //real x = -1.+(i*2.+1.)/steps;
-    x[i]=cos((2.*i+1)/2./steps*M_PI);
-    current_polynome[i]= x[i];  //T1(x)=x
-    #ifdef SD_FF_ONLY
-    weight_and_func[i]=fac*sqrt(x[i]*lambda_m/2.+lambda_p/2.);
-    #else
-    weight_and_func[i]=fac*1./sqrt(x[i]*lambda_m/2.+lambda_p/2.);// /sqrt(1-x*x);// this could be big, but should not be inf
-    #endif
-    ai+=weight_and_func[i];
-    a1+=weight_and_func[i]*x[i];
-  }
-  real error;
-  int loop=0;
-  //fprintf(stderr,"%s l. %d: a[%d]: %e\n",__FILE__,__LINE__,0,ai);
-  //fprintf(stderr,"%s l. %d: a[%d]: %e\n",__FILE__,__LINE__,1,a1);
-  (*coefficents)[loop]=ai/2.;
-  loop++;
-  (*coefficents)[loop]=a1;
-  //double sumfacmax=lambda_max;
-  //double totalsum=abs(ai)+abs(a1)*sumfacmax;
-  double totalsum=abs(ai)+abs(a1);
-  //sumfacmax*=lambda_max;
-  const int miniloop=10;
-  do{
-    error=0;
-    do {
-      std::swap(last_polynome,current_polynome);
-      //{	real * tmp       = last_polynome; last_polynome    = current_polynome; current_polynome = tmp; }
-      //printf("addresses: 0x%08x  0x%08x ",last_polynome,current_polynome);
-      ai=0;
-      for (int i=0;i<steps;i++){
-	//real x = -1.+(i*2.+1.)/steps;
-	current_polynome[i]=-1.*current_polynome[i]+((real)2)*x[i]*last_polynome[i];
-	//printf ("%e %e %e\n", x, weight_and_func[i], current_polynome[i]);
-	ai+=current_polynome[i]*weight_and_func[i];
-      }
-      //fprintf(stderr,"%s l. %d: a[%d]: %e\n",__FILE__,__LINE__,loop,ai);
-      //printf("\n\n");
-      //sumfacmax*=lambda_max;
-      error+=abs(ai);//*sumfacmax;
-      totalsum+=abs(ai);//*sumfacmax;
-      loop++;
-      (*coefficents)[loop]=ai;
-    } while (loop%miniloop);
-    if (loop+miniloop > N){
-      N*=2;
-      *coefficents=(real *)Utils::realloc(*coefficents,N*sizeof(real));
-    }
-  } while ((error > tol*totalsum/10 || loop < 20 ) && loop < sqrt(steps));
-  if (loop >=steps/10 -1 ){
-    fprintf(stderr,"to few steps to get sufficent results in %s l. %d\n",__FILE__,__LINE__);
-  }
-  error=0;
-  while (error < tol*totalsum){ // approximate error
-    loop--;
-    error+=abs((*coefficents)[loop]);//*sumfacmax;
-    //sumfacmax/=lambda_max;
-  }
-  //fprintf(stderr,"sum: %e   error: %e",totalsum,error);
-  loop++;
-  free(last_polynome);
-  free(current_polynome);
-  if (warnings > 1) fprintf(stderr,"chebyshev needs %d gemv\n",loop);
-  return loop;
-}
-/// \brief Compute the brownian force resulting from the farfield
-/// To compute the farfield contribution of the brownian motion the
-/// squareroot of the inverse of the mobility is required.
-/// Instead of calculating it directly, its action on the vector
-/// containing the randomnumbers is computed via the chebyshev-polynomials.
-/// In the case of FF_ONLY this computes directly the displacement!
-real sd_compute_brownian_force_farfield(const matrix & mobility, const real * gaussian_ff,
-					real tol, real * brownian_force_ff ){
-
-  cudaCheckError("Unknown Error");
-  int size=mobility.size;
-  static real * cheby_coefficents=NULL;
-  static int N_chebyshev;
-  static bool recalc_ew = true;
-  static real lambda_min, lambda_max;
-  static int count_recalc=0;
-  static int count_total=0;
-  count_total++;
-  cudaCheckError("Unknown Error");
-  if (recalc_ew){
-    count_recalc++;
-    calculate_maxmin_eigenvalues(mobility,&lambda_min, &lambda_max, tol);
-    //printf("lambda min: %e, lambda_max: %e",lambda_min, lambda_max);
-    N_chebyshev = calculate_chebyshev_coefficents(lambda_min, lambda_max,tol*tol,&cheby_coefficents);
-    //printf("\nWe need %d iteration for Chebyshev.\n", N_chebyshev);
-    recalc_ew=false;
-    if (warnings>1){printf("recalc ratio: %6f\n",(((double)count_recalc)/((double)count_total)));}
-  }
-  if (lambda_min < 0){
-    std::cerr << "Mobility has negative eigenvalues!\n" << std::endl;
-    std::cout << "Mobility has negative eigenvalues!\n" << std::endl;
-    if (mobility.wavespace != NULL){
-      //printMatrixDev(mobility);
-    }
-    if (warnings > 2){
-      //printMatrixDev(mobility);
-    }
-    //printMatrixDev(mobility.data,mobility.ldd,mobility.size,"Mobility has negative eigenvalues!\n");
-    errexit();
-  }
-  real * chebyshev_vec_curr, * chebyshev_vec_last, * chebyshev_vec_next;
-  sd_set_zero<<<64,192>>>(brownian_force_ff,size);
-  cudaThreadSynchronize();
-  cudaCheckError("set zero");
-  chebyshev_vec_curr=NULL;
-  cuda_safe_mem(cudaMalloc( (void**)&chebyshev_vec_curr, size*sizeof(real) ));    assert(chebyshev_vec_curr != NULL);
-#ifdef SD_FF_ONLY
-  const real one  =1;
-  real gMg;
-  //cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &one, mobility_d, lda, gaussian_ff, 1, &zero,  chebyshev_vec_curr, 1));
-  sd_Rgemv(&one, mobility, gaussian_ff, chebyshev_vec_curr);
-  cublasCall(cublasRdot(cublas, size, chebyshev_vec_curr, 1, gaussian_ff, 1, &gMg));
-#else
-  real gaussian_ff_norm;
-  cublasCall(cublasRnrm2(cublas, size, gaussian_ff, 1, &gaussian_ff_norm));
-#endif
-  cublasCall(cublasRcopy( cublas, size, gaussian_ff, 1, chebyshev_vec_curr, 1));
-  cublasCall(cublasRaxpy( cublas, size, cheby_coefficents+0, chebyshev_vec_curr, 1, brownian_force_ff, 1 ));
-  //chebyshev_vec_last=chebyshev_vec_curr;
-  chebyshev_vec_last=NULL;
-  cuda_safe_mem(cudaMalloc( (void**)&chebyshev_vec_last, size*sizeof(real) ));    assert(chebyshev_vec_last != NULL);
-  chebyshev_vec_next=NULL;
-  cuda_safe_mem(cudaMalloc( (void**)&chebyshev_vec_next, size*sizeof(real) ));    assert(chebyshev_vec_next != NULL);
-  //sd_set_zero<<<64,192>>>(chebyshev_vec_????,size);
-  real lambda_minus=lambda_max-lambda_min;
-  real alpha=2./lambda_minus;
-  sd_Rgemv(&alpha, mobility,gaussian_ff, chebyshev_vec_next);
-  //cublasCall(cublasRgemv( cublas, CUBLAS_OP_N, size, size, &alpha, mobility_d, lda, gaussian_ff, 1, &zero, chebyshev_vec_next , 1));
-  alpha=-(lambda_min+lambda_max)/lambda_minus;
-  cublasCall(cublasRaxpy( cublas, size, &alpha, chebyshev_vec_curr, 1, chebyshev_vec_next, 1 ));
-  std::swap(chebyshev_vec_curr,chebyshev_vec_next);
-  std::swap(chebyshev_vec_last,chebyshev_vec_next);
-  cublasCall(cublasRaxpy( cublas, size, cheby_coefficents+1, chebyshev_vec_curr, 1, brownian_force_ff, 1 ));
-  for (int i=2;i<=N_chebyshev;i++){
-    alpha=4./lambda_minus;
-    sd_Rgemv(&alpha, mobility, chebyshev_vec_curr, chebyshev_vec_next);
-    //cublasCall(cublasRgemv( cublas, CUBLAS_OP_N, size, size, &alpha, mobility_d, lda, chebyshev_vec_curr, 1, &zero, chebyshev_vec_next , 1));
-    alpha=-2*(lambda_min+lambda_max)/lambda_minus;
-    cublasCall(cublasRaxpy( cublas, size, &alpha, chebyshev_vec_curr, 1, chebyshev_vec_next, 1 ));
-    alpha=-1;
-    cublasCall(cublasRaxpy( cublas, size, &alpha, chebyshev_vec_last, 1, chebyshev_vec_next, 1 ));
-    std::swap(chebyshev_vec_curr,chebyshev_vec_next);
-    std::swap(chebyshev_vec_last,chebyshev_vec_next);
-    cublasCall(cublasRaxpy( cublas, size, cheby_coefficents+i, chebyshev_vec_curr, 1, brownian_force_ff, 1 ));
-  }
-#ifdef SD_DEBUG
-  assert(isSymmetricDev(mobility.data,mobility.ldd,mobility.size));
-#endif
-  // errorcheck of chebyshev polynomial
-#ifdef SD_FF_ONLY
-  real dispdisp;
-  cublasCall(cublasRdot(cublas, size, brownian_force_ff, 1,brownian_force_ff, 1, &dispdisp));
-  real E_cheby = sqrt(abs(dispdisp-gMg)/gMg);
-#else
-  real zMz;
-  alpha = 1;
-  sd_Rgemv(&alpha, mobility, brownian_force_ff, chebyshev_vec_last);
-  //cublasCall(cublasRgemv(cublas, CUBLAS_OP_N, size, size, &alpha, mobility_d, lda, brownian_force_ff, 1, &zero,  chebyshev_vec_last, 1));
-  cublasCall(cublasRdot(cublas, size, chebyshev_vec_last, 1, brownian_force_ff, 1, &zMz));
-  real E_cheby = sqrt(abs(zMz-gaussian_ff_norm*gaussian_ff_norm))/gaussian_ff_norm;
-#endif
-  if (E_cheby > tol){
-    recalc_ew=true;
-  }
-  cuda_safe_mem(cudaFree((void*)chebyshev_vec_last));
-  cuda_safe_mem(cudaFree((void*)chebyshev_vec_curr));
-  cuda_safe_mem(cudaFree((void*)chebyshev_vec_next));
-  return E_cheby;
-}
-
-void sd_Rgemv(const real * factor, const matrix & A, const real * in, real * out){
-  A.assert_all();
-  if(A.is_sparse){
-    int threads=A.size;
-    const int tpB = numThreadsPerBlock;
-    int blocks = ((threads+tpB-1)/tpB);
-    sd_multiply_sparse_matrix_vector<<<blocks, tpB>>>(A.size, *factor, A.data, A.ldd, A.ldd_short, A.col_idx, A.row_l, in, out );
-    cudaThreadSynchronize(); 
-    real tmp;
-    cublasCall(cublasRnrm2(cublas, A.size, out,1, &tmp));
-    cudaCheckError("Sparse Matrix Vector Multiplication");
-  }
-  else{
-    assert(A.ldd);
-    const real zero=0;
-    #ifdef SD_USE_FLOAT
-    cublasCall(cublasSgemv(cublas, CUBLAS_OP_N, A.size, A.size, factor, A.data, A.ldd, in,  1, &zero, out, 1));
-    #else
-    cublasCall(cublasDgemv(cublas, CUBLAS_OP_N, A.size, A.size, factor, A.data, A.ldd, in,  1, &zero, out, 1));
-    #endif
-  }
-  if(A.wavespace != NULL){
-    /*real insize;
-    cublasCall(cublasRnrm2(cublas, A.size, in, 1, &insize));
-    real before;
-    cublasCall(cublasRnrm2(cublas, A.size, out, 1, &before));*/
-    wavepart  & wave=* A.wavespace;
-    int buf_size = A.wavespace->max*3;
-    real * sin_sum=NULL;
-    cuda_safe_mem(cudaMalloc( (void**)& sin_sum, buf_size*sizeof(real)));
-    real max;
-    sd_nrm1<<<1,numThreadsPerBlock>>>(A.size,in,sin_sum);
-    cuda_safe_mem(cudaMemcpy(&max, sin_sum, sizeof(real), cudaMemcpyDeviceToHost));
-    sd_set_zero<<<64,192>>>(sin_sum,buf_size);
-    //fprintf(stderr,"buf_size is %d\n",buf_size);
-    real * cos_sum=NULL;
-    cuda_safe_mem(cudaMalloc( (void**)& cos_sum, buf_size*sizeof(real)));
-    sd_set_zero<<<64,192>>>(cos_sum,buf_size);
-    cudaThreadSynchronize();
-    cudaCheckError("");
-    // thread number has to be a power of two!
-    //sd_wavepart_sum<<<wave.num,64,64*3*sizeof(real)>>>(in, wave.vecs, wave.num, wave.matrices, wave.sines, wave.cosines, \
-    //fprintf(stderr,"calling sd_wavepart_sum<<<%d,%d,%d>>>\n",wave.num,8,8*3*sizeof(real));
-    //                         ,------ has to be power of two and at least 8 (and there might be still a bug if larger than 32)
-    //                        \|/
-    //                         v   
-    sd_wavepart_sum<<<wave.num,32,32*3*sizeof(real)>>>(in, wave.vecs, wave.num, wave.matrices, wave.sines, wave.cosines, \
-						       A.ldd_short, A.size/3, sin_sum, cos_sum, max);
-    cudaThreadSynchronize();
-    cudaCheckError("");
-    int tpB = 32;
-    int blocks = (A.ldd_short)/tpB;
-    sd_wavepart_assemble<<<blocks,tpB,tpB*3*sizeof(real)>>>(wave.num, wave.sines, wave.cosines, sin_sum, cos_sum,
-    							    A.ldd_short, out, max, A.size/3, *factor);
-    //sd_wavepart_assemble_block<<<A.size/3,tpB,tpB*3*sizeof(real)>>>(wave.num, wave.sines, wave.cosines, sin_sum, cos_sum,
-    //							              A.ldd_short, out, max, A.size/3, *factor);
-    cudaThreadSynchronize();
-    cudaCheckError("");
-    /*#ifdef SD_DEBUG
-    if (A.dense){
-      printVectorDev(sin_sum,buf_size,"insin_sum");
-      printVectorDev(cos_sum,buf_size,"incos_sum");
-      real * correct;
-      cuda_safe_mem(cudaMalloc( (void**)&correct,    A.ldd*sizeof(real) ));
-      const real zero=0;
-      #ifdef SD_USE_FLOAT
-      cublasCall(cublasSgemv(cublas, CUBLAS_OP_N, A.size, A.size, factor, A.dense, A.ldd, in,  1, &zero, correct, 1));
-      #else
-      cublasCall(cublasDgemv(cublas, CUBLAS_OP_N, A.size, A.size, factor, A.dense, A.ldd, in,  1, &zero, correct, 1));
-      #endif
-      const real minusone = -1;
-      cublasCall(cublasRaxpy(cublas, A.size, &minusone, out, 1 ,correct, 1));
-      real erg;
-      cublasCall(cublasRnrm2(cublas, A.size, correct, 1 , &erg));
-      if (erg < 1e-3){
-	std::cout << "s";
-	printVectorDev(in     ,A.size,"\nin, worked  ");
-	printVectorDev(correct,A.size,"correct     ");
-	printVectorDev(out    ,A.size,"out         ");
-      } else {
-	A.print();
-	cublasCall(cublasRaxpy(cublas, A.size, &minusone, out, 1 ,correct, 1));
-	printVectorDev(in     ,A.size,"in, failed  ");
-	printVectorDev(correct,A.size,"correct     ");
-	printVectorDev(out    ,A.size,"out         ");
-	
-      }
-      cuda_safe_mem(cudaFree((void*) correct));
-    }
-    if (warnings > 20){
-      printVectorDev(sin_sum,buf_size,"sin_sum");
-      printVectorDev(cos_sum,buf_size,"cos_sum");
-    }
-    #endif*/
-    cuda_safe_mem(cudaFree((void*) sin_sum));
-    cuda_safe_mem(cudaFree((void*) cos_sum));
-  }
-}
-
-
-void sd_compute_mobility_matrix_wave_cpu(int kmax, real kmax2, matrix & mobility, real a,const real * L_h, real xi, real selfmobility, real ew_prec){
-#ifdef SD_DEBUG
-  //kmax=1;
-  //kmax2=kmax*kmax;
-#endif
-  cudaCheckError("");
-  real fak=selfmobility;
-  for(int k=0;k<3;k++){
-    fak/=L_h[k];
-  }
-  //std::cerr <<fak<<std::endl;
-  int cki[3];
-  real k[3];
-  real ki[3];
-  real xi2=1/xi/xi;
-  real kvol=1;
-  for (int i =0 ; i < 3; i++){
-    ki[i]=2*M_PI/L_h[i];
-    kvol*=ki[i];
-  }
-  // count how many wave vectors we need
-  int vc = 0;
-  for (cki[0]=-kmax;cki[0]<kmax+1;cki[0]+=1){
-    for (cki[1]=-kmax;cki[1]<kmax+1;cki[1]+=1){
-      for (cki[2]=-kmax;cki[2]<kmax+1;cki[2]+=1){
-	int cki2=cki[0]*cki[0]+ cki[1]*cki[1] + cki[2]*cki[2];
-	if (cki2 <= kmax2 && cki2 > 0 ){
-	  vc++;
-	}
-      }
-    }
-  }
-  if (!vc){ // nothing to do
-    return;
-  }
-  if (mobility.wavespace == NULL){
-    mobility.wavespace = new wavepart(mobility.ldd_short);
-    //mobility.wavespace = (wavepart *) Utils::malloc(sizeof(wavepart));
-    //mobility.wavespace->wavepart();
-  }
-  mobility.wavespace->num      = vc;
-  int max = vc;
-  if (max !=  mobility.wavespace->num){
-    mobility.wavespace->num=max;
-  }
-  assert(max);
-  if (max > mobility.wavespace->max){
-    mobility.wavespace->max = ((max+31)/32)*32;
-    max=mobility.wavespace->max;
-    //fprintf(stderr,"there are %d wavespace contribution taking %d places",vc, max);
-    cuda_safe_mem(cudaMalloc( (void**)& (mobility.wavespace->vecs)     , max*3*sizeof(real) ));
-    cuda_safe_mem(cudaMalloc( (void**)& (mobility.wavespace->matrices) , max*6*sizeof(real) ));
-    cuda_safe_mem(cudaMalloc( (void**)& (mobility.wavespace->sines)    , max*mobility.ldd_short*sizeof(real) ));
-    cuda_safe_mem(cudaMalloc( (void**)& (mobility.wavespace->cosines)  , max*mobility.ldd_short*sizeof(real) ));
-    cudaCheckError("");
-    sd_set_zero<<<640,64>>>(mobility.wavespace->sines   , max*mobility.ldd_short);
-    sd_set_zero<<<640,64>>>(mobility.wavespace->cosines , max*mobility.ldd_short);
-    cudaCheckError("");
-  }
-  int ldd = mobility.wavespace->max;
-  real vecs_h[3 * ldd];
-  real matrices_h[6 * ldd];
-  vc=0;
-  for (cki[0]=-kmax;cki[0]<kmax+1;cki[0]+=1){
-    k[0]=cki[0]*ki[0];
-    for (cki[1]=-kmax;cki[1]<kmax+1;cki[1]+=1){
-      k[1]=cki[1]*ki[1];
-      for (cki[2]=-kmax;cki[2]<kmax+1;cki[2]+=1){
-	int cki2=cki[0]*cki[0]+ cki[1]*cki[1] + cki[2]*cki[2];
-	if (cki2 <= kmax2 && cki2 > 0){
-	  k[2]=cki[2]*ki[2];
-	  real k2=k[0]*k[0]+ k[1]*k[1] + k[2]*k[2];
-	  for (int i = 0;i<3;i++){
-	    vecs_h[vc*3+i]=k[i];
-	  }
-	  real scal = (a-1./3.*a*a*a*k2)*(1+0.25*xi2*k2+0.125*xi2*xi2*k2*k2)*6.*M_PI/k2*exp(-0.25*xi2*k2)*fak;
-	  //if (cki[0]==-kmax){
-	  //  printf("scal is %e, exp is %e\n",scal/fak,exp(-0.25*xi2*k2));
-	  //}
-	  //if (scal/kvol > 1e-6*ew_prec){
-	    for (int i = 0 ; i < 3 ; i++ ){
-	      matrices_h[vc*6+i] = (1-(k[i]*k[i]/k2))*scal;
-	    }
-	    matrices_h[vc*6+3] = (-k[0]*k[1]/k2)*scal;
-	    matrices_h[vc*6+4] = (-k[0]*k[2]/k2)*scal;
-	    matrices_h[vc*6+5] = (-k[1]*k[2]/k2)*scal;
-	    /*if (scal/fak > 1e-5){
-	      printf("scal is %e, k is %e",scal/fak,sqrt(k2));
-	      printf("\tk=[%4.0f, %4.0f, %4.0f]  ",k[0]*L_h[0]/2/M_PI,k[1]*L_h[1]/2/M_PI,k[2]*L_h[2]/2/M_PI);
-	      printf("  m=[%12.4e, %12.4e, %12.4e,%12.4e, %12.4e, %12.4e]  \n",matrices_h[vc*6],matrices_h[vc*6+1],matrices_h[vc*6+2],matrices_h[vc*6+3],matrices_h[vc*6+4],matrices_h[vc*6+5]);
-	      }*/
-	    vc++;
-	    //}
-	}
-      }
-    }
-  }
-  mobility.wavespace->num      = vc;
-  {
-    static int printed=0;
-    if ((printed%reprint)==1){
-      printf("\nwe need %d k vectors\n",vc);
-    }
-    printed++;
-  }
-  cudaCheckError("");
-  //fprintf(stderr,"h: 0x%x d: 0x%x, size: %d",vecs_h,mobility.wavespace->vecs,  ldd*3*sizeof(real));
-  cuda_safe_mem(cudaMemcpy(mobility.wavespace->vecs,    vecs_h,    ldd*3*sizeof(real),cudaMemcpyHostToDevice));
-  cuda_safe_mem(cudaMemcpy(mobility.wavespace->matrices,matrices_h,ldd*6*sizeof(real),cudaMemcpyHostToDevice));
-  //mobility.wavespace->hash_vecs();
-}
-
-void _cuda_check_error(char *file, unsigned int line){
-  cudaError_t err=cudaGetLastError();
-  if (err != cudaSuccess) {
-    fprintf(stderr, "Error found during memory operation. Possibly however from an failed operation before. %s:%u.\n", file, line);
-    printf("CUDA error: %s\n", cudaGetErrorString(err));
-    if ( err == cudaErrorInvalidValue )
-      fprintf(stderr, "You may have tried to allocate zero memory before %s:%u.\n", file, line);
-    exit(EXIT_FAILURE);
-  }
-}
-
-
-
-int sd_find_max(int * data, int length){
-  int tlength=length;
-  int * tdata=data;
-  if (tlength >= 512){
-    int bs=32;
-    if (tlength > 4*1024){
-      bs=128;
-    } else if (tlength > 1024){
-      bs=64;
-    }
-    tlength = (length+bs-1)/bs;
-    cuda_safe_mem(cudaMalloc((void **) &tdata,tlength*sizeof(int)));
-    sd_nrm_inf<<<tlength,bs,bs*sizeof(int)>>>(length,data,tdata);
-  }
-  //if (length < 64){ // copy to host and compare
-  int tmp[tlength];
-  cuda_safe_mem(cudaMemcpy(tmp,tdata,tlength*sizeof(int),cudaMemcpyDeviceToHost));
-  int result=tmp[0];
-  for (int i=1;i<tlength;i++){
-    if (tmp[i] > result){
-      result=tmp[i];
-    }
-  }
-  if (length>=512) {
-    cuda_safe_mem(cudaFree((void*)tdata));
-  }
-  return result;
-}
-
-
-
-#endif
diff --git a/src/core/integrate_sd_cuda.hpp b/src/core/integrate_sd_cuda.hpp
deleted file mode 100644
index 115ce5437e8..00000000000
--- a/src/core/integrate_sd_cuda.hpp
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
-  Copyright (C) 2010,2012,2013,2014,2015,2016 The ESPResSo project
-  Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 
-    Max-Planck-Institute for Polymer Research, Theory Group
-  
-  This file is part of ESPResSo.
-  
-  ESPResSo is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-  
-  ESPResSo is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-  
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
-*/
-
-#ifndef __INTEGRATE_SD_CUDA_HPP
-#define __INTEGRATE_SD_CUDA_HPP
-
-#include "integrate_sd.hpp"
-
-#ifdef SD
-#ifdef CUDA
-#include <cublas_v2.h>
-
-void errexit();
-
-#ifndef SD_USE_FLOAT
-typedef double real;
-#define cublasRgemm(...)                cublasDgemm( __VA_ARGS__)
-#define cublasRdot(...)                 cublasDdot( __VA_ARGS__)
-//#define cublasRgemv(...)                cublasDgemv( __VA_ARGS__) // always use sd_Rgemv
-#define cublasRcopy(...)                cublasDcopy( __VA_ARGS__)
-#define cublasRaxpy(...)                cublasDaxpy( __VA_ARGS__)
-#define cublasRscal(...)                cublasDscal( __VA_ARGS__)
-//#define cublasRnrm2(...)              cublasDnrm2( __VA_ARGS__)
-#define cublasRnrm2(a1, a2, a3, a4, a5) cublasDdot(a1,a2, a3, a4, a3, a4, a5));*(a5)=sqrt(*(a5)
-#define cublasRtrsv(...)                cublasDtrsv( __VA_ARGS__)
-#define curandGenerateNormalReal(...)   curandGenerateNormalDouble(__VA_ARGS__)
-#define __real2int_rd(...)              __double2int_rd(__VA_ARGS__)
-#define __real2int_ru(...)              __double2int_ru(__VA_ARGS__)
-#define rnaupd(...)                     dnaupd_(__VA_ARGS__)
-#else // use float
-typedef float real;
-#define cublasRgemm(...)                cublasSgemm( __VA_ARGS__)
-#define cublasRdot(...)                 cublasSdot( __VA_ARGS__)
-//#define cublasRgemv(...)                cublasSgemv( __VA_ARGS__) // always use sd_Rgemv
-#define cublasRcopy(...)                cublasScopy( __VA_ARGS__)
-#define cublasRaxpy(...)                cublasSaxpy( __VA_ARGS__)
-#define cublasRscal(...)                cublasSscal( __VA_ARGS__)
-//#define cublasRnrm2(...)              cublasSnrm2( __VA_ARGS__)
-#define cublasRnrm2(a1, a2, a3, a4, a5) cublasSdot(a1,a2, a3, a4, a3, a4, a5));*(a5)=sqrt(*(a5)
-#define cublasRtrsv(...)                cublasStrsv( __VA_ARGS__)
-#define curandGenerateNormalReal(...)   curandGenerateNormal(__VA_ARGS__)
-#define __real2int_rd(...)              __float2int_rd(__VA_ARGS__)
-#define __real2int_ru(...)              __float2int_ru(__VA_ARGS__)
-#define rnaupd(...)                     snaupd_(__VA_ARGS__)
-#endif //#ifndef SD_USE_FLOAT
-
-#include "cuda_utils.hpp"
-#include "integrate_sd.hpp"
-#include "integrate_sd_cuda.hpp"
-#include "integrate_sd_matrix.hpp"
-
-extern cublasHandle_t cublas;
-
-extern double temperature; ///< this is defined in \file thermostat.cpp
-
-const int numThreadsPerBlock = 32; ///< gives the number of threads per cuda block. should be a multiple of 32 and much smaller then the numbers of particles
-#define numThreadsPerBlock_is_power_of_two TRUE
-
-#ifndef SD_FF_ONLY
-const int bucket_size_factor = 3;  //TODO: find optimal parameter for given implementation
-#endif
-
-void _cudaCheckError(const char *msg, const char * file, const int line);
-#define cudaCheckError(msg)  cudaThreadSynchronize();_cudaCheckError((msg),__FILE__,__LINE__)
-
-#define myindex(i,j) ((i)*(lda)+(j))
-
-#define SQR(x) (x)*(x)
-
-#define cublasCall(call) { cublasStatus_t stat=(call);			\
-    if (stat != CUBLAS_STATUS_SUCCESS) {				\
-      std::cerr << "CUBLAS failed in " << __FILE__			\
-		<< " l. " << __LINE__ <<"\n\t"  ;			\
-      if (stat == CUBLAS_STATUS_NOT_INITIALIZED){			\
-	std::cerr << "the CUDA Runtime initialization failed.\n";	\
-      } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) {			\
-	std::cerr << "the resources could not be allocated\n";		\
-      } else {								\
-	std::cerr << "unknown error\n";					\
-      }									\
-      errexit();							\
-    }									\
-  }
-#define curandCall(call) { curandStatus_t stat =(call);			\
-    if (stat != CURAND_STATUS_SUCCESS) {				\
-      std::cerr << "CURAND failed in " << __FILE__			\
-		<< " l. " << __LINE__ <<"\n\t"  ;			\
-      if (stat == CURAND_STATUS_NOT_INITIALIZED){			\
-	std::cerr << "the CUDA Runtime initialization failed.\n";	\
-      } else if (stat == CURAND_STATUS_VERSION_MISMATCH) {		\
-	std::cerr << "Header file and linked library version do not match.\n"; \
-      } else if (stat == CURAND_STATUS_ALLOCATION_FAILED) {		\
-	std::cerr << "Memory allocation failed.\n";			\
-      } else if (stat == CURAND_STATUS_TYPE_ERROR) {			\
-	std::cerr << "Generator is wrong type.\n";			\
-      } else if (stat == CURAND_STATUS_OUT_OF_RANGE) {			\
-	std::cerr << "the argument was out of range.\n";		\
-      } else if (stat == CURAND_STATUS_LENGTH_NOT_MULTIPLE) {		\
-	std::cerr << "Length requested is not a multple of dimension.\n"; \
-      } else if (stat == CURAND_STATUS_DOUBLE_PRECISION_REQUIRED) {	\
-	std::cerr << "GPU does not have double precision required by MRG32k3a.\n"; \
-      } else if (stat == CURAND_STATUS_LAUNCH_FAILURE) {		\
-	std::cerr << "Kernel launch failure.\n";			\
-      } else if (stat == CURAND_STATUS_PREEXISTING_FAILURE) {		\
-	std::cerr << "Preexisting failure on library entry.\n";		\
-      } else if (stat == CURAND_STATUS_INITIALIZATION_FAILED) {		\
-	std::cerr << "Initialization of CUDA failed.\n";		\
-      } else if (stat == CURAND_STATUS_ARCH_MISMATCH) {			\
-	std::cerr << "Architecture mismatch, GPU does not support requested feature.\n"; \
-      } else if (stat == CURAND_STATUS_INTERNAL_ERROR) {		\
-	std::cerr << "Internal library error.\n";\
-      } else {								\
-	fprintf(stderr,"unknown error (error number: %d)\n",stat);	\
-      }									\
-      errexit();							\
-    }									\
-  }
-
-// headers for ARPACK-NG: http://forge.scilab.org/index.php/p/arpack-ng/
-extern "C"
-{
-  void dnaupd_(int* IDO, char* BMAT, int* N, char WHICH[], int* NEV, double* TOL, double RESID[], int* NCV, double V[], int* LDV, int IPARAM[],
-	       int IPNTR[], double WORKD[], double WORKL[], int* LWORKL, int* INFO);
-  void snaupd_(int* IDO, char* BMAT, int* N, char WHICH[], int* NEV, float* TOL, float RESID[], int* NCV, float V[], int* LDV, int IPARAM[],
-	       int IPNTR[], float WORKD[], float WORKL[], int* LWORKL, int* INFO);
-}
-
-#define SD_RESISTANCE_CORRECT
-
-/* ************************************* *
- * *******   private functions   ******* *
- * ************************************* */
-void sd_compute_displacement(const real * r_d, int N, real viscosity, real radius, const real * L_d, const real * L_h,
-			     const real * total_mobility_d, const real * force_d, real * disp_d, int * myInfo);
-
-
-// this solves iteratively using CG
-// disp * (1+resistance*mobility) = mobility_d *  force_d 
-// and returnes disp
-// mobility and resistance are square matrizes with size <size> and lda <((size+31)/32)*32>
-// force and disp are vectors of size <size>
-// recalc whether to mobility has changed since the last call or not
-real sd_iterative_solver(const matrix & mobility, const matrix & resistance, const real * force, real * disp, real rel_err, real abs_err, bool recalc);
-
-// description below
-// in the case of FF_ONLY this computes directly the displacement
-real sd_compute_brownian_force_farfield(const matrix & mobility, const real * gaussian_ff,
-					real tol, real * brownian_force_ff );
-
-
-/// This computes the k-vectors and the wavespace mobility matrices as defined be Beenakker 1986
-void sd_compute_mobility_matrix_wave_cpu(int kmax, real kmax2, matrix & mobility, real a, const real * L_h, real xi, real selfmobility, real ew_prec);
-
-
-
-
-void _cuda_check_error(char *file, unsigned int line);
-#define cuda_check_error(); _cuda_check_error(__FILE__,__LINE__);
-// BICGSTAB-Solver
-// implimented as given in `Numerik linearer Gleichungssysteme` by Prof. Dr. Andreas Meister
-// additional a preconditioner can be used (compile-time-switch). This overwrites the soluten vector b
-// this solves A*x=b
-// cublas a handle for cublas
-// size   the size n of the matrix
-// A      the given n*n matrix (in)
-// lda    the leading demension of A
-// b      the given solution vector (in)
-// tol    requested tolerance of the solution
-// maxit  maximum number of iterations
-// x      the requested solution with an initial guess (in/out)
-// returns 0 on success, else error code
-/*#ifdef SD_PC
-int sd_bicgstab_solver(cublasHandle_t cublas ,int size, const real * A,                   int lda,
-		       real * b, real tol,real abs_tol,  int maxit, real * x, real * res);
-#else
-int sd_bicgstab_solver(cublasHandle_t cublas ,int size, const real * A1, const real * A2, int lda,
-		       real * b, real tol,real abs_tol,  int maxit, real * x, real * res);
-		       #endif*/
-
-/// GMRes-Solver
-// implimented as given in Numerik linearer Gleichungssysteme by Prof. Dr. Andreas Meister
-// additional a preconditioner can be used (compile-time-switch). This overwrites the soluten vector b
-// this solves A*x=b
-// cublas a handle for cublas
-// size   the size n of the matrix
-// A      the given n*n matrix (in) (to use with preconditioning)
-// A1,A2  the matrizes that give A=A1*A2+1 (to use without preconditioning)
-// lda    the leading demension of A
-// b      the given solution vector (in)
-// tol    requested tolerance of the solution
-// maxit  maximum number of iterations
-// x      the requested solution with an initial guess (in/out)
-// returns 0 on success, else error code
-#ifdef SD_PC
-int sd_gmres_solver(int size, const real * A, int lda,
-		    real * b, real tol,real abs_tol, int maxit, real * x, real * res);
-#else
-int sd_gmres_solver(const matrix & A1, const matrix & A2,
-		    const real * b, real tol,real abs_tol, int maxit, real * x, real * res);
-#endif
-
-
-// calculates the largest and snalles eigenvalue of the matrix
-// mobility    : the mobility matrix (data on the device)      (IN)
-// lambda_min  : smalles eigenvalue                            (OUT)
-// lambda_max  : largest eigenvalue                            (OUT)
-void calculate_maxmin_eigenvalues(const matrix & mobility, real * lamba_min, real * lambda_max, real tol);
-
-/// matrix vector multiplication
-/// this solves \[ out = factor * A * in \]
-/// where A is a matrix (see struct matrix)
-/// in and out are vectors
-/// factor is a scalar
-void sd_Rgemv(const real * factor,const matrix & A,const real * in, real * out);
-
-
-// compute the chebyshev coeffeicents which are needed to achieve a given accuracy.
-// lambda_min   : the lower boundery
-// lambda_max   : the upper boundery of the interval
-// tol          : the given tollerance which should be achieved
-// coefficents  : the pointer where the data will be stored
-real calculate_chebyshev_coefficents(real lambda_min, real lambda_max, real tol,real ** coefficents);
-
-// find the maximal value of an array on the GPU
-// data         : the array
-// length       : number of elements
-// return value : the maximal value
-int sd_find_max(int * data, int length);
-
-#endif /* SD */
-#endif /* CUDA */
-
-#endif
diff --git a/src/core/integrate_sd_cuda_debug.cu b/src/core/integrate_sd_cuda_debug.cu
deleted file mode 100644
index 678633130ae..00000000000
--- a/src/core/integrate_sd_cuda_debug.cu
+++ /dev/null
@@ -1,337 +0,0 @@
-/*
-  Copyright (C) 2010,2012,2013,2014,2015,2016 The ESPResSo project
-  Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 
-    Max-Planck-Institute for Polymer Research, Theory Group
-  
-  This file is part of ESPResSo.
-  
-  ESPResSo is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-  
-  ESPResSo is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-  
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
-*/
-
-#include "config.hpp"
-
-#include <iostream>
-#include <iomanip>
-#include <stdio.h>
-#include <cublas_v2.h>
-#include <curand.h>
-
-#include "integrate_sd_cuda.hpp"
-#include "integrate_sd_cuda_debug.hpp"
-#include "integrate_sd_cuda_kernel.hpp"
-//#include "errorhandling.hpp"
-#include "cuda_utils.hpp"
-
-#if defined(OMPI_MPI_H) || defined(_MPI_H)
-#error CU-file includes mpi.h! This should not happen!
-#endif
-
-
-/* *************************************************************************************************************** *
- * ********************************************    DEBUGING & TESTs   ******************************************** *
- * *************************************************************************************************************** */
-#ifdef SD
- 
-int sd_test(int size, int type){
-  assert(size%3 == 0);
-  int lda=((size+31)/32)*32;
-  matrix mat;
-  mat.ldd=lda;
-  mat.size=size;
-  real * x1=NULL, * x2=NULL, * b =NULL;
-  cuda_safe_mem(cudaMalloc((void**) &mat.data   ,size*lda*sizeof(real)));
-  cuda_safe_mem(cudaMalloc((void**) &mat.col_idx,size*lda*sizeof(int)));
-  cuda_safe_mem(cudaMalloc((void**) &mat.row_l  ,size*sizeof(int)));
-  cuda_safe_mem(cudaMalloc((void**) &x1         ,size*sizeof(real)));
-  cuda_safe_mem(cudaMalloc((void**) &x2         ,size*sizeof(real)));
-  cuda_safe_mem(cudaMalloc((void**) &b          ,size*sizeof(real))); 
-  sd_set_zero_matrix<<<32,32>>>(mat.data,mat.size,mat.ldd);
-  sd_set_zero<<<32,32>>>(b,size);
-  for (int i=0;i<size/3;i++){
-    sd_set_value<<<32,32>>>(mat.col_idx+i*mat.ldd,lda,i);
-  }
-  sd_set_value<<<32,32>>>(mat.row_l,mat.ldd, mat.size/3);
-  //cublasHandle_t cublas=NULL; // use global
-  cublasCall(cublasCreate(&cublas));
-  curandGenerator_t generator = NULL;
-  curandCall(curandCreateGenerator(&generator, CURAND_RNG_PSEUDO_DEFAULT));
-  curandCall(curandSetPseudoRandomGeneratorSeed(generator, (unsigned long long)('E'+'S'+'P'+'R'+'e'+'s'+'S'+'o')));
-  curandCall(curandSetGeneratorOrdering( generator, CURAND_ORDERING_PSEUDO_BEST));
-  curandCall(curandSetGeneratorOffset( generator, 0));
-  curandCall(curandGenerateNormalReal( generator, b,  size, 0, sqrt(2.)));
-  curandCall(curandGenerateNormalReal( generator, x1, size, 0, sqrt(2.)));
-  curandCall(curandGenerateNormalReal( generator, x2, size, 0, sqrt(2.)));
-  if (type==0){
-    ;
-  }
-  else{
-    curandCall(curandGenerateNormalReal( generator, mat.data, mat.size*mat.ldd, 0, sqrt(2.)));
-  }
-  mat.is_sparse=false;
-  const real one=1;
-  const real minusone=-1;
-  sd_Rgemv(&one, mat, b,x1);
-  mat.is_sparse=true;
-  sd_Rgemv(&one, mat, b,x2);
-  real tmp;
-  cublasCall(cublasRnrm2(cublas, size, x1, 1, &tmp));
-  std::cout << "x1 norm: " << tmp;
-  cublasCall(cublasRnrm2(cublas, size, x2, 1, &tmp));
-  std::cout << "   x2 norm: " << tmp;
-  cublasCall(cublasRaxpy(cublas, size, &minusone, x1,1,x2,1));
-  cublasCall(cublasRnrm2(cublas, size, x2, 1, &tmp));
-  std::cout << "   diff norm: " << tmp;
-  return 0;
-}
-
-
-void printVectorDev( const real * data, const int m, const char * msg){
-  std::cout << msg;
-  printVectorDev(data,m);
-}
-void printVectorDev( const real * data, const int m){
-  real host[m];
-  assert(m>0);
-  cuda_safe_mem(cudaMemcpy( host, data, m*sizeof(*data), cudaMemcpyDeviceToHost ));
-  int count = 1;
-  char komma[2]={0,0};
-  std::cout << " = [";
-  for (int j=0; j < m;j++){
-    bool print=true;
-    //std::cout << ",\t" << std::setw(20) << std::setprecision(15) << host[j];
-    if (j < m-1){
-      if (host[j]==host[j+1]){
-	count++;
-	print=false;
-      }
-    }
-    if (print){
-      if (count > 3){
-	printf("%s\t%12.4e <repeats %d times>",komma,host[j],count);
-	komma[0]=',';
-	count = 1;
-      } else {
-	for (; count > 0 ; count--){
-	  printf("%s\t%12.4e",komma,host[j]);
-	  komma[0]=',';
-	}
-	count = 1;
-      }
-    }
-  }
-  std::cout << "];\n";
-}
-
-
-void printVectorHost( const real * host, const int m, const char * msg){
-  std::cout << msg;
-  printVectorHost(host,m);
-}
-
-void printVectorHost( const real * host, const int m){
-  for (int j=0; j < m;j++){
-    std::cout << ",\t" << std::setw(20) << std::setprecision(15) << host[j];
-  }
-  std::cout << "\n";
-}
-
-void printPosDev( real * data, int m, const char * msg){
-  std::cout << msg;
-  real host[m*3];
-  cuda_safe_mem(cudaMemcpy( host, data,3*m*sizeof(*data), cudaMemcpyDeviceToHost ));
-  for (int l=0; l<m;l++){
-    printf("%5d : [ %e, %e, %e]\n",l,host[3*l],host[3*l+1],host[3*l+2]);
-  }
-}
-
-/*void _cudaCheckError(const char *msg, const char * file, const int line)
-{
-  cudaError_t err = cudaGetLastError();
-  if( cudaSuccess != err)
-    {
-      std::cerr <<  "Cuda error:" <<  msg << ": " <<  cudaGetErrorString( err) << " in "<<file << "l. "<<line<<"\n";
-      errexit();
-    }
-    }*/
-
- 
-// check whether there was any cuda error so far.
-// do not use this function directly but use the macro cudaCheckError(const char *msg);
-// which requires only the first paramter
-// PARAMTERS:
-// msg   : the message which should be printed in case of an error
-// file  : the file in which the function is called
-// line  : the line in which the function is called
-void _cudaCheckError(const char *msg, const char * file, const int line)
-{
-  cudaError_t err = cudaGetLastError();
-  if( cudaSuccess != err)
-    {
-      std::cerr <<  "Cuda error:" <<  msg << ": '" <<  cudaGetErrorString( err) << "' in "<<file << " l. "<<line<<"\n";
-      errexit();
-    }
-}
-
-
-
-myTimer::myTimer(unsigned int _numCounters):numCounters(_numCounters),counters(NULL){
-  counters=(unsigned long long *)Utils::malloc(sizeof(unsigned long long)*numCounters);
-  assert(counters!=NULL);
-  for (unsigned int i=0;i<numCounters;i++){
-    counters[i]=0;
-  }
-  clock_gettime(CLOCK_MONOTONIC_RAW,&last);
-}
-
-myTimer::~myTimer(){
-  free(counters);
-}
-
-void myTimer::add(unsigned int _counter){
-  struct timespec current;
-  clock_gettime(CLOCK_MONOTONIC_RAW,&current);
-  // use usec
-  counters[_counter]+=(current.tv_sec-last.tv_sec)*1e6+(current.tv_nsec-last.tv_nsec+500)*1e-3;
-  clock_gettime(CLOCK_MONOTONIC_RAW,&last);
-}
-
-void myTimer::print(unsigned int _counter,const char * msg){
-  std::cout << msg << " took "<< counters[_counter]*1e-3<<" ms\n";
-}
-
-void myTimer::printAll(const char *msg){
-  std::cout << "\n" << msg;
-  for (unsigned int i=0;i<numCounters;i++){
-    std::cout << "\t" << counters[i]*1e-3;
-  }
-  std::cout << "\n";
-}
-
-
-
-real getAbsMax(const real * data, int length){
-  real max=0;
-  for (int i=0;i<length;i++){
-    if (abs(data[i])>max){
-      max=abs(data[i]);
-    }
-  }
-  return max;
-}
-
-
-real getAbsMaxDev(const real * data_d, int length){
-  real host[length];
-  cuda_safe_mem(cudaMemcpy( host, data_d, length*sizeof(*data_d), cudaMemcpyDeviceToHost ));
-  return getAbsMax(host,length);
-}
-
-
-int countMinusDev(const real * data_d, int length){
-  real host[length];
-  cuda_safe_mem(cudaMemcpy( host, data_d, length*sizeof(*data_d), cudaMemcpyDeviceToHost ));
-  int c=0;
-  for (int i=0;i<length;i++){
-    if (host[i]<0)
-      c++;
-  }
-  return c;
-}
-
-bool hasAnyNanDev(const real * data, int length){
-  real host[length];
-  cuda_safe_mem(cudaMemcpy( host, data, length*sizeof(*data), cudaMemcpyDeviceToHost ));
-  for (int c=0;c<length;c++){
-    if (isnan(host[c])){
-      return true;
-    }
-  }
-  return false;
-}
-
-bool isSymmetricDev(const matrix & mat){
-  return isSymmetricDev(mat.data,mat.ldd,mat.size);
-}
-
-bool  isSymmetricDev(const real * data, int lda, int size){
-  #ifdef SD_DEBUG
-  real host[lda*size];
-  cuda_safe_mem(cudaMemcpy( host, data, size*lda*sizeof(real), cudaMemcpyDeviceToHost ));
-  for (int i=0;i<size;i++){
-    for (int j=0;j<size;j++){
-      if (fabs(host[i*lda+j]-host[j*lda+i])/(fabs(host[i*lda+j])+fabs(host[j*lda+i])) > 1e-8){
-	fprintf(stderr,"not symmetric: elements (%d,%d) and (%d,%d)\n",i,j,j,i);
-	return false;
-      }
-    }
-  }
-  #endif
-  return true;
-}
-
-
-
-void printMatrixDev( const matrix & mat){
-  /*if (mat.wavespace){
-    printf("wavespace:: max entries: %d, actual entries: %d\n",mat.wavespace->max,mat.wavespace->num);
-    printVectorDev(mat.wavespace->matrices,mat.wavespace->max*6, "matrices");
-    printVectorDev(mat.wavespace->vecs    ,mat.wavespace->max*3, "vecs");
-  }*/
-  mat.printWavespace();
-  printMatrixDev(mat.data,mat.ldd,mat.size);
-}
-
-
-void  matrix::printWavespace() const{
-  if (wavespace){
-    wavespace->print(size/3,ldd_short);
-  }else {
-    printf("No wavespace available!\n");
-  }
-}
-void matrix::print() const{
-  printMatrixDev(*this);
-}
-
-void wavepart::print(int N, int ldd) const{
-  printVectorDev(sines,max*ldd);
-  printVectorDev(cosines,max*ldd);
-  assert(max > 0);
-  real vec_h[max*3];
-  real mat_h[max*6];
-  real sin_h[max*ldd];
-  real cos_h[max*ldd];
-  //real box_l[]={8,8,8};
-  extern double box_l[3];
-  cuda_safe_mem(cudaMemcpy( vec_h, vecs,     max*3*sizeof(real),   cudaMemcpyDeviceToHost ));
-  cuda_safe_mem(cudaMemcpy( mat_h, matrices, max*6*sizeof(real),   cudaMemcpyDeviceToHost ));
-  cuda_safe_mem(cudaMemcpy( sin_h, sines,    max*ldd*sizeof(real), cudaMemcpyDeviceToHost ));
-  cuda_safe_mem(cudaMemcpy( cos_h, cosines,  max*ldd*sizeof(real), cudaMemcpyDeviceToHost ));
-  printf("printing wavespace:\n");
-  for (int i=0; i<num; i++){
-    printf("\tk=[%4.0f, %4.0f, %4.0f]  ",vec_h[i*3]*box_l[0]/2/M_PI,vec_h[i*3+1]*box_l[1]/2/M_PI,vec_h[i*3+2]*box_l[2]/2/M_PI);
-    printf("  m=[%12.4e, %12.4e, %12.4e,%12.4e, %12.4e, %12.4e]  ",mat_h[i*6],mat_h[i*6+1],mat_h[i*6+2],mat_h[i*6+3],mat_h[i*6+4],mat_h[i*6+5]);
-    printf("\tsines:  \t");
-    for (int p=0;p<N;p++){
-      printf("%12.4e,  ",sin_h[p+ldd*i]);
-    }
-    printf("\tcosines:\t");
-    for (int p=0;p<N;p++){
-      printf("%12.4e,  ",cos_h[p+ldd*i]);
-    }
-    printf("\n");
-  }
-}
-
-#endif /* SD */
diff --git a/src/core/integrate_sd_cuda_debug.hpp b/src/core/integrate_sd_cuda_debug.hpp
deleted file mode 100644
index 8fbbb2039ff..00000000000
--- a/src/core/integrate_sd_cuda_debug.hpp
+++ /dev/null
@@ -1,143 +0,0 @@
-#ifndef __helper_functions_hpp__
-#define __helper_functions_hpp__
-
-#include "config.hpp"
-
-#include <sys/time.h>
-#include <assert.h>
-#include <iostream>
-#include <iomanip> 
-
-#include "integrate_sd_cuda.hpp"
-
-#ifdef SD
-
-// template implementation below
-template<typename T> void printMatrixDev( T * data, int m, int n, const char * msg);
-template<typename T> void printMatrixDev( T * data, int m, int n);
-void printMatrixDev( const matrix& mat);
-template<typename T> void printMatrixHost( T * data, int m, int n, const char * msg);
-
-template<typename T> void printDistances( T * r_h, T, int N, T min, T max);
-template<typename T> void printDistances( T * r_h, T, int N);
-
-void printPosDev( real * data, int m, const char * msg);
-void printVectorDev( const real * data, const int m);
-void printVectorDev( const real * data, const int m, const char * msg);
-//void printVectorDev( float * data, int m);
-void printVectorHost( const real * data, const int m);
-void printVectorHost( const real * data, const int m, const char * msg);
-//void cudaCheckError(const char *msg);
-bool hasAnyNanDev(const real * data, int length);
-bool isSymmetricDev(const matrix & mat);
-bool isSymmetricDev(const real * data, int lda, int size);
-
-
-real getAbsMax(const real * data, int length);
-real getAbsMaxDev(const real * data_d, int length);
-int countMinusDev(const real * data_d, int length);
-
-class myTimer {
-public:
-  // init numCounters counter
-  myTimer(unsigned int _numCounters);
-  ~myTimer();
-  // add the time since the last call to counter _counter
-  void add(unsigned int _counter);
-  void print(unsigned int _counter,const char * msg);
-  void printAll(const char * msg);
-private:
-  myTimer(const myTimer &t); // this far not implemented - do not copy by accident
-  unsigned int numCounters;
-  unsigned long long int * counters;
-  struct timespec last;
-};
-
-
-/***********************************
- ***   Template Implementation   ***
- **********************************/
-template<typename T>
-T mymin(T a, T b){
-  return a>b?b:a;
-}
-
-template<typename T>
-void printMatrixHost( T * host, int n, int m, const char * msg){
-  std::cout << msg;
-  int maxn,maxm;
-  if (false){
-    maxm=mymin(1000,m);
-    maxn=mymin(1000,n);
-  } else {
-    int sym = mymin(m,n);
-    maxm=mymin(1000,sym);
-    maxn=mymin(1000,sym);
-  }
-  if ( m<=0 || n<=0){
-    return;
-  }
-  std::cout << "\nA=[";
-  for (int j=0; j < maxm;j++){
-    std::cout <<"[";
-    std::cout << std::setw(12)<< host[n*j];
-    for (int i =1; i<maxn;i++){
-      std::cout << " , " <<std::setw(12) << std::setprecision(4)<< host[n*j+i];
-      //printf(",\t%20.15e",host[j]);
-    }
-    //std::cerr << j <<" (" <<n<<","<<m<<") " << std::endl;
-    if (j==n-1){
-      std::cout << "]";
-    } else {
-      std::cout << "]\n   ";
-    }
-  }
-  std::cout << "];\neig(A)\n";
-}
-template<typename T>
-void printMatrixDev( T * data, int n, int m){
-  T host[m*n];
-  if ( m<=0 || n<=0){
-    return;
-  }
-  else{
-    cudaMemcpy( host, data, m*n*sizeof(*data), cudaMemcpyDeviceToHost );
-    printMatrixHost(host,n,m,"");
-  }
-}
-
-
-template<typename T>
-void printMatrixDev( T * data, int m, int n, const char * msg){
-  std::cout << msg;
-  printMatrixDev(data,m,n);
-}
-
-
-template<typename T> void printDistances( T * r_h, T L, int N, T min, T max){
-#ifndef DIM
-  const int DIM=3;
-#endif
-  for (int i = 0;i<N;i++){
-    for (int j=0; j < i;j++){
-      T dr2=0;
-      for (int k=0;k<DIM; k++){
-	T tmp=r_h[DIM*i+k]-r_h[DIM*j+k];
-	tmp-=L*rint(tmp/L);
-	dr2+=tmp*tmp;
-      }
-      T drn=sqrt(dr2);
-      if (min < drn && drn < max){
-	std::cout << drn << "\n";
-      }
-    }
-  }
-}
-template<typename T> void printDistances( T * r_h, T L, int N){
-  printDistances(r_h, L, N,0,L);
-}
-
-
-#endif /* SD */
-
-#endif
diff --git a/src/core/integrate_sd_cuda_device.cu b/src/core/integrate_sd_cuda_device.cu
deleted file mode 100644
index e11366bd9f7..00000000000
--- a/src/core/integrate_sd_cuda_device.cu
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
-  Copyright (C) 2010,2012,2013,2014,2015,2016 The ESPResSo project
-  Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 
-    Max-Planck-Institute for Polymer Research, Theory Group
-  
-  This file is part of ESPResSo.
-  
-  ESPResSo is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-  
-  ESPResSo is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-  
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
-*/
-
-#include "integrate_sd_cuda.hpp"
-#include "integrate_sd_cuda_device.hpp"
-
-#if defined(OMPI_MPI_H) || defined(_MPI_H)
-#error CU-file includes mpi.h! This should not happen!
-#endif
-
-#ifdef SD
-
-/* *************************************************************************************************************** *
- * ********************************************    DEVICE-Functions   ******************************************** *
- * *************************************************************************************************************** */
-
-/// atomic add for doubles
-/// address  : pointer to which the value should be added
-/// inc      : value which should be added
-__device__ double atomicAdd(double * address, double inc){
-
-  ull *addressUll = (ull*) address;
-  ull oldValue=*addressUll;
-  ull assumedValue;
-  assert(!isnan(inc));
-  do {
-    assumedValue=oldValue;
-    ull newValue = __double_as_longlong (__longlong_as_double(assumedValue)+inc);
-    oldValue = atomicCAS(addressUll,assumedValue,newValue);
-  }
-  while (oldValue != assumedValue);
-  return __longlong_as_double(oldValue);
-}
-
-/// modulo function for integers as implemented in python
-/// returns value % max 
-/// which is always positive
-__device__ __inline__ int fold_back_up(int value, int max){
-  while (value < 0){
-    value+=max;
-  }
-  while (value >= max){
-    value-=max;
-  }
-  return value;
-}
-
-/// reduction function returning maximum of all value.
-/// has to be called by all threads
-/// shared_cache should be a pointer to shared memory of at least size blockDim.x
-/// blockDim.x has to be an even number
-__device__ int reduce_max(int * shared_cache){
-  for (int t=(blockDim.x+1)/2;t>1;t=(t+1)/2){
-    if (threadIdx.x < t){
-      if (shared_cache[threadIdx.x]<shared_cache[threadIdx.x+t]){
-	shared_cache[threadIdx.x]=shared_cache[threadIdx.x+t];
-      }
-    }
-    __syncthreads();
-  }
-  if (threadIdx.x==0){
-    shared_cache[0]=shared_cache[0]>shared_cache[1]?shared_cache[0]:shared_cache[1];
-  }
-  return shared_cache[0];
-}
-
-
-__device__ int reduce_max(int * shared_cache, int value){
-  shared_cache[threadIdx.x]=value;
-  /*if (threadIdx.x == 0){
-    if ((((blockDim.x+1)/2)*2) > blockDim.x){
-      shared_cache[blockDim.x+1]=0;
-    }
-  }*/
-  //shared_cache[threadIdx.x+blockDim.x]=0;
-  __syncthreads();
-  return reduce_max(shared_cache);
-}
-
-
-
-/// reduction function returning sum of all values in shared_cache[0:blockDim.x-1]
-/// has to be called by all threads
-/// shared_cache should be a pointer to shared memory of at least size blockDim.x
-__device__ void reduce_sum(real * shared_cache){
-#ifndef numThreadsPerBlock_is_power_of_two
-#error numThreadsPerBlock has to be a power of two for effective reduction
-#endif
-  //shared_cache[threadIdx.x]=value;
-  for (int t=(blockDim.x)/2;t>0;t=t/2){
-    if (threadIdx.x < t){
-      shared_cache[threadIdx.x]=shared_cache[threadIdx.x]+shared_cache[threadIdx.x+t];
-    }
-    __syncthreads();
-  }
-}
-
-/// function to avoid caching if reading from global memory
-/// addr     : address from which the value should be read
-__device__ __inline__ real read_without_caching( const real * addr){
-   real tmp;
-#ifdef SD_USE_FLOAT
-   asm("ld.global.cs.f32 %0,[%1];\n"
-       : "=f"(tmp) : "l"(addr) : );
-#else
-   asm("ld.global.cs.f64 %0,[%1];\n"
-       : "=d"(tmp) : "l"(addr) : );
-#endif
-   return tmp;
-}
-
-#endif
diff --git a/src/core/integrate_sd_cuda_device.hpp b/src/core/integrate_sd_cuda_device.hpp
deleted file mode 100644
index 67bb68bf772..00000000000
--- a/src/core/integrate_sd_cuda_device.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-  Copyright (C) 2010,2012,2013,2014,2015,2016 The ESPResSo project
-  Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 
-    Max-Planck-Institute for Polymer Research, Theory Group
-  
-  This file is part of ESPResSo.
-  
-  ESPResSo is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-  
-  ESPResSo is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-  
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
-*/
-
-#ifndef __INTEGRATE_SD_CUDA_DEVICE_HPP
-#define __INTEGRATE_SD_CUDA_DEVICE_HPP
-
-#include "integrate_sd_cuda.hpp"
-
-#ifdef SD
-
-typedef unsigned long long ull;
-// atomicAdd implementation for double
-__device__ double atomicAdd(double * address, double inc);
-
-__device__ int fold_back_up(int value, int max);
-
-__device__ int reduce_max(int * shared_cache, int value);
-
-__device__ __inline__ real read_without_caching( const real * addr);
-
-__device__ void reduce_sum(real * shared_cache);
-
-#endif /* SD */
-
-#endif
diff --git a/src/core/integrate_sd_cuda_kernel.cu b/src/core/integrate_sd_cuda_kernel.cu
deleted file mode 100644
index 5977c2eecee..00000000000
--- a/src/core/integrate_sd_cuda_kernel.cu
+++ /dev/null
@@ -1,1884 +0,0 @@
-/*
-  Copyright (C) 2010,2012,2013,2014,2015,2016 The ESPResSo project
-  Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 
-    Max-Planck-Institute for Polymer Research, Theory Group
-  
-  This file is part of ESPResSo.
-  
-  ESPResSo is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-  
-  ESPResSo is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-  
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
-*/
-
-#include "config.hpp"
-
-#include <stdio.h>
-#include <assert.h>
-#include "cuda_runtime.h"
-#include <device_functions.h>
-
-#include "integrate_sd_cuda.hpp"
-#include "integrate_sd_cuda_device.cu"
-
-#if defined(OMPI_MPI_H) || defined(_MPI_H)
-#error CU-file includes mpi.h! This should not happen!
-#endif
-
-#ifdef SD
-
-/* *************************************************************************************************************** *
- * ********************************************      CUDA-KERNELS     ******************************************** *
- * *************************************************************************************************************** */
-
-
-/// This computes the farfield contribution of the mobility in the case of no
-/// periodic boundary conditions.
-/// r is the vector of [x_1, y_1, z_1, x_2, y_2, z_2, ...]
-/// N is the number of particles
-/// self_mobility is 1./(6.*PI*eta*a)
-/// a is the particle radius
-/// mobility is the mobility matrix which will be retruned
-#define mydebug(str,...)
-// if (threadIdx.x < 3 && (blockIdx.x == 0 || blockIdx.x == 1)){printf("line: %d thread: %2d, block: %2d "str,__LINE__,threadIdx.x,blockIdx.x,__VA_ARGS__);}
-__global__ void sd_compute_mobility_matrix(const real * r, int N, real self_mobility, real a, real * mobility){
-  real mypos[3];
-  const int lda=((3*N+31)/32)*32;
-  __shared__ real cachedPos[3*numThreadsPerBlock];
-  __shared__ real writeCache[3*numThreadsPerBlock];
-  int i = blockIdx.x*blockDim.x + threadIdx.x;
-  // get data for myposition - using coalscaled memory access
-  for (int l=0;l<3;l++){
-    mydebug(" 0x%08x -> 0x%08x  \n",numThreadsPerBlock*(l+blockIdx.x*3)+threadIdx.x,numThreadsPerBlock*l+threadIdx.x);
-    cachedPos[numThreadsPerBlock*l+threadIdx.x] = r[numThreadsPerBlock*(l+blockIdx.x*3)+threadIdx.x];
-  }
-  __syncthreads();
-  for (int l=0;l<3;l++){
-    mypos[l] = cachedPos[threadIdx.x*3+l];
-    mydebug("mypos[%d]:  %e\n",l,mypos[l]);
-  }
-
-  /*if (i < N){
-    // first write the self contribution
-#pragma unroll
-    for (int k=0; k < DIM; k++){
-      //#pragma unroll
-      //for (int l=0; l < DIM; l++){
-      //mobility[myindex(DIM*i+k,DIM*i+l)]=0;
-      //}
-      mobility[myindex(DIM*i+k,DIM*i+k)]=self_mobility;
-    }
-    }*/
-  for (int offset=0;offset<N;offset+=numThreadsPerBlock){
-    mydebug("offset: %d\n",offset)
-    // copy positions to shared memory
-#pragma unroll
-    for (int l=0;l<3;l++){
-      mydebug("fuu:: 0x%08x  0x%08x  0x%08x  0x%08x %e\n",r, offset*3,numThreadsPerBlock*l,threadIdx.x,cachedPos[numThreadsPerBlock*l+threadIdx.x]);
-      cachedPos[numThreadsPerBlock*l+threadIdx.x] = r[offset*3+numThreadsPerBlock*l+threadIdx.x];
-    }
-    __syncthreads();
-    if (i < N){
-      for (int j=offset;j<min(offset+numThreadsPerBlock,N);j++){
-	real dr[DIM];
-	real dr2=0;
-#pragma unroll 3
-	for (int k=0;k<DIM;k++){
-	  dr[k]=mypos[k]-cachedPos[DIM*(j-offset)+k]; // r_ij
-	  dr2+=dr[k]*dr[k];
-	}
-	dr2=max(dr2,0.01);
-	real drn= sqrt(dr2); // length of dr
-	real b = a/drn;
-      
-	/*if (0.5 < b){  // drn < 2*a
-	  /*real t=3./32./drn/a*self_mobility;
-	  real t2=(1-9./32.*drn/a)*self_mobility;
-	  for (k=0; k < DIM; k++){
-	  for (l=0;l < DIM; l++){
-	  mobility[myindex(DIM*i+k,DIM*j+l)]=dr[k]*dr[l]*t;
-	  }
-	  mobility[myindex(DIM*i+k,DIM*j+k)]+=t2;
-	  }*/ // this should not happen ...
-	real t,t2;
-	// this also catches the case i == j
-	if (0.5 < b ){  // drn < 2*a
-	  t=0;
-	  t2=0;
-	  if (i==j){
-	    t2=self_mobility;
-	  }
-	} else {
-	  real b2=(a*a)/dr2;
-	  // Rotne Prager
-	  //t=(0.75-1.5*b2)*b/dr2*self_mobility;
-	  //t2=(0.75+0.5*b2)*b*self_mobility;
-	  
-	  t=(0.75-1.5*b2)*b/dr2*self_mobility;
-	  t2=(0.75+0.5*b2)*b*self_mobility;
-	}
-	//mobility[threadIdx.x]=3+threadIdx.x;
-	real tmp_el13;
-#pragma unroll 3
-	for (int k=0; k < DIM; k++){
-	  if (k ==0){ // these ifs should be removed at compile time ... after unrolling
-#pragma unroll 3
-	    for (int l=0;l < 3; l++){
-	      //mobility[myindex(DIM*i+k,DIM*j+l)]=dr[k]*dr[l]*t;
-	      writeCache[3*threadIdx.x+l]=dr[k]*dr[l]*t;
-	    }
-	  }
-	  else if(k==1){
-	    tmp_el13 = writeCache[3*threadIdx.x+2];
-	    writeCache[3*threadIdx.x+0]=writeCache[3*threadIdx.x+1];
-#pragma unroll 2
-	    for (int l=1;l < DIM; l++){
-	      //mobility[myindex(DIM*i+k,DIM*j+l)]=dr[k]*dr[l]*t;
-	      writeCache[3*threadIdx.x+l]=dr[k]*dr[l]*t;
-	    }	
-	  }
-	  else{
-	    writeCache[3*threadIdx.x+0]=tmp_el13;
-	    writeCache[3*threadIdx.x+1]=writeCache[3*threadIdx.x+2];
-	    writeCache[3*threadIdx.x+2]=dr[k]*dr[2]*t;
-	  }
-	  writeCache[3*threadIdx.x+k]+=t2;
-	    
-	  __syncthreads();
-	  //int max = min(blockDim.x, N-(blockIdx.x*blockDim.x));
-	  int max = min(blockDim.x,N-blockDim.x*blockIdx.x);
-	  for (int l=0;l<3;l++){
-	    //mobility[(DIM*j+k)*3*N+blockIdx.x*blockDim.x+threadIdx.x+blockDim.x*l]=writeCache[threadIdx.x+blockDim.x*l];
-	    mobility[(DIM*j+k)*lda+blockIdx.x*blockDim.x*3+max*l+threadIdx.x]=writeCache[max*l+threadIdx.x];
-	  }
-	  //mobility[myindex(DIM*i+k,DIM*j+k)]+=t2;
-	}
-	// python implementation:
-	// T=one*(0.75+0.5*b2)*b+(0.75-1.5*b2)*b*drt*dr/dr2;
-	//} // if (j <N)
-      } // for (j = ...
-    } // if (i < N)
-  }// for offset = ...
-}
-#undef mydebug
-
-/// This computes the farfield contribution of the mobility with ewald summation
-/// this version assumes that the real-space cutoff is smaller than the boxlength
-/// r is the vector of [x_1, y_1, z_1, x_2, y_2, z_2, ...]
-/// N is the number of particles
-/// self_mobility is 1./(6.*PI*eta*a)
-/// a is the particle radius
-/// L_g is the boxlength
-/// mobility is the mobility matrix which will be retruned
-/// cutoff the realspace cutoff distance
-/// xi the splitting parameter as defined by Beenakker 1986
-/// xa  = xi * a
-/// xa3 = xa * xa * xa
-#define mydebug(str,...)
-// if (threadIdx.x < 3 && (blockIdx.x == 0 || blockIdx.x == 1)){printf("line: %d thread: %2d, block: %2d "str,__LINE__,threadIdx.x,blockIdx.x,__VA_ARGS__);}
- __global__ void sd_compute_mobility_matrix_real_short(const real * r, int N, real self_mobility, real a, const real * L_g,
-						       real * mobility, real cutoff, real xi, real xa, real xa3){
-  real mypos[3];
-  const int lda=((3*N+31)/32)*32;
-  __shared__ real L[3];
-  __shared__ real cachedPos[3*numThreadsPerBlock];
-  __shared__ real writeCache[3*numThreadsPerBlock];
-  int i = blockIdx.x*blockDim.x + threadIdx.x;
-  if (threadIdx.x < 3){ // copy L to shared memory
-    L[threadIdx.x]=L_g[threadIdx.x];
-  }
-  __syncthreads();
-  // get data for myposition - using coalscaled memory access
-  for (int l=0;l<3;l++){
-    cachedPos[numThreadsPerBlock*l+threadIdx.x] = r[numThreadsPerBlock*(l+blockIdx.x*3)+threadIdx.x];
-  }
-  __syncthreads();
-  for (int l=0;l<3;l++){
-    mypos[l] = cachedPos[threadIdx.x*3+l];
-    mydebug("mypos[%d]:  %e\n",l,mypos[l]);
-  }
-
-  for (int offset=0;offset<N;offset+=numThreadsPerBlock){
-    mydebug("offset: %d\n",offset)
-    // copy positions to shared memory
-#pragma unroll
-    for (int l=0;l<3;l++){
-      mydebug("fuu:: 0x%08x  0x%08x  0x%08x  0x%08x %e\n",r, offset*3,numThreadsPerBlock*l,threadIdx.x,cachedPos[numThreadsPerBlock*l+threadIdx.x]);
-      cachedPos[numThreadsPerBlock*l+threadIdx.x] = r[offset*3+numThreadsPerBlock*l+threadIdx.x];
-    }
-    __syncthreads();
-    if (i < N){
-      for (int j=offset;j<min(offset+numThreadsPerBlock,N);j++){
-	real dr[DIM];
-	real dr2=0;
-#pragma unroll 3
-	for (int k=0;k<DIM;k++){
-	  dr[k]=mypos[k]-cachedPos[DIM*(j-offset)+k]; // r_ij
-	  dr[k]-=rint(dr[k]/L[k])*L[k]; // fold back
-	  dr2+=dr[k]*dr[k];
-	}
-	dr2=max(dr2,0.01);
-	real drn= sqrt(dr2); // length of dr
-	real ar = a/drn;
-	real xr = xi*drn;
-	//real xa is given
-	
-	real t,t2;
-	// this also catches the case i == j
-	if (0.5 < ar || drn > cutoff){  // drn < 2*a
-	  t=0;
-	  t2=0;
-	  if (i==j){
-	    //t2=1-6./sqrt(M_PI)*xa+40./3./sqrt(M_PI)*xa*xa*xa;
-#ifdef SD_USE_FLOAT
-	    t2=1-3.385137501286537721688476709364635515064303775f*xa + 7.52252778063675049264105935414363447792067505f*xa3;
-#else
-	    t2=1-3.385137501286537721688476709364635515064303775*xa + 7.52252778063675049264105935414363447792067505*xa3;
-#endif
-	  }
-	} else {
-	  // Rotne Prager
-	  real xr2=xr*xr;
-	  real ar2=ar*ar;
-#ifdef SD_USE_FLOAT
-	  t=(0.75f-1.5f*ar2)*ar*erfcf(xr)+(-4.f*xa3*xr2*xr2-3.f*xa*xr2+16.f*xa3*xr2+1.5f*xa-2.f *xa3-3.f*xa*ar2)*0.5641895835477562869480794515607725858440506f*exp(-xr2);
-	  t2=(0.75f+0.5f*ar2)*ar*erfcf(xr)+(4.f*xa3*xr2*xr2+3.f*xa*xr2-20.f*xa3*xr2-4.5f*xa+14.f*xa3+    xa*ar2)*0.5641895835477562869480794515607725858440506f*exp(-xr2);
-#else
-	  //assert(erfc(xr)>1e-15);
-	  t=(0.75-1.5*ar2)*ar*erfc(xr);
-	  //assert(t>1e-15);
-	  real tmp1=(-4.*xa3*xr2*xr2-3.*xa*xr2+16.*xa3*xr2+1.5*xa-2. *xa3-3.*xa*ar2);
-	  real tmp2=0.5641895835477562869480794515607725858440506*exp(-xr2);//
-	  //assert(abs(tmp1)>1e-15);
-	  //assert(abs(tmp2)>1e-15);
-	  //assert(abs(tmp1*tmp2)>1e-15);
-	  t+=tmp1*tmp2;
-	  //assert(abs(t)>1e-15);
-	  t2=(0.75+0.5*ar2)*ar*erfc(xr)+(4.*xa3*xr2*xr2+3.*xa*xr2-20.*xa3*xr2-4.5*xa+14.*xa3+   xa*ar2)*0.5641895835477562869480794515607725858440506*exp(-xr2);
-#endif
-	  //assert(t>1e-15);
-	  //assert(t2>1e-15);
-	}
-	t*=self_mobility;
-	t2*=self_mobility;
-	t/=dr2;
-	real tmp_el13;
-#pragma unroll 3
-	for (int k=0; k < DIM; k++){
-	  if (k ==0){ // these ifs should be removed at compile time ... after unrolling
-#pragma unroll 3
-	    for (int l=0;l < 3; l++){
-	      writeCache[3*threadIdx.x+l]=dr[k]*dr[l]*t;
-	    }
-	  }
-	  else if(k==1){
-	    tmp_el13 = writeCache[3*threadIdx.x+2];
-	    writeCache[3*threadIdx.x+0]=writeCache[3*threadIdx.x+1];
-#pragma unroll 2
-	    for (int l=1;l < DIM; l++){
-	      writeCache[3*threadIdx.x+l]=dr[k]*dr[l]*t;
-	    }	
-	  }
-	  else{
-	    writeCache[3*threadIdx.x+0]=tmp_el13;
-	    writeCache[3*threadIdx.x+1]=writeCache[3*threadIdx.x+2];
-	    writeCache[3*threadIdx.x+2]=dr[k]*dr[2]*t;
-	  }
-	  writeCache[3*threadIdx.x+k]+=t2;
-	    
-	  __syncthreads();
-	  int max = min(blockDim.x,N-blockDim.x*blockIdx.x);
-	  for (int l=0;l<3;l++){
-	    mobility[(DIM*j+k)*lda+blockIdx.x*blockDim.x*3+max*l+threadIdx.x]=writeCache[max*l+threadIdx.x];
-	  }
-	}
-      } // for (j = ...
-    } // if (i < N)
-  }// for offset = ...
-}
-#undef mydebug
-
-
-/// This computes the farfield contribution of the mobility with ewald summation
-/// this kernel computes the sines and cosinus.
-/// r        : the vector of positions [x_1, y_1, z_1, x_2, y_2, z_2, ...]
-/// N        : the number of particles
-/// vecs *   : a pointer to the k-vectors
-/// num      : number of k-vectors
-/// sines    : the values for sin(pos_i k_j)
-/// cosines  : the values for cos(pos_i k_j)
-/// ldd      : the leading dimension of the sines and cosines array
-#define mydebug(str,...)
-// if (threadIdx.x < 3 && (blockIdx.x == 0 || blockIdx.x == 1)){printf("line: %d thread: %2d, block: %2d "str,__LINE__,threadIdx.x,blockIdx.x,__VA_ARGS__);}
-__global__ void sd_compute_mobility_sines(const real * r, int N, const real * vecs, int num, real * sines, real * cosines, int ldd){
-  real mypos[3];
-  __shared__ real cache[3*numThreadsPerBlock];
-  int i = blockIdx.x*blockDim.x + threadIdx.x;
-  // get data for myposition - using coalscaled memory access
-  for (int l=0;l<3;l++){
-    mydebug(" 0x%08x -> 0x%08x  \n",numThreadsPerBlock*(l+blockIdx.x*3)+threadIdx.x,numThreadsPerBlock*l+threadIdx.x);
-    cache[numThreadsPerBlock*l+threadIdx.x] = r[numThreadsPerBlock*(l+blockIdx.x*3)+threadIdx.x];
-  }
-  __syncthreads();
-  for (int l=0;l<3;l++){
-    mypos[l] = cache[threadIdx.x*3+l];
-    mydebug("mypos[%d]:  %e\n",l,mypos[l]);
-  }
-  
-  for (int offset=0;offset<num;offset+=numThreadsPerBlock){
-    mydebug("offset: %d\n",offset)
-    // copy positions to shared memory
-#pragma unroll
-    for (int l=0;l<3;l++){
-      mydebug("fuu:: 0x%08x  0x%08x  0x%08x  0x%08x %e\n",r, offset*3,numThreadsPerBlock*l,threadIdx.x,cachedPos[numThreadsPerBlock*l+threadIdx.x]);
-      cache[numThreadsPerBlock*l+threadIdx.x] = vecs[offset*3+numThreadsPerBlock*l+threadIdx.x];
-    }
-    __syncthreads();
-    for (int j=offset;j<offset+numThreadsPerBlock&& j < num;j++){
-      real vec_r=0;
-#pragma unroll 3
-      for (int k=0;k<DIM;k++){
-	vec_r+=mypos[k]*cache[3*(j-offset)+k];
-      }
-      real tmp;
-      if (i < N){
-	tmp=sin(vec_r);
-      } else {
-	tmp=0;
-      }
-      sines[ldd*j+i]  =tmp;
-      if (i < N){
-	tmp=cos(vec_r);
-      }
-      cosines[ldd*j+i]=tmp;
-    } // for (j = ...
-  }// for offset = ...
-}
-#undef mydebug
-
-
-
-/// This computes the farfield contribution of the mobility with ewald summation
-/// this kernel computes the sum in eq. (3.34) and (3.35) in the master thesis 
-/// "rotne-prager based hydrodynamics" by D. Schwoerer: M^{wave} \sum F_i cos / sin.
-/// The number of threads have to be a power of two e.g. 32, 64 or 128.
-/// forces is the input vector of form [fx_1, fy_1, fz_1, fx_2, fy_2, fz_2, ...]
-__global__ void sd_wavepart_sum(const real * const forces, const real * const vecs,    int num, const  real * const matrices_d,
-				const real * const sines,  const real * const cosines, int ldd, int N, real * sin_sum, real * cos_sum, real max){
-  // each block summs over all particles
-  // with constant wave vektor
-  real myforcesin[3]={0,0,0};
-  real myforcecos[3]={0,0,0};
-  extern __shared__ real cache[];//3*blockDim.x
-  for (int offset=0;offset<N;offset+=blockDim.x){
-    // for (int j=offset;j< offset_next;j++){ // is parallel
-#pragma unroll 3
-    for (int l=0;l<3;l++){
-      cache[blockDim.x*l+threadIdx.x] = forces[blockDim.x*l+offset*3+threadIdx.x];
-    }
-    __syncthreads();
-    real sine;
-    real cosine;
-    if (offset+threadIdx.x < N){
-      sine   = sines  [blockIdx.x*ldd+offset+threadIdx.x];
-      cosine = cosines[blockIdx.x*ldd+offset+threadIdx.x];
-      //} else {
-      //sine   = 0;
-      //cosine = 0;
-      //}
-      //assert(!isnan(sine));
-      //assert(!isnan(cosine));
-      for (int k=0;k<3;k++){
-	//assert(abs(cache[threadIdx.x*3+k]) <= max);
-	//assert(abs(myforcesin[k]) <= max);
-	//assert(abs(myforcecos[k]) <= max);
-	//assert(!isnan(myforcesin[k]));
-	//assert(!isnan(myforcecos[k]));
-	real tmp=  sine*cache[threadIdx.x*3+k];
-	//assert(abs(tmp) <= max);
-	myforcesin[k]+=tmp;
-	tmp     =cosine*cache[threadIdx.x*3+k];
-	//assert(abs(tmp) <= max);
-	myforcecos[k]+=tmp;
-	//assert(!isnan(myforcesin[k]));
-	//assert(!isnan(myforcecos[k]));
-      } // for k
-    } // if offset + threadIdx.x < N
-  }// for offset = ...
-  #pragma unroll 3
-  for (int k=0 ; k < 3; k++){
-    cache[threadIdx.x+blockDim.x*k]=myforcesin[k];
-  }
-  __syncthreads();
-  reduce_sum(cache);
-  //assert(cache[0] <= max);
-  reduce_sum(cache+blockDim.x);
-  //assert(cache[blockDim.x] <= max);
-  reduce_sum(cache+blockDim.x*2);
-  //assert(cache[blockDim.x*2] <= max);
-  __syncthreads();
-  if (threadIdx.x < 3){
-    myforcesin[0]=cache[threadIdx.x*blockDim.x];
-  }
-  __syncthreads();
-  for (int k=0 ; k < 3; k++){
-    cache[threadIdx.x+blockDim.x*k]=myforcecos[k];
-  }
-  __syncthreads();
-  reduce_sum(cache);
-  //assert(cache[0] <= max);
-  reduce_sum(cache+blockDim.x);
-  //assert(cache[blockDim.x] <= max);
-  reduce_sum(cache+blockDim.x*2);
-  //assert(cache[blockDim.x*2] <= max);
-  __syncthreads();
-  if (threadIdx.x < 3){
-    cache[threadIdx.x]   = cache[threadIdx.x*blockDim.x];
-    cache[threadIdx.x+3] = myforcesin[0];
-  }
-  __syncthreads();
-  real * mat = cache + 6;
-  if (threadIdx.x < 6){
-    //assert(cache[threadIdx.x] <= max);
-    mat[threadIdx.x] = matrices_d[blockIdx.x*6+threadIdx.x];
-    __syncthreads();
-    cache[threadIdx.x+12]=cache[threadIdx.x]*mat[threadIdx.x%3];
-  }
-  __syncthreads();
-  if (threadIdx.x < 2){
-    cache[threadIdx.x*3+1+12]+=mat[3]*cache[threadIdx.x*3+0];
-    cache[threadIdx.x*3+0+12]+=mat[3]*cache[threadIdx.x*3+1];
-    cache[threadIdx.x*3+2+12]+=mat[4]*cache[threadIdx.x*3+0];
-    cache[threadIdx.x*3+0+12]+=mat[4]*cache[threadIdx.x*3+2];
-    cache[threadIdx.x*3+2+12]+=mat[5]*cache[threadIdx.x*3+1];
-    cache[threadIdx.x*3+1+12]+=mat[5]*cache[threadIdx.x*3+2];
-  }
-  __syncthreads();
-  if (threadIdx.x < 3){
-    //assert(cache[threadIdx.x+12] <= max);
-    //assert(cache[threadIdx.x+15] <= max);
-    cos_sum[threadIdx.x+blockIdx.x*3]=cache[threadIdx.x+12];
-    sin_sum[threadIdx.x+blockIdx.x*3]=cache[threadIdx.x+15];
-  }
-}
-
-
-
-/// This computes the wave space contribution of the matrix vector product of
-/// the farfield. This uses the pre computed values from the kernel
-/// sd_wavepart_sum
-/// num      : number of wave vectors
-/// sines    : sinus of the scalar product of position and wave vector
-/// cosines  : cosines of the scalar product of position and wave vector
-/// sin_sum  : precomputed sum for sinus
-/// cos_sum  : precomputed sum for cosinus
-/// ldd      : leading dimension of the sin_sum and cos_sum arrays
-/// out      : out vector upon which the result gets added
-/// max      : maximal value of which the sin_sum and cos_sum can have
-/// N        : number of particles
-/// factor   : the scalar factor c of the matrix product c A \cdot x
-__global__ void sd_wavepart_assemble(const int num, const real * const __restrict__ sines, const real * const __restrict__ cosines,
-				     const real * const __restrict__ sin_sum, const real * const __restrict__ cos_sum, 
-				     const int ldd, real * __restrict__ out, const real max, const int N, const real factor){
-  // particle index of the particle for which we sum:
-  int i = threadIdx.x+blockIdx.x*blockDim.x;
-  real myout[3]={0,0,0};
-  real sine;
-  real cosine;
-  
-  int offset_next;
-  for (int offset=0;offset<num;offset=offset_next){
-    offset_next=offset+blockDim.x;
-    if (offset_next > num){
-      offset_next=num;
-    }
-    // j: wave vector index
-    for (int j=offset;j< offset_next ;j++){
-      sine   = sines  [i + j*ldd];
-      //sine = read_without_caching(sines + i+j*ldd);
-#pragma unroll 3
-      for (int k=0;k < 3;k++){
-	real tmp = sin_sum[k + j*3];
-	//assert(tmp <= max);
-	myout[k]+=sine*tmp;
-      }
-      cosine = cosines[i + j*ldd];
-      //cosine = read_without_caching(cosines + i+j*ldd);
-#pragma unroll 3
-      for (int k=0;k<DIM;k++){
-	real tmp = cos_sum[k + j*3];
-	//assert(tmp <= max);
-	myout[k]+=cosine*tmp;
-      }
-    } // for (j = ...
-  }// for offset = ...
-  extern __shared__ real cache[];
-#pragma unroll 3
-  for (int k=0;k<3;k++){
-    cache[threadIdx.x*3+k]=myout[k]*factor;;
-  }
-  __syncthreads();
-#pragma unroll 3
-  for (int k=0;k<3;k++){
-    if (threadIdx.x+(blockIdx.x*3+k)*blockDim.x < 3*N){
-      out[threadIdx.x+(blockIdx.x*3+k)*blockDim.x]+=cache[threadIdx.x + k*blockDim.x];
-    }
-    //cache[threadIdx.x+k*numThreadsPerBlock]+=out[threadIdx.x+(blockIdx.x*3+k)*numThreadsPerBlock];
-    //out[threadIdx.x+(blockIdx.x*3+k)*numThreadsPerBlock]=cache[threadIdx.x+k*numThreadsPerBlock];
-  }
-}
-/* this is slower (if we use a block per particle instead of a thread
-__global__ void sd_wavepart_assemble_block(const int num, const real * const sines, const real * const cosines, const real * const sin_sum,
-				     const real * const cos_sum, const int ldd, real * out, real max, int N, const real factor){
-  // particle index of the particle for which we sum:
-  int i = blockIdx.x;
-  real myout[3]={0,0,0};
-  real sine;
-  real cosine;
-  
-  extern __shared__ real cache[];
-  int offset_next;
-  for (int offset=0;offset<num;offset=offset_next){
-    offset_next=offset+blockDim.x;
-    // j: wave vector index
-    //for (int j=offset;j< offset_next ;j++){
-    int j=offset+threadIdx.x;
-    if (j < num){
-      sine   = sines  [i + j*ldd];
-#pragma unroll 3
-      for (int k=0;k < 3;k++){
-	real tmp = sin_sum[k + j*3];
-	//assert(tmp <= max);
-	myout[k]+=sine*tmp;
-      }
-      cosine = cosines[i + j*ldd];
-#pragma unroll 3
-      for (int k=0;k<DIM;k++){
-	real tmp = cos_sum[k + j*3];
-	//assert(tmp <= max);
-	myout[k]+=cosine*tmp;
-      }
-    } // for (j = ...
-  }// for offset = ...
-#pragma unroll 3
-  for (int k=0;k<3;k++){
-    cache[threadIdx.x]=myout[k]*factor;
-    __syncthreads();
-    reduce_sum(cache);
-    __syncthreads();
-    if (threadIdx.x == k){
-      myout[0]=cache[0];
-    }
-  }
-  if (threadIdx.x < 3){
-    out[blockIdx.x * 3 + threadIdx.x]+= myout[0];
-  }
-}*/
-
-
-
-/// Add the wavepart contribution of the mobility to the matrix
-/// num      : number of wave space vectors
-/// matrices_d : pointer to the matrices of the wave space contribution in the
-///            ewald sum
-/// sines    : sines of the scalar product of wave space vector and particle
-///            positions
-/// cosines  : cosines of the scalar product of wave space vector and
-///            particle positions
-/// ldd      : leading dimension of sines/cosines
-/// N        : number of particles
-/// mobility_d : mobility matrix to which the wave space contribution is added
-/// mat_ldd  : leading dimension of the mobility matrix
-__global__ void sd_wavepart_addto_matrix(const int num, const  real * const matrices_d, const real * const sines,  const real * const cosines,
-					 const int ldd, const int N, real * const mobility_d, const int mat_ldd){
-  const int i= threadIdx.x + blockDim.x*blockIdx.x;
-  const int j= blockIdx.y;
-  
-  real mymat[6]={0,0,0,
-		 0,0,0};
-  if (i < N){
-    for (int k = 0; k < num; k++){
-      real mycos = cosines[i+k*ldd];
-      mycos *= cosines[j+k*ldd];
-      mycos += sines[i+k*ldd] * sines[j+k*ldd];
-      for (int l=0;l<6;l++){
-	mymat[l]+=matrices_d[6*k+l]*mycos;
-      }
-    }
-  }
-  extern __shared__ real share[];//3*blockDim.x
-  #pragma unroll 3
-  for (int d=0;d<3;d++){
-    for (int l=0;l<3;l++){
-      if (threadIdx.x+3*blockDim.x*blockIdx.x+l*blockDim.x<3*N)
-	share[threadIdx.x+l*blockDim.x]=mobility_d[threadIdx.x+3*blockDim.x*blockIdx.x+l*blockDim.x+mat_ldd*(j*3+d)];
-#ifdef SD_DEBUG
-      else
-	share[threadIdx.x+l*blockDim.x]=0;
-#endif
-    }
-    __syncthreads();
-    if (d==0){
-      share[threadIdx.x*3+0]+=mymat[0];
-      share[threadIdx.x*3+1]+=mymat[3];
-      share[threadIdx.x*3+2]+=mymat[4];
-    } else if (d==1){
-      share[threadIdx.x*3+0]+=mymat[3];
-      share[threadIdx.x*3+1]+=mymat[1];
-      share[threadIdx.x*3+2]+=mymat[5];
-    } else {
-      share[threadIdx.x*3+0]+=mymat[4];
-      share[threadIdx.x*3+1]+=mymat[5];
-      share[threadIdx.x*3+2]+=mymat[2];
-    }
-    __syncthreads();
-    for (int l=0;l<3;l++){
-      if (threadIdx.x+3*blockDim.x*blockIdx.x+l*blockDim.x<3*N)
-	mobility_d[threadIdx.x+3*blockDim.x*blockIdx.x+l*blockDim.x+mat_ldd*(j*3+d)]=share[threadIdx.x+l*blockDim.x];
-    }
-    __syncthreads();
-  }
-}
-
-
-
-#define mydebug(str,...)
-// if (threadIdx.x < 3 && blockIdx.x < 2){printf("line: %d thread: %2d, block: %2d "str,__LINE__,threadIdx.x,blockIdx.x,__VA_ARGS__);}
-/// this computes the near field as a  ResistanceMatrix
-/// pos           : is the vector of [x_1, y_1, z_1, x_2, y_2, z_2, ...]
-/// N             : is the number of particles
-/// self_mobility : is 1./(6.*PI*eta*a)
-/// a             : is the particle radius
-/// L_g           : is the boxlength
-/// resistance    : is the resistance matrix which will be retruned
-/// myInfo        : contains infos about the operation:
-///                myInfo[0] : number of overlapping particles
-///                myInfo[1] : number of interacting particles (via nf)
-///                myInfo[2] : max number of interacting particles per particle
-__global__ void sd_compute_resistance_matrix(const real * pos, int N, real self_mobility, real a, const real * L_g, real * resistance, int * myInfo){
-  int interactions=0;
-  real mypos[3];
-#ifdef SD_USE_FLOAT
-  __shared__ real cachedPos[4*numThreadsPerBlock];
-#else
-  __shared__ real cachedPos[3*numThreadsPerBlock];
-#endif
-  const int lda=(((N*3)+31)/32)*32;
-  real myresistance[6]={0,0,0,0,0,0};
-  int i = blockIdx.x*blockDim.x + threadIdx.x;
-#ifndef SD_NOT_PERIODIC
-  __shared__ real L[3];
-  if (threadIdx.x < 3){ // copy L to shared memory
-    L[threadIdx.x]=L_g[threadIdx.x];
-  }
-#endif
-  for (int l=0;l<3;l++){
-    cachedPos[threadIdx.x+l*numThreadsPerBlock] = pos[threadIdx.x+l*numThreadsPerBlock+blockIdx.x*blockDim.x*3];
-  }
-
-  __syncthreads();
-  for (int d=0;d<3;d++){
-    mypos[d] = cachedPos[threadIdx.x*3+d];
-  }
-  
-  for (int offset=0;offset<N;offset+=numThreadsPerBlock){
-    // copy positions to shared memory
-#pragma unroll
-    for (int l=0;l<3;l++){
-      cachedPos[threadIdx.x+l*numThreadsPerBlock] = pos[threadIdx.x+l*numThreadsPerBlock+offset*3];
-    }
-    __syncthreads();
-    for (int j=offset;j<min(offset+numThreadsPerBlock,N);j++){
-      real dr[DIM];
-      real dr2=0;
-#pragma unroll
-      for (int k=0;k<DIM;k++){
-	dr[k]=mypos[k]-cachedPos[3*(j-offset)+k]; // r_ij
-#ifndef SD_NOT_PERIODIC
-	dr[k]-=L[k]*rint(dr[k]/L[k]); // fold back
-#endif
-	dr2+=dr[k]*dr[k];
-	mydebug("dr[%d]: %f\n",k,dr[k]);
-      }
-#ifdef SD_RESISTANCE_CORRECT
-      real r2bcorr_diag_self     = 0;
-      real r2bcorr_diag_mix      = 0;
-      real r2bcorr_offdiag_self  = 0;
-      real r2bcorr_offdiag_mix   = 0;
-#else
-      real offdiag_fac=0;
-      real diag_fac=0;
-#endif
-      if (i >= N || i ==j || j >= N){
-	;
-      }
-      else if (dr2 < 4*a*4*a){
-	if (!(2*a*2*a < dr2 )){
-	  atomicAdd(myInfo,1); // count overlapping particles
-	}
-	else {// 2*a < drn < 4*a 
-	  interactions++;
-	  // python code:
-	  // # Use only singular therms, namely to order O(s_ij^0)                                                                  
-	  // T=(1./4./s-1/4-9./40.*ls)*dr*drt/dr2
-	  // #           ^ this additonal constant is so that the mobility is smooth
-	  // # c.f. N.-Q. Nguyen and A. J. C. Ladd, PHYSICAL REVIEW E 66, 046708 (2002) equation (34)                               
-	  // T+=1./6.*ls*(-one+dr*drt/dr2)
-	  // R[3*i:3*i+3,3*j:3*j+3]=-T
-	  // R[3*i:3*i+3,3*i:3*i+3]+=T
-	  real drn= sqrt(dr2); // length of dr
-	  real s = drn/a-2;
-	  real ls = log(s);
-	  
-	  real const para_fac_c=-0.125+(9./40.)*log(2.)+3./112.*2.*log(2.);
-#ifdef SD_RESISTANCE_CORRECT
-	  real para_fac    =(-0.25/s+(9./40.)*ls+(3./112.)*s*ls);
-	  real perp_fac    =((1./6.)*ls);
-#else
-	  real const perp_fac_c=1./6.*log(2.);
-	  real para_fac    =(-0.25/s+(9./40.)*ls+(3./112.)*s*ls-para_fac_c)/dr2/self_mobility;
-	  real perp_fac    =((1./6.)*ls-perp_fac_c)/self_mobility;
-	  diag_fac    = perp_fac;
-	  offdiag_fac = para_fac-perp_fac;
-#endif
-#ifdef SD_RESISTANCE_CORRECT
-	  real dr4=dr2*dr2;
-	  real dr6=dr4*dr2;
-	  // constants for correction at cutoff
-	  const real dr_c1 = 4;
-	  const real dr_c2 = 4*4;
-	  const real dr_c3 = 4*4*4;
-	  const real dr_c4 = 4*4*4*4;
-	  const real dr_c5 = 4*4*4*4*4;
-	  const real dr_c6 = 4*4*4*4*4*4;
-	  const real r2bcorr_para_self_c  =                  1./(1.-9./4./dr_c2+3./dr_c4-1./dr_c6) + para_fac_c;
-	  const real r2bcorr_para_mix_c   = (6.*dr_c5-4.*dr_c3)/(4.*dr_c6-9.*dr_c4+12.*dr_c2-4.)   + para_fac_c;
-	  const real r2bcorr_perp_self_c  =                  1./(1.-25./16./dr_c2)                 + 1./6.*log(2.);
-	  const real r2bcorr_perp_mix_c   =                  1./(16./20.*dr_c1-25./20./dr_c1)      + 1./6.*log(2.);
-	  // TODO: Make sure to use (real) and not double ...
-	  // real computation
-	  real r2bcorr_para_self     =-( para_fac  + (                      1./(1.-9./4./dr2+3./dr4-1./dr6)  - r2bcorr_para_self_c ));
-	  real r2bcorr_para_mix      = ( para_fac  + ( (6.*dr4*drn-4.*dr2*drn)/(4.*dr6-9.*dr4+12.*dr2-4.)    - r2bcorr_para_mix_c  ));
-	  real r2bcorr_perp_self     =-( perp_fac  + (                      1./(1.-((real)25./16.)/dr2)      - r2bcorr_perp_self_c ));
-	  real r2bcorr_perp_mix      = ( perp_fac  + (                      1./(16./20.*drn-25./20./drn)     - r2bcorr_perp_mix_c  ));
-	  //printf("%d %d   show  [ %e, %e,  %e, %e ]\n",i,j,r2bcorr_para_self,r2bcorr_perp_self,r2bcorr_para_mix,r2bcorr_perp_mix);
-	    
-	  r2bcorr_diag_self     = (r2bcorr_perp_self)/self_mobility;
-	  r2bcorr_diag_mix      = (r2bcorr_perp_mix )/self_mobility;
-	  r2bcorr_offdiag_self  = (r2bcorr_para_self - r2bcorr_perp_self) /self_mobility/dr2;
-	  r2bcorr_offdiag_mix   = (r2bcorr_para_mix  - r2bcorr_perp_mix ) /self_mobility/dr2;
-#endif
-	}
-      }
-      if (i < N){
-#pragma unroll 3
-	for (int k=0; k < DIM; k++){
-#pragma unroll 3
-	  for (int l=0;l < DIM; l++){
-#ifdef SD_RESISTANCE_CORRECT
-	    resistance[myindex(DIM*i+k,DIM*j+l)]=dr[k]*dr[l]*r2bcorr_offdiag_mix;
-#else
-	    resistance[myindex(DIM*i+k,DIM*j+l)]=dr[k]*dr[l]*offdiag_fac;
-#endif
-	  }
-#ifdef SD_RESISTANCE_CORRECT
-	  myresistance[k]-=dr[k]*dr[k]*r2bcorr_offdiag_self;
-	  resistance[myindex(DIM*i+k,DIM*j+k)]+=r2bcorr_diag_mix;
-	  myresistance[k]-=r2bcorr_diag_self;
-#else
-	  myresistance[k]-=dr[k]*dr[k]*offdiag_fac;
-	  resistance[myindex(DIM*i+k,DIM*j+k)]+=diag_fac;
-	  myresistance[k]-=diag_fac;
-#endif
-	}
-      }
-#ifdef SD_RESISTANCE_CORRECT
-      myresistance[3]-=r2bcorr_offdiag_self*dr[0]*dr[1];
-      myresistance[4]-=r2bcorr_offdiag_self*dr[0]*dr[2];
-      myresistance[5]-=r2bcorr_offdiag_self*dr[1]*dr[2];
-#else
-      myresistance[3]-=offdiag_fac*dr[0]*dr[1];
-      myresistance[4]-=offdiag_fac*dr[0]*dr[2];
-      myresistance[5]-=offdiag_fac*dr[1]*dr[2];
-#endif
-      // python implementation:
-      //T=one*(1-9./32.*drn/a)+3./32.*dr*drt/drn/a;
-    }
-  }
-  if ( i < N){
-#pragma unroll
-    for (int k=0;k<3;k++){
-      resistance[myindex(DIM*i+k,DIM*i+k)]=myresistance[k];
-    }
-    resistance[myindex(DIM*i+0,DIM*i+1)]=myresistance[3];
-    resistance[myindex(DIM*i+1,DIM*i+0)]=myresistance[3];
-    resistance[myindex(DIM*i+0,DIM*i+2)]=myresistance[4];
-    resistance[myindex(DIM*i+2,DIM*i+0)]=myresistance[4];
-    resistance[myindex(DIM*i+1,DIM*i+2)]=myresistance[5];
-    resistance[myindex(DIM*i+2,DIM*i+1)]=myresistance[5];
-  }
-  __syncthreads();
-  int * sharedInteractions = (int *) cachedPos; // reuse shared memory
-  int * maxInteractions    = sharedInteractions + blockDim.x*2;
-  sharedInteractions[threadIdx.x]=interactions;
-  sharedInteractions[threadIdx.x+blockDim.x]=0;
-  maxInteractions[threadIdx.x]   =interactions;
-  maxInteractions[threadIdx.x+blockDim.x]   =0;
-  for (int t=(blockDim.x+1)/2;t>1;t=(t+1)/2){
-    if (threadIdx.x < t){
-      sharedInteractions[threadIdx.x]+=sharedInteractions[threadIdx.x+t];
-      sharedInteractions[threadIdx.x+t]=0;
-      maxInteractions[threadIdx.x]=max(maxInteractions[threadIdx.x+t],maxInteractions[threadIdx.x]);
-    }
-    __syncthreads();
-  }
-  if (threadIdx.x==0){
-    sharedInteractions[0]+=sharedInteractions[1];
-    atomicAdd(myInfo+1, sharedInteractions[0]);
-    maxInteractions[0]=max(maxInteractions[0],maxInteractions[1]);
-    atomicMax(myInfo+2, maxInteractions[0]);
-  }
-}
-
-/// Find particle pairs which are within a certain cutoff range
-/// pos           : is the vector of [x_1, y_1, z_1, x_2, y_2, z_2, ...]
-/// L_g           : is the boxlength
-/// N             : is the number of particles
-/// interacting   : list of interaction particles. First entry is the first
-///                 particle close enough to particle 1. Followed by the first
-///                 one close enough to the second and so on.
-///                 The second interaction particle of the first is at
-///                 lda_short=((N+31)/32)*32 stored.
-/// num_interacting : for each particle the number of interacting particles.
-/// interaction_range : the cutoff range for interaction. Has to be smaller
-///                 than half the boxlength.
-__global__ void sd_find_interacting_particles(const real * pos,const real * L_g, const int N, int * interacting, int * num_interacting,
-					      const real interaction_range){
-  const int lda_short=((N+31)/32)*32;
-  const real interaction_range_squared=SQR(interaction_range);
-  __shared__ real cachedPos[3*numThreadsPerBlock];
-#ifndef SD_NOT_PERIODIC
-  __shared__ real L[3];
-  if (threadIdx.x < 3){
-    L[threadIdx.x]=L_g[threadIdx.x];
-  }
-#endif
-  int i = blockIdx.x*blockDim.x + threadIdx.x;
-  for (int l=0;l<3;l++){
-    cachedPos[threadIdx.x+l*blockDim.x] = pos[threadIdx.x+l*numThreadsPerBlock+blockIdx.x*blockDim.x*3];
-  }
-  __syncthreads();
-  real mypos[3];
-  for (int l=0;l<3;l++){
-    mypos[l]=cachedPos[threadIdx.x*3+l];
-  }
-  if (i < N){
-    int interactions=0;
-    int j0next;
-    for (int j0=0;j0<N;j0+=j0next){
-      j0next=min(j0+blockDim.x,N);
-      j0next-=j0;
-      for (int l=0;l<3;l++){
-	cachedPos[threadIdx.x+l*numThreadsPerBlock] = pos[threadIdx.x+l*blockDim.x+j0*3];
-      }
-      for (int j=0;j<j0next;j++){
-	real dr2=0;
-	for (int l=0;l<3;l++){
-	  real dr=mypos[l]-cachedPos[j*3+l];
-#ifndef SD_NOT_PERIODIC
-	  dr-=L[l]*rint(dr/L[l]);
-#endif
-	  dr2+=dr*dr;
-	}
-	if (dr2 < interaction_range_squared && i != j+j0) {
-	  interacting[i+interactions*lda_short]=j+j0;
-	  interactions++;
-	}
-      }
-    }
-    num_interacting[i]=interactions;
-  }
-}
-
-
-/// Find particle pairs which are within a certain cutoff range using buckets
-/// pos           : the vector of [x_1, y_1, z_1, x_2, y_2, z_2, ...]
-/// _L            : the boxlength
-/// N             : the number of particles
-/// interacting   : list of interaction particles. First entry is the first
-///                 particle close enough to particle 1. Followed by the first
-///                 one close enough to the second and so on.
-///                 The second interaction particle of the first is at
-///                 lda_short=((N+31)/32)*32 stored.
-/// num_interacting : for each particle the number of interacting particles.
-/// particle_count : the number of particles within a certain bucket
-/// particle_sorted_list : the list of the particles in the buckets
-/// bucket_size_  : real[3] array of the dimensions of a bucket
-/// bucket_num_   : int[3] array of the number of buckets in each direction
-/// particle_to_bucket_list : lookup table what particle is in what bucket
-/// interaction_range : the cutoff range for interaction. Has to be smaller
-///                 than half the boxlength.
-/// total_bucket_num : number of buckets
-__global__ void sd_find_interacting_particles(const real * pos,const real * _L, const int N, int * interacting, int * num_interacting,
-					      const int * particle_count, const int * particle_sorted_list, const real * bucket_size_,
-					      const int * bucket_num_, const int * particle_to_bucket_list, const real interaction_range,
-					      const int total_bucket_num){
-  const int lda_short=((N+31)/32)*32;
-  const real interaction_range_squared=SQR(interaction_range);
-  __shared__ real bucket_size[3];
-  __shared__ int bucket_num[6];
-#ifdef SD_USE_FLOAT
-  __shared__ real cachedPos[4*numThreadsPerBlock];
-#else
-  __shared__ real cachedPos[3*numThreadsPerBlock];
-#endif
-  // copy small vectors to shared memory
-#ifndef SD_NOT_PERIODIC
-  __shared__ real L[3];
-  if (threadIdx.x < 3){
-    L[threadIdx.x]=_L[threadIdx.x];
-  }
-#endif
-  if (threadIdx.x < 3){
-    bucket_size[threadIdx.x]=bucket_size_[threadIdx.x];
-    bucket_num[threadIdx.x]=bucket_num_[threadIdx.x];
-  }
-  int i = blockIdx.x*blockDim.x + threadIdx.x;
-  for (int l=0;l<3;l++){
-    cachedPos[threadIdx.x+l*numThreadsPerBlock] = pos[threadIdx.x+l*numThreadsPerBlock+blockIdx.x*blockDim.x*3];
-  }
-  __syncthreads();
-  int interactions=0;
-  if ( i < N){
-    int my_bucket=particle_to_bucket_list[i];
-    int bucket_x=my_bucket%bucket_num[0];
-    int bucket_y=(my_bucket/bucket_num[0])%bucket_num[1];
-    int bucket_z=my_bucket/(bucket_num[0]*bucket_num[1]);
-    int search_x=__real2int_ru(interaction_range/bucket_size[0]);
-    int search_y=__real2int_ru(interaction_range/bucket_size[1]);
-    int search_z=__real2int_ru(interaction_range/bucket_size[2]);
-    int cz = bucket_z-search_z;
-    while (cz < 0){
-      cz+=bucket_num[2];
-    }
-    int cy0 = bucket_y-search_y;
-    while (cy0 < 0){
-      cy0 += bucket_num[1];
-    }
-    int cx0=bucket_x-search_x;
-    while (cx0 < 0){
-      cx0 += bucket_num[0];
-    }
-    int current_bucket  = cz*bucket_num[1];
-    current_bucket += cy0;
-    current_bucket *= bucket_num[0];
-    current_bucket += cx0;
-    for (int z = -search_z;z<=search_z;z++){
-      int cy=cy0;
-      int z_bucket=current_bucket;
-      for (int y=-search_y;y<=search_y;y++){
-	if (cy == bucket_num[1]){
-	  cy = 0;
-	  current_bucket -= bucket_num[0]*bucket_num[1];
-	}
-	int y_bucket = current_bucket;
-	int cx=cx0;
-	for (int x=-search_x;x<=search_x;x++){
-	  if (cx == bucket_num[2]){
-	    cx = 0;
-	    current_bucket -= bucket_num[0];
-	  }
-	
-	  //assert((current_bucket>=0 && current_bucket < total_bucket_num));
-	  int in_loop=particle_count[current_bucket];
-	  for (int j_loop=0;j_loop<in_loop;j_loop++){
-	    int j=particle_sorted_list[current_bucket+j_loop*total_bucket_num];
-	    real dr2=0;
-#pragma unroll 3
-	    for (int d=0;d<3;d++){
-	      real dr=cachedPos[threadIdx.x*3+d]-pos[3*j+d];
-#ifndef SD_NOT_PERIODIC
-	      dr-=L[d]*rint(dr/L[d]);
-#endif
-	      //#warning folding back
-	      dr2+=dr*dr;
-	    }
-	    if (dr2<interaction_range_squared){
-	      interacting[i+interactions*lda_short]=j;
-	      interactions++;
-	    } 
-	  }
-	  cx++;
-	  if (cx == bucket_num[0]){
-	    cx = 0;
-	    current_bucket -= bucket_num[0];
-	  }
-	  current_bucket++;
-	} // end of x loop
-	current_bucket = y_bucket;
-	cy++;
-	if (cy == bucket_num[1]){
-	  cy = 0;
-	  current_bucket -= bucket_num[0]*bucket_num[1];
-	}
-	current_bucket += bucket_num[0];
-      }
-      current_bucket  = z_bucket;
-      cz++;
-      if (cz == bucket_num[2]){
-	cz = 0;
-	current_bucket -= total_bucket_num;
-      }
-      current_bucket += bucket_num[0]*bucket_num[1];
-    }
-  }
-  num_interacting[i]=interactions;
-  // sort the interactions
-  /*int loop = reduce_max((int *)cachedPos,interactions); // buggy
-  if ( i < N){
-    for (int j=0;j < loop;j++){
-      if (j>=interactions){
-	interacting[i+j*lda_short]=N+1;
-      }
-    }
-    bool swapped;
-    do {
-      swapped=false;
-      int last=interacting[i];
-      int cur;
-      for (int j=1;j<loop;j++){
-	cur=interacting[i+lda_short*j];
-	if (last > cur){
-	  //swap
-	  interacting[i+lda_short*(j-1)]=cur;
-	  interacting[i+lda_short*j]    =last;
-	  swapped=true;
-	} else {
-	  last=cur;
-	}
-      }
-      loop--;
-    } while (swapped);
-  }*/
-  if ( i < N){
-    int loop = interactions;
-    do {
-      int last=interacting[i];
-      int cur;
-      int last_j=0;
-      for (int j=1;j<loop;j++){
-	cur=interacting[i+lda_short*j];
-	if (last > cur){
-	  //swap
-	  interacting[i+lda_short*(j-1)]=cur;
-	  interacting[i+lda_short*j]    =last;
-	  last_j=j;
-	} else {
-	  last=cur;
-	}
-      }
-      loop=last_j;
-    } while (loop>0);
-  }
-  
-}
-
-/// Compute the lubrication correction in the resistance formulation.
-/// pos           : is the vector of [x_1, y_1, z_1, x_2, y_2, z_2, ...]
-/// N             : is the number of particles
-/// self_mobility : is 1./(6.*PI*eta*a)
-/// a             : is the particle radius
-/// _L            : is the boxlength
-/// resistance    : is the sparse resistance matrix which will be returned
-/// col_idx       : list of interacting partilces
-/// row_l         : number of col_idx entries per particle
-/// myInfo        : contains infos about the operation:
-///                myInfo[0] : number of overlapping particles
-///                myInfo[1] : number of interacting particles (via nf)
-///                myInfo[2] : max number of interacting particles per particle
-__global__ void sd_compute_resistance_matrix_sparse(const real * pos, const int N, const real self_mobility,const real a,const real * _L,
-  						    real * resistance,const int * col_idx,const int * row_l, int * myInfo){
-   const int lda_short=((N+31)/32)*32;
-   const int lda = ((N*3+31)/32)*32;
-#ifdef SD_USE_FLOAT
-  __shared__ real cachedPos[4*numThreadsPerBlock];
-#else
-  __shared__ real cachedPos[3*numThreadsPerBlock];
-#endif
-  // copy small vectors to shared memory
-#ifndef SD_NOT_PERIODIC
-  __shared__ real L[3];
-  if (threadIdx.x < 3){
-    L[threadIdx.x]=_L[threadIdx.x];
-  }
-#endif
-  real myresistance[6]={0,0,0,0,0,0};
-  int i = blockIdx.x*blockDim.x + threadIdx.x;
-  // get data for myposition - but coalscaled
-#pragma unroll 3
-  for (int l=0;l<3;l++){
-    cachedPos[threadIdx.x+l*numThreadsPerBlock] = pos[threadIdx.x+l*numThreadsPerBlock+blockIdx.x*blockDim.x*3];
-  }
-  
-  __syncthreads();
-  
-  int num_interactions=i<N?row_l[i]:0;
-  int i_index;
-  for (int j_loop=0;j_loop<num_interactions;j_loop++){
-    int j=col_idx[i+j_loop*lda_short];
-    real dr[DIM];
-    real dr2=0;
-#pragma unroll
-    for (int d=0;d<DIM;d++){
-      dr[d]=cachedPos[threadIdx.x*3+d]-pos[3*j+d]; // r_ij
-#ifndef SD_NOT_PERIODIC
-      dr[d]-=L[d]*rint(dr[d]/L[d]); // fold back
-#endif
-      dr2+=dr[d]*dr[d];
-      mydebug("dr[%d]: %f\n",d,dr[d]);
-    }
-#ifdef SD_RESISTANCE_CORRECT
-    real r2bcorr_diag_self     = 0;
-    real r2bcorr_diag_mix      = 0;
-    real r2bcorr_offdiag_self  = 0;
-    real r2bcorr_offdiag_mix   = 0;
-#else
-    real offdiag_fac=0;
-    real diag_fac=0;
-#endif
-    if (i >= N || i ==j || j >= N){
-      ;
-    }
-    else if (dr2 < 4*a*4*a){
-      if (!(2*a*2*a < dr2 )){
-	if (i==j){
-	  i_index=j_loop;
-	} else {
-	  atomicAdd(myInfo,1); // count overlapping particles
-	}
-      }
-      else {// 2*a < drn < 4*a 
-	// python code:
-	// # Use only singular therms, namely to order O(s_ij^0)                                                                  
-	// T=(1./4./s-1/4-9./40.*ls)*dr*drt/dr2
-	// #           ^ this additonal constant is so that the mobility is smooth
-	// # c.f. N.-Q. Nguyen and A. J. C. Ladd, PHYSICAL REVIEW E 66, 046708 (2002) equation (34)                               
-	// T+=1./6.*ls*(-one+dr*drt/dr2)
-	// R[3*i:3*i+3,3*j:3*j+3]=-T
-	// R[3*i:3*i+3,3*i:3*i+3]+=T
-	real drn= sqrt(dr2); // length of dr
-	real s = drn/a-2;
-	real ls = log(s);
-	
-	real const para_fac_c=-0.125+(9./40.)*log(2.)+3./112.*2.*log(2.);
-#ifdef SD_RESISTANCE_CORRECT
-	real para_fac    =(-0.25/s+(9./40.)*ls+(3./112.)*s*ls);
-	real perp_fac    =((1./6.)*ls);
-#else
-	real const perp_fac_c=1./6.*log(2.);
-	real para_fac    =(-0.25/s+(9./40.)*ls+(3./112.)*s*ls-para_fac_c)/dr2/self_mobility;
-	real perp_fac    =((1./6.)*ls-perp_fac_c)/self_mobility;
-	diag_fac    = perp_fac;
-	offdiag_fac = para_fac-perp_fac;
-#endif
-#ifdef SD_RESISTANCE_CORRECT
-	real dr4=dr2*dr2;
-	real dr6=dr4*dr2;
-	// constants for correction at cutoff
-	const real dr_c1 = 4;
-	const real dr_c2 = 4*4;
-	const real dr_c3 = 4*4*4;
-	const real dr_c4 = 4*4*4*4;
-	const real dr_c5 = 4*4*4*4*4;
-	const real dr_c6 = 4*4*4*4*4*4;
-	
-	const real r2bcorr_para_self_c  =                  1./(1.-9./4./dr_c2+3./dr_c4-1./dr_c6) + para_fac_c;
-	const real r2bcorr_para_mix_c   = (6.*dr_c5-4.*dr_c3)/(4.*dr_c6-9.*dr_c4+12.*dr_c2-4.)   + para_fac_c;
-	const real r2bcorr_perp_self_c  =                  1./(1.-25./16./dr_c2)                 + 1./6.*log(2.);
-	const real r2bcorr_perp_mix_c   =                  1./(16./20.*dr_c1-25./20./dr_c1)      + 1./6.*log(2.);
-	// TODO: Make sure to use (real) and not double ...
-	// real computation
-	real r2bcorr_para_self     =-( para_fac  + (                      1./(1.-9./4./dr2+3./dr4-1./dr6)  - r2bcorr_para_self_c ));
-	real r2bcorr_para_mix      = ( para_fac  + ( (6.*dr4*drn-4.*dr2*drn)/(4.*dr6-9.*dr4+12.*dr2-4.)    - r2bcorr_para_mix_c  ));
-	real r2bcorr_perp_self     =-( perp_fac  + (                      1./(1.-((real)25./16.)/dr2)      - r2bcorr_perp_self_c ));
-	real r2bcorr_perp_mix      = ( perp_fac  + (                      1./(16./20.*drn-25./20./drn)     - r2bcorr_perp_mix_c  ));
-	//printf("%d %d   show  [ %e, %e,  %e, %e ]\n",i,j,r2bcorr_para_self,r2bcorr_perp_self,r2bcorr_para_mix,r2bcorr_perp_mix);
-	
-	r2bcorr_diag_self     = (r2bcorr_perp_self)/self_mobility;
-	r2bcorr_diag_mix      = (r2bcorr_perp_mix )/self_mobility;
-	r2bcorr_offdiag_self  = (r2bcorr_para_self - r2bcorr_perp_self) /self_mobility/dr2;
-	r2bcorr_offdiag_mix   = (r2bcorr_para_mix  - r2bcorr_perp_mix ) /self_mobility/dr2;
-#endif
-      }
-    }
-    if (i < N){
-#pragma unroll 3
-      for (int k=0; k < DIM; k++){
-#pragma unroll 3
-	for (int l=0;l < DIM; l++){
-#ifdef SD_RESISTANCE_CORRECT
-	  resistance[DIM*i+k+(DIM*j_loop+l)*lda]=dr[k]*dr[l]*r2bcorr_offdiag_mix;
-#else
-	  resistance[DIM*i+k+(DIM*j_loop+l)*lda]=dr[k]*dr[l]*offdiag_fac;
-#endif
-	}
-#ifdef SD_RESISTANCE_CORRECT
-	myresistance[k]-=dr[k]*dr[k]*r2bcorr_offdiag_self;
-	resistance[DIM*i+k+(DIM*j_loop+k)*lda]+=r2bcorr_diag_mix;
-	myresistance[k]-=r2bcorr_diag_self;
-#else
-	myresistance[k]-=dr[k]*dr[k]*offdiag_fac;
-	resistance[DIM*i+k+(DIM*j_loop+k)*lda]+=diag_fac;
-	myresistance[k]-=diag_fac;
-#endif
-      }
-    }  
-#ifdef SD_RESISTANCE_CORRECT
-    myresistance[3]-=r2bcorr_offdiag_self*dr[0]*dr[1];
-    myresistance[4]-=r2bcorr_offdiag_self*dr[0]*dr[2];
-    myresistance[5]-=r2bcorr_offdiag_self*dr[1]*dr[2];
-#else
-    myresistance[3]-=offdiag_fac*dr[0]*dr[1];
-    myresistance[4]-=offdiag_fac*dr[0]*dr[2];
-    myresistance[5]-=offdiag_fac*dr[1]*dr[2];
-#endif
-    // python implementation:
-    //T=one*(1-9./32.*drn/a)+3./32.*dr*drt/drn/a;
-  }
-  
-  /*else{ // set the block to zero
-  // it might be faster to set everything in the beginning to zero ...
-  // or use sparse matrices ...
-  #pragma unroll 3
-  for (int k=0; k < DIM; k++){
-  #pragma unroll 3
-  for (int l=0;l < DIM; l++){
-  resistance[myindex(DIM*i+k,DIM*j+l)]=0;
-  }
-  }  
-  }*/
-  
-  if ( i < N){
-    resistance[DIM*i+0+(DIM*i_index+0)*lda]=myresistance[0];
-    resistance[DIM*i+0+(DIM*i_index+1)*lda]=myresistance[3];
-    resistance[DIM*i+0+(DIM*i_index+2)*lda]=myresistance[4];
-    resistance[DIM*i+1+(DIM*i_index+0)*lda]=myresistance[3];
-    resistance[DIM*i+1+(DIM*i_index+1)*lda]=myresistance[1];
-    resistance[DIM*i+1+(DIM*i_index+2)*lda]=myresistance[5];
-    resistance[DIM*i+2+(DIM*i_index+0)*lda]=myresistance[4];
-    resistance[DIM*i+2+(DIM*i_index+1)*lda]=myresistance[5];
-    resistance[DIM*i+2+(DIM*i_index+2)*lda]=myresistance[2];
-  }
-  /*
-  __syncthreads();
-  int * sharedInteractions = (int *) cachedPos; // reuse shared memory
-  int * maxInteractions    = sharedInteractions + blockDim.x*2;
-  sharedInteractions[threadIdx.x]=num_iterations-1;
-  sharedInteractions[threadIdx.x+blockDim.x]=0;
-  maxInteractions[threadIdx.x]   =num_iterations-1;
-  maxInteractions[threadIdx.x+blockDim.x]   =0;
-  for (int t=(blockDim.x+1)/2;t>1;t=(t+1)/2){
-    if (threadIdx.x < t){
-      sharedInteractions[threadIdx.x]+=sharedInteractions[threadIdx.x+t];
-      sharedInteractions[threadIdx.x+t]=0;
-      maxInteractions[threadIdx.x]=max(maxInteractions[threadIdx.x+t],maxInteractions[threadIdx.x]);
-    }
-    __syncthreads();
-  }
-  if (threadIdx.x==0){
-    sharedInteractions[0]+=sharedInteractions[1];
-    atomicAdd(myInfo+1, sharedInteractions[0]);
-    maxInteractions[0]=max(maxInteractions[0],maxInteractions[1]);
-    atomicMax(myInfo+2, maxInteractions[0]);
-    }*/
-}
-
-
-#ifndef SD_RESISTANCE_CORRECT
-#warning "SD Brownian motion only support corrected resistance calculation ..."
-#endif
-/// Computing the brownian forces which arise from the nearfield contribution
-/// of the mobility matrix
-/// r        : position of the particles
-/// gaussian : pointer to a sufficent amount of gaussian distributed random
-///            numbers 
-/// N        : number of particles
-/// L_g      : size of periodic box
-/// a        : hydrodynamic particle radius
-/// self_mobility : is 1./(6.*PI*eta*a)
-/// brownian_force_nf : computed brownian force
-__global__ void sd_compute_brownian_force_nearfield(const real * r, const real * gaussian,int N,const real * L_g,
-						    real a, real self_mobility,real * brownian_force_nf){
-  const int gaussian_ldd=((N+31)/32)*32;
-  int interactions=0;
-  real mypos[3];
-  real writeCache[6];
-  //real otherWriteCache[3];
-  __shared__ real cachedPos[3*numThreadsPerBlock];
-  __shared__ real choleskyCache[12*numThreadsPerBlock];
-  //const int lda=(((N*3)+31)/32)*32;
-  //__shared__ real myresistance[6*numThreadsPerBlock];
-  //real myresistance[6];
-  //__shared__ real otherresistance[6*numThreadsPerBlock];
-  int i = blockIdx.x*blockDim.x + threadIdx.x;
-#ifndef SD_NOT_PERIODIC
-  __shared__ real L[3];
-  if (threadIdx.x < 3){ // copy L to shared memory
-    L[threadIdx.x]=L_g[threadIdx.x];
-  }
-#endif
-  for (int l=0;l<3;l++){
-    cachedPos[threadIdx.x+l*numThreadsPerBlock] = r[threadIdx.x+l*numThreadsPerBlock+blockIdx.x*blockDim.x*3];
-    writeCache[l]= 0;
-  }
-  __syncthreads();
-  for (int d=0;d<3;d++){
-    mypos[d] = cachedPos[threadIdx.x*3+d];
-  }
-  
-  for (int offset=0;offset<N;offset+=numThreadsPerBlock){
-    // copy positions to shared memory
-#pragma unroll
-    for (int l=0;l<3;l++){
-      cachedPos[threadIdx.x+l*numThreadsPerBlock] = r[threadIdx.x+l*numThreadsPerBlock+offset*3];
-    }
-    __syncthreads();
-for (int j=max(offset, blockIdx.x*numThreadsPerBlock+1);j<min(offset+numThreadsPerBlock,N);j++){
-      real dr[DIM];
-      real dr2=0;
-#pragma unroll
-      for (int k=0;k<DIM;k++){
-	dr[k]=mypos[k]-cachedPos[3*(j-offset)+k]; // r_ij
-#ifndef SD_NOT_PERIODIC
-	dr[k]-=L[k]*rint(dr[k]/L[k]); // fold back
-#endif
-	dr[k]/=a;
-	dr2+=dr[k]*dr[k];
-      }
-      real r2bcorr_diag_self     = 0;
-      real r2bcorr_diag_mix      = 0;
-      real r2bcorr_offdiag_self  = 0;
-      real r2bcorr_offdiag_mix   = 0;
-      
-#ifdef SD_DEBUG
-      for (int t=0;t<12;t++){
-	choleskyCache[threadIdx.x+ (t)*numThreadsPerBlock]=0;
-      }
-#endif
-      int wasInLoop = 0;
-      if (i >= N || i >= j || j >= N){
-	writeCache[3]=0;
-	writeCache[4]=0;
-	writeCache[5]=0;
-      }
-      // j > i
-      else if (dr2 < 16  && 4 < dr2 ){// 2*a < drn < 4*a 
-	wasInLoop = 1;
-	{
-	  real drn= sqrt(dr2); // length of dr
-	  real s = drn-2;
-	  real ls = log(s);
-	  
-	  real const para_fac_c=-0.125+(9./40.)*log(2.)+3./112.*2.*log(2.);
-	  real para_fac    =(-0.25/s+(9./40.)*ls+(3./112.)*s*ls);
-	  real perp_fac    =((1./6.)*ls);
-	  
-	  real dr4=dr2*dr2;
-	  real dr6=dr4*dr2;
-	  // constants for correction at cutoff
-	  const real dr_c1 = 4;
-	  const real dr_c2 = 4*4;
-	  const real dr_c3 = 4*4*4;
-	  const real dr_c4 = 4*4*4*4;
-	  const real dr_c5 = 4*4*4*4*4;
-	  const real dr_c6 = 4*4*4*4*4*4;
-	  
-	  const real r2bcorr_para_self_c  =                  1./(1.-9./4./dr_c2+3./dr_c4-1./dr_c6) + para_fac_c;
-	  const real r2bcorr_para_mix_c   = (6.*dr_c5-4.*dr_c3)/(4.*dr_c6-9.*dr_c4+12.*dr_c2-4.)   + para_fac_c;
-	  const real r2bcorr_perp_self_c  =                  1./(1.-25./16./dr_c2)                 + 1./6.*log(2.);
-	  const real r2bcorr_perp_mix_c   =                  1./(16./20.*dr_c1-25./20./dr_c1)      + 1./6.*log(2.);
-	  // TODO: Make sure to use (real) and not double ...
-	  // real computation
-	  real r2bcorr_para_self     =-( para_fac  + (                      1./(1.-9./4./dr2+3./dr4-1./dr6)  - r2bcorr_para_self_c ));
-	  real r2bcorr_para_mix      = ( para_fac  + ( (6.*dr4*drn-4.*dr2*drn)/(4.*dr6-9.*dr4+12.*dr2-4.)    - r2bcorr_para_mix_c  ));
-	  real r2bcorr_perp_self     =-( perp_fac  + (                      1./(1.-((real)25./16.)/dr2)      - r2bcorr_perp_self_c ));
-	  real r2bcorr_perp_mix      = ( perp_fac  + (                      1./(16./20.*drn-25./20./drn)     - r2bcorr_perp_mix_c  ));
-	  //printf("%d %d   show  [ %e, %e,  %e, %e ]\n",i,j,r2bcorr_para_self,r2bcorr_perp_self,r2bcorr_para_mix,r2bcorr_perp_mix);
-	  
-	  r2bcorr_diag_self     = (r2bcorr_perp_self)/self_mobility;
-	  r2bcorr_diag_mix      = (r2bcorr_perp_mix )/self_mobility;
-	  r2bcorr_offdiag_self  = (r2bcorr_para_self - r2bcorr_perp_self) /self_mobility/dr2;
-	  r2bcorr_offdiag_mix   = (r2bcorr_para_mix  - r2bcorr_perp_mix ) /self_mobility/dr2;
-	  //printf("%d %d   shoz     [%e, %e, %e, %e]\n",i,j,r2bcorr_diag_self,r2bcorr_offdiag_self,r2bcorr_diag_mix,r2bcorr_offdiag_mix);
-	  //r2bcorr_offdiag_self /=  dr2;
-	  //r2bcorr_offdiag_mix  /=  dr2;
-	}
-	// This is the cholesky decomposition.
-	// note that we try to avoid the usage of registers, so we use shared mem
-	// myCC is a makro, defined here to shorten the lines:
-#define myCC(pos) choleskyCache[threadIdx.x+ (pos)*numThreadsPerBlock]
-	// without it would look more like this:
-	//choleskyCache[threadIdx.x+ 0*numThreadsPerBlock] = sqrt(r2bcorr_diag_self+r2bcorr_offdiag_self*dr[0]*dr[0]);
-	//choleskyCache[threadIdx.x+ 1*numThreadsPerBlock] = r2bcorr_offdiag_self*dr[0]*dr[1] / choleskyCache[threadIdx.x+ 0*numThreadsPerBlock];
-	// L_{1,1} to L_{6,1}
-	myCC(0)  = sqrt(r2bcorr_diag_self+r2bcorr_offdiag_self*dr[0]*dr[0]);
-	myCC(1)  =                        r2bcorr_offdiag_self*dr[0]*dr[1] / myCC(0);
-	myCC(2)  =                        r2bcorr_offdiag_self*dr[0]*dr[2] / myCC(0);
-	myCC(3)  =    (r2bcorr_diag_mix + r2bcorr_offdiag_mix *dr[0]*dr[0])/ myCC(0);
-	myCC(4)  =                        r2bcorr_offdiag_mix *dr[0]*dr[1] / myCC(0);
-	myCC(5)  =                        r2bcorr_offdiag_mix *dr[0]*dr[2] / myCC(0);
-	
-	writeCache[0]+=myCC(0)  * gaussian[0*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	writeCache[1]+=myCC(1)  * gaussian[0*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	writeCache[2]+=myCC(2)  * gaussian[0*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	writeCache[3] =myCC(3)  * gaussian[0*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	writeCache[4] =myCC(4)  * gaussian[0*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	writeCache[5] =myCC(5)  * gaussian[0*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	// used: 6
-	// L_{2,2} to L_{6,2}
-	myCC(0)  = sqrt(r2bcorr_diag_self+r2bcorr_offdiag_self*dr[1]*dr[1] - SQR(    myCC(1)));
-	myCC(6)  =                       (r2bcorr_offdiag_self*dr[1]*dr[2] - myCC(2)*myCC(1))/myCC(0);
-	myCC(7)  =                       (r2bcorr_offdiag_mix *dr[1]*dr[0] - myCC(3)*myCC(1))/myCC(0);
-	myCC(8)  =     (r2bcorr_diag_mix +r2bcorr_offdiag_mix *dr[1]*dr[1] - myCC(4)*myCC(1))/myCC(0);
-	myCC(9)  =                       (r2bcorr_offdiag_mix *dr[1]*dr[2] - myCC(5)*myCC(1))/myCC(0);
-	writeCache[1]+=myCC(0)  * gaussian[1*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	writeCache[2]+=myCC(6)  * gaussian[1*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	writeCache[3]+=myCC(7)  * gaussian[1*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	writeCache[4]+=myCC(8)  * gaussian[1*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	writeCache[5]+=myCC(9)  * gaussian[1*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	// used: 11 - 1
-	// L_{3,3} to L_{6,3}
-	myCC(0)  = sqrt(r2bcorr_diag_self+r2bcorr_offdiag_self*dr[2]*dr[2] - SQR(    myCC(2))- SQR(    myCC(6)));
-	myCC(1)  =                       (r2bcorr_offdiag_mix *dr[2]*dr[0] - myCC(3)*myCC(2) - myCC(7)*myCC(6))/myCC(0);
-	myCC(10) =                       (r2bcorr_offdiag_mix *dr[2]*dr[1] - myCC(4)*myCC(2) - myCC(8)*myCC(6))/myCC(0);
-	myCC(11) =     (r2bcorr_diag_mix +r2bcorr_offdiag_mix *dr[2]*dr[2] - myCC(5)*myCC(2) - myCC(9)*myCC(6))/myCC(0);
-	writeCache[2]+=myCC(0)  * gaussian[2*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	writeCache[3]+=myCC(1)  * gaussian[2*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	writeCache[4]+=myCC(10) * gaussian[2*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	writeCache[5]+=myCC(11) * gaussian[2*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	// used: 15 - 3
-	// L_{4,4} to L_{6,4}
-	myCC(0)  = sqrt(r2bcorr_diag_self+r2bcorr_offdiag_self*dr[0]*dr[0] - SQR(    myCC(3))- SQR(    myCC(7))
-			- SQR(     myCC(1)));
-	if (isnan(myCC(0))){
-	  printf("%d %d : [%e %e %e]\n",i,j,dr[0],dr[1],dr[2]);
-	}
-	myCC(2)  =                       (r2bcorr_offdiag_self*dr[0]*dr[1] - myCC(4)*myCC(3) - myCC(8)*myCC(7) 
-					  - myCC(10)*myCC(1))/myCC(0);
-	myCC(6)  =                       (r2bcorr_offdiag_self*dr[0]*dr[2] - myCC(5)*myCC(3) - myCC(9)*myCC(7) 
-					  - myCC(11)*myCC(1))/myCC(0);
-	writeCache[3]+=myCC(0)  * gaussian[3*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	writeCache[4]+=myCC(2)  * gaussian[3*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	writeCache[5]+=myCC(6)  * gaussian[3*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	// used: 18 - 6
-	// L_{5,5} and L_{6,5}
-	myCC(0)  = sqrt(r2bcorr_diag_self+r2bcorr_offdiag_self*dr[1]*dr[1] - SQR(    myCC(4))- SQR(    myCC(8))
-			- SQR(     myCC(10))- SQR(    myCC(2)));
-	if (isnan(myCC(0))){
-	  printf("%d %d : [%e %e %e] \n",i,j,dr[0],dr[1],dr[2]);
-	}
-	myCC(3)  =                       (r2bcorr_offdiag_self*dr[1]*dr[2] - myCC(5)*myCC(4) - myCC(9)*myCC(8) 
-					  - myCC(11)*myCC(10) - myCC(6)*myCC(2))/myCC(0);
-	writeCache[4]+=myCC(0)  * gaussian[4*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	writeCache[5]+=myCC(3)  * gaussian[4*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	// used: 20 - 10
-	// L_{6,6} would be:
-	myCC(0) = sqrt(r2bcorr_diag_self+r2bcorr_offdiag_self*dr[2]*dr[2] - SQR(myCC(5))    - SQR(myCC(9))     
-		       - SQR(myCC(11)) - SQR(myCC(6)) - SQR(myCC(3)));
-	if (isnan(myCC(0))){
-	  printf("%d %d : [%e %e %e] \n",i,j,dr[0],dr[1],dr[2]);
-	}
-	writeCache[5]+=myCC(0)  * gaussian[5*gaussian_ldd+threadIdx.x+blockDim.x*blockIdx.x+6*gaussian_ldd*interactions];
-	// used 21 - 15
-	interactions++;
-	for (int l=0;l<6;l++){
-	  if (isnan(writeCache[l])){
-	    printf("%d %d, %d: dr=[%e, %e, %e]\n",i,j,l,dr[0],dr[1],dr[2]);
-	  }
-	}
-      }
-      // for the particle j (writeCache[3-5]) we can reduce localy:
-      
-      int * haveInteraction = (int *) choleskyCache+6*numThreadsPerBlock; // reuse shared memory
-      choleskyCache[threadIdx.x+0*numThreadsPerBlock]=writeCache[3];
-      choleskyCache[threadIdx.x+1*numThreadsPerBlock]=0;
-      choleskyCache[threadIdx.x+2*numThreadsPerBlock]=writeCache[4];
-      choleskyCache[threadIdx.x+3*numThreadsPerBlock]=0;
-      choleskyCache[threadIdx.x+4*numThreadsPerBlock]=writeCache[5];
-      choleskyCache[threadIdx.x+5*numThreadsPerBlock]=0;
-      haveInteraction[threadIdx.x]=wasInLoop;
-      haveInteraction[threadIdx.x+numThreadsPerBlock]=0;
-      for (int t=(blockDim.x+1)/2;t>1;t=(t+1)/2){
-	if (threadIdx.x < t){
-	  choleskyCache[threadIdx.x]+=choleskyCache[threadIdx.x+t];
-	  choleskyCache[threadIdx.x+2*numThreadsPerBlock]+=choleskyCache[threadIdx.x+t +2*numThreadsPerBlock];
-	  choleskyCache[threadIdx.x+4*numThreadsPerBlock]+=choleskyCache[threadIdx.x+t +2*numThreadsPerBlock];
-	  haveInteraction[threadIdx.x]|=haveInteraction[threadIdx.x+t];
-	  choleskyCache[threadIdx.x+t]=0;
-	  choleskyCache[threadIdx.x+t +2*numThreadsPerBlock]=0;
-	  choleskyCache[threadIdx.x+t +4*numThreadsPerBlock]=0;
-	  haveInteraction[threadIdx.x+t]=0;
-	}
-	__syncthreads();
-      }
-      if (threadIdx.x==0){
-	if (haveInteraction[0] || haveInteraction[1]){
-	  choleskyCache[0]+=choleskyCache[1];
-	  choleskyCache[2*numThreadsPerBlock]+=choleskyCache[1+2*numThreadsPerBlock];
-	  choleskyCache[4*numThreadsPerBlock]+=choleskyCache[1+4*numThreadsPerBlock];
-	  real tmp=atomicAdd(brownian_force_nf+j*3,   choleskyCache[0]);
-	  bool error=isnan(tmp);
-	  tmp = atomicAdd(brownian_force_nf+j*3+1, choleskyCache[2*numThreadsPerBlock]);
-	  error|=isnan(tmp);
-	  tmp = atomicAdd(brownian_force_nf+j*3+2, choleskyCache[4*numThreadsPerBlock]);
-	  error|=isnan(tmp);
-	  if (error){
-	    printf("%d %d: dr=[%e, %e, %e]\n",i,j,dr[0],dr[1],dr[2]);
-	  }
-	}
-      }
-    }
-  }
-  if ( i < N){
-#pragma unroll 3
-    for (int k=0;k<3;k++){
-      real tmp = atomicAdd(brownian_force_nf+i*3+k, writeCache[k]);
-      //assert(!isnan(tmp));
-    }
-  }
-}
-
-/// computing the matrix vector y = c * A * x product with a square sparse
-/// matrix A and a scalar factor c
-/// size     : size of the matrix and of the input and output vectors
-/// factor   : factor $c$ is the given prefactor
-/// matrix   : data of the sparse matrix
-/// ldd      : leading dimension of the matrix
-/// ldd_short : leading dimension of col_idx
-/// col_idx  : indices of the entries in the matrix
-/// row_l    : number of entris per row
-/// vec_in   : the input vecor $x$
-/// vec_out  : the output vector $y$
-__global__ void sd_multiply_sparse_matrix_vector(const int size, const real factor, const real * matrix, const int ldd, const int ldd_short,
-						 const int * col_idx, const int * row_l, const real * vec_in, real * vec_out ){
-  int id = threadIdx.x + blockIdx.x*blockDim.x;
-  if (id<size){
-    int i=id/3;
-    int rows=row_l[i];
-    real erg=0;
-    for (int row=0;row<rows;row++){
-      int colid=col_idx[i+row*ldd_short];
-      for (int k=0;k<3;k++){
-	// asm version is about 33% faster
-	erg+=read_without_caching(matrix+id+(row*3+k)*ldd)*vec_in[colid*3+k];
-	//erg+=matrix[id+(row*3+k)*ldd]*vec_in[colid*3+k];
-      }
-    }
-    vec_out[id]=erg*factor;
-  }
-}
-
-/// this adds the identity matrix to a given matrix of ld=size
-/// matrix   : pointer to the given matrix
-/// size     : the size of the matrix (in the example below 3N)
-/// block    : (ignored) the number of elements to process per thread
-///            if this is e.g. 3 and the matrix is 3Nx3N, than N threads have
-///            to be started 
-__global__ void sd_add_identity_matrix(real * matrix, int size, int lda){
-  //int lda=((size+31)/32)*32;
-  int idx = blockIdx.x*blockDim.x + threadIdx.x;
-  //for (int i = idx*block; i< (idx+1)*block; i++){
-  for (int i = idx;i< size; i+=blockDim.x*gridDim.x){
-    matrix[i+i*lda]+=1;
-  }
-}
-
-/// this sets a block to zero
-/// matrix   : pointer to the given matrix
-/// size     : the size of the matrix (in the example below 3N)
-/// ldd      : the leading dimension of the matrix
-__global__ void sd_set_zero_matrix(real * matrix, int size, int ldd){
-  int idx = blockIdx.x*blockDim.x + threadIdx.x;
-  int matsize=ldd;
-  matsize*=size;
-  for (int i = idx;i< matsize; i+=blockDim.x*gridDim.x){
-    matrix[i]=0;
-  }
-}
-
-
-/// this sets a block to zero
-/// data     : pointer to the given data
-/// size     : the size of the data
-__global__ void sd_set_zero(real * data, int size){
-  int idx = blockIdx.x*blockDim.x + threadIdx.x;
-  for (int i = idx;i< size; i+=blockDim.x*gridDim.x){
-    data[i]=0;
-  }
-}
-
-/// this sets a block to an given integer
-/// data     : pointer to the given data
-/// size     : the size of the data
-/// value    : the value written to the data block
-__global__ void sd_set_value(int * data, int size, int value){
-  int idx = blockIdx.x*blockDim.x + threadIdx.x;
-  for (int i = idx;i< size; i+=blockDim.x*gridDim.x){
-    data[i]=value;
-  }
-}
-
-/// this sets a block to an given real
-/// data     : pointer to the given data
-/// size     : the size of the data
-/// value    : the value written to the data block
-__global__ void sd_set_value(real * data, int size, real value){
-  int idx = blockIdx.x*blockDim.x + threadIdx.x;
-  for (int i = idx;i< size; i+=blockDim.x*gridDim.x){
-    data[i]=value;
-  }
-}
-
-
-
-
-#define DIST (2+1e-1)
-#define DISP_MAX (1000)
-/// Checking before the actual integration whether any particles moves more
-/// than DISP_MAX and if so, reducing the displacement to DISP_MAX
-/// r_d      : position of the particles
-/// disp_d   : displacements of the particles
-/// L        : box dimension
-/// a        : particle radius
-/// N        : number of particles
-__global__ void sd_real_integrate_prepare( const real * r_d , real * disp_d, const real * L, real a, int N){
-  int i=blockIdx.x*blockDim.x + threadIdx.x;
-  i*=3;
-  real disp2;
-#pragma unroll
-  for (int d=0;d<3;d++){
-    disp2+=disp_d[i+d]*disp_d[i+d];
-  }
-  if (disp2> DISP_MAX*DISP_MAX){
-    disp2=DISP_MAX/sqrt(disp2);
-#pragma unroll
-    for (int d=0;d<3;d++){
-      disp_d[i+d]*=disp2;
-    }
-  }
-}
-
-/// if SD_AVOID_COLLISION is set avoid that particles are overlapping after
-/// the displacement, else only perform the integration step
-/// r_d      : position of the particles
-/// disp_d   : displacement of the particles
-/// L        : box dimensions
-/// a        : particle radius
-/// N        : number of particles
-__global__ void sd_real_integrate( real * r_d , const real * disp_d, const real * L, real a, int N)
-{
-  int idx =  blockIdx.x*blockDim.x + threadIdx.x;
-  // t is the factor how far of disp_d we will move.
-  // in case everything is fine, we will move t, if there is some trouble,
-  // we will move less to avoid collision
-  real t=1;
-#ifdef SD_AVOID_COLLISON
-  real rnew[DIM];
-  for (int d=0;d<DIM;d++){
-    rnew[d]=r_d[DIM*idx+d]+disp_d[DIM*idx+d];
-  }
-  const real distmin=(3*a)*(3*a);
-  for (int i=0;i<N;i++){
-    if (idx==i){
-      i++;
-      if (i >N){
-	continue;
-      }
-    }
-    real dr2=0;
-    for (int d=0;d<DIM;d++){
-      real tmp=r_d[i*DIM+d]-rnew[d];
-#ifndef SD_NOT_PERIODIC
-      tmp-=L[d]*rint(tmp/L[d]);
-#endif
-      dr2+=tmp*tmp;
-    }
-    if (dr2 <distmin){ // possible colision - check better
-      dr2=0;
-      //real dr2o=0; // or do we need old distance?
-      for (int d=0;d<DIM;d++){
-	real tmp=r_d[i*DIM+d]+disp_d[i*DIM+d]-rnew[d];
-#ifndef SD_NOT_PERIODIC
-	tmp-=L[d]*rint(tmp/L[d]);
-#endif
-	dr2+=tmp*tmp;
-	//tmp=r_d[i*DIM+d]-r_d[idx*DIM+d];
-	//tmp-=L*rint(tmp/L);
-	//dr2o+=tmp*tmp;
-      }
-      if (dr2 < DIST*DIST*a*a){ // do they collide after the step?
-	// ideal: the motion which is responsible for the crash: avoid it.
-	// just move them that far that they nearly touch each other.
-	// therefore we need the soluten of an quadratic equation
-	// in case they are already closer than DIST*a this will move them appart.
-	// first: get the coefficents
-	real alpha=0,beta=0,gamma=0;
-	for (int d=0;d<DIM;d++){
-	  real t1=r_d[i*DIM+d]-r_d[idx*DIM+d];
-#ifndef SD_NOT_PERIODIC
-	  t1-=L[d]*rint(t1/L[d]);
-#endif
-	  real t2=disp_d[i*DIM+d]-disp_d[idx*DIM+d];
-	  //t2-=L*rint(t2/L); // we would have a problem if we would need to fold back these ...
-	  alpha +=t2*t2;
-	  beta  +=2*t1*t2;
-	  gamma +=t1*t1;
-	} 
-	// now we want to solve for t: alpha*t**2+beta*t+gamma=DIST*a
-	// we want the solution with the minus in the 'mitternachtsformel'
-	// because the other solution is when the particles moved through each other
-	real tnew = (-beta-sqrt(beta*beta-4*alpha*gamma))/(2*alpha);
-	if (tnew < t){ // use the smallest t
-	  t=tnew;
-	}
-      }
-    }
-  }
-#endif
-  for (int d=0;d<DIM;d++){ // actually do the integration
-    r_d[DIM*idx+d]+=disp_d[DIM*idx+d]*t;
-  }
-  //#warning "Debug is still enabaled"
-    //pos_d[DIM*N+idx]=t;
-}
-
-/// sorting the particles in buckets
-/// pos      : position of the particles
-/// bucketSize : dimension of bucket
-/// bucketNum : number of buckets in each direction
-/// N        : number of particles
-/// particleCount : number of particles within a bucket
-/// particleList : list of the particles within a given bucket
-/// totalBucketNum : number of buckets, product of the three values of
-///            bucketNum
-/// particleToBucketList : array containg the bucket number for each particle
-__global__ void sd_bucket_sort( const real * pos , const real * bucketSize, const int * bucketNum, int N, int * particleCount,
-				int * particleList, int maxParticlePerCell, int totalBucketNum, int * particleToBucketList){
-  for (int i = blockIdx.x*blockDim.x + threadIdx.x;
-       i<N ;
-       i+=blockDim.x*gridDim.x){
-    int3 bucket;
-#pragma unroll 3
-    for (int d =0; d<3; d++){
-      real tmp;
-      // no asm version:
-      // tmp = pos[i*3+d];
-      // asm version avoids caching
-#ifdef SD_USE_FLOAT
-      asm("ld.global.cs.f32 %0,[%1];\n"
-	: "=f"(tmp) : "l"(pos+i*3+d) : );
-#else
-      asm("ld.global.cs.f64 %0,[%1];\n"
-	: "=d"(tmp) : "l"(pos+i*3+d) : );
-#endif
-      tmp/=bucketSize[d];
-      int x;
-      // this should work with compute capability >= 2.0
-      x=__real2int_rd(tmp);
-      //x=floor(tmp);
-      x%=bucketNum[d];
-      // avoid negativ numbers
-      x= (x < 0)?x+bucketNum[d]: x;
-      //x+=bucketNum[d];
-      //x%=bucketNum[d];
-      switch (d){
-      case 0:
-	bucket.x = x;
-	break;
-      case 1:
-	bucket.y = x;
-	break;
-      case 2:
-	bucket.z = x;
-	break;
-      }
-    }
-    int myBucket = bucket.x + bucket.y*bucketNum[0] + bucket.z*bucketNum[0]*bucketNum[1];
-    int num = atomicAdd(particleCount+myBucket, 1);
-    if (num < maxParticlePerCell){
-      particleList[myBucket+num*totalBucketNum]=i;
-      particleToBucketList[i]=myBucket;
-    }else{
-      // Note: printf in device code works only with c.c.>=2.0 
-#if (__CUDA_ARCH__>=200)
-      printf("error: overflow in grid cell (%i,%i,%i)\n",bucket.x,bucket.y,bucket.z);
-#endif
-    }
-  }
-}
-
-/// This function reads the diagonal element entries of a square matrix
-/// and stores them in the vector diag and the inverse of them in diag_i
-/// size     : size of the matrix
-/// mat_a    : the matrix
-/// lda      : leading dimension of the matrix mat_a
-/// diag     : the diagonal (OUT)
-/// diag_i   : the inverse of the diagonal (OUT)
-__global__ void sd_get_diag(int size, const real * mat_a,int lda,real * diag,real * diag_i){
-  const int stepw=lda+1;
-  for (int l=threadIdx.x+blockDim.x*blockIdx.x;l<size;l+=blockDim.x*gridDim.x){
-    real tmp;
-    tmp = mat_a[stepw*l];
-    diag[l]=tmp;
-    diag_i[l]=1/tmp;
-  }
-}
-
-/// computing the 1 norm of a given vector
-/// size     : size of the vector
-/// vec      : data of the vector
-/// res      : the 1 norm (OUT)
-__global__ void sd_nrm1(const int size, const real * const vec, real * res){
-  __shared__ real cache[numThreadsPerBlock];
-  if (blockIdx.x == 0){ // only use one block
-    //assert(blockDim.x == numThreadsPerBlock);
-    cache[threadIdx.x]=0;
-    for (int i = threadIdx.x; i < size ; i+=blockDim.x){
-      cache[threadIdx.x]+=abs(vec[i]);
-    }
-    reduce_sum(cache);
-    if (threadIdx.x == 0 ){
-      res[0]=cache[0];
-    }
-  }
-}
-
-/// computing the 1 norm of a given vector
-/// size     : size of the vector
-/// vec      : data of the vector
-/// res      : the 1 norm (OUT)
-__global__ void sd_nrm_inf(const int size, const int * const vec, int * res){
-  extern __shared__ int cache2[]; // blockDim.x * sizeof(int)
-  //if (blockIdx.x == 0){ // only use one block
-  cache2[threadIdx.x]=vec[threadIdx.x];
-  for (int i = threadIdx.x+blockDim.x; i < size ; i+=blockDim.x){
-    if ( cache2[threadIdx.x] < vec[i])
-      cache2[threadIdx.x]=vec[i];
-  }
-  __syncthreads();
-  reduce_max(cache2);
-  if (threadIdx.x == 0 ){
-    res[blockIdx.x]=cache2[0];
-  }
-  //}
-}
-
-#endif /* SD */
diff --git a/src/core/integrate_sd_cuda_kernel.hpp b/src/core/integrate_sd_cuda_kernel.hpp
deleted file mode 100644
index f0adc41df02..00000000000
--- a/src/core/integrate_sd_cuda_kernel.hpp
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
-  Copyright (C) 2010,2012,2013,2014,2015,2016 The ESPResSo project
-  Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 
-    Max-Planck-Institute for Polymer Research, Theory Group
-  
-  This file is part of ESPResSo.
-  
-  ESPResSo is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-  
-  ESPResSo is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-  
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
-*/
-
-#ifndef __INTEGRATE_SD_CUDA_KERNEL_HPP
-#define __INTEGRATE_SD_CUDA_KERNEL_HPP
-
-#include "integrate_sd.hpp"
-
-#ifdef SD
-
-// This computes the farfield contribution.
-// r is the vector of [x_1, y_1, z_1, x_2, y_2, z_2, ...]
-// N is the number of particles
-// self_mobility is 1./(6.*PI*viscosity*radius)
-// a is the particle radius
-// mobility is the mobility matrix which will be retruned
-// L is the boxlength
-__global__ void sd_compute_mobility_matrix(const real * r, int N, real self_mobility, real a, real * mobility);
-
-
-
-/// This computes the farfield contribution of the mobility with ewald summation
-/// this version assumes that the real-space cutoff is smaller than the boxlength
-// r is the vector of [x_1, y_1, z_1, x_2, y_2, z_2, ...]
-// N is the number of particles
-// self_mobility is 1./(6.*PI*eta*a)
-// a is the particle radius
-// mobility is the mobility matrix which will be retruned
-// L_d is the boxlength
-// cutoff the realspace cutoff distance
-// xi the splitting parameter as defined by Beenakker 1986
-// xa  = xi * a
-// xa3 = xa * xa * xa
-__global__ void sd_compute_mobility_matrix_real_short(const real * r, int N, real self_mobility, real a, const real * L_g, real * mobility,
-						      real cutoff, real xi, real xa, real xa3);
-
-/// This computes the farfield contribution of the mobility with ewald summation
-/// this kernel computes the sines and cosinus.
-// r is the vector of [x_1, y_1, z_1, x_2, y_2, z_2, ...]
-// N is the number of particles
-// vecs are the k vectors
-// num is the number of k vectors
-// cosines is the pointer where the cosines : cos( position \times k-vector) are saved
-// sines the same for sines
-// ldd the rounded number of particles
-__global__ void sd_compute_mobility_sines(const real * r, int N, const real * vecs, int num, real * sines, real * cosines, int ldd);
-
-/// adds to each of the diagonal elemnts of the size*size matrix \param matrix
-/// with lda \param lda 1
-__global__ void sd_add_identity_matrix(real * matrix, int size, int lda);
-
-void _cudaCheckError(const char *msg, const char * file, const int line);
-// this computes the near field
-// it calculates the ResistanceMatrix
-__global__ void sd_compute_resistance_matrix(const real * r, int N, real self_mobility, real a, const real * L, real * resistance, int * myInfo);
-// this computes the near field
-// it calculates the Sparse ResistanceMatrix
-
-__global__ void sd_compute_resistance_matrix_sparse(const real * pos, const int N, const real self_mobility,const real a,const real * _L,
-  						    real * resistance,const int * col_idx,const int * row_l, int * myInfo);
-// TODO: make the order of arguments uniform (and logical?)
-// TODO: description here
-__global__ void sd_compute_brownian_force_nearfield(const real * r,const real * gaussian_nf,int N,const real * L, real a, real self_mobility,real * brownian_force_nf);
-
-
-/// This computes the farfield contribution of the mobility with ewald summation
-/// this kernel computes the sines and cosinus.
-// forces is the vector of [fx_1, fy_1, fz_1, fx_2, fy_2, fz_2, ...]
-// a is the particle radius
-// mobility is the mobility matrix which will be retruned (in/out)
-// L_d is the boxlength
-// cutoff the wavespace cutoff distance
-__global__ void sd_wavepart_sum(const real * const forces, const real * const vecs,    int num, const  real * const matrices_d,
-                                const real * const sines,  const real * const cosines, int ldd, int N, real * sin_sum, real * cos_sum, real max);
-
-
-__global__ void sd_wavepart_assemble(const int num, const real * const sines, const real * const cosines, const real * const sin_sum,
-				     const real * const cos_sum, const int ldd, real * out, real max, int N, const real factor);
-
-
-/// Add the wavepart contribution of the mobility to the matrix
-__global__ void sd_wavepart_addto_matrix(const int num, const  real * const matrices_d, const real * const sines,  const real * const cosines,
-					 const int ldd, const int N, real * const mobility_d, const int mat_ldd);
-
-// make sure to have one thread per particle
-__global__ void sd_real_integrate_prepare( const real * r_d , real * disp_d, const real * L, real a, int N);
-__global__ void sd_real_integrate( real * r_d , const real * disp_d, const real * L, real a, int N);
-
-
-// this sets a block to zero
-// matrix: pointer to the given matrix
-// size  : the size of the matrix (in the example below 3N)
-// ldd   : the leading dimension of the matrix
-__global__ void sd_set_zero_matrix(real * matrix, int size, int ldd);
-
-
-// this sets a block to zero
-// data  : pointer to the given data
-// size  : the size of the data
-__global__ void sd_set_zero(real * data, int size);
-
-// this sets a block to an given integer
-// data  : pointer to the given data
-// size  : the size of the data
-// value : the value written to the data block
-__global__ void sd_set_value(int * data, int size, int value);
-
-// this sets a block to an given real
-// data  : pointer to the given data
-// size  : the size of the data
-// value : the value written to the data block
-__global__ void sd_set_value(real * data, int size, real value);
-
-
-// This function reads the diagonal element entries of a matrix
-// and stores them in the vector diag
-// and the inverse of them in diag_i
-__global__ void sd_get_diag(int size, const real * mat_a,int lda,real * diag,real * diag_i);
-
-
-
-// implementation of a bucket sort algorithm
-// puts all the N particles with given position pos 
-// and particle radius a within the periodic boundary 
-// conditions of boxSize L_i = bucketSize_i * bucketNum_i
-// puts them in the list particleList
-// pos                device array of particle position xyz
-// bucketSize         device array with the number of buckets in x y and z direction
-// bucketNum          device array with the size of a bucket in x y and z direction
-// N                  number of particles
-// particleCount      device array of the numbers of particles per bucket. must be initalized to zero
-// particleList       device array of the partilces in each bucket
-// maxParticlePerCell maximum particles per cell
-// totalBucketNUm     bucketNum[0]*bucketNum[1]*bucketNum[2] - the total number of buckets
-__global__ void sd_bucket_sort( const real * pos , const real * bucketSize, const int * bucketNum, int N, int * particleCount,
-				int * particleList, int maxParticlePerCell, int totalBucketNum, int * particleToBucketList);
-
-
-// matrix multiplication for sparse matrizes
-// never use direct, use always wrapper
-__global__ void sd_multiply_sparse_matrix_vector(const int size, const real factor, const real * matrix, const int ldd, const int ldd_short,
-						 const int * col_idx, const int * row_l, const real * vec_in, real * vec_out );
-
-// finding all interacting particles, given a certain iteraction range.
-// this requires bucket sort and is therefor O(N)
-// if interaction_range is > min(L)/2 particles can appeare more often in the list.
-// self_interaction is excluded.
-__global__ void sd_find_interacting_particles(const real * pos,const real * _L, const int N, int * interacting, int * num_interacting,
-					      const int * particle_count, const int * particle_sorted_list, const real * bucket_size_,
-					      const int * bucket_num_, const int * particle_to_bucket_list, const real interaction_range,
-					      const int total_bucket_num);
-
-// finding all interacting particles, given a certain iteraction range.
-// this works without bucket sort and is O(N^2)
-__global__ void sd_find_interacting_particles(const real * pos,const real * L_g, const int N, int * interacting, int * num_interacting,
-					      const real interaction_range);
-
-
-
-/// computing the inf norm of a given vector (or in other words the largest value)
-/// if more than one block is used, this kernel does not perform the final reduction
-/// size     : size of the vector
-/// vec      : data of the vector
-/// res      : the 1 norm (OUT)
-__global__ void sd_nrm_inf(const int size, const int * const vec, int * res);
-
-
-
-__global__ void sd_nrm1(const int size,const real * const vec, real * erg);
-
-#endif /* SD */
-
-#endif
diff --git a/src/core/integrate_sd_matrix.cu b/src/core/integrate_sd_matrix.cu
deleted file mode 100644
index 5d670d58358..00000000000
--- a/src/core/integrate_sd_matrix.cu
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
-  Copyright (C) 2010,2012,2013,2014,2015,2016 The ESPResSo project
-  Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010 
-    Max-Planck-Institute for Polymer Research, Theory Group
-  
-  This file is part of ESPResSo.
-  
-  ESPResSo is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-  
-  ESPResSo is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-  
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>. 
-*/
-
-#include "config.hpp"
-
-#ifdef SD
-
-#include <stdio.h>
-#include <assert.h>
-#include "cuda_runtime.h"
-
-#include "integrate_sd.hpp"
-#include "integrate_sd_cuda.hpp"
-#include "integrate_sd_matrix.hpp"
-
-#include <openssl/md5.h>
-
-#if defined(OMPI_MPI_H) || defined(_MPI_H)
-#error CU-file includes mpi.h! This should not happen!
-#endif
-
-/// constructor for wave part
-/// _ldd_short : ldd_short of the matrix
-wavepart::wavepart(int _ldd_short):ldd_short(_ldd_short){
-  vecs          = NULL;
-  matrices      = NULL;
-  cosines       = NULL;
-  sines         = NULL;
-  num           = 0;
-  max           = 0;
-#ifdef SD_DEBUG
-  vecs_hash     = NULL;
-  matrices_hash = NULL;
-  cosines_hash  = NULL;
-  sines_hash    = NULL;
-#endif
-}
-
-/// destructor of the wave part
-wavepart::~wavepart(){
-  if (vecs){
-    cuda_safe_mem(cudaFree((void*)vecs));
-    vecs=NULL;
-  }
-  if (matrices){
-    cuda_safe_mem(cudaFree((void*)matrices));
-    matrices=NULL;
-  }
-  if (cosines){
-    cuda_safe_mem(cudaFree((void*)cosines));
-    cosines=NULL;
-  }
-  if (sines){
-    cuda_safe_mem(cudaFree((void*)sines));
-    sines=NULL;
-  }
-#ifdef SD_DEBUG
-  if (vecs_hash){
-    free(vecs_hash);
-    vecs_hash=NULL;
-  }
-  if (matrices_hash){
-    free(matrices_hash);
-    matrices_hash=NULL;
-  }
-  if (sines_hash){
-    free(sines_hash);
-    sines_hash=NULL;
-  }
-  if (cosines_hash){
-    free(cosines_hash);
-    cosines_hash=NULL;
-  }
-#endif
-  max=0;
-}
-
-
-/// free the wave part of the matrix
-void matrix::_free_wavespace(){
-  if (wavespace){
-    //wavespace->~wavepart();
-    //free((void*)wavespace);
-    delete wavespace;
-    wavespace=NULL;
-  }
-}
-
-/// constructor of the matrix
-matrix::matrix(){
-  data=NULL;
-  col_idx=NULL;
-  row_l=NULL;
-  wavespace=NULL;
-  is_sparse=false;
-  size=0;
-  ldd=0;
-  ldd_short=0;
-#ifdef SD_DEBUG
-  dense=NULL;
-  data_hash=NULL;
-  dense_hash=NULL;
-#endif
-}
-
-/// destructor of the matrix
-matrix::~matrix(){
-  if (data){
-    cuda_safe_mem(cudaFree((void*)data));
-    data=NULL;
-  }
-  if (col_idx){
-    cuda_safe_mem(cudaFree((void*)col_idx));
-    col_idx=NULL;
-  }
-  if (row_l){
-    cuda_safe_mem(cudaFree((void*)row_l));
-    row_l=NULL;
-  }
-#ifdef SD_DEBUG
-  if (dense){
-    cuda_safe_mem(cudaFree((void*)dense));
-    dense=NULL;
-  }
-  if (data_hash){
-    free(data_hash);
-    data_hash=NULL;
-  }
-  if (dense_hash){
-    free(dense_hash);
-    dense_hash=NULL;
-  }
-#endif
-  _free_wavespace();
-}
-
-
-/// print some basic properties of the matrix, if SD_DEBUG is enabled
-void matrix::printStats() const{
-#ifdef SD_DEBUG
-  printf("addr:%p\tsize:%d\tldd:%d\tsparse:%d",this,size,ldd,is_sparse);
-#endif
-}
-
-void getAndHash(const real * data, const int size, unsigned char result[MD5_DIGEST_LENGTH]){
-#ifdef SD_DEBUG
-  real host[size];
-  cuda_safe_mem(cudaMemcpy( host, data, size*sizeof(real), cudaMemcpyDeviceToHost ));
-  MD5((unsigned char*) host, size, result);
-#endif
-}
-
-void matrix::hash_data(){
-#ifdef SD_DEBUG
-  if (data_hash == NULL){
-    data_hash=(unsigned char *) Utils::malloc(MD5_DIGEST_LENGTH);
-  }
-  getAndHash(data, size*ldd, data_hash);
-#endif
-}
-
-void matrix::hash_dense(){
-#ifdef SD_DEBUG
-  if (dense_hash == NULL){
-    dense_hash=(unsigned char *) Utils::malloc(MD5_DIGEST_LENGTH);
-  }
-  getAndHash(dense, size*ldd, dense_hash);
-#endif
-}
-
-void matrix::assert_all() const{
-#ifdef SD_DEBUG
-  unsigned char tmp[MD5_DIGEST_LENGTH];
-  if (data_hash){
-    getAndHash(data, size*ldd, tmp);
-    for (int i=0; i < MD5_DIGEST_LENGTH; i++){
-      assert (tmp[i]==data_hash[i]);
-      printf("%02x",tmp[i]);
-    }
-  }
-  if (dense_hash){
-    getAndHash(dense, size*ldd, tmp);
-    for (int i=0; i < MD5_DIGEST_LENGTH; i++){
-      assert (tmp[i]==dense_hash[i]);
-      printf("%02x",tmp[i]);
-    }
-  }
-  if (wavespace){
-    wavespace->assert_all();
-  }
-#endif
-}
-
-
-void wavepart::hash_vecs(){
-#ifdef SD_DEBUG
-  fprintf(stderr,"going to allocate %d bytes, vecs_hash is %p\n",MD5_DIGEST_LENGTH, vecs_hash);
-  if (vecs_hash == NULL){
-    vecs_hash=(unsigned char *) Utils::malloc(MD5_DIGEST_LENGTH);
-    fprintf(stderr,"allocated %d bytes\n",MD5_DIGEST_LENGTH);
-  }
-  getAndHash(vecs, num*3 , vecs_hash);
-#endif
-}
-void wavepart::hash_matrices(){
-#ifdef SD_DEBUG
-  if (matrices_hash == NULL){
-    matrices_hash=(unsigned char *) Utils::malloc(MD5_DIGEST_LENGTH);
-  }
-  getAndHash(matrices, num*6, matrices_hash);
-#endif
-}
-void wavepart::hash_cosines(){
-#ifdef SD_DEBUG
-  if (cosines_hash == NULL){
-    cosines_hash=(unsigned char *) Utils::malloc(MD5_DIGEST_LENGTH);
-  }
-  getAndHash(cosines, num*ldd_short, cosines_hash);
-#endif
-}
-void wavepart::hash_sines(){
-#ifdef SD_DEBUG
-  if (sines_hash == NULL){
-    sines_hash=(unsigned char *) Utils::malloc(MD5_DIGEST_LENGTH);
-  }
-  getAndHash(sines, num*ldd_short, sines_hash);
-#endif
-}
-
-void wavepart::assert_all() const{
-#ifdef SD_DEBUG
-  unsigned char tmp[MD5_DIGEST_LENGTH];
-  if (vecs_hash){
-    getAndHash(vecs, 3*num, tmp);
-    for (int i=0; i < MD5_DIGEST_LENGTH; i++){
-      assert (tmp[i]==vecs_hash[i]);
-      printf("%02x",tmp[i]);
-    }
-  }
-  if (matrices_hash){
-    getAndHash(matrices, 6*num, tmp);
-    for (int i=0; i < MD5_DIGEST_LENGTH; i++){
-      assert (tmp[i]==matrices_hash[i]);
-      printf("%02x",tmp[i]);
-    }
-  }
-  if (sines_hash){
-    getAndHash(sines, num*ldd_short, tmp);
-    for (int i=0; i < MD5_DIGEST_LENGTH; i++){
-      assert (tmp[i]==sines_hash[i]);
-      printf("%02x",tmp[i]);
-    }
-  }
-  if (cosines_hash){
-    getAndHash(cosines, num*ldd_short, tmp);
-    for (int i=0; i < MD5_DIGEST_LENGTH; i++){
-      assert (tmp[i]==cosines_hash[i]);
-      printf("%02x",tmp[i]);
-    }
-  }
-#endif
-}
-
-#endif /* SD */
-
diff --git a/src/core/integrate_sd_matrix.hpp b/src/core/integrate_sd_matrix.hpp
deleted file mode 100644
index 30a6fe5f954..00000000000
--- a/src/core/integrate_sd_matrix.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-#pragma once
-
-/// stuct for sparse matrix: (blocked version off ELLPACK-R, see \link http://elib.uni-stuttgart.de/opus/volltexte/2010/5033/pdf/DIP_2938.pdf \endlink
-/// for dense matrizes use only data
-
-struct wavepart{
-public:
-  real * vecs;
-  real * matrices;
-  real * cosines;
-  real * sines;
-  int num;
-  int max;
-  int & ldd_short;
-  wavepart(int _ldd_short);
-  ~wavepart();
-private:
-  wavepart(wavepart & other);
-public:
-  void print( int N, int ldd) const;
-  void hash_vecs();
-  void hash_matrices();
-  void hash_cosines();
-  void hash_sines();
-  void assert_all() const;
-#ifdef SD_DEBUG
-private:
-  unsigned char * vecs_hash;
-  unsigned char * matrices_hash;
-  unsigned char * cosines_hash;
-  unsigned char * sines_hash;
-#endif
-};
-
-
-
-struct matrix{
-  //public:
-  real * data;
-  int  * col_idx;
-  int  * row_l;
-  wavepart * wavespace;
-  bool is_sparse;
-  int size;
-  int ldd;
-  int ldd_short;
-  void _init();
-  matrix();
-  ~matrix();
-  void _free_wavespace();
-  void _free();
-  void printWavespace() const;
-  void print() const;
-  void printStats() const;
-private:
-  matrix(matrix & other); // disable copy operator
-#ifdef SD_DEBUG
-  unsigned char * data_hash;
-  unsigned char * dense_hash;
-#endif
-public:
-  real * dense;
-  void hash_data();
-  void hash_dense();
-  void assert_all() const;
-};
-  
-  
-#ifdef SD_DEBUG
-  void getAndHash(const real * data, const int size, unsigned char * result);
-#endif
-  
diff --git a/src/core/thermostat.hpp b/src/core/thermostat.hpp
index f36735bf0a5..e3b93fdcb69 100755
--- a/src/core/thermostat.hpp
+++ b/src/core/thermostat.hpp
@@ -48,8 +48,6 @@
 #define THERMO_INTER_DPD  16
 #define THERMO_GHMC       32
 #define THERMO_CPU        64
-#define THERMO_SD         128
-#define THERMO_BD         256
 /*@}*/
 
 // Handle switching of noise function flat vs Gaussian